From bb00d9136830b3e861fe2b567d2b203e772078e5 Mon Sep 17 00:00:00 2001
From: Nicolas Gontier <gontier.nicolas@yahoo.fr>
Date: Mon, 16 Jun 2025 20:37:51 +0000
Subject: [PATCH 01/73] add README

---
 pipelinerl/miniwob/README.md | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)
 create mode 100644 pipelinerl/miniwob/README.md

diff --git a/pipelinerl/miniwob/README.md b/pipelinerl/miniwob/README.md
new file mode 100644
index 00000000..9f91ddba
--- /dev/null
+++ b/pipelinerl/miniwob/README.md
@@ -0,0 +1,30 @@
+# Miniwob example
+
+## Prerequesites
+
+### TapeAgents
+
+Clone [TapeAgents](https://github.com/ServiceNow/TapeAgents/) in your parent folder and install it.
+```bash
+cd ..
+git clone git@github.com:ServiceNow/TapeAgents.git
+cd TapeAgents
+git checkout async_web_agent  # required until #230 is merged into main
+pip install -e .
+pip install 'tapeagents[finetune,converters]=0.1.12'
+cd ../PipelineRL
+```
+
+### Miniwob
+
+see setup here: https://github.com/ServiceNow/BrowserGym/blob/main/browsergym/miniwob/README.md
+
+### Playwright
+
+The environment server will need to have playwright installed.
+
+`playwright install`
+
+## Launch Command
+
+`python -m pipelinerl.launch --config-name miniwob`

From dc81770ddde67527264c30604a65c2a2cb6cd920 Mon Sep 17 00:00:00 2001
From: Nicolas Gontier <gontier.nicolas@yahoo.fr>
Date: Tue, 17 Jun 2025 15:09:07 +0000
Subject: [PATCH 02/73] increase env session inactivity timout

---
 conf/miniwob.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/conf/miniwob.yaml b/conf/miniwob.yaml
index a5bf8bc2..7e435c49 100644
--- a/conf/miniwob.yaml
+++ b/conf/miniwob.yaml
@@ -123,7 +123,7 @@ environment:
   miniwob_url: file:///home/toolkit/miniwob-plusplus/miniwob/html/miniwob/
   n_envs: 64
   host: "0.0.0.0"
-  max_session_inactivity_secs: 300
+  max_session_inactivity_secs: 600
   web_env_target: examples.rl_webagent.environment.WebEnvironment
   exp_path: ${output_dir}/env_server
   headless: true

From e60d4c137753fcf176b83f387fd0906afa3d9a9f Mon Sep 17 00:00:00 2001
From: Nicolas Gontier <gontier.nicolas@yahoo.fr>
Date: Tue, 17 Jun 2025 15:09:22 +0000
Subject: [PATCH 03/73] update readme

---
 pipelinerl/miniwob/README.md | 1 -
 1 file changed, 1 deletion(-)

diff --git a/pipelinerl/miniwob/README.md b/pipelinerl/miniwob/README.md
index 9f91ddba..04e63120 100644
--- a/pipelinerl/miniwob/README.md
+++ b/pipelinerl/miniwob/README.md
@@ -9,7 +9,6 @@ Clone [TapeAgents](https://github.com/ServiceNow/TapeAgents/) in your parent fol
 cd ..
 git clone git@github.com:ServiceNow/TapeAgents.git
 cd TapeAgents
-git checkout async_web_agent  # required until #230 is merged into main
 pip install -e .
 pip install 'tapeagents[finetune,converters]=0.1.12'
 cd ../PipelineRL

From f9e45c26bafb65fc687dcca0f68d5066bc3a6678 Mon Sep 17 00:00:00 2001
From: Nicolas Gontier <gontier.nicolas@yahoo.fr>
Date: Wed, 18 Jun 2025 21:02:28 +0000
Subject: [PATCH 04/73] move miniwob to domains/

---
 pipelinerl/{ => domains}/miniwob/README.md                        | 0
 pipelinerl/{ => domains}/miniwob/environment_server.py            | 0
 pipelinerl/{ => domains}/miniwob/load_tasks.py                    | 0
 pipelinerl/{ => domains}/miniwob/rollouts.py                      | 0
 .../{ => domains}/miniwob/tool_chat_template_llama3.1_json.jinja  | 0
 5 files changed, 0 insertions(+), 0 deletions(-)
 rename pipelinerl/{ => domains}/miniwob/README.md (100%)
 rename pipelinerl/{ => domains}/miniwob/environment_server.py (100%)
 rename pipelinerl/{ => domains}/miniwob/load_tasks.py (100%)
 rename pipelinerl/{ => domains}/miniwob/rollouts.py (100%)
 rename pipelinerl/{ => domains}/miniwob/tool_chat_template_llama3.1_json.jinja (100%)

diff --git a/pipelinerl/miniwob/README.md b/pipelinerl/domains/miniwob/README.md
similarity index 100%
rename from pipelinerl/miniwob/README.md
rename to pipelinerl/domains/miniwob/README.md
diff --git a/pipelinerl/miniwob/environment_server.py b/pipelinerl/domains/miniwob/environment_server.py
similarity index 100%
rename from pipelinerl/miniwob/environment_server.py
rename to pipelinerl/domains/miniwob/environment_server.py
diff --git a/pipelinerl/miniwob/load_tasks.py b/pipelinerl/domains/miniwob/load_tasks.py
similarity index 100%
rename from pipelinerl/miniwob/load_tasks.py
rename to pipelinerl/domains/miniwob/load_tasks.py
diff --git a/pipelinerl/miniwob/rollouts.py b/pipelinerl/domains/miniwob/rollouts.py
similarity index 100%
rename from pipelinerl/miniwob/rollouts.py
rename to pipelinerl/domains/miniwob/rollouts.py
diff --git a/pipelinerl/miniwob/tool_chat_template_llama3.1_json.jinja b/pipelinerl/domains/miniwob/tool_chat_template_llama3.1_json.jinja
similarity index 100%
rename from pipelinerl/miniwob/tool_chat_template_llama3.1_json.jinja
rename to pipelinerl/domains/miniwob/tool_chat_template_llama3.1_json.jinja

From 8cdbd06a4dbd2ded04f9d42b8dc95425d4e530dc Mon Sep 17 00:00:00 2001
From: Nicolas Gontier <gontier.nicolas@yahoo.fr>
Date: Mon, 7 Jul 2025 19:04:49 +0000
Subject: [PATCH 05/73] fix

---
 conf/miniwob.yaml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/conf/miniwob.yaml b/conf/miniwob.yaml
index 7e435c49..86f84c51 100644
--- a/conf/miniwob.yaml
+++ b/conf/miniwob.yaml
@@ -45,7 +45,7 @@ vllm_config:
     enforce-eager: ""  # speed the actor llm startup a bit
 
 actor:
-  rollout_policy: pipelinerl.miniwob.rollouts.generate_miniwob_rollout
+  rollout_policy: pipelinerl.domains.miniwob.rollouts.generate_miniwob_rollout
   shared_memory_entry_size: 100000000
 
 preprocess:
@@ -119,7 +119,7 @@ agent:
 # ENVIRONMENT CONFIGURATION
 start_attempts: 3  # number of attempts to start each task
 environment:
-  _target_: pipelinerl.miniwob.environment_server.WebEnvironmentServer
+  _target_: pipelinerl.domains.miniwob.environment_server.WebEnvironmentServer
   miniwob_url: file:///home/toolkit/miniwob-plusplus/miniwob/html/miniwob/
   n_envs: 64
   host: "0.0.0.0"
@@ -130,7 +130,7 @@ environment:
   observation_format: html
 
 # DATASET CONFIGURATION
-dataset_loader: pipelinerl.miniwob.load_tasks.load_tasks
+dataset_loader: pipelinerl.domains.miniwob.load_tasks.load_tasks
 dataset_loader_params:
   train_split: 0.6  # 0.6 of tasks for training, 0.4 for testing
   seeds: [0, 42, 1337, 900, 103]

From 551098270e3791e9f771e086852663464ebd0ac5 Mon Sep 17 00:00:00 2001
From: Nicolas Gontier <gontier.nicolas@yahoo.fr>
Date: Mon, 7 Jul 2025 19:14:47 +0000
Subject: [PATCH 06/73] fix path

---
 conf/miniwob.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/conf/miniwob.yaml b/conf/miniwob.yaml
index 86f84c51..658fdd05 100644
--- a/conf/miniwob.yaml
+++ b/conf/miniwob.yaml
@@ -41,7 +41,7 @@ vllm_config:
   vllm_kwargs:
     enable-auto-tool-choice: ""
     tool-call-parser: llama3_json # use hermes for qwen
-    chat_template: pipelinerl/miniwob/tool_chat_template_llama3.1_json.jinja  # copy pasted from https://github.com/vllm-project/vllm/blob/main/examples/tool_chat_template_llama3.1_json.jinja
+    chat_template: pipelinerl/domains/miniwob/tool_chat_template_llama3.1_json.jinja  # copy pasted from https://github.com/vllm-project/vllm/blob/main/examples/tool_chat_template_llama3.1_json.jinja
     enforce-eager: ""  # speed the actor llm startup a bit
 
 actor:

From 07e858c3ef8e74e6393c7fd239ecb4e51afa44bb Mon Sep 17 00:00:00 2001
From: Nicolas Gontier <gontier.nicolas@yahoo.fr>
Date: Mon, 7 Jul 2025 20:27:33 +0000
Subject: [PATCH 07/73] return RuntimeError instead of HTTPException because
 not pickable

---
 pipelinerl/domains/miniwob/rollouts.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/pipelinerl/domains/miniwob/rollouts.py b/pipelinerl/domains/miniwob/rollouts.py
index bbf68860..fb437658 100644
--- a/pipelinerl/domains/miniwob/rollouts.py
+++ b/pipelinerl/domains/miniwob/rollouts.py
@@ -5,6 +5,7 @@
 import random
 import time
 import aiohttp
+from fastapi import HTTPException
 from hydra.utils import instantiate
 from omegaconf import DictConfig
 
@@ -73,7 +74,10 @@ async def generate_miniwob_rollout(
             except Exception as e:
                 start_attempts -= 1
                 if start_attempts <= 0:
-                    raise e
+                    if isinstance(e, HTTPException):
+                        raise RuntimeError(f"HTTPException: {e.status_code} {e.detail}")
+                    else:
+                        raise e
                 logger.warning(f"Failed to start task, retry after 5 seconds: {e}")
                 await asyncio.sleep(5)
         logger.info(f"Task {problem['dataset']}/{problem['task']}/{problem['seed']} started in {time.perf_counter() - t:.2f} seconds")

From 5e568964cd4f41ccd6852f72e415116f14bffba1 Mon Sep 17 00:00:00 2001
From: Nicolas Gontier <gontier.nicolas@yahoo.fr>
Date: Tue, 8 Jul 2025 14:22:13 +0000
Subject: [PATCH 08/73] add env_call_timeout

---
 conf/miniwob.yaml                                | 3 ++-
 pipelinerl/domains/miniwob/environment_server.py | 6 +++++-
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/conf/miniwob.yaml b/conf/miniwob.yaml
index 658fdd05..d20ca1b6 100644
--- a/conf/miniwob.yaml
+++ b/conf/miniwob.yaml
@@ -123,7 +123,8 @@ environment:
   miniwob_url: file:///home/toolkit/miniwob-plusplus/miniwob/html/miniwob/
   n_envs: 64
   host: "0.0.0.0"
-  max_session_inactivity_secs: 600
+  max_session_inactivity_secs: 600  # kill session after 10 minutes of inactivity
+  env_call_timeout: 60  # timeout for each environment call (e.g. start_task, act, etc.)
   web_env_target: examples.rl_webagent.environment.WebEnvironment
   exp_path: ${output_dir}/env_server
   headless: true
diff --git a/pipelinerl/domains/miniwob/environment_server.py b/pipelinerl/domains/miniwob/environment_server.py
index 13839f7a..db0072c5 100644
--- a/pipelinerl/domains/miniwob/environment_server.py
+++ b/pipelinerl/domains/miniwob/environment_server.py
@@ -14,11 +14,15 @@ def __init__(self,
         headless: bool = True,
         observation_format: str = "html",
         max_session_inactivity_secs: int = 600,
+        env_call_timeout: int = 60,
     ):
         os.environ["MINIWOB_URL"] = miniwob_url
+        # Remote environment server configuration
         self.n_envs = n_envs
         self.host = host
         self.max_session_inactivity_secs = max_session_inactivity_secs
+        self.env_call_timeout = env_call_timeout
+        # Individual web environment configuration
         self.web_env_target = web_env_target
         self.exp_path = exp_path
         self.headless = headless
@@ -29,7 +33,7 @@ def launch(self, port: int):
         """
         Serve the web environment in TapeAgent.
         """
-        env_server = EnvironmentServer(n_envs=self.n_envs, host=self.host, port=port, max_session_inactivity_secs=self.max_session_inactivity_secs)
+        env_server = EnvironmentServer(n_envs=self.n_envs, host=self.host, port=port, max_session_inactivity_secs=self.max_session_inactivity_secs, env_call_timeout=self.env_call_timeout)
         env_server.launch(OmegaConf.create({
             "_target_": self.web_env_target,
             "exp_path": self.exp_path,

From c06b768f7e3ada49f9d0658174dd74cba4b1f79e Mon Sep 17 00:00:00 2001
From: Nicolas Gontier <gontier.nicolas@yahoo.fr>
Date: Tue, 8 Jul 2025 18:39:10 +0000
Subject: [PATCH 09/73] update gpu fractions

---
 conf/miniwob.yaml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/conf/miniwob.yaml b/conf/miniwob.yaml
index d20ca1b6..a20c594b 100644
--- a/conf/miniwob.yaml
+++ b/conf/miniwob.yaml
@@ -2,9 +2,9 @@ defaults:
   - base
 
 world:
-  actor_fraction: 4
-  preprocessor_fraction: 1
-  finetune_fraction: 3
+  actor_fraction: 2
+  preprocessor_fraction: 0
+  finetune_fraction: 6
 
 # debug:
 #   mode: actor

From b1ad285cf43c8b2fd647146846933ec615f659fc Mon Sep 17 00:00:00 2001
From: Nicolas Gontier <gontier.nicolas@yahoo.fr>
Date: Tue, 8 Jul 2025 18:49:11 +0000
Subject: [PATCH 10/73] set kl coef to 0

---
 conf/miniwob.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/conf/miniwob.yaml b/conf/miniwob.yaml
index a20c594b..d93edeaf 100644
--- a/conf/miniwob.yaml
+++ b/conf/miniwob.yaml
@@ -21,7 +21,7 @@ finetune:
   learning_rate: 1e-6
   optim: adamw_torch
   rl:
-    kl_coef: 0.01  # GRPO beta coefficient
+    kl_coef: 0.0  # GRPO beta coefficient
     reward_minus_kl_coef: 0.0  # RLOO beta coefficient
     use_advantages: true
     algo: grpo

From c8ac64d59a8e59429449135cb37b26fa08163154 Mon Sep 17 00:00:00 2001
From: Nicolas Gontier <gontier.nicolas@yahoo.fr>
Date: Tue, 8 Jul 2025 19:27:27 +0000
Subject: [PATCH 11/73] update max seq len

---
 conf/miniwob.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/conf/miniwob.yaml b/conf/miniwob.yaml
index d93edeaf..4b8c2149 100644
--- a/conf/miniwob.yaml
+++ b/conf/miniwob.yaml
@@ -15,7 +15,7 @@ model_path: meta-llama/Llama-3.1-8B-Instruct
 
 finetune:
   save_checkpoint_steps: 10
-  seq_length: 4096
+  seq_length: 8192
   train_batch_size: 1
   gradient_accumulation_passes: 1024
   learning_rate: 1e-6

From b87a6d11102f121e5c5dcfceb7dd35dfd0b77952 Mon Sep 17 00:00:00 2001
From: Nicolas Gontier <gontier.nicolas@yahoo.fr>
Date: Wed, 9 Jul 2025 21:38:17 +0000
Subject: [PATCH 12/73] revert to json instead of tool use agent

---
 conf/miniwob.yaml | 76 ++++++++++++++++++++++++++++++++---------------
 1 file changed, 52 insertions(+), 24 deletions(-)

diff --git a/conf/miniwob.yaml b/conf/miniwob.yaml
index 4b8c2149..8a9eb6cd 100644
--- a/conf/miniwob.yaml
+++ b/conf/miniwob.yaml
@@ -17,7 +17,7 @@ finetune:
   save_checkpoint_steps: 10
   seq_length: 8192
   train_batch_size: 1
-  gradient_accumulation_passes: 1024
+  gradient_accumulation_passes: 512
   learning_rate: 1e-6
   optim: adamw_torch
   rl:
@@ -28,7 +28,7 @@ finetune:
 
 llm:
   parameters:
-    max_tokens: 3072
+    max_tokens: 4096
     temperature: 1.0
 test_llm:
   parameters:
@@ -37,12 +37,12 @@ test_llm:
     top_p: 1.0
     top_k: 50
 
-vllm_config:
-  vllm_kwargs:
-    enable-auto-tool-choice: ""
-    tool-call-parser: llama3_json # use hermes for qwen
-    chat_template: pipelinerl/domains/miniwob/tool_chat_template_llama3.1_json.jinja  # copy pasted from https://github.com/vllm-project/vllm/blob/main/examples/tool_chat_template_llama3.1_json.jinja
-    enforce-eager: ""  # speed the actor llm startup a bit
+# vllm_config:
+#   vllm_kwargs:
+#     enable-auto-tool-choice: ""
+#     tool-call-parser: llama3_json # use hermes for qwen
+#     chat_template: pipelinerl/domains/miniwob/tool_chat_template_llama3.1_json.jinja  # copy pasted from https://github.com/vllm-project/vllm/blob/main/examples/tool_chat_template_llama3.1_json.jinja
+#     enforce-eager: ""  # speed the actor llm startup a bit
 
 actor:
   rollout_policy: pipelinerl.domains.miniwob.rollouts.generate_miniwob_rollout
@@ -68,6 +68,10 @@ agent:
     allowed_tools: |
       You have access to the following tools:
       {tools_description}
+    allowed_steps: |
+      You are allowed to produce ONLY steps with the following json schemas:
+      {allowed_steps}
+      Do not reproduce schema when producing the steps, use it as a reference.
     thought_format: |
       Important! Respond with the plain text, do not include any JSON or code.
       Do not output anything besides what I asked in this message.
@@ -75,13 +79,22 @@ agent:
     - _target_: examples.rl_webagent.agent.WebNode
       name: set_goal
       system_prompt: ${agent.templates.system_prompt}
+      # guidance: |
+      #   Produce the thought that describes the intended solution to the task. In the reasoning lines:
+      #   - review the instructions from the user and the content of the page.
+      #   - outline the main task to be accomplished and the steps to be taken to achieve it.
+      #   - produce definiton of done, that will be checked later to verify if the task was completed.
+      #   ${agent.templates.thought_format}
       guidance: |
-        Produce the thought that describes the intended solution to the task. In the reasoning lines:
+        Produce the reasoning_thought step that describes the intended solution to the task. In the reasoning lines:
         - review the instructions from the user and the content of the page.
         - outline the main task to be accomplished and the steps to be taken to achieve it.
         - produce definiton of done, that will be checked later to verify if the task was completed.
-        ${agent.templates.thought_format}
-      steps_prompt: ${agent.templates.allowed_tools}
+        Produce only one step!
+      # steps_prompt: ${agent.templates.allowed_tools}
+      steps_prompt: ${agent.templates.allowed_steps}
+      steps:
+        - tapeagents.steps.ReasoningThought
       trim_obs_except_last_n: 3  # keep the last 3 observations from the tape in prompt messages
       max_chars_page_observation: 3000  # keep up to 3000 chars in PageObservation steps
     - _target_: examples.rl_webagent.agent.WebNode
@@ -91,26 +104,41 @@ agent:
         Review the current state of the page and previous steps to find the best possible next action to accomplish the task.
         Produce the reflection_thought to describe the current page state, reflect on your last action, describe what is left to do, and what will be the immediate next action.
         Produce only one reflection_thought step!
-        ${agent.templates.thought_format}
-      steps_prompt: ${agent.templates.allowed_tools}
+      #   ${agent.templates.thought_format}
+      # steps_prompt: ${agent.templates.allowed_tools}
+      steps_prompt: ${agent.templates.allowed_steps}
+      steps:
+        - examples.rl_webagent.steps.ReflectionThought
       trim_obs_except_last_n: 3  # keep the last 3 observations from the tape in prompt messages
       max_chars_page_observation: 3000  # keep up to 3000 chars in PageObservation steps
     - _target_: examples.rl_webagent.agent.WebNode
       name: act
       system_prompt: ${agent.templates.system_prompt}
+      # guidance: |
+      #   Produce the single next tool call to be performed with the current page.
+      #   If you think that the task is solved, call the FinalAnswer.
+      #   You can interact with the page elements using their BIDs or coordinates as arguments for actions.
+      #   HINTS:
+      #   - You can use the BIDs of the elements or the mouse position in x, y coordinates to interact with them.
+      #   - To select value in a dropdown or combobox, ALWAYS use SelectOption tool.
+      #   - To click on a checkbox or radio button, ALWAYS use BID (or coordinates) of the corresponding Text and not the BID (or coordinates) of the element itself.
+      #   - Press enter key to submit the search query.
       guidance: |
-        Produce the single next tool call to be performed with the current page.
-        If you think that the task is solved, call the FinalAnswer.
+        Produce the next action to be performed with the current page.
+        If you think that the task is solved, produce the final_answer_action.
         You can interact with the page elements using their BIDs or coordinates as arguments for actions.
         HINTS:
         - You can use the BIDs of the elements or the mouse position in x, y coordinates to interact with them.
-        - To select value in a dropdown or combobox, ALWAYS use SelectOption tool.
+        - To select value in a dropdown or combobox, ALWAYS use select_action.
         - To click on a checkbox or radio button, ALWAYS use BID (or coordinates) of the corresponding Text and not the BID (or coordinates) of the element itself.
         - Press enter key to submit the search query.
-      use_known_actions: true
-      use_function_calls: true
+        - Always produce only one step at a time.
+        - Step kind is always lowercase and underscore separated.
+      # steps_prompt: ${agent.templates.allowed_tools}
+      steps_prompt: ${agent.templates.allowed_steps}
+      # use_known_actions: true
       steps:
-        - examples.rl_webagent.steps.FinalAnswerAction
+        - examples.rl_webagent.steps.WebAgentAction
       trim_obs_except_last_n: 3  # keep the last 3 observations from the tape in prompt messages
       max_chars_page_observation: 3000  # keep up to 3000 chars in PageObservation steps
       next_node: reflect
@@ -120,11 +148,11 @@ agent:
 start_attempts: 3  # number of attempts to start each task
 environment:
   _target_: pipelinerl.domains.miniwob.environment_server.WebEnvironmentServer
-  miniwob_url: file:///home/toolkit/miniwob-plusplus/miniwob/html/miniwob/
-  n_envs: 64
+  miniwob_url: ???
+  n_envs: 8
   host: "0.0.0.0"
   max_session_inactivity_secs: 600  # kill session after 10 minutes of inactivity
-  env_call_timeout: 60  # timeout for each environment call (e.g. start_task, act, etc.)
+  env_call_timeout: 120  # timeout for each environment call (e.g. start_task, act, etc.)
   web_env_target: examples.rl_webagent.environment.WebEnvironment
   exp_path: ${output_dir}/env_server
   headless: true
@@ -136,6 +164,6 @@ dataset_loader_params:
   train_split: 0.6  # 0.6 of tasks for training, 0.4 for testing
   seeds: [0, 42, 1337, 900, 103]
 train_dataset_names:
-  - train
+  - debug
 test_dataset_names:
-  - test
+  - debug

From 824d841861ac1d2314d604ede6ac6f90ac724b24 Mon Sep 17 00:00:00 2001
From: Nicolas Gontier <gontier.nicolas@yahoo.fr>
Date: Wed, 9 Jul 2025 21:38:34 +0000
Subject: [PATCH 13/73] update README

---
 pipelinerl/domains/miniwob/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pipelinerl/domains/miniwob/README.md b/pipelinerl/domains/miniwob/README.md
index 04e63120..0539f078 100644
--- a/pipelinerl/domains/miniwob/README.md
+++ b/pipelinerl/domains/miniwob/README.md
@@ -26,4 +26,4 @@ The environment server will need to have playwright installed.
 
 ## Launch Command
 
-`python -m pipelinerl.launch --config-name miniwob`
+`python -m pipelinerl.launch --config-name  environment.miniwob_url=file:///PATH/TO/miniwob-plusplus/miniwob/html/miniwob/`

From 8d170eccad7d21985d79acd3132302365184b8fe Mon Sep 17 00:00:00 2001
From: Nicolas Gontier <gontier.nicolas@yahoo.fr>
Date: Thu, 10 Jul 2025 19:53:31 +0000
Subject: [PATCH 14/73] debug overflow counter

---
 pipelinerl/domains/miniwob/rollouts.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pipelinerl/domains/miniwob/rollouts.py b/pipelinerl/domains/miniwob/rollouts.py
index fb437658..22d1511e 100644
--- a/pipelinerl/domains/miniwob/rollouts.py
+++ b/pipelinerl/domains/miniwob/rollouts.py
@@ -123,7 +123,7 @@ async def generate_miniwob_rollout(
     ]
 
     # (4) # For each LLM interaction in the tape, make a training example.
-    all_finished = 0
+    all_finished = 1
     prompt_tokens = [llm_call.prompt_length_tokens for llm_call in llm_calls]
     output_tokens = [llm_call.output_length_tokens for llm_call in llm_calls]
     training_texts = [make_training_text(llm, llm_call) for llm_call in llm_calls]

From 21a1b2afc91d8d7dc42eb0292b698d071a094f87 Mon Sep 17 00:00:00 2001
From: Nicolas Gontier <gontier.nicolas@yahoo.fr>
Date: Thu, 10 Jul 2025 19:54:52 +0000
Subject: [PATCH 15/73] fix prompts

---
 conf/miniwob.yaml | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/conf/miniwob.yaml b/conf/miniwob.yaml
index 8a9eb6cd..6ab67eaf 100644
--- a/conf/miniwob.yaml
+++ b/conf/miniwob.yaml
@@ -75,6 +75,9 @@ agent:
     thought_format: |
       Important! Respond with the plain text, do not include any JSON or code.
       Do not output anything besides what I asked in this message.
+    json_format: |
+      Important! Respond with parsable JSON, do not include any text or code.
+      Do not output anything besides one JSON object.
   nodes:
     - _target_: examples.rl_webagent.agent.WebNode
       name: set_goal
@@ -90,7 +93,8 @@ agent:
         - review the instructions from the user and the content of the page.
         - outline the main task to be accomplished and the steps to be taken to achieve it.
         - produce definiton of done, that will be checked later to verify if the task was completed.
-        Produce only one step!
+        Produce only one reasoning_thought step!
+        ${agent.templates.json_format}
       # steps_prompt: ${agent.templates.allowed_tools}
       steps_prompt: ${agent.templates.allowed_steps}
       steps:
@@ -104,6 +108,7 @@ agent:
         Review the current state of the page and previous steps to find the best possible next action to accomplish the task.
         Produce the reflection_thought to describe the current page state, reflect on your last action, describe what is left to do, and what will be the immediate next action.
         Produce only one reflection_thought step!
+        ${agent.templates.json_format}
       #   ${agent.templates.thought_format}
       # steps_prompt: ${agent.templates.allowed_tools}
       steps_prompt: ${agent.templates.allowed_steps}
@@ -134,11 +139,10 @@ agent:
         - Press enter key to submit the search query.
         - Always produce only one step at a time.
         - Step kind is always lowercase and underscore separated.
+        ${agent.templates.json_format}
       # steps_prompt: ${agent.templates.allowed_tools}
       steps_prompt: ${agent.templates.allowed_steps}
-      # use_known_actions: true
-      steps:
-        - examples.rl_webagent.steps.WebAgentAction
+      use_known_actions: true
       trim_obs_except_last_n: 3  # keep the last 3 observations from the tape in prompt messages
       max_chars_page_observation: 3000  # keep up to 3000 chars in PageObservation steps
       next_node: reflect

From 05b67941586085a0bde3cd43432dbce38b1e1c12 Mon Sep 17 00:00:00 2001
From: Nicolas Gontier <gontier.nicolas@yahoo.fr>
Date: Fri, 11 Jul 2025 14:44:22 +0000
Subject: [PATCH 16/73] update readme

---
 pipelinerl/domains/miniwob/README.md | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/pipelinerl/domains/miniwob/README.md b/pipelinerl/domains/miniwob/README.md
index 0539f078..9ff8461c 100644
--- a/pipelinerl/domains/miniwob/README.md
+++ b/pipelinerl/domains/miniwob/README.md
@@ -14,6 +14,11 @@ pip install 'tapeagents[finetune,converters]=0.1.12'
 cd ../PipelineRL
 ```
 
+Make sure to add the TapeAgent folder to your python path.
+```bash
+export PYTHONPATH="/path/to/TapeAgents:$PYTHONPATH"
+```
+
 ### Miniwob
 
 see setup here: https://github.com/ServiceNow/BrowserGym/blob/main/browsergym/miniwob/README.md

From ef6b2b02687d642d1d9eeec3ed6822f7651aa00d Mon Sep 17 00:00:00 2001
From: Nicolas Gontier <gontier.nicolas@yahoo.fr>
Date: Mon, 21 Jul 2025 17:52:46 +0000
Subject: [PATCH 17/73] flag tape as invalid instead of raising http errors

---
 pipelinerl/domains/miniwob/rollouts.py | 62 ++++++++++++++++----------
 1 file changed, 39 insertions(+), 23 deletions(-)

diff --git a/pipelinerl/domains/miniwob/rollouts.py b/pipelinerl/domains/miniwob/rollouts.py
index 22d1511e..5d483a0b 100644
--- a/pipelinerl/domains/miniwob/rollouts.py
+++ b/pipelinerl/domains/miniwob/rollouts.py
@@ -33,7 +33,8 @@ def tape_contains_an_error(tape: WebTape) -> bool:
     - the last step is a PageObservation with an error
     """
     return (
-        isinstance(tape.steps[-1], LLMOutputParsingFailureAction)
+        len(tape.steps) == 0
+        or isinstance(tape.steps[-1], LLMOutputParsingFailureAction)
         or tape.metadata.result.get("error") is not None
         or (isinstance(tape.steps[-1], PageObservation) and tape.steps[-1].error)
     )
@@ -63,6 +64,7 @@ async def generate_miniwob_rollout(
     env_job_url = f"http://{env_job.hostname}:{env_job.port}"
 
     # (2) Generate environment, TapeAgent, and run them to get a Tape
+    no_error = True  # track if there was an error in the tape
     environment = AsyncRemoteEnvironment(server_url=env_job_url)  # type: ignore
     async with environment.acontext(session, wait_for_env=True) as env:
         start_attempts = cfg.start_attempts
@@ -72,26 +74,35 @@ async def generate_miniwob_rollout(
                 tape_dict, _ = await env.start_task(problem)
                 break
             except Exception as e:
+                logger.warning(f"Failed to start task {problem['dataset']}/{problem['task']}/{problem['seed']}")
                 start_attempts -= 1
                 if start_attempts <= 0:
-                    if isinstance(e, HTTPException):
-                        raise RuntimeError(f"HTTPException: {e.status_code} {e.detail}")
-                    else:
-                        raise e
-                logger.warning(f"Failed to start task, retry after 5 seconds: {e}")
-                await asyncio.sleep(5)
+                    no_error = False
+                    tape_dict = {}
+                    break
+                    # if isinstance(e, HTTPException):
+                    #     raise RuntimeError(f"HTTPException: {e.status_code} {e.detail}")
+                    # else:
+                    #     raise e
+                else:
+                    logger.warning(f"retry after 5 seconds: {e}")
+                    await asyncio.sleep(5)
         logger.info(f"Task {problem['dataset']}/{problem['task']}/{problem['seed']} started in {time.perf_counter() - t:.2f} seconds")
         tape: WebTape = WebTape(**tape_dict)  # convert http response dict to WebTape object
         t = time.perf_counter()
-        try:
-            actions = await env.a_actions()
-            tools_description = await env.a_tools_description()
-            logger.debug(f"Available tools: {tools_description}")
-            agent: Agent = instantiate(cfg.agent, known_actions=actions, tools_description=tools_description)
-            agent.llms = {DEFAULT: llm}
-            tape = await async_execute_agent(agent, tape, env, session, max_loops=cfg.agent_max_loops)
-        except Exception as e:
-            logger.error(f"Error occurred while running agent: {e}")
+        if no_error:  # only run the agent if the task started successfully
+            logger.info(f"Running agent for task {problem['dataset']}/{problem['task']]}/{problem['seed']}")
+            try:
+                actions = await env.a_actions()
+                tools_description = await env.a_tools_description()
+                logger.debug(f"Available tools: {tools_description}")
+                agent: Agent = instantiate(cfg.agent, known_actions=actions, tools_description=tools_description)
+                agent.llms = {DEFAULT: llm}
+                tape = await async_execute_agent(agent, tape, env, session, max_loops=cfg.agent_max_loops)
+            except Exception as e:
+                logger.error(f"Error occurred while running agent: {e}")
+                no_error = False
+            logger.info(f"Agent finished task {problem['dataset']}/{problem['task']}/{problem['seed']} in {time.perf_counter() - t:.2f} seconds")
         tape.metadata.result = {"execution_time": time.perf_counter() - t}
 
     # save the tape as we go
@@ -99,13 +110,18 @@ async def generate_miniwob_rollout(
         save_json_tape(tape, os.path.join(cfg.output_dir, "tapes"), tape.metadata.id)
 
     # (3) Compute rewards
-    last_obs = [step for step in tape if isinstance(step, Observation)][-1]
-    # in Miniwob, the observation "reward" is defined as RAW_REWARD_GLOBAL > 0
-    # see here: https://github.com/ServiceNow/BrowserGym/blob/main/browsergym/miniwob/src/browsergym/miniwob/base.py#L183
-    # Let's take directly the RAW_REWARD_GLOBAL from the metadata
-    # raw_reward = last_obs.metadata.other.get("reward", 0.0)
-    raw_reward = last_obs.metadata.other.get("info", {}).get("task_info", {}).get("REWARD_GLOBAL", -1.0)
-    no_error = not tape_contains_an_error(tape)
+    obs_steps = [step for step in tape if isinstance(step, Observation)]
+    if obs_steps:
+        last_obs = obs_steps[-1]
+        # in Miniwob, the observation "reward" is defined as RAW_REWARD_GLOBAL > 0
+        # see here: https://github.com/ServiceNow/BrowserGym/blob/main/browsergym/miniwob/src/browsergym/miniwob/base.py#L183
+        # Let's take directly the RAW_REWARD_GLOBAL from the metadata
+        # raw_reward = last_obs.metadata.other.get("reward", 0.0)
+        raw_reward = last_obs.metadata.other.get("info", {}).get("task_info", {}).get("REWARD_GLOBAL", -1.0)
+    else:
+        raw_reward = -1.0
+
+    no_error = no_error and not tape_contains_an_error(tape)
     # get the number of LLMOutputParsingFailureAction in the tape
     n_step_errors = len([step for step in tape.steps if isinstance(step, LLMOutputParsingFailureAction)])
     # get the number of PageObservation steps in the tape

From 0abc2b094071130249edce0f9b54e9a75fe9cd31 Mon Sep 17 00:00:00 2001
From: Nicolas Gontier <gontier.nicolas@yahoo.fr>
Date: Mon, 21 Jul 2025 17:59:27 +0000
Subject: [PATCH 18/73] use redis

---
 conf/miniwob.yaml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/conf/miniwob.yaml b/conf/miniwob.yaml
index 6ab67eaf..faa759ca 100644
--- a/conf/miniwob.yaml
+++ b/conf/miniwob.yaml
@@ -1,5 +1,7 @@
 defaults:
   - base
+  - override streams: redis
+  - _self_
 
 world:
   actor_fraction: 2

From d3f68893113988c6af2b50bdec97c884336366e2 Mon Sep 17 00:00:00 2001
From: Nicolas Gontier <gontier.nicolas@yahoo.fr>
Date: Mon, 21 Jul 2025 18:04:32 +0000
Subject: [PATCH 19/73] track task names instead of data splits

---
 pipelinerl/domains/miniwob/load_tasks.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/pipelinerl/domains/miniwob/load_tasks.py b/pipelinerl/domains/miniwob/load_tasks.py
index e5056c80..2c58f0e0 100644
--- a/pipelinerl/domains/miniwob/load_tasks.py
+++ b/pipelinerl/domains/miniwob/load_tasks.py
@@ -56,20 +56,24 @@ def load_tasks(dataset_names: list[str], train_split: float = 0.6, seeds: list[i
     for name in dataset_names:
         if name == "debug":
             tasks.extend([
-                {"dataset": "miniwob.debug", "task": task, "seed": 0} for task in DEBUG_SPLIT
+                # {"dataset": "miniwob.debug", "task": task, "seed": 0} for task in DEBUG_SPLIT
+                {"dataset": task, "task": task, "seed": 0} for task in DEBUG_SPLIT
             ])
         elif name == "easy":
             tasks.extend([
-                {"dataset": "miniwob.easy", "task": task, "seed": 0} for task in EASY_SPLIT
+                # {"dataset": "miniwob.easy", "task": task, "seed": 0} for task in EASY_SPLIT
+                {"dataset": task, "task": task, "seed": 0} for task in EASY_SPLIT
             ])
         elif name == "train":
             tasks.extend([
-                {"dataset": "miniwob.train", "task": task, "seed": seed}
+                # {"dataset": "miniwob.train", "task": task, "seed": seed}
+                {"dataset": task, "task": task, "seed": seed}
                 for task in TRAIN_SPLIT for seed in seeds
             ])
         elif name == "test":
             tasks.extend([
-                {"dataset": "miniwob.test", "task": task, "seed": seed}
+                # {"dataset": "miniwob.test", "task": task, "seed": seed}
+                {"dataset": task, "task": task, "seed": seed}
                 for task in TEST_SPLIT for seed in seeds
             ])
     return tasks

From 9c319e3eeb95e6d448d57f6e4ad0de1a69623c2e Mon Sep 17 00:00:00 2001
From: Nicolas Gontier <gontier.nicolas@yahoo.fr>
Date: Mon, 21 Jul 2025 19:31:55 +0000
Subject: [PATCH 20/73] fix

---
 pipelinerl/domains/miniwob/rollouts.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pipelinerl/domains/miniwob/rollouts.py b/pipelinerl/domains/miniwob/rollouts.py
index 5d483a0b..e3f039dd 100644
--- a/pipelinerl/domains/miniwob/rollouts.py
+++ b/pipelinerl/domains/miniwob/rollouts.py
@@ -91,7 +91,7 @@ async def generate_miniwob_rollout(
         tape: WebTape = WebTape(**tape_dict)  # convert http response dict to WebTape object
         t = time.perf_counter()
         if no_error:  # only run the agent if the task started successfully
-            logger.info(f"Running agent for task {problem['dataset']}/{problem['task']]}/{problem['seed']}")
+            logger.info(f"Running agent for task {problem['dataset']}/{problem['task']}/{problem['seed']}")
             try:
                 actions = await env.a_actions()
                 tools_description = await env.a_tools_description()

From 92c8a93b227c65b5f044c04fafc33cc185f5b4d5 Mon Sep 17 00:00:00 2001
From: Nicolas Gontier <gontier.nicolas@yahoo.fr>
Date: Tue, 22 Jul 2025 19:38:27 +0000
Subject: [PATCH 21/73] remove unused var in new tapeagent remote_env

---
 conf/miniwob.yaml                                | 1 -
 pipelinerl/domains/miniwob/environment_server.py | 4 +---
 2 files changed, 1 insertion(+), 4 deletions(-)

diff --git a/conf/miniwob.yaml b/conf/miniwob.yaml
index faa759ca..926f176b 100644
--- a/conf/miniwob.yaml
+++ b/conf/miniwob.yaml
@@ -157,7 +157,6 @@ environment:
   miniwob_url: ???
   n_envs: 8
   host: "0.0.0.0"
-  max_session_inactivity_secs: 600  # kill session after 10 minutes of inactivity
   env_call_timeout: 120  # timeout for each environment call (e.g. start_task, act, etc.)
   web_env_target: examples.rl_webagent.environment.WebEnvironment
   exp_path: ${output_dir}/env_server
diff --git a/pipelinerl/domains/miniwob/environment_server.py b/pipelinerl/domains/miniwob/environment_server.py
index db0072c5..b30f9ef7 100644
--- a/pipelinerl/domains/miniwob/environment_server.py
+++ b/pipelinerl/domains/miniwob/environment_server.py
@@ -13,14 +13,12 @@ def __init__(self,
         exp_path: str,
         headless: bool = True,
         observation_format: str = "html",
-        max_session_inactivity_secs: int = 600,
         env_call_timeout: int = 60,
     ):
         os.environ["MINIWOB_URL"] = miniwob_url
         # Remote environment server configuration
         self.n_envs = n_envs
         self.host = host
-        self.max_session_inactivity_secs = max_session_inactivity_secs
         self.env_call_timeout = env_call_timeout
         # Individual web environment configuration
         self.web_env_target = web_env_target
@@ -33,7 +31,7 @@ def launch(self, port: int):
         """
         Serve the web environment in TapeAgent.
         """
-        env_server = EnvironmentServer(n_envs=self.n_envs, host=self.host, port=port, max_session_inactivity_secs=self.max_session_inactivity_secs, env_call_timeout=self.env_call_timeout)
+        env_server = EnvironmentServer(n_envs=self.n_envs, host=self.host, port=port, env_call_timeout=self.env_call_timeout)
         env_server.launch(OmegaConf.create({
             "_target_": self.web_env_target,
             "exp_path": self.exp_path,

From edf4d000a10cd166116666659a44fbc9495807d6 Mon Sep 17 00:00:00 2001
From: Nicolas Gontier <gontier.nicolas@yahoo.fr>
Date: Wed, 23 Jul 2025 18:33:11 +0000
Subject: [PATCH 22/73] use BaseMetrics

---
 pipelinerl/domains/miniwob/rollouts.py | 34 +++++++++++++++++---------
 1 file changed, 23 insertions(+), 11 deletions(-)

diff --git a/pipelinerl/domains/miniwob/rollouts.py b/pipelinerl/domains/miniwob/rollouts.py
index e3f039dd..b2de5373 100644
--- a/pipelinerl/domains/miniwob/rollouts.py
+++ b/pipelinerl/domains/miniwob/rollouts.py
@@ -25,6 +25,18 @@
 logger = logging.getLogger(__name__)
 
 
+class MiniwobMetrics(BaseMetrics):
+    reward: float
+    success: bool
+    no_error: bool
+    no_answer: bool
+    overflow: bool
+    n_llm_calls: int
+    n_step_errors: int
+    n_page_observations: int
+    n_steps: int
+
+
 def tape_contains_an_error(tape: WebTape) -> bool:
     """
     Returns true if the tape ends with an error, ie if one of the following is true:
@@ -149,17 +161,17 @@ async def generate_miniwob_rollout(
 
     latency = time.time() - start_time
 
-    metrics = {
-        "reward": reward,
-        "success": 1 if reward > 0.5 else 0,
-        "no_error": no_error,
-        "no_answer": 1 if reward < 0 else 0,
-        "overflow": 0 if all_finished else 1,
-        "n_llm_calls": n_llm_calls,
-        "n_step_errors": n_step_errors,
-        "n_page_observations": n_page_observations,
-        "n_steps": len(tape.steps),
-    }
+    metrics = MiniwobMetrics(
+        reward=reward,
+        success=reward > 0.5,
+        no_error=no_error,
+        no_answer=reward < 0,
+        overflow=not all_finished,
+        n_llm_calls=n_llm_calls,
+        n_step_errors=n_step_errors,
+        n_page_observations=n_page_observations,
+        n_steps=len(tape.steps),
+    )
 
     return RolloutResult(
         training_texts=training_texts,

From 28749e0928d6db9b76efcb6e6f67e50bc933dca3 Mon Sep 17 00:00:00 2001
From: Nicolas Gontier <gontier.nicolas@yahoo.fr>
Date: Wed, 23 Jul 2025 18:37:30 +0000
Subject: [PATCH 23/73] fix

---
 pipelinerl/domains/miniwob/rollouts.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pipelinerl/domains/miniwob/rollouts.py b/pipelinerl/domains/miniwob/rollouts.py
index b2de5373..eb6d563b 100644
--- a/pipelinerl/domains/miniwob/rollouts.py
+++ b/pipelinerl/domains/miniwob/rollouts.py
@@ -10,7 +10,7 @@
 from omegaconf import DictConfig
 
 from pipelinerl.async_llm import llm_async_generate, make_training_text
-from pipelinerl.rollouts import RolloutResult
+from pipelinerl.rollouts import BaseMetrics, RolloutResult
 from pipelinerl.world import Job
 from tapeagents.agent import Agent, DEFAULT
 from tapeagents.core import LLMOutputParsingFailureAction, Observation, LLMCall

From a4f9f79bec163b558e24a660636d221c0b39508b Mon Sep 17 00:00:00 2001
From: Nicolas Gontier <gontier.nicolas@yahoo.fr>
Date: Wed, 23 Jul 2025 21:18:22 +0000
Subject: [PATCH 24/73] keep track of time taken

---
 pipelinerl/domains/miniwob/rollouts.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/pipelinerl/domains/miniwob/rollouts.py b/pipelinerl/domains/miniwob/rollouts.py
index eb6d563b..43bf70d3 100644
--- a/pipelinerl/domains/miniwob/rollouts.py
+++ b/pipelinerl/domains/miniwob/rollouts.py
@@ -35,6 +35,9 @@ class MiniwobMetrics(BaseMetrics):
     n_step_errors: int
     n_page_observations: int
     n_steps: int
+    total_execution_time: float
+    agent_execution_time: float
+    environment_execution_time: float
 
 
 def tape_contains_an_error(tape: WebTape) -> bool:
@@ -115,7 +118,7 @@ async def generate_miniwob_rollout(
                 logger.error(f"Error occurred while running agent: {e}")
                 no_error = False
             logger.info(f"Agent finished task {problem['dataset']}/{problem['task']}/{problem['seed']} in {time.perf_counter() - t:.2f} seconds")
-        tape.metadata.result = {"execution_time": time.perf_counter() - t}
+        tape.metadata.result.update({"total_execution_time": time.perf_counter() - t})
 
     # save the tape as we go
     if cfg.save_tapes:
@@ -171,6 +174,9 @@ async def generate_miniwob_rollout(
         n_step_errors=n_step_errors,
         n_page_observations=n_page_observations,
         n_steps=len(tape.steps),
+        total_execution_time=tape.metadata.result.get("total_execution_time", -1.0),
+        agent_execution_time=tape.metadata.result.get("agent_execution_time", -1.0),
+        environment_execution_time=tape.metadata.result.get("environment_execution_time", -1.0),
     )
 
     return RolloutResult(

From 8a6120f1a8261ffac963f9602d2bd503afe86a39 Mon Sep 17 00:00:00 2001
From: Oleh Shliazhko <oleh.shliazhko@servicenow.com>
Date: Thu, 24 Jul 2025 11:11:33 +0200
Subject: [PATCH 25/73] send per step times to wandb

---
 pipelinerl/domains/miniwob/rollouts.py | 45 +++++++++++++++-----------
 1 file changed, 27 insertions(+), 18 deletions(-)

diff --git a/pipelinerl/domains/miniwob/rollouts.py b/pipelinerl/domains/miniwob/rollouts.py
index 43bf70d3..5c56c92a 100644
--- a/pipelinerl/domains/miniwob/rollouts.py
+++ b/pipelinerl/domains/miniwob/rollouts.py
@@ -1,26 +1,24 @@
-
 import asyncio
 import logging
 import os
 import random
 import time
+
 import aiohttp
-from fastapi import HTTPException
+from examples.rl_webagent.steps import WebTape
 from hydra.utils import instantiate
 from omegaconf import DictConfig
-
-from pipelinerl.async_llm import llm_async_generate, make_training_text
-from pipelinerl.rollouts import BaseMetrics, RolloutResult
-from pipelinerl.world import Job
-from tapeagents.agent import Agent, DEFAULT
-from tapeagents.core import LLMOutputParsingFailureAction, Observation, LLMCall
+from tapeagents.agent import DEFAULT, Agent
+from tapeagents.core import LLMCall, LLMOutputParsingFailureAction, Observation
+from tapeagents.io import save_json_tape
 from tapeagents.llms.trainable import TrainableLLM
+from tapeagents.orchestrator import async_execute_agent
 from tapeagents.remote_environment import AsyncRemoteEnvironment
 from tapeagents.tools.simple_browser import PageObservation
-from tapeagents.orchestrator import async_execute_agent
-from tapeagents.io import save_json_tape
-from examples.rl_webagent.steps import WebTape
 
+from pipelinerl.async_llm import make_training_text
+from pipelinerl.rollouts import BaseMetrics, RolloutResult
+from pipelinerl.world import Job
 
 logger = logging.getLogger(__name__)
 
@@ -38,6 +36,8 @@ class MiniwobMetrics(BaseMetrics):
     total_execution_time: float
     agent_execution_time: float
     environment_execution_time: float
+    env_step_time: float
+    agent_step_time: float
 
 
 def tape_contains_an_error(tape: WebTape) -> bool:
@@ -102,7 +102,9 @@ async def generate_miniwob_rollout(
                 else:
                     logger.warning(f"retry after 5 seconds: {e}")
                     await asyncio.sleep(5)
-        logger.info(f"Task {problem['dataset']}/{problem['task']}/{problem['seed']} started in {time.perf_counter() - t:.2f} seconds")
+        logger.info(
+            f"Task {problem['dataset']}/{problem['task']}/{problem['seed']} started in {time.perf_counter() - t:.2f} seconds"
+        )
         tape: WebTape = WebTape(**tape_dict)  # convert http response dict to WebTape object
         t = time.perf_counter()
         if no_error:  # only run the agent if the task started successfully
@@ -117,7 +119,9 @@ async def generate_miniwob_rollout(
             except Exception as e:
                 logger.error(f"Error occurred while running agent: {e}")
                 no_error = False
-            logger.info(f"Agent finished task {problem['dataset']}/{problem['task']}/{problem['seed']} in {time.perf_counter() - t:.2f} seconds")
+            logger.info(
+                f"Agent finished task {problem['dataset']}/{problem['task']}/{problem['seed']} in {time.perf_counter() - t:.2f} seconds"
+            )
         tape.metadata.result.update({"total_execution_time": time.perf_counter() - t})
 
     # save the tape as we go
@@ -148,7 +152,8 @@ async def generate_miniwob_rollout(
     llm_calls = [step for step in tape.steps if step.metadata.other.get("llm_call") is not None]
     n_llm_calls = len(llm_calls)
     llm_calls: list[LLMCall] = [
-        LLMCall(**step.metadata.other["llm_call"]) if isinstance(step.metadata.other["llm_call"], dict)
+        LLMCall(**step.metadata.other["llm_call"])
+        if isinstance(step.metadata.other["llm_call"], dict)
         else step.metadata.other["llm_call"]
         for step in llm_calls
     ]
@@ -163,7 +168,10 @@ async def generate_miniwob_rollout(
         all_finished &= 1 if text.input_ids[-1] == llm.tokenizer.eos_token_id else 0
 
     latency = time.time() - start_time
-
+    agent_time = tape.metadata.result.get("agent_execution_time", -1.0)
+    env_time = tape.metadata.result.get("environment_execution_time", -1.0)
+    n_observations = len([s for s in tape.steps if isinstance(s, Observation)])
+    n_other_steps = len(tape.steps) - n_observations
     metrics = MiniwobMetrics(
         reward=reward,
         success=reward > 0.5,
@@ -175,8 +183,10 @@ async def generate_miniwob_rollout(
         n_page_observations=n_page_observations,
         n_steps=len(tape.steps),
         total_execution_time=tape.metadata.result.get("total_execution_time", -1.0),
-        agent_execution_time=tape.metadata.result.get("agent_execution_time", -1.0),
-        environment_execution_time=tape.metadata.result.get("environment_execution_time", -1.0),
+        agent_execution_time=agent_time,
+        environment_execution_time=env_time,
+        env_step_time=env_time / n_observations if env_time > 0 and n_observations > 0 else -1.0,
+        agent_step_time=agent_time / n_other_steps if agent_time > 0 and n_other_steps > 0 else -1.0,
     )
 
     return RolloutResult(
@@ -187,4 +197,3 @@ async def generate_miniwob_rollout(
         prompt_tokens=prompt_tokens,
         output_tokens=output_tokens,
     )
-

From 5eb3a4eb7b96212f8fad19268b10f09b49fb399e Mon Sep 17 00:00:00 2001
From: Nicolas Gontier <gontier.nicolas@yahoo.fr>
Date: Fri, 25 Jul 2025 21:05:43 +0000
Subject: [PATCH 26/73] use all miniwob tasks

---
 conf/miniwob.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/conf/miniwob.yaml b/conf/miniwob.yaml
index 926f176b..48e66108 100644
--- a/conf/miniwob.yaml
+++ b/conf/miniwob.yaml
@@ -169,6 +169,6 @@ dataset_loader_params:
   train_split: 0.6  # 0.6 of tasks for training, 0.4 for testing
   seeds: [0, 42, 1337, 900, 103]
 train_dataset_names:
-  - debug
+  - train
 test_dataset_names:
-  - debug
+  - test

From 75d3c9c303a8372e1d02673526b77691a6990ef3 Mon Sep 17 00:00:00 2001
From: Nicolas Gontier <gontier.nicolas@yahoo.fr>
Date: Mon, 28 Jul 2025 14:45:57 +0000
Subject: [PATCH 27/73] default save checkpoints

---
 conf/miniwob.yaml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/conf/miniwob.yaml b/conf/miniwob.yaml
index 48e66108..16cd1068 100644
--- a/conf/miniwob.yaml
+++ b/conf/miniwob.yaml
@@ -16,7 +16,6 @@ output_dir: results/miniwob_debug/${now:%Y-%m-%d}/${now:%H-%M-%S}
 model_path: meta-llama/Llama-3.1-8B-Instruct
 
 finetune:
-  save_checkpoint_steps: 10
   seq_length: 8192
   train_batch_size: 1
   gradient_accumulation_passes: 512

From 6b97c7b0cbb607d83ebb927b84b0ae8b61c8538a Mon Sep 17 00:00:00 2001
From: Nicolas Gontier <gontier.nicolas@yahoo.fr>
Date: Mon, 28 Jul 2025 15:03:22 +0000
Subject: [PATCH 28/73] update vllm max tokens

---
 conf/base.yaml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/conf/base.yaml b/conf/base.yaml
index 3d426f4c..2f20d18c 100644
--- a/conf/base.yaml
+++ b/conf/base.yaml
@@ -47,7 +47,7 @@ llm:
     temperature: 1.0
 test_llm:
   parameters: 
-    max_tokens: 16000
+    max_tokens: 8192
     temperature: 1.0
     top_p: 0.95
     top_k: 50
@@ -67,6 +67,7 @@ vllm_config:
     tensor-parallel-size: 1
     pipeline-parallel-size: 1
     generation-config: vllm
+    max_model_len: 10000
 
 world:
   replicas: 1

From d3cf30b9c44ba03fc9edb1ae2507418bad5828cc Mon Sep 17 00:00:00 2001
From: Nicolas Gontier <gontier.nicolas@yahoo.fr>
Date: Mon, 28 Jul 2025 18:31:18 +0000
Subject: [PATCH 29/73] assert group size is as expected

---
 pipelinerl/actor.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/pipelinerl/actor.py b/pipelinerl/actor.py
index dad79e0b..6a1f2447 100644
--- a/pipelinerl/actor.py
+++ b/pipelinerl/actor.py
@@ -462,6 +462,9 @@ def run(self, dataset: list[tuple[str, dict]]):
 
                 assert isinstance(rollout_results, list)
                 assert isinstance(rollout_results[0], RolloutResult)
+                assert len(rollout_results) == attempts, (
+                    f"Expected {attempts} rollouts, got {len(rollout_results)}"
+                )
                 group_samples = sum(len(r.training_texts) for r in rollout_results)
 
                 published_samples += group_samples

From 4c50f1f4e9721e1956c2fe794a8f87c2908397e4 Mon Sep 17 00:00:00 2001
From: Nicolas Gontier <gontier.nicolas@yahoo.fr>
Date: Mon, 28 Jul 2025 18:32:10 +0000
Subject: [PATCH 30/73] assert finetuning length is as much as vllm max length

---
 pipelinerl/launch.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/pipelinerl/launch.py b/pipelinerl/launch.py
index b03ab8d7..ac87457e 100644
--- a/pipelinerl/launch.py
+++ b/pipelinerl/launch.py
@@ -71,6 +71,13 @@ def validate_config(cfg: DictConfig):
         if not hasattr(cfg.finetune.rl, "value_loss_coef") or cfg.finetune.rl.value_loss_coef <= 0.0:
             raise ValueError("value_loss_coef must be greater than 0 when using causal-language-modeling-with-value-head")
 
+    if cfg.finetune.seq_length < cfg.vllm_config.vllm_kwargs.max_model_len:
+        raise ValueError(
+            f"seq_length {cfg.finetune.seq_length} must be greater than or equal to "
+            f"vllm_kwargs.max_model_len {cfg.vllm_config.vllm_kwargs.max_model_len}"
+        )
+
+
 
 def run_ref_llm(cfg: DictConfig, preprocessor_llm_idx: int, local_idx: int, gpus: list[int], exp_dir: Path):
     kwargs = cfg.vllm_config.vllm_kwargs

From ff61d73dc92ad71cba9bf4d2f6cdf8a5d2f9b00e Mon Sep 17 00:00:00 2001
From: Nicolas Gontier <gontier.nicolas@yahoo.fr>
Date: Mon, 28 Jul 2025 18:32:47 +0000
Subject: [PATCH 31/73] update finetuning & vllm max lengths

---
 conf/miniwob.yaml | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/conf/miniwob.yaml b/conf/miniwob.yaml
index 16cd1068..225a7e6e 100644
--- a/conf/miniwob.yaml
+++ b/conf/miniwob.yaml
@@ -16,7 +16,7 @@ output_dir: results/miniwob_debug/${now:%Y-%m-%d}/${now:%H-%M-%S}
 model_path: meta-llama/Llama-3.1-8B-Instruct
 
 finetune:
-  seq_length: 8192
+  seq_length: 16384
   train_batch_size: 1
   gradient_accumulation_passes: 512
   learning_rate: 1e-6
@@ -38,8 +38,9 @@ test_llm:
     top_p: 1.0
     top_k: 50
 
-# vllm_config:
-#   vllm_kwargs:
+vllm_config:
+  vllm_kwargs:
+    max_model_len: 16384
 #     enable-auto-tool-choice: ""
 #     tool-call-parser: llama3_json # use hermes for qwen
 #     chat_template: pipelinerl/domains/miniwob/tool_chat_template_llama3.1_json.jinja  # copy pasted from https://github.com/vllm-project/vllm/blob/main/examples/tool_chat_template_llama3.1_json.jinja

From a00e6e63a74d3212c53c41bdf681fad6b5e94162 Mon Sep 17 00:00:00 2001
From: Nicolas Gontier <gontier.nicolas@yahoo.fr>
Date: Mon, 28 Jul 2025 21:19:48 +0000
Subject: [PATCH 32/73] debug agent

---
 conf/miniwob.yaml                      | 2 ++
 pipelinerl/domains/miniwob/rollouts.py | 4 ----
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/conf/miniwob.yaml b/conf/miniwob.yaml
index 225a7e6e..91c89ee6 100644
--- a/conf/miniwob.yaml
+++ b/conf/miniwob.yaml
@@ -145,6 +145,8 @@ agent:
       # steps_prompt: ${agent.templates.allowed_tools}
       steps_prompt: ${agent.templates.allowed_steps}
       use_known_actions: true
+      steps:
+        - examples.rl_webagent.steps.FinalAnswerAction
       trim_obs_except_last_n: 3  # keep the last 3 observations from the tape in prompt messages
       max_chars_page_observation: 3000  # keep up to 3000 chars in PageObservation steps
       next_node: reflect
diff --git a/pipelinerl/domains/miniwob/rollouts.py b/pipelinerl/domains/miniwob/rollouts.py
index 5c56c92a..3d3287be 100644
--- a/pipelinerl/domains/miniwob/rollouts.py
+++ b/pipelinerl/domains/miniwob/rollouts.py
@@ -95,10 +95,6 @@ async def generate_miniwob_rollout(
                     no_error = False
                     tape_dict = {}
                     break
-                    # if isinstance(e, HTTPException):
-                    #     raise RuntimeError(f"HTTPException: {e.status_code} {e.detail}")
-                    # else:
-                    #     raise e
                 else:
                     logger.warning(f"retry after 5 seconds: {e}")
                     await asyncio.sleep(5)

From 6f149c89e67094fa1e8633072ba72c534082b099 Mon Sep 17 00:00:00 2001
From: Nicolas Gontier <gontier.nicolas@yahoo.fr>
Date: Fri, 8 Aug 2025 15:05:43 +0000
Subject: [PATCH 33/73] use ppo & upd config

---
 conf/miniwob.yaml | 22 +++++++---------------
 1 file changed, 7 insertions(+), 15 deletions(-)

diff --git a/conf/miniwob.yaml b/conf/miniwob.yaml
index 91c89ee6..57cd8b89 100644
--- a/conf/miniwob.yaml
+++ b/conf/miniwob.yaml
@@ -1,6 +1,7 @@
 defaults:
   - base
   - override streams: redis
+  - override finetune: ppo
   - _self_
 
 world:
@@ -12,24 +13,15 @@ world:
 #   mode: actor
 save_tapes: False
 
-output_dir: results/miniwob_debug/${now:%Y-%m-%d}/${now:%H-%M-%S}
+output_dir: results/miniwob/${now:%Y-%m-%d}/${now:%H-%M-%S}
 model_path: meta-llama/Llama-3.1-8B-Instruct
 
 finetune:
-  seq_length: 16384
-  train_batch_size: 1
-  gradient_accumulation_passes: 512
-  learning_rate: 1e-6
-  optim: adamw_torch
-  rl:
-    kl_coef: 0.0  # GRPO beta coefficient
-    reward_minus_kl_coef: 0.0  # RLOO beta coefficient
-    use_advantages: true
-    algo: grpo
+  seq_length: 16384  # input + output tokens
 
 llm:
   parameters:
-    max_tokens: 4096
+    max_tokens: 4096  # output tokens
     temperature: 1.0
 test_llm:
   parameters:
@@ -40,7 +32,7 @@ test_llm:
 
 vllm_config:
   vllm_kwargs:
-    max_model_len: 16384
+    max_model_len: 16384  # input + output tokens
 #     enable-auto-tool-choice: ""
 #     tool-call-parser: llama3_json # use hermes for qwen
 #     chat_template: pipelinerl/domains/miniwob/tool_chat_template_llama3.1_json.jinja  # copy pasted from https://github.com/vllm-project/vllm/blob/main/examples/tool_chat_template_llama3.1_json.jinja
@@ -157,9 +149,9 @@ start_attempts: 3  # number of attempts to start each task
 environment:
   _target_: pipelinerl.domains.miniwob.environment_server.WebEnvironmentServer
   miniwob_url: ???
-  n_envs: 8
+  n_envs: 32
   host: "0.0.0.0"
-  env_call_timeout: 120  # timeout for each environment call (e.g. start_task, act, etc.)
+  env_call_timeout: 600  # timeout for each environment call (e.g. start_task, act, etc.)
   web_env_target: examples.rl_webagent.environment.WebEnvironment
   exp_path: ${output_dir}/env_server
   headless: true

From 2ae2dd8d9fc41a733b06cbbd4bc36b95ea0c2434 Mon Sep 17 00:00:00 2001
From: Nicolas Gontier <gontier.nicolas@yahoo.fr>
Date: Fri, 8 Aug 2025 15:05:59 +0000
Subject: [PATCH 34/73] update readme

---
 pipelinerl/domains/miniwob/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pipelinerl/domains/miniwob/README.md b/pipelinerl/domains/miniwob/README.md
index 9ff8461c..e9af1b42 100644
--- a/pipelinerl/domains/miniwob/README.md
+++ b/pipelinerl/domains/miniwob/README.md
@@ -31,4 +31,4 @@ The environment server will need to have playwright installed.
 
 ## Launch Command
 
-`python -m pipelinerl.launch --config-name  environment.miniwob_url=file:///PATH/TO/miniwob-plusplus/miniwob/html/miniwob/`
+`python -m pipelinerl.launch --config-name miniwob environment.miniwob_url=file:///PATH/TO/miniwob-plusplus/miniwob/html/miniwob/`

From 913c8e27771d307ba1bf84ea76a211463bd2001b Mon Sep 17 00:00:00 2001
From: Nicolas Gontier <gontier.nicolas@yahoo.fr>
Date: Mon, 11 Aug 2025 14:03:22 +0000
Subject: [PATCH 35/73] stop training after 1k steps

---
 conf/miniwob.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/conf/miniwob.yaml b/conf/miniwob.yaml
index 57cd8b89..07943203 100644
--- a/conf/miniwob.yaml
+++ b/conf/miniwob.yaml
@@ -18,6 +18,7 @@ model_path: meta-llama/Llama-3.1-8B-Instruct
 
 finetune:
   seq_length: 16384  # input + output tokens
+  max_train_steps: 1000
 
 llm:
   parameters:

From 402eeb239e82694a29f4378be17d75dc2148f322 Mon Sep 17 00:00:00 2001
From: Nicolas Gontier <gontier.nicolas@yahoo.fr>
Date: Wed, 20 Aug 2025 18:17:28 +0000
Subject: [PATCH 36/73] scale up env servers by llm_servers

---
 conf/base.yaml      | 3 ++-
 pipelinerl/world.py | 5 ++++-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/conf/base.yaml b/conf/base.yaml
index 2f20d18c..995db7c5 100644
--- a/conf/base.yaml
+++ b/conf/base.yaml
@@ -76,7 +76,8 @@ world:
   preprocessor_fraction: 0
   finetune_fraction: 4
 
-  env_replicas: 2
+  # Number of environment servers per actor VLLM server
+  env_replicas_per_actor: 1
 
   actor_group_port: 9000
   environment_start_port: 7777
diff --git a/pipelinerl/world.py b/pipelinerl/world.py
index f41714e4..cc23afd0 100644
--- a/pipelinerl/world.py
+++ b/pipelinerl/world.py
@@ -188,7 +188,10 @@ def _place_pipeline_stages(self, cfg):
             self.add_job(kind="preprocessor", replica_idx=worker_idx, node_rank=node, gpus=[], cpu_heavy=True)
 
     def _place_environments(self, cfg):
-        for worker_idx in range(cfg.world.env_replicas):
+        # Scale environment servers to be the same as llm servers
+        env_replicas_per_actor = getattr(cfg.world, "env_replicas_per_actor", 1)
+        total_env_replicas = cfg.world.replicas * self.llms_per_actor * env_replicas_per_actor
+        for worker_idx in range(total_env_replicas):
             node = self.get_least_busy_node()
             envs_at_node = len([job for job in self.job_map[node] if job.kind == "environment"])
             self.add_job(

From 58f31ccd6f441bc286cea798dfe201f2b267265b Mon Sep 17 00:00:00 2001
From: Nicolas Gontier <gontier.nicolas@yahoo.fr>
Date: Wed, 20 Aug 2025 18:19:30 +0000
Subject: [PATCH 37/73] reweight actor/trainer

---
 conf/miniwob.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/conf/miniwob.yaml b/conf/miniwob.yaml
index 07943203..8ab18d3a 100644
--- a/conf/miniwob.yaml
+++ b/conf/miniwob.yaml
@@ -5,9 +5,9 @@ defaults:
   - _self_
 
 world:
-  actor_fraction: 2
+  actor_fraction: 3
   preprocessor_fraction: 0
-  finetune_fraction: 6
+  finetune_fraction: 5
 
 # debug:
 #   mode: actor

From 4101d777bfcb2664d76d1a876eb1e02f959da44f Mon Sep 17 00:00:00 2001
From: Nicolas Gontier <gontier.nicolas@yahoo.fr>
Date: Wed, 20 Aug 2025 18:21:16 +0000
Subject: [PATCH 38/73] add massimo miniwob split

---
 pipelinerl/domains/miniwob/load_tasks.py | 136 +++++++++++++++++++++++
 1 file changed, 136 insertions(+)

diff --git a/pipelinerl/domains/miniwob/load_tasks.py b/pipelinerl/domains/miniwob/load_tasks.py
index 2c58f0e0..4bade257 100644
--- a/pipelinerl/domains/miniwob/load_tasks.py
+++ b/pipelinerl/domains/miniwob/load_tasks.py
@@ -34,6 +34,132 @@
     "miniwob.tic-tac-toe",
     "miniwob.use-autocomplete-nodelay"
 ]
+MASSIMO_TRAIN_SPLIT = [
+    "miniwob.ascending-numbers",
+    "miniwob.bisect-angle",
+    "miniwob.book-flight",
+    "miniwob.choose-date",
+    "miniwob.choose-date-easy",
+    "miniwob.choose-date-medium",
+    "miniwob.choose-date-nodelay",
+    "miniwob.choose-list",
+    "miniwob.circle-center",
+    "miniwob.click-button-sequence",
+    "miniwob.click-checkboxes-soft",
+    "miniwob.click-checkboxes-transfer",
+    "miniwob.click-collapsible-2",
+    "miniwob.click-collapsible-2-nodelay",
+    "miniwob.click-collapsible-nodelay",
+    "miniwob.click-color",
+    "miniwob.click-dialog",
+    "miniwob.click-dialog-2",
+    "miniwob.click-link",
+    "miniwob.click-menu",
+    "miniwob.click-menu-2",
+    "miniwob.click-scroll-list",
+    "miniwob.click-shape",
+    "miniwob.click-tab",
+    "miniwob.click-tab-2",
+    "miniwob.click-tab-2-hard",
+    "miniwob.click-tab-2-medium",
+    "miniwob.click-test",
+    "miniwob.click-test-2",
+    "miniwob.click-test-transfer",
+    "miniwob.click-widget",
+    "miniwob.copy-paste",
+    "miniwob.copy-paste-2",
+    "miniwob.count-shape",
+    "miniwob.count-sides",
+    "miniwob.daily-calendar",
+    "miniwob.drag-box",
+    "miniwob.drag-circle",
+    "miniwob.drag-cube",
+    "miniwob.drag-items",
+    "miniwob.drag-items-grid",
+    "miniwob.drag-shapes",
+    "miniwob.drag-shapes-2",
+    "miniwob.drag-sort-numbers",
+    "miniwob.draw-circle",
+    "miniwob.draw-line",
+    "miniwob.email-inbox",
+    "miniwob.email-inbox-delete",
+    "miniwob.email-inbox-forward",
+    "miniwob.email-inbox-forward-nl",
+    "miniwob.email-inbox-forward-nl-turk",
+    "miniwob.email-inbox-important",
+    "miniwob.email-inbox-noscroll",
+    "miniwob.email-inbox-reply",
+    "miniwob.email-inbox-star-reply",
+    "miniwob.enter-date",
+    "miniwob.enter-text",
+    "miniwob.enter-text-dynamic",
+    "miniwob.enter-time",
+    "miniwob.find-greatest",
+    "miniwob.find-word",
+    "miniwob.focus-text-2",
+    "miniwob.form-sequence",
+    "miniwob.form-sequence-2",
+    "miniwob.generate-number",
+    "miniwob.grid-coordinate",
+    "miniwob.guess-number",
+    "miniwob.highlight-text",
+    "miniwob.hot-cold",
+    "miniwob.identify-shape",
+    "miniwob.login-user",
+    "miniwob.login-user-popup",
+    "miniwob.multi-layouts",
+    "miniwob.multi-orderings",
+    "miniwob.navigate-tree",
+    "miniwob.odd-or-even",
+    "miniwob.order-food",
+    "miniwob.phone-book",
+    "miniwob.read-table",
+    "miniwob.read-table-2",
+    "miniwob.resize-textarea",
+    "miniwob.right-angle",
+    "miniwob.scroll-text",
+    "miniwob.scroll-text-2",
+    "miniwob.search-engine",
+    "miniwob.sign-agreement",
+    "miniwob.simple-algebra",
+    "miniwob.social-media",
+    "miniwob.social-media-all",
+    "miniwob.social-media-some",
+    "miniwob.text-editor",
+    "miniwob.text-transform",
+    "miniwob.tic-tac-toe",
+    "miniwob.use-autocomplete",
+    "miniwob.use-autocomplete-nodelay",
+    "miniwob.use-colorwheel",
+    "miniwob.use-colorwheel-2",
+    "miniwob.use-spinner",
+    "miniwob.visual-addition",
+]
+MASSIMO_TEST_SPLIT = [
+    "miniwob.buy-ticket",
+    "miniwob.click-button",
+    "miniwob.click-option",
+    "miniwob.click-pie-nodelay",
+    "miniwob.drag-single-shape",
+    "miniwob.email-inbox-nl-turk",
+    "miniwob.enter-text-2",
+    "miniwob.find-midpoint",
+    "miniwob.focus-text",
+    "miniwob.simple-arithmetic",
+    "miniwob.stock-market",
+    "miniwob.use-slider-2",
+    "miniwob.click-checkboxes",
+    "miniwob.click-checkboxes-large",
+    "miniwob.click-collapsible",
+    "miniwob.click-pie",
+    "miniwob.click-shades",
+    "miniwob.click-tab-2-easy",
+    "miniwob.enter-password",
+    "miniwob.form-sequence-3",
+    "miniwob.highlight-text-2",
+    "miniwob.unicode-test",
+    "miniwob.use-slider",
+]
 TRAIN_SPLIT = None
 TEST_SPLIT = None
 
@@ -76,5 +202,15 @@ def load_tasks(dataset_names: list[str], train_split: float = 0.6, seeds: list[i
                 {"dataset": task, "task": task, "seed": seed}
                 for task in TEST_SPLIT for seed in seeds
             ])
+        elif name == "massimo_train":
+            tasks.extend([
+                {"dataset": task, "task": task, "seed": seed}
+                for task in MASSIMO_TRAIN_SPLIT for seed in seeds
+            ])
+        elif name == "massimo_test":
+            tasks.extend([
+                {"dataset": task, "task": task, "seed": seed}
+                for task in MASSIMO_TEST_SPLIT for seed in seeds
+            ])
     return tasks
 

From b00e4760a8ed3b41e6a91dfd798269f9f0fdba85 Mon Sep 17 00:00:00 2001
From: Nicolas Gontier <gontier.nicolas@yahoo.fr>
Date: Wed, 20 Aug 2025 18:40:07 +0000
Subject: [PATCH 39/73] cleanup

---
 conf/miniwob.yaml                             |   4 -
 .../tool_chat_template_llama3.1_json.jinja    | 120 ------------------
 2 files changed, 124 deletions(-)
 delete mode 100644 pipelinerl/domains/miniwob/tool_chat_template_llama3.1_json.jinja

diff --git a/conf/miniwob.yaml b/conf/miniwob.yaml
index 8ab18d3a..08ade1ed 100644
--- a/conf/miniwob.yaml
+++ b/conf/miniwob.yaml
@@ -34,10 +34,6 @@ test_llm:
 vllm_config:
   vllm_kwargs:
     max_model_len: 16384  # input + output tokens
-#     enable-auto-tool-choice: ""
-#     tool-call-parser: llama3_json # use hermes for qwen
-#     chat_template: pipelinerl/domains/miniwob/tool_chat_template_llama3.1_json.jinja  # copy pasted from https://github.com/vllm-project/vllm/blob/main/examples/tool_chat_template_llama3.1_json.jinja
-#     enforce-eager: ""  # speed the actor llm startup a bit
 
 actor:
   rollout_policy: pipelinerl.domains.miniwob.rollouts.generate_miniwob_rollout
diff --git a/pipelinerl/domains/miniwob/tool_chat_template_llama3.1_json.jinja b/pipelinerl/domains/miniwob/tool_chat_template_llama3.1_json.jinja
deleted file mode 100644
index a3bc9f02..00000000
--- a/pipelinerl/domains/miniwob/tool_chat_template_llama3.1_json.jinja
+++ /dev/null
@@ -1,120 +0,0 @@
-{{- bos_token }}
-{%- if custom_tools is defined %}
-    {%- set tools = custom_tools %}
-{%- endif %}
-{%- if not tools_in_user_message is defined %}
-    {#- Llama 3.1 doesn't pass all tests if the tools are in the system prompt #}
-    {%- set tools_in_user_message = true %}
-{%- endif %}
-{%- if not date_string is defined %}
-    {%- if strftime_now is defined %}
-        {%- set date_string = strftime_now("%d %b %Y") %}
-    {%- else %}
-        {%- set date_string = "26 Jul 2024" %}
-    {%- endif %}
-{%- endif %}
-{%- if not tools is defined %}
-    {%- set tools = none %}
-{%- endif %}
-
-{#- This block extracts the system message, so we can slot it into the right place. #}
-{%- if messages[0]['role'] == 'system' %}
-    {%- if messages[0]['content'] is string %}
-        {%- set system_message = messages[0]['content']|trim %}
-    {%- else %}
-        {%- set system_message = messages[0]['content'][0]['text']|trim %}
-    {%- endif %}
-    {%- set messages = messages[1:] %}
-{%- else %}
-    {%- if tools is not none %}
-        {%- set system_message = "You are a helpful assistant with tool calling capabilities. Only reply with a tool call if the function exists in the library provided by the user. If it doesn't exist, just reply directly in natural language. When you receive a tool call response, use the output to format an answer to the original user question." %}
-    {%- else %}
-        {%- set system_message = "" %}
-    {%- endif %}
-{%- endif %}
-
-{#- System message #}
-{{- "<|start_header_id|>system<|end_header_id|>\n\n" }}
-{%- if tools is not none %}
-    {{- "Environment: ipython\n" }}
-{%- endif %}
-{{- "Cutting Knowledge Date: December 2023\n" }}
-{{- "Today Date: " + date_string + "\n\n" }}
-{%- if tools is not none and not tools_in_user_message %}
-    {{- "You have access to the following functions. To call a function, please respond with JSON for a function call. " }}
-    {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}. ' }}
-    {{- "Do not use variables.\n\n" }}
-    {%- for t in tools %}
-        {{- t | tojson(indent=4) }}
-        {{- "\n\n" }}
-    {%- endfor %}
-{%- endif %}
-{{- system_message }}
-{{- "<|eot_id|>" }}
-
-{#- Custom tools are passed in a user message with some extra guidance #}
-{%- if tools_in_user_message and not tools is none %}
-    {#- Extract the first user message so we can plug it in here #}
-    {%- if messages | length != 0 %}
-        {%- if messages[0]['content'] is string %}
-            {%- set first_user_message = messages[0]['content']|trim %}
-        {%- else %}
-            {%- set first_user_message = messages[0]['content'] | selectattr('type', 'equalto', 'text') | map(attribute='text') | map('trim') | join('\n') %}
-        {%- endif %}
-        {%- set messages = messages[1:] %}
-    {%- else %}
-        {{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }}
-    {%- endif %}
-    {{- '<|start_header_id|>user<|end_header_id|>\n\n' -}}
-    {{- "Given the following functions, please respond with a JSON for a function call " }}
-    {{- "with its proper arguments that best answers the given prompt.\n\n" }}
-    {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}. ' }}
-    {{- "Do not use variables.\n\n" }}
-    {%- for t in tools %}
-        {{- t | tojson(indent=4) }}
-        {{- "\n\n" }}
-    {%- endfor %}
-    {{- first_user_message + "<|eot_id|>"}}
-{%- endif %}
-
-{%- for message in messages %}
-    {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}
-        {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n' }}
-        {%- if message['content'] is string %}
-            {{- message['content'] | trim}}
-        {%- else %}
-            {%- for content in message['content'] %}
-                {%- if content['type'] == 'text' %}
-                    {{- content['text'] | trim }}
-                {%- endif %}
-            {%- endfor %}
-        {%- endif %}
-        {{- '<|eot_id|>' }}
-    {%- elif 'tool_calls' in message %}
-        {%- if not message.tool_calls|length == 1 %}
-            {{- raise_exception("This model only supports single tool-calls at once!") }}
-        {%- endif %}
-        {%- set tool_call = message.tool_calls[0].function %}
-        {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}}
-        {{- '{"name": "' + tool_call.name + '", ' }}
-        {{- '"parameters": ' }}
-        {{- tool_call.arguments | tojson }}
-        {{- "}" }}
-        {{- "<|eot_id|>" }}
-    {%- elif message.role == "tool" or message.role == "ipython" %}
-        {{- "<|start_header_id|>ipython<|end_header_id|>\n\n" }}
-        {%- if message.content is string %}
-            {{- { "output": message.content } | tojson }}
-        {%- else %}
-            {%- for content in message['content']  %}
-                {%- if content['type']  == 'text' %}
-                    {{- { "output": content['text']  } | tojson }}
-                {%- endif %}
-            {%- endfor %}
-        {%- endif %}
-        {{- "<|eot_id|>" }}
-    {%- endif %}
-{%- endfor %}
-{%- if add_generation_prompt %}
-    {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }}
-{%- endif %}
\ No newline at end of file

From 0b561258c946b470149094fe309029e8c7195198 Mon Sep 17 00:00:00 2001
From: Nicolas Gontier <gontier.nicolas@yahoo.fr>
Date: Thu, 21 Aug 2025 15:34:58 +0000
Subject: [PATCH 40/73] update agent reflection node

---
 conf/miniwob.yaml | 39 +++++++++------------------------------
 1 file changed, 9 insertions(+), 30 deletions(-)

diff --git a/conf/miniwob.yaml b/conf/miniwob.yaml
index 08ade1ed..a55dfd65 100644
--- a/conf/miniwob.yaml
+++ b/conf/miniwob.yaml
@@ -47,7 +47,7 @@ agent_max_loops: 10  # max number of agent - environment interactions for each t
 agent:
   _target_: tapeagents.agent.Agent
   name : web_agent
-  max_iterations: 4  # max number of iterations (make_prompt + llm? + generate_steps) for each loop
+  max_iterations: 4  # max number of iterations (make_prompt + llm + generate_steps) for each loop
   store_llm_calls: true
   templates:
     system_prompt: |
@@ -56,16 +56,10 @@ agent:
       Keep your replies concise and direct. Prioritize clarity and avoid over-elaboration.
       You will be provided with the content of the current page and a task from the user.
       Do not express your emotions or opinions about the user question.
-    allowed_tools: |
-      You have access to the following tools:
-      {tools_description}
     allowed_steps: |
       You are allowed to produce ONLY steps with the following json schemas:
       {allowed_steps}
       Do not reproduce schema when producing the steps, use it as a reference.
-    thought_format: |
-      Important! Respond with the plain text, do not include any JSON or code.
-      Do not output anything besides what I asked in this message.
     json_format: |
       Important! Respond with parsable JSON, do not include any text or code.
       Do not output anything besides one JSON object.
@@ -73,12 +67,6 @@ agent:
     - _target_: examples.rl_webagent.agent.WebNode
       name: set_goal
       system_prompt: ${agent.templates.system_prompt}
-      # guidance: |
-      #   Produce the thought that describes the intended solution to the task. In the reasoning lines:
-      #   - review the instructions from the user and the content of the page.
-      #   - outline the main task to be accomplished and the steps to be taken to achieve it.
-      #   - produce definiton of done, that will be checked later to verify if the task was completed.
-      #   ${agent.templates.thought_format}
       guidance: |
         Produce the reasoning_thought step that describes the intended solution to the task. In the reasoning lines:
         - review the instructions from the user and the content of the page.
@@ -86,7 +74,6 @@ agent:
         - produce definiton of done, that will be checked later to verify if the task was completed.
         Produce only one reasoning_thought step!
         ${agent.templates.json_format}
-      # steps_prompt: ${agent.templates.allowed_tools}
       steps_prompt: ${agent.templates.allowed_steps}
       steps:
         - tapeagents.steps.ReasoningThought
@@ -96,29 +83,22 @@ agent:
       name: reflect
       system_prompt: ${agent.templates.system_prompt}
       guidance: |
-        Review the current state of the page and previous steps to find the best possible next action to accomplish the task.
-        Produce the reflection_thought to describe the current page state, reflect on your last action, describe what is left to do, and what will be the immediate next action.
-        Produce only one reflection_thought step!
+        Produce the reasoning_thought step that describes the current state of the page, the previous actions, and what should be the next best action to accomplish the task. In the reasoning lines:
+        - think about which information could be relevant to the given task, note relevant BIDs and coordinates.
+        - describe the last action taken, what were its expected effects on the page, versus the actual effects you can observe. Are they the same or not? if not, what could have gone wrong?
+        - check if you are stuck with repeating the same action over and over again, if so, try something else and change the action.
+        - check if you think the task is done, if not give a detailed list of actions to do next to accomplish the task.
+        - finally, if the task is not done, describe the immediate next action to be performed and its expected effect on the page.
+        Produce only one reasoning_thought step!
         ${agent.templates.json_format}
-      #   ${agent.templates.thought_format}
-      # steps_prompt: ${agent.templates.allowed_tools}
       steps_prompt: ${agent.templates.allowed_steps}
       steps:
-        - examples.rl_webagent.steps.ReflectionThought
+        - tapeagents.steps.ReasoningThought
       trim_obs_except_last_n: 3  # keep the last 3 observations from the tape in prompt messages
       max_chars_page_observation: 3000  # keep up to 3000 chars in PageObservation steps
     - _target_: examples.rl_webagent.agent.WebNode
       name: act
       system_prompt: ${agent.templates.system_prompt}
-      # guidance: |
-      #   Produce the single next tool call to be performed with the current page.
-      #   If you think that the task is solved, call the FinalAnswer.
-      #   You can interact with the page elements using their BIDs or coordinates as arguments for actions.
-      #   HINTS:
-      #   - You can use the BIDs of the elements or the mouse position in x, y coordinates to interact with them.
-      #   - To select value in a dropdown or combobox, ALWAYS use SelectOption tool.
-      #   - To click on a checkbox or radio button, ALWAYS use BID (or coordinates) of the corresponding Text and not the BID (or coordinates) of the element itself.
-      #   - Press enter key to submit the search query.
       guidance: |
         Produce the next action to be performed with the current page.
         If you think that the task is solved, produce the final_answer_action.
@@ -131,7 +111,6 @@ agent:
         - Always produce only one step at a time.
         - Step kind is always lowercase and underscore separated.
         ${agent.templates.json_format}
-      # steps_prompt: ${agent.templates.allowed_tools}
       steps_prompt: ${agent.templates.allowed_steps}
       use_known_actions: true
       steps:

From 9b0a74cebeed9f61bf4ca156366b5e75f54b404e Mon Sep 17 00:00:00 2001
From: Nicolas Gontier <gontier.nicolas@yahoo.fr>
Date: Fri, 22 Aug 2025 15:58:34 +0000
Subject: [PATCH 41/73] towards massimo setup

---
 pipelinerl/domains/miniwob/load_tasks.py |  4 ++--
 pipelinerl/domains/miniwob/rollouts.py   | 11 ++++++++---
 2 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/pipelinerl/domains/miniwob/load_tasks.py b/pipelinerl/domains/miniwob/load_tasks.py
index 4bade257..a056a311 100644
--- a/pipelinerl/domains/miniwob/load_tasks.py
+++ b/pipelinerl/domains/miniwob/load_tasks.py
@@ -205,12 +205,12 @@ def load_tasks(dataset_names: list[str], train_split: float = 0.6, seeds: list[i
         elif name == "massimo_train":
             tasks.extend([
                 {"dataset": task, "task": task, "seed": seed}
-                for task in MASSIMO_TRAIN_SPLIT for seed in seeds
+                for task in MASSIMO_TRAIN_SPLIT for seed in range(3,10)  # seeds 0-2 are used for held out goals in Mass setup
             ])
         elif name == "massimo_test":
             tasks.extend([
                 {"dataset": task, "task": task, "seed": seed}
-                for task in MASSIMO_TEST_SPLIT for seed in seeds
+                for task in MASSIMO_TEST_SPLIT for seed in range(10)
             ])
     return tasks
 
diff --git a/pipelinerl/domains/miniwob/rollouts.py b/pipelinerl/domains/miniwob/rollouts.py
index 3d3287be..5b590665 100644
--- a/pipelinerl/domains/miniwob/rollouts.py
+++ b/pipelinerl/domains/miniwob/rollouts.py
@@ -129,7 +129,7 @@ async def generate_miniwob_rollout(
     if obs_steps:
         last_obs = obs_steps[-1]
         # in Miniwob, the observation "reward" is defined as RAW_REWARD_GLOBAL > 0
-        # see here: https://github.com/ServiceNow/BrowserGym/blob/main/browsergym/miniwob/src/browsergym/miniwob/base.py#L183
+        # see here: https://github.com/ServiceNow/BrowserGym/blob/main/browsergym/miniwob/src/browsergym/miniwob/base.py#L188
         # Let's take directly the RAW_REWARD_GLOBAL from the metadata
         # raw_reward = last_obs.metadata.other.get("reward", 0.0)
         raw_reward = last_obs.metadata.other.get("info", {}).get("task_info", {}).get("REWARD_GLOBAL", -1.0)
@@ -142,7 +142,12 @@ async def generate_miniwob_rollout(
     # get the number of PageObservation steps in the tape
     n_page_observations = len([step for step in tape.steps if isinstance(step, PageObservation)])
 
-    reward = raw_reward * 0.99**n_step_errors if no_error and raw_reward >= 0 else -1.0
+    #reward = raw_reward * 0.99**n_step_errors if no_error and raw_reward >= 0 else -1.0
+    # massimo's setup:
+    reward = float(raw_reward>0)
+    if reward == 0.0:
+        reward = -1.0
+    reward *= 0.98 ** n_page_observations
 
     # (3) Get LLM calls from Tape
     llm_calls = [step for step in tape.steps if step.metadata.other.get("llm_call") is not None]
@@ -166,7 +171,7 @@ async def generate_miniwob_rollout(
     latency = time.time() - start_time
     agent_time = tape.metadata.result.get("agent_execution_time", -1.0)
     env_time = tape.metadata.result.get("environment_execution_time", -1.0)
-    n_observations = len([s for s in tape.steps if isinstance(s, Observation)])
+    n_observations = len([s for s in tape.steps if isinstance(s, Observation)])  # TODO: is this not the same n_page_observations??
     n_other_steps = len(tape.steps) - n_observations
     metrics = MiniwobMetrics(
         reward=reward,

From ef46f392ae856bf30bfe3639a7750e13dac7d9b6 Mon Sep 17 00:00:00 2001
From: Nicolas Gontier <gontier.nicolas@yahoo.fr>
Date: Thu, 28 Aug 2025 13:57:23 +0000
Subject: [PATCH 42/73] upd configs

---
 conf/miniwob.yaml         |  4 ++--
 conf/miniwob_massimo.yaml | 14 ++++++++++++++
 2 files changed, 16 insertions(+), 2 deletions(-)
 create mode 100644 conf/miniwob_massimo.yaml

diff --git a/conf/miniwob.yaml b/conf/miniwob.yaml
index a55dfd65..5c090823 100644
--- a/conf/miniwob.yaml
+++ b/conf/miniwob.yaml
@@ -18,7 +18,7 @@ model_path: meta-llama/Llama-3.1-8B-Instruct
 
 finetune:
   seq_length: 16384  # input + output tokens
-  max_train_steps: 1000
+  max_train_steps: 1000  # 1000 optim steps = 1000 * bs samples
 
 llm:
   parameters:
@@ -125,7 +125,7 @@ start_attempts: 3  # number of attempts to start each task
 environment:
   _target_: pipelinerl.domains.miniwob.environment_server.WebEnvironmentServer
   miniwob_url: ???
-  n_envs: 32
+  n_envs: ${actor.llm_max_rollouts}
   host: "0.0.0.0"
   env_call_timeout: 600  # timeout for each environment call (e.g. start_task, act, etc.)
   web_env_target: examples.rl_webagent.environment.WebEnvironment
diff --git a/conf/miniwob_massimo.yaml b/conf/miniwob_massimo.yaml
new file mode 100644
index 00000000..7f2c3da3
--- /dev/null
+++ b/conf/miniwob_massimo.yaml
@@ -0,0 +1,14 @@
+defaults:
+  - miniwob
+  - _self_
+
+train_dataset_names:
+  - massimo_train
+test_dataset_names:
+  - massimo_test
+
+finetune:
+  train_batch_size: 1
+  gradient_accumulation_passes: 512
+
+eval_every_n_versions: 5120  # 512 effective bs * 10 "optim steps"

From 1274748dae4c8c36c7d746405471b7746d3021ea Mon Sep 17 00:00:00 2001
From: Nicolas Gontier <gontier.nicolas@yahoo.fr>
Date: Thu, 28 Aug 2025 13:57:41 +0000
Subject: [PATCH 43/73] upd

---
 pipelinerl/domains/miniwob/rollouts.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pipelinerl/domains/miniwob/rollouts.py b/pipelinerl/domains/miniwob/rollouts.py
index 5b590665..8d34e6ec 100644
--- a/pipelinerl/domains/miniwob/rollouts.py
+++ b/pipelinerl/domains/miniwob/rollouts.py
@@ -142,7 +142,7 @@ async def generate_miniwob_rollout(
     # get the number of PageObservation steps in the tape
     n_page_observations = len([step for step in tape.steps if isinstance(step, PageObservation)])
 
-    #reward = raw_reward * 0.99**n_step_errors if no_error and raw_reward >= 0 else -1.0
+    # reward = raw_reward * 0.99**n_step_errors if no_error and raw_reward >= 0 else -1.0
     # massimo's setup:
     reward = float(raw_reward>0)
     if reward == 0.0:

From b16d45c6b2496a017aa5e47817e57485b8ed83a2 Mon Sep 17 00:00:00 2001
From: Nicolas Gontier <gontier.nicolas@yahoo.fr>
Date: Thu, 28 Aug 2025 18:39:11 +0000
Subject: [PATCH 44/73] revert reward calculation

---
 pipelinerl/domains/miniwob/rollouts.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/pipelinerl/domains/miniwob/rollouts.py b/pipelinerl/domains/miniwob/rollouts.py
index 8d34e6ec..0c5a4396 100644
--- a/pipelinerl/domains/miniwob/rollouts.py
+++ b/pipelinerl/domains/miniwob/rollouts.py
@@ -142,12 +142,12 @@ async def generate_miniwob_rollout(
     # get the number of PageObservation steps in the tape
     n_page_observations = len([step for step in tape.steps if isinstance(step, PageObservation)])
 
-    # reward = raw_reward * 0.99**n_step_errors if no_error and raw_reward >= 0 else -1.0
+    reward = raw_reward * 0.99**n_step_errors if no_error and raw_reward >= 0 else -1.0
     # massimo's setup:
-    reward = float(raw_reward>0)
-    if reward == 0.0:
-        reward = -1.0
-    reward *= 0.98 ** n_page_observations
+    # reward = float(raw_reward>0)
+    # if reward == 0.0:
+    #     reward = -1.0
+    # reward *= 0.98 ** n_page_observations
 
     # (3) Get LLM calls from Tape
     llm_calls = [step for step in tape.steps if step.metadata.other.get("llm_call") is not None]

From 9e61c35f51963de30a963fe3cc5bb46ac2ecccd9 Mon Sep 17 00:00:00 2001
From: Nicolas Gontier <gontier.nicolas@yahoo.fr>
Date: Thu, 28 Aug 2025 19:47:54 +0000
Subject: [PATCH 45/73] update massimo cfg to grpo

---
 conf/miniwob_massimo.yaml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/conf/miniwob_massimo.yaml b/conf/miniwob_massimo.yaml
index 7f2c3da3..99ba0c56 100644
--- a/conf/miniwob_massimo.yaml
+++ b/conf/miniwob_massimo.yaml
@@ -1,5 +1,6 @@
 defaults:
   - miniwob
+  - override finetune: grpo
   - _self_
 
 train_dataset_names:
@@ -8,6 +9,8 @@ test_dataset_names:
   - massimo_test
 
 finetune:
+  seq_length: 16384  # input + output tokens
+  max_train_steps: 1000  # 1000 optim steps = 1000 * bs samples
   train_batch_size: 1
   gradient_accumulation_passes: 512
 

From ef884f22cc4f3b655a067df405562a925fc35ed5 Mon Sep 17 00:00:00 2001
From: Nicolas Gontier <gontier.nicolas@yahoo.fr>
Date: Thu, 28 Aug 2025 20:49:15 +0000
Subject: [PATCH 46/73] test with ppo

---
 conf/miniwob_massimo.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/conf/miniwob_massimo.yaml b/conf/miniwob_massimo.yaml
index 99ba0c56..003238be 100644
--- a/conf/miniwob_massimo.yaml
+++ b/conf/miniwob_massimo.yaml
@@ -1,6 +1,6 @@
 defaults:
   - miniwob
-  - override finetune: grpo
+  - override finetune: ppo
   - _self_
 
 train_dataset_names:

From 537ec7a08b5e5f210fda381c99757438cc921ff7 Mon Sep 17 00:00:00 2001
From: Nicolas Gontier <gontier.nicolas@yahoo.fr>
Date: Tue, 2 Sep 2025 14:57:07 +0000
Subject: [PATCH 47/73] update configs

---
 conf/miniwob_grpo.yaml                        | 149 ++++++++++++++++++
 conf/miniwob_massimo_grpo.yaml                |  18 +++
 ..._massimo.yaml => miniwob_massimo_ppo.yaml} |   5 +-
 conf/{miniwob.yaml => miniwob_ppo.yaml}       |   5 +
 pipelinerl/domains/miniwob/rollouts.py        |  15 +-
 5 files changed, 184 insertions(+), 8 deletions(-)
 create mode 100644 conf/miniwob_grpo.yaml
 create mode 100644 conf/miniwob_massimo_grpo.yaml
 rename conf/{miniwob_massimo.yaml => miniwob_massimo_ppo.yaml} (88%)
 rename conf/{miniwob.yaml => miniwob_ppo.yaml} (97%)

diff --git a/conf/miniwob_grpo.yaml b/conf/miniwob_grpo.yaml
new file mode 100644
index 00000000..864dbc75
--- /dev/null
+++ b/conf/miniwob_grpo.yaml
@@ -0,0 +1,149 @@
+defaults:
+  - base
+  - override streams: redis
+  - override finetune: grpo
+  - _self_
+
+world:
+  actor_fraction: 3
+  preprocessor_fraction: 0
+  finetune_fraction: 5
+
+# debug:
+#   mode: actor
+save_tapes: False
+
+output_dir: results/miniwob/${now:%Y-%m-%d}/${now:%H-%M-%S}
+model_path: meta-llama/Llama-3.1-8B-Instruct
+
+finetune:
+  seq_length: 16384  # input + output tokens
+  max_train_steps: 1000  # 1000 optim steps = 1000 * bs samples
+  train_batch_size: 1
+  gradient_accumulation_passes: 1024
+
+eval_every_n_versions: 10240  # 1024 effective bs * 10 "optim steps"
+
+llm:
+  parameters:
+    max_tokens: 4096  # output tokens
+    temperature: 1.0
+test_llm:
+  parameters:
+    max_tokens: ${...llm.parameters.max_tokens}
+    temperature: 0.0
+    top_p: 1.0
+    top_k: 50
+
+vllm_config:
+  vllm_kwargs:
+    max_model_len: 16384  # input + output tokens
+
+actor:
+  rollout_policy: pipelinerl.domains.miniwob.rollouts.generate_miniwob_rollout
+  shared_memory_entry_size: 100000000
+
+preprocess:
+  shared_memory_entry_size: 1000000000
+
+# AGENT CONFIGURATION
+agent_max_loops: 10  # max number of agent - environment interactions for each task
+reward_computation: nico
+agent:
+  _target_: tapeagents.agent.Agent
+  name : web_agent
+  max_iterations: 4  # max number of iterations (make_prompt + llm + generate_steps) for each loop
+  store_llm_calls: true
+  templates:
+    system_prompt: |
+      You are an expert AI Agent, your goal is to help the user perform tasks using a web browser.
+      Your role is to understand user queries and respond in a helpful and accurate manner.
+      Keep your replies concise and direct. Prioritize clarity and avoid over-elaboration.
+      You will be provided with the content of the current page and a task from the user.
+      Do not express your emotions or opinions about the user question.
+    allowed_steps: |
+      You are allowed to produce ONLY steps with the following json schemas:
+      {allowed_steps}
+      Do not reproduce schema when producing the steps, use it as a reference.
+    json_format: |
+      Important! Respond with parsable JSON, do not include any text or code.
+      Do not output anything besides one JSON object.
+  nodes:
+    - _target_: examples.rl_webagent.agent.WebNode
+      name: set_goal
+      system_prompt: ${agent.templates.system_prompt}
+      guidance: |
+        Produce the reasoning_thought step that describes the intended solution to the task. In the reasoning lines:
+        - review the instructions from the user and the content of the page.
+        - outline the main task to be accomplished and the steps to be taken to achieve it.
+        - produce definiton of done, that will be checked later to verify if the task was completed.
+        Produce only one reasoning_thought step!
+        ${agent.templates.json_format}
+      steps_prompt: ${agent.templates.allowed_steps}
+      steps:
+        - tapeagents.steps.ReasoningThought
+      trim_obs_except_last_n: 3  # keep the last 3 observations from the tape in prompt messages
+      max_chars_page_observation: 3000  # keep up to 3000 chars in PageObservation steps
+    - _target_: examples.rl_webagent.agent.WebNode
+      name: reflect
+      system_prompt: ${agent.templates.system_prompt}
+      guidance: |
+        Produce the reasoning_thought step that describes the current state of the page, the previous actions, and what should be the next best action to accomplish the task. In the reasoning lines:
+        - think about which information could be relevant to the given task, note relevant BIDs and coordinates.
+        - describe the last action taken, what were its expected effects on the page, versus the actual effects you can observe. Are they the same or not? if not, what could have gone wrong?
+        - check if you are stuck with repeating the same action over and over again, if so, try something else and change the action.
+        - check if you think the task is done, if not give a detailed list of actions to do next to accomplish the task.
+        - finally, if the task is not done, describe the immediate next action to be performed and its expected effect on the page.
+        Produce only one reasoning_thought step!
+        ${agent.templates.json_format}
+      steps_prompt: ${agent.templates.allowed_steps}
+      steps:
+        - tapeagents.steps.ReasoningThought
+      trim_obs_except_last_n: 3  # keep the last 3 observations from the tape in prompt messages
+      max_chars_page_observation: 3000  # keep up to 3000 chars in PageObservation steps
+    - _target_: examples.rl_webagent.agent.WebNode
+      name: act
+      system_prompt: ${agent.templates.system_prompt}
+      guidance: |
+        Produce the next action to be performed with the current page.
+        If you think that the task is solved, produce the final_answer_action.
+        You can interact with the page elements using their BIDs or coordinates as arguments for actions.
+        HINTS:
+        - You can use the BIDs of the elements or the mouse position in x, y coordinates to interact with them.
+        - To select value in a dropdown or combobox, ALWAYS use select_action.
+        - To click on a checkbox or radio button, ALWAYS use BID (or coordinates) of the corresponding Text and not the BID (or coordinates) of the element itself.
+        - Press enter key to submit the search query.
+        - Always produce only one step at a time.
+        - Step kind is always lowercase and underscore separated.
+        ${agent.templates.json_format}
+      steps_prompt: ${agent.templates.allowed_steps}
+      use_known_actions: true
+      steps:
+        - examples.rl_webagent.steps.FinalAnswerAction
+      trim_obs_except_last_n: 3  # keep the last 3 observations from the tape in prompt messages
+      max_chars_page_observation: 3000  # keep up to 3000 chars in PageObservation steps
+      next_node: reflect
+
+
+# ENVIRONMENT CONFIGURATION
+start_attempts: 3  # number of attempts to start each task
+environment:
+  _target_: pipelinerl.domains.miniwob.environment_server.WebEnvironmentServer
+  miniwob_url: ???
+  n_envs: ${actor.llm_max_rollouts}
+  host: "0.0.0.0"
+  env_call_timeout: 600  # timeout for each environment call (e.g. start_task, act, etc.)
+  web_env_target: examples.rl_webagent.environment.WebEnvironment
+  exp_path: ${output_dir}/env_server
+  headless: true
+  observation_format: html
+
+# DATASET CONFIGURATION
+dataset_loader: pipelinerl.domains.miniwob.load_tasks.load_tasks
+dataset_loader_params:
+  train_split: 0.6  # 0.6 of tasks for training, 0.4 for testing
+  seeds: [0, 42, 1337, 900, 103]
+train_dataset_names:
+  - train
+test_dataset_names:
+  - test
diff --git a/conf/miniwob_massimo_grpo.yaml b/conf/miniwob_massimo_grpo.yaml
new file mode 100644
index 00000000..761ee43b
--- /dev/null
+++ b/conf/miniwob_massimo_grpo.yaml
@@ -0,0 +1,18 @@
+defaults:
+  - miniwob_grpo
+  - _self_
+
+train_dataset_names:
+  - massimo_train
+test_dataset_names:
+  - massimo_test
+
+reward_computation: massimo
+
+finetune:
+  seq_length: 16384  # input + output tokens
+  max_train_steps: 1000  # 1000 optim steps = 1000 * bs samples
+  train_batch_size: 1
+  gradient_accumulation_passes: 512
+
+eval_every_n_versions: 5120  # 512 effective bs * 10 "optim steps"
diff --git a/conf/miniwob_massimo.yaml b/conf/miniwob_massimo_ppo.yaml
similarity index 88%
rename from conf/miniwob_massimo.yaml
rename to conf/miniwob_massimo_ppo.yaml
index 003238be..8b1fefb8 100644
--- a/conf/miniwob_massimo.yaml
+++ b/conf/miniwob_massimo_ppo.yaml
@@ -1,6 +1,5 @@
 defaults:
-  - miniwob
-  - override finetune: ppo
+  - miniwob_ppo
   - _self_
 
 train_dataset_names:
@@ -8,6 +7,8 @@ train_dataset_names:
 test_dataset_names:
   - massimo_test
 
+reward_computation: massimo
+
 finetune:
   seq_length: 16384  # input + output tokens
   max_train_steps: 1000  # 1000 optim steps = 1000 * bs samples
diff --git a/conf/miniwob.yaml b/conf/miniwob_ppo.yaml
similarity index 97%
rename from conf/miniwob.yaml
rename to conf/miniwob_ppo.yaml
index 5c090823..656e7839 100644
--- a/conf/miniwob.yaml
+++ b/conf/miniwob_ppo.yaml
@@ -19,6 +19,10 @@ model_path: meta-llama/Llama-3.1-8B-Instruct
 finetune:
   seq_length: 16384  # input + output tokens
   max_train_steps: 1000  # 1000 optim steps = 1000 * bs samples
+  train_batch_size: 1
+  gradient_accumulation_passes: 1024
+
+eval_every_n_versions: 10240  # 1024 effective bs * 10 "optim steps"
 
 llm:
   parameters:
@@ -44,6 +48,7 @@ preprocess:
 
 # AGENT CONFIGURATION
 agent_max_loops: 10  # max number of agent - environment interactions for each task
+reward_computation: nico
 agent:
   _target_: tapeagents.agent.Agent
   name : web_agent
diff --git a/pipelinerl/domains/miniwob/rollouts.py b/pipelinerl/domains/miniwob/rollouts.py
index 0c5a4396..8168bcd5 100644
--- a/pipelinerl/domains/miniwob/rollouts.py
+++ b/pipelinerl/domains/miniwob/rollouts.py
@@ -142,12 +142,15 @@ async def generate_miniwob_rollout(
     # get the number of PageObservation steps in the tape
     n_page_observations = len([step for step in tape.steps if isinstance(step, PageObservation)])
 
-    reward = raw_reward * 0.99**n_step_errors if no_error and raw_reward >= 0 else -1.0
-    # massimo's setup:
-    # reward = float(raw_reward>0)
-    # if reward == 0.0:
-    #     reward = -1.0
-    # reward *= 0.98 ** n_page_observations
+    if cfg.reward_computation == "nico":
+        reward = raw_reward * 0.99**n_step_errors if no_error and raw_reward >= 0 else -1.0
+    elif cfg.reward_computation == "massimo":
+        reward = float(raw_reward>0)
+        if reward == 0.0:
+            reward = -1.0
+        reward *= 0.98 ** n_page_observations
+    else:
+        raise ValueError(f"Invalid reward configuration: {cfg.reward_computation}")
 
     # (3) Get LLM calls from Tape
     llm_calls = [step for step in tape.steps if step.metadata.other.get("llm_call") is not None]

From 7a4e73fb4389dfa87c691a3168d8de21802d0eff Mon Sep 17 00:00:00 2001
From: Nicolas Gontier <gontier.nicolas@yahoo.fr>
Date: Tue, 2 Sep 2025 20:55:23 +0000
Subject: [PATCH 48/73] add retry mechanism for agent loop

---
 conf/miniwob_grpo.yaml                 |  1 +
 conf/miniwob_ppo.yaml                  |  1 +
 pipelinerl/domains/miniwob/rollouts.py | 48 +++++++++++++++++++-------
 3 files changed, 38 insertions(+), 12 deletions(-)

diff --git a/conf/miniwob_grpo.yaml b/conf/miniwob_grpo.yaml
index 864dbc75..5e82caae 100644
--- a/conf/miniwob_grpo.yaml
+++ b/conf/miniwob_grpo.yaml
@@ -48,6 +48,7 @@ preprocess:
 
 # AGENT CONFIGURATION
 agent_max_loops: 10  # max number of agent - environment interactions for each task
+agent_attempts: 3  # number of attempts to run the agent (retry on timeout/errors)
 reward_computation: nico
 agent:
   _target_: tapeagents.agent.Agent
diff --git a/conf/miniwob_ppo.yaml b/conf/miniwob_ppo.yaml
index 656e7839..05b7ff0d 100644
--- a/conf/miniwob_ppo.yaml
+++ b/conf/miniwob_ppo.yaml
@@ -48,6 +48,7 @@ preprocess:
 
 # AGENT CONFIGURATION
 agent_max_loops: 10  # max number of agent - environment interactions for each task
+agent_attempts: 3  # number of attempts to run the agent (retry on timeout/errors)
 reward_computation: nico
 agent:
   _target_: tapeagents.agent.Agent
diff --git a/pipelinerl/domains/miniwob/rollouts.py b/pipelinerl/domains/miniwob/rollouts.py
index 8168bcd5..a356911f 100644
--- a/pipelinerl/domains/miniwob/rollouts.py
+++ b/pipelinerl/domains/miniwob/rollouts.py
@@ -84,7 +84,7 @@ async def generate_miniwob_rollout(
     async with environment.acontext(session, wait_for_env=True) as env:
         start_attempts = cfg.start_attempts
         t = time.perf_counter()
-        while True:
+        while start_attempts > 0:
             try:
                 tape_dict, _ = await env.start_task(problem)
                 break
@@ -92,11 +92,12 @@ async def generate_miniwob_rollout(
                 logger.warning(f"Failed to start task {problem['dataset']}/{problem['task']}/{problem['seed']}")
                 start_attempts -= 1
                 if start_attempts <= 0:
+                    logger.error("Failed to start task after all retry attempts")
                     no_error = False
                     tape_dict = {}
                     break
                 else:
-                    logger.warning(f"retry after 5 seconds: {e}")
+                    logger.warning(f"retry after 5 seconds: {e}, {start_attempts} attempts remaining")
                     await asyncio.sleep(5)
         logger.info(
             f"Task {problem['dataset']}/{problem['task']}/{problem['seed']} started in {time.perf_counter() - t:.2f} seconds"
@@ -105,16 +106,39 @@ async def generate_miniwob_rollout(
         t = time.perf_counter()
         if no_error:  # only run the agent if the task started successfully
             logger.info(f"Running agent for task {problem['dataset']}/{problem['task']}/{problem['seed']}")
-            try:
-                actions = await env.a_actions()
-                tools_description = await env.a_tools_description()
-                logger.debug(f"Available tools: {tools_description}")
-                agent: Agent = instantiate(cfg.agent, known_actions=actions, tools_description=tools_description)
-                agent.llms = {DEFAULT: llm}
-                tape = await async_execute_agent(agent, tape, env, session, max_loops=cfg.agent_max_loops)
-            except Exception as e:
-                logger.error(f"Error occurred while running agent: {e}")
-                no_error = False
+            agent_attempts = cfg.agent_attempts
+            while agent_attempts > 0:
+                try:
+                    actions = await env.a_actions()
+                    tools_description = await env.a_tools_description()
+                    agent: Agent = instantiate(cfg.agent, known_actions=actions, tools_description=tools_description)
+                    agent.llms = {DEFAULT: llm}
+                    tape = await async_execute_agent(agent, tape, env, session, max_loops=cfg.agent_max_loops)
+                    # Check if the tape has an error from the orchestrator (e.g., SocketTimeoutError)
+                    if tape.metadata.error:
+                        logger.warning(f"Agent execution failed with error: {tape.metadata.error}")
+                        agent_attempts -= 1
+                        if agent_attempts <= 0:
+                            logger.error("Agent execution failed after all retry attempts")
+                            no_error = False
+                            break
+                        else:
+                            logger.warning(f"Retrying agent execution after 5 seconds, {agent_attempts} attempts remaining")
+                            await asyncio.sleep(5)
+                            continue
+                    else:
+                        # Success - break out of retry loop
+                        break
+                except Exception as e:
+                    logger.warning(f"Error occurred while running agent: {e}")
+                    agent_attempts -= 1
+                    if agent_attempts <= 0:
+                        logger.error("Agent execution failed after all retry attempts")
+                        no_error = False
+                        break
+                    else:
+                        logger.warning(f"Retrying agent execution after 5 seconds, {agent_attempts} attempts remaining")
+                        await asyncio.sleep(5)
             logger.info(
                 f"Agent finished task {problem['dataset']}/{problem['task']}/{problem['seed']} in {time.perf_counter() - t:.2f} seconds"
             )

From 42e811e66d641f3e2219a2721c8ae6c576f0624f Mon Sep 17 00:00:00 2001
From: Nicolas Gontier <gontier.nicolas@yahoo.fr>
Date: Wed, 3 Sep 2025 20:45:20 +0000
Subject: [PATCH 49/73] add 30min timeout to rollout function

---
 conf/miniwob_grpo.yaml                 |  3 +-
 conf/miniwob_ppo.yaml                  |  3 +-
 pipelinerl/domains/miniwob/rollouts.py | 54 ++++++++++++++++++++++++++
 3 files changed, 58 insertions(+), 2 deletions(-)

diff --git a/conf/miniwob_grpo.yaml b/conf/miniwob_grpo.yaml
index 5e82caae..eb733148 100644
--- a/conf/miniwob_grpo.yaml
+++ b/conf/miniwob_grpo.yaml
@@ -49,6 +49,7 @@ preprocess:
 # AGENT CONFIGURATION
 agent_max_loops: 10  # max number of agent - environment interactions for each task
 agent_attempts: 3  # number of attempts to run the agent (retry on timeout/errors)
+rollout_timeout: 1800  # overall timeout for entire rollout in seconds (30 minutes)
 reward_computation: nico
 agent:
   _target_: tapeagents.agent.Agent
@@ -133,7 +134,7 @@ environment:
   miniwob_url: ???
   n_envs: ${actor.llm_max_rollouts}
   host: "0.0.0.0"
-  env_call_timeout: 600  # timeout for each environment call (e.g. start_task, act, etc.)
+  env_call_timeout: 120  # timeout for each environment call (e.g. start_task, act, etc.)
   web_env_target: examples.rl_webagent.environment.WebEnvironment
   exp_path: ${output_dir}/env_server
   headless: true
diff --git a/conf/miniwob_ppo.yaml b/conf/miniwob_ppo.yaml
index 05b7ff0d..9a85a8cd 100644
--- a/conf/miniwob_ppo.yaml
+++ b/conf/miniwob_ppo.yaml
@@ -49,6 +49,7 @@ preprocess:
 # AGENT CONFIGURATION
 agent_max_loops: 10  # max number of agent - environment interactions for each task
 agent_attempts: 3  # number of attempts to run the agent (retry on timeout/errors)
+rollout_timeout: 1800  # overall timeout for entire rollout in seconds (30 minutes)
 reward_computation: nico
 agent:
   _target_: tapeagents.agent.Agent
@@ -133,7 +134,7 @@ environment:
   miniwob_url: ???
   n_envs: ${actor.llm_max_rollouts}
   host: "0.0.0.0"
-  env_call_timeout: 600  # timeout for each environment call (e.g. start_task, act, etc.)
+  env_call_timeout: 120  # timeout for each environment call (e.g. start_task, act, etc.)
   web_env_target: examples.rl_webagent.environment.WebEnvironment
   exp_path: ${output_dir}/env_server
   headless: true
diff --git a/pipelinerl/domains/miniwob/rollouts.py b/pipelinerl/domains/miniwob/rollouts.py
index a356911f..2df03815 100644
--- a/pipelinerl/domains/miniwob/rollouts.py
+++ b/pipelinerl/domains/miniwob/rollouts.py
@@ -70,7 +70,29 @@ async def generate_miniwob_rollout(
     # get training text from llm calls
 
     start_time = time.time()
+    
+    # Overall timeout for the entire rollout to prevent hanging
+    rollout_timeout = getattr(cfg, 'rollout_timeout', 1800)  # 30 minutes default
 
+    try:
+        # Execute the entire rollout with a timeout
+        return await asyncio.wait_for(
+            _execute_rollout_with_timeout(cfg, llm, problem, session, start_time),
+            timeout=rollout_timeout
+        )
+    except asyncio.TimeoutError:
+        logger.error(f"Rollout timed out after {rollout_timeout} seconds for task {problem['dataset']}/{problem['task']}/{problem['seed']}")
+        # Return a failed rollout result
+        return _create_failed_rollout_result(problem, start_time, "timeout")
+
+
+async def _execute_rollout_with_timeout(
+    cfg: DictConfig,
+    llm: TrainableLLM,
+    problem: dict,
+    session: aiohttp.ClientSession,
+    start_time: float,
+) -> RolloutResult:
     # (1) Choose a random environment server
     env_jobs = [Job(**job) for job in cfg.jobs if job["kind"] == "environment"]
     # choose the env job randomly
@@ -225,3 +247,35 @@ async def generate_miniwob_rollout(
         prompt_tokens=prompt_tokens,
         output_tokens=output_tokens,
     )
+
+
+def _create_failed_rollout_result(problem: dict, start_time: float, error_type: str) -> RolloutResult:
+    """Create a failed rollout result for timeout or other errors."""
+    latency = time.time() - start_time
+    
+    # Create empty training texts and metrics for failed rollout
+    metrics = MiniwobMetrics(
+        reward=-1.0,
+        success=False,
+        no_error=False,
+        no_answer=True,
+        overflow=False,
+        n_llm_calls=0,
+        n_step_errors=0,
+        n_page_observations=0,
+        n_steps=0,
+        total_execution_time=latency,
+        agent_execution_time=-1.0,
+        environment_execution_time=-1.0,
+        env_step_time=-1.0,
+        agent_step_time=-1.0,
+    )
+    
+    return RolloutResult(
+        training_texts=[],
+        metrics=metrics,
+        latency=latency,
+        dataset_name=problem["dataset"],
+        prompt_tokens=[],
+        output_tokens=[],
+    )

From a4e8f5fd581e0bcfc62bda9f61dffbef21cb7df3 Mon Sep 17 00:00:00 2001
From: Nicolas Gontier <gontier.nicolas@yahoo.fr>
Date: Fri, 5 Sep 2025 02:29:14 +0000
Subject: [PATCH 50/73] upd configs

---
 conf/{miniwob_ppo.yaml => miniwob.yaml} |   7 +-
 conf/miniwob_grpo.yaml                  | 150 +-----------------------
 2 files changed, 5 insertions(+), 152 deletions(-)
 rename conf/{miniwob_ppo.yaml => miniwob.yaml} (97%)

diff --git a/conf/miniwob_ppo.yaml b/conf/miniwob.yaml
similarity index 97%
rename from conf/miniwob_ppo.yaml
rename to conf/miniwob.yaml
index 9a85a8cd..341512ca 100644
--- a/conf/miniwob_ppo.yaml
+++ b/conf/miniwob.yaml
@@ -42,6 +42,7 @@ vllm_config:
 actor:
   rollout_policy: pipelinerl.domains.miniwob.rollouts.generate_miniwob_rollout
   shared_memory_entry_size: 100000000
+  llm_max_rollouts: 32
 
 preprocess:
   shared_memory_entry_size: 1000000000
@@ -68,7 +69,7 @@ agent:
       {allowed_steps}
       Do not reproduce schema when producing the steps, use it as a reference.
     json_format: |
-      Important! Respond with parsable JSON, do not include any text or code.
+      Important! Respond with parsable JSON, do not include any special characters or code.
       Do not output anything besides one JSON object.
   nodes:
     - _target_: examples.rl_webagent.agent.WebNode
@@ -96,7 +97,7 @@ agent:
         - check if you are stuck with repeating the same action over and over again, if so, try something else and change the action.
         - check if you think the task is done, if not give a detailed list of actions to do next to accomplish the task.
         - finally, if the task is not done, describe the immediate next action to be performed and its expected effect on the page.
-        Produce only one reasoning_thought step!
+        Produce only one reasoning_thought step! Be brief and to the point. You can skip some details if they are not relevant for this step.
         ${agent.templates.json_format}
       steps_prompt: ${agent.templates.allowed_steps}
       steps:
@@ -132,7 +133,7 @@ start_attempts: 3  # number of attempts to start each task
 environment:
   _target_: pipelinerl.domains.miniwob.environment_server.WebEnvironmentServer
   miniwob_url: ???
-  n_envs: ${actor.llm_max_rollouts}
+  n_envs: 64
   host: "0.0.0.0"
   env_call_timeout: 120  # timeout for each environment call (e.g. start_task, act, etc.)
   web_env_target: examples.rl_webagent.environment.WebEnvironment
diff --git a/conf/miniwob_grpo.yaml b/conf/miniwob_grpo.yaml
index eb733148..7837c14b 100644
--- a/conf/miniwob_grpo.yaml
+++ b/conf/miniwob_grpo.yaml
@@ -1,151 +1,3 @@
 defaults:
-  - base
-  - override streams: redis
+  - miniwob
   - override finetune: grpo
-  - _self_
-
-world:
-  actor_fraction: 3
-  preprocessor_fraction: 0
-  finetune_fraction: 5
-
-# debug:
-#   mode: actor
-save_tapes: False
-
-output_dir: results/miniwob/${now:%Y-%m-%d}/${now:%H-%M-%S}
-model_path: meta-llama/Llama-3.1-8B-Instruct
-
-finetune:
-  seq_length: 16384  # input + output tokens
-  max_train_steps: 1000  # 1000 optim steps = 1000 * bs samples
-  train_batch_size: 1
-  gradient_accumulation_passes: 1024
-
-eval_every_n_versions: 10240  # 1024 effective bs * 10 "optim steps"
-
-llm:
-  parameters:
-    max_tokens: 4096  # output tokens
-    temperature: 1.0
-test_llm:
-  parameters:
-    max_tokens: ${...llm.parameters.max_tokens}
-    temperature: 0.0
-    top_p: 1.0
-    top_k: 50
-
-vllm_config:
-  vllm_kwargs:
-    max_model_len: 16384  # input + output tokens
-
-actor:
-  rollout_policy: pipelinerl.domains.miniwob.rollouts.generate_miniwob_rollout
-  shared_memory_entry_size: 100000000
-
-preprocess:
-  shared_memory_entry_size: 1000000000
-
-# AGENT CONFIGURATION
-agent_max_loops: 10  # max number of agent - environment interactions for each task
-agent_attempts: 3  # number of attempts to run the agent (retry on timeout/errors)
-rollout_timeout: 1800  # overall timeout for entire rollout in seconds (30 minutes)
-reward_computation: nico
-agent:
-  _target_: tapeagents.agent.Agent
-  name : web_agent
-  max_iterations: 4  # max number of iterations (make_prompt + llm + generate_steps) for each loop
-  store_llm_calls: true
-  templates:
-    system_prompt: |
-      You are an expert AI Agent, your goal is to help the user perform tasks using a web browser.
-      Your role is to understand user queries and respond in a helpful and accurate manner.
-      Keep your replies concise and direct. Prioritize clarity and avoid over-elaboration.
-      You will be provided with the content of the current page and a task from the user.
-      Do not express your emotions or opinions about the user question.
-    allowed_steps: |
-      You are allowed to produce ONLY steps with the following json schemas:
-      {allowed_steps}
-      Do not reproduce schema when producing the steps, use it as a reference.
-    json_format: |
-      Important! Respond with parsable JSON, do not include any text or code.
-      Do not output anything besides one JSON object.
-  nodes:
-    - _target_: examples.rl_webagent.agent.WebNode
-      name: set_goal
-      system_prompt: ${agent.templates.system_prompt}
-      guidance: |
-        Produce the reasoning_thought step that describes the intended solution to the task. In the reasoning lines:
-        - review the instructions from the user and the content of the page.
-        - outline the main task to be accomplished and the steps to be taken to achieve it.
-        - produce definiton of done, that will be checked later to verify if the task was completed.
-        Produce only one reasoning_thought step!
-        ${agent.templates.json_format}
-      steps_prompt: ${agent.templates.allowed_steps}
-      steps:
-        - tapeagents.steps.ReasoningThought
-      trim_obs_except_last_n: 3  # keep the last 3 observations from the tape in prompt messages
-      max_chars_page_observation: 3000  # keep up to 3000 chars in PageObservation steps
-    - _target_: examples.rl_webagent.agent.WebNode
-      name: reflect
-      system_prompt: ${agent.templates.system_prompt}
-      guidance: |
-        Produce the reasoning_thought step that describes the current state of the page, the previous actions, and what should be the next best action to accomplish the task. In the reasoning lines:
-        - think about which information could be relevant to the given task, note relevant BIDs and coordinates.
-        - describe the last action taken, what were its expected effects on the page, versus the actual effects you can observe. Are they the same or not? if not, what could have gone wrong?
-        - check if you are stuck with repeating the same action over and over again, if so, try something else and change the action.
-        - check if you think the task is done, if not give a detailed list of actions to do next to accomplish the task.
-        - finally, if the task is not done, describe the immediate next action to be performed and its expected effect on the page.
-        Produce only one reasoning_thought step!
-        ${agent.templates.json_format}
-      steps_prompt: ${agent.templates.allowed_steps}
-      steps:
-        - tapeagents.steps.ReasoningThought
-      trim_obs_except_last_n: 3  # keep the last 3 observations from the tape in prompt messages
-      max_chars_page_observation: 3000  # keep up to 3000 chars in PageObservation steps
-    - _target_: examples.rl_webagent.agent.WebNode
-      name: act
-      system_prompt: ${agent.templates.system_prompt}
-      guidance: |
-        Produce the next action to be performed with the current page.
-        If you think that the task is solved, produce the final_answer_action.
-        You can interact with the page elements using their BIDs or coordinates as arguments for actions.
-        HINTS:
-        - You can use the BIDs of the elements or the mouse position in x, y coordinates to interact with them.
-        - To select value in a dropdown or combobox, ALWAYS use select_action.
-        - To click on a checkbox or radio button, ALWAYS use BID (or coordinates) of the corresponding Text and not the BID (or coordinates) of the element itself.
-        - Press enter key to submit the search query.
-        - Always produce only one step at a time.
-        - Step kind is always lowercase and underscore separated.
-        ${agent.templates.json_format}
-      steps_prompt: ${agent.templates.allowed_steps}
-      use_known_actions: true
-      steps:
-        - examples.rl_webagent.steps.FinalAnswerAction
-      trim_obs_except_last_n: 3  # keep the last 3 observations from the tape in prompt messages
-      max_chars_page_observation: 3000  # keep up to 3000 chars in PageObservation steps
-      next_node: reflect
-
-
-# ENVIRONMENT CONFIGURATION
-start_attempts: 3  # number of attempts to start each task
-environment:
-  _target_: pipelinerl.domains.miniwob.environment_server.WebEnvironmentServer
-  miniwob_url: ???
-  n_envs: ${actor.llm_max_rollouts}
-  host: "0.0.0.0"
-  env_call_timeout: 120  # timeout for each environment call (e.g. start_task, act, etc.)
-  web_env_target: examples.rl_webagent.environment.WebEnvironment
-  exp_path: ${output_dir}/env_server
-  headless: true
-  observation_format: html
-
-# DATASET CONFIGURATION
-dataset_loader: pipelinerl.domains.miniwob.load_tasks.load_tasks
-dataset_loader_params:
-  train_split: 0.6  # 0.6 of tasks for training, 0.4 for testing
-  seeds: [0, 42, 1337, 900, 103]
-train_dataset_names:
-  - train
-test_dataset_names:
-  - test

From 95b735b7e1ee686cee046ba596b0dbe536083168 Mon Sep 17 00:00:00 2001
From: Nicolas Gontier <gontier.nicolas@yahoo.fr>
Date: Fri, 5 Sep 2025 03:21:24 +0000
Subject: [PATCH 51/73] upd

---
 conf/miniwob.yaml             | 5 +++--
 conf/miniwob_massimo_ppo.yaml | 2 +-
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/conf/miniwob.yaml b/conf/miniwob.yaml
index 341512ca..c9499b48 100644
--- a/conf/miniwob.yaml
+++ b/conf/miniwob.yaml
@@ -69,8 +69,9 @@ agent:
       {allowed_steps}
       Do not reproduce schema when producing the steps, use it as a reference.
     json_format: |
-      Important! Respond with parsable JSON, do not include any special characters or code.
-      Do not output anything besides one JSON object.
+      Important! Respond with very simple parsable JSON!
+      Do not use any special characters or code. Do not use new lines, tabs, or any other formatting inside the JSON.
+      Do not output anything besides one simple JSON object.
   nodes:
     - _target_: examples.rl_webagent.agent.WebNode
       name: set_goal
diff --git a/conf/miniwob_massimo_ppo.yaml b/conf/miniwob_massimo_ppo.yaml
index 8b1fefb8..b2e3b8ca 100644
--- a/conf/miniwob_massimo_ppo.yaml
+++ b/conf/miniwob_massimo_ppo.yaml
@@ -1,5 +1,5 @@
 defaults:
-  - miniwob_ppo
+  - miniwob
   - _self_
 
 train_dataset_names:

From 8616303e9bebe222ce56b82bd4b75d03e99e55ef Mon Sep 17 00:00:00 2001
From: Nicolas Gontier <gontier.nicolas@yahoo.fr>
Date: Fri, 5 Sep 2025 03:25:27 +0000
Subject: [PATCH 52/73] upd configs

---
 conf/miniwob_grpo.yaml         | 7 +++++++
 conf/miniwob_massimo_grpo.yaml | 3 ---
 conf/miniwob_massimo_ppo.yaml  | 3 ---
 3 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/conf/miniwob_grpo.yaml b/conf/miniwob_grpo.yaml
index 7837c14b..f6cfeed3 100644
--- a/conf/miniwob_grpo.yaml
+++ b/conf/miniwob_grpo.yaml
@@ -1,3 +1,10 @@
 defaults:
   - miniwob
   - override finetune: grpo
+  - _self_
+
+finetune:
+  seq_length: 16384  # input + output tokens
+  max_train_steps: 1000  # 1000 optim steps = 1000 * bs samples
+  train_batch_size: 1
+  gradient_accumulation_passes: 1024
diff --git a/conf/miniwob_massimo_grpo.yaml b/conf/miniwob_massimo_grpo.yaml
index 761ee43b..b61dcf32 100644
--- a/conf/miniwob_massimo_grpo.yaml
+++ b/conf/miniwob_massimo_grpo.yaml
@@ -10,9 +10,6 @@ test_dataset_names:
 reward_computation: massimo
 
 finetune:
-  seq_length: 16384  # input + output tokens
-  max_train_steps: 1000  # 1000 optim steps = 1000 * bs samples
-  train_batch_size: 1
   gradient_accumulation_passes: 512
 
 eval_every_n_versions: 5120  # 512 effective bs * 10 "optim steps"
diff --git a/conf/miniwob_massimo_ppo.yaml b/conf/miniwob_massimo_ppo.yaml
index b2e3b8ca..53703d56 100644
--- a/conf/miniwob_massimo_ppo.yaml
+++ b/conf/miniwob_massimo_ppo.yaml
@@ -10,9 +10,6 @@ test_dataset_names:
 reward_computation: massimo
 
 finetune:
-  seq_length: 16384  # input + output tokens
-  max_train_steps: 1000  # 1000 optim steps = 1000 * bs samples
-  train_batch_size: 1
   gradient_accumulation_passes: 512
 
 eval_every_n_versions: 5120  # 512 effective bs * 10 "optim steps"

From 923cf6a8c1372a7929cb4ef0ebfb9959da5f717d Mon Sep 17 00:00:00 2001
From: Nicolas Gontier <gontier.nicolas@yahoo.fr>
Date: Sat, 6 Sep 2025 02:37:45 +0000
Subject: [PATCH 53/73] reduce n_env

---
 conf/miniwob.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/conf/miniwob.yaml b/conf/miniwob.yaml
index c9499b48..cecf7e3e 100644
--- a/conf/miniwob.yaml
+++ b/conf/miniwob.yaml
@@ -134,7 +134,7 @@ start_attempts: 3  # number of attempts to start each task
 environment:
   _target_: pipelinerl.domains.miniwob.environment_server.WebEnvironmentServer
   miniwob_url: ???
-  n_envs: 64
+  n_envs: 32
   host: "0.0.0.0"
   env_call_timeout: 120  # timeout for each environment call (e.g. start_task, act, etc.)
   web_env_target: examples.rl_webagent.environment.WebEnvironment

From 44a033f306b37b89a5413e2224326e3891bc7ffd Mon Sep 17 00:00:00 2001
From: Nicolas Gontier <gontier.nicolas@yahoo.fr>
Date: Sat, 6 Sep 2025 03:46:23 +0000
Subject: [PATCH 54/73] boost preprocess power

---
 conf/miniwob.yaml | 20 +++++++++++++++++---
 1 file changed, 17 insertions(+), 3 deletions(-)

diff --git a/conf/miniwob.yaml b/conf/miniwob.yaml
index cecf7e3e..af0397fe 100644
--- a/conf/miniwob.yaml
+++ b/conf/miniwob.yaml
@@ -5,9 +5,9 @@ defaults:
   - _self_
 
 world:
-  actor_fraction: 3
+  actor_fraction: 2
   preprocessor_fraction: 0
-  finetune_fraction: 5
+  finetune_fraction: 6
 
 # debug:
 #   mode: actor
@@ -45,7 +45,21 @@ actor:
   llm_max_rollouts: 32
 
 preprocess:
-  shared_memory_entry_size: 1000000000
+  n_workers: 16  # Increase from 8
+  chunk_n_groups: 4  # Increase from 2 for better throughput
+  # queue for loaded raw groups
+  raw_queue_size: 16      # Increase from 8
+  # queue for processed chunks of multiple groups
+  input_queue_size: 64    # Increase from 32
+  # queue for ready chunks for multiple groups
+  output_queue_size: 64   # Increase from 32
+  # queue for accumulating samples before further processing
+  dataset_buffer_size: 512  # Enable buffering (was 0)
+  # ring buffer to replace old samples with new ones when training is slow
+  ring_buffer_size: 1024  # Increase from 128
+  # "virtual" sample queue per lead trainer
+  max_ready_samples_per_lead: 256  # Increase from 64
+  shared_memory_entry_size: 1000000000  # Increase from 100M
 
 # AGENT CONFIGURATION
 agent_max_loops: 10  # max number of agent - environment interactions for each task

From 2918d1fe15b72a6930c83c4bfeeffab47f846e7f Mon Sep 17 00:00:00 2001
From: Nicolas Gontier <gontier.nicolas@yahoo.fr>
Date: Sat, 6 Sep 2025 04:00:18 +0000
Subject: [PATCH 55/73] pop old data

---
 conf/miniwob.yaml | 2 --
 1 file changed, 2 deletions(-)

diff --git a/conf/miniwob.yaml b/conf/miniwob.yaml
index af0397fe..0f13dc64 100644
--- a/conf/miniwob.yaml
+++ b/conf/miniwob.yaml
@@ -53,8 +53,6 @@ preprocess:
   input_queue_size: 64    # Increase from 32
   # queue for ready chunks for multiple groups
   output_queue_size: 64   # Increase from 32
-  # queue for accumulating samples before further processing
-  dataset_buffer_size: 512  # Enable buffering (was 0)
   # ring buffer to replace old samples with new ones when training is slow
   ring_buffer_size: 1024  # Increase from 128
   # "virtual" sample queue per lead trainer

From dacaa1f6c366a2996d2db3d88b375de801dca2b3 Mon Sep 17 00:00:00 2001
From: Nicolas Gontier <gontier.nicolas@yahoo.fr>
Date: Sun, 7 Sep 2025 04:27:41 +0000
Subject: [PATCH 56/73] do not save playwright traces & screenshots

---
 conf/miniwob.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/conf/miniwob.yaml b/conf/miniwob.yaml
index 0f13dc64..d10fbbb2 100644
--- a/conf/miniwob.yaml
+++ b/conf/miniwob.yaml
@@ -150,7 +150,7 @@ environment:
   host: "0.0.0.0"
   env_call_timeout: 120  # timeout for each environment call (e.g. start_task, act, etc.)
   web_env_target: examples.rl_webagent.environment.WebEnvironment
-  exp_path: ${output_dir}/env_server
+  exp_path: null
   headless: true
   observation_format: html
 

From fcee5ee6bb27f4787d9700c109f718261d19a246 Mon Sep 17 00:00:00 2001
From: Nicolas Gontier <gontier.nicolas@yahoo.fr>
Date: Sun, 7 Sep 2025 05:01:47 +0000
Subject: [PATCH 57/73] return empty aggregate stats if empty stats

---
 pipelinerl/utils.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/pipelinerl/utils.py b/pipelinerl/utils.py
index 2b0a252c..a6467271 100644
--- a/pipelinerl/utils.py
+++ b/pipelinerl/utils.py
@@ -239,6 +239,9 @@ def calculate_stats(stats: List | Dict[Any, Any]) -> Dict[str, float]:
     if not isinstance(stats, list):
         raise TypeError(f"Expected stats to be a list, got {type(stats)}")
 
+    if len(stats) == 0:
+        return {}
+
     aggregated_stats = {
         "max": float(max(stats)),
         "min": float(min(stats)),

From 631389f312a737fcf8852b9eea82ce0aa329ad43 Mon Sep 17 00:00:00 2001
From: Nicolas Gontier <gontier.nicolas@yahoo.fr>
Date: Sun, 7 Sep 2025 05:05:12 +0000
Subject: [PATCH 58/73] increase preprocessor power

---
 conf/miniwob.yaml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/conf/miniwob.yaml b/conf/miniwob.yaml
index d10fbbb2..65e23d9c 100644
--- a/conf/miniwob.yaml
+++ b/conf/miniwob.yaml
@@ -45,10 +45,10 @@ actor:
   llm_max_rollouts: 32
 
 preprocess:
-  n_workers: 16  # Increase from 8
-  chunk_n_groups: 4  # Increase from 2 for better throughput
+  n_workers: 32  # Increase from 8
+  chunk_n_groups: 8  # Increase from 2 for better throughput
   # queue for loaded raw groups
-  raw_queue_size: 16      # Increase from 8
+  raw_queue_size: 32      # Increase from 8
   # queue for processed chunks of multiple groups
   input_queue_size: 64    # Increase from 32
   # queue for ready chunks for multiple groups

From f7912114dd701a0d0a6484161a0bb5feba9ecced Mon Sep 17 00:00:00 2001
From: Nicolas Gontier <gontier.nicolas@yahoo.fr>
Date: Mon, 8 Sep 2025 18:55:36 +0000
Subject: [PATCH 59/73] better error handling

---
 pipelinerl/domains/miniwob/rollouts.py | 31 ++++++++++++--------------
 1 file changed, 14 insertions(+), 17 deletions(-)

diff --git a/pipelinerl/domains/miniwob/rollouts.py b/pipelinerl/domains/miniwob/rollouts.py
index 2df03815..72c52678 100644
--- a/pipelinerl/domains/miniwob/rollouts.py
+++ b/pipelinerl/domains/miniwob/rollouts.py
@@ -108,18 +108,24 @@ async def _execute_rollout_with_timeout(
         t = time.perf_counter()
         while start_attempts > 0:
             try:
-                tape_dict, _ = await env.start_task(problem)
+                start_result = await env.start_task(problem)
+                if isinstance(start_result, dict) and "error" in start_result:
+                    raise ValueError(start_result['error'])
+                elif isinstance(start_result, tuple):
+                    tape_dict, _ = start_result
+                else:
+                    raise ValueError(f"Invalid start result: {start_result}")
                 break
             except Exception as e:
-                logger.warning(f"Failed to start task {problem['dataset']}/{problem['task']}/{problem['seed']}")
                 start_attempts -= 1
+                logger.warning(f"Failed to start task {problem['dataset']}/{problem['task']}/{problem['seed']}. {start_attempts} attempts remaining. Error: {e}")
                 if start_attempts <= 0:
-                    logger.error("Failed to start task after all retry attempts")
+                    logger.error(f"Failed to start task after all retry attempts: {e}")
                     no_error = False
                     tape_dict = {}
                     break
                 else:
-                    logger.warning(f"retry after 5 seconds: {e}, {start_attempts} attempts remaining")
+                    logger.warning("Retry start task after 5 seconds.")
                     await asyncio.sleep(5)
         logger.info(
             f"Task {problem['dataset']}/{problem['task']}/{problem['seed']} started in {time.perf_counter() - t:.2f} seconds"
@@ -138,28 +144,19 @@ async def _execute_rollout_with_timeout(
                     tape = await async_execute_agent(agent, tape, env, session, max_loops=cfg.agent_max_loops)
                     # Check if the tape has an error from the orchestrator (e.g., SocketTimeoutError)
                     if tape.metadata.error:
-                        logger.warning(f"Agent execution failed with error: {tape.metadata.error}")
-                        agent_attempts -= 1
-                        if agent_attempts <= 0:
-                            logger.error("Agent execution failed after all retry attempts")
-                            no_error = False
-                            break
-                        else:
-                            logger.warning(f"Retrying agent execution after 5 seconds, {agent_attempts} attempts remaining")
-                            await asyncio.sleep(5)
-                            continue
+                        raise ValueError(tape.metadata.error)
                     else:
                         # Success - break out of retry loop
                         break
                 except Exception as e:
-                    logger.warning(f"Error occurred while running agent: {e}")
                     agent_attempts -= 1
+                    logger.warning(f"Error occurred while running agent. {agent_attempts} attempts remaining. Error: {e}")
                     if agent_attempts <= 0:
-                        logger.error("Agent execution failed after all retry attempts")
+                        logger.error(f"Agent execution failed after all retry attempts: {e}")
                         no_error = False
                         break
                     else:
-                        logger.warning(f"Retrying agent execution after 5 seconds, {agent_attempts} attempts remaining")
+                        logger.warning("Retry agent execution after 5 seconds.")
                         await asyncio.sleep(5)
             logger.info(
                 f"Agent finished task {problem['dataset']}/{problem['task']}/{problem['seed']} in {time.perf_counter() - t:.2f} seconds"

From c54d90070583ee920797e1fb1c18bbbfbb8f4cc3 Mon Sep 17 00:00:00 2001
From: Nicolas Gontier <gontier.nicolas@yahoo.fr>
Date: Mon, 8 Sep 2025 19:21:36 +0000
Subject: [PATCH 60/73] fix

---
 pipelinerl/domains/miniwob/rollouts.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pipelinerl/domains/miniwob/rollouts.py b/pipelinerl/domains/miniwob/rollouts.py
index 72c52678..4399f6a1 100644
--- a/pipelinerl/domains/miniwob/rollouts.py
+++ b/pipelinerl/domains/miniwob/rollouts.py
@@ -111,7 +111,7 @@ async def _execute_rollout_with_timeout(
                 start_result = await env.start_task(problem)
                 if isinstance(start_result, dict) and "error" in start_result:
                     raise ValueError(start_result['error'])
-                elif isinstance(start_result, tuple):
+                elif isinstance(start_result, list):
                     tape_dict, _ = start_result
                 else:
                     raise ValueError(f"Invalid start result: {start_result}")

From ea4918a67cd0d35532fe0af4a7a3253e7fc3d160 Mon Sep 17 00:00:00 2001
From: Nicolas Gontier <gontier.nicolas@yahoo.fr>
Date: Tue, 9 Sep 2025 03:31:04 +0000
Subject: [PATCH 61/73] reduce timeouts

---
 conf/miniwob.yaml                      | 6 +++---
 pipelinerl/domains/miniwob/rollouts.py | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/conf/miniwob.yaml b/conf/miniwob.yaml
index 65e23d9c..1454e774 100644
--- a/conf/miniwob.yaml
+++ b/conf/miniwob.yaml
@@ -61,8 +61,8 @@ preprocess:
 
 # AGENT CONFIGURATION
 agent_max_loops: 10  # max number of agent - environment interactions for each task
-agent_attempts: 3  # number of attempts to run the agent (retry on timeout/errors)
-rollout_timeout: 1800  # overall timeout for entire rollout in seconds (30 minutes)
+agent_attempts: 3  # number of attempts to run the agent (retry on errors)
+rollout_timeout: 600  # overall timeout for entire rollout in seconds (10 minutes)
 reward_computation: nico
 agent:
   _target_: tapeagents.agent.Agent
@@ -148,7 +148,7 @@ environment:
   miniwob_url: ???
   n_envs: 32
   host: "0.0.0.0"
-  env_call_timeout: 120  # timeout for each environment call (e.g. start_task, act, etc.)
+  env_call_timeout: 60  # timeout for each environment call (e.g. start_task, act, etc.)
   web_env_target: examples.rl_webagent.environment.WebEnvironment
   exp_path: null
   headless: true
diff --git a/pipelinerl/domains/miniwob/rollouts.py b/pipelinerl/domains/miniwob/rollouts.py
index 4399f6a1..34ded1b6 100644
--- a/pipelinerl/domains/miniwob/rollouts.py
+++ b/pipelinerl/domains/miniwob/rollouts.py
@@ -72,7 +72,7 @@ async def generate_miniwob_rollout(
     start_time = time.time()
     
     # Overall timeout for the entire rollout to prevent hanging
-    rollout_timeout = getattr(cfg, 'rollout_timeout', 1800)  # 30 minutes default
+    rollout_timeout = getattr(cfg, 'rollout_timeout', 600)  # 10 minutes default
 
     try:
         # Execute the entire rollout with a timeout

From e5fca104d7dfa7c9ea3b9c6e141e4e3e0d82a21b Mon Sep 17 00:00:00 2001
From: Nicolas Gontier <gontier.nicolas@yahoo.fr>
Date: Fri, 12 Sep 2025 20:44:34 +0000
Subject: [PATCH 62/73] log number of groups done so far

---
 pipelinerl/actor.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pipelinerl/actor.py b/pipelinerl/actor.py
index ce63ac72..a329598f 100644
--- a/pipelinerl/actor.py
+++ b/pipelinerl/actor.py
@@ -196,6 +196,7 @@ async def rollout_and_maybe_produce_result(
                     f"groups in progress: {len(group_rollouts)}, "
                     f"rollouts started so far: {started_rollouts}, "
                     f"rollouts finished so far: {finished_rollouts}, "
+                    f"groups finished so far: {group_id}, "
                     f"max group size in bytes: {result_queue.max_actual_entry_size()}, "
                 )
                 last_logged = time.time()
@@ -482,7 +483,6 @@ def run(self, dataset: list[tuple[str, dict]]):
                     f" {in_progress} groups in progress"
                 )
 
-                    
                 self.update_stats(rollout_results=rollout_results)
 
                 finished_groups += 1

From df66a88a8fb7c0e30907ba242019d0f5b91c9cfa Mon Sep 17 00:00:00 2001
From: Nicolas Gontier <gontier.nicolas@yahoo.fr>
Date: Fri, 12 Sep 2025 20:45:45 +0000
Subject: [PATCH 63/73] log everything if populate_rl_data fails

---
 pipelinerl/preprocess.py | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/pipelinerl/preprocess.py b/pipelinerl/preprocess.py
index 65e29b4b..cd34b54d 100644
--- a/pipelinerl/preprocess.py
+++ b/pipelinerl/preprocess.py
@@ -160,7 +160,18 @@ def preprocess_dataset(
         entry["step_index"] = entry["metadata"]["step_index"]
     if not isinstance(tokenizer.eos_token_id, int):
         raise ValueError(f"Tokenizer {tokenizer} does not have an eos_token_id")
-    dataset = populate_rl_data(dataset=dataset, eos_token_id=tokenizer.eos_token_id, config=rl_config)
+    try:
+        dataset = populate_rl_data(dataset=dataset, eos_token_id=tokenizer.eos_token_id, config=rl_config)
+    except Exception as e:
+        logger.error(f"Error in populate_rl_data: {e}")
+        logger.error(f"Data: {data}")
+        logger.error(f"Dataset: {dataset}")
+        logger.error(f"Tokenizer: {tokenizer}")
+        logger.error(f"Tokenizer eos_token_id: {tokenizer.eos_token_id}")
+        logger.error(f"RL config: {rl_config}")
+        logger.error(f"LLM: {llm}")
+        logger.error(f"Seq length: {seq_length}")
+        raise e
     return dataset
 
 

From c8d017122fff127c4218b3b5d2b24a0a426897e4 Mon Sep 17 00:00:00 2001
From: Nicolas Gontier <gontier.nicolas@yahoo.fr>
Date: Fri, 12 Sep 2025 20:49:23 +0000
Subject: [PATCH 64/73] monitor env servers and reset if needed

---
 pipelinerl/domains/miniwob/rollouts.py | 100 ++++++++++++++++++++-----
 1 file changed, 83 insertions(+), 17 deletions(-)

diff --git a/pipelinerl/domains/miniwob/rollouts.py b/pipelinerl/domains/miniwob/rollouts.py
index 34ded1b6..dff461c1 100644
--- a/pipelinerl/domains/miniwob/rollouts.py
+++ b/pipelinerl/domains/miniwob/rollouts.py
@@ -1,4 +1,5 @@
 import asyncio
+import json
 import logging
 import os
 import random
@@ -55,6 +56,41 @@ def tape_contains_an_error(tape: WebTape) -> bool:
     )
 
 
+async def check_env_server_health(env_job: Job, session: aiohttp.ClientSession) -> dict:
+    """Check environment server health via HTTP API."""
+    try:
+        url = f"http://{env_job.hostname}:{env_job.port}/health"
+        async with session.get(url, timeout=5) as response:
+            if response.status == 200:
+                health_data = await response.json()
+                return {
+                    "healthy": True,
+                    "active_workers": health_data.get("active_workers", 0),
+                    "max_workers": health_data.get("max_workers", 0),
+                    "stopped_workers": health_data.get("stopped_workers", 0)
+                }
+            else:
+                return {"healthy": False, "error": f"HTTP {response.status}"}
+    except Exception as e:
+        return {"healthy": False, "error": str(e)}
+
+
+async def reset_env_server(env_job: Job, session: aiohttp.ClientSession) -> bool:
+    """Reset environment server via HTTP API."""
+    try:
+        url = f"http://{env_job.hostname}:{env_job.port}/reset_all"
+        async with session.post(url, timeout=10) as response:
+            if response.status == 200:
+                logger.info(f"Reset environment server {env_job.hostname}:{env_job.port}")
+                return True
+            else:
+                logger.error(f"Reset failed: HTTP {response.status}")
+                return False
+    except Exception as e:
+        logger.error(f"Reset failed: {e}")
+        return False
+
+
 async def generate_miniwob_rollout(
     cfg: DictConfig,
     llm: TrainableLLM,
@@ -74,16 +110,52 @@ async def generate_miniwob_rollout(
     # Overall timeout for the entire rollout to prevent hanging
     rollout_timeout = getattr(cfg, 'rollout_timeout', 600)  # 10 minutes default
 
-    try:
-        # Execute the entire rollout with a timeout
-        return await asyncio.wait_for(
-            _execute_rollout_with_timeout(cfg, llm, problem, session, start_time),
-            timeout=rollout_timeout
-        )
-    except asyncio.TimeoutError:
-        logger.error(f"Rollout timed out after {rollout_timeout} seconds for task {problem['dataset']}/{problem['task']}/{problem['seed']}")
-        # Return a failed rollout result
-        return _create_failed_rollout_result(problem, start_time, "timeout")
+    env_jobs = [Job(**job) for job in cfg.jobs if job["kind"] == "environment"]
+    env_jobs_url_tried = []
+
+    # Try each environment server with health checks until one of them returns a rollout result
+    for _ in range(len(env_jobs)):
+        # Choose the next environment server to try randomly from the ones that have not been tried yet
+        env_job = random.choice([job for job in env_jobs if f"http://{job.hostname}:{job.port}" not in env_jobs_url_tried])
+        env_job_url = f"http://{env_job.hostname}:{env_job.port}"
+        env_jobs_url_tried.append(env_job_url)
+
+        # Check server health before using
+        health = await check_env_server_health(env_job, session)
+        if not health["healthy"]:
+            logger.warning(f"Environment server {env_job_url} is unhealthy: {json.dumps(health, indent=2)}")
+            # Try to reset the server
+            if await reset_env_server(env_job, session):
+                logger.info(f"Reset environment server {env_job_url} successfully, retrying health check")
+                await asyncio.sleep(5)  # Wait for server to restart
+                health = await check_env_server_health(env_job, session)
+                if not health["healthy"]:
+                    logger.error(f"Environment server {env_job_url} still unhealthy after reset: {json.dumps(health, indent=2)}")
+                    continue
+            else:
+                logger.error(f"Failed to reset environment server {env_job_url}")
+                continue
+        # Log health status for monitoring
+        if health["healthy"]:
+            logger.info(f"Using healthy environment server {env_job_url}: {json.dumps(health, indent=2)}")
+
+        try:
+            # Execute the entire rollout with a timeout
+            return await asyncio.wait_for(
+                _execute_rollout_with_timeout(cfg, llm, problem, session, start_time, env_job_url),
+                timeout=rollout_timeout
+            )
+        except asyncio.TimeoutError:
+            health = await check_env_server_health(env_job, session)
+            logger.warning(f"Rollout timed out after {rollout_timeout} seconds for task {problem['dataset']}/{problem['task']}/{problem['seed']} on environment {env_job_url}. Health: {json.dumps(health, indent=2)}. Trying next server.")
+            continue
+        except Exception as e:
+            health = await check_env_server_health(env_job, session)
+            logger.warning(f"Rollout failed for task {problem['dataset']}/{problem['task']}/{problem['seed']} on environment {env_job_url}. Health: {json.dumps(health, indent=2)}. Trying next server.")
+            continue
+    # If all servers failed
+    logger.error(f"All environment servers failed for task {problem['dataset']}/{problem['task']}/{problem['seed']}. Returning a failed rollout result.")
+    return _create_failed_rollout_result(problem, start_time, "all environment servers failed")
 
 
 async def _execute_rollout_with_timeout(
@@ -92,14 +164,8 @@ async def _execute_rollout_with_timeout(
     problem: dict,
     session: aiohttp.ClientSession,
     start_time: float,
+    env_job_url: str,
 ) -> RolloutResult:
-    # (1) Choose a random environment server
-    env_jobs = [Job(**job) for job in cfg.jobs if job["kind"] == "environment"]
-    # choose the env job randomly
-    env_job = random.choice(env_jobs)
-    assert env_job.port is not None
-    env_job_url = f"http://{env_job.hostname}:{env_job.port}"
-
     # (2) Generate environment, TapeAgent, and run them to get a Tape
     no_error = True  # track if there was an error in the tape
     environment = AsyncRemoteEnvironment(server_url=env_job_url)  # type: ignore

From 981cd85a22dff384b011dab47f67b9daedba4c69 Mon Sep 17 00:00:00 2001
From: Nicolas Gontier <gontier.nicolas@yahoo.fr>
Date: Fri, 12 Sep 2025 21:04:32 +0000
Subject: [PATCH 65/73] better health message

---
 pipelinerl/domains/miniwob/rollouts.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pipelinerl/domains/miniwob/rollouts.py b/pipelinerl/domains/miniwob/rollouts.py
index dff461c1..b506a88c 100644
--- a/pipelinerl/domains/miniwob/rollouts.py
+++ b/pipelinerl/domains/miniwob/rollouts.py
@@ -70,9 +70,9 @@ async def check_env_server_health(env_job: Job, session: aiohttp.ClientSession)
                     "stopped_workers": health_data.get("stopped_workers", 0)
                 }
             else:
-                return {"healthy": False, "error": f"HTTP {response.status}"}
+                return {"healthy": False, "error_status": f"HTTP {response.status}", "error_message": response.text}
     except Exception as e:
-        return {"healthy": False, "error": str(e)}
+        return {"healthy": False, "error_status": "Unknown", "error_message": str(e)}
 
 
 async def reset_env_server(env_job: Job, session: aiohttp.ClientSession) -> bool:

From 9c755ed82803f346aadb7826fda968d7e5909e8d Mon Sep 17 00:00:00 2001
From: Nicolas Gontier <gontier.nicolas@yahoo.fr>
Date: Sat, 13 Sep 2025 03:16:00 +0000
Subject: [PATCH 66/73] small fix

---
 pipelinerl/domains/miniwob/rollouts.py | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/pipelinerl/domains/miniwob/rollouts.py b/pipelinerl/domains/miniwob/rollouts.py
index b506a88c..b9a9cb74 100644
--- a/pipelinerl/domains/miniwob/rollouts.py
+++ b/pipelinerl/domains/miniwob/rollouts.py
@@ -70,7 +70,8 @@ async def check_env_server_health(env_job: Job, session: aiohttp.ClientSession)
                     "stopped_workers": health_data.get("stopped_workers", 0)
                 }
             else:
-                return {"healthy": False, "error_status": f"HTTP {response.status}", "error_message": response.text}
+                error_text = await response.text()
+                return {"healthy": False, "error_status": f"HTTP {response.status}", "error_message": error_text}
     except Exception as e:
         return {"healthy": False, "error_status": "Unknown", "error_message": str(e)}
 
@@ -123,21 +124,21 @@ async def generate_miniwob_rollout(
         # Check server health before using
         health = await check_env_server_health(env_job, session)
         if not health["healthy"]:
-            logger.warning(f"Environment server {env_job_url} is unhealthy: {json.dumps(health, indent=2)}")
+            logger.warning(f"Environment server {env_job_url} is unhealthy: {health}")
             # Try to reset the server
             if await reset_env_server(env_job, session):
                 logger.info(f"Reset environment server {env_job_url} successfully, retrying health check")
                 await asyncio.sleep(5)  # Wait for server to restart
                 health = await check_env_server_health(env_job, session)
                 if not health["healthy"]:
-                    logger.error(f"Environment server {env_job_url} still unhealthy after reset: {json.dumps(health, indent=2)}")
+                    logger.error(f"Environment server {env_job_url} still unhealthy after reset: {health}")
                     continue
             else:
                 logger.error(f"Failed to reset environment server {env_job_url}")
                 continue
         # Log health status for monitoring
         if health["healthy"]:
-            logger.info(f"Using healthy environment server {env_job_url}: {json.dumps(health, indent=2)}")
+            logger.info(f"Using healthy environment server {env_job_url}: {health}")
 
         try:
             # Execute the entire rollout with a timeout
@@ -147,11 +148,11 @@ async def generate_miniwob_rollout(
             )
         except asyncio.TimeoutError:
             health = await check_env_server_health(env_job, session)
-            logger.warning(f"Rollout timed out after {rollout_timeout} seconds for task {problem['dataset']}/{problem['task']}/{problem['seed']} on environment {env_job_url}. Health: {json.dumps(health, indent=2)}. Trying next server.")
+            logger.warning(f"Rollout timed out after {rollout_timeout} seconds for task {problem['dataset']}/{problem['task']}/{problem['seed']} on environment {env_job_url}. Health: {health}. Trying next server.")
             continue
         except Exception as e:
             health = await check_env_server_health(env_job, session)
-            logger.warning(f"Rollout failed for task {problem['dataset']}/{problem['task']}/{problem['seed']} on environment {env_job_url}. Health: {json.dumps(health, indent=2)}. Trying next server.")
+            logger.warning(f"Rollout failed for task {problem['dataset']}/{problem['task']}/{problem['seed']} on environment {env_job_url}. Health: {health}. Trying next server.")
             continue
     # If all servers failed
     logger.error(f"All environment servers failed for task {problem['dataset']}/{problem['task']}/{problem['seed']}. Returning a failed rollout result.")

From 0b8a24d3a209cef477442d37ba7f238f36c32839 Mon Sep 17 00:00:00 2001
From: Nicolas Gontier <gontier.nicolas@yahoo.fr>
Date: Fri, 26 Sep 2025 04:31:39 +0000
Subject: [PATCH 67/73] better logs

---
 pipelinerl/actor.py                    | 2 +-
 pipelinerl/domains/miniwob/rollouts.py | 5 ++++-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/pipelinerl/actor.py b/pipelinerl/actor.py
index a329598f..bcce006b 100644
--- a/pipelinerl/actor.py
+++ b/pipelinerl/actor.py
@@ -196,7 +196,7 @@ async def rollout_and_maybe_produce_result(
                     f"groups in progress: {len(group_rollouts)}, "
                     f"rollouts started so far: {started_rollouts}, "
                     f"rollouts finished so far: {finished_rollouts}, "
-                    f"groups finished so far: {group_id}, "
+                    f"groups started so far: {group_id}, "
                     f"max group size in bytes: {result_queue.max_actual_entry_size()}, "
                 )
                 last_logged = time.time()
diff --git a/pipelinerl/domains/miniwob/rollouts.py b/pipelinerl/domains/miniwob/rollouts.py
index b9a9cb74..3e941dae 100644
--- a/pipelinerl/domains/miniwob/rollouts.py
+++ b/pipelinerl/domains/miniwob/rollouts.py
@@ -73,7 +73,10 @@ async def check_env_server_health(env_job: Job, session: aiohttp.ClientSession)
                 error_text = await response.text()
                 return {"healthy": False, "error_status": f"HTTP {response.status}", "error_message": error_text}
     except Exception as e:
-        return {"healthy": False, "error_status": "Unknown", "error_message": str(e)}
+        exception_type = type(e).__name__
+        exception_message = str(e) if str(e) else "No message available"
+        logger.exception(f"Error checking environment server health: {exception_type}: {exception_message}", stack_info=True)
+        return {"healthy": False, "error_status": f"Exception: {exception_type}", "error_message": exception_message}
 
 
 async def reset_env_server(env_job: Job, session: aiohttp.ClientSession) -> bool:

From cd27e30f485594393163ab742758510cd82c14cc Mon Sep 17 00:00:00 2001
From: Nicolas Gontier <gontier.nicolas@yahoo.fr>
Date: Fri, 26 Sep 2025 19:44:15 +0000
Subject: [PATCH 68/73] always check the worker before launching the agent on
 it + more detailed logs

---
 pipelinerl/domains/miniwob/rollouts.py | 67 +++++++++++---------------
 1 file changed, 28 insertions(+), 39 deletions(-)

diff --git a/pipelinerl/domains/miniwob/rollouts.py b/pipelinerl/domains/miniwob/rollouts.py
index 3e941dae..3243ea1f 100644
--- a/pipelinerl/domains/miniwob/rollouts.py
+++ b/pipelinerl/domains/miniwob/rollouts.py
@@ -65,34 +65,17 @@ async def check_env_server_health(env_job: Job, session: aiohttp.ClientSession)
                 health_data = await response.json()
                 return {
                     "healthy": True,
-                    "active_workers": health_data.get("active_workers", 0),
-                    "max_workers": health_data.get("max_workers", 0),
-                    "stopped_workers": health_data.get("stopped_workers", 0)
+                    "health_data": health_data,
+                    "last_check": time.time()
                 }
             else:
                 error_text = await response.text()
-                return {"healthy": False, "error_status": f"HTTP {response.status}", "error_message": error_text}
+                return {"healthy": False, "error_message": f"HTTP {response.status}: {error_text}", "last_check": time.time()}
     except Exception as e:
         exception_type = type(e).__name__
         exception_message = str(e) if str(e) else "No message available"
         logger.exception(f"Error checking environment server health: {exception_type}: {exception_message}", stack_info=True)
-        return {"healthy": False, "error_status": f"Exception: {exception_type}", "error_message": exception_message}
-
-
-async def reset_env_server(env_job: Job, session: aiohttp.ClientSession) -> bool:
-    """Reset environment server via HTTP API."""
-    try:
-        url = f"http://{env_job.hostname}:{env_job.port}/reset_all"
-        async with session.post(url, timeout=10) as response:
-            if response.status == 200:
-                logger.info(f"Reset environment server {env_job.hostname}:{env_job.port}")
-                return True
-            else:
-                logger.error(f"Reset failed: HTTP {response.status}")
-                return False
-    except Exception as e:
-        logger.error(f"Reset failed: {e}")
-        return False
+        return {"healthy": False, "error_message": f"Exception: {exception_type}: {exception_message}", "last_check": time.time()}
 
 
 async def generate_miniwob_rollout(
@@ -128,17 +111,7 @@ async def generate_miniwob_rollout(
         health = await check_env_server_health(env_job, session)
         if not health["healthy"]:
             logger.warning(f"Environment server {env_job_url} is unhealthy: {health}")
-            # Try to reset the server
-            if await reset_env_server(env_job, session):
-                logger.info(f"Reset environment server {env_job_url} successfully, retrying health check")
-                await asyncio.sleep(5)  # Wait for server to restart
-                health = await check_env_server_health(env_job, session)
-                if not health["healthy"]:
-                    logger.error(f"Environment server {env_job_url} still unhealthy after reset: {health}")
-                    continue
-            else:
-                logger.error(f"Failed to reset environment server {env_job_url}")
-                continue
+            continue
         # Log health status for monitoring
         if health["healthy"]:
             logger.info(f"Using healthy environment server {env_job_url}: {health}")
@@ -198,38 +171,54 @@ async def _execute_rollout_with_timeout(
                     logger.warning("Retry start task after 5 seconds.")
                     await asyncio.sleep(5)
         logger.info(
-            f"Task {problem['dataset']}/{problem['task']}/{problem['seed']} started in {time.perf_counter() - t:.2f} seconds"
+            f"Task {problem['dataset']}/{problem['task']}/{problem['seed']} started in {time.perf_counter() - t:.2f} seconds. Worker ID: {env.worker_id}. Tape dict: {tape_dict}"
         )
         tape: WebTape = WebTape(**tape_dict)  # convert http response dict to WebTape object
         t = time.perf_counter()
         if no_error:  # only run the agent if the task started successfully
-            logger.info(f"Running agent for task {problem['dataset']}/{problem['task']}/{problem['seed']}")
+            logger.info(f"Running agent for task {problem['dataset']}/{problem['task']}/{problem['seed']} with worker ID: {env.worker_id} and tape ID {tape.metadata.id}")
             agent_attempts = cfg.agent_attempts
             while agent_attempts > 0:
+                # check if the worker is alive.
+                try:
+                    # this will either raise RuntimeError if worker is not alive anymore, or return a dictionary with the worker status
+                    worker_status = await env.check_worker_alive()
+                    if worker_status.get("status") == "starting":
+                        logger.warning(f"Worker {env.worker_id} for task {problem['dataset']}/{problem['task']}/{problem['seed']} and tape ID {tape.metadata.id} is starting, waiting 5 seconds for it to be fully started.")
+                        await asyncio.sleep(5)
+                        continue
+                except Exception as e:
+                    # if worker is dead, no need to retry
+                    logger.exception(f"Worker {env.worker_id} for task {problem['dataset']}/{problem['task']}/{problem['seed']} and tape ID {tape.metadata.id} is dead. Error: {e}", stack_info=True)
+                    no_error = False
+                    break
+                # if worker is alive, run the agent
                 try:
                     actions = await env.a_actions()
                     tools_description = await env.a_tools_description()
                     agent: Agent = instantiate(cfg.agent, known_actions=actions, tools_description=tools_description)
                     agent.llms = {DEFAULT: llm}
                     tape = await async_execute_agent(agent, tape, env, session, max_loops=cfg.agent_max_loops)
-                    # Check if the tape has an error from the orchestrator (e.g., SocketTimeoutError)
+                    # Check if the tape has an error from the orchestrator (e.g., SocketTimeoutError, RuntimeError: Worker is not alive, etc.)
                     if tape.metadata.error:
+                        logger.error(f"Agent execution for task {problem['dataset']}/{problem['task']}/{problem['seed']} with worker ID: {env.worker_id} and tape ID {tape.metadata.id} returned a tape with error: {tape.metadata.error}")
                         raise ValueError(tape.metadata.error)
                     else:
                         # Success - break out of retry loop
+                        logger.info(f"Agent execution for task {problem['dataset']}/{problem['task']}/{problem['seed']} with worker ID: {env.worker_id} and tape ID {tape.metadata.id} finished successfully")
                         break
                 except Exception as e:
                     agent_attempts -= 1
-                    logger.warning(f"Error occurred while running agent. {agent_attempts} attempts remaining. Error: {e}")
+                    logger.warning(f"Error occurred while running agent for task {problem['dataset']}/{problem['task']}/{problem['seed']} with worker ID: {env.worker_id} and tape ID {tape.metadata.id}. {agent_attempts} attempts remaining. Error: {e}")
                     if agent_attempts <= 0:
-                        logger.error(f"Agent execution failed after all retry attempts: {e}")
+                        logger.error(f"Agent execution failed after all retry attempts for task {problem['dataset']}/{problem['task']}/{problem['seed']} with worker ID: {env.worker_id} and tape ID {tape.metadata.id}: {e}")
                         no_error = False
                         break
                     else:
-                        logger.warning("Retry agent execution after 5 seconds.")
+                        logger.warning(f"Retry agent execution after 5 seconds for task {problem['dataset']}/{problem['task']}/{problem['seed']} with worker ID: {env.worker_id} and tape ID {tape.metadata.id}.")
                         await asyncio.sleep(5)
             logger.info(
-                f"Agent finished task {problem['dataset']}/{problem['task']}/{problem['seed']} in {time.perf_counter() - t:.2f} seconds"
+                f"Agent finished task {problem['dataset']}/{problem['task']}/{problem['seed']} in {time.perf_counter() - t:.2f} seconds with worker ID: {env.worker_id} and tape ID {tape.metadata.id}"
             )
         tape.metadata.result.update({"total_execution_time": time.perf_counter() - t})
 

From f9ce99e7efa0392085c03cb346db8786031a8de7 Mon Sep 17 00:00:00 2001
From: Nicolas Gontier <gontier.nicolas@yahoo.fr>
Date: Mon, 29 Sep 2025 19:35:17 +0000
Subject: [PATCH 69/73] log stack trace

---
 pipelinerl/domains/miniwob/rollouts.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/pipelinerl/domains/miniwob/rollouts.py b/pipelinerl/domains/miniwob/rollouts.py
index 3243ea1f..bdd753a8 100644
--- a/pipelinerl/domains/miniwob/rollouts.py
+++ b/pipelinerl/domains/miniwob/rollouts.py
@@ -4,6 +4,7 @@
 import os
 import random
 import time
+import traceback
 
 import aiohttp
 from examples.rl_webagent.steps import WebTape
@@ -75,7 +76,7 @@ async def check_env_server_health(env_job: Job, session: aiohttp.ClientSession)
         exception_type = type(e).__name__
         exception_message = str(e) if str(e) else "No message available"
         logger.exception(f"Error checking environment server health: {exception_type}: {exception_message}", stack_info=True)
-        return {"healthy": False, "error_message": f"Exception: {exception_type}: {exception_message}", "last_check": time.time()}
+        return {"healthy": False, "error_message": f"Exception: {exception_type}: {exception_message}", "last_check": time.time(), "error_stacktrace": traceback.format_exc()}
 
 
 async def generate_miniwob_rollout(
@@ -111,6 +112,7 @@ async def generate_miniwob_rollout(
         health = await check_env_server_health(env_job, session)
         if not health["healthy"]:
             logger.warning(f"Environment server {env_job_url} is unhealthy: {health}")
+            logger.warning(f"Get health error stacktrace: {health['error_stacktrace']}")
             continue
         # Log health status for monitoring
         if health["healthy"]:
@@ -124,10 +126,16 @@ async def generate_miniwob_rollout(
             )
         except asyncio.TimeoutError:
             health = await check_env_server_health(env_job, session)
+            if stack_trace := health.get("error_stacktrace"):
+                logger.warning(f"Get health error stacktrace: {stack_trace}")
+            logger.warning(f"Rollout timeout error stacktrace: {traceback.format_exc()}")
             logger.warning(f"Rollout timed out after {rollout_timeout} seconds for task {problem['dataset']}/{problem['task']}/{problem['seed']} on environment {env_job_url}. Health: {health}. Trying next server.")
             continue
         except Exception as e:
             health = await check_env_server_health(env_job, session)
+            if stack_trace := health.get("error_stacktrace"):
+                logger.warning(f"Get health error stacktrace: {stack_trace}")
+            logger.warning(f"Rollout failed error stacktrace: {traceback.format_exc()}")
             logger.warning(f"Rollout failed for task {problem['dataset']}/{problem['task']}/{problem['seed']} on environment {env_job_url}. Health: {health}. Trying next server.")
             continue
     # If all servers failed

From 60fb04282d3157eaa771dab5c6282e84a3b0bcb6 Mon Sep 17 00:00:00 2001
From: Nicolas Gontier <gontier.nicolas@yahoo.fr>
Date: Mon, 29 Sep 2025 19:50:02 +0000
Subject: [PATCH 70/73] small cleanup

---
 pipelinerl/domains/miniwob/rollouts.py | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/pipelinerl/domains/miniwob/rollouts.py b/pipelinerl/domains/miniwob/rollouts.py
index bdd753a8..ec71ff8e 100644
--- a/pipelinerl/domains/miniwob/rollouts.py
+++ b/pipelinerl/domains/miniwob/rollouts.py
@@ -159,13 +159,9 @@ async def _execute_rollout_with_timeout(
         t = time.perf_counter()
         while start_attempts > 0:
             try:
-                start_result = await env.start_task(problem)
-                if isinstance(start_result, dict) and "error" in start_result:
-                    raise ValueError(start_result['error'])
-                elif isinstance(start_result, list):
-                    tape_dict, _ = start_result
-                else:
-                    raise ValueError(f"Invalid start result: {start_result}")
+                tape_dict, info = await env.start_task(problem)
+                if info.get("error"):
+                    raise ValueError(info['error'])
                 break
             except Exception as e:
                 start_attempts -= 1

From 122db3cade1717d92f3ee965c0ac148f4d0054b3 Mon Sep 17 00:00:00 2001
From: Nicolas Gontier <gontier.nicolas@yahoo.fr>
Date: Tue, 14 Oct 2025 18:08:32 +0000
Subject: [PATCH 71/73] add massimo heldout goals

---
 pipelinerl/domains/miniwob/load_tasks.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/pipelinerl/domains/miniwob/load_tasks.py b/pipelinerl/domains/miniwob/load_tasks.py
index a056a311..63cbbb8a 100644
--- a/pipelinerl/domains/miniwob/load_tasks.py
+++ b/pipelinerl/domains/miniwob/load_tasks.py
@@ -207,6 +207,11 @@ def load_tasks(dataset_names: list[str], train_split: float = 0.6, seeds: list[i
                 {"dataset": task, "task": task, "seed": seed}
                 for task in MASSIMO_TRAIN_SPLIT for seed in range(3,10)  # seeds 0-2 are used for held out goals in Mass setup
             ])
+        elif name == "massimo_train_heldout_goals":
+            tasks.extend([
+                {"dataset": task, "task": task, "seed": seed}
+                for task in MASSIMO_TRAIN_SPLIT for seed in range(3)  # seeds 0-2 are used for held out goals in Mass setup
+            ])
         elif name == "massimo_test":
             tasks.extend([
                 {"dataset": task, "task": task, "seed": seed}

From ff001e8a7412c2759f5c525b9b4ca809a5edbbf6 Mon Sep 17 00:00:00 2001
From: Nicolas Gontier <gontier.nicolas@yahoo.fr>
Date: Mon, 17 Nov 2025 17:30:12 +0000
Subject: [PATCH 72/73] add back the dataset names & update massimo configs

---
 conf/miniwob_massimo_grpo.yaml           |  1 +
 conf/miniwob_massimo_ppo.yaml            |  1 +
 pipelinerl/domains/miniwob/load_tasks.py | 18 +++++++-----------
 3 files changed, 9 insertions(+), 11 deletions(-)

diff --git a/conf/miniwob_massimo_grpo.yaml b/conf/miniwob_massimo_grpo.yaml
index b61dcf32..e96370af 100644
--- a/conf/miniwob_massimo_grpo.yaml
+++ b/conf/miniwob_massimo_grpo.yaml
@@ -5,6 +5,7 @@ defaults:
 train_dataset_names:
   - massimo_train
 test_dataset_names:
+  - massimo_train_heldout_goals
   - massimo_test
 
 reward_computation: massimo
diff --git a/conf/miniwob_massimo_ppo.yaml b/conf/miniwob_massimo_ppo.yaml
index 53703d56..9e07e74d 100644
--- a/conf/miniwob_massimo_ppo.yaml
+++ b/conf/miniwob_massimo_ppo.yaml
@@ -5,6 +5,7 @@ defaults:
 train_dataset_names:
   - massimo_train
 test_dataset_names:
+  - massimo_train_heldout_goals
   - massimo_test
 
 reward_computation: massimo
diff --git a/pipelinerl/domains/miniwob/load_tasks.py b/pipelinerl/domains/miniwob/load_tasks.py
index 63cbbb8a..27c35351 100644
--- a/pipelinerl/domains/miniwob/load_tasks.py
+++ b/pipelinerl/domains/miniwob/load_tasks.py
@@ -182,39 +182,35 @@ def load_tasks(dataset_names: list[str], train_split: float = 0.6, seeds: list[i
     for name in dataset_names:
         if name == "debug":
             tasks.extend([
-                # {"dataset": "miniwob.debug", "task": task, "seed": 0} for task in DEBUG_SPLIT
-                {"dataset": task, "task": task, "seed": 0} for task in DEBUG_SPLIT
+                {"dataset": "miniwob.debug", "task": task, "seed": 0} for task in DEBUG_SPLIT
             ])
         elif name == "easy":
             tasks.extend([
-                # {"dataset": "miniwob.easy", "task": task, "seed": 0} for task in EASY_SPLIT
-                {"dataset": task, "task": task, "seed": 0} for task in EASY_SPLIT
+                {"dataset": "miniwob.easy", "task": task, "seed": 0} for task in EASY_SPLIT
             ])
         elif name == "train":
             tasks.extend([
-                # {"dataset": "miniwob.train", "task": task, "seed": seed}
-                {"dataset": task, "task": task, "seed": seed}
+                {"dataset": "miniwob.train", "task": task, "seed": seed}
                 for task in TRAIN_SPLIT for seed in seeds
             ])
         elif name == "test":
             tasks.extend([
-                # {"dataset": "miniwob.test", "task": task, "seed": seed}
-                {"dataset": task, "task": task, "seed": seed}
+                {"dataset": "miniwob.test", "task": task, "seed": seed}
                 for task in TEST_SPLIT for seed in seeds
             ])
         elif name == "massimo_train":
             tasks.extend([
-                {"dataset": task, "task": task, "seed": seed}
+                {"dataset": "miniwob.massimo_train", "task": task, "seed": seed}
                 for task in MASSIMO_TRAIN_SPLIT for seed in range(3,10)  # seeds 0-2 are used for held out goals in Mass setup
             ])
         elif name == "massimo_train_heldout_goals":
             tasks.extend([
-                {"dataset": task, "task": task, "seed": seed}
+                {"dataset": "miniwob.massimo_train_heldout_goals", "task": task, "seed": seed}
                 for task in MASSIMO_TRAIN_SPLIT for seed in range(3)  # seeds 0-2 are used for held out goals in Mass setup
             ])
         elif name == "massimo_test":
             tasks.extend([
-                {"dataset": task, "task": task, "seed": seed}
+                {"dataset": "miniwob.massimo_test", "task": task, "seed": seed}
                 for task in MASSIMO_TEST_SPLIT for seed in range(10)
             ])
     return tasks

From 4c1bd4a3c3b8995d42e259072cccb4795d980bf9 Mon Sep 17 00:00:00 2001
From: Oleh Shliazhko <oleh.shliazhko@servicenow.com>
Date: Fri, 28 Nov 2025 12:24:57 +0100
Subject: [PATCH 73/73] address review comments

---
 ...assimo_grpo.yaml => miniwob_uic_grpo.yaml} |  8 +++----
 ..._massimo_ppo.yaml => miniwob_uic_ppo.yaml} |  8 +++----
 pipelinerl/domains/miniwob/load_tasks.py      | 23 ++++++++++---------
 pipelinerl/domains/miniwob/rollouts.py        |  6 ++---
 pipelinerl/preprocess.py                      | 19 +++++++--------
 pipelinerl/world.py                           | 11 ++++-----
 6 files changed, 36 insertions(+), 39 deletions(-)
 rename conf/{miniwob_massimo_grpo.yaml => miniwob_uic_grpo.yaml} (67%)
 rename conf/{miniwob_massimo_ppo.yaml => miniwob_uic_ppo.yaml} (66%)

diff --git a/conf/miniwob_massimo_grpo.yaml b/conf/miniwob_uic_grpo.yaml
similarity index 67%
rename from conf/miniwob_massimo_grpo.yaml
rename to conf/miniwob_uic_grpo.yaml
index e96370af..7e7746c6 100644
--- a/conf/miniwob_massimo_grpo.yaml
+++ b/conf/miniwob_uic_grpo.yaml
@@ -3,12 +3,12 @@ defaults:
   - _self_
 
 train_dataset_names:
-  - massimo_train
+  - uic_train
 test_dataset_names:
-  - massimo_train_heldout_goals
-  - massimo_test
+  - uic_train_train_heldout_goals
+  - uic_test
 
-reward_computation: massimo
+reward_computation: uic
 
 finetune:
   gradient_accumulation_passes: 512
diff --git a/conf/miniwob_massimo_ppo.yaml b/conf/miniwob_uic_ppo.yaml
similarity index 66%
rename from conf/miniwob_massimo_ppo.yaml
rename to conf/miniwob_uic_ppo.yaml
index 9e07e74d..db0f5792 100644
--- a/conf/miniwob_massimo_ppo.yaml
+++ b/conf/miniwob_uic_ppo.yaml
@@ -3,12 +3,12 @@ defaults:
   - _self_
 
 train_dataset_names:
-  - massimo_train
+  - uic_train
 test_dataset_names:
-  - massimo_train_heldout_goals
-  - massimo_test
+  - uic_train_train_heldout_goals
+  - uic_test
 
-reward_computation: massimo
+reward_computation: uic
 
 finetune:
   gradient_accumulation_passes: 512
diff --git a/pipelinerl/domains/miniwob/load_tasks.py b/pipelinerl/domains/miniwob/load_tasks.py
index 27c35351..a0efc261 100644
--- a/pipelinerl/domains/miniwob/load_tasks.py
+++ b/pipelinerl/domains/miniwob/load_tasks.py
@@ -1,4 +1,5 @@
 import random
+
 from browsergym.miniwob import ALL_MINIWOB_TASKS
 
 DEBUG_SPLIT = [
@@ -34,7 +35,7 @@
     "miniwob.tic-tac-toe",
     "miniwob.use-autocomplete-nodelay"
 ]
-MASSIMO_TRAIN_SPLIT = [
+UIC_TRAIN_SPLIT = [
     "miniwob.ascending-numbers",
     "miniwob.bisect-angle",
     "miniwob.book-flight",
@@ -135,7 +136,7 @@
     "miniwob.use-spinner",
     "miniwob.visual-addition",
 ]
-MASSIMO_TEST_SPLIT = [
+UIC_TEST_SPLIT = [
     "miniwob.buy-ticket",
     "miniwob.click-button",
     "miniwob.click-option",
@@ -198,20 +199,20 @@ def load_tasks(dataset_names: list[str], train_split: float = 0.6, seeds: list[i
                 {"dataset": "miniwob.test", "task": task, "seed": seed}
                 for task in TEST_SPLIT for seed in seeds
             ])
-        elif name == "massimo_train":
+        elif name == "uic_train":
             tasks.extend([
-                {"dataset": "miniwob.massimo_train", "task": task, "seed": seed}
-                for task in MASSIMO_TRAIN_SPLIT for seed in range(3,10)  # seeds 0-2 are used for held out goals in Mass setup
+                {"dataset": "miniwob.uic_train", "task": task, "seed": seed}
+                for task in UIC_TRAIN_SPLIT for seed in range(3,10)  # seeds 0-2 are used for held out goals in Mass setup
             ])
-        elif name == "massimo_train_heldout_goals":
+        elif name == "uic_train_heldout_goals":
             tasks.extend([
-                {"dataset": "miniwob.massimo_train_heldout_goals", "task": task, "seed": seed}
-                for task in MASSIMO_TRAIN_SPLIT for seed in range(3)  # seeds 0-2 are used for held out goals in Mass setup
+                {"dataset": "miniwob.uic_train_heldout_goals", "task": task, "seed": seed}
+                for task in UIC_TRAIN_SPLIT for seed in range(3)  # seeds 0-2 are used for held out goals in Mass setup
             ])
-        elif name == "massimo_test":
+        elif name == "uic_test":
             tasks.extend([
-                {"dataset": "miniwob.massimo_test", "task": task, "seed": seed}
-                for task in MASSIMO_TEST_SPLIT for seed in range(10)
+                {"dataset": "miniwob.uic_test", "task": task, "seed": seed}
+                for task in UIC_TEST_SPLIT for seed in range(10)
             ])
     return tasks
 
diff --git a/pipelinerl/domains/miniwob/rollouts.py b/pipelinerl/domains/miniwob/rollouts.py
index 8f2413d3..ea850814 100644
--- a/pipelinerl/domains/miniwob/rollouts.py
+++ b/pipelinerl/domains/miniwob/rollouts.py
@@ -14,16 +14,14 @@
 from tapeagents.core import LLMCall, LLMOutputParsingFailureAction, Observation
 from tapeagents.io import save_json_tape
 from tapeagents.llms.trainable import TrainableLLM
-from tapeagents.core import LLMOutputParsingFailureAction, Observation
-
 from tapeagents.orchestrator import async_execute_agent
 from tapeagents.remote_environment import AsyncRemoteEnvironment
 from tapeagents.tools.simple_browser import PageObservation
 
 from pipelinerl.async_llm import make_training_text
+from pipelinerl.llm import LLMCall, TrainableLLM
 from pipelinerl.rollouts import BaseMetrics, RolloutResult
 from pipelinerl.world import Job
-from pipelinerl.llm import TrainableLLM, LLMCall
 
 from .steps import WebTape
 
@@ -255,7 +253,7 @@ async def _execute_rollout_with_timeout(
 
     if cfg.reward_computation == "nico":
         reward = raw_reward * 0.99**n_step_errors if no_error and raw_reward >= 0 else -1.0
-    elif cfg.reward_computation == "massimo":
+    elif cfg.reward_computation == "uic":
         reward = float(raw_reward>0)
         if reward == 0.0:
             reward = -1.0
diff --git a/pipelinerl/preprocess.py b/pipelinerl/preprocess.py
index f622cff9..0a6015e4 100644
--- a/pipelinerl/preprocess.py
+++ b/pipelinerl/preprocess.py
@@ -160,15 +160,16 @@ def preprocess_dataset(
     try:
         dataset = populate_rl_data(dataset=dataset, eos_token_id=tokenizer.eos_token_id, config=rl_config)
     except Exception as e:
-        logger.error(f"Error in populate_rl_data: {e}")
-        logger.error(f"Data: {data}")
-        logger.error(f"Dataset: {dataset}")
-        logger.error(f"Tokenizer: {tokenizer}")
-        logger.error(f"Tokenizer eos_token_id: {tokenizer.eos_token_id}")
-        logger.error(f"RL config: {rl_config}")
-        logger.error(f"LLM: {llm}")
-        logger.error(f"Seq length: {seq_length}")
-        raise e
+        logger.error(f"Error in populate_rl_data: {e}", extra={
+            "data": data,
+            "dataset": dataset,
+            "tokenizer": tokenizer,
+            "eos_token_id": tokenizer.eos_token_id,
+            "rl_config": rl_config,
+            "llm": llm,
+            "seq_length": seq_length,
+        })
+        raise
     return dataset
 
 
diff --git a/pipelinerl/world.py b/pipelinerl/world.py
index cc23afd0..992a7c4d 100644
--- a/pipelinerl/world.py
+++ b/pipelinerl/world.py
@@ -1,9 +1,9 @@
 import logging
 import os
-from typing import Literal
-from pydantic import BaseModel
-from omegaconf import DictConfig
+
 import torch
+from omegaconf import DictConfig
+from pydantic import BaseModel
 
 logger = logging.getLogger(__name__)
 
@@ -188,10 +188,7 @@ def _place_pipeline_stages(self, cfg):
             self.add_job(kind="preprocessor", replica_idx=worker_idx, node_rank=node, gpus=[], cpu_heavy=True)
 
     def _place_environments(self, cfg):
-        # Scale environment servers to be the same as llm servers
-        env_replicas_per_actor = getattr(cfg.world, "env_replicas_per_actor", 1)
-        total_env_replicas = cfg.world.replicas * self.llms_per_actor * env_replicas_per_actor
-        for worker_idx in range(total_env_replicas):
+        for worker_idx in range(cfg.world.env_replicas):
             node = self.get_least_busy_node()
             envs_at_node = len([job for job in self.job_map[node] if job.kind == "environment"])
             self.add_job(