From 3d57d2effaaf4073338d13831f2277b4bdb0d970 Mon Sep 17 00:00:00 2001
From: Alex Piche <alexandrelpiche@gmail.com>
Date: Fri, 25 Jul 2025 17:21:01 +0000
Subject: [PATCH 001/126] processed_entries_queue_popped_data

---
 pipelinerl/preprocess.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pipelinerl/preprocess.py b/pipelinerl/preprocess.py
index 65e29b4b..887bdceb 100644
--- a/pipelinerl/preprocess.py
+++ b/pipelinerl/preprocess.py
@@ -637,6 +637,7 @@ def run_preprocessing_loop(
                             "preprocessor/queue/output": output_queue.qsize(),
                             "preprocessor/filtered_out_samples": num_filtered_out,
                             "preprocessor/total_filtered_out_samples": total_filtered_out,
+                            "preprocessor/popped_entries_queue": processed_entries_queue_popped_data,
                         }
                         if stats_aggregator.has_enough_data():
                             stats.update({"preprocessor/" + k: v for k, v in stats_aggregator.get_stats().items()})

From 4fbc5c7dbcd64ac98aad60c8486c3acd913d2d44 Mon Sep 17 00:00:00 2001
From: Alex Piche <alexandrelpiche@gmail.com>
Date: Fri, 25 Jul 2025 18:54:27 +0000
Subject: [PATCH 002/126] faster preprocess

---
 conf/base.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/conf/base.yaml b/conf/base.yaml
index 3d426f4c..ac44fdde 100644
--- a/conf/base.yaml
+++ b/conf/base.yaml
@@ -23,9 +23,9 @@ preprocess:
   input: actor
   output: training_data
   n_workers: 8
-  chunk_n_groups: 2
+  chunk_n_groups: 8
   # queue for loaded raw groups
-  raw_queue_size: 8
+  raw_queue_size: 128
   # queue for processed chunks of multiple groups  
   input_queue_size: 32
   # queue for ready chunks for multiple groups

From 91acbc4386cf413ed6d5646485b6adbe9b4df799 Mon Sep 17 00:00:00 2001
From: Alex Piche <alexandrelpiche@gmail.com>
Date: Fri, 25 Jul 2025 19:16:33 +0000
Subject: [PATCH 003/126] more logging

---
 pipelinerl/preprocess.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/pipelinerl/preprocess.py b/pipelinerl/preprocess.py
index 887bdceb..a8676e7a 100644
--- a/pipelinerl/preprocess.py
+++ b/pipelinerl/preprocess.py
@@ -170,6 +170,7 @@ def run_dataset_loader(
     check_group_size: int,
     chunk_n_groups: int,
     pop_old_data: bool,
+    wandb_run,
 ):
     old_and_dropped = 0
     last_time_notice = 0
@@ -196,6 +197,8 @@ def run_dataset_loader(
                             if old_and_dropped // 100 != last_time_notice:
                                 logger.info(f"So far removed {old_and_dropped} old elements from preprocessor queue")
                                 last_time_notice = old_and_dropped // 100
+                                if wandb_run is not None:
+                                    wandb_run.log({"preprocessor/old_and_dropped": old_and_dropped})
                         except Empty:
                             pass
                     # Put new element in now that we made space
@@ -382,6 +385,7 @@ def run_preprocessing_loop(
         check_group_size=cfg.attempts,
         chunk_n_groups=cfg.preprocess.chunk_n_groups,
         pop_old_data=pop_old_data,
+        wandb_run=wandb_run,
     )
     # Start the dataset loader thread using Thread
     dataset_loader_thread = threading.Thread(target=dataset_loader_worker_fn)

From fb5a0bd06750d4f4dd6d3573759da03d795b7acc Mon Sep 17 00:00:00 2001
From: Alex Piche <alexandrelpiche@gmail.com>
Date: Fri, 25 Jul 2025 19:18:11 +0000
Subject: [PATCH 004/126] better namming

---
 pipelinerl/preprocess.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pipelinerl/preprocess.py b/pipelinerl/preprocess.py
index a8676e7a..fc238798 100644
--- a/pipelinerl/preprocess.py
+++ b/pipelinerl/preprocess.py
@@ -198,7 +198,7 @@ def run_dataset_loader(
                                 logger.info(f"So far removed {old_and_dropped} old elements from preprocessor queue")
                                 last_time_notice = old_and_dropped // 100
                                 if wandb_run is not None:
-                                    wandb_run.log({"preprocessor/old_and_dropped": old_and_dropped})
+                                    wandb_run.log({"preprocessor/dropped_before_preprocessing": old_and_dropped})
                         except Empty:
                             pass
                     # Put new element in now that we made space
@@ -641,7 +641,7 @@ def run_preprocessing_loop(
                             "preprocessor/queue/output": output_queue.qsize(),
                             "preprocessor/filtered_out_samples": num_filtered_out,
                             "preprocessor/total_filtered_out_samples": total_filtered_out,
-                            "preprocessor/popped_entries_queue": processed_entries_queue_popped_data,
+                            "preprocessor/dropped_after_preprocessing": processed_entries_queue_popped_data,
                         }
                         if stats_aggregator.has_enough_data():
                             stats.update({"preprocessor/" + k: v for k, v in stats_aggregator.get_stats().items()})

From 8c78c4517d61d7d8542e334488772d70f94ae946 Mon Sep 17 00:00:00 2001
From: Alex Piche <alexandrelpiche@gmail.com>
Date: Fri, 25 Jul 2025 19:27:39 +0000
Subject: [PATCH 005/126] clean up

---
 pipelinerl/preprocess.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/pipelinerl/preprocess.py b/pipelinerl/preprocess.py
index fc238798..d758ff36 100644
--- a/pipelinerl/preprocess.py
+++ b/pipelinerl/preprocess.py
@@ -170,7 +170,6 @@ def run_dataset_loader(
     check_group_size: int,
     chunk_n_groups: int,
     pop_old_data: bool,
-    wandb_run,
 ):
     old_and_dropped = 0
     last_time_notice = 0
@@ -197,8 +196,6 @@ def run_dataset_loader(
                             if old_and_dropped // 100 != last_time_notice:
                                 logger.info(f"So far removed {old_and_dropped} old elements from preprocessor queue")
                                 last_time_notice = old_and_dropped // 100
-                                if wandb_run is not None:
-                                    wandb_run.log({"preprocessor/dropped_before_preprocessing": old_and_dropped})
                         except Empty:
                             pass
                     # Put new element in now that we made space
@@ -385,7 +382,6 @@ def run_preprocessing_loop(
         check_group_size=cfg.attempts,
         chunk_n_groups=cfg.preprocess.chunk_n_groups,
         pop_old_data=pop_old_data,
-        wandb_run=wandb_run,
     )
     # Start the dataset loader thread using Thread
     dataset_loader_thread = threading.Thread(target=dataset_loader_worker_fn)

From 1b90a4b9033a9842fdc8c5ae9538c39026c0368f Mon Sep 17 00:00:00 2001
From: Alex Piche <alexandrelpiche@gmail.com>
Date: Sat, 26 Jul 2025 18:03:48 +0000
Subject: [PATCH 006/126] add groups_in_progress

---
 pipelinerl/actor.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pipelinerl/actor.py b/pipelinerl/actor.py
index dad79e0b..b0908d08 100644
--- a/pipelinerl/actor.py
+++ b/pipelinerl/actor.py
@@ -498,6 +498,7 @@ def run(self, dataset: list[tuple[str, dict]]):
                             "finished_groups": finished_groups,
                             "trainer_model_version": trainer_version_to_publish, 
                             "time_since_start": time.time() - loop_start_time,
+                            "groups_in_progress": in_progress,
                         }
                         trainer_version_to_publish = None
                     else:

From 3c8f338e9c5bd7c41106333de785f6ce68a026a7 Mon Sep 17 00:00:00 2001
From: Alex Piche <alexandrelpiche@gmail.com>
Date: Sat, 26 Jul 2025 18:54:34 +0000
Subject: [PATCH 007/126] raise when finetune is done

---
 pipelinerl/finetune_loop.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pipelinerl/finetune_loop.py b/pipelinerl/finetune_loop.py
index 68bbfb17..0948e056 100644
--- a/pipelinerl/finetune_loop.py
+++ b/pipelinerl/finetune_loop.py
@@ -483,6 +483,7 @@ def run_finetuning_loop(
     finally:
         if actor_update_group:
             dist.destroy_process_group(actor_update_group)
+        raise RuntimeError("Finetuning loop finished, exiting worker thread")
 
 
 def rl_finetuning_worker(

From f88dceb9704be64d95dede133bd79f8a1725e430 Mon Sep 17 00:00:00 2001
From: Alex Piche <alexandrelpiche@gmail.com>
Date: Sun, 27 Jul 2025 21:40:05 +0000
Subject: [PATCH 008/126] cte lr

---
 conf/finetune/base.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/conf/finetune/base.yaml b/conf/finetune/base.yaml
index 237e6d56..6fb09310 100644
--- a/conf/finetune/base.yaml
+++ b/conf/finetune/base.yaml
@@ -36,7 +36,7 @@ learning_rate: 1e-6
 # How much to clip the gradient (no clipping if null)
 gradient_clipping_threshold: 0.3
 # Learning rate scheduler type (indexed by completed_steps).
-lr_scheduler_type: cosine # could be cosine, constant_with_warmup
+lr_scheduler_type: constant # could be cosine, constant_with_warmup
 # Number of warmup (completed) steps in the learning rate schedule.
 num_warmup_steps: 50
 # Number of gradient accumulation steps.

From 812aafcc100f4b56a8ebe9e6365e5b109bef4a75 Mon Sep 17 00:00:00 2001
From: Alex Piche <alexandrelpiche@gmail.com>
Date: Fri, 15 Aug 2025 22:39:44 +0000
Subject: [PATCH 009/126] first mcp

---
 .gitignore                               |   3 +-
 conf/mcp/python.json                     |  16 +++
 conf/tir_mcp.yaml                        | 119 +++++++++++++++++++++++
 pipelinerl/domains/math/__init__.py      |   2 +-
 pipelinerl/domains/math/rollouts.py      |  45 +++++----
 pipelinerl/domains/tir_mcp/__init__.py   |   1 +
 pipelinerl/domains/tir_mcp/env_server.py |  44 +++++++++
 pipelinerl/domains/tir_mcp/rollouts.py   |  77 +++++++++++++++
 8 files changed, 284 insertions(+), 23 deletions(-)
 create mode 100644 conf/mcp/python.json
 create mode 100644 conf/tir_mcp.yaml
 create mode 100644 pipelinerl/domains/tir_mcp/__init__.py
 create mode 100644 pipelinerl/domains/tir_mcp/env_server.py
 create mode 100644 pipelinerl/domains/tir_mcp/rollouts.py

diff --git a/.gitignore b/.gitignore
index 476aab77..1469bc67 100644
--- a/.gitignore
+++ b/.gitignore
@@ -120,6 +120,7 @@ celerybeat.pid
 
 # SageMath parsed files
 *.sage.py
+node_modules/
 
 # Environments
 .env
@@ -185,4 +186,4 @@ results
 results/
 data/
 cache/
-dump.rdb
\ No newline at end of file
+dump.rdb
diff --git a/conf/mcp/python.json b/conf/mcp/python.json
new file mode 100644
index 00000000..50ccbe69
--- /dev/null
+++ b/conf/mcp/python.json
@@ -0,0 +1,16 @@
+{
+    "mcpServers": {
+        "python_exec": {
+            "command": "deno",
+            "args": [
+                "run",
+                "-N",
+                "-R=node_modules",
+                "-W=node_modules",
+                "--node-modules-dir=auto",
+                "jsr:@pydantic/mcp-run-python",
+                "stdio"
+            ]
+        }
+    }
+}
\ No newline at end of file
diff --git a/conf/tir_mcp.yaml b/conf/tir_mcp.yaml
new file mode 100644
index 00000000..45596607
--- /dev/null
+++ b/conf/tir_mcp.yaml
@@ -0,0 +1,119 @@
+defaults:
+    - base
+    - _self_
+
+actor:
+  rollout_policy: pipelinerl.domains.tir_mcp.generate_math_rollout2
+  system_prompt: Please reason step by step, and put your final answer within \boxed{}.
+  task_template: |-
+    {task}
+
+dataset_loader: pipelinerl.domains.math.load_datasets
+train_dataset_names:
+- open_reasoner_zero_57k
+- open_reasoner_zero_extended_72k 
+test_dataset_names:
+  - aime_2024
+  - amc_2023
+  - math_500
+
+vllm_config:
+  use_v1: true
+  vllm_kwargs:
+    enable-auto-tool-choice: ""
+    tool-call-parser: hermes
+
+environment:
+  _target_: pipelinerl.domains.tir_mcp.env_server.MCPEnvironmentServer
+  n_envs: 8
+  n_envs_mcp: 7
+  n_envs_math: 1
+  host: localhost
+  exp_path: ${output_dir}/env_server
+  mcp_target: tapeagents.mcp.MCPEnvironment
+  mcp_config_path: /home/toolkit/research-now-reasoner/pipelinerl/conf/mcp/python.json
+  mcp_tools_whitelist:
+    - run_python_code
+  math_target: pipelinerl.domains.math.MathEnvironment
+
+
+agent_max_loops: 2
+agent:
+  _target_: tapeagents.agent.Agent
+  name : mcp_agent
+  max_iterations: 2
+  templates:
+    system_prompt: |
+      You are an expert AI Agent trained to assist users with complex information processing tasks.
+      Your role is to understand user queries and respond in a helpful and accurate manner.
+      Keep your replies concise and direct. Prioritize clarity and avoid over-elaboration.
+      Do not express emotions or opinions about user questions.
+    allowed_tools: |
+      You have access to the following tools:
+      {tools_description}
+    thought_format: |
+      Important! Respond with the plain text, do not include any JSON or code.
+      Do not output anything besides what I asked in this message.
+    allowed_steps: |
+      You have access to the following tools:
+      {tools_description}
+    format: >
+      Output only a single JSON dict.
+      Do not repeat the last thought again.
+      If the last action does not change the observation, do not repeat it!
+      DO NOT OUTPUT ANYTHING BESIDES THE JSON! DO NOT PLACE ANY COMMENTS INSIDE THE JSON. 
+      It will break the system that processes the output.
+
+  nodes:
+    - _target_: tapeagents.nodes.StandardNode
+      name: plan
+      system_prompt: ${agent.templates.system_prompt}
+      guidance: |
+        Write a concise multi-step plan explaining which steps should be performed to find the answer for the given task.
+        Be specific about how each step should be performed. Only describe the intended actions here, do not perform them yet.
+        Consider that next steps may depend on results of previous steps, so include conditional branching using "if" statements where needed.
+        Start with the title "Plan". Every step should have short name and description.
+        ${agent.templates.thought_format}
+      steps_prompt: ${agent.templates.allowed_tools}
+
+    - _target_: tapeagents.nodes.StandardNode
+      name: select
+      system_prompt: ${agent.templates.system_prompt}
+      trim_obs_except_last_n: 1
+      guidance: |
+        Select the next step to do to move forward with the plan. Describe the expected effect of the proposed action.
+        ${agent.templates.thought_format}
+      steps_prompt: ${agent.templates.allowed_tools}
+
+    - _target_: tapeagents.nodes.StandardNode
+      name: act
+      system_prompt: ${agent.templates.system_prompt}
+      trim_obs_except_last_n: 1
+      guidance: Then produce single function call for the next step. If the answer is ready, call GaiaAnswer.
+      steps:
+        - examples.gaia_agent.steps.GaiaAnswer
+      use_known_actions: true
+      use_function_calls: true
+
+    - _target_: tapeagents.nodes.StandardNode
+      name: summarize
+      system_prompt: ${agent.templates.system_prompt}
+      trim_obs_except_last_n: 1
+      guidance: |
+        Summarize last observation. If its an image, thoroughly describe it with all details.
+        Describe the results of the last action and observed changes
+        Do not hallucinate or make up any information, only describe what you see in the observation.
+        Do not guess or assume action effects, describe only visible changes.
+        ${agent.templates.thought_format}
+
+    - _target_: tapeagents.nodes.StandardNode
+      name: reflect
+      system_prompt: ${agent.templates.system_prompt}
+      trim_obs_except_last_n: 1
+      guidance: |
+        1. Evaluate the action's success, explain its effect on current step, overall plan and task solution.
+        2. If the last action was not successful, describe errors and the possible reasons for failure.
+        3. Check if the current plan step is finished. 
+        4. If the step is finished, update the following steps of the plan with new information and choose the next step.
+        ${agent.templates.thought_format}
+      next_node: select
\ No newline at end of file
diff --git a/pipelinerl/domains/math/__init__.py b/pipelinerl/domains/math/__init__.py
index 9aee0b8f..1c7310f2 100644
--- a/pipelinerl/domains/math/__init__.py
+++ b/pipelinerl/domains/math/__init__.py
@@ -1,3 +1,3 @@
 from .load_datasets import load_datasets
-from .rollouts import generate_math_rollout, RewardTable
+from .rollouts import generate_math_rollout, RewardTable, get_reward
 from .verifier_api import MathEnvironment, verify_answer, verify_answer_rpc
\ No newline at end of file
diff --git a/pipelinerl/domains/math/rollouts.py b/pipelinerl/domains/math/rollouts.py
index 41a61021..cdb7ba2a 100644
--- a/pipelinerl/domains/math/rollouts.py
+++ b/pipelinerl/domains/math/rollouts.py
@@ -26,6 +26,28 @@ class RewardTable(BaseModel):
     correct_answer_finished: float
     buffer_tokens: int = 0 # 0 means no overlong reward shaping
 
+def get_reward(answer_status: str, finished: bool, reward_table: RewardTable) -> float:
+    match (answer_status, finished):
+        case ("wrong", False):
+            return reward_table.wrong_answer_not_finished
+        case ("wrong", True):
+            return reward_table.wrong_answer_finished
+        case ("no_answer", False):
+            reward = reward_table.no_answer_not_finished
+        case ("no_answer", True):
+            reward = reward_table.no_answer_finished
+        case ("unparsable", False):
+            reward = reward_table.unparsable_not_finished
+        case ("unparsable", True):
+            reward = reward_table.unparsable_finished
+        case ("correct", False):
+            reward = reward_table.correct_answer_not_finished
+        case ("correct", True):
+            reward = reward_table.correct_answer_finished
+        case _:
+            raise ValueError(f"Invalid answer_status/finished combination: {answer_status}/{trace.finished}")
+
+
 def length_penalty(max_length: int, sequence_length: int, buffer_tokens: int) -> float:
     """
     Compute the overlong penalty
@@ -51,7 +73,7 @@ async def generate_math_rollout(
     latency = time.time() - time_start
 
     assert llm_call.output.content is not None
-    rewards = RewardTable(**dict(cfg.rewards))
+    reward_table = RewardTable(**dict(cfg.rewards))
     discount_factor = cfg.actor.discount_factor
 
     # math_verify is a fast environment, no support for environment replicas for now
@@ -70,26 +92,7 @@ async def generate_math_rollout(
 
     trace = make_training_text(llm, llm_call)
     # Determine reward based on answer status and finished state
-    match (answer_status, trace.finished):
-        case ("wrong", False):
-            reward = rewards.wrong_answer_not_finished
-        case ("wrong", True):
-            reward = rewards.wrong_answer_finished
-        case ("no_answer", False):
-            reward = rewards.no_answer_not_finished
-        case ("no_answer", True):
-            reward = rewards.no_answer_finished
-        case ("unparsable", False):
-            reward = rewards.unparsable_not_finished
-        case ("unparsable", True):
-            reward = rewards.unparsable_finished
-        case ("correct", False):
-            reward = rewards.correct_answer_not_finished
-        case ("correct", True):
-            reward = rewards.correct_answer_finished
-        case _:
-            raise ValueError(f"Invalid answer_status/finished combination: {answer_status}/{trace.finished}")
-
+    reward = get_reward(answer_status, trace.finished, reward_table)
     # Apply discount factor based on output length
     reward *= discount_factor**llm_call.output_length_tokens
     overlong_penalty = 0
diff --git a/pipelinerl/domains/tir_mcp/__init__.py b/pipelinerl/domains/tir_mcp/__init__.py
new file mode 100644
index 00000000..c558147b
--- /dev/null
+++ b/pipelinerl/domains/tir_mcp/__init__.py
@@ -0,0 +1 @@
+from .rollouts import generate_math_rollout2
\ No newline at end of file
diff --git a/pipelinerl/domains/tir_mcp/env_server.py b/pipelinerl/domains/tir_mcp/env_server.py
new file mode 100644
index 00000000..53259069
--- /dev/null
+++ b/pipelinerl/domains/tir_mcp/env_server.py
@@ -0,0 +1,44 @@
+import os
+from tapeagents.remote_environment import EnvironmentServer
+from omegaconf import OmegaConf
+from typing import List
+
+class MCPEnvironmentServer:
+
+    def __init__(self,
+        n_envs: int,
+        n_envs_mcp: int,
+        n_envs_math: int,
+        host: str,
+        mcp_target: str,
+        mcp_config_path: str,
+        mcp_tools_whitelist: List[str],
+        math_target: str,
+        exp_path: str,
+        env_call_timeout: int = 60,
+    ):
+        # Remote environment server configuration
+        self.n_envs = n_envs
+        self.host = host
+        self.env_call_timeout = env_call_timeout
+        # Individual web environment configuration
+        self.mcp_target = mcp_target
+        self.mcp_config_path = mcp_config_path
+        self.mcp_tools_whitelist = mcp_tools_whitelist
+        self.exp_path = exp_path
+
+
+    def launch(self, port: int):
+        """
+        Serve the environment in TapeAgent.
+        """
+        if port != 7777:
+            env_server = EnvironmentServer(n_envs=self.n_envs, host=self.host, port=port, env_call_timeout=self.env_call_timeout)
+            env_server.launch(OmegaConf.create({
+                "_target_": self.mcp_target,
+                "config_path": self.mcp_config_path,
+                "tools_whitelist": self.mcp_tools_whitelist,
+            }))
+        else:
+            MathEnvironment.launch(port)
+
diff --git a/pipelinerl/domains/tir_mcp/rollouts.py b/pipelinerl/domains/tir_mcp/rollouts.py
new file mode 100644
index 00000000..3a85804a
--- /dev/null
+++ b/pipelinerl/domains/tir_mcp/rollouts.py
@@ -0,0 +1,77 @@
+import time
+import random
+import logging 
+
+import aiohttp
+from omegaconf import DictConfig
+from pydantic import BaseModel
+from pipelinerl.world import Job
+from tapeagents.core import Prompt
+from tapeagents.llms.trainable import TrainableLLM
+from tapeagents.remote_environment import AsyncRemoteEnvironment
+from pipelinerl.async_llm import llm_async_generate, make_training_text
+from tapeagents.orchestrator import async_execute_agent
+from tapeagents.agent import DEFAULT, Agent
+from hydra.utils import instantiate
+from tapeagents.core import StopStep, Tape
+from tapeagents.dialog_tape import UserStep
+
+from pipelinerl.domains.math import verify_answer_rpc, RewardTable, get_reward
+from pipelinerl.rollouts import RolloutResult, BaseMetrics
+
+logger = logging.getLogger(__name__)
+
+
+
+async def generate_math_rollout2(
+    cfg: DictConfig,
+    llm: TrainableLLM,
+    problem: dict,
+    session: aiohttp.ClientSession,
+) -> RolloutResult:
+    # (1) Choose a random environment server
+    start = time.perf_counter()
+    env_jobs = [Job(**job) for job in cfg.jobs if job["kind"] == "environment"]
+    math_job, mcp_jobs = env_jobs[:1], env_jobs[1:]
+    # choose the env job randomly
+    mcp_job = random.choice(mcp_jobs)
+    assert mcp_job.port is not None
+    mcp_job_url = f"http://{mcp_job.hostname}:{mcp_job.port}"
+    environment = AsyncRemoteEnvironment(server_url=mcp_job_url)  # type: ignore
+    async with environment.acontext(session, wait_for_env=True) as env:
+        actions = await env.a_actions()
+        tools_description = await env.a_tools_description()
+        logger.debug(f"Available tools: {tools_description}")
+        agent: Agent = instantiate(cfg.agent, known_actions=actions, tools_description=tools_description)
+        agent.llms = {DEFAULT: llm}
+        tape = Tape(steps=[UserStep(content=problem["task"])])
+        tape = await async_execute_agent(agent, tape, env, session, max_loops=cfg.agent_max_loops)
+
+    reward_table = RewardTable(**dict(cfg.rewards))
+    answer_status = await verify_answer_rpc(
+        session=session,
+        host=math_job.hostname,
+        port=math_job.port,
+        prediction=llm_call.output.content,
+        gold=problem["answer"],
+        strict=True,
+    )
+    reward = get_reward(answer_status, tape.finished, reward_table)
+
+    metrics = BaseMetrics(
+        reward=reward,
+        success=answer_status == "correct",
+        no_error=answer_status != "unparsable",
+        no_answer=answer_status == "no_answer",
+    )
+
+    training_texts = [make_training_text(llm, llm_call) for llm_call in llm_calls]
+    for text in training_texts:
+        text.reward = reward
+    latency = time.perf_counter() - start
+    return RolloutResult(
+        training_texts=training_texts,
+        metrics=metrics,
+        latency=latency,
+        dataset_name=problem["dataset"],
+    )

From ca8516b6b9d9b4d98a9e028298613657e50b054f Mon Sep 17 00:00:00 2001
From: Alex Piche <alexandrelpiche@gmail.com>
Date: Sat, 16 Aug 2025 00:01:58 +0000
Subject: [PATCH 010/126] fix the env server

---
 conf/tir_mcp.yaml                        | 11 +++++-----
 pipelinerl/domains/tir_mcp/env_server.py |  5 ++++-
 pipelinerl/domains/tir_mcp/rollouts.py   | 28 +++++++++++++++++-------
 3 files changed, 30 insertions(+), 14 deletions(-)

diff --git a/conf/tir_mcp.yaml b/conf/tir_mcp.yaml
index 45596607..f4e95376 100644
--- a/conf/tir_mcp.yaml
+++ b/conf/tir_mcp.yaml
@@ -42,12 +42,11 @@ agent:
   _target_: tapeagents.agent.Agent
   name : mcp_agent
   max_iterations: 2
+  store_llm_calls: true
   templates:
     system_prompt: |
-      You are an expert AI Agent trained to assist users with complex information processing tasks.
-      Your role is to understand user queries and respond in a helpful and accurate manner.
-      Keep your replies concise and direct. Prioritize clarity and avoid over-elaboration.
-      Do not express emotions or opinions about user questions.
+      You are an math and code expert AI Agent.
+      Please reason step by step, and put your final answer within \boxed{}.
     allowed_tools: |
       You have access to the following tools:
       {tools_description}
@@ -116,4 +115,6 @@ agent:
         3. Check if the current plan step is finished. 
         4. If the step is finished, update the following steps of the plan with new information and choose the next step.
         ${agent.templates.thought_format}
-      next_node: select
\ No newline at end of file
+      next_node: select
+
+model_path: Qwen/Qwen3-8B
\ No newline at end of file
diff --git a/pipelinerl/domains/tir_mcp/env_server.py b/pipelinerl/domains/tir_mcp/env_server.py
index 53259069..8c265549 100644
--- a/pipelinerl/domains/tir_mcp/env_server.py
+++ b/pipelinerl/domains/tir_mcp/env_server.py
@@ -3,6 +3,9 @@
 from omegaconf import OmegaConf
 from typing import List
 
+
+from pipelinerl.domains.math import MathEnvironment
+
 class MCPEnvironmentServer:
 
     def __init__(self,
@@ -40,5 +43,5 @@ def launch(self, port: int):
                 "tools_whitelist": self.mcp_tools_whitelist,
             }))
         else:
-            MathEnvironment.launch(port)
+            MathEnvironment().launch(port)
 
diff --git a/pipelinerl/domains/tir_mcp/rollouts.py b/pipelinerl/domains/tir_mcp/rollouts.py
index 3a85804a..43e404c3 100644
--- a/pipelinerl/domains/tir_mcp/rollouts.py
+++ b/pipelinerl/domains/tir_mcp/rollouts.py
@@ -15,6 +15,7 @@
 from hydra.utils import instantiate
 from tapeagents.core import StopStep, Tape
 from tapeagents.dialog_tape import UserStep
+from tapeagents.core import LLMCall
 
 from pipelinerl.domains.math import verify_answer_rpc, RewardTable, get_reward
 from pipelinerl.rollouts import RolloutResult, BaseMetrics
@@ -32,7 +33,7 @@ async def generate_math_rollout2(
     # (1) Choose a random environment server
     start = time.perf_counter()
     env_jobs = [Job(**job) for job in cfg.jobs if job["kind"] == "environment"]
-    math_job, mcp_jobs = env_jobs[:1], env_jobs[1:]
+    math_job, mcp_jobs = env_jobs[0], env_jobs[1:]
     # choose the env job randomly
     mcp_job = random.choice(mcp_jobs)
     assert mcp_job.port is not None
@@ -48,15 +49,31 @@ async def generate_math_rollout2(
         tape = await async_execute_agent(agent, tape, env, session, max_loops=cfg.agent_max_loops)
 
     reward_table = RewardTable(**dict(cfg.rewards))
+
+
+    llm_calls = [step for step in tape.steps if step.metadata.other.get("llm_call") is not None]
+    llm_calls: list[LLMCall] = [
+        LLMCall(**step.metadata.other["llm_call"])
+        if isinstance(step.metadata.other["llm_call"], dict)
+        else step.metadata.other["llm_call"]
+        for step in llm_calls
+    ]
+    assert len(llm_calls) > 0, "No LLM calls found"
+    training_texts = [make_training_text(llm, llm_call) for llm_call in llm_calls]
     answer_status = await verify_answer_rpc(
         session=session,
         host=math_job.hostname,
         port=math_job.port,
-        prediction=llm_call.output.content,
+        prediction=llm_calls[-1].output.content,
         gold=problem["answer"],
         strict=True,
     )
-    reward = get_reward(answer_status, tape.finished, reward_table)
+    tape_finished = True # TODO
+    reward = get_reward(answer_status, tape_finished, reward_table)
+    for text in training_texts:
+        text.reward = reward
+
+    latency = time.perf_counter() - start
 
     metrics = BaseMetrics(
         reward=reward,
@@ -64,11 +81,6 @@ async def generate_math_rollout2(
         no_error=answer_status != "unparsable",
         no_answer=answer_status == "no_answer",
     )
-
-    training_texts = [make_training_text(llm, llm_call) for llm_call in llm_calls]
-    for text in training_texts:
-        text.reward = reward
-    latency = time.perf_counter() - start
     return RolloutResult(
         training_texts=training_texts,
         metrics=metrics,

From f3af1bc9cff436edef31e379581198df3f737477 Mon Sep 17 00:00:00 2001
From: Alex Piche <alexandrelpiche@gmail.com>
Date: Sat, 16 Aug 2025 00:05:00 +0000
Subject: [PATCH 011/126] tweak prompt

---
 conf/tir_mcp.yaml | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/conf/tir_mcp.yaml b/conf/tir_mcp.yaml
index f4e95376..67b025ec 100644
--- a/conf/tir_mcp.yaml
+++ b/conf/tir_mcp.yaml
@@ -45,8 +45,11 @@ agent:
   store_llm_calls: true
   templates:
     system_prompt: |
-      You are an math and code expert AI Agent.
-      Please reason step by step, and put your final answer within \boxed{}.
+      You are an expert AI Agent trained to assist users with complex information processing tasks.
+      Your role is to understand user queries and respond in a helpful and accurate manner.
+      Keep your replies concise and direct. Prioritize clarity and avoid over-elaboration.
+      Do not express emotions or opinions about user questions.
+      Put your final answer within \boxed{}.
     allowed_tools: |
       You have access to the following tools:
       {tools_description}

From 5b10c33e2be57c6f64b7eedae23538c505787fa8 Mon Sep 17 00:00:00 2001
From: Alex Piche <alexandrelpiche@gmail.com>
Date: Sat, 16 Aug 2025 16:49:20 +0000
Subject: [PATCH 012/126] upd

---
 conf/tir_mcp.yaml                      | 20 ++++++++------------
 pipelinerl/domains/tir_mcp/rollouts.py | 10 ++++++----
 2 files changed, 14 insertions(+), 16 deletions(-)

diff --git a/conf/tir_mcp.yaml b/conf/tir_mcp.yaml
index 67b025ec..555523d9 100644
--- a/conf/tir_mcp.yaml
+++ b/conf/tir_mcp.yaml
@@ -48,8 +48,7 @@ agent:
       You are an expert AI Agent trained to assist users with complex information processing tasks.
       Your role is to understand user queries and respond in a helpful and accurate manner.
       Keep your replies concise and direct. Prioritize clarity and avoid over-elaboration.
-      Do not express emotions or opinions about user questions.
-      Put your final answer within \boxed{}.
+      Do not express emotions or opinions about user questions. You must use the python tool for computation.
     allowed_tools: |
       You have access to the following tools:
       {tools_description}
@@ -65,23 +64,20 @@ agent:
       If the last action does not change the observation, do not repeat it!
       DO NOT OUTPUT ANYTHING BESIDES THE JSON! DO NOT PLACE ANY COMMENTS INSIDE THE JSON. 
       It will break the system that processes the output.
+      
 
   nodes:
     - _target_: tapeagents.nodes.StandardNode
       name: plan
       system_prompt: ${agent.templates.system_prompt}
       guidance: |
-        Write a concise multi-step plan explaining which steps should be performed to find the answer for the given task.
-        Be specific about how each step should be performed. Only describe the intended actions here, do not perform them yet.
-        Consider that next steps may depend on results of previous steps, so include conditional branching using "if" statements where needed.
-        Start with the title "Plan". Every step should have short name and description.
-        ${agent.templates.thought_format}
+        Use python to compute the correct answer
       steps_prompt: ${agent.templates.allowed_tools}
 
     - _target_: tapeagents.nodes.StandardNode
       name: select
       system_prompt: ${agent.templates.system_prompt}
-      trim_obs_except_last_n: 1
+      trim_obs_except_last_n: 100
       guidance: |
         Select the next step to do to move forward with the plan. Describe the expected effect of the proposed action.
         ${agent.templates.thought_format}
@@ -90,8 +86,8 @@ agent:
     - _target_: tapeagents.nodes.StandardNode
       name: act
       system_prompt: ${agent.templates.system_prompt}
-      trim_obs_except_last_n: 1
-      guidance: Then produce single function call for the next step. If the answer is ready, call GaiaAnswer.
+      trim_obs_except_last_n: 100
+      guidance: Then produce single function call for the next step. If the answer is ready, call GaiaAnswer. Put your final answer within \boxed{}.
       steps:
         - examples.gaia_agent.steps.GaiaAnswer
       use_known_actions: true
@@ -100,7 +96,7 @@ agent:
     - _target_: tapeagents.nodes.StandardNode
       name: summarize
       system_prompt: ${agent.templates.system_prompt}
-      trim_obs_except_last_n: 1
+      trim_obs_except_last_n: 100
       guidance: |
         Summarize last observation. If its an image, thoroughly describe it with all details.
         Describe the results of the last action and observed changes
@@ -111,7 +107,7 @@ agent:
     - _target_: tapeagents.nodes.StandardNode
       name: reflect
       system_prompt: ${agent.templates.system_prompt}
-      trim_obs_except_last_n: 1
+      trim_obs_except_last_n: 100
       guidance: |
         1. Evaluate the action's success, explain its effect on current step, overall plan and task solution.
         2. If the last action was not successful, describe errors and the possible reasons for failure.
diff --git a/pipelinerl/domains/tir_mcp/rollouts.py b/pipelinerl/domains/tir_mcp/rollouts.py
index 43e404c3..5dd20104 100644
--- a/pipelinerl/domains/tir_mcp/rollouts.py
+++ b/pipelinerl/domains/tir_mcp/rollouts.py
@@ -45,18 +45,20 @@ async def generate_math_rollout2(
         logger.debug(f"Available tools: {tools_description}")
         agent: Agent = instantiate(cfg.agent, known_actions=actions, tools_description=tools_description)
         agent.llms = {DEFAULT: llm}
-        tape = Tape(steps=[UserStep(content=problem["task"])])
+
+        tape = Tape(steps=[
+            #UserStep(content=f"{problem['task']}. You have access to the following tools: {tools_description}")
+            UserStep(content=f"Use run_python_code to compute 32+45")
+            ])
         tape = await async_execute_agent(agent, tape, env, session, max_loops=cfg.agent_max_loops)
 
     reward_table = RewardTable(**dict(cfg.rewards))
 
-
-    llm_calls = [step for step in tape.steps if step.metadata.other.get("llm_call") is not None]
     llm_calls: list[LLMCall] = [
         LLMCall(**step.metadata.other["llm_call"])
         if isinstance(step.metadata.other["llm_call"], dict)
         else step.metadata.other["llm_call"]
-        for step in llm_calls
+        for step in tape.steps if step.metadata.other.get("llm_call") is not None
     ]
     assert len(llm_calls) > 0, "No LLM calls found"
     training_texts = [make_training_text(llm, llm_call) for llm_call in llm_calls]

From d2e6d09deb3a2efddd79d8e42d14ac2ca8e6101f Mon Sep 17 00:00:00 2001
From: Alex Piche <alexandrelpiche@gmail.com>
Date: Mon, 18 Aug 2025 14:33:11 +0000
Subject: [PATCH 013/126] clean up

---
 conf/tir_mcp.yaml                      |  4 ++--
 pipelinerl/domains/math/rollouts.py    | 12 ++++++------
 pipelinerl/domains/tir_mcp/rollouts.py | 12 +++++++++---
 3 files changed, 17 insertions(+), 11 deletions(-)

diff --git a/conf/tir_mcp.yaml b/conf/tir_mcp.yaml
index 555523d9..03fd7699 100644
--- a/conf/tir_mcp.yaml
+++ b/conf/tir_mcp.yaml
@@ -37,11 +37,11 @@ environment:
   math_target: pipelinerl.domains.math.MathEnvironment
 
 
-agent_max_loops: 2
+agent_max_loops: 3
 agent:
   _target_: tapeagents.agent.Agent
   name : mcp_agent
-  max_iterations: 2
+  max_iterations: 5
   store_llm_calls: true
   templates:
     system_prompt: |
diff --git a/pipelinerl/domains/math/rollouts.py b/pipelinerl/domains/math/rollouts.py
index cdb7ba2a..7bc21a8f 100644
--- a/pipelinerl/domains/math/rollouts.py
+++ b/pipelinerl/domains/math/rollouts.py
@@ -33,17 +33,17 @@ def get_reward(answer_status: str, finished: bool, reward_table: RewardTable) ->
         case ("wrong", True):
             return reward_table.wrong_answer_finished
         case ("no_answer", False):
-            reward = reward_table.no_answer_not_finished
+            return reward_table.no_answer_not_finished
         case ("no_answer", True):
-            reward = reward_table.no_answer_finished
+            return reward_table.no_answer_finished
         case ("unparsable", False):
-            reward = reward_table.unparsable_not_finished
+            return reward_table.unparsable_not_finished
         case ("unparsable", True):
-            reward = reward_table.unparsable_finished
+            return reward_table.unparsable_finished
         case ("correct", False):
-            reward = reward_table.correct_answer_not_finished
+            return reward_table.correct_answer_not_finished
         case ("correct", True):
-            reward = reward_table.correct_answer_finished
+            return reward_table.correct_answer_finished
         case _:
             raise ValueError(f"Invalid answer_status/finished combination: {answer_status}/{trace.finished}")
 
diff --git a/pipelinerl/domains/tir_mcp/rollouts.py b/pipelinerl/domains/tir_mcp/rollouts.py
index 5dd20104..fcdba1f3 100644
--- a/pipelinerl/domains/tir_mcp/rollouts.py
+++ b/pipelinerl/domains/tir_mcp/rollouts.py
@@ -23,6 +23,9 @@
 logger = logging.getLogger(__name__)
 
 
+class Metrics(BaseMetrics):
+    num_tool_calls: int
+    num_python_calls: int
 
 async def generate_math_rollout2(
     cfg: DictConfig,
@@ -47,8 +50,7 @@ async def generate_math_rollout2(
         agent.llms = {DEFAULT: llm}
 
         tape = Tape(steps=[
-            #UserStep(content=f"{problem['task']}. You have access to the following tools: {tools_description}")
-            UserStep(content=f"Use run_python_code to compute 32+45")
+            UserStep(content=f"{problem['task']}. You have access to the following tools: {tools_description}")
             ])
         tape = await async_execute_agent(agent, tape, env, session, max_loops=cfg.agent_max_loops)
 
@@ -60,6 +62,7 @@ async def generate_math_rollout2(
         else step.metadata.other["llm_call"]
         for step in tape.steps if step.metadata.other.get("llm_call") is not None
     ]
+    num_tool_call = len([llm_call for llm_call in llm_calls if llm_call.output.tool_calls])
     assert len(llm_calls) > 0, "No LLM calls found"
     training_texts = [make_training_text(llm, llm_call) for llm_call in llm_calls]
     answer_status = await verify_answer_rpc(
@@ -77,12 +80,15 @@ async def generate_math_rollout2(
 
     latency = time.perf_counter() - start
 
-    metrics = BaseMetrics(
+    metrics = Metrics(
         reward=reward,
         success=answer_status == "correct",
         no_error=answer_status != "unparsable",
         no_answer=answer_status == "no_answer",
+        num_tool_calls=num_tool_call,
+        num_python_calls=len([llm_call for llm_call in llm_calls if llm_call.output.tool_calls and llm_call.output.tool_calls[0].function.name != "GaiaAnswer"])
     )
+
     return RolloutResult(
         training_texts=training_texts,
         metrics=metrics,

From 228cb42d8f389a4102b627c1776967d3f9c383de Mon Sep 17 00:00:00 2001
From: Alex Piche <alexandrelpiche@gmail.com>
Date: Mon, 18 Aug 2025 18:33:55 +0000
Subject: [PATCH 014/126] hard code dino

---
 conf/mcp/python.json                   |  2 +-
 conf/tir_mcp.yaml                      |  8 ++++--
 pipelinerl/domains/math/rollouts.py    |  2 +-
 pipelinerl/domains/tir_mcp/rollouts.py | 37 +++++++++++++++++++++-----
 4 files changed, 38 insertions(+), 11 deletions(-)

diff --git a/conf/mcp/python.json b/conf/mcp/python.json
index 50ccbe69..f9ff1a04 100644
--- a/conf/mcp/python.json
+++ b/conf/mcp/python.json
@@ -1,7 +1,7 @@
 {
     "mcpServers": {
         "python_exec": {
-            "command": "deno",
+            "command": "/home/toolkit/.deno/bin/deno",
             "args": [
                 "run",
                 "-N",
diff --git a/conf/tir_mcp.yaml b/conf/tir_mcp.yaml
index 03fd7699..1b846b60 100644
--- a/conf/tir_mcp.yaml
+++ b/conf/tir_mcp.yaml
@@ -5,6 +5,7 @@ defaults:
 actor:
   rollout_policy: pipelinerl.domains.tir_mcp.generate_math_rollout2
   system_prompt: Please reason step by step, and put your final answer within \boxed{}.
+  llm_max_rollouts: 8
   task_template: |-
     {task}
 
@@ -17,6 +18,9 @@ test_dataset_names:
   - amc_2023
   - math_500
 
+world:
+  env_replicas: 16
+
 vllm_config:
   use_v1: true
   vllm_kwargs:
@@ -37,11 +41,11 @@ environment:
   math_target: pipelinerl.domains.math.MathEnvironment
 
 
-agent_max_loops: 3
+agent_max_loops: 1
 agent:
   _target_: tapeagents.agent.Agent
   name : mcp_agent
-  max_iterations: 5
+  max_iterations: 4
   store_llm_calls: true
   templates:
     system_prompt: |
diff --git a/pipelinerl/domains/math/rollouts.py b/pipelinerl/domains/math/rollouts.py
index 7bc21a8f..7f370214 100644
--- a/pipelinerl/domains/math/rollouts.py
+++ b/pipelinerl/domains/math/rollouts.py
@@ -96,7 +96,7 @@ async def generate_math_rollout(
     # Apply discount factor based on output length
     reward *= discount_factor**llm_call.output_length_tokens
     overlong_penalty = 0
-    if rewards.buffer_tokens > 0:
+    if reward_table.buffer_tokens > 0:
         overlong_penalty = length_penalty(llm.parameters['max_tokens'], llm_call.output_length_tokens, rewards.buffer_tokens)
     reward += overlong_penalty
     trace.reward = reward
diff --git a/pipelinerl/domains/tir_mcp/rollouts.py b/pipelinerl/domains/tir_mcp/rollouts.py
index fcdba1f3..bd984337 100644
--- a/pipelinerl/domains/tir_mcp/rollouts.py
+++ b/pipelinerl/domains/tir_mcp/rollouts.py
@@ -1,6 +1,8 @@
 import time
 import random
 import logging 
+from collections import Counter
+from typing import List, Dict
 
 import aiohttp
 from omegaconf import DictConfig
@@ -23,9 +25,29 @@
 logger = logging.getLogger(__name__)
 
 
+def count_tool_calls_by_category(llm_calls: List[LLMCall]) -> Dict[str, int]:
+    """
+    Count the number of tool calls for each function name category.
+    
+    Args:
+        llm_calls: List of LLMCall objects
+        
+    Returns:
+        Dictionary mapping function names to their counts
+    """
+    tool_call_names = []
+    
+    for llm_call in llm_calls:
+        if llm_call.output.tool_calls:
+            for tool_call in llm_call.output.tool_calls:
+                tool_call_names.append(tool_call.function.name)
+    
+    return dict(Counter(tool_call_names))
+
+
 class Metrics(BaseMetrics):
-    num_tool_calls: int
-    num_python_calls: int
+    num_python_calls: int = 0
+    num_steps: int = 0
 
 async def generate_math_rollout2(
     cfg: DictConfig,
@@ -62,14 +84,13 @@ async def generate_math_rollout2(
         else step.metadata.other["llm_call"]
         for step in tape.steps if step.metadata.other.get("llm_call") is not None
     ]
-    num_tool_call = len([llm_call for llm_call in llm_calls if llm_call.output.tool_calls])
     assert len(llm_calls) > 0, "No LLM calls found"
     training_texts = [make_training_text(llm, llm_call) for llm_call in llm_calls]
     answer_status = await verify_answer_rpc(
         session=session,
         host=math_job.hostname,
-        port=math_job.port,
-        prediction=llm_calls[-1].output.content,
+        port=math_job.port, # type: ignore
+        prediction=llm_calls[-1].output.content, # type: ignore
         gold=problem["answer"],
         strict=True,
     )
@@ -80,13 +101,15 @@ async def generate_math_rollout2(
 
     latency = time.perf_counter() - start
 
+    tool_call_counts = count_tool_calls_by_category(llm_calls)
+    
     metrics = Metrics(
         reward=reward,
         success=answer_status == "correct",
         no_error=answer_status != "unparsable",
         no_answer=answer_status == "no_answer",
-        num_tool_calls=num_tool_call,
-        num_python_calls=len([llm_call for llm_call in llm_calls if llm_call.output.tool_calls and llm_call.output.tool_calls[0].function.name != "GaiaAnswer"])
+        num_steps=len(tape.steps),
+        num_python_calls=tool_call_counts.get("run_python_code", 0),
     )
 
     return RolloutResult(

From fdf3c830f185ce23d31ebcbd7b6969dee0c8e1cc Mon Sep 17 00:00:00 2001
From: Alex Piche <alexandrelpiche@gmail.com>
Date: Mon, 18 Aug 2025 18:44:23 +0000
Subject: [PATCH 015/126] less envs

---
 conf/tir_mcp.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/conf/tir_mcp.yaml b/conf/tir_mcp.yaml
index 1b846b60..ef4ef28d 100644
--- a/conf/tir_mcp.yaml
+++ b/conf/tir_mcp.yaml
@@ -19,7 +19,7 @@ test_dataset_names:
   - math_500
 
 world:
-  env_replicas: 16
+  env_replicas: 3
 
 vllm_config:
   use_v1: true

From 1165397c6482d166a3ab174105d18e0a491b8004 Mon Sep 17 00:00:00 2001
From: Alex Piche <alexandrelpiche@gmail.com>
Date: Mon, 18 Aug 2025 18:56:20 +0000
Subject: [PATCH 016/126] less envs

---
 conf/tir_mcp.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/conf/tir_mcp.yaml b/conf/tir_mcp.yaml
index ef4ef28d..2cdf6602 100644
--- a/conf/tir_mcp.yaml
+++ b/conf/tir_mcp.yaml
@@ -29,7 +29,7 @@ vllm_config:
 
 environment:
   _target_: pipelinerl.domains.tir_mcp.env_server.MCPEnvironmentServer
-  n_envs: 8
+  n_envs: 2
   n_envs_mcp: 7
   n_envs_math: 1
   host: localhost

From 40a144aa716661848087fb9fea784288194b323b Mon Sep 17 00:00:00 2001
From: Alex Piche <alexandrelpiche@gmail.com>
Date: Mon, 18 Aug 2025 19:18:09 +0000
Subject: [PATCH 017/126] longer timeout

---
 conf/tir_mcp.yaml                        | 1 +
 pipelinerl/domains/tir_mcp/env_server.py | 3 +++
 2 files changed, 4 insertions(+)

diff --git a/conf/tir_mcp.yaml b/conf/tir_mcp.yaml
index 2cdf6602..305aa610 100644
--- a/conf/tir_mcp.yaml
+++ b/conf/tir_mcp.yaml
@@ -39,6 +39,7 @@ environment:
   mcp_tools_whitelist:
     - run_python_code
   math_target: pipelinerl.domains.math.MathEnvironment
+  mcp_read_timeout_seconds: 300
 
 
 agent_max_loops: 1
diff --git a/pipelinerl/domains/tir_mcp/env_server.py b/pipelinerl/domains/tir_mcp/env_server.py
index 8c265549..e1662990 100644
--- a/pipelinerl/domains/tir_mcp/env_server.py
+++ b/pipelinerl/domains/tir_mcp/env_server.py
@@ -19,6 +19,7 @@ def __init__(self,
         math_target: str,
         exp_path: str,
         env_call_timeout: int = 60,
+        mcp_read_timeout_seconds: int = 10,
     ):
         # Remote environment server configuration
         self.n_envs = n_envs
@@ -29,6 +30,7 @@ def __init__(self,
         self.mcp_config_path = mcp_config_path
         self.mcp_tools_whitelist = mcp_tools_whitelist
         self.exp_path = exp_path
+        self.mcp_read_timeout_seconds = mcp_read_timeout_seconds
 
 
     def launch(self, port: int):
@@ -41,6 +43,7 @@ def launch(self, port: int):
                 "_target_": self.mcp_target,
                 "config_path": self.mcp_config_path,
                 "tools_whitelist": self.mcp_tools_whitelist,
+                "read_timeout_seconds": self.mcp_read_timeout_seconds,
             }))
         else:
             MathEnvironment().launch(port)

From 2d25d8870d9f3e3ae2c0048aedca5c3ace046efd Mon Sep 17 00:00:00 2001
From: Alex Piche <alexandrelpiche@gmail.com>
Date: Mon, 18 Aug 2025 19:30:43 +0000
Subject: [PATCH 018/126] longer seq length

---
 conf/tir_mcp.yaml        | 4 ++++
 pipelinerl/preprocess.py | 4 ++++
 2 files changed, 8 insertions(+)

diff --git a/conf/tir_mcp.yaml b/conf/tir_mcp.yaml
index 305aa610..5974132b 100644
--- a/conf/tir_mcp.yaml
+++ b/conf/tir_mcp.yaml
@@ -9,6 +9,10 @@ actor:
   task_template: |-
     {task}
 
+finetune:
+  seq_length: 50000
+  seq_parallel: 4
+
 dataset_loader: pipelinerl.domains.math.load_datasets
 train_dataset_names:
 - open_reasoner_zero_57k
diff --git a/pipelinerl/preprocess.py b/pipelinerl/preprocess.py
index 65e29b4b..5f2b4af5 100644
--- a/pipelinerl/preprocess.py
+++ b/pipelinerl/preprocess.py
@@ -573,6 +573,10 @@ def run_preprocessing_loop(
                                     sample_length = len(entry["input_ids"])
 
                                     if current_length + sample_length > cfg.finetune.seq_length:
+                                        if len(current_batch) == 0:
+                                            raise ValueError(
+                                                f"sample_length is {sample_length}, but cfg.finetune.seq_length is {cfg.finetune.seq_length}"
+                                            )
                                         time_to_write = True
                                         break  # Current micro batch is full
                                     

From 20361677a2b560f34123d6af84c290f079c8e6fb Mon Sep 17 00:00:00 2001
From: Alex Piche <alexandrelpiche@gmail.com>
Date: Mon, 18 Aug 2025 19:48:45 +0000
Subject: [PATCH 019/126] more envs

---
 conf/tir_mcp.yaml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/conf/tir_mcp.yaml b/conf/tir_mcp.yaml
index 5974132b..24d5b977 100644
--- a/conf/tir_mcp.yaml
+++ b/conf/tir_mcp.yaml
@@ -10,7 +10,7 @@ actor:
     {task}
 
 finetune:
-  seq_length: 50000
+  seq_length: 60000
   seq_parallel: 4
 
 dataset_loader: pipelinerl.domains.math.load_datasets
@@ -23,7 +23,7 @@ test_dataset_names:
   - math_500
 
 world:
-  env_replicas: 3
+  env_replicas: 16
 
 vllm_config:
   use_v1: true
@@ -33,7 +33,7 @@ vllm_config:
 
 environment:
   _target_: pipelinerl.domains.tir_mcp.env_server.MCPEnvironmentServer
-  n_envs: 2
+  n_envs: 8
   n_envs_mcp: 7
   n_envs_math: 1
   host: localhost

From 664b53968520b6c5428b968b4b26be8de2778e77 Mon Sep 17 00:00:00 2001
From: Alex Piche <alexandrelpiche@gmail.com>
Date: Mon, 18 Aug 2025 19:52:03 +0000
Subject: [PATCH 020/126] more llms per actor

---
 conf/tir_mcp.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/conf/tir_mcp.yaml b/conf/tir_mcp.yaml
index 24d5b977..5a26861a 100644
--- a/conf/tir_mcp.yaml
+++ b/conf/tir_mcp.yaml
@@ -5,7 +5,7 @@ defaults:
 actor:
   rollout_policy: pipelinerl.domains.tir_mcp.generate_math_rollout2
   system_prompt: Please reason step by step, and put your final answer within \boxed{}.
-  llm_max_rollouts: 8
+  llm_max_rollouts: 64
   task_template: |-
     {task}
 

From 4b0db03d50827b8535a7bb090cec297179caa6d5 Mon Sep 17 00:00:00 2001
From: Alex Piche <alexandrelpiche@gmail.com>
Date: Mon, 18 Aug 2025 19:59:57 +0000
Subject: [PATCH 021/126] even more envs

---
 conf/tir_mcp.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/conf/tir_mcp.yaml b/conf/tir_mcp.yaml
index 5a26861a..44db4fbe 100644
--- a/conf/tir_mcp.yaml
+++ b/conf/tir_mcp.yaml
@@ -33,7 +33,7 @@ vllm_config:
 
 environment:
   _target_: pipelinerl.domains.tir_mcp.env_server.MCPEnvironmentServer
-  n_envs: 8
+  n_envs: 16
   n_envs_mcp: 7
   n_envs_math: 1
   host: localhost

From 63d40924642dd56fabfd00b905373a8bb90758a0 Mon Sep 17 00:00:00 2001
From: Alex Piche <alexandrelpiche@gmail.com>
Date: Mon, 18 Aug 2025 20:08:45 +0000
Subject: [PATCH 022/126] longer timeout and revert prompt

---
 conf/tir_mcp.yaml | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/conf/tir_mcp.yaml b/conf/tir_mcp.yaml
index 44db4fbe..e6cb0349 100644
--- a/conf/tir_mcp.yaml
+++ b/conf/tir_mcp.yaml
@@ -43,7 +43,7 @@ environment:
   mcp_tools_whitelist:
     - run_python_code
   math_target: pipelinerl.domains.math.MathEnvironment
-  mcp_read_timeout_seconds: 300
+  mcp_read_timeout_seconds: 3000
 
 
 agent_max_loops: 1
@@ -57,7 +57,7 @@ agent:
       You are an expert AI Agent trained to assist users with complex information processing tasks.
       Your role is to understand user queries and respond in a helpful and accurate manner.
       Keep your replies concise and direct. Prioritize clarity and avoid over-elaboration.
-      Do not express emotions or opinions about user questions. You must use the python tool for computation.
+      Do not express emotions or opinions about user questions. 
     allowed_tools: |
       You have access to the following tools:
       {tools_description}
@@ -80,7 +80,11 @@ agent:
       name: plan
       system_prompt: ${agent.templates.system_prompt}
       guidance: |
-        Use python to compute the correct answer
+        Write a concise multi-step plan explaining which steps should be performed to find the answer for the given task.
+        Be specific about how each step should be performed. Only describe the intended actions here, do not perform them yet.
+        Consider that next steps may depend on results of previous steps, so include conditional branching using "if" statements where needed.
+        Start with the title "Plan". Every step should have short name and description.
+        ${agent.templates.thought_format}
       steps_prompt: ${agent.templates.allowed_tools}
 
     - _target_: tapeagents.nodes.StandardNode

From 6d81456d544734bcc26bf8dd65b78d73728b4702 Mon Sep 17 00:00:00 2001
From: Alex Piche <alexandrelpiche@gmail.com>
Date: Mon, 18 Aug 2025 20:23:45 +0000
Subject: [PATCH 023/126] retry task

---
 pipelinerl/domains/tir_mcp/rollouts.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/pipelinerl/domains/tir_mcp/rollouts.py b/pipelinerl/domains/tir_mcp/rollouts.py
index bd984337..a22b690a 100644
--- a/pipelinerl/domains/tir_mcp/rollouts.py
+++ b/pipelinerl/domains/tir_mcp/rollouts.py
@@ -1,3 +1,4 @@
+import asyncio
 import time
 import random
 import logging 
@@ -74,7 +75,12 @@ async def generate_math_rollout2(
         tape = Tape(steps=[
             UserStep(content=f"{problem['task']}. You have access to the following tools: {tools_description}")
             ])
-        tape = await async_execute_agent(agent, tape, env, session, max_loops=cfg.agent_max_loops)
+        while True:
+            try:
+                tape = await async_execute_agent(agent, tape, env, session, max_loops=cfg.agent_max_loops)
+                break
+            except Exception as e:
+                await asyncio.sleep(5)
 
     reward_table = RewardTable(**dict(cfg.rewards))
 

From 373b0ac16e8ac282dfd877561b1ce1229c01931d Mon Sep 17 00:00:00 2001
From: Alex Piche <alexandrelpiche@gmail.com>
Date: Mon, 18 Aug 2025 21:18:37 +0000
Subject: [PATCH 024/126] pid deno module

---
 conf/mcp/python.json | 6 +++---
 conf/tir_mcp.yaml    | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/conf/mcp/python.json b/conf/mcp/python.json
index f9ff1a04..f6e79890 100644
--- a/conf/mcp/python.json
+++ b/conf/mcp/python.json
@@ -5,9 +5,9 @@
             "args": [
                 "run",
                 "-N",
-                "-R=node_modules",
-                "-W=node_modules",
-                "--node-modules-dir=auto",
+                "-R=.mcp_node_modules_$$",
+                "-W=.mcp_node_modules_$$",
+                "--node-modules-dir=.mcp_node_modules_$$",
                 "jsr:@pydantic/mcp-run-python",
                 "stdio"
             ]
diff --git a/conf/tir_mcp.yaml b/conf/tir_mcp.yaml
index e6cb0349..9856e5c0 100644
--- a/conf/tir_mcp.yaml
+++ b/conf/tir_mcp.yaml
@@ -23,7 +23,7 @@ test_dataset_names:
   - math_500
 
 world:
-  env_replicas: 16
+  env_replicas: 64
 
 vllm_config:
   use_v1: true

From e2de76821eb69fa38490c0fc88026332d02bd369 Mon Sep 17 00:00:00 2001
From: Alex Piche <alexandrelpiche@gmail.com>
Date: Mon, 18 Aug 2025 21:30:01 +0000
Subject: [PATCH 025/126] diff deno tmp dir

---
 conf/mcp/python.json | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/conf/mcp/python.json b/conf/mcp/python.json
index f6e79890..59531b79 100644
--- a/conf/mcp/python.json
+++ b/conf/mcp/python.json
@@ -5,12 +5,13 @@
             "args": [
                 "run",
                 "-N",
-                "-R=.mcp_node_modules_$$",
-                "-W=.mcp_node_modules_$$",
-                "--node-modules-dir=.mcp_node_modules_$$",
+                "--node-modules-dir=auto",
                 "jsr:@pydantic/mcp-run-python",
                 "stdio"
-            ]
+            ],
+            "env": {
+                "DENO_DIR": "/tmp/deno_cache_mcp_python_$$"
+            }
         }
     }
 }
\ No newline at end of file

From 763b594d860184cdde9605c530e405e28ab1df4c Mon Sep 17 00:00:00 2001
From: Alex Piche <alexandrelpiche@gmail.com>
Date: Mon, 18 Aug 2025 21:33:55 +0000
Subject: [PATCH 026/126] none node modules

---
 conf/mcp/python.json | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/conf/mcp/python.json b/conf/mcp/python.json
index 59531b79..44699388 100644
--- a/conf/mcp/python.json
+++ b/conf/mcp/python.json
@@ -5,13 +5,10 @@
             "args": [
                 "run",
                 "-N",
-                "--node-modules-dir=auto",
+                "--node-modules-dir=none",
                 "jsr:@pydantic/mcp-run-python",
                 "stdio"
-            ],
-            "env": {
-                "DENO_DIR": "/tmp/deno_cache_mcp_python_$$"
-            }
+            ]
         }
     }
 }
\ No newline at end of file

From 07835700ebf21a6d883fd46001452ca1691ed98b Mon Sep 17 00:00:00 2001
From: Alex Piche <alexandrelpiche@gmail.com>
Date: Tue, 19 Aug 2025 02:07:36 +0000
Subject: [PATCH 027/126] bigger timeout

---
 conf/tir_mcp.yaml | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/conf/tir_mcp.yaml b/conf/tir_mcp.yaml
index 9856e5c0..0506a0bf 100644
--- a/conf/tir_mcp.yaml
+++ b/conf/tir_mcp.yaml
@@ -10,7 +10,7 @@ actor:
     {task}
 
 finetune:
-  seq_length: 60000
+  seq_length: 48000
   seq_parallel: 4
 
 dataset_loader: pipelinerl.domains.math.load_datasets
@@ -23,7 +23,7 @@ test_dataset_names:
   - math_500
 
 world:
-  env_replicas: 64
+  env_replicas: 5
 
 vllm_config:
   use_v1: true
@@ -33,7 +33,7 @@ vllm_config:
 
 environment:
   _target_: pipelinerl.domains.tir_mcp.env_server.MCPEnvironmentServer
-  n_envs: 16
+  n_envs: 8
   n_envs_mcp: 7
   n_envs_math: 1
   host: localhost
@@ -43,6 +43,7 @@ environment:
   mcp_tools_whitelist:
     - run_python_code
   math_target: pipelinerl.domains.math.MathEnvironment
+  env_call_timeout: 600  # Increased from default 60s to 10 minutes
   mcp_read_timeout_seconds: 3000
 
 

From b284fcb43523474c8d926a06951bea9187f0afd2 Mon Sep 17 00:00:00 2001
From: Alex Piche <alexandrelpiche@gmail.com>
Date: Tue, 19 Aug 2025 10:26:09 +0000
Subject: [PATCH 028/126] diff temp dir for each mcp

---
 conf/mcp/python.json | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/conf/mcp/python.json b/conf/mcp/python.json
index 44699388..b0881201 100644
--- a/conf/mcp/python.json
+++ b/conf/mcp/python.json
@@ -1,13 +1,10 @@
 {
     "mcpServers": {
         "python_exec": {
-            "command": "/home/toolkit/.deno/bin/deno",
+            "command": "bash",
             "args": [
-                "run",
-                "-N",
-                "--node-modules-dir=none",
-                "jsr:@pydantic/mcp-run-python",
-                "stdio"
+                "-c",
+                "mkdir -p /tmp/mcp_work_$$ && cd /tmp/mcp_work_$$ && /home/toolkit/.deno/bin/deno run -N -R=node_modules -W=node_modules --node-modules-dir=auto jsr:@pydantic/mcp-run-python stdio"
             ]
         }
     }

From eb48d90ca945af5f6490829496dece843708edcd Mon Sep 17 00:00:00 2001
From: Alex Piche <alexandrelpiche@gmail.com>
Date: Tue, 19 Aug 2025 17:40:43 +0000
Subject: [PATCH 029/126] 0.0.0.0

---
 conf/tir_mcp.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/conf/tir_mcp.yaml b/conf/tir_mcp.yaml
index 0506a0bf..3d2c9678 100644
--- a/conf/tir_mcp.yaml
+++ b/conf/tir_mcp.yaml
@@ -36,7 +36,7 @@ environment:
   n_envs: 8
   n_envs_mcp: 7
   n_envs_math: 1
-  host: localhost
+  host: "0.0.0.0"
   exp_path: ${output_dir}/env_server
   mcp_target: tapeagents.mcp.MCPEnvironment
   mcp_config_path: /home/toolkit/research-now-reasoner/pipelinerl/conf/mcp/python.json

From efa271767ff7883bdb352cbdc548a6abe6210037 Mon Sep 17 00:00:00 2001
From: Alex Piche <alexandrelpiche@gmail.com>
Date: Tue, 19 Aug 2025 19:04:36 +0000
Subject: [PATCH 030/126] filter based on port

---
 pipelinerl/domains/tir_mcp/rollouts.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/pipelinerl/domains/tir_mcp/rollouts.py b/pipelinerl/domains/tir_mcp/rollouts.py
index a22b690a..417a214c 100644
--- a/pipelinerl/domains/tir_mcp/rollouts.py
+++ b/pipelinerl/domains/tir_mcp/rollouts.py
@@ -58,10 +58,11 @@ async def generate_math_rollout2(
 ) -> RolloutResult:
     # (1) Choose a random environment server
     start = time.perf_counter()
-    env_jobs = [Job(**job) for job in cfg.jobs if job["kind"] == "environment"]
-    math_job, mcp_jobs = env_jobs[0], env_jobs[1:]
+    mcp_jobs = [Job(**job) for job in cfg.jobs if job["kind"] == "environment" and job["port"] != 7777]
+    math_jobs = [Job(**job) for job in cfg.jobs if job["kind"] == "environment" and job["port"] == 7777]
     # choose the env job randomly
     mcp_job = random.choice(mcp_jobs)
+    math_job = random.choice(math_jobs)
     assert mcp_job.port is not None
     mcp_job_url = f"http://{mcp_job.hostname}:{mcp_job.port}"
     environment = AsyncRemoteEnvironment(server_url=mcp_job_url)  # type: ignore

From 3d86a28c92edd74c3a3dd6d4bddbdca766b2454b Mon Sep 17 00:00:00 2001
From: Alex Piche <alexandrelpiche@gmail.com>
Date: Thu, 21 Aug 2025 15:01:04 +0000
Subject: [PATCH 031/126] change port to 7778

---
 pipelinerl/domains/tir_mcp/env_server.py | 2 +-
 pipelinerl/domains/tir_mcp/rollouts.py   | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/pipelinerl/domains/tir_mcp/env_server.py b/pipelinerl/domains/tir_mcp/env_server.py
index e1662990..d2be2dd8 100644
--- a/pipelinerl/domains/tir_mcp/env_server.py
+++ b/pipelinerl/domains/tir_mcp/env_server.py
@@ -37,7 +37,7 @@ def launch(self, port: int):
         """
         Serve the environment in TapeAgent.
         """
-        if port != 7777:
+        if port != 7778:
             env_server = EnvironmentServer(n_envs=self.n_envs, host=self.host, port=port, env_call_timeout=self.env_call_timeout)
             env_server.launch(OmegaConf.create({
                 "_target_": self.mcp_target,
diff --git a/pipelinerl/domains/tir_mcp/rollouts.py b/pipelinerl/domains/tir_mcp/rollouts.py
index 417a214c..27a15b71 100644
--- a/pipelinerl/domains/tir_mcp/rollouts.py
+++ b/pipelinerl/domains/tir_mcp/rollouts.py
@@ -58,8 +58,8 @@ async def generate_math_rollout2(
 ) -> RolloutResult:
     # (1) Choose a random environment server
     start = time.perf_counter()
-    mcp_jobs = [Job(**job) for job in cfg.jobs if job["kind"] == "environment" and job["port"] != 7777]
-    math_jobs = [Job(**job) for job in cfg.jobs if job["kind"] == "environment" and job["port"] == 7777]
+    mcp_jobs = [Job(**job) for job in cfg.jobs if job["kind"] == "environment" and job["port"] != 7778]
+    math_jobs = [Job(**job) for job in cfg.jobs if job["kind"] == "environment" and job["port"] == 7778]
     # choose the env job randomly
     mcp_job = random.choice(mcp_jobs)
     math_job = random.choice(math_jobs)

From 96a75c176bcfe9dd60a1a8573865f476a30b8d98 Mon Sep 17 00:00:00 2001
From: Alex Piche <alexandrelpiche@gmail.com>
Date: Thu, 21 Aug 2025 16:39:36 +0000
Subject: [PATCH 032/126] mcp and verify server

---
 pipelinerl/domains/tir_mcp/env_server.py | 78 ++++++++++++++++++++----
 1 file changed, 66 insertions(+), 12 deletions(-)

diff --git a/pipelinerl/domains/tir_mcp/env_server.py b/pipelinerl/domains/tir_mcp/env_server.py
index d2be2dd8..d1f14961 100644
--- a/pipelinerl/domains/tir_mcp/env_server.py
+++ b/pipelinerl/domains/tir_mcp/env_server.py
@@ -2,9 +2,61 @@
 from tapeagents.remote_environment import EnvironmentServer
 from omegaconf import OmegaConf
 from typing import List
+from fastapi import HTTPException
+from pydantic import BaseModel
+import logging
+import asyncio
+from concurrent.futures import ProcessPoolExecutor
+from functools import partial
 
+from pipelinerl.domains.math.verifier_api import verify_answer
+
+logger = logging.getLogger(__name__)
+
+
+class EnvironmentServerWithVerifier(EnvironmentServer):
+    """Environment server that includes the verify_answer endpoint."""
+    
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.process_pool = ProcessPoolExecutor(max_workers=4)
+    
+    def create_app(self):
+        app = super().create_app()
+        
+        class VerifyAnswerRequest(BaseModel):
+            prediction: str
+            gold: str
+            strict: bool = True
+            max_prediction_length: int = 1000
+        
+        @app.post("/verify_answer")
+        async def verify_answer_endpoint(request: VerifyAnswerRequest):
+            try:
+                # Run verification in the process pool to avoid blocking the main thread
+                loop = asyncio.get_event_loop()
+                answer_status = await loop.run_in_executor(
+                    self.process_pool, 
+                    partial(
+                        verify_answer, 
+                        request.prediction, 
+                        request.gold, 
+                        request.strict, 
+                        request.max_prediction_length
+                    )
+                )
+                return {"answer_status": answer_status}
+            except Exception as e:
+                logger.exception(f"Error in verify_answer: {e}")
+                raise HTTPException(status_code=500, detail=f"Error verifying answer: {str(e)}")
+        
+        return app
+    
+    def shutdown(self):
+        super().shutdown()
+        if hasattr(self, 'process_pool'):
+            self.process_pool.shutdown(wait=True)
 
-from pipelinerl.domains.math import MathEnvironment
 
 class MCPEnvironmentServer:
 
@@ -35,16 +87,18 @@ def __init__(self,
 
     def launch(self, port: int):
         """
-        Serve the environment in TapeAgent.
+        Serve the environment in TapeAgent with verify_answer endpoint.
         """
-        if port != 7778:
-            env_server = EnvironmentServer(n_envs=self.n_envs, host=self.host, port=port, env_call_timeout=self.env_call_timeout)
-            env_server.launch(OmegaConf.create({
-                "_target_": self.mcp_target,
-                "config_path": self.mcp_config_path,
-                "tools_whitelist": self.mcp_tools_whitelist,
-                "read_timeout_seconds": self.mcp_read_timeout_seconds,
-            }))
-        else:
-            MathEnvironment().launch(port)
+        env_server = EnvironmentServerWithVerifier(
+            n_envs=self.n_envs, 
+            host=self.host, 
+            port=port, 
+            env_call_timeout=self.env_call_timeout
+        )
+        env_server.launch(OmegaConf.create({
+            "_target_": self.mcp_target,
+            "config_path": self.mcp_config_path,
+            "tools_whitelist": self.mcp_tools_whitelist,
+            "read_timeout_seconds": self.mcp_read_timeout_seconds,
+        }))
 

From 0b4c9922fafcfe8984a58dd804fe2917b2f31a92 Mon Sep 17 00:00:00 2001
From: Alex Piche <alexandrelpiche@gmail.com>
Date: Thu, 21 Aug 2025 17:37:46 +0000
Subject: [PATCH 033/126] use custom parser

---
 conf/tir_mcp.yaml                      |  11 +-
 pipelinerl/domains/tir_mcp/rollouts.py |  16 ++-
 pipelinerl/rl_tool_parser_plugin.py    | 141 +++++++++++++++++++++++++
 3 files changed, 158 insertions(+), 10 deletions(-)
 create mode 100644 pipelinerl/rl_tool_parser_plugin.py

diff --git a/conf/tir_mcp.yaml b/conf/tir_mcp.yaml
index 3d2c9678..c3b2658a 100644
--- a/conf/tir_mcp.yaml
+++ b/conf/tir_mcp.yaml
@@ -2,6 +2,14 @@ defaults:
     - base
     - _self_
 
+llm:
+ stop:
+    - "</tool_call>"
+
+test_llm:
+  stop:
+    - "</tool_call>"
+
 actor:
   rollout_policy: pipelinerl.domains.tir_mcp.generate_math_rollout2
   system_prompt: Please reason step by step, and put your final answer within \boxed{}.
@@ -29,7 +37,8 @@ vllm_config:
   use_v1: true
   vllm_kwargs:
     enable-auto-tool-choice: ""
-    tool-call-parser: hermes
+    tool-call-parser: rl_tool
+    tool-parser-plugin: /home/toolkit/research-now-reasoner/pipelinerl/pipelinerl/rl_tool_parser_plugin.py
 
 environment:
   _target_: pipelinerl.domains.tir_mcp.env_server.MCPEnvironmentServer
diff --git a/pipelinerl/domains/tir_mcp/rollouts.py b/pipelinerl/domains/tir_mcp/rollouts.py
index 27a15b71..f0c751f3 100644
--- a/pipelinerl/domains/tir_mcp/rollouts.py
+++ b/pipelinerl/domains/tir_mcp/rollouts.py
@@ -58,14 +58,12 @@ async def generate_math_rollout2(
 ) -> RolloutResult:
     # (1) Choose a random environment server
     start = time.perf_counter()
-    mcp_jobs = [Job(**job) for job in cfg.jobs if job["kind"] == "environment" and job["port"] != 7778]
-    math_jobs = [Job(**job) for job in cfg.jobs if job["kind"] == "environment" and job["port"] == 7778]
+    env_jobs = [Job(**job) for job in cfg.jobs if job["kind"] == "environment"]
     # choose the env job randomly
-    mcp_job = random.choice(mcp_jobs)
-    math_job = random.choice(math_jobs)
-    assert mcp_job.port is not None
-    mcp_job_url = f"http://{mcp_job.hostname}:{mcp_job.port}"
-    environment = AsyncRemoteEnvironment(server_url=mcp_job_url)  # type: ignore
+    env_job = random.choice(env_jobs)
+    assert env_job.port is not None
+    env_job_url = f"http://{env_job.hostname}:{env_job.port}"
+    environment = AsyncRemoteEnvironment(server_url=env_job_url)  # type: ignore
     async with environment.acontext(session, wait_for_env=True) as env:
         actions = await env.a_actions()
         tools_description = await env.a_tools_description()
@@ -95,8 +93,8 @@ async def generate_math_rollout2(
     training_texts = [make_training_text(llm, llm_call) for llm_call in llm_calls]
     answer_status = await verify_answer_rpc(
         session=session,
-        host=math_job.hostname,
-        port=math_job.port, # type: ignore
+        host=env_job.hostname,
+        port=env_job.port, # type: ignore
         prediction=llm_calls[-1].output.content, # type: ignore
         gold=problem["answer"],
         strict=True,
diff --git a/pipelinerl/rl_tool_parser_plugin.py b/pipelinerl/rl_tool_parser_plugin.py
new file mode 100644
index 00000000..23c67d66
--- /dev/null
+++ b/pipelinerl/rl_tool_parser_plugin.py
@@ -0,0 +1,141 @@
+"""
+Tool parser plugin for RL tool calling format.
+"""
+
+import json
+import re
+from typing import Any, Dict, List, Optional, Union, Sequence
+
+from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import ToolParser
+from vllm.entrypoints.openai.tool_parsers import ToolParserManager
+from vllm.entrypoints.openai.protocol import (
+    ChatCompletionRequest, 
+    ExtractedToolCallInformation,
+    ToolCall,
+    FunctionCall
+)
+
+
+@ToolParserManager.register_module("rl_tool")
+class HermesRLToolParser(ToolParser):
+    """
+    Tool parser for RL tool calling format using <tool_call></tool_call> markers.
+    """
+    
+    def __init__(self, tokenizer):
+        super().__init__(tokenizer)
+        
+        # Tool call markers
+        self.tool_call_start_token = "<tool_call>"
+        self.tool_call_end_token = "</tool_call>"
+        
+        # Regex pattern for parsing tool calls
+        self.tool_call_regex = re.compile(
+            r"<tool_call>(.*?)</tool_call>|<tool_call>(.*)", re.DOTALL
+        )
+        
+        # State for streaming
+        self.current_tool_name_sent = False
+        self.prev_tool_call_arr = []
+        self.current_tool_id = -1
+        self.streamed_args_for_tool = []
+    
+    def extract_tool_calls(self, model_output: str, request: ChatCompletionRequest) -> ExtractedToolCallInformation:
+        """
+        Extract tool calls from the model output.
+        
+        Args:
+            model_output: The raw model output string
+            request: The request object
+            
+        Returns:
+            ExtractedToolCallInformation with tool calls and metadata
+        """
+        # Quick check to avoid unnecessary processing
+        if self.tool_call_start_token not in model_output:
+            return ExtractedToolCallInformation(
+                tools_called=False,
+                tool_calls=[],
+                content=model_output
+            )
+        
+        try:
+            # Find all tool call matches
+            function_call_tuples = self.tool_call_regex.findall(model_output)
+            
+            # Parse JSON from matches
+            tool_calls = []
+            for i, match in enumerate(function_call_tuples):
+                json_str = match[0] if match[0] else match[1]
+                try:
+                    parsed_call = json.loads(json_str.strip())
+                    
+                    tool_call = ToolCall(
+                        id=f"call_{i}",
+                        type="function",
+                        function=FunctionCall(
+                            name=parsed_call.get("name", ""),
+                            arguments=json.dumps(
+                                parsed_call.get("arguments", {}),
+                                ensure_ascii=False
+                            )
+                        )
+                    )
+                    tool_calls.append(tool_call)
+                except json.JSONDecodeError:
+                    continue
+            
+            # Extract content before first tool call
+            content = model_output#[:model_output.find(self.tool_call_end_token)].strip()
+            if not content:
+                content = None
+                
+            return ExtractedToolCallInformation(
+                tools_called=bool(tool_calls),
+                tool_calls=tool_calls,
+                content=content
+            )
+            
+        except Exception:
+            return ExtractedToolCallInformation(
+                tools_called=False,
+                tool_calls=[],
+                content=model_output
+            )
+    
+    def extract_tool_calls_streaming(
+        self, 
+        previous_text: str, 
+        current_text: str, 
+        delta_text: str, 
+        request
+    ) -> Optional[Dict[str, Any]]:
+        """
+        Extract tool calls in streaming mode.
+        
+        Args:
+            previous_text: The previous text
+            current_text: The current complete text
+            delta_text: The new text delta
+            request: The request object
+            
+        Returns:
+            Dictionary with streaming tool call information
+        """
+        # Simple streaming implementation
+        if self.tool_call_start_token not in current_text:
+            return {"content": delta_text}
+        
+        # Check if we're starting a new tool call
+        if self.tool_call_start_token in delta_text:
+            self.current_tool_id += 1
+            return {
+                "tool_calls": [{
+                    "index": self.current_tool_id,
+                    "type": "function",
+                    "id": f"call_{self.current_tool_id}",
+                    "function": {"name": ""}
+                }]
+            }
+        
+        return {"content": delta_text}
\ No newline at end of file

From 471d28d4e8f0338720756b27ed819d2a4a8613c3 Mon Sep 17 00:00:00 2001
From: Alex Piche <alexandrelpiche@gmail.com>
Date: Thu, 21 Aug 2025 17:44:34 +0000
Subject: [PATCH 034/126] relative path

---
 conf/tir_mcp.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/conf/tir_mcp.yaml b/conf/tir_mcp.yaml
index c3b2658a..fa5cee77 100644
--- a/conf/tir_mcp.yaml
+++ b/conf/tir_mcp.yaml
@@ -38,7 +38,7 @@ vllm_config:
   vllm_kwargs:
     enable-auto-tool-choice: ""
     tool-call-parser: rl_tool
-    tool-parser-plugin: /home/toolkit/research-now-reasoner/pipelinerl/pipelinerl/rl_tool_parser_plugin.py
+    tool-parser-plugin: pipelinerl/rl_tool_parser_plugin.py
 
 environment:
   _target_: pipelinerl.domains.tir_mcp.env_server.MCPEnvironmentServer

From 8e0eeffc23865e78c8b0c22b858adc37e5e17100 Mon Sep 17 00:00:00 2001
From: Alex Piche <alexandrelpiche@gmail.com>
Date: Thu, 21 Aug 2025 17:56:12 +0000
Subject: [PATCH 035/126] test apth

---
 pipelinerl/launch.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/pipelinerl/launch.py b/pipelinerl/launch.py
index b03ab8d7..b0ec5785 100644
--- a/pipelinerl/launch.py
+++ b/pipelinerl/launch.py
@@ -18,8 +18,9 @@
 
 logger = logging.getLogger(__name__)
 
-# All the launch commands in this file pass the environment to child processes
-os.environ["PYTHONPATH"] = f"/home/toolkit/TapeAgents"
+# TODO: rm debug code
+import tapeagents
+logger.info(f"TapeAgents loaded from: {tapeagents.__file__}")
 os.environ["NCCL_CUMEM_ENABLE"] = "0"
 os.environ["TORCH_DISABLE_SHARE_RDZV_TCP_STORE"] = "1"
 os.environ["HF_DATASETS_DISABLE_PROGRESS_BARS"] = "1"

From f93d7560e6a6449f9e6e8f7b947d9414c59d7e7b Mon Sep 17 00:00:00 2001
From: Alex Piche <alexandrelpiche@gmail.com>
Date: Thu, 21 Aug 2025 18:02:21 +0000
Subject: [PATCH 036/126] typo

---
 pipelinerl/launch.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pipelinerl/launch.py b/pipelinerl/launch.py
index b0ec5785..26f26b70 100644
--- a/pipelinerl/launch.py
+++ b/pipelinerl/launch.py
@@ -20,7 +20,6 @@
 
 # TODO: rm debug code
 import tapeagents
-logger.info(f"TapeAgents loaded from: {tapeagents.__file__}")
 os.environ["NCCL_CUMEM_ENABLE"] = "0"
 os.environ["TORCH_DISABLE_SHARE_RDZV_TCP_STORE"] = "1"
 os.environ["HF_DATASETS_DISABLE_PROGRESS_BARS"] = "1"
@@ -538,6 +537,7 @@ def main(cfg: DictConfig):
 
     processes = []
 
+    logger.info(f"TapeAgents loaded from: {tapeagents.__file__}")
     lead_launcher_stream = SingleStreamSpec(exp_path=exp_dir, topic="launcher_0")
     init_msg = {"exp_init": "true"}
     if world_map.my_rank == 0:

From 32e3eb62aa58af5e51c6546ce4980863dd987ceb Mon Sep 17 00:00:00 2001
From: Alex Piche <alexandrelpiche@gmail.com>
Date: Thu, 21 Aug 2025 20:09:47 +0000
Subject: [PATCH 037/126] clean up

---
 conf/tir_mcp.yaml                      | 12 ++++++------
 pipelinerl/domains/math/rollouts.py    |  2 +-
 pipelinerl/domains/tir_mcp/__init__.py |  3 ++-
 pipelinerl/domains/tir_mcp/rollouts.py |  8 +++++---
 pipelinerl/domains/tir_mcp/steps.py    | 13 +++++++++++++
 5 files changed, 27 insertions(+), 11 deletions(-)
 create mode 100644 pipelinerl/domains/tir_mcp/steps.py

diff --git a/conf/tir_mcp.yaml b/conf/tir_mcp.yaml
index fa5cee77..11063092 100644
--- a/conf/tir_mcp.yaml
+++ b/conf/tir_mcp.yaml
@@ -11,7 +11,7 @@ test_llm:
     - "</tool_call>"
 
 actor:
-  rollout_policy: pipelinerl.domains.tir_mcp.generate_math_rollout2
+  rollout_policy: pipelinerl.domains.tir_mcp.generate_mcp_rollout
   system_prompt: Please reason step by step, and put your final answer within \boxed{}.
   llm_max_rollouts: 64
   task_template: |-
@@ -41,7 +41,7 @@ vllm_config:
     tool-parser-plugin: pipelinerl/rl_tool_parser_plugin.py
 
 environment:
-  _target_: pipelinerl.domains.tir_mcp.env_server.MCPEnvironmentServer
+  _target_: pipelinerl.domains.tir_mcp.MCPEnvironmentServer
   n_envs: 8
   n_envs_mcp: 7
   n_envs_math: 1
@@ -56,11 +56,11 @@ environment:
   mcp_read_timeout_seconds: 3000
 
 
-agent_max_loops: 1
+agent_max_loops: 3
 agent:
   _target_: tapeagents.agent.Agent
   name : mcp_agent
-  max_iterations: 4
+  max_iterations: 3
   store_llm_calls: true
   templates:
     system_prompt: |
@@ -110,9 +110,9 @@ agent:
       name: act
       system_prompt: ${agent.templates.system_prompt}
       trim_obs_except_last_n: 100
-      guidance: Then produce single function call for the next step. If the answer is ready, call GaiaAnswer. Put your final answer within \boxed{}.
+      guidance: Then produce single function call for the next step. If the answer is ready, call MathAnswer. Put your final answer within \boxed{}.
       steps:
-        - examples.gaia_agent.steps.GaiaAnswer
+        - pipelinerl.domains.tir_mcp.steps.MathAnswer
       use_known_actions: true
       use_function_calls: true
 
diff --git a/pipelinerl/domains/math/rollouts.py b/pipelinerl/domains/math/rollouts.py
index 7f370214..c293b36f 100644
--- a/pipelinerl/domains/math/rollouts.py
+++ b/pipelinerl/domains/math/rollouts.py
@@ -45,7 +45,7 @@ def get_reward(answer_status: str, finished: bool, reward_table: RewardTable) ->
         case ("correct", True):
             return reward_table.correct_answer_finished
         case _:
-            raise ValueError(f"Invalid answer_status/finished combination: {answer_status}/{trace.finished}")
+            raise ValueError(f"Invalid answer_status/finished combination: {answer_status}/{finished}")
 
 
 def length_penalty(max_length: int, sequence_length: int, buffer_tokens: int) -> float:
diff --git a/pipelinerl/domains/tir_mcp/__init__.py b/pipelinerl/domains/tir_mcp/__init__.py
index c558147b..a47458a5 100644
--- a/pipelinerl/domains/tir_mcp/__init__.py
+++ b/pipelinerl/domains/tir_mcp/__init__.py
@@ -1 +1,2 @@
-from .rollouts import generate_math_rollout2
\ No newline at end of file
+from .rollouts import generate_mcp_rollout
+from .env_server import MCPEnvironmentServer
\ No newline at end of file
diff --git a/pipelinerl/domains/tir_mcp/rollouts.py b/pipelinerl/domains/tir_mcp/rollouts.py
index f0c751f3..5ca29cb8 100644
--- a/pipelinerl/domains/tir_mcp/rollouts.py
+++ b/pipelinerl/domains/tir_mcp/rollouts.py
@@ -8,6 +8,7 @@
 import aiohttp
 from omegaconf import DictConfig
 from pydantic import BaseModel
+from pipelinerl.domains.tir_mcp.steps import MathAnswer
 from pipelinerl.world import Job
 from tapeagents.core import Prompt
 from tapeagents.llms.trainable import TrainableLLM
@@ -50,7 +51,7 @@ class Metrics(BaseMetrics):
     num_python_calls: int = 0
     num_steps: int = 0
 
-async def generate_math_rollout2(
+async def generate_mcp_rollout(
     cfg: DictConfig,
     llm: TrainableLLM,
     problem: dict,
@@ -90,6 +91,7 @@ async def generate_math_rollout2(
         for step in tape.steps if step.metadata.other.get("llm_call") is not None
     ]
     assert len(llm_calls) > 0, "No LLM calls found"
+    tool_call_counts = count_tool_calls_by_category(llm_calls)
     training_texts = [make_training_text(llm, llm_call) for llm_call in llm_calls]
     answer_status = await verify_answer_rpc(
         session=session,
@@ -99,14 +101,14 @@ async def generate_math_rollout2(
         gold=problem["answer"],
         strict=True,
     )
-    tape_finished = True # TODO
+    # Tape should finish with an answer
+    tape_finished = True if isinstance(tape.steps[-1], MathAnswer) else False
     reward = get_reward(answer_status, tape_finished, reward_table)
     for text in training_texts:
         text.reward = reward
 
     latency = time.perf_counter() - start
 
-    tool_call_counts = count_tool_calls_by_category(llm_calls)
     
     metrics = Metrics(
         reward=reward,
diff --git a/pipelinerl/domains/tir_mcp/steps.py b/pipelinerl/domains/tir_mcp/steps.py
new file mode 100644
index 00000000..f33d6efa
--- /dev/null
+++ b/pipelinerl/domains/tir_mcp/steps.py
@@ -0,0 +1,13 @@
+from typing import Any, Literal
+from pydantic import Field
+from tapeagents.core import StopStep
+
+
+class MathAnswer(StopStep):
+    """
+    Action that indicates the agent has finished solving a math problem.
+    The final answer must be contained within \\boxed{} format.
+    """
+
+    kind: Literal["math_answer_action"] = "math_answer_action"
+    answer: Any = Field(description="Final answer in \\boxed{} format")
\ No newline at end of file

From 5a3ab0ee44ff8542025e2060844b0809e088b986 Mon Sep 17 00:00:00 2001
From: Alex Piche <alexandrelpiche@gmail.com>
Date: Thu, 21 Aug 2025 20:55:01 +0000
Subject: [PATCH 038/126] clean up

---
 pipelinerl/rl_tool_parser_plugin.py | 45 ++---------------------------
 1 file changed, 2 insertions(+), 43 deletions(-)

diff --git a/pipelinerl/rl_tool_parser_plugin.py b/pipelinerl/rl_tool_parser_plugin.py
index 23c67d66..194a5d87 100644
--- a/pipelinerl/rl_tool_parser_plugin.py
+++ b/pipelinerl/rl_tool_parser_plugin.py
@@ -85,15 +85,10 @@ def extract_tool_calls(self, model_output: str, request: ChatCompletionRequest)
                 except json.JSONDecodeError:
                     continue
             
-            # Extract content before first tool call
-            content = model_output#[:model_output.find(self.tool_call_end_token)].strip()
-            if not content:
-                content = None
-                
             return ExtractedToolCallInformation(
                 tools_called=bool(tool_calls),
                 tool_calls=tool_calls,
-                content=content
+                content=model_output
             )
             
         except Exception:
@@ -102,40 +97,4 @@ def extract_tool_calls(self, model_output: str, request: ChatCompletionRequest)
                 tool_calls=[],
                 content=model_output
             )
-    
-    def extract_tool_calls_streaming(
-        self, 
-        previous_text: str, 
-        current_text: str, 
-        delta_text: str, 
-        request
-    ) -> Optional[Dict[str, Any]]:
-        """
-        Extract tool calls in streaming mode.
-        
-        Args:
-            previous_text: The previous text
-            current_text: The current complete text
-            delta_text: The new text delta
-            request: The request object
-            
-        Returns:
-            Dictionary with streaming tool call information
-        """
-        # Simple streaming implementation
-        if self.tool_call_start_token not in current_text:
-            return {"content": delta_text}
-        
-        # Check if we're starting a new tool call
-        if self.tool_call_start_token in delta_text:
-            self.current_tool_id += 1
-            return {
-                "tool_calls": [{
-                    "index": self.current_tool_id,
-                    "type": "function",
-                    "id": f"call_{self.current_tool_id}",
-                    "function": {"name": ""}
-                }]
-            }
-        
-        return {"content": delta_text}
\ No newline at end of file
+    
\ No newline at end of file

From 436e2333d8638acd601046fa0159b0e73b913583 Mon Sep 17 00:00:00 2001
From: Alex Piche <alexandrelpiche@gmail.com>
Date: Fri, 22 Aug 2025 13:43:51 +0000
Subject: [PATCH 039/126] rename domain to mcp

---
 conf/{tir_mcp.yaml => mcp.yaml}                   | 6 +++---
 pipelinerl/domains/{tir_mcp => mcp}/__init__.py   | 0
 pipelinerl/domains/{tir_mcp => mcp}/env_server.py | 0
 pipelinerl/domains/{tir_mcp => mcp}/rollouts.py   | 5 +++--
 pipelinerl/domains/{tir_mcp => mcp}/steps.py      | 0
 5 files changed, 6 insertions(+), 5 deletions(-)
 rename conf/{tir_mcp.yaml => mcp.yaml} (96%)
 rename pipelinerl/domains/{tir_mcp => mcp}/__init__.py (100%)
 rename pipelinerl/domains/{tir_mcp => mcp}/env_server.py (100%)
 rename pipelinerl/domains/{tir_mcp => mcp}/rollouts.py (97%)
 rename pipelinerl/domains/{tir_mcp => mcp}/steps.py (100%)

diff --git a/conf/tir_mcp.yaml b/conf/mcp.yaml
similarity index 96%
rename from conf/tir_mcp.yaml
rename to conf/mcp.yaml
index 11063092..2ffb097d 100644
--- a/conf/tir_mcp.yaml
+++ b/conf/mcp.yaml
@@ -11,7 +11,7 @@ test_llm:
     - "</tool_call>"
 
 actor:
-  rollout_policy: pipelinerl.domains.tir_mcp.generate_mcp_rollout
+  rollout_policy: pipelinerl.domains.mcp.generate_mcp_rollout
   system_prompt: Please reason step by step, and put your final answer within \boxed{}.
   llm_max_rollouts: 64
   task_template: |-
@@ -41,7 +41,7 @@ vllm_config:
     tool-parser-plugin: pipelinerl/rl_tool_parser_plugin.py
 
 environment:
-  _target_: pipelinerl.domains.tir_mcp.MCPEnvironmentServer
+  _target_: pipelinerl.domains.mcp.MCPEnvironmentServer
   n_envs: 8
   n_envs_mcp: 7
   n_envs_math: 1
@@ -112,7 +112,7 @@ agent:
       trim_obs_except_last_n: 100
       guidance: Then produce single function call for the next step. If the answer is ready, call MathAnswer. Put your final answer within \boxed{}.
       steps:
-        - pipelinerl.domains.tir_mcp.steps.MathAnswer
+        - pipelinerl.domains.mcp.steps.MathAnswer
       use_known_actions: true
       use_function_calls: true
 
diff --git a/pipelinerl/domains/tir_mcp/__init__.py b/pipelinerl/domains/mcp/__init__.py
similarity index 100%
rename from pipelinerl/domains/tir_mcp/__init__.py
rename to pipelinerl/domains/mcp/__init__.py
diff --git a/pipelinerl/domains/tir_mcp/env_server.py b/pipelinerl/domains/mcp/env_server.py
similarity index 100%
rename from pipelinerl/domains/tir_mcp/env_server.py
rename to pipelinerl/domains/mcp/env_server.py
diff --git a/pipelinerl/domains/tir_mcp/rollouts.py b/pipelinerl/domains/mcp/rollouts.py
similarity index 97%
rename from pipelinerl/domains/tir_mcp/rollouts.py
rename to pipelinerl/domains/mcp/rollouts.py
index 5ca29cb8..099b0abe 100644
--- a/pipelinerl/domains/tir_mcp/rollouts.py
+++ b/pipelinerl/domains/mcp/rollouts.py
@@ -8,7 +8,7 @@
 import aiohttp
 from omegaconf import DictConfig
 from pydantic import BaseModel
-from pipelinerl.domains.tir_mcp.steps import MathAnswer
+from pipelinerl.domains.mcp.steps import MathAnswer
 from pipelinerl.world import Job
 from tapeagents.core import Prompt
 from tapeagents.llms.trainable import TrainableLLM
@@ -66,6 +66,7 @@ async def generate_mcp_rollout(
     env_job_url = f"http://{env_job.hostname}:{env_job.port}"
     environment = AsyncRemoteEnvironment(server_url=env_job_url)  # type: ignore
     async with environment.acontext(session, wait_for_env=True) as env:
+        await env.start_task(problem)
         actions = await env.a_actions()
         tools_description = await env.a_tools_description()
         logger.debug(f"Available tools: {tools_description}")
@@ -79,7 +80,7 @@ async def generate_mcp_rollout(
             try:
                 tape = await async_execute_agent(agent, tape, env, session, max_loops=cfg.agent_max_loops)
                 break
-            except Exception as e:
+            except Exception:
                 await asyncio.sleep(5)
 
     reward_table = RewardTable(**dict(cfg.rewards))
diff --git a/pipelinerl/domains/tir_mcp/steps.py b/pipelinerl/domains/mcp/steps.py
similarity index 100%
rename from pipelinerl/domains/tir_mcp/steps.py
rename to pipelinerl/domains/mcp/steps.py

From 366263ba0ad71ef3f620f654d8b7d42b2897050b Mon Sep 17 00:00:00 2001
From: Alex Piche <alexandrelpiche@gmail.com>
Date: Fri, 22 Aug 2025 15:02:18 +0000
Subject: [PATCH 040/126] more envs

---
 conf/mcp.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/conf/mcp.yaml b/conf/mcp.yaml
index 2ffb097d..b6fc7e1a 100644
--- a/conf/mcp.yaml
+++ b/conf/mcp.yaml
@@ -31,7 +31,7 @@ test_dataset_names:
   - math_500
 
 world:
-  env_replicas: 5
+  env_replicas: 16
 
 vllm_config:
   use_v1: true
@@ -42,7 +42,7 @@ vllm_config:
 
 environment:
   _target_: pipelinerl.domains.mcp.MCPEnvironmentServer
-  n_envs: 8
+  n_envs: 16
   n_envs_mcp: 7
   n_envs_math: 1
   host: "0.0.0.0"

From 371be6ed40464aad773b6836c3a499fc6ef32a1d Mon Sep 17 00:00:00 2001
From: Alex Piche <alexandrelpiche@gmail.com>
Date: Fri, 22 Aug 2025 15:59:23 +0000
Subject: [PATCH 041/126] less env replicas

---
 conf/mcp.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/conf/mcp.yaml b/conf/mcp.yaml
index b6fc7e1a..2de0318a 100644
--- a/conf/mcp.yaml
+++ b/conf/mcp.yaml
@@ -31,7 +31,7 @@ test_dataset_names:
   - math_500
 
 world:
-  env_replicas: 16
+  env_replicas: 8
 
 vllm_config:
   use_v1: true
@@ -42,7 +42,7 @@ vllm_config:
 
 environment:
   _target_: pipelinerl.domains.mcp.MCPEnvironmentServer
-  n_envs: 16
+  n_envs: 32
   n_envs_mcp: 7
   n_envs_math: 1
   host: "0.0.0.0"

From 46b39d1b196906f659c8ee9ea5328c822ffe4fbd Mon Sep 17 00:00:00 2001
From: Alex Piche <alexandrelpiche@gmail.com>
Date: Fri, 22 Aug 2025 18:56:23 +0000
Subject: [PATCH 042/126] clean up tmp

---
 conf/mcp/python.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/conf/mcp/python.json b/conf/mcp/python.json
index b0881201..7b270065 100644
--- a/conf/mcp/python.json
+++ b/conf/mcp/python.json
@@ -4,7 +4,7 @@
             "command": "bash",
             "args": [
                 "-c",
-                "mkdir -p /tmp/mcp_work_$$ && cd /tmp/mcp_work_$$ && /home/toolkit/.deno/bin/deno run -N -R=node_modules -W=node_modules --node-modules-dir=auto jsr:@pydantic/mcp-run-python stdio"
+                "mkdir -p /tmp/mcp_work_$$ && cd /tmp/mcp_work_$$ && /home/toolkit/.deno/bin/deno run -N -R=node_modules -W=node_modules --node-modules-dir=auto jsr:@pydantic/mcp-run-python stdio && rm -rf /tmp/mcp_work_$$"
             ]
         }
     }

From af63f51320e7ed1838de19644152cf0b5c4968c2 Mon Sep 17 00:00:00 2001
From: Alex Piche <alexandrelpiche@gmail.com>
Date: Fri, 22 Aug 2025 19:38:21 +0000
Subject: [PATCH 043/126] change mcp dir

---
 conf/mcp/python.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/conf/mcp/python.json b/conf/mcp/python.json
index 7b270065..e1b7dd63 100644
--- a/conf/mcp/python.json
+++ b/conf/mcp/python.json
@@ -4,7 +4,7 @@
             "command": "bash",
             "args": [
                 "-c",
-                "mkdir -p /tmp/mcp_work_$$ && cd /tmp/mcp_work_$$ && /home/toolkit/.deno/bin/deno run -N -R=node_modules -W=node_modules --node-modules-dir=auto jsr:@pydantic/mcp-run-python stdio && rm -rf /tmp/mcp_work_$$"
+                "mkdir -p /home/toolkit/.cache/mcp && cd /home/toolkit/.cache/mcp && /home/toolkit/.deno/bin/deno run -N -R=node_modules -W=node_modules --node-modules-dir=auto jsr:@pydantic/mcp-run-python stdio"
             ]
         }
     }

From 55a96e55b4bb79de19af8dcb2c93fa75968c8dd0 Mon Sep 17 00:00:00 2001
From: Alex Piche <alexandrelpiche@gmail.com>
Date: Fri, 22 Aug 2025 19:54:41 +0000
Subject: [PATCH 044/126] bigger model len

---
 conf/mcp.yaml | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/conf/mcp.yaml b/conf/mcp.yaml
index 2de0318a..826f2445 100644
--- a/conf/mcp.yaml
+++ b/conf/mcp.yaml
@@ -30,19 +30,17 @@ test_dataset_names:
   - amc_2023
   - math_500
 
-world:
-  env_replicas: 8
-
 vllm_config:
   use_v1: true
   vllm_kwargs:
     enable-auto-tool-choice: ""
     tool-call-parser: rl_tool
     tool-parser-plugin: pipelinerl/rl_tool_parser_plugin.py
+    max_model_len: 48000
 
 environment:
   _target_: pipelinerl.domains.mcp.MCPEnvironmentServer
-  n_envs: 32
+  n_envs: ${actor.llm_max_rollouts}
   n_envs_mcp: 7
   n_envs_math: 1
   host: "0.0.0.0"

From dd0ea2bd1c534b3e8b44e9ad35b34043408d64af Mon Sep 17 00:00:00 2001
From: Alex Piche <alexandrelpiche@gmail.com>
Date: Fri, 22 Aug 2025 20:01:46 +0000
Subject: [PATCH 045/126] typo

---
 conf/mcp.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/conf/mcp.yaml b/conf/mcp.yaml
index 826f2445..f0de57f4 100644
--- a/conf/mcp.yaml
+++ b/conf/mcp.yaml
@@ -31,12 +31,12 @@ test_dataset_names:
   - math_500
 
 vllm_config:
-  use_v1: true
+  use_v1: false
   vllm_kwargs:
     enable-auto-tool-choice: ""
     tool-call-parser: rl_tool
     tool-parser-plugin: pipelinerl/rl_tool_parser_plugin.py
-    max_model_len: 48000
+    max_model_len: 40960
 
 environment:
   _target_: pipelinerl.domains.mcp.MCPEnvironmentServer

From dc4052d9bd7f51888b656fa1192a91840521efb7 Mon Sep 17 00:00:00 2001
From: Alex Piche <alexandrelpiche@gmail.com>
Date: Sat, 23 Aug 2025 16:09:42 +0000
Subject: [PATCH 046/126] typo

---
 conf/base.yaml       | 2 +-
 conf/mcp/python.json | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/conf/base.yaml b/conf/base.yaml
index 995db7c5..b91f113b 100644
--- a/conf/base.yaml
+++ b/conf/base.yaml
@@ -67,7 +67,7 @@ vllm_config:
     tensor-parallel-size: 1
     pipeline-parallel-size: 1
     generation-config: vllm
-    max_model_len: 10000
+    max_model_len: 16000
 
 world:
   replicas: 1
diff --git a/conf/mcp/python.json b/conf/mcp/python.json
index e1b7dd63..5e44e30f 100644
--- a/conf/mcp/python.json
+++ b/conf/mcp/python.json
@@ -4,7 +4,7 @@
             "command": "bash",
             "args": [
                 "-c",
-                "mkdir -p /home/toolkit/.cache/mcp && cd /home/toolkit/.cache/mcp && /home/toolkit/.deno/bin/deno run -N -R=node_modules -W=node_modules --node-modules-dir=auto jsr:@pydantic/mcp-run-python stdio"
+                "mkdir -p /home/toolkit/.cache/mcp_$$ && cd /home/toolkit/.cache/mcp_$$ && /home/toolkit/.deno/bin/deno run -N -R=node_modules -W=node_modules --node-modules-dir=auto jsr:@pydantic/mcp-run-python stdio"
             ]
         }
     }

From bb4d0c59128900a7f723ba0b9edd1ed7c860ddda Mon Sep 17 00:00:00 2001
From: Alex Piche <alexandrelpiche@gmail.com>
Date: Tue, 26 Aug 2025 19:51:30 +0000
Subject: [PATCH 047/126] clean up

---
 conf/finetune/base.yaml              |  2 +-
 conf/mcp.yaml                        | 15 ++++-----------
 pipelinerl/domains/mcp/env_server.py |  3 ---
 3 files changed, 5 insertions(+), 15 deletions(-)

diff --git a/conf/finetune/base.yaml b/conf/finetune/base.yaml
index 237e6d56..6fb09310 100644
--- a/conf/finetune/base.yaml
+++ b/conf/finetune/base.yaml
@@ -36,7 +36,7 @@ learning_rate: 1e-6
 # How much to clip the gradient (no clipping if null)
 gradient_clipping_threshold: 0.3
 # Learning rate scheduler type (indexed by completed_steps).
-lr_scheduler_type: cosine # could be cosine, constant_with_warmup
+lr_scheduler_type: constant # could be cosine, constant_with_warmup
 # Number of warmup (completed) steps in the learning rate schedule.
 num_warmup_steps: 50
 # Number of gradient accumulation steps.
diff --git a/conf/mcp.yaml b/conf/mcp.yaml
index f0de57f4..e8aa33cd 100644
--- a/conf/mcp.yaml
+++ b/conf/mcp.yaml
@@ -2,13 +2,6 @@ defaults:
     - base
     - _self_
 
-llm:
- stop:
-    - "</tool_call>"
-
-test_llm:
-  stop:
-    - "</tool_call>"
 
 actor:
   rollout_policy: pipelinerl.domains.mcp.generate_mcp_rollout
@@ -40,20 +33,20 @@ vllm_config:
 
 environment:
   _target_: pipelinerl.domains.mcp.MCPEnvironmentServer
-  n_envs: ${actor.llm_max_rollouts}
-  n_envs_mcp: 7
-  n_envs_math: 1
+  n_envs: 32
   host: "0.0.0.0"
   exp_path: ${output_dir}/env_server
   mcp_target: tapeagents.mcp.MCPEnvironment
   mcp_config_path: /home/toolkit/research-now-reasoner/pipelinerl/conf/mcp/python.json
   mcp_tools_whitelist:
     - run_python_code
-  math_target: pipelinerl.domains.math.MathEnvironment
   env_call_timeout: 600  # Increased from default 60s to 10 minutes
   mcp_read_timeout_seconds: 3000
 
 
+world:
+  env_replicas_per_actor: 8
+
 agent_max_loops: 3
 agent:
   _target_: tapeagents.agent.Agent
diff --git a/pipelinerl/domains/mcp/env_server.py b/pipelinerl/domains/mcp/env_server.py
index d1f14961..fabc5af2 100644
--- a/pipelinerl/domains/mcp/env_server.py
+++ b/pipelinerl/domains/mcp/env_server.py
@@ -62,13 +62,10 @@ class MCPEnvironmentServer:
 
     def __init__(self,
         n_envs: int,
-        n_envs_mcp: int,
-        n_envs_math: int,
         host: str,
         mcp_target: str,
         mcp_config_path: str,
         mcp_tools_whitelist: List[str],
-        math_target: str,
         exp_path: str,
         env_call_timeout: int = 60,
         mcp_read_timeout_seconds: int = 10,

From ccdcd325c31fd6f1fdcea314d26df150c4b5a064 Mon Sep 17 00:00:00 2001
From: Alex Piche <alexandrelpiche@gmail.com>
Date: Tue, 26 Aug 2025 20:33:50 +0000
Subject: [PATCH 048/126] center reward

---
 pipelinerl/finetune/rl/__init__.py | 3 +++
 pipelinerl/finetune_loop.py        | 7 ++++++-
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/pipelinerl/finetune/rl/__init__.py b/pipelinerl/finetune/rl/__init__.py
index 57aa4fa7..fd014bb9 100644
--- a/pipelinerl/finetune/rl/__init__.py
+++ b/pipelinerl/finetune/rl/__init__.py
@@ -133,6 +133,7 @@ def rl_step(
     current_step: int,
     max_step: int,
     config: RLConfig,
+    running_avg_reward: float = 0.0,
 ) -> tuple[torch.Tensor, dict[str, float]]:
     """
     Perform a single RL step on the model using the given batch and config.
@@ -211,6 +212,8 @@ def rl_step(
 
     # get shifted values and compute ratios
     rewards = batch.rewards[:, 1:]
+    # Center rewards using running average
+    rewards = rewards - running_avg_reward
     ref_logprobs = batch.ref_logprobs[:, 1:]
     old_logprobs = batch.old_logprobs[:, 1:]
     group_tokens = batch.group_tokens[:, 1:]
diff --git a/pipelinerl/finetune_loop.py b/pipelinerl/finetune_loop.py
index a91d1aa2..32b1dbba 100644
--- a/pipelinerl/finetune_loop.py
+++ b/pipelinerl/finetune_loop.py
@@ -659,7 +659,7 @@ def toggle_sync(sync: bool):
                 assert batch.seq_boundaries is not None
                 update_ring_flash_attn_params(batch.seq_boundaries, seq_parallel_group)
             loss, this_step_rl_metrics = rl_step(
-                model, batch, training_metrics.completed_steps, final_train_steps, rl_config
+                model, batch, training_metrics.completed_steps, final_train_steps, rl_config, training_metrics.running_avg_reward
             )
             if is_sentinel_batch:
                 # zero out the loss and do not update the metrics
@@ -668,6 +668,11 @@ def toggle_sync(sync: bool):
                 # update the metrics
                 for k, v in this_step_rl_metrics.items():
                     rl_metrics[k].append(v)
+                
+                # Update running average reward
+                current_reward = this_step_rl_metrics.get('reward', 0.0)
+                alpha = 0.1  # Exponential moving average coefficient
+                training_metrics.running_avg_reward = (1 - alpha) * training_metrics.running_avg_reward + alpha * current_reward
 
             backward(loss, is_final_micro_batch=do_optimizer_step)
 

From 7f5ed953e166d5d91a26f44144cd3404c5374111 Mon Sep 17 00:00:00 2001
From: Alex Piche <alexandrelpiche@gmail.com>
Date: Tue, 26 Aug 2025 20:59:54 +0000
Subject: [PATCH 049/126] running avg reward

---
 pipelinerl/finetune/types.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pipelinerl/finetune/types.py b/pipelinerl/finetune/types.py
index 33194c90..8c8f85be 100644
--- a/pipelinerl/finetune/types.py
+++ b/pipelinerl/finetune/types.py
@@ -41,6 +41,7 @@ class TrainingMetrics:
     best_completed_steps: int = 0
     lr: float = 0.0
     time_waiting_for_data: float = 0.0
+    running_avg_reward: float = 0.0
 
 
 class PipelineBatchEncoding(BaseModel):

From 88a0ee7c291d97f446862f348f91e9a82000b342 Mon Sep 17 00:00:00 2001
From: Alex Piche <alexandrelpiche@gmail.com>
Date: Tue, 26 Aug 2025 21:33:02 +0000
Subject: [PATCH 050/126] start from real mean

---
 pipelinerl/finetune/rl/__init__.py | 3 +++
 pipelinerl/finetune/types.py       | 2 +-
 pipelinerl/finetune_loop.py        | 5 ++++-
 3 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/pipelinerl/finetune/rl/__init__.py b/pipelinerl/finetune/rl/__init__.py
index fd014bb9..499cccf8 100644
--- a/pipelinerl/finetune/rl/__init__.py
+++ b/pipelinerl/finetune/rl/__init__.py
@@ -11,6 +11,7 @@
 from datasets import Dataset
 from transformers import PreTrainedModel
 from pipelinerl.finetune.types import PipelineBatchEncoding
+from tapeagents.tapeagents.finetune.rl.utils import masked_mean
 
 from .utils import (
     sum_sum,
@@ -213,6 +214,8 @@ def rl_step(
     # get shifted values and compute ratios
     rewards = batch.rewards[:, 1:]
     # Center rewards using running average
+    if running_avg_reward is None:
+        running_avg_reward = masked_mean(rewards, masks_shifted).item()
     rewards = rewards - running_avg_reward
     ref_logprobs = batch.ref_logprobs[:, 1:]
     old_logprobs = batch.old_logprobs[:, 1:]
diff --git a/pipelinerl/finetune/types.py b/pipelinerl/finetune/types.py
index 8c8f85be..2af9edfd 100644
--- a/pipelinerl/finetune/types.py
+++ b/pipelinerl/finetune/types.py
@@ -41,7 +41,7 @@ class TrainingMetrics:
     best_completed_steps: int = 0
     lr: float = 0.0
     time_waiting_for_data: float = 0.0
-    running_avg_reward: float = 0.0
+    running_avg_reward: float | None = None
 
 
 class PipelineBatchEncoding(BaseModel):
diff --git a/pipelinerl/finetune_loop.py b/pipelinerl/finetune_loop.py
index 32b1dbba..82d9ee31 100644
--- a/pipelinerl/finetune_loop.py
+++ b/pipelinerl/finetune_loop.py
@@ -672,7 +672,10 @@ def toggle_sync(sync: bool):
                 # Update running average reward
                 current_reward = this_step_rl_metrics.get('reward', 0.0)
                 alpha = 0.1  # Exponential moving average coefficient
-                training_metrics.running_avg_reward = (1 - alpha) * training_metrics.running_avg_reward + alpha * current_reward
+                if training_metrics.running_avg_reward is None:
+                    training_metrics.running_avg_reward = current_reward
+                else:
+                    training_metrics.running_avg_reward = (1 - alpha) * training_metrics.running_avg_reward + alpha * current_reward
 
             backward(loss, is_final_micro_batch=do_optimizer_step)
 

From 66bcfbde9f7306357e198eaf4d07ec43688f7786 Mon Sep 17 00:00:00 2001
From: rafapi <rafael.pardinas@servicenow.com>
Date: Thu, 28 Aug 2025 16:19:43 +0000
Subject: [PATCH 051/126] Fix paths

---
 conf/mcp.yaml        | 2 +-
 conf/mcp/python.json | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/conf/mcp.yaml b/conf/mcp.yaml
index e8aa33cd..be79fde5 100644
--- a/conf/mcp.yaml
+++ b/conf/mcp.yaml
@@ -37,7 +37,7 @@ environment:
   host: "0.0.0.0"
   exp_path: ${output_dir}/env_server
   mcp_target: tapeagents.mcp.MCPEnvironment
-  mcp_config_path: /home/toolkit/research-now-reasoner/pipelinerl/conf/mcp/python.json
+  mcp_config_path: /home/toolkit/PipelineRL/conf/mcp/python.json
   mcp_tools_whitelist:
     - run_python_code
   env_call_timeout: 600  # Increased from default 60s to 10 minutes
diff --git a/conf/mcp/python.json b/conf/mcp/python.json
index 5e44e30f..580f70ef 100644
--- a/conf/mcp/python.json
+++ b/conf/mcp/python.json
@@ -4,8 +4,8 @@
             "command": "bash",
             "args": [
                 "-c",
-                "mkdir -p /home/toolkit/.cache/mcp_$$ && cd /home/toolkit/.cache/mcp_$$ && /home/toolkit/.deno/bin/deno run -N -R=node_modules -W=node_modules --node-modules-dir=auto jsr:@pydantic/mcp-run-python stdio"
-            ]
+                "DIR=$(mktemp -d -p /tmp mcp_XXXXXXXX) && cd \"$DIR\" && /home/toolkit/.deno/bin/deno run -N -R=node_modules -W=node_modules --node-modules-dir=auto jsr:@pydantic/mcp-run-python stdio; EC=$?; cd /; rm -rf \"$DIR\"; exit $EC"
+                ]
         }
     }
 }
\ No newline at end of file

From 3fcb847988ae6e4e27c17fb375f0133d56ed0a4c Mon Sep 17 00:00:00 2001
From: rafapi <rafael.pardinas@servicenow.com>
Date: Thu, 28 Aug 2025 17:00:25 +0000
Subject: [PATCH 052/126] Use relative path

---
 conf/mcp.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/conf/mcp.yaml b/conf/mcp.yaml
index be79fde5..fc30208e 100644
--- a/conf/mcp.yaml
+++ b/conf/mcp.yaml
@@ -33,11 +33,11 @@ vllm_config:
 
 environment:
   _target_: pipelinerl.domains.mcp.MCPEnvironmentServer
-  n_envs: 32
+  n_envs: 8
   host: "0.0.0.0"
   exp_path: ${output_dir}/env_server
   mcp_target: tapeagents.mcp.MCPEnvironment
-  mcp_config_path: /home/toolkit/PipelineRL/conf/mcp/python.json
+  mcp_config_path: ${hydra:runtime.cwd}/conf/mcp/python.json
   mcp_tools_whitelist:
     - run_python_code
   env_call_timeout: 600  # Increased from default 60s to 10 minutes

From 9f239c6b52fea245f89b2408ad928a4253e0ebc3 Mon Sep 17 00:00:00 2001
From: rafapi <rafael.pardinas@servicenow.com>
Date: Thu, 28 Aug 2025 19:26:02 +0000
Subject: [PATCH 053/126] Fix path

---
 conf/mcp/python.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/conf/mcp/python.json b/conf/mcp/python.json
index 580f70ef..977ab8c3 100644
--- a/conf/mcp/python.json
+++ b/conf/mcp/python.json
@@ -4,7 +4,7 @@
             "command": "bash",
             "args": [
                 "-c",
-                "DIR=$(mktemp -d -p /tmp mcp_XXXXXXXX) && cd \"$DIR\" && /home/toolkit/.deno/bin/deno run -N -R=node_modules -W=node_modules --node-modules-dir=auto jsr:@pydantic/mcp-run-python stdio; EC=$?; cd /; rm -rf \"$DIR\"; exit $EC"
+                "DIR=$(mktemp -d -p /tmp/mcp_XXXXXXXX) && cd \"$DIR\" && /home/toolkit/.deno/bin/deno run -N -R=node_modules -W=node_modules --node-modules-dir=auto jsr:@pydantic/mcp-run-python stdio; EC=$?; cd /; rm -rf \"$DIR\"; exit $EC"
                 ]
         }
     }

From 020a02172b1859684a8bc3a9f5f4185d3040f268 Mon Sep 17 00:00:00 2001
From: rafapi <rafael.pardinas@servicenow.com>
Date: Thu, 28 Aug 2025 19:50:40 +0000
Subject: [PATCH 054/126] revert mktemp changes

---
 conf/mcp/python.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/conf/mcp/python.json b/conf/mcp/python.json
index 977ab8c3..580f70ef 100644
--- a/conf/mcp/python.json
+++ b/conf/mcp/python.json
@@ -4,7 +4,7 @@
             "command": "bash",
             "args": [
                 "-c",
-                "DIR=$(mktemp -d -p /tmp/mcp_XXXXXXXX) && cd \"$DIR\" && /home/toolkit/.deno/bin/deno run -N -R=node_modules -W=node_modules --node-modules-dir=auto jsr:@pydantic/mcp-run-python stdio; EC=$?; cd /; rm -rf \"$DIR\"; exit $EC"
+                "DIR=$(mktemp -d -p /tmp mcp_XXXXXXXX) && cd \"$DIR\" && /home/toolkit/.deno/bin/deno run -N -R=node_modules -W=node_modules --node-modules-dir=auto jsr:@pydantic/mcp-run-python stdio; EC=$?; cd /; rm -rf \"$DIR\"; exit $EC"
                 ]
         }
     }

From 4323f571853c15c8459b4ed127e8979a36b617b4 Mon Sep 17 00:00:00 2001
From: rafapi <rafael.pardinas@servicenow.com>
Date: Fri, 29 Aug 2025 17:44:27 +0000
Subject: [PATCH 055/126] Fix deno paths

---
 conf/mcp.yaml                      | 2 +-
 conf/mcp/python.json               | 2 +-
 pipelinerl/finetune/rl/__init__.py | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/conf/mcp.yaml b/conf/mcp.yaml
index fc30208e..efdd196a 100644
--- a/conf/mcp.yaml
+++ b/conf/mcp.yaml
@@ -28,7 +28,7 @@ vllm_config:
   vllm_kwargs:
     enable-auto-tool-choice: ""
     tool-call-parser: rl_tool
-    tool-parser-plugin: pipelinerl/rl_tool_parser_plugin.py
+    tool-parser-plugin: ${hydra:runtime.cwd}/pipelinerl/rl_tool_parser_plugin.py
     max_model_len: 40960
 
 environment:
diff --git a/conf/mcp/python.json b/conf/mcp/python.json
index 580f70ef..b26cd498 100644
--- a/conf/mcp/python.json
+++ b/conf/mcp/python.json
@@ -4,7 +4,7 @@
             "command": "bash",
             "args": [
                 "-c",
-                "DIR=$(mktemp -d -p /tmp mcp_XXXXXXXX) && cd \"$DIR\" && /home/toolkit/.deno/bin/deno run -N -R=node_modules -W=node_modules --node-modules-dir=auto jsr:@pydantic/mcp-run-python stdio; EC=$?; cd /; rm -rf \"$DIR\"; exit $EC"
+                "JOB_TAG=${SLURM_JOB_ID:-${SLURM_PROCID:-$(hostname)-$$-$(date +%s%N)}} && BASE=/home/toolkit/.cache && mkdir -p \"$BASE/mcp_tmp/$JOB_TAG\" \"$BASE/deno_mcp/$JOB_TAG\" \"$BASE/tmp/$JOB_TAG\" && export DENO_DIR=\"$BASE/deno_mcp/$JOB_TAG\" TMPDIR=\"$BASE/tmp/$JOB_TAG\" && DIR=$(mktemp -d -p \"$BASE/mcp_tmp/$JOB_TAG\" mcp_XXXXXXXX) && cd \"$DIR\" && /home/toolkit/.deno/bin/deno run -N -R=node_modules -W=node_modules --node-modules-dir=auto jsr:@pydantic/mcp-run-python stdio; EC=$?; cd /; rm -rf \"$DIR\"; exit $EC"
                 ]
         }
     }
diff --git a/pipelinerl/finetune/rl/__init__.py b/pipelinerl/finetune/rl/__init__.py
index 499cccf8..289b8f74 100644
--- a/pipelinerl/finetune/rl/__init__.py
+++ b/pipelinerl/finetune/rl/__init__.py
@@ -11,7 +11,7 @@
 from datasets import Dataset
 from transformers import PreTrainedModel
 from pipelinerl.finetune.types import PipelineBatchEncoding
-from tapeagents.tapeagents.finetune.rl.utils import masked_mean
+from tapeagents.finetune.rl.utils import masked_mean
 
 from .utils import (
     sum_sum,

From 2b5e9f5fcee5c824b13867898d8a6b20bc526a53 Mon Sep 17 00:00:00 2001
From: rafapi <rafael.pardinas@servicenow.com>
Date: Fri, 29 Aug 2025 20:31:01 +0000
Subject: [PATCH 056/126] udt

---
 conf/mcp/python.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/conf/mcp/python.json b/conf/mcp/python.json
index b26cd498..0029ea0e 100644
--- a/conf/mcp/python.json
+++ b/conf/mcp/python.json
@@ -4,7 +4,7 @@
             "command": "bash",
             "args": [
                 "-c",
-                "JOB_TAG=${SLURM_JOB_ID:-${SLURM_PROCID:-$(hostname)-$$-$(date +%s%N)}} && BASE=/home/toolkit/.cache && mkdir -p \"$BASE/mcp_tmp/$JOB_TAG\" \"$BASE/deno_mcp/$JOB_TAG\" \"$BASE/tmp/$JOB_TAG\" && export DENO_DIR=\"$BASE/deno_mcp/$JOB_TAG\" TMPDIR=\"$BASE/tmp/$JOB_TAG\" && DIR=$(mktemp -d -p \"$BASE/mcp_tmp/$JOB_TAG\" mcp_XXXXXXXX) && cd \"$DIR\" && /home/toolkit/.deno/bin/deno run -N -R=node_modules -W=node_modules --node-modules-dir=auto jsr:@pydantic/mcp-run-python stdio; EC=$?; cd /; rm -rf \"$DIR\"; exit $EC"
+                "JOB_TAG=${JOB_ID:-${PROCID:-$(hostname)-$$-$(date +%s%N)}} && BASE=/home/toolkit/.cache && mkdir -p \"$BASE/mcp_tmp/$JOB_TAG\" \"$BASE/deno_mcp/$JOB_TAG\" \"$BASE/tmp/$JOB_TAG\" && export DENO_DIR=\"$BASE/deno_mcp/$JOB_TAG\" TMPDIR=\"$BASE/tmp/$JOB_TAG\" && DIR=$(mktemp -d -p \"$BASE/mcp_tmp/$JOB_TAG\" mcp_XXXXXXXX) && cd \"$DIR\" && /home/toolkit/.deno/bin/deno run -N -R=node_modules -W=node_modules --node-modules-dir=auto jsr:@pydantic/mcp-run-python stdio; EC=$?; cd /; rm -rf \"$DIR\"; exit $EC"
                 ]
         }
     }

From 565d25c1f7774a5981c49b1e79aea7dfd3f358d0 Mon Sep 17 00:00:00 2001
From: rafapi <rafael.pardinas@servicenow.com>
Date: Fri, 29 Aug 2025 20:41:40 +0000
Subject: [PATCH 057/126] make the cache tag stable across all processes

---
 conf/mcp/python.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/conf/mcp/python.json b/conf/mcp/python.json
index 0029ea0e..1224755b 100644
--- a/conf/mcp/python.json
+++ b/conf/mcp/python.json
@@ -4,7 +4,7 @@
             "command": "bash",
             "args": [
                 "-c",
-                "JOB_TAG=${JOB_ID:-${PROCID:-$(hostname)-$$-$(date +%s%N)}} && BASE=/home/toolkit/.cache && mkdir -p \"$BASE/mcp_tmp/$JOB_TAG\" \"$BASE/deno_mcp/$JOB_TAG\" \"$BASE/tmp/$JOB_TAG\" && export DENO_DIR=\"$BASE/deno_mcp/$JOB_TAG\" TMPDIR=\"$BASE/tmp/$JOB_TAG\" && DIR=$(mktemp -d -p \"$BASE/mcp_tmp/$JOB_TAG\" mcp_XXXXXXXX) && cd \"$DIR\" && /home/toolkit/.deno/bin/deno run -N -R=node_modules -W=node_modules --node-modules-dir=auto jsr:@pydantic/mcp-run-python stdio; EC=$?; cd /; rm -rf \"$DIR\"; exit $EC"
+                "JOB_TAG=${MCP_JOB_TAG:-${SLURM_JOB_ID:-$HOSTNAME}} && BASE=/home/toolkit/.cache && mkdir -p \"$BASE/mcp_tmp/$JOB_TAG\" \"$BASE/deno_mcp/$JOB_TAG\" \"$BASE/tmp/$JOB_TAG\" && export DENO_DIR=\"$BASE/deno_mcp/$JOB_TAG\" TMPDIR=\"$BASE/tmp/$JOB_TAG\" && /home/toolkit/.deno/bin/deno cache jsr:@pydantic/mcp-run-python >/dev/null 2>&1 || true; DIR=$(mktemp -d -p \"$BASE/mcp_tmp/$JOB_TAG\" mcp_XXXXXXXX) && cd \"$DIR\" && /home/toolkit/.deno/bin/deno run -N -R=node_modules -W=node_modules --node-modules-dir=auto jsr:@pydantic/mcp-run-python stdio; EC=$?; cd /; rm -rf \"$DIR\"; exit $EC"
                 ]
         }
     }

From e39ff7b3043a3bcf0a220252d100bc0be8cce537 Mon Sep 17 00:00:00 2001
From: Alex Piche <alexandrelpiche@gmail.com>
Date: Fri, 29 Aug 2025 22:56:34 +0000
Subject: [PATCH 058/126] remove running avg

---
 pipelinerl/finetune/rl/__init__.py |  6 ------
 pipelinerl/finetune/types.py       |  1 -
 pipelinerl/finetune_loop.py        | 10 +---------
 3 files changed, 1 insertion(+), 16 deletions(-)

diff --git a/pipelinerl/finetune/rl/__init__.py b/pipelinerl/finetune/rl/__init__.py
index 289b8f74..57aa4fa7 100644
--- a/pipelinerl/finetune/rl/__init__.py
+++ b/pipelinerl/finetune/rl/__init__.py
@@ -11,7 +11,6 @@
 from datasets import Dataset
 from transformers import PreTrainedModel
 from pipelinerl.finetune.types import PipelineBatchEncoding
-from tapeagents.finetune.rl.utils import masked_mean
 
 from .utils import (
     sum_sum,
@@ -134,7 +133,6 @@ def rl_step(
     current_step: int,
     max_step: int,
     config: RLConfig,
-    running_avg_reward: float = 0.0,
 ) -> tuple[torch.Tensor, dict[str, float]]:
     """
     Perform a single RL step on the model using the given batch and config.
@@ -213,10 +211,6 @@ def rl_step(
 
     # get shifted values and compute ratios
     rewards = batch.rewards[:, 1:]
-    # Center rewards using running average
-    if running_avg_reward is None:
-        running_avg_reward = masked_mean(rewards, masks_shifted).item()
-    rewards = rewards - running_avg_reward
     ref_logprobs = batch.ref_logprobs[:, 1:]
     old_logprobs = batch.old_logprobs[:, 1:]
     group_tokens = batch.group_tokens[:, 1:]
diff --git a/pipelinerl/finetune/types.py b/pipelinerl/finetune/types.py
index 2af9edfd..33194c90 100644
--- a/pipelinerl/finetune/types.py
+++ b/pipelinerl/finetune/types.py
@@ -41,7 +41,6 @@ class TrainingMetrics:
     best_completed_steps: int = 0
     lr: float = 0.0
     time_waiting_for_data: float = 0.0
-    running_avg_reward: float | None = None
 
 
 class PipelineBatchEncoding(BaseModel):
diff --git a/pipelinerl/finetune_loop.py b/pipelinerl/finetune_loop.py
index 82d9ee31..a91d1aa2 100644
--- a/pipelinerl/finetune_loop.py
+++ b/pipelinerl/finetune_loop.py
@@ -659,7 +659,7 @@ def toggle_sync(sync: bool):
                 assert batch.seq_boundaries is not None
                 update_ring_flash_attn_params(batch.seq_boundaries, seq_parallel_group)
             loss, this_step_rl_metrics = rl_step(
-                model, batch, training_metrics.completed_steps, final_train_steps, rl_config, training_metrics.running_avg_reward
+                model, batch, training_metrics.completed_steps, final_train_steps, rl_config
             )
             if is_sentinel_batch:
                 # zero out the loss and do not update the metrics
@@ -668,14 +668,6 @@ def toggle_sync(sync: bool):
                 # update the metrics
                 for k, v in this_step_rl_metrics.items():
                     rl_metrics[k].append(v)
-                
-                # Update running average reward
-                current_reward = this_step_rl_metrics.get('reward', 0.0)
-                alpha = 0.1  # Exponential moving average coefficient
-                if training_metrics.running_avg_reward is None:
-                    training_metrics.running_avg_reward = current_reward
-                else:
-                    training_metrics.running_avg_reward = (1 - alpha) * training_metrics.running_avg_reward + alpha * current_reward
 
             backward(loss, is_final_micro_batch=do_optimizer_step)
 

From fc17df72414ef5ab0bcb8b1b677295442445af60 Mon Sep 17 00:00:00 2001
From: rafapi <rafael.pardinas@servicenow.com>
Date: Sat, 30 Aug 2025 21:31:23 +0000
Subject: [PATCH 059/126] fix

---
 conf/mcp/python.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/conf/mcp/python.json b/conf/mcp/python.json
index 1224755b..d64cb8eb 100644
--- a/conf/mcp/python.json
+++ b/conf/mcp/python.json
@@ -4,7 +4,7 @@
             "command": "bash",
             "args": [
                 "-c",
-                "JOB_TAG=${MCP_JOB_TAG:-${SLURM_JOB_ID:-$HOSTNAME}} && BASE=/home/toolkit/.cache && mkdir -p \"$BASE/mcp_tmp/$JOB_TAG\" \"$BASE/deno_mcp/$JOB_TAG\" \"$BASE/tmp/$JOB_TAG\" && export DENO_DIR=\"$BASE/deno_mcp/$JOB_TAG\" TMPDIR=\"$BASE/tmp/$JOB_TAG\" && /home/toolkit/.deno/bin/deno cache jsr:@pydantic/mcp-run-python >/dev/null 2>&1 || true; DIR=$(mktemp -d -p \"$BASE/mcp_tmp/$JOB_TAG\" mcp_XXXXXXXX) && cd \"$DIR\" && /home/toolkit/.deno/bin/deno run -N -R=node_modules -W=node_modules --node-modules-dir=auto jsr:@pydantic/mcp-run-python stdio; EC=$?; cd /; rm -rf \"$DIR\"; exit $EC"
+                "JOB_TAG=${MCP_JOB_TAG:-${JOB_ID:-$HOSTNAME}} && BASE=/home/toolkit/.cache && mkdir -p \"$BASE/mcp_tmp/$JOB_TAG\" \"$BASE/deno_mcp/$JOB_TAG\" \"$BASE/tmp/$JOB_TAG\" && export DENO_DIR=\"$BASE/deno_mcp/$JOB_TAG\" TMPDIR=\"$BASE/tmp/$JOB_TAG\" && /home/toolkit/.deno/bin/deno cache jsr:@pydantic/mcp-run-python >/dev/null 2>&1 || true; DIR=$(mktemp -d -p \"$BASE/mcp_tmp/$JOB_TAG\" mcp_XXXXXXXX) && cd \"$DIR\" && /home/toolkit/.deno/bin/deno run -N -R=node_modules -W=node_modules --node-modules-dir=auto jsr:@pydantic/mcp-run-python stdio; EC=$?; cd /; rm -rf \"$DIR\"; exit $EC"
                 ]
         }
     }

From f4d8e0d60abfb4ed432f5a29d23db41eeafb47f7 Mon Sep 17 00:00:00 2001
From: rafapi <rafael.pardinas@servicenow.com>
Date: Fri, 5 Sep 2025 15:13:25 +0000
Subject: [PATCH 060/126] Avoid hot-spotting env; add extra metrics

---
 pipelinerl/domains/mcp/rollouts.py | 99 +++++++++++++++++++++---------
 1 file changed, 70 insertions(+), 29 deletions(-)

diff --git a/pipelinerl/domains/mcp/rollouts.py b/pipelinerl/domains/mcp/rollouts.py
index 099b0abe..cd82e351 100644
--- a/pipelinerl/domains/mcp/rollouts.py
+++ b/pipelinerl/domains/mcp/rollouts.py
@@ -1,4 +1,5 @@
 import asyncio
+from urllib.parse import urlparse
 import time
 import random
 import logging 
@@ -7,17 +8,15 @@
 
 import aiohttp
 from omegaconf import DictConfig
-from pydantic import BaseModel
 from pipelinerl.domains.mcp.steps import MathAnswer
 from pipelinerl.world import Job
-from tapeagents.core import Prompt
 from tapeagents.llms.trainable import TrainableLLM
 from tapeagents.remote_environment import AsyncRemoteEnvironment
-from pipelinerl.async_llm import llm_async_generate, make_training_text
+from pipelinerl.async_llm import make_training_text
 from tapeagents.orchestrator import async_execute_agent
 from tapeagents.agent import DEFAULT, Agent
 from hydra.utils import instantiate
-from tapeagents.core import StopStep, Tape
+from tapeagents.core import Tape
 from tapeagents.dialog_tape import UserStep
 from tapeagents.core import LLMCall
 
@@ -50,6 +49,10 @@ def count_tool_calls_by_category(llm_calls: List[LLMCall]) -> Dict[str, int]:
 class Metrics(BaseMetrics):
     num_python_calls: int = 0
     num_steps: int = 0
+    n_llm_calls: int = 0
+    total_execution_time: float = -1.0
+    agent_execution_time: float = -1.0
+    environment_execution_time: float = -1.0
 
 async def generate_mcp_rollout(
     cfg: DictConfig,
@@ -57,31 +60,58 @@ async def generate_mcp_rollout(
     problem: dict,
     session: aiohttp.ClientSession,
 ) -> RolloutResult:
-    # (1) Choose a random environment server
+    # choose and retry env servers if one is saturated
     start = time.perf_counter()
     env_jobs = [Job(**job) for job in cfg.jobs if job["kind"] == "environment"]
-    # choose the env job randomly
-    env_job = random.choice(env_jobs)
-    assert env_job.port is not None
-    env_job_url = f"http://{env_job.hostname}:{env_job.port}"
-    environment = AsyncRemoteEnvironment(server_url=env_job_url)  # type: ignore
-    async with environment.acontext(session, wait_for_env=True) as env:
-        await env.start_task(problem)
-        actions = await env.a_actions()
-        tools_description = await env.a_tools_description()
-        logger.debug(f"Available tools: {tools_description}")
-        agent: Agent = instantiate(cfg.agent, known_actions=actions, tools_description=tools_description)
-        agent.llms = {DEFAULT: llm}
-
-        tape = Tape(steps=[
-            UserStep(content=f"{problem['task']}. You have access to the following tools: {tools_description}")
-            ])
-        while True:
+    if not env_jobs:
+        raise RuntimeError("No environment servers available")
+
+    # shuffle to avoid dead-locking a single server
+    env_urls_all = [f"http://{job.hostname}:{job.port}" for job in env_jobs if job.port is not None]
+    if not env_urls_all:
+        raise RuntimeError("Environment server definitions missing ports")
+
+    while True:
+        env_urls = env_urls_all[:]
+        random.shuffle(env_urls)
+        chosen_url = None
+        for env_url in env_urls:
             try:
-                tape = await async_execute_agent(agent, tape, env, session, max_loops=cfg.agent_max_loops)
-                break
-            except Exception:
-                await asyncio.sleep(5)
+                environment = AsyncRemoteEnvironment(
+                    server_url=env_url, start_timeout_sec=600, start_repeat_delay=5)
+                context_manager = environment.acontext(session, wait_for_env=True)
+                env = await context_manager.__aenter__()
+                try:
+                    await env.start_task(problem)
+                    chosen_url = env_url
+                    actions = await env.a_actions()
+                    tools_description = await env.a_tools_description()
+                    logger.debug(f"Available tools: {tools_description}")
+                    agent: Agent = instantiate(cfg.agent, known_actions=actions, tools_description=tools_description)
+                    agent.llms = {DEFAULT: llm}
+
+                    tape = Tape(steps=[
+                        UserStep(content=f"{problem['task']}. You have access to the following tools: {tools_description}")
+                    ])
+                    t_exec = time.perf_counter()
+                    while True:
+                        try:
+                            tape = await async_execute_agent(agent, tape, env, session, max_loops=cfg.agent_max_loops)
+                            tape.metadata.result.update({"total_execution_time": time.perf_counter() - t_exec})
+                            break
+                        except Exception:
+                            await asyncio.sleep(5)
+                    break  # success
+                finally:
+                    await context_manager.__aexit__(None, None, None)
+            except Exception as e:
+                # try the next server on errors (503: busyslots)
+                logger.warning(f"Env start failed at {env_url}: {e}")
+                continue
+        if chosen_url is not None:
+            break  # success
+        # if none succeeded backoff and retry the whole list
+        await asyncio.sleep(1.0)
 
     reward_table = RewardTable(**dict(cfg.rewards))
 
@@ -94,11 +124,14 @@ async def generate_mcp_rollout(
     assert len(llm_calls) > 0, "No LLM calls found"
     tool_call_counts = count_tool_calls_by_category(llm_calls)
     training_texts = [make_training_text(llm, llm_call) for llm_call in llm_calls]
+    n_llm_calls = len(llm_calls)
+    parsed = urlparse(chosen_url)
+    assert parsed.hostname is not None and parsed.port is not None
     answer_status = await verify_answer_rpc(
         session=session,
-        host=env_job.hostname,
-        port=env_job.port, # type: ignore
-        prediction=llm_calls[-1].output.content, # type: ignore
+        host=parsed.hostname,
+        port=parsed.port,
+        prediction=llm_calls[-1].output.content,  # type: ignore
         gold=problem["answer"],
         strict=True,
     )
@@ -110,6 +143,10 @@ async def generate_mcp_rollout(
 
     latency = time.perf_counter() - start
 
+    agent_time = tape.metadata.result.get("agent_execution_time", -1.0)
+    env_time = tape.metadata.result.get("environment_execution_time", -1.0)
+    total_time = tape.metadata.result.get("total_execution_time", -1.0)
+    
     
     metrics = Metrics(
         reward=reward,
@@ -118,6 +155,10 @@ async def generate_mcp_rollout(
         no_answer=answer_status == "no_answer",
         num_steps=len(tape.steps),
         num_python_calls=tool_call_counts.get("run_python_code", 0),
+        n_llm_calls=n_llm_calls,
+        total_execution_time=total_time,
+        agent_execution_time=agent_time,
+        environment_execution_time=env_time,
     )
 
     return RolloutResult(

From 23decf758de8ab1bdfd38b58e2092d72b969ab5e Mon Sep 17 00:00:00 2001
From: rafapi <rafael.pardinas@servicenow.com>
Date: Fri, 5 Sep 2025 15:14:33 +0000
Subject: [PATCH 061/126] Print correct policy info

---
 pipelinerl/actor.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pipelinerl/actor.py b/pipelinerl/actor.py
index ce63ac72..46b6606f 100644
--- a/pipelinerl/actor.py
+++ b/pipelinerl/actor.py
@@ -135,7 +135,7 @@ async def schedule_rollouts(
     # Track rollouts per problem group
     group_rollouts = {}
     rollout_policy = hydra.utils.get_method(cfg.actor.rollout_policy)
-    logger.info(f"Use rollout policy: {rollout_policy}")
+    logger.info(f"Use rollout policy: {rollout_policy.__name__}")
 
     async def rollout_and_maybe_produce_result(
         problem: dict,

From 29118b719b722b2ccbdca4d28185ba7b9fbc0904 Mon Sep 17 00:00:00 2001
From: rafapi <rafael.pardinas@servicenow.com>
Date: Fri, 5 Sep 2025 15:16:04 +0000
Subject: [PATCH 062/126] Add aime2025

---
 pipelinerl/domains/math/load_datasets.py | 26 ++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/pipelinerl/domains/math/load_datasets.py b/pipelinerl/domains/math/load_datasets.py
index 4b44dfb6..7cbf9c18 100644
--- a/pipelinerl/domains/math/load_datasets.py
+++ b/pipelinerl/domains/math/load_datasets.py
@@ -170,6 +170,26 @@ def _load_aime_dataset(year: int, upsample_factor: int = 0) -> list[dict]:
     return add_ids(samples)
 
 
+def _load_aime_2025_opencompass(upsample_factor: int = 0) -> list[dict]:
+    configs = ["AIME2025-I", "AIME2025-II"]
+    dataset_name = "aime_2025" + ("" if upsample_factor > 0 else "_original")
+
+    samples: list[dict] = []
+    for config_name in configs:
+        ds = load_dataset("opencompass/AIME2025", config_name, split="test")
+        samples.extend([s for s in process_math(ds, dataset_name) if s is not None])
+
+    original_size = len(samples)
+    if upsample_factor > 0:
+        samples *= upsample_factor
+
+    logger.info(
+        f"Loading aime 2025 (OpenCompass) dataset: {len(samples)} samples"
+        + (f" (upsampled from {original_size})" if upsample_factor > 0 else "")
+    )
+    return add_ids(samples)
+
+
 def _load_amc_dataset(year: int, upsample_factor: int = 0) -> list[dict]:
     amc_dataset = load_dataset("AI-MO/aimo-validation-amc", split="train", trust_remote_code=True)
     amc_dataset = amc_dataset.filter(lambda x: str(year) in x["url"])
@@ -335,6 +355,12 @@ def load_datasets(dataset_names: List[str] | str | None, seed: int | None = None
     if "aime_2024_original" in dataset_names:
         datasets += _load_aime_dataset(2024)
 
+    if "aime_2025" in dataset_names:
+        datasets += _load_aime_2025_opencompass(upsample_factor=16)
+
+    if "aime_2025_original" in dataset_names:
+        datasets += _load_aime_2025_opencompass()
+
     if "amc_2022" in dataset_names:
         # TODO: AMC 2022 is 43 problems, is that to be expected?
         datasets += _load_amc_dataset(2022, upsample_factor=16)

From 88828596361fe13ad27b8fb8cc6cbb58dda41ce0 Mon Sep 17 00:00:00 2001
From: rafapi <rafael.pardinas@servicenow.com>
Date: Fri, 5 Sep 2025 15:16:37 +0000
Subject: [PATCH 063/126] Test on aime2025

---
 conf/mcp.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/conf/mcp.yaml b/conf/mcp.yaml
index efdd196a..c4b050b8 100644
--- a/conf/mcp.yaml
+++ b/conf/mcp.yaml
@@ -19,6 +19,7 @@ train_dataset_names:
 - open_reasoner_zero_57k
 - open_reasoner_zero_extended_72k 
 test_dataset_names:
+  - aime_2025
   - aime_2024
   - amc_2023
   - math_500

From ea2d393005cd17b6bd2000234fae4cceec94c1c1 Mon Sep 17 00:00:00 2001
From: Alex Piche <alexandrelpiche@gmail.com>
Date: Mon, 22 Sep 2025 16:37:31 +0000
Subject: [PATCH 064/126] kl new old

---
 pipelinerl/finetune/rl/__init__.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/pipelinerl/finetune/rl/__init__.py b/pipelinerl/finetune/rl/__init__.py
index 57aa4fa7..3211d579 100644
--- a/pipelinerl/finetune/rl/__init__.py
+++ b/pipelinerl/finetune/rl/__init__.py
@@ -260,6 +260,7 @@ def rl_step(
     )
 
     approx_kl = torch.exp(log_ratio_ref_new_clamp) - log_ratio_ref_new_clamp - 1  # Schulman KL approx
+    approx_kl_new_old = torch.exp(log_ratio_new_old) - log_ratio_new_old - 1  # Schulman KL approx
 
     assert torch.isfinite(approx_kl).all(), f"approx_kl is not finite: {approx_kl}"
     entropy_bonus_coef = linear_decay_coef(current_step, max_step, config.entropy_bonus, config.final_entropy_bonus)
@@ -337,6 +338,7 @@ def rl_step(
         "max_advantage": advantages[masks_shifted].max().item(),
         "min_advantage": advantages[masks_shifted].min().item(),
         "kl": sum_sum(approx_kl / num_labels_in_seq, masks_shifted, segments).item(),
+        "kl_new_old": sum_sum(approx_kl_new_old / num_labels_in_seq, masks_shifted, segments).item(),
         "max_kl": approx_kl[masks_shifted].max().item(),
         "min_kl": approx_kl[masks_shifted].min().item(),
         "policy_loss": sum_sum(policy_loss / num_labels_in_seq, masks_shifted, segments).item(),

From eb7eb0d37e2f61322ea70003e3c58baa5c48149a Mon Sep 17 00:00:00 2001
From: Alex Piche <alexandrelpiche@gmail.com>
Date: Thu, 25 Sep 2025 19:25:44 +0000
Subject: [PATCH 065/126] loo

---
 pipelinerl/finetune/rl/__init__.py | 61 ++++++++++++++++++------------
 1 file changed, 37 insertions(+), 24 deletions(-)

diff --git a/pipelinerl/finetune/rl/__init__.py b/pipelinerl/finetune/rl/__init__.py
index 3211d579..e74b9a0b 100644
--- a/pipelinerl/finetune/rl/__init__.py
+++ b/pipelinerl/finetune/rl/__init__.py
@@ -383,14 +383,7 @@ def populate_rl_data(dataset: list[dict[str, Any]], eos_token_id: int, config: R
     """
     Populates a dataset with reinforcement learning specific data columns including
     rewards, advantages, and token weights.
-
-    Args:
-        dataset (Dataset): The input dataset to populate with RL data
-        eos_token_id (int): End of sequence token ID
-        config (RLConfig): Configuration object containing RL training parameters
-
-    Returns:
-        Dataset: The dataset populated with RL-specific columns
+    Uses leave-one-out (LOO) reward mean: each rollout's baseline excludes its own reward.
     """
     # Convert to pandas for processing
     df_init = pd.DataFrame(dataset)
@@ -398,7 +391,7 @@ def populate_rl_data(dataset: list[dict[str, Any]], eos_token_id: int, config: R
 
     # Step 1: calculate group-level statistics
     df_stats = df_init[["group_id", "rollout_index", "step_index"]].copy()
-    df_stats["num_tokens"] = df_init["input_ids"].apply(lambda x: len(x))
+    df_stats["num_tokens"] = df_init["input_ids"].apply(len)
     # We assume that rewards for all tokens are the same
     df_stats["rollout_reward"] = df_init["rewards"].apply(lambda x: x[0])
     # Check that the reward is the same for each step in the rollout
@@ -408,15 +401,22 @@ def populate_rl_data(dataset: list[dict[str, Any]], eos_token_id: int, config: R
     df_grouped = (
         df_stats.groupby("group_id")
         .agg(
-            rollout_reward_mean=("rollout_reward", "mean"),
+            rollout_reward_sum=("rollout_reward", "sum"),
+            rollout_reward_count=("rollout_reward", "count"),
             rollout_reward_std=("rollout_reward", "std"),
-            group_tokens=("num_tokens", "mean"), 
+            group_tokens=("num_tokens", "mean"),
         )
         .reset_index()
     )
-    assert df_grouped.columns.tolist() == ["group_id", "rollout_reward_mean", "rollout_reward_std", "group_tokens"]
-
-    # Step 2: calculate advantages for each sample
+    assert df_grouped.columns.tolist() == [
+        "group_id",
+        "rollout_reward_sum",
+        "rollout_reward_count",
+        "rollout_reward_std",
+        "group_tokens",
+    ]
+
+    # Step 2: calculate advantages for each sample (with LOO mean)
     df_advantages = pd.merge(
         df_init[["group_id", "rollout_index", "step_index", "rewards"]],
         df_grouped,
@@ -424,26 +424,37 @@ def populate_rl_data(dataset: list[dict[str, Any]], eos_token_id: int, config: R
         how="left"
     )
     assert len(df_advantages) == len(df_init)
+
     def calculate_advantages(row):
         rewards = row["rewards"]
-        mean = row["rollout_reward_mean"]
+        group_sum = row["rollout_reward_sum"]
+        group_count = row["rollout_reward_count"]
+        current_reward = rewards[0]  # same reward across tokens in rollout
+
+        # Leave-one-out mean
+        if group_count > 1:
+            loo_mean = (group_sum - current_reward) / (group_count - 1)
+        else:
+            loo_mean = current_reward  # degenerate case: only one rollout in group
+
         std = row["rollout_reward_std"]
         if config.divide_advantage_by_std:
-            advantages = [(reward - mean) / (np.nan_to_num(std) + 1e-4) for reward in rewards]
+            advantages = [(r - loo_mean) / (np.nan_to_num(std) + 1e-4) for r in rewards]
         else:
-            advantages = [(reward - mean) for reward in rewards]
+            advantages = [(r - loo_mean) for r in rewards]
         return advantages
-    df_advantages["advantages"] = df_advantages.apply(
-        calculate_advantages,
-        axis=1,
+
+    df_advantages["advantages"] = df_advantages.apply(calculate_advantages, axis=1)
+    df_advantages = df_advantages.drop(
+        columns=["rewards", "rollout_reward_sum", "rollout_reward_count", "rollout_reward_std"]
     )
-    df_advantages = df_advantages.drop(columns=["rewards", "rollout_reward_mean", "rollout_reward_std"])
-    assert df_advantages.columns.tolist() == ["group_id", "rollout_index", "step_index", "group_tokens", "advantages"]
+    assert df_advantages.columns.tolist() == [
+        "group_id", "rollout_index", "step_index", "group_tokens", "advantages"
+    ]
 
     # Step 3: bring advantages and group level stats back to the main df
     df = df_init.drop(columns=["advantages", "group_tokens"])
     df = pd.merge(df, df_advantages, on=["group_id", "rollout_index", "step_index"], how="left")
-    # Debug print lengths of all dataframes
     assert len(df) == len(df_init)
 
     # Step 4: make token-level overflow and mean group length information
@@ -452,7 +463,9 @@ def calculate_advantages(row):
         axis=1,
     )
     df["group_tokens"] = df.apply(lambda row: [row["group_tokens"]] * len(row["input_ids"]), axis=1)
-    df["num_labels"] = df.apply(lambda row: [sum(1 for label in row["labels"] if label != -100)] * len(row["input_ids"]), axis=1)
+    df["num_labels"] = df.apply(
+        lambda row: [sum(1 for label in row["labels"] if label != -100)] * len(row["input_ids"]), axis=1
+    )
 
     # Step 5: move the results back to the dataset
     advantages_list = df["advantages"].tolist()

From 1247360cb1545e9ada775a06ca9bad05442c5816 Mon Sep 17 00:00:00 2001
From: rafapi <rafael.pardinas@servicenow.com>
Date: Fri, 26 Sep 2025 11:29:57 +0000
Subject: [PATCH 066/126] Add new metrics

---
 pipelinerl/actor.py | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

diff --git a/pipelinerl/actor.py b/pipelinerl/actor.py
index 46b6606f..cb9a4434 100644
--- a/pipelinerl/actor.py
+++ b/pipelinerl/actor.py
@@ -349,6 +349,8 @@ def update_stats(self, rollout_results: List[RolloutResult]):
             self.model_versions_list.append(result.model_version)
             domain_agnostic_metrics = self.compute_domain_agnostic_metrics(result) 
             all_metrics = result.metrics.model_dump() | domain_agnostic_metrics
+            all_metrics["used_python"] = int(all_metrics.get("used_python", False))
+            all_metrics["used_math_answer"] = int(all_metrics.get("used_math_answer", False))
             for k, v in all_metrics.items():
                 if isinstance(v, list):
                     self.stats[k][dataset_name][group_id] += v
@@ -549,6 +551,21 @@ def publish_stats(self, stats_writer: StreamWriter, loop_stats: Dict):
         stats |= loop_stats
         for k, v in self.sliding_stats.items():
             stats[k] = sum(v) / len(v) if v else 0
+
+        rename_suffixes = {
+            "num_python_calls_mean": "python_calls_mean",
+            "used_python_mean": "python_usage_rate",
+            "num_math_answer_calls_mean": "math_answer_calls_mean",
+            "used_math_answer_mean": "math_answer_usage_rate",
+        }
+
+        for key in list(stats.keys()):
+            for old_suffix, new_suffix in rename_suffixes.items():
+                if key.endswith(old_suffix):
+                    prefix = key[: -len(old_suffix)]
+                    stats[f"{prefix}{new_suffix}"] = stats[key]
+                    break
+
         if self.cfg.wandb.use_wandb:
             wandb.log({f"actor/{k}": v for k, v in stats.items()})
         stats_writer.write(stats)
@@ -592,11 +609,18 @@ def run_actor_loop(cfg: DictConfig):
     else:
         actor_model_path = cfg.model_path
     
+    # Align client-side context size with vLLM server max_model_len when available
+    try:
+        _context_size = int(cfg.vllm_config.vllm_kwargs.max_model_len)
+    except Exception:
+        _context_size = 32000
+
     train_llms = [
         TrainableLLM(
             base_url=url,
             model_name=str(actor_model_path),
             tokenizer_name=str(actor_model_path),
+            context_size=_context_size,
             parameters=cfg.llm.parameters,
             use_cache=False,
             collect_logprobs=True,
@@ -609,6 +633,7 @@ def run_actor_loop(cfg: DictConfig):
             base_url=url,
             model_name=str(actor_model_path),
             tokenizer_name=str(actor_model_path),
+            context_size=_context_size,
             parameters=cfg.test_llm.parameters,
             use_cache=False,
             collect_logprobs=True,

From 61c91c73bac1903b5d6260f5686af7102ff8954c Mon Sep 17 00:00:00 2001
From: rafapi <rafael.pardinas@servicenow.com>
Date: Tue, 30 Sep 2025 18:42:11 +0000
Subject: [PATCH 067/126] Embedded envs

---
 conf/base.yaml                       |   2 +
 conf/mcp.yaml                        | 151 +++--
 pipelinerl/domains/mcp/__init__.py   |   2 +-
 pipelinerl/domains/mcp/env_server.py | 948 ++++++++++++++++++++++++++-
 pipelinerl/domains/mcp/rollouts.py   | 245 +++++--
 pipelinerl/domains/mcp/steps.py      |   6 +-
 pipelinerl/launch.py                 |  41 +-
 pipelinerl/rl_tool_parser_plugin.py  | 189 +++++-
 pipelinerl/utils.py                  |  10 +-
 pipelinerl/vllm0.py                  |  19 +
 pipelinerl/world.py                  |   2 +-
 11 files changed, 1445 insertions(+), 170 deletions(-)

diff --git a/conf/base.yaml b/conf/base.yaml
index 82b95d91..638d2c13 100644
--- a/conf/base.yaml
+++ b/conf/base.yaml
@@ -81,6 +81,8 @@ world:
 
   actor_group_port: 9000
   environment_start_port: 7777
+# Remote vs embedded environment execution strategy
+  environment_mode: remote
 # this will be autocreated based on the config
 jobs: []
 
diff --git a/conf/mcp.yaml b/conf/mcp.yaml
index c4b050b8..cf85ca18 100644
--- a/conf/mcp.yaml
+++ b/conf/mcp.yaml
@@ -1,18 +1,27 @@
 defaults:
     - base
+    - override finetune: grpo
     - _self_
 
+llm:
+  parameters:
+    max_tokens: 8192
+
+test_llm:
+  parameters:
+    max_tokens: 8192
 
 actor:
   rollout_policy: pipelinerl.domains.mcp.generate_mcp_rollout
-  system_prompt: Please reason step by step, and put your final answer within \boxed{}.
+  system_prompt: Please reason step by step, and put your final answer within \boxed{{}}.
   llm_max_rollouts: 64
   task_template: |-
     {task}
+  shared_memory_entry_size: 10000000
 
 finetune:
-  seq_length: 48000
-  seq_parallel: 4
+  seq_length: 128000
+  seq_parallel: 8
 
 dataset_loader: pipelinerl.domains.math.load_datasets
 train_dataset_names:
@@ -20,9 +29,6 @@ train_dataset_names:
 - open_reasoner_zero_extended_72k 
 test_dataset_names:
   - aime_2025
-  - aime_2024
-  - amc_2023
-  - math_500
 
 vllm_config:
   use_v1: false
@@ -30,23 +36,26 @@ vllm_config:
     enable-auto-tool-choice: ""
     tool-call-parser: rl_tool
     tool-parser-plugin: ${hydra:runtime.cwd}/pipelinerl/rl_tool_parser_plugin.py
-    max_model_len: 40960
+    max-num-seqs: ${actor.llm_max_rollouts}
+    max-num-batched-tokens: 4096
+    max_model_len: 128000
+    gpu-memory-utilization: 0.85
 
 environment:
-  _target_: pipelinerl.domains.mcp.MCPEnvironmentServer
-  n_envs: 8
-  host: "0.0.0.0"
-  exp_path: ${output_dir}/env_server
-  mcp_target: tapeagents.mcp.MCPEnvironment
-  mcp_config_path: ${hydra:runtime.cwd}/conf/mcp/python.json
-  mcp_tools_whitelist:
+  _target_: pipelinerl.domains.mcp.env_server.EmbeddedMCPEnvironment
+  config_path: ${hydra:runtime.cwd}/conf/mcp/python.json
+  tools_whitelist:
+    - run_python_code
+  read_timeout_seconds: 600
+  use_cache: false
+  runtime_pool_workers: 4
+  offload_tools:
     - run_python_code
-  env_call_timeout: 600  # Increased from default 60s to 10 minutes
-  mcp_read_timeout_seconds: 3000
 
 
 world:
   env_replicas_per_actor: 8
+  environment_mode: embedded
 
 agent_max_loops: 3
 agent:
@@ -56,25 +65,44 @@ agent:
   store_llm_calls: true
   templates:
     system_prompt: |
-      You are an expert AI Agent trained to assist users with complex information processing tasks.
-      Your role is to understand user queries and respond in a helpful and accurate manner.
+      You are a math-focused AI Agent. Solve problems by combining clear symbolic reasoning
+      with short, deterministic Python code.
       Keep your replies concise and direct. Prioritize clarity and avoid over-elaboration.
-      Do not express emotions or opinions about user questions. 
+      Always present the final answer in LaTeX \boxed{{}}.
+      Do not express emotions or opinions about user questions.
+
+      Workflow:
+      1. Draft a brief plan in plain text.
+      2. Execute one run_python_code call to compute or verify the result.
+      3. Finalize by calling MathAnswer with the LaTeX-formatted answer.
+
+      Python execution policy (run_python_code):
+      - Use Python strictly for pure computation to verify and validate the final answer.
+      - No network, file system, OS or environment access.
+      - Keep snippets minimal and self-contained; avoid large outputs and long-running loops; print only the final result.
+
+      Validation:
+      - Cross-check results (alternative derivation, invariants, higher precision) before finalizing.
+      - If execution fails, propose the minimal fix and retry.
+      Keep replies direct and avoid unnecessary text.
     allowed_tools: |
-      You have access to the following tools:
+      You can call the following tools:
       {tools_description}
+      - run_python_code: deterministic math code; print only the final value.
+      - MathAnswer: return the LaTeX \boxed{{}} answer when the solution is verified.
+      Always verify with run_python_code before invoking MathAnswer.
     thought_format: |
       Important! Respond with the plain text, do not include any JSON or code.
       Do not output anything besides what I asked in this message.
     allowed_steps: |
-      You have access to the following tools:
-      {tools_description}
-    format: >
-      Output only a single JSON dict.
-      Do not repeat the last thought again.
-      If the last action does not change the observation, do not repeat it!
-      DO NOT OUTPUT ANYTHING BESIDES THE JSON! DO NOT PLACE ANY COMMENTS INSIDE THE JSON. 
-      It will break the system that processes the output.
+      Workflow summary:
+      - Plan briefly in plain text.
+      - Call run_python_code exactly once per loop to compute/verify.
+      - Finish with a single MathAnswer tool call carrying the \boxed{{}} result.
+    format: |
+      For finalization, reply with a single short sentence that ends in the \boxed{{}} answer,
+      immediately followed by the MathAnswer function call containing the same \boxed{{}} value.
+      Never emit unrelated JSON wrappers or duplicate the final thought.
       
 
   nodes:
@@ -82,53 +110,52 @@ agent:
       name: plan
       system_prompt: ${agent.templates.system_prompt}
       guidance: |
-        Write a concise multi-step plan explaining which steps should be performed to find the answer for the given task.
-        Be specific about how each step should be performed. Only describe the intended actions here, do not perform them yet.
-        Consider that next steps may depend on results of previous steps, so include conditional branching using "if" statements where needed.
-        Start with the title "Plan". Every step should have short name and description.
+        Produce a concise math plan (formulas/checks). You will ALWAYS verify by executing Python code.
         ${agent.templates.thought_format}
       steps_prompt: ${agent.templates.allowed_tools}
+      trim_obs_except_last_n: 2
 
     - _target_: tapeagents.nodes.StandardNode
-      name: select
+      name: code
       system_prompt: ${agent.templates.system_prompt}
-      trim_obs_except_last_n: 100
       guidance: |
-        Select the next step to do to move forward with the plan. Describe the expected effect of the proposed action.
-        ${agent.templates.thought_format}
-      steps_prompt: ${agent.templates.allowed_tools}
+        ALWAYS call run_python_code once to compute/verify the result.
+        Use exact, deterministic code; print only the final scalar or tuple.
+        If code fails, fix minimally and call run_python_code again after reviewing the error.
+      use_known_actions: true
+      use_function_calls: true
+      trim_obs_except_last_n: 2
 
     - _target_: tapeagents.nodes.StandardNode
-      name: act
+      name: finalize
       system_prompt: ${agent.templates.system_prompt}
-      trim_obs_except_last_n: 100
-      guidance: Then produce single function call for the next step. If the answer is ready, call MathAnswer. Put your final answer within \boxed{}.
+      guidance: |
+        Read the last Python stdout value. First, state the answer in one short sentence that ends with LaTeX \boxed{{}}.
+        Immediately after that sentence, call the MathAnswer tool exactly once with:
+          name: MathAnswer
+          arguments: {"answer": "<final answer in LaTeX \\boxed{}>"}
+        Do not add any extra text around the tool call. Once the sentence is emitted, return only the MathAnswer function call.
       steps:
         - pipelinerl.domains.mcp.steps.MathAnswer
       use_known_actions: true
       use_function_calls: true
+      trim_obs_except_last_n: 2
+      next_node: code
 
-    - _target_: tapeagents.nodes.StandardNode
-      name: summarize
-      system_prompt: ${agent.templates.system_prompt}
-      trim_obs_except_last_n: 100
-      guidance: |
-        Summarize last observation. If its an image, thoroughly describe it with all details.
-        Describe the results of the last action and observed changes
-        Do not hallucinate or make up any information, only describe what you see in the observation.
-        Do not guess or assume action effects, describe only visible changes.
-        ${agent.templates.thought_format}
+# model_path: Qwen/Qwen3-8B
+model_path: /mnt/llmd/base_models/ServiceNow-AI/7_9_25_14b_text_reasoning_sft
 
-    - _target_: tapeagents.nodes.StandardNode
-      name: reflect
-      system_prompt: ${agent.templates.system_prompt}
-      trim_obs_except_last_n: 100
-      guidance: |
-        1. Evaluate the action's success, explain its effect on current step, overall plan and task solution.
-        2. If the last action was not successful, describe errors and the possible reasons for failure.
-        3. Check if the current plan step is finished. 
-        4. If the step is finished, update the following steps of the plan with new information and choose the next step.
-        ${agent.templates.thought_format}
-      next_node: select
+# Local reward shaping for tool usage
+python_tool_shaping:
+  bonus_on_correct_with_python: 0.2
+  penalty_on_incorrect_without_python: 0.1
+  max_abs: 0.2
 
-model_path: Qwen/Qwen3-8B
\ No newline at end of file
+# Encourage concise outputs (penalize long completions)
+length_shaping:
+  target_ratio: 0.1                # 10% of max_tokens; auto scales with max_tokens
+  min_target_tokens: 256           # lower clamp
+  max_target_tokens: 2048          # upper clamp
+  slope: 0.001                     # penalty per token beyond target
+  max_penalty: 0.2                 # clamp absolute penalty
+  bonus_on_short_correct: 0.05     # bonus if correct and concise
diff --git a/pipelinerl/domains/mcp/__init__.py b/pipelinerl/domains/mcp/__init__.py
index a47458a5..4218ca1b 100644
--- a/pipelinerl/domains/mcp/__init__.py
+++ b/pipelinerl/domains/mcp/__init__.py
@@ -1,2 +1,2 @@
 from .rollouts import generate_mcp_rollout
-from .env_server import MCPEnvironmentServer
\ No newline at end of file
+from .env_server import EmbeddedMCPEnvironment, MCPEnvironmentServer, EmbeddedEnvironmentWorker
diff --git a/pipelinerl/domains/mcp/env_server.py b/pipelinerl/domains/mcp/env_server.py
index fabc5af2..2298e5cd 100644
--- a/pipelinerl/domains/mcp/env_server.py
+++ b/pipelinerl/domains/mcp/env_server.py
@@ -1,19 +1,361 @@
-import os
-from tapeagents.remote_environment import EnvironmentServer
-from omegaconf import OmegaConf
-from typing import List
-from fastapi import HTTPException
-from pydantic import BaseModel
-import logging
 import asyncio
+import atexit
+import inspect
+import json
+import logging
+import os
+import re
+import threading
+import time
+import traceback
 from concurrent.futures import ProcessPoolExecutor
+from contextlib import asynccontextmanager
 from functools import partial
+from typing import Any, AsyncIterator, List
+
+import multiprocessing
+
+from fastapi import HTTPException
+from hydra.utils import instantiate
+from omegaconf import DictConfig, OmegaConf
+from pydantic import BaseModel
+from tapeagents.core import Action, Observation
+from tapeagents.environment import Environment
+from tapeagents.mcp import MCPClient, MCPEnvironment, NoTool
+from tapeagents.remote_environment import EnvironmentServer
+from tapeagents.tool_calling import FunctionSpec, ToolCallAction, ToolResult, ToolSpec
+from mcp.types import CallToolResult, TextContent
 
 from pipelinerl.domains.math.verifier_api import verify_answer
+from pipelinerl.domains.mcp.steps import MathAnswer
 
 logger = logging.getLogger(__name__)
 
 
+_CONNECTION_ERROR_PATTERNS = (
+    "closedresourceerror",
+    "brokenresourceerror",
+    "broken pipe",
+    "connectionreseterror",
+    "timed out while waiting for response",
+)
+
+
+_MCP_WORKER_STATE: dict[str, Any] | None = None
+
+
+def _shutdown_mcp_worker() -> None:
+    global _MCP_WORKER_STATE
+    if not _MCP_WORKER_STATE:
+        return
+    loop: asyncio.AbstractEventLoop = _MCP_WORKER_STATE["loop"]
+    client: MCPClient = _MCP_WORKER_STATE["client"]
+    try:
+        loop.run_until_complete(client.close())
+    except Exception:
+        logger.warning("Failed to close MCP client in worker", exc_info=True)
+    finally:
+        loop.close()
+        _MCP_WORKER_STATE = None
+
+
+def _initialize_mcp_worker(
+    config_path: str,
+    tools_whitelist: list[str] | tuple[str, ...] | None,
+    use_cache: bool,
+    read_timeout_seconds: int,
+) -> None:
+    """Initializer for the ProcessPool workers that own MCP runtimes."""
+    global _MCP_WORKER_STATE
+    loop = asyncio.new_event_loop()
+    asyncio.set_event_loop(loop)
+    client = MCPClient(
+        config_path=config_path,
+        use_cache=use_cache,
+        read_timeout_seconds=read_timeout_seconds,
+    )
+    loop.run_until_complete(client.start_servers())
+    _MCP_WORKER_STATE = {
+        "loop": loop,
+        "client": client,
+        "tools_whitelist": list(tools_whitelist or []),
+    }
+    atexit.register(_shutdown_mcp_worker)
+
+
+def _call_tool_in_worker(tool_name: str, tool_arguments: Any) -> dict[str, Any]:
+    """Execute an MCP tool call inside a worker process."""
+    if not _MCP_WORKER_STATE:
+        raise RuntimeError("MCP worker not initialized")
+    loop: asyncio.AbstractEventLoop = _MCP_WORKER_STATE["loop"]
+    client: MCPClient = _MCP_WORKER_STATE["client"]
+    whitelist: list[str] = _MCP_WORKER_STATE.get("tools_whitelist", [])
+    if whitelist and tool_name not in whitelist:
+        raise NoTool(f"Tool {tool_name} not allowed by whitelist")
+    result = loop.run_until_complete(client.call_tool(tool_name, tool_arguments))
+    return result.model_dump(exclude_none=True)
+
+
+class _RemoteCallError(RuntimeError):
+    def __init__(self, message: str, details: dict[str, Any] | None = None) -> None:
+        super().__init__(message)
+        self.details = details or {}
+
+
+def _invoke_environment_method(
+    environment: Environment,
+    method_name: str,
+    args: tuple[Any, ...],
+    kwargs: dict[str, Any],
+    loop: asyncio.AbstractEventLoop,
+) -> Any:
+    attr = getattr(environment, method_name)
+    if inspect.iscoroutinefunction(attr):
+        return loop.run_until_complete(attr(*args, **kwargs))
+    result = attr(*args, **kwargs)
+    if inspect.isawaitable(result):
+        return loop.run_until_complete(result)
+    return result
+
+
+def _environment_process_main(env_cfg_container: dict[str, Any], conn) -> None:
+    loop = asyncio.new_event_loop()
+    asyncio.set_event_loop(loop)
+    try:
+        env_cfg = OmegaConf.create(env_cfg_container)
+        environment: Environment = instantiate(env_cfg)
+    except Exception:
+        conn.send(
+            (
+                "exception",
+                {
+                    "type": "EnvironmentBootstrapError",
+                    "message": "Failed to instantiate environment",
+                    "traceback": traceback.format_exc(),
+                },
+            )
+        )
+        conn.close()
+        loop.close()
+        return
+
+    async_methods = {
+        name
+        for name in ("ainitialize", "areset", "aclose", "astep", "areact")
+        if hasattr(environment, name) and inspect.iscoroutinefunction(getattr(environment, name))
+    }
+    sync_methods = {
+        name
+        for name in (
+            "initialize",
+            "reset",
+            "close",
+            "start_task",
+            "actions",
+            "tools_description",
+            "mark_healthy",
+            "is_healthy",
+            "step",
+            "react",
+        )
+        if callable(getattr(environment, name, None))
+    }
+
+    conn.send(("capabilities", {"sync": list(sync_methods), "async": list(async_methods)}))
+
+    running = True
+    while running:
+        try:
+            message = conn.recv()
+        except EOFError:
+            break
+        if not isinstance(message, tuple) or len(message) != 3:
+            continue
+        command, args, kwargs = message
+        if command == "__shutdown__":
+            running = False
+            conn.send(("ok", None))
+            break
+        try:
+            result = _invoke_environment_method(environment, command, args, kwargs, loop)
+            conn.send(("ok", result))
+        except Exception as exc:
+            conn.send(
+                (
+                    "exception",
+                    {
+                        "type": exc.__class__.__name__,
+                        "message": str(exc),
+                        "traceback": traceback.format_exc(),
+                    },
+                )
+            )
+
+    try:
+        if "aclose" in async_methods:
+            loop.run_until_complete(environment.aclose())
+        elif "close" in sync_methods:
+            environment.close()
+    except Exception:
+        logger.debug("Failed to close environment during shutdown", exc_info=True)
+    finally:
+        conn.close()
+        loop.close()
+
+
+class _ProcessEnvironmentProxy:
+    def __init__(self, env_cfg: DictConfig):
+        self._ctx = multiprocessing.get_context("spawn")
+        self._parent_conn, child_conn = self._ctx.Pipe()
+        cfg_container = OmegaConf.to_container(env_cfg, resolve=True)
+        self._process = self._ctx.Process(
+            target=_environment_process_main,
+            args=(cfg_container, child_conn),
+        )
+        self._process.daemon = False
+        self._process.start()
+        self._lock = threading.Lock()
+        self._closed = False
+        try:
+            status, payload = self._parent_conn.recv()
+        except EOFError as error:
+            raise _RemoteCallError("Environment process terminated prematurely") from error
+        if status == "exception":
+            raise _RemoteCallError(payload.get("message", "Environment bootstrap failed"), payload)
+        if status != "capabilities":
+            raise _RemoteCallError("Unexpected handshake from environment process")
+        self._sync_methods = set(payload.get("sync", []))
+        self._async_methods = set(payload.get("async", []))
+
+    def supports_async(self, name: str) -> bool:
+        return name in self._async_methods
+
+    def supports_sync(self, name: str) -> bool:
+        return name in self._sync_methods
+
+    def _ensure_alive(self) -> None:
+        if self._closed:
+            raise _RemoteCallError("Environment proxy is closed")
+        if not self._process.is_alive():
+            raise _RemoteCallError("Environment process died unexpectedly")
+
+    def _call_remote(self, method: str, *args: Any, **kwargs: Any) -> Any:
+        self._ensure_alive()
+        with self._lock:
+            try:
+                self._parent_conn.send((method, args, kwargs))
+                status, payload = self._parent_conn.recv()
+            except EOFError as error:
+                raise _RemoteCallError("Lost connection to environment process") from error
+        if status == "ok":
+            return payload
+        if status == "exception":
+            raise _RemoteCallError(payload.get("message", "Remote call failed"), payload)
+        raise _RemoteCallError(f"Unexpected response type: {status}")
+
+    def start_task(self, task: dict) -> dict:
+        return self._call_remote("start_task", task)
+
+    def actions(self) -> tuple[type[Action], ...]:
+        return tuple(self._call_remote("actions"))
+
+    def tools_description(self) -> str:
+        return self._call_remote("tools_description")
+
+    def initialize(self):
+        if self.supports_sync("initialize"):
+            return self._call_remote("initialize")
+        if self.supports_async("ainitialize"):
+            return self._call_remote("ainitialize")
+        return None
+
+    async def ainitialize(self) -> None:
+        loop = asyncio.get_running_loop()
+        await loop.run_in_executor(None, self.initialize)
+
+    def reset(self) -> None:
+        if self.supports_sync("reset"):
+            self._call_remote("reset")
+        elif self.supports_async("areset"):
+            self._call_remote("areset")
+
+    async def areset(self) -> None:
+        loop = asyncio.get_running_loop()
+        await loop.run_in_executor(None, self.reset)
+
+    def step(self, action: Action) -> Observation:
+        if self.supports_sync("step"):
+            return self._call_remote("step", action)
+        if self.supports_async("astep"):
+            return self._call_remote("astep", action)
+        raise _RemoteCallError("Remote environment does not support step or astep")
+
+    async def astep(self, action: Action) -> Observation:
+        loop = asyncio.get_running_loop()
+        return await loop.run_in_executor(None, self.step, action)
+
+    def react(self, tape) -> Any:
+        if self.supports_sync("react"):
+            return self._call_remote("react", tape)
+        if self.supports_async("areact"):
+            return self._call_remote("areact", tape)
+        raise _RemoteCallError("Remote environment does not support react or areact")
+
+    async def areact(self, tape) -> Any:
+        loop = asyncio.get_running_loop()
+        return await loop.run_in_executor(None, self.react, tape)
+
+    def mark_healthy(self) -> None:
+        if self.supports_sync("mark_healthy"):
+            self._call_remote("mark_healthy")
+
+    def is_healthy(self) -> bool:
+        if self.supports_sync("is_healthy"):
+            return bool(self._call_remote("is_healthy"))
+        return True
+
+    def close(self) -> None:
+        if self._closed:
+            return
+        try:
+            if self.supports_sync("close"):
+                self._call_remote("close")
+            elif self.supports_async("aclose"):
+                self._call_remote("aclose")
+        except _RemoteCallError:
+            logger.debug("Remote close failed", exc_info=True)
+        finally:
+            self._shutdown()
+
+    async def aclose(self) -> None:
+        loop = asyncio.get_running_loop()
+        await loop.run_in_executor(None, self.close)
+
+    def _shutdown(self) -> None:
+        if self._closed:
+            return
+        try:
+            with self._lock:
+                if self._process.is_alive():
+                    self._parent_conn.send(("__shutdown__", (), {}))
+                    try:
+                        self._parent_conn.recv()
+                    except EOFError:
+                        pass
+        except Exception:
+            logger.debug("Failed to send shutdown to environment process", exc_info=True)
+        finally:
+            self._parent_conn.close()
+            self._process.join(timeout=5)
+            if self._process.is_alive():
+                self._process.terminate()
+            self._closed = True
+
+    def __del__(self) -> None:
+        try:
+            self._shutdown()
+        except Exception:
+            pass
 class EnvironmentServerWithVerifier(EnvironmentServer):
     """Environment server that includes the verify_answer endpoint."""
     
@@ -99,3 +441,595 @@ def launch(self, port: int):
             "read_timeout_seconds": self.mcp_read_timeout_seconds,
         }))
 
+
+class EmbeddedMCPEnvironment(MCPEnvironment):
+    def __init__(
+        self,
+        *args,
+        math_answer_description: str = "Submit the final answer in LaTeX \\boxed{} format.",
+        **kwargs,
+    ) -> None:
+        config_path = kwargs.get("config_path", "")
+        use_cache = kwargs.get("use_cache", False)
+        read_timeout_seconds = kwargs.get("read_timeout_seconds", 10)
+        runtime_pool_workers = kwargs.pop("runtime_pool_workers", 0)
+        offload_tools = tuple(kwargs.pop("offload_tools", ()))
+
+        super().__init__(*args, **kwargs)
+        self._broken = False
+        self._last_failure_reason: str | None = None
+        self._runtime_guard_installed: bool = False
+        self._runtime_pool: ProcessPoolExecutor | None = None
+        self._runtime_pool_lock = threading.Lock()
+        self._runtime_pool_workers = runtime_pool_workers
+        self._offload_tools = set(offload_tools)
+        self._config_path = getattr(self.client, "config_path", config_path)
+        self._use_cache = getattr(self.client, "use_cache", use_cache)
+        self._read_timeout_seconds = getattr(self.client, "read_timeout_seconds", read_timeout_seconds)
+
+        # try to catch time wasting patterns before execution
+        self._python_blocklist = (
+            (re.compile(r"\bsys\s*\.\s*exit\s*\(", re.IGNORECASE), "sys.exit"),
+            (re.compile(r"\bos\s*\.\s*_exit\s*\(", re.IGNORECASE), "os._exit"),
+            (re.compile(r"\bexit\s*\(", re.IGNORECASE), "exit"),
+            (re.compile(r"\bquit\s*\(", re.IGNORECASE), "quit"),
+            (re.compile(r"raise\s+systemexit", re.IGNORECASE), "raise SystemExit"),
+            (re.compile(r"from\s+sys\s+import\s+exit", re.IGNORECASE), "from sys import exit"),
+            (
+                re.compile(r"__import__\s*\(\s*['\"]os['\"]\s*\)\s*\.\s*_exit", re.IGNORECASE),
+                "__import__('os')._exit",
+            ),
+            (
+                re.compile(r"__import__\s*\(\s*['\"]sys['\"]\s*\)\s*\.\s*exit", re.IGNORECASE),
+                "__import__('sys').exit",
+            ),
+        )
+        self._math_answer_spec = ToolSpec(
+            function=FunctionSpec(
+                name="MathAnswer",
+                description=math_answer_description,
+                parameters={
+                    "type": "object",
+                    "properties": {
+                        "answer": {
+                            "type": "string",
+                            "description": "Final answer expressed in LaTeX \\boxed{} format.",
+                        }
+                    },
+                    "required": ["answer"],
+                },
+            )
+        )
+
+    def initialize(self):
+        super().initialize()
+        self._reset_health()
+        self._ensure_math_answer_tool()
+
+    async def ainitialize(self) -> None:
+        self.loop = asyncio.get_running_loop()
+        await super().ainitialize()
+        self._reset_health()
+        self._ensure_math_answer_tool()
+        await self._install_runtime_guard()
+
+    def actions(self):
+        base_actions = super().actions()
+        if not any(
+            getattr(action, "function", None) and action.function.name == "MathAnswer"
+            for action in base_actions
+        ):
+            base_actions = base_actions + (self._math_answer_spec,)
+        return base_actions
+
+    def _should_offload(self, tool_name: str) -> bool:
+        return bool(self._runtime_pool_workers) and tool_name in self._offload_tools
+
+    def _ensure_runtime_pool(self) -> ProcessPoolExecutor:
+        if self._runtime_pool is not None:
+            return self._runtime_pool
+        with self._runtime_pool_lock:
+            if self._runtime_pool is not None:
+                return self._runtime_pool
+            cpu_count = os.cpu_count() or 1
+            default_workers = max(1, cpu_count // 2)
+            max_workers = self._runtime_pool_workers or default_workers
+            whitelist = tuple(self.tools_whitelist) if getattr(self, "tools_whitelist", None) else tuple()
+            self._runtime_pool = ProcessPoolExecutor(
+                max_workers=max_workers,
+                initializer=_initialize_mcp_worker,
+                initargs=(
+                    self._config_path,
+                    whitelist,
+                    bool(self._use_cache),
+                    int(self._read_timeout_seconds),
+                ),
+            )
+            return self._runtime_pool
+
+    @staticmethod
+    def _make_error_call_result(tool_name: str, message: str) -> CallToolResult:
+        return CallToolResult(
+            content=[TextContent(type="text", text=message)],
+            isError=True,
+        )
+
+    def _resolve_pool_future_sync(self, future, tool_name: str) -> CallToolResult:
+        try:
+            payload = future.result()
+            return CallToolResult.model_validate(payload)
+        except NoTool:
+            logger.exception(f"Tool {tool_name} not found in MCP client")
+            return self._make_error_call_result(tool_name, f"Tool {tool_name} not found")
+        except KeyError as error:
+            logger.exception(f"KeyError when executing MCP tool call: {error}")
+            return self._make_error_call_result(
+                tool_name, f"Error executing tool {tool_name}: KeyError {error}"
+            )
+        except Exception as error:
+            logger.exception(f"Error executing MCP tool call: {error}")
+            return self._make_error_call_result(
+                tool_name, f"Error executing tool {tool_name}: {error}"
+            )
+
+    async def _resolve_pool_future_async(self, future, tool_name: str) -> CallToolResult:
+        try:
+            payload = await asyncio.wrap_future(future)
+            return CallToolResult.model_validate(payload)
+        except NoTool:
+            logger.exception(f"Tool {tool_name} not found in MCP client")
+            return self._make_error_call_result(tool_name, f"Tool {tool_name} not found")
+        except KeyError as error:
+            logger.exception(f"KeyError when executing MCP tool call: {error}")
+            return self._make_error_call_result(
+                tool_name, f"Error executing tool {tool_name}: KeyError {error}"
+            )
+        except Exception as error:
+            logger.exception(f"Error executing MCP tool call: {error}")
+            return self._make_error_call_result(
+                tool_name, f"Error executing tool {tool_name}: {error}"
+            )
+
+    def _shutdown_runtime_pool(self) -> None:
+        if self._runtime_pool is not None:
+            self._runtime_pool.shutdown(wait=True)
+            self._runtime_pool = None
+
+    def _execute_tool_via_pool_sync(self, action: ToolCallAction) -> ToolResult:
+        start = time.perf_counter()
+        future = self._ensure_runtime_pool().submit(
+            _call_tool_in_worker,
+            action.function.name,
+            action.function.arguments,
+        )
+        call_result = self._resolve_pool_future_sync(future, action.function.name)
+        observation = ToolResult(tool_call_id=getattr(action, "id", ""), content=call_result)
+        observation.metadata.other["action_execution_time"] = time.perf_counter() - start
+        observation.metadata.other["action_kind"] = action.kind
+        return observation
+
+    async def _execute_tool_via_pool_async(self, action: ToolCallAction) -> ToolResult:
+        start = time.perf_counter()
+        future = self._ensure_runtime_pool().submit(
+            _call_tool_in_worker,
+            action.function.name,
+            action.function.arguments,
+        )
+        call_result = await self._resolve_pool_future_async(future, action.function.name)
+        observation = ToolResult(tool_call_id=getattr(action, "id", ""), content=call_result)
+        observation.metadata.other["action_execution_time"] = time.perf_counter() - start
+        observation.metadata.other["action_kind"] = action.kind
+        return observation
+
+    def step(self, action: Action) -> Observation:
+        if not isinstance(action, ToolCallAction):
+            return super().step(action)
+
+        outcome, message = self._precheck_tool_action(action)
+        if outcome == "math_answer":
+            return self._create_math_answer(action)
+        if outcome == "error":
+            return self._make_error_tool_result(action, message or "")
+
+        try:
+            observation = self._execute_tool_call_sync(action)
+        except BaseException:
+            self._broken = True
+            raise
+
+        return self._postprocess_after_tool(action, observation)
+
+    async def astep(self, action: Action) -> Observation:
+        if not isinstance(action, ToolCallAction):
+            return await super().astep(action)
+
+        outcome, message = self._precheck_tool_action(action)
+        if outcome == "math_answer":
+            return self._create_math_answer(action)
+        if outcome == "error":
+            return self._make_error_tool_result(action, message or "")
+
+        try:
+            observation = await self._execute_tool_call_async(action)
+        except BaseException:
+            self._broken = True
+            raise
+
+        return self._postprocess_after_tool(action, observation)
+
+    def _precheck_tool_action(self, action: ToolCallAction) -> tuple[str, str | None]:
+        if action.function.name == "MathAnswer":
+            return "math_answer", None
+        if self._broken:
+            return "error", self._backend_unavailable_message()
+        if action.function.name == "run_python_code":
+            block_message = self._check_python_safety(action.function.arguments)
+            if block_message is not None:
+                return "error", block_message
+        return "ok", None
+
+    def _execute_tool_call_sync(self, action: ToolCallAction) -> Observation:
+        if self._should_offload(action.function.name):
+            return self._execute_tool_via_pool_sync(action)
+        return super().step(action)
+
+    async def _execute_tool_call_async(self, action: ToolCallAction) -> Observation:
+        if self._should_offload(action.function.name):
+            return await self._execute_tool_via_pool_async(action)
+        return await super().astep(action)
+
+    def _postprocess_after_tool(
+        self,
+        action: ToolCallAction,
+        observation: Observation,
+    ) -> Observation:
+        if action.function.name != "MathAnswer":
+            return self._postprocess_tool_observation(action, observation)
+        return observation
+
+    def _ensure_math_answer_tool(self) -> None:
+        if not any(
+            getattr(tool, "function", None) and tool.function.name == "MathAnswer"
+            for tool in self.tools
+        ):
+            self.tools.append(self._math_answer_spec)
+
+    def _reset_health(self) -> None:
+        self._broken = False
+        self._last_failure_reason = None
+        self._runtime_guard_installed = False
+
+    def _create_math_answer(self, action: ToolCallAction) -> MathAnswer:
+        answer_value = self._extract_answer(action.function.arguments)
+        math_answer = MathAnswer(answer=answer_value)
+        math_answer.metadata.other.update({
+            "action_kind": "MathAnswer",
+            "tool_call_id": getattr(action, "id", ""),
+            "action_execution_time": 0.0,
+        })
+        return math_answer
+
+    def mark_healthy(self) -> None:
+        self._reset_health()
+
+    def is_healthy(self) -> bool:
+        return not self._broken
+
+    def close(self) -> None:
+        self._shutdown_runtime_pool()
+        super().close()
+
+    async def aclose(self) -> None:
+        self._shutdown_runtime_pool()
+        await super().aclose()
+
+    @staticmethod
+    def _guard_snippet() -> str:
+        """generate Python code that installs safety guards"""
+        return (
+            "import builtins, sys, os, time, atexit\n"
+            "try:\n"
+            "    _PIPELINERL_TIME_LIMIT = float(os.environ.get('PIPELINERL_PY_TIMEOUT', '30'))\n"
+            "except ValueError:\n"
+            "    _PIPELINERL_TIME_LIMIT = 30.0\n"
+            "_PIPELINERL_START = time.perf_counter()\n"
+            "class _ExitBlocked(RuntimeError):\n"
+            "    pass\n"
+            "def _blocked_exit(*_args, **_kwargs):\n"
+            "    raise _ExitBlocked('exit() and os._exit() are disabled in this environment.')\n"
+            "for _target in (builtins, sys):\n"
+            "    for _name in ('exit', 'quit'):\n"
+            "        if hasattr(_target, _name):\n"
+            "            setattr(_target, _name, _blocked_exit)\n"
+            "if hasattr(os, '_exit'):\n"
+            "    os._exit = _blocked_exit\n"
+            "def _pipelinerl_trace(frame, event, arg):\n"
+            "    if event == 'line' and (time.perf_counter() - _PIPELINERL_START) > _PIPELINERL_TIME_LIMIT:\n"
+            "        sys.settrace(None)\n"
+            "        raise RuntimeError(f'Python execution timed out after {_PIPELINERL_TIME_LIMIT} seconds.')\n"
+            "    return _pipelinerl_trace\n"
+            "sys.settrace(_pipelinerl_trace)\n"
+            "atexit.register(lambda: sys.settrace(None))\n"
+        )
+
+    async def _install_runtime_guard(self) -> None:
+        """Install runtime safety guard in the Python environment."""
+        if self._runtime_guard_installed or not getattr(self, "client", None):
+            return
+        try:
+            snippet = self._guard_snippet()
+            if self._should_offload("run_python_code"):
+                future = self._ensure_runtime_pool().submit(
+                    _call_tool_in_worker,
+                    "run_python_code",
+                    {"python_code": snippet},
+                )
+                await self._resolve_pool_future_async(future, "run_python_code")
+            else:
+                await self.client.call_tool(
+                    "run_python_code",
+                    {"python_code": snippet},
+                )
+            self._runtime_guard_installed = True
+            logger.debug("Runtime guard installed successfully")
+        except Exception:
+            logger.warning("Failed to install runtime guard in MCP environment", exc_info=True)
+
+    def _postprocess_tool_observation(
+        self,
+        action: ToolCallAction,
+        observation: Observation,
+    ) -> Observation:
+        if not isinstance(observation, ToolResult):
+            return observation
+        call_result = observation.content
+        if not isinstance(call_result, CallToolResult):
+            return observation
+        if not getattr(call_result, "isError", False):
+            return observation
+        error_text = self._extract_call_result_text(call_result)
+        if not self._is_connection_error_message(error_text):
+            return observation
+        logger.warning(
+            "MCP backend failure detected for tool %s: %s",
+            action.function.name,
+            error_text,
+        )
+        return self._handle_connection_failure(action, observation, error_text)
+
+    @staticmethod
+    def _extract_call_result_text(call_result: CallToolResult) -> str:
+        if not isinstance(call_result.content, list):
+            return ""
+        parts: list[str] = []
+        for block in call_result.content:
+            if isinstance(block, TextContent) and isinstance(block.text, str):
+                parts.append(block.text)
+        return "\n".join(parts).strip()
+
+    @staticmethod
+    def _is_connection_error_message(message: str) -> bool:
+        lowered = message.lower()
+        return any(pattern in lowered for pattern in _CONNECTION_ERROR_PATTERNS)
+
+    def _handle_connection_failure(
+        self,
+        action: ToolCallAction,
+        observation: ToolResult,
+        error_text: str,
+    ) -> ToolResult:
+        """Mark environment as broken and update observation."""
+        self._broken = True
+        failure_message = (
+            "Python tool backend became unavailable (connection lost). "
+            "Environment will restart after this attempt; stop issuing additional tool calls."
+        )
+        if error_text:
+            failure_message = f"{failure_message}\nOriginal error: {error_text}"
+
+        observation.content = CallToolResult(
+            content=[TextContent(type="text", text=failure_message)],
+            isError=True,
+        )
+        observation.metadata.other.setdefault("action_execution_time", observation.metadata.other.get("action_execution_time", 0.0))
+        observation.metadata.other["connection_failure"] = True
+        observation.metadata.other["original_error"] = error_text
+        self._last_failure_reason = failure_message
+        return observation
+
+    def _backend_unavailable_message(self) -> str:
+        """Get message for unavailable backend."""
+        return self._last_failure_reason or (
+            "Python tool backend is restarting after a connection failure. "
+            "Abort this attempt and wait for a fresh environment."
+        )
+
+    @staticmethod
+    def _extract_answer(arguments: dict | str | None) -> str:
+        """Extract answer string from arguments."""
+        if arguments is None:
+            return ""
+        if isinstance(arguments, str):
+            try:
+                parsed = json.loads(arguments)
+                return str(parsed.get("answer", "")) if isinstance(parsed, dict) else str(parsed)
+            except json.JSONDecodeError:
+                return arguments
+        if isinstance(arguments, dict):
+            return str(arguments.get("answer", ""))
+        return str(arguments)
+
+    def _check_python_safety(self, arguments: dict | str | None) -> str | None:
+        """check for Python code problems"""
+        code = self._extract_python_code(arguments)
+        if not code:
+            return None
+        for pattern, label in self._python_blocklist:
+            if pattern.search(code):
+                return (
+                    f"Python execution rejected: forbidden call detected ({label}). "
+                    "Use pure computation without exiting the runtime."
+                )
+        return None
+
+    @staticmethod
+    def _extract_python_code(arguments: dict | str | None) -> str:
+        if arguments is None:
+            return ""
+        if isinstance(arguments, str):
+            try:
+                parsed = json.loads(arguments)
+                if isinstance(parsed, dict):
+                    return str(parsed.get("python_code", parsed.get("code", "")))
+                return str(parsed)
+            except json.JSONDecodeError:
+                return arguments
+        if isinstance(arguments, dict):
+            return str(arguments.get("python_code", arguments.get("code", "")))
+        return str(arguments)
+
+    def _make_error_tool_result(self, action: ToolCallAction, message: str) -> ToolResult:
+        result = CallToolResult(
+            content=[TextContent(type="text", text=message)],
+            isError=True,
+        )
+        tool_result = ToolResult(
+            tool_call_id=getattr(action, "id", ""),
+            content=result,
+        )
+        tool_result.metadata.other["action_execution_time"] = 0.0
+        tool_result.metadata.other["action_kind"] = action.kind
+        return tool_result
+
+
+class EmbeddedEnvironmentWorker:
+    def __init__(self, env_cfg: DictConfig, concurrency: int = 1):
+        # make repeated instantiations stable even if the caller changes its copy
+        self._env_cfg = OmegaConf.create(env_cfg)
+        self._cfg_signature = self._make_cfg_signature(self._env_cfg)
+        self._concurrency = max(1, concurrency)
+        self._init_lock = asyncio.Lock()
+        self._available: asyncio.Queue[_ProcessEnvironmentProxy] | None = None
+        self._all_envs: set[_ProcessEnvironmentProxy] = set()
+
+    @staticmethod
+    def _make_cfg_signature(cfg: DictConfig) -> str:
+        try:
+            container = OmegaConf.to_container(cfg, resolve=True)
+        except Exception:
+            container = OmegaConf.to_container(cfg, resolve=False)
+        return json.dumps(container, sort_keys=True, default=str)
+
+    @property
+    def concurrency(self) -> int:
+        return self._concurrency
+
+    def matches(self, env_cfg: DictConfig) -> bool:
+        return self._cfg_signature == self._make_cfg_signature(env_cfg)
+
+    def set_concurrency(self, concurrency: int) -> None:
+        self._concurrency = max(1, concurrency)
+
+    async def _ensure_pool(self) -> None:
+        if self._available is None:
+            self._available = asyncio.Queue()
+        if len(self._all_envs) >= self._concurrency:
+            return
+        async with self._init_lock:
+            if len(self._all_envs) >= self._concurrency:
+                return
+            missing = self._concurrency - len(self._all_envs)
+            for _ in range(missing):
+                environment = _ProcessEnvironmentProxy(self._env_cfg)
+                try:
+                    await self._init_and_reset(environment)
+                except Exception:
+                    logger.exception("Failed to initialize embedded environment instance")
+                    await self._close(environment)
+                    raise
+                self._all_envs.add(environment)
+                await self._available.put(environment)
+
+    @asynccontextmanager
+    async def alifecycle(self) -> AsyncIterator[Environment]:
+        """Context manager for environment lifecycle with automatic health checking."""
+        await self._ensure_pool()
+        assert self._available is not None
+        
+        environment = await self._available.get()
+        try:
+            await self._reset(environment)
+            yield environment
+        finally:
+            try:
+                unhealthy = (
+                    hasattr(environment, "is_healthy")
+                    and not environment.is_healthy()  # type: ignore
+                )
+            except Exception:
+                logger.warning("Failed to query embedded environment health; replacing", exc_info=True)
+                unhealthy = True
+            is_healthy = not unhealthy
+            
+            if is_healthy:
+                # try to reset and recycle healthy environment
+                try:
+                    await self._reset(environment)
+                    if hasattr(environment, "mark_healthy"):
+                        environment.mark_healthy()  # type: ignore
+                    await self._available.put(environment)
+                except Exception:
+                    logger.exception("Failed to recycle embedded environment; replacing")
+                    await self._replace(environment)
+            else:
+                # environment is unhealthy, replace it
+                logger.warning("Embedded environment is unhealthy, replacing")
+                await self._replace(environment)
+
+    async def _replace(self, environment: Environment) -> None:
+        """Replace a broken environment with a new one."""
+        if environment in self._all_envs:
+            self._all_envs.remove(environment)
+        try:
+            await self._close(environment)
+        except Exception:
+            logger.exception("Failed to close environment during replacement")
+        # Refill the pool
+        await self._ensure_pool()
+
+    async def _init_and_reset(self, env: Environment) -> None:
+        # init
+        if hasattr(env, "ainitialize") and inspect.iscoroutinefunction(env.ainitialize):
+            await env.ainitialize()  # type: ignore
+        else:
+            loop = asyncio.get_running_loop()
+            await loop.run_in_executor(None, env.initialize)
+        
+        # reset
+        await self._reset(env)
+
+    async def _reset(self, env: Environment) -> None:
+        if hasattr(env, "areset") and inspect.iscoroutinefunction(env.areset):
+            await env.areset()  # type: ignore
+        else:
+            reset_fn = getattr(env, "reset", None)
+            if callable(reset_fn):
+                loop = asyncio.get_running_loop()
+                await loop.run_in_executor(None, reset_fn)
+
+    async def _close(self, env: Environment) -> None:
+        loop = asyncio.get_running_loop()
+        
+        # try async close first
+        if hasattr(env, "aclose") and inspect.iscoroutinefunction(env.aclose):
+            try:
+                await env.aclose()  # type: ignore
+                return
+            except Exception as e:
+                logger.debug(f"Async close failed: {e}, trying sync close")
+        
+        # fallback to sync close
+        try:
+            await loop.run_in_executor(None, env.close)
+        except Exception as e:
+            logger.debug(f"Sync close failed: {e}")
diff --git a/pipelinerl/domains/mcp/rollouts.py b/pipelinerl/domains/mcp/rollouts.py
index cd82e351..f62f0567 100644
--- a/pipelinerl/domains/mcp/rollouts.py
+++ b/pipelinerl/domains/mcp/rollouts.py
@@ -1,31 +1,46 @@
 import asyncio
-from urllib.parse import urlparse
 import time
 import random
 import logging 
 from collections import Counter
-from typing import List, Dict
+from typing import Dict, List
 
 import aiohttp
+from urllib.parse import urlparse
 from omegaconf import DictConfig
 from pipelinerl.domains.mcp.steps import MathAnswer
 from pipelinerl.world import Job
 from tapeagents.llms.trainable import TrainableLLM
-from tapeagents.remote_environment import AsyncRemoteEnvironment
 from pipelinerl.async_llm import make_training_text
+from tapeagents.environment import Environment
 from tapeagents.orchestrator import async_execute_agent
 from tapeagents.agent import DEFAULT, Agent
 from hydra.utils import instantiate
 from tapeagents.core import Tape
 from tapeagents.dialog_tape import UserStep
 from tapeagents.core import LLMCall
+from tapeagents.remote_environment import AsyncRemoteEnvironment
 
-from pipelinerl.domains.math import verify_answer_rpc, RewardTable, get_reward
+from pipelinerl.domains.mcp.env_server import EmbeddedEnvironmentWorker
+from pipelinerl.domains.math import RewardTable, get_reward, verify_answer, verify_answer_rpc
 from pipelinerl.rollouts import RolloutResult, BaseMetrics
 
 logger = logging.getLogger(__name__)
 
 
+_embedded_worker: EmbeddedEnvironmentWorker | None = None
+
+
+def _get_embedded_worker(env_cfg: DictConfig, concurrency: int) -> EmbeddedEnvironmentWorker:
+    global _embedded_worker
+    concurrency = max(1, concurrency)
+    if _embedded_worker is None or not _embedded_worker.matches(env_cfg):
+        _embedded_worker = EmbeddedEnvironmentWorker(env_cfg, concurrency=concurrency)
+    else:
+        _embedded_worker.set_concurrency(concurrency)
+    return _embedded_worker
+
+
 def count_tool_calls_by_category(llm_calls: List[LLMCall]) -> Dict[str, int]:
     """
     Count the number of tool calls for each function name category.
@@ -53,6 +68,7 @@ class Metrics(BaseMetrics):
     total_execution_time: float = -1.0
     agent_execution_time: float = -1.0
     environment_execution_time: float = -1.0
+    overflow: bool = False
 
 async def generate_mcp_rollout(
     cfg: DictConfig,
@@ -60,58 +76,90 @@ async def generate_mcp_rollout(
     problem: dict,
     session: aiohttp.ClientSession,
 ) -> RolloutResult:
-    # choose and retry env servers if one is saturated
     start = time.perf_counter()
-    env_jobs = [Job(**job) for job in cfg.jobs if job["kind"] == "environment"]
-    if not env_jobs:
-        raise RuntimeError("No environment servers available")
-
-    # shuffle to avoid dead-locking a single server
-    env_urls_all = [f"http://{job.hostname}:{job.port}" for job in env_jobs if job.port is not None]
-    if not env_urls_all:
-        raise RuntimeError("Environment server definitions missing ports")
-
-    while True:
-        env_urls = env_urls_all[:]
-        random.shuffle(env_urls)
-        chosen_url = None
-        for env_url in env_urls:
-            try:
-                environment = AsyncRemoteEnvironment(
-                    server_url=env_url, start_timeout_sec=600, start_repeat_delay=5)
-                context_manager = environment.acontext(session, wait_for_env=True)
-                env = await context_manager.__aenter__()
+
+    chosen_url: str | None = None
+    env_host: str | None = None
+    env_port: int | None = None
+
+    if cfg.world.environment_mode == "remote":
+        env_jobs = [Job(**job) for job in cfg.jobs if job["kind"] == "environment"]
+        if not env_jobs:
+            raise RuntimeError("No environment servers available")
+
+        env_urls_all = [f"http://{job.hostname}:{job.port}" for job in env_jobs if job.port is not None]
+        if not env_urls_all:
+            raise RuntimeError("Environment server definitions missing ports")
+
+        while True:
+            env_urls = env_urls_all[:]
+            random.shuffle(env_urls)
+            chosen_url = None
+            for env_url in env_urls:
+                jitter = random.randint(3, 12)
                 try:
-                    await env.start_task(problem)
-                    chosen_url = env_url
-                    actions = await env.a_actions()
-                    tools_description = await env.a_tools_description()
-                    logger.debug(f"Available tools: {tools_description}")
-                    agent: Agent = instantiate(cfg.agent, known_actions=actions, tools_description=tools_description)
-                    agent.llms = {DEFAULT: llm}
-
-                    tape = Tape(steps=[
-                        UserStep(content=f"{problem['task']}. You have access to the following tools: {tools_description}")
-                    ])
-                    t_exec = time.perf_counter()
-                    while True:
-                        try:
-                            tape = await async_execute_agent(agent, tape, env, session, max_loops=cfg.agent_max_loops)
-                            tape.metadata.result.update({"total_execution_time": time.perf_counter() - t_exec})
-                            break
-                        except Exception:
-                            await asyncio.sleep(5)
-                    break  # success
-                finally:
-                    await context_manager.__aexit__(None, None, None)
-            except Exception as e:
-                # try the next server on errors (503: busyslots)
-                logger.warning(f"Env start failed at {env_url}: {e}")
-                continue
-        if chosen_url is not None:
-            break  # success
-        # if none succeeded backoff and retry the whole list
-        await asyncio.sleep(1.0)
+                    environment = AsyncRemoteEnvironment(
+                        server_url=env_url, start_timeout_sec=600, start_repeat_delay=jitter)
+                    context_manager = environment.acontext(session, wait_for_env=True)
+                    env = await context_manager.__aenter__()
+                    try:
+                        await env.start_task(problem)
+                        chosen_url = env_url
+                        actions = await env.a_actions()
+                        tools_description = await env.a_tools_description()
+                        logger.debug(f"Available tools: {tools_description}")
+                        agent: Agent = instantiate(cfg.agent, known_actions=actions, tools_description=tools_description)
+                        agent.llms = {DEFAULT: llm}
+
+                        tape = Tape(steps=[
+                            UserStep(content=f"{problem['task']}. You have access to the following tools: {tools_description}")
+                        ])
+                        t_exec = time.perf_counter()
+                        while True:
+                            try:
+                                tape = await async_execute_agent(agent, tape, env, session, max_loops=cfg.agent_max_loops)
+                                tape.metadata.result.update({"total_execution_time": time.perf_counter() - t_exec})
+                                break
+                            except Exception:
+                                await asyncio.sleep(5)
+                        break  # success
+                    finally:
+                        await context_manager.__aexit__(None, None, None)
+                except Exception as e:
+                    logger.warning(f"Env start failed at {env_url}: {e}")
+                    continue
+            if chosen_url is not None:
+                break  # success
+            await asyncio.sleep(1.0)
+
+        parsed = urlparse(chosen_url)
+        env_host, env_port = parsed.hostname, parsed.port
+    else:
+        concurrency = max(1, int(getattr(cfg.world, "env_replicas_per_actor", 1)))
+        env_worker = _get_embedded_worker(cfg.environment, concurrency)
+        async with env_worker.alifecycle() as environment:
+            start_result = environment.start_task(problem)
+            tape_metadata = start_result if isinstance(start_result, dict) else {}
+
+            actions = environment.actions()
+            tools_description = environment.tools_description()
+            logger.debug(f"Embedded tools: {tools_description}")
+            agent: Agent = instantiate(cfg.agent, known_actions=actions, tools_description=tools_description)
+            agent.llms = {DEFAULT: llm}
+            tape = Tape(
+                steps=[
+                    UserStep(
+                        content=f"{problem['task']}. You have access to the following tools: {tools_description}"
+                    )
+                ]
+            )
+            if tape_metadata:
+                tape.metadata.other.update(tape_metadata)
+
+            t_exec = time.perf_counter()
+            tape = await async_execute_agent(agent, tape, environment, session, max_loops=cfg.agent_max_loops)
+            tape.metadata.result.update({"total_execution_time": time.perf_counter() - t_exec})
+        env_host = env_port = None
 
     reward_table = RewardTable(**dict(cfg.rewards))
 
@@ -125,21 +173,87 @@ async def generate_mcp_rollout(
     tool_call_counts = count_tool_calls_by_category(llm_calls)
     training_texts = [make_training_text(llm, llm_call) for llm_call in llm_calls]
     n_llm_calls = len(llm_calls)
-    parsed = urlparse(chosen_url)
-    assert parsed.hostname is not None and parsed.port is not None
-    answer_status = await verify_answer_rpc(
-        session=session,
-        host=parsed.hostname,
-        port=parsed.port,
-        prediction=llm_calls[-1].output.content,  # type: ignore
-        gold=problem["answer"],
-        strict=True,
-    )
+    if env_host and env_port:
+        answer_status = await verify_answer_rpc(
+            session=session,
+            host=env_host,
+            port=env_port,
+            prediction=llm_calls[-1].output.content,  # type: ignore
+            gold=problem["answer"],
+            strict=True,
+        )
+    else:
+        answer_status = verify_answer(
+            prediction=llm_calls[-1].output.content,  # type: ignore
+            gold=problem["answer"],
+            strict=True,
+        )
     # Tape should finish with an answer
     tape_finished = True if isinstance(tape.steps[-1], MathAnswer) else False
-    reward = get_reward(answer_status, tape_finished, reward_table)
+    base_reward = get_reward(answer_status, tape_finished, reward_table)
+
+    # Local reward shaping (configurable in conf/mcp.yaml)
+    total_shaping = 0.0
+    shaping_cfg = getattr(cfg, "python_tool_shaping", None)
+    if shaping_cfg is not None:
+        num_python_calls = tool_call_counts.get("run_python_code", 0)
+        bonus_on_correct_with_python = float(getattr(shaping_cfg, "bonus_on_correct_with_python", 0.0))
+        penalty_on_incorrect_without_python = float(getattr(shaping_cfg, "penalty_on_incorrect_without_python", 0.0))
+        max_abs = float(getattr(shaping_cfg, "max_abs", 0.2))
+
+        # Episode-level bonuses/penalties
+        if answer_status == "correct" and num_python_calls >= 1:
+            total_shaping += bonus_on_correct_with_python
+        if answer_status in ("wrong", "unparsable") and num_python_calls == 0:
+            total_shaping -= penalty_on_incorrect_without_python
+
+        # Clamp total shaping
+        if total_shaping > max_abs:
+            total_shaping = max_abs
+        if total_shaping < -max_abs:
+            total_shaping = -max_abs
+
+    # Length shaping: discourage very long completions; award concise correct ones
+    length_cfg = getattr(cfg, "length_shaping", None)
+    if length_cfg is not None:
+        try:
+            # Prefer ratio-based target if provided; otherwise use absolute
+            if hasattr(length_cfg, "target_ratio"):
+                ratio = float(getattr(length_cfg, "target_ratio"))
+                max_gen = int(llm.parameters.get("max_tokens", 2048))
+                target_tokens = int(max(1, ratio * max_gen))
+                # Optional clamps
+                min_t = int(getattr(length_cfg, "min_target_tokens", 0))
+                max_t = int(getattr(length_cfg, "max_target_tokens", 10**9))
+                target_tokens = max(min_t, min(max_t, target_tokens))
+            else:
+                target_tokens = int(getattr(length_cfg, "target_output_tokens", 512))
+            slope = float(getattr(length_cfg, "slope", 0.0))
+            max_penalty = float(getattr(length_cfg, "max_penalty", 0.0))
+            bonus_short_correct = float(getattr(length_cfg, "bonus_on_short_correct", 0.0))
+        except Exception:
+            target_tokens, slope, max_penalty, bonus_short_correct = 512, 0.0, 0.0, 0.0
+
+        # average output tokens across llm calls for this rollout
+        try:
+            avg_output_tokens = sum(t.output_tokens for t in training_texts) / max(1, len(training_texts))
+        except Exception:
+            avg_output_tokens = 0.0
+
+        if slope > 0.0 and max_penalty > 0.0 and avg_output_tokens > target_tokens:
+            over_by = float(avg_output_tokens - target_tokens)
+            penalty = min(max_penalty, slope * over_by)
+            total_shaping -= penalty
+
+        if bonus_short_correct > 0.0 and answer_status == "correct" and avg_output_tokens <= target_tokens:
+            total_shaping += bonus_short_correct
+
+    reward = base_reward + total_shaping
+
+    # Assign identical reward to all steps in the rollout (pipeline expects uniform rollout_reward)
     for text in training_texts:
         text.reward = reward
+        text.finished = tape_finished
 
     latency = time.perf_counter() - start
 
@@ -159,6 +273,7 @@ async def generate_mcp_rollout(
         total_execution_time=total_time,
         agent_execution_time=agent_time,
         environment_execution_time=env_time,
+        overflow=not tape_finished,
     )
 
     return RolloutResult(
diff --git a/pipelinerl/domains/mcp/steps.py b/pipelinerl/domains/mcp/steps.py
index f33d6efa..9b29a717 100644
--- a/pipelinerl/domains/mcp/steps.py
+++ b/pipelinerl/domains/mcp/steps.py
@@ -1,13 +1,13 @@
 from typing import Any, Literal
 from pydantic import Field
-from tapeagents.core import StopStep
+from tapeagents.core import FinalObservation
 
 
-class MathAnswer(StopStep):
+class MathAnswer(FinalObservation):
     """
     Action that indicates the agent has finished solving a math problem.
     The final answer must be contained within \\boxed{} format.
     """
 
     kind: Literal["math_answer_action"] = "math_answer_action"
-    answer: Any = Field(description="Final answer in \\boxed{} format")
\ No newline at end of file
+    answer: Any = Field(description="Final answer in \\boxed{} format")
diff --git a/pipelinerl/launch.py b/pipelinerl/launch.py
index e56c0e80..be5c8faf 100644
--- a/pipelinerl/launch.py
+++ b/pipelinerl/launch.py
@@ -1,6 +1,7 @@
 import logging
 import math
 import os
+import shlex
 import shutil
 import subprocess
 import sys
@@ -157,6 +158,29 @@ def run_actor_llm(
         str(world_map.weight_update_group_size),
     ]
 
+    # Provide deterministic rendezvous port defaults when env vars are absent.
+    # vLLM spins up a torch.distributed TCPStore using VLLM_PORT. On the remote
+    # scheduler we observed replica crashes (store collisions, connection
+    # refused) because every start script inherited the same default port. By
+    # exporting VLLM_PORT_BASE/VLLM_PORT_STRIDE we carve out a rendezvous range
+    # per actor_idx while keeping the public HTTP listener at 8080+local_idx.
+    env = dict(os.environ)
+    if "VLLM_PORT_BASE" not in env:
+        # Each rank gets 1000 ports; 43000 leaves room below.
+        env["VLLM_PORT_BASE"] = str(43000 + 1000 * world_map.my_rank)
+        logger.debug(
+            "Setting default VLLM_PORT_BASE=%s for rank %s",
+            env["VLLM_PORT_BASE"], world_map.my_rank,
+        )
+    if "VLLM_PORT_STRIDE" not in env:
+        env["VLLM_PORT_STRIDE"] = "20"
+
+    env_overrides = {
+        key: str(env[key])
+        for key in ("VLLM_PORT_BASE", "VLLM_PORT_STRIDE")
+        if key in env
+    }
+
     # Add vLLM kwargs as separate arguments
     if cfg.vllm_config.vllm_kwargs:
         for k, v in cfg.vllm_config.vllm_kwargs.items():
@@ -169,13 +193,13 @@ def run_actor_llm(
 
     gpu_str = ",".join([str(gpu) for gpu in gpus])
     logger.info(f"Running actor_llm with command: {' '.join(cmd)} on gpus: {gpu_str}")
-    save_command(log_dir, cmd)
+    save_command(log_dir, cmd, env_overrides or None)
     log_file_path = os.path.join(log_dir, "stdout.log")
     err_file_path = os.path.join(log_dir, "stderr.log")
     with open(log_file_path, "a") as log_file, open(err_file_path, "a") as err_file:
         yield _popen(
             cmd,
-            env={**os.environ, "CUDA_VISIBLE_DEVICES": gpu_str},
+            env={**env, "CUDA_VISIBLE_DEVICES": gpu_str},
             stdout=log_file,
             stderr=err_file,
         )
@@ -372,14 +396,21 @@ def run_redis(cfg: DictConfig):
     yield _popen(cmd, env=dict(os.environ))
 
 
-def save_command(script_dir: Path, cmd):
+def save_command(script_dir: Path, cmd, env: dict | None = None):
     os.makedirs(script_dir, exist_ok=True)
     script_path = script_dir / "start.sh"
     with open(script_path, "w") as f:
         f.write("#!/bin/bash\n")
+        f.write("set -e\n")
+        if env:
+            for key, value in sorted(env.items()):
+                quoted_value = shlex.quote(value)
+                f.write(f"export {key}={quoted_value}\n")
         # Properly quote arguments for the shell script
-        quoted_cmd = [f"'{arg}'" if " " in arg or "$" in arg else arg for arg in cmd]
-        f.write(" ".join(quoted_cmd) + "\n")
+        quoted_cmd = [shlex.quote(arg) for arg in cmd]
+        f.write("exec ")
+        f.write(" ".join(quoted_cmd))
+        f.write("\n")
     os.chmod(script_path, 0o755)
     logger.info(f"Saved start script to {script_path}")
 
diff --git a/pipelinerl/rl_tool_parser_plugin.py b/pipelinerl/rl_tool_parser_plugin.py
index 194a5d87..12e6fc2d 100644
--- a/pipelinerl/rl_tool_parser_plugin.py
+++ b/pipelinerl/rl_tool_parser_plugin.py
@@ -4,7 +4,8 @@
 
 import json
 import re
-from typing import Any, Dict, List, Optional, Union, Sequence
+from typing import Any  # noqa: F401
+import logging
 
 from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import ToolParser
 from vllm.entrypoints.openai.tool_parsers import ToolParserManager
@@ -20,6 +21,9 @@
 class HermesRLToolParser(ToolParser):
     """
     Tool parser for RL tool calling format using <tool_call></tool_call> markers.
+    Supports both standard format and Apriel-style formats:
+    - <tool_calls>[{...}, {...}]</tool_calls> (preferred if present)
+    - [BEGIN FINAL RESPONSE] ... [END FINAL RESPONSE] wrapper
     """
     
     def __init__(self, tokenizer):
@@ -34,6 +38,16 @@ def __init__(self, tokenizer):
             r"<tool_call>(.*?)</tool_call>|<tool_call>(.*)", re.DOTALL
         )
         
+        # Apriel-specific patterns
+        self.apriel_final_response_regex = re.compile(
+            r"\[BEGIN FINAL RESPONSE\](.*?)\[END FINAL RESPONSE\]", re.DOTALL
+        )
+        # Prefer parsing aggregated tool calls from <tool_calls>...</tool_calls>
+        # Be lenient: case-insensitive; tolerate missing closing tag by capturing to end.
+        self.apriel_tool_calls_regex = re.compile(
+            r"<tool_calls>\s*(.*?)\s*(?:</tool_calls>|$)", re.DOTALL | re.IGNORECASE
+        )
+        
         # State for streaming
         self.current_tool_name_sent = False
         self.prev_tool_call_arr = []
@@ -51,47 +65,180 @@ def extract_tool_calls(self, model_output: str, request: ChatCompletionRequest)
         Returns:
             ExtractedToolCallInformation with tool calls and metadata
         """
-        # Quick check to avoid unnecessary processing
-        if self.tool_call_start_token not in model_output:
-            return ExtractedToolCallInformation(
-                tools_called=False,
-                tool_calls=[],
-                content=model_output
-            )
-        
+        logger = logging.getLogger("pipelinerl.tool_parser")
+        # Ensure variable exists for any fallback references below
+        final_response_match = None
+
         try:
+            # 1) Apriel aggregated tool calls block has priority
+            tool_calls_matches = list(self.apriel_tool_calls_regex.finditer(model_output))
+            if tool_calls_matches:
+                # Use the last match (in case of multiple blocks)
+                last_match = tool_calls_matches[-1]
+                tool_calls_json = last_match.group(1).strip()
+                parsed_calls = []
+                try:
+                    parsed_calls = json.loads(tool_calls_json) if tool_calls_json else []
+                except Exception:
+                    logger.debug("Failed to parse aggregated <tool_calls> JSON; falling back", exc_info=True)
+                    parsed_calls = []
+
+                tool_calls: list[ToolCall] = []
+                for i, pc in enumerate(parsed_calls):
+                    try:
+                        name = pc.get("name", "")
+                        args_obj = pc.get("arguments", {})
+                        if not isinstance(args_obj, (dict, list, str, int, float, bool)):
+                            args_obj = {}
+                        args_str = json.dumps(args_obj, ensure_ascii=False)
+                        call_id = pc.get("id", f"call_{i}")
+                        tool_calls.append(
+                            ToolCall(
+                                id=call_id,
+                                type="function",
+                                function=FunctionCall(name=str(name), arguments=args_str),
+                            )
+                        )
+                    except Exception:
+                        logger.debug("Skipping malformed aggregated tool call", exc_info=True)
+                        continue
+
+                # Prefer final response content if present; otherwise empty string
+                final_response_match = self.apriel_final_response_regex.search(model_output)
+                content = final_response_match.group(1).strip() if final_response_match else ""
+
+                return ExtractedToolCallInformation(
+                    tools_called=bool(tool_calls),
+                    tool_calls=tool_calls,
+                    content=content,
+                )
+
+            # 2) Try bare JSON tool-calls (no tags), but only if tools are declared in the request
+            #    Accept either a list of {name, arguments} or a single dict
+            try:
+                tools_declared = bool(getattr(request, "tools", None))
+            except Exception:
+                tools_declared = False
+
+            if tools_declared:
+                candidate_strings: list[str] = []
+                final_response_match = self.apriel_final_response_regex.search(model_output)
+                if final_response_match:
+                    candidate_strings.append(final_response_match.group(1).strip())
+                candidate_strings.append(model_output.strip())
+
+                for candidate in candidate_strings:
+                    try:
+                        parsed = json.loads(candidate)
+                    except Exception:
+                        continue
+                    parsed_list = []
+                    if isinstance(parsed, dict) and "name" in parsed and "arguments" in parsed:
+                        parsed_list = [parsed]
+                    elif isinstance(parsed, list) and all(isinstance(it, dict) for it in parsed):
+                        parsed_list = [it for it in parsed if "name" in it and "arguments" in it]
+                    if not parsed_list:
+                        continue
+                    tool_calls: list[ToolCall] = []
+                    for i, pc in enumerate(parsed_list):
+                        try:
+                            name = pc.get("name", "")
+                            args_obj = pc.get("arguments", {})
+                            if not isinstance(args_obj, (dict, list, str, int, float, bool)):
+                                args_obj = {}
+                            args_str = json.dumps(args_obj, ensure_ascii=False)
+                            call_id = pc.get("id", f"call_{i}")
+                            tool_calls.append(
+                                ToolCall(
+                                    id=call_id,
+                                    type="function",
+                                    function=FunctionCall(name=str(name), arguments=args_str),
+                                )
+                            )
+                        except Exception:
+                            logger.debug("Skipping malformed bare-JSON tool call", exc_info=True)
+                            continue
+                    content = final_response_match.group(1).strip() if final_response_match else ""
+                    return ExtractedToolCallInformation(
+                        tools_called=bool(tool_calls),
+                        tool_calls=tool_calls,
+                        content=content,
+                    )
+
+            # 3) Fallback: look for single <tool_call> blocks (legacy / other models)
+            content_to_search = model_output
+            final_response_match = self.apriel_final_response_regex.search(model_output)
+            if final_response_match:
+                final_response_content = final_response_match.group(1).strip()
+                if self.tool_call_start_token in final_response_content:
+                    content_to_search = final_response_content
+                elif self.tool_call_start_token not in model_output:
+                    # No tool calls found, return final response as content
+                    return ExtractedToolCallInformation(
+                        tools_called=False,
+                        tool_calls=[],
+                        content=final_response_content
+                    )
+
+            # Quick check to avoid unnecessary processing
+            if self.tool_call_start_token not in content_to_search:
+                return ExtractedToolCallInformation(
+                    tools_called=False,
+                    tool_calls=[],
+                    content=model_output
+                )
+
             # Find all tool call matches
-            function_call_tuples = self.tool_call_regex.findall(model_output)
-            
+            function_call_tuples = self.tool_call_regex.findall(content_to_search)
+
             # Parse JSON from matches
             tool_calls = []
             for i, match in enumerate(function_call_tuples):
                 json_str = match[0] if match[0] else match[1]
                 try:
                     parsed_call = json.loads(json_str.strip())
-                    
+                    args_obj = parsed_call.get("arguments", {})
+                    if not isinstance(args_obj, (dict, list, str, int, float, bool)):
+                        args_obj = {}
                     tool_call = ToolCall(
                         id=f"call_{i}",
                         type="function",
                         function=FunctionCall(
-                            name=parsed_call.get("name", ""),
-                            arguments=json.dumps(
-                                parsed_call.get("arguments", {}),
-                                ensure_ascii=False
-                            )
+                            name=str(parsed_call.get("name", "")),
+                            arguments=json.dumps(args_obj, ensure_ascii=False)
                         )
                     )
                     tool_calls.append(tool_call)
-                except json.JSONDecodeError:
+                except Exception:
+                    logger.debug("Skipping malformed <tool_call> JSON", exc_info=True)
                     continue
-            
+
+            # Determine content based on whether we found tool calls
+            if tool_calls and final_response_match:
+                # If we found tool calls in final response, use just the tool calls
+                content = ""
+            elif final_response_match:
+                # If we have final response but no tool calls there, use final response
+                content = final_response_match.group(1).strip()
+            else:
+                # Standard processing
+                content = model_output
+
             return ExtractedToolCallInformation(
                 tools_called=bool(tool_calls),
                 tool_calls=tool_calls,
-                content=model_output
+                content=content
             )
-            
+
         except Exception:
+            # Never propagate exceptions to the server; log and return a safe fallback.
+            logger.exception("Tool parser encountered an exception; returning safe fallback.")
+            if final_response_match:
+                return ExtractedToolCallInformation(
+                    tools_called=False,
+                    tool_calls=[],
+                    content=final_response_match.group(1).strip()
+                )
             return ExtractedToolCallInformation(
                 tools_called=False,
                 tool_calls=[],
diff --git a/pipelinerl/utils.py b/pipelinerl/utils.py
index 2b0a252c..7cb58ede 100644
--- a/pipelinerl/utils.py
+++ b/pipelinerl/utils.py
@@ -293,19 +293,19 @@ def wait_for_inference_servers(urls: list[str]):
 
 
 def wait_for_environments(cfg: DictConfig):
-    """
-    Wait for the verifier to be ready.
-    """
+    """Wait for remote environment servers to report healthy."""
+    if cfg.world.environment_mode != "remote":
+        return
+
     env_jobs = [Job(**job) for job in cfg.jobs if job.kind == "environment"]
     for job in env_jobs:
         while True:
             url = f"http://{job.hostname}:{job.port}/health"
-            # use requests
             try:
                 response = requests.get(url)
                 if response.status_code == 200:
                     break
-            except:
+            except requests.exceptions.RequestException:
                 logger.info(f"Waiting for environment at {url} to be ready...")
                 time.sleep(5.0)
 
diff --git a/pipelinerl/vllm0.py b/pipelinerl/vllm0.py
index 92c51085..32c17093 100644
--- a/pipelinerl/vllm0.py
+++ b/pipelinerl/vllm0.py
@@ -180,6 +180,25 @@ async def run_server(args, **uvicorn_kwargs) -> None:
             f"invalid tool call parser: {args.tool_call_parser} (chose from {{ {','.join(valide_tool_parses)} }})"
         )
 
+    # Choose a unique rendezvous port per actor to avoid torch.distributed
+    # TCPStore collisions across concurrently launched vLLM processes.
+    try:
+        if "VLLM_PORT" not in os.environ:
+            actor_idx = getattr(args, "actor_llm_idx", None)
+            base_str = os.environ.get("VLLM_PORT_BASE", "")
+            stride_str = os.environ.get("VLLM_PORT_STRIDE", "10")
+            if actor_idx is not None and base_str.isdigit():
+                base = int(base_str)
+                stride = int(stride_str) if stride_str.isdigit() else 10
+                port = base + stride * int(actor_idx)
+                os.environ["VLLM_PORT"] = str(port)
+                logger.info(
+                    "Using VLLM_PORT=%s (base=%s stride=%s actor_idx=%s)",
+                    port, base, stride, actor_idx,
+                )
+    except Exception as e:
+        logger.warning("Failed to set VLLM_PORT from actor_idx: %s", e)
+
     # workaround to make sure that we bind the port before the engine is set up.
     # This avoids race conditions with ray.
     # see https://github.com/vllm-project/vllm/issues/8204
diff --git a/pipelinerl/world.py b/pipelinerl/world.py
index cc23afd0..6a06fc9f 100644
--- a/pipelinerl/world.py
+++ b/pipelinerl/world.py
@@ -71,7 +71,7 @@ def __init__(self, cfg: DictConfig, verbose: bool = False):
         if place_inference_jobs:
             self._place_inference_jobs(cfg)
         self._place_pipeline_stages(cfg)
-        if cfg.environment:
+        if cfg.environment and cfg.world.environment_mode == "remote":
             self._place_environments(cfg)
 
         # Place the finetune workers on the remaining gpus, take all remaining GPUs

From bd46a7d69c40b9d6a5e108659998c26c8fc9971a Mon Sep 17 00:00:00 2001
From: rafapi <rafael.pardinas@servicenow.com>
Date: Tue, 30 Sep 2025 18:43:01 +0000
Subject: [PATCH 068/126] Remove imports

---
 pipelinerl/utils.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/pipelinerl/utils.py b/pipelinerl/utils.py
index 7cb58ede..6243c2c7 100644
--- a/pipelinerl/utils.py
+++ b/pipelinerl/utils.py
@@ -6,14 +6,13 @@
 import time
 from pathlib import Path
 import traceback
-from typing import Dict, Mapping, List, Any, Union
+from typing import Dict, Mapping, List, Any
 import numpy as np
 from omegaconf import DictConfig
 import psutil
 import requests
 from importlib.metadata import distributions
 from transformers import PreTrainedTokenizer
-from collections import defaultdict
 
 from pipelinerl.world import Job
 from tapeagents.llms import LLMOutput
@@ -321,7 +320,7 @@ def better_crashing(entrypoint_name: str):
         # get process if of the current process
         process_id = os.getpid()
         terminate_with_children(process_id)
-        logger.error(f"I should not even be here...")
+        logger.error("I should not even be here...")
         import sys
 
         sys.exit(1)

From 724f318daf22e058ef277e2b32f16fc0114f140d Mon Sep 17 00:00:00 2001
From: Oleh Shliazhko <oleh.shliazhko@servicenow.com>
Date: Wed, 1 Oct 2025 17:15:17 +0000
Subject: [PATCH 069/126] sketch of new actor loop class, reuse most of the
 current one

---
 pipelinerl/actor.py | 105 +++++++++++++++++++++++++++++++-------------
 1 file changed, 75 insertions(+), 30 deletions(-)

diff --git a/pipelinerl/actor.py b/pipelinerl/actor.py
index 38b2daf2..d1907ed4 100644
--- a/pipelinerl/actor.py
+++ b/pipelinerl/actor.py
@@ -4,24 +4,26 @@
 import multiprocessing as mp
 import os
 import queue
-from queue import Empty
 import random
 import time
 from collections import defaultdict
 from multiprocessing.managers import SharedMemoryManager
 from pathlib import Path
+from queue import Empty
+from typing import Dict, List
 
 import aiohttp
 import hydra
+import ray
 import uvloop
 from omegaconf import DictConfig
 from pydantic import BaseModel, Field
 from tapeagents.llms import TrainableLLM
-from typing import Dict, List
+from tapeagents.orchestrator import save_debug_line
 
 import wandb
 from pipelinerl.finetune.logging_ import flatten_dict_config, init_wandb
-from pipelinerl.rollouts import RolloutResult, BaseMetrics
+from pipelinerl.rollouts import BaseMetrics, RolloutResult
 from pipelinerl.shared_memory_array import SharedMemoryQueue
 from pipelinerl.state import TrainerState
 from pipelinerl.streams import (
@@ -107,6 +109,10 @@ def make_stats_dict() -> dict:
     return defaultdict(lambda: defaultdict(list))
 
 
+def get_number_of_tokens_in_result(result: RolloutResult) -> int:
+    return sum(training_text.prompt_tokens + training_text.output_tokens for training_text in result.training_texts)
+
+
 async def schedule_rollouts(
     cfg: DictConfig,
     attempts: int,
@@ -132,6 +138,7 @@ async def schedule_rollouts(
     active_rollouts = [0] * len(llms)
     started_rollouts = 0
     finished_rollouts = 0
+    token_count = 0
     # Track rollouts per problem group
     group_rollouts = {}
     rollout_policy = hydra.utils.get_method(cfg.actor.rollout_policy)
@@ -144,13 +151,16 @@ async def rollout_and_maybe_produce_result(
         llm_index: int,
         session: aiohttp.ClientSession,
     ):
-        nonlocal started_rollouts, finished_rollouts
+        nonlocal started_rollouts, finished_rollouts, token_count
         try:
             llm = llms[llm_index]
             model_version = trainer_state.propagated_weight_version
             assert model_version is not None
-            rollout_result = await rollout_policy(cfg, llm, problem, session)
+            logger.info(f"Starting rollout policy for problem {problem['id']}")
+            rollout_result: RolloutResult = await rollout_policy(cfg, llm, problem, session)
+            logger.info(f"Finished rollout policy for problem {problem['id']}")
             rollout_result.model_version = model_version
+            token_count += get_number_of_tokens_in_result(rollout_result)
             # Make a group id that will be different from groups made by another rollout maker
             full_group_id = f"{scheduler_name}_{group_id}"
             rollout_result.group_id = full_group_id
@@ -187,15 +197,20 @@ async def rollout_and_maybe_produce_result(
     logger.info("Starting rollout scheduler")
     connector = aiohttp.TCPConnector(limit=50000, limit_per_host=50000, keepalive_timeout=1.0)
     timeout = aiohttp.ClientTimeout(total=3600.0, connect=3600.0, sock_read=3600.0)
+    old_finished_rollouts = 0
     async with aiohttp.ClientSession(connector=connector, timeout=timeout) as session:
         while True:
             if time.time() - last_logged > 10.0 and sum(active_rollouts):
+                if finished_rollouts > old_finished_rollouts:
+                    old_finished_rollouts = finished_rollouts
+                    save_debug_line({"rollouts_finished": finished_rollouts, "tokens_produced": token_count})
                 logger.info(
                     f"{scheduler_name}: "
                     f"rollouts in progress: {sum(active_rollouts)}, "
                     f"groups in progress: {len(group_rollouts)}, "
                     f"rollouts started so far: {started_rollouts}, "
                     f"rollouts finished so far: {finished_rollouts}, "
+                    f"total tokens produced so far: {token_count}, "
                     f"max group size in bytes: {result_queue.max_actual_entry_size()}, "
                 )
                 last_logged = time.time()
@@ -217,7 +232,6 @@ async def rollout_and_maybe_produce_result(
                 await asyncio.sleep(0.01)
                 continue
             active_rollouts[next_llm] += 1
-            started_rollouts += 1
             assert problem is not None
             loop.create_task(
                 rollout_and_maybe_produce_result(
@@ -228,6 +242,7 @@ async def rollout_and_maybe_produce_result(
                     session=session,
                 )
             )
+            started_rollouts += 1
             group_rollout_index += 1
     logger.info("Rollout scheduler finished")
 
@@ -281,40 +296,41 @@ def __init__(
         self.sliding_aggregator = SlidingWindowAggregator(window_size=cfg.actor.throughput_window_size)
         self.llms = llms
         self.loop_start_time = -1
-        self.cfg = cfg
+        self.cfg: DictConfig = cfg
         self.is_training = is_training
         self.is_scheduling_paused = False
         self.debug_mode = bool(cfg.debug.mode)
 
         # Determine the number of processes to use
         num_processes = min(self.cfg.actor.rollout_workers, len(self.llms))
-        attempts = self.cfg.attempts if is_training else 1
 
         # Divide LLMs approximately equally across processes
-        llm_groups = [[] for _ in range(num_processes)]
+        self.llm_groups = [[] for _ in range(num_processes)]
         for i, llm in enumerate(self.llms):
-            llm_groups[i % num_processes].append((i, llm))
+            self.llm_groups[i % num_processes].append((i, llm))
 
         self.smm = SharedMemoryManager()
         self.smm.start()
 
-        
+
         # Use SharedMemoryQueue instead of separate problem_queue, result_queue, and io_buffer
         self.problem_queue = SharedMemoryQueue(self.smm, self.cfg.actor.problem_queue_size, cfg.actor.shared_memory_entry_size)
         self.result_queue = SharedMemoryQueue(self.smm, self.cfg.actor.result_queue_size, cfg.actor.shared_memory_entry_size)
-        
+
         logger.info(f"Initialized {'train' if self.is_training else 'test'} actor loop")
         logger.info(f"Problem queue size: {self.problem_queue.max_size}, result queue size: {self.result_queue.max_size}")
         logger.info(f"Result queue buffer size: {self.result_queue.get_memory_size() / 2**30} Gb")
 
+    def start_backend(self):
         # Create and start multiple rollout processes
+        attempts = self.cfg.attempts if self.is_training else 1
         self.rollout_processes = []
-        for llm_group in llm_groups:
+        for llm_group in self.llm_groups:
             assert llm_group
             llm_idxs = [llm[0] for llm in llm_group]
             llms = [llm[1] for llm in llm_group]
             scheduler_name = (
-                f"{'train' if is_training else 'test'} scheduler for llms {','.join([str(i) for i in llm_idxs])}"
+                f"{'train' if self.is_training else 'test'} scheduler for llms {','.join([str(i) for i in llm_idxs])}"
             )
             process = mp.Process(
                 target=rollout_maker_entrypoint,
@@ -328,15 +344,15 @@ def init_stats(self):
         self.latency_list = []
         self.model_versions_list = []
         self.sliding_stats = defaultdict(list)
-    
+
     def compute_domain_agnostic_metrics(self, result: RolloutResult) -> Dict[str, float]:
         metrics = {}
-        
+
         metrics['overflow'] = all([not training_text.finished for training_text in result.training_texts ])
         metrics['num_turns'] = len(result.training_texts)
         metrics['prompt_tokens'] = [training_text.prompt_tokens for training_text in result.training_texts]
         metrics['output_tokens'] = [training_text.output_tokens for training_text in result.training_texts]
-        
+
         return metrics
 
     def update_stats(self, rollout_results: List[RolloutResult]):
@@ -347,7 +363,7 @@ def update_stats(self, rollout_results: List[RolloutResult]):
             group_id = result.group_id
             self.latency_list.append(result.latency)
             self.model_versions_list.append(result.model_version)
-            domain_agnostic_metrics = self.compute_domain_agnostic_metrics(result) 
+            domain_agnostic_metrics = self.compute_domain_agnostic_metrics(result)
             all_metrics = result.metrics.model_dump() | domain_agnostic_metrics
             all_metrics["used_python"] = int(all_metrics.get("used_python", False))
             all_metrics["used_math_answer"] = int(all_metrics.get("used_math_answer", False))
@@ -358,7 +374,7 @@ def update_stats(self, rollout_results: List[RolloutResult]):
                     self.stats[k][dataset_name][group_id].append(v)
                 else:
                     raise ValueError(f"Unsupported metric type: {type(v)} for key {k}")
-        
+
         prompt_length_tokens = [training_text.prompt_tokens for result in rollout_results for training_text in result.training_texts]
         output_length_tokens = [training_text.output_tokens for result in rollout_results for training_text in result.training_texts]
         self.sliding_aggregator.update(prompt_length_tokens, output_length_tokens)
@@ -366,7 +382,7 @@ def update_stats(self, rollout_results: List[RolloutResult]):
         if sliding_window_stats is not None:
             for k, v in sliding_window_stats.items():
                 self.sliding_stats[k].append(v)
-        
+
 
 
     def run(self, dataset: list[tuple[str, dict]]):
@@ -443,9 +459,9 @@ def run(self, dataset: list[tuple[str, dict]]):
                             try:
                                 try:
                                     problem = next(problem_iter)
-                                    self.problem_queue.put(problem, block=False)
+                                    self.submit_problem(problem)
                                     submitted_groups += 1
-                                except queue.Full:            
+                                except queue.Full:
                                     assert False, "Problem queue was not full just a moment ago, but now it is full"
                             except StopIteration:
                                 break
@@ -455,7 +471,7 @@ def run(self, dataset: list[tuple[str, dict]]):
                 # Second, try return a result
                 try:
                     # Directly get the result from the SharedMemoryQueue
-                    rollout_results = self.result_queue.get(block=False)
+                    rollout_results = self.check_for_new_results()
                 except queue.Empty:
                     continue
 
@@ -484,14 +500,14 @@ def run(self, dataset: list[tuple[str, dict]]):
                     f" {in_progress} groups in progress"
                 )
 
-                    
+
                 self.update_stats(rollout_results=rollout_results)
 
                 finished_groups += 1
                 time_to_publish_train_stats = (
                     self.is_training
                     and trainer_version_to_publish is not None
-                ) or self.debug_mode 
+                ) or self.debug_mode
                 time_to_publish_test_stats = finished_groups == expected_rollouts
 
                 # Publish stats at every new model version or if all tapes are finished
@@ -502,7 +518,7 @@ def run(self, dataset: list[tuple[str, dict]]):
                             "problem_queue_size": self.problem_queue.qsize(),
                             "result_queue_size": self.result_queue.qsize(),
                             "finished_groups": finished_groups,
-                            "trainer_model_version": trainer_version_to_publish, 
+                            "trainer_model_version": trainer_version_to_publish,
                             "time_since_start": time.time() - loop_start_time,
                             "groups_in_progress": in_progress,
                         }
@@ -520,6 +536,7 @@ def run(self, dataset: list[tuple[str, dict]]):
 
                 if finished_groups == expected_rollouts:
                     logger.info(f"Finished {expected_rollouts} rollouts, stopping actor loop")
+                    self.stop_tasks()
                     break
 
     def publish_stats(self, stats_writer: StreamWriter, loop_stats: Dict):
@@ -572,6 +589,34 @@ def publish_stats(self, stats_writer: StreamWriter, loop_stats: Dict):
         stats_writer.write(stats)
         self.init_stats()  # Reset stats for the next iteration
 
+    def submit_problem(self, problem: dict):
+        self.problem_queue.put(problem, block=False)
+
+    def stop_tasks(self):
+        pass
+
+    def check_for_new_results(self):
+        rollout_results = self.result_queue.get(block=False)
+        return rollout_results
+
+
+class ActorLoop2(ActorLoop):
+    """
+    Loop that runs the ray tasks for n_jobs to perform rollouts in parallel
+    """
+    def start_backend(self):
+        ray.init(num_cpus=self.cfg.actor.rollout_workers, dashboard_host="0.0.0.0")
+
+    def submit_problem(self, problem: dict):
+        pass
+
+    def stop_tasks(self):
+        pass
+
+    def check_for_new_results(self):
+        pass
+
+
 
 def run_actor_loop(cfg: DictConfig):
     set_streams_backend(**cfg.streams)
@@ -609,7 +654,7 @@ def run_actor_loop(cfg: DictConfig):
         actor_model_path = finetune_model_path
     else:
         actor_model_path = cfg.model_path
-    
+
     # Align client-side context size with vLLM server max_model_len when available
     try:
         _context_size = int(cfg.vllm_config.vllm_kwargs.max_model_len)
@@ -655,9 +700,8 @@ def run_actor_loop(cfg: DictConfig):
     train_loop = ActorLoop(
         data_stream=data_stream, cfg=cfg, trainer_state=trainer_state, stats_stream=stats_stream, llms=train_llms
     )
-    train_loop_run = train_loop.run(
-        dataset=train_dataset,
-    )
+    train_loop.start_backend()
+    train_loop_run = train_loop.run(dataset=train_dataset)
     test_loop = ActorLoop(
         data_stream=test_data_stream,
         cfg=cfg,
@@ -687,6 +731,7 @@ def run_actor_loop(cfg: DictConfig):
             and test_loop_run is None
         ):
             logger.info("Create test loop")
+            test_loop.start_backend()
             test_loop_run = test_loop.run(
                 dataset=test_dataset,
             )

From b5c8d8917c272a881a4a2ad43ddeec5c6b279145 Mon Sep 17 00:00:00 2001
From: Oleh Shliazhko <oleh.shliazhko@servicenow.com>
Date: Wed, 1 Oct 2025 17:16:02 +0000
Subject: [PATCH 070/126] seq len 32k fits 1 h100, use qwen3-8b

---
 conf/mcp.yaml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/conf/mcp.yaml b/conf/mcp.yaml
index cf85ca18..330c6c9e 100644
--- a/conf/mcp.yaml
+++ b/conf/mcp.yaml
@@ -20,7 +20,7 @@ actor:
   shared_memory_entry_size: 10000000
 
 finetune:
-  seq_length: 128000
+  seq_length: 32000
   seq_parallel: 8
 
 dataset_loader: pipelinerl.domains.math.load_datasets
@@ -38,7 +38,7 @@ vllm_config:
     tool-parser-plugin: ${hydra:runtime.cwd}/pipelinerl/rl_tool_parser_plugin.py
     max-num-seqs: ${actor.llm_max_rollouts}
     max-num-batched-tokens: 4096
-    max_model_len: 128000
+    max_model_len: 32000
     gpu-memory-utilization: 0.85
 
 environment:
@@ -142,8 +142,8 @@ agent:
       trim_obs_except_last_n: 2
       next_node: code
 
-# model_path: Qwen/Qwen3-8B
-model_path: /mnt/llmd/base_models/ServiceNow-AI/7_9_25_14b_text_reasoning_sft
+model_path: Qwen/Qwen3-8B
+# model_path: /mnt/llmd/base_models/ServiceNow-AI/7_9_25_14b_text_reasoning_sft
 
 # Local reward shaping for tool usage
 python_tool_shaping:

From b2fbc2b4cbca8a4670acb382f4fa3919c01bf738 Mon Sep 17 00:00:00 2001
From: Oleh Shliazhko <oleh.shliazhko@servicenow.com>
Date: Wed, 1 Oct 2025 17:16:17 +0000
Subject: [PATCH 071/126] debug entrypoint

---
 debug.sh | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)
 create mode 100755 debug.sh

diff --git a/debug.sh b/debug.sh
new file mode 100755
index 00000000..c1e2822a
--- /dev/null
+++ b/debug.sh
@@ -0,0 +1,18 @@
+#!/bin/bash
+python -m pipelinerl.launch \
+    output_dir=results/actor_debug1 \
+    force_restart=true \
+    world.env_replicas_per_actor=1 \
+    actor.llm_max_rollouts=16 \
+    finetune.seq_parallel=8 \
+    eval_every_n_versions=0 \
+    actor.rollout_workers=1 \
+    debug.mode=actor \
+    world.actor_fraction=8 \
+    world.finetune_fraction=0 \
+    world.preprocessor_fraction=0 \
+    --config-name mcp
+
+    # environment.n_envs=4 \
+    # environment.mcp_read_timeout_seconds=300 \
+    # environment.env_call_timeout=300 \
\ No newline at end of file

From 550cb6369558fce34e3838e2b9a6799b67e6106a Mon Sep 17 00:00:00 2001
From: rafapi <rafael.pardinas@servicenow.com>
Date: Thu, 2 Oct 2025 07:57:31 +0000
Subject: [PATCH 072/126] Increase shared_memory_entry_size

---
 conf/mcp.yaml | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/conf/mcp.yaml b/conf/mcp.yaml
index cf85ca18..43ebf586 100644
--- a/conf/mcp.yaml
+++ b/conf/mcp.yaml
@@ -17,7 +17,10 @@ actor:
   llm_max_rollouts: 64
   task_template: |-
     {task}
-  shared_memory_entry_size: 10000000
+  shared_memory_entry_size: 200000000
+
+preprocess:
+  shared_memory_entry_size: 2000000000
 
 finetune:
   seq_length: 128000

From c13a71b2c342ff4e6ebf4f75eac15596fb6a5487 Mon Sep 17 00:00:00 2001
From: Oleh Shliazhko <oleh.shliazhko@servicenow.com>
Date: Thu, 2 Oct 2025 13:17:54 +0000
Subject: [PATCH 073/126] synchronous rollout policy

---
 pipelinerl/domains/mcp/__init__.py |   4 +-
 pipelinerl/domains/mcp/rollouts.py | 189 +++++++++++++++++++++++++----
 2 files changed, 170 insertions(+), 23 deletions(-)

diff --git a/pipelinerl/domains/mcp/__init__.py b/pipelinerl/domains/mcp/__init__.py
index 4218ca1b..4557fa53 100644
--- a/pipelinerl/domains/mcp/__init__.py
+++ b/pipelinerl/domains/mcp/__init__.py
@@ -1,2 +1,2 @@
-from .rollouts import generate_mcp_rollout
-from .env_server import EmbeddedMCPEnvironment, MCPEnvironmentServer, EmbeddedEnvironmentWorker
+from .env_server import EmbeddedEnvironmentWorker, EmbeddedMCPEnvironment, MCPEnvironmentServer
+from .rollouts import generate_mcp_rollout, generate_mcp_rollout_with_local_env
diff --git a/pipelinerl/domains/mcp/rollouts.py b/pipelinerl/domains/mcp/rollouts.py
index f62f0567..c867cbc4 100644
--- a/pipelinerl/domains/mcp/rollouts.py
+++ b/pipelinerl/domains/mcp/rollouts.py
@@ -1,29 +1,28 @@
 import asyncio
-import time
+import logging
 import random
-import logging 
+import time
 from collections import Counter
 from typing import Dict, List
+from urllib.parse import urlparse
 
 import aiohttp
-from urllib.parse import urlparse
-from omegaconf import DictConfig
-from pipelinerl.domains.mcp.steps import MathAnswer
-from pipelinerl.world import Job
-from tapeagents.llms.trainable import TrainableLLM
-from pipelinerl.async_llm import make_training_text
-from tapeagents.environment import Environment
-from tapeagents.orchestrator import async_execute_agent
-from tapeagents.agent import DEFAULT, Agent
 from hydra.utils import instantiate
-from tapeagents.core import Tape
+from omegaconf import DictConfig, OmegaConf
+from tapeagents.agent import DEFAULT, Agent
+from tapeagents.core import LLMCall, Tape
 from tapeagents.dialog_tape import UserStep
-from tapeagents.core import LLMCall
+from tapeagents.llms.trainable import TrainableLLM
+from tapeagents.mcp import MCPEnvironment
+from tapeagents.orchestrator import async_execute_agent, execute_agent, get_agent_and_env_from_config
 from tapeagents.remote_environment import AsyncRemoteEnvironment
 
-from pipelinerl.domains.mcp.env_server import EmbeddedEnvironmentWorker
+from pipelinerl.async_llm import make_training_text
 from pipelinerl.domains.math import RewardTable, get_reward, verify_answer, verify_answer_rpc
-from pipelinerl.rollouts import RolloutResult, BaseMetrics
+from pipelinerl.domains.mcp.env_server import EmbeddedEnvironmentWorker
+from pipelinerl.domains.mcp.steps import MathAnswer
+from pipelinerl.rollouts import BaseMetrics, RolloutResult
+from pipelinerl.world import Job
 
 logger = logging.getLogger(__name__)
 
@@ -44,20 +43,20 @@ def _get_embedded_worker(env_cfg: DictConfig, concurrency: int) -> EmbeddedEnvir
 def count_tool_calls_by_category(llm_calls: List[LLMCall]) -> Dict[str, int]:
     """
     Count the number of tool calls for each function name category.
-    
+
     Args:
         llm_calls: List of LLMCall objects
-        
+
     Returns:
         Dictionary mapping function names to their counts
     """
     tool_call_names = []
-    
+
     for llm_call in llm_calls:
         if llm_call.output.tool_calls:
             for tool_call in llm_call.output.tool_calls:
                 tool_call_names.append(tool_call.function.name)
-    
+
     return dict(Counter(tool_call_names))
 
 
@@ -260,8 +259,8 @@ async def generate_mcp_rollout(
     agent_time = tape.metadata.result.get("agent_execution_time", -1.0)
     env_time = tape.metadata.result.get("environment_execution_time", -1.0)
     total_time = tape.metadata.result.get("total_execution_time", -1.0)
-    
-    
+
+
     metrics = Metrics(
         reward=reward,
         success=answer_status == "correct",
@@ -282,3 +281,151 @@ async def generate_mcp_rollout(
         latency=latency,
         dataset_name=problem["dataset"],
     )
+
+
+
+def generate_mcp_rollout_with_local_env(
+    cfg: DictConfig | dict,
+    llm: TrainableLLM,
+    problem: dict,
+) -> RolloutResult:
+    start = time.perf_counter()
+    if isinstance(cfg, dict):
+        cfg = OmegaConf.create(cfg)
+    agent, _env = get_agent_and_env_from_config(cfg)
+    environment: MCPEnvironment = _env
+    logger.info("Agent and environment loaded")
+    try:
+        start_result = environment.start_task(problem)
+        logger.info("Task started")
+        tape_metadata = start_result if isinstance(start_result, dict) else {}
+        agent.llms = {DEFAULT: llm}
+        tape = Tape(
+            steps=[
+                UserStep(
+                    content=f"{problem['task']}. You have access to the following tools: {environment.tools_description()}"
+                )
+            ]
+        )
+        if tape_metadata:
+            tape.metadata.other.update(tape_metadata)
+
+        t_exec = time.perf_counter()
+        logger.info("Running agent..")
+        tape = execute_agent(agent, tape, environment, max_loops=cfg.agent_max_loops)
+        logger.info("Agent finished")
+        tape.metadata.result.update({"total_execution_time": time.perf_counter() - t_exec})
+
+        reward_table = RewardTable(**dict(cfg.rewards))
+
+        llm_calls: list[LLMCall] = [
+            LLMCall(**step.metadata.other["llm_call"])
+            if isinstance(step.metadata.other["llm_call"], dict)
+            else step.metadata.other["llm_call"]
+            for step in tape.steps if step.metadata.other.get("llm_call") is not None
+        ]
+        assert len(llm_calls) > 0, "No LLM calls found"
+        tool_call_counts = count_tool_calls_by_category(llm_calls)
+        training_texts = [make_training_text(llm, llm_call) for llm_call in llm_calls]
+        n_llm_calls = len(llm_calls)
+        answer_status = verify_answer(
+            prediction=llm_calls[-1].output.content,  # type: ignore
+            gold=problem["answer"],
+            strict=True,
+        )
+        # Tape should finish with an answer
+        tape_finished = True if isinstance(tape.steps[-1], MathAnswer) else False
+        base_reward = get_reward(answer_status, tape_finished, reward_table)
+
+        # Local reward shaping (configurable in conf/mcp.yaml)
+        total_shaping = 0.0
+        shaping_cfg = getattr(cfg, "python_tool_shaping", None)
+        if shaping_cfg is not None:
+            num_python_calls = tool_call_counts.get("run_python_code", 0)
+            bonus_on_correct_with_python = float(getattr(shaping_cfg, "bonus_on_correct_with_python", 0.0))
+            penalty_on_incorrect_without_python = float(getattr(shaping_cfg, "penalty_on_incorrect_without_python", 0.0))
+            max_abs = float(getattr(shaping_cfg, "max_abs", 0.2))
+
+            # Episode-level bonuses/penalties
+            if answer_status == "correct" and num_python_calls >= 1:
+                total_shaping += bonus_on_correct_with_python
+            if answer_status in ("wrong", "unparsable") and num_python_calls == 0:
+                total_shaping -= penalty_on_incorrect_without_python
+
+            # Clamp total shaping
+            if total_shaping > max_abs:
+                total_shaping = max_abs
+            if total_shaping < -max_abs:
+                total_shaping = -max_abs
+
+        # Length shaping: discourage very long completions; award concise correct ones
+        length_cfg = getattr(cfg, "length_shaping", None)
+        if length_cfg is not None:
+            try:
+                # Prefer ratio-based target if provided; otherwise use absolute
+                if hasattr(length_cfg, "target_ratio"):
+                    ratio = float(getattr(length_cfg, "target_ratio"))
+                    max_gen = int(llm.parameters.get("max_tokens", 2048))
+                    target_tokens = int(max(1, ratio * max_gen))
+                    # Optional clamps
+                    min_t = int(getattr(length_cfg, "min_target_tokens", 0))
+                    max_t = int(getattr(length_cfg, "max_target_tokens", 10**9))
+                    target_tokens = max(min_t, min(max_t, target_tokens))
+                else:
+                    target_tokens = int(getattr(length_cfg, "target_output_tokens", 512))
+                slope = float(getattr(length_cfg, "slope", 0.0))
+                max_penalty = float(getattr(length_cfg, "max_penalty", 0.0))
+                bonus_short_correct = float(getattr(length_cfg, "bonus_on_short_correct", 0.0))
+            except Exception:
+                target_tokens, slope, max_penalty, bonus_short_correct = 512, 0.0, 0.0, 0.0
+
+            # average output tokens across llm calls for this rollout
+            try:
+                avg_output_tokens = sum(t.output_tokens for t in training_texts) / max(1, len(training_texts))
+            except Exception:
+                avg_output_tokens = 0.0
+
+            if slope > 0.0 and max_penalty > 0.0 and avg_output_tokens > target_tokens:
+                over_by = float(avg_output_tokens - target_tokens)
+                penalty = min(max_penalty, slope * over_by)
+                total_shaping -= penalty
+
+            if bonus_short_correct > 0.0 and answer_status == "correct" and avg_output_tokens <= target_tokens:
+                total_shaping += bonus_short_correct
+
+        reward = base_reward + total_shaping
+
+        # Assign identical reward to all steps in the rollout (pipeline expects uniform rollout_reward)
+        for text in training_texts:
+            text.reward = reward
+            text.finished = tape_finished
+
+        latency = time.perf_counter() - start
+
+        agent_time = tape.metadata.result.get("agent_execution_time", -1.0)
+        env_time = tape.metadata.result.get("environment_execution_time", -1.0)
+        total_time = tape.metadata.result.get("total_execution_time", -1.0)
+
+        metrics = Metrics(
+            reward=reward,
+            success=answer_status == "correct",
+            no_error=answer_status != "unparsable",
+            no_answer=answer_status == "no_answer",
+            num_steps=len(tape.steps),
+            num_python_calls=tool_call_counts.get("run_python_code", 0),
+            n_llm_calls=n_llm_calls,
+            total_execution_time=total_time,
+            agent_execution_time=agent_time,
+            environment_execution_time=env_time,
+            overflow=not tape_finished,
+        )
+
+        return RolloutResult(
+            training_texts=training_texts,
+            metrics=metrics,
+            latency=latency,
+            dataset_name=problem["dataset"],
+            llm_url=llm.get_base_url(),
+        )
+    finally:
+        environment.close()

From 44d6fd4e1267036d8d46e13d80175a6d6c0ca55b Mon Sep 17 00:00:00 2001
From: Oleh Shliazhko <oleh.shliazhko@servicenow.com>
Date: Thu, 2 Oct 2025 13:19:14 +0000
Subject: [PATCH 074/126] fix import

---
 pipelinerl/async_llm.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/pipelinerl/async_llm.py b/pipelinerl/async_llm.py
index e375b6a5..aa75d4ed 100644
--- a/pipelinerl/async_llm.py
+++ b/pipelinerl/async_llm.py
@@ -8,12 +8,16 @@
 from tapeagents.core import LLMCall, LLMOutput, Prompt, TokenLogprob
 from tapeagents.llms.trainable import TrainableLLM
 
-from pipelinerl.finetune.data import MASKED_TOKEN_ID
-from pipelinerl.rollouts import TrainingText
 from pipelinerl.processor_factory import get_processor
+from pipelinerl.rollouts import TrainingText
 
 logger = logging.getLogger(__name__)
 
+# -100 is the default "ignore_index" in nn.CrossEntropyLoss
+# Defined here to avoid importing dependencies from finetune.data
+# Do not replace. Import from finetune module breaks ray parallelization!
+MASKED_TOKEN_ID = -100
+
 
 def extract_images_from_messages(messages: list[dict]) -> list[Image.Image]:
     """Extract PIL Images from multimodal messages."""

From 053a532344989753885d00f6f2792c63d9d8e586 Mon Sep 17 00:00:00 2001
From: Oleh Shliazhko <oleh.shliazhko@servicenow.com>
Date: Fri, 3 Oct 2025 17:42:28 +0000
Subject: [PATCH 075/126] llm benchmarking scripts

---
 llm.sh       |  69 +++++++++++++++++++++++++++
 llm_bench.py | 130 +++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 199 insertions(+)
 create mode 100755 llm.sh
 create mode 100644 llm_bench.py

diff --git a/llm.sh b/llm.sh
new file mode 100755
index 00000000..9ed54075
--- /dev/null
+++ b/llm.sh
@@ -0,0 +1,69 @@
+#!/bin/bash
+echo "Run LLM only"
+
+# python -m pipelinerl.launch \
+#     output_dir=results/llm_debug1 \
+#     force_restart=true \
+#     actor.llm_max_rollouts=16 \
+#     finetune.seq_parallel=8 \
+#     eval_every_n_versions=0 \
+#     debug.mode=llm \
+#     world.actor_fraction=8 \
+#     world.finetune_fraction=0 \
+#     world.preprocessor_fraction=0 \
+#     --config-name mcp
+
+
+python -m pipelinerl.entrypoints.run_vllm0 \
+    --model Qwen/Qwen3-8B \
+    --host 0.0.0.0 \
+    --port 8080 \
+    --seed 42 \
+    --actor-llm-idx 0 \
+    --weight-update-group-init-method tcp://localhost:9000 \
+    --weight-update-group-world-size 2 \
+    --dtype bfloat16 \
+    --gpu-memory-utilization 0.9 \
+    --num-scheduler-steps 1 \
+    --disable-log-requests \
+    --disable-frontend-multiprocessing \
+    --max-num-seqs 256 \
+    --max-num-batched-tokens 32000 \
+    --enable-chunked-prefill \
+    --return-tokens-as-token-ids \
+    --tensor-parallel-size 1 \
+    --pipeline-parallel-size 1 \
+    --generation-config vllm \
+    --max_model_len 32000 \
+    --enable-auto-tool-choice \
+    --tool-call-parser rl_tool \
+    --tool-parser-plugin /home/toolkit/PipelineRL/pipelinerl/rl_tool_parser_plugin.py \
+    --disable-weight-update
+
+
+# python -m pipelinerl.entrypoints.run_vllm0 \
+#     --model Qwen/Qwen2.5-7B \
+#     --host 0.0.0.0 \
+#     --port 8080 \
+#     --seed 13 \
+#     --actor-llm-idx 0 \
+#     --weight-update-group-init-method tcp://localhost:9000 \
+#     --weight-update-group-world-size 2 \
+#     --dtype bfloat16 \
+#     --gpu-memory-utilization 0.9 \
+#     --num-scheduler-steps 1 \
+#     --disable-log-requests \
+#     --disable-frontend-multiprocessing \
+#     --max-num-seqs 64 \
+#     --max-num-batched-tokens 1024 \
+#     --enable-chunked-prefill \
+#     --return-tokens-as-token-ids \
+#     --tensor-parallel-size 1 \
+#     --pipeline-parallel-size 1 \
+#     --generation-config vllm \
+#     --max_model_len 64000 \
+#     --disable-weight-update
+ 
+# python -m pipelinerl.entrypoints.run_vllm0 --model /mnt/llmd/base_models/Mistral-Small-24B-Base-2501 --host 0.0.0.0 --port 8080 --seed 78 --actor-llm-idx 36 --weight-update-group-init-method tcp://dns-99833624-2133-43c0-a112-07520ffee505-0:9000 --weight-update-group-world-size 49 --dtype bfloat16 --gpu-memory-utilization 0.9 --num-scheduler-steps 1 --disable-log-requests --disable-frontend-multiprocessing --max-num-seqs 256 --max-num-batched-tokens 1024 --enable-chunked-prefill --return-tokens-as-token-ids --tensor-parallel-size 1 --pipeline-parallel-size 1 --generation-config vllm --max_model_len 32768
+
+ 
\ No newline at end of file
diff --git a/llm_bench.py b/llm_bench.py
new file mode 100644
index 00000000..f4a014ee
--- /dev/null
+++ b/llm_bench.py
@@ -0,0 +1,130 @@
+import json
+import os
+import time
+
+import numpy as np
+import ray
+import requests
+from tapeagents.llms import TrainableLLM
+
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+
+llm_url = "http://localhost:8080"
+# llm_model = "Qwen/Qwen3-8B"
+llm_model = "Qwen/Qwen2.5-7B"
+# exp_name = "qwen3-8b"
+exp_name = "qwen2.5-7b"
+
+def llm_quick_response(prompt: str):
+    t = time.perf_counter()
+    r = requests.post(
+        url=f"{llm_url}/v1/chat/completions",
+        json={
+            "model": llm_model,
+            "messages": [{"role": "user", "content": prompt}],
+            "stream": False,
+        },
+        headers={"Content-Type": "application/json"},
+        stream=False,
+        verify=False,
+    )
+    d = r.json()
+    dt = time.perf_counter() - t
+    return d["choices"][0]["message"]["content"], dt
+
+
+llm = TrainableLLM(base_url=llm_url, model_name=llm_model)
+response = llm.quick_response("Hello, how are you?")
+response2, _ = llm_quick_response("Hello, how are you?")
+assert len(response) > 0
+assert len(response2) > 0
+assert llm.tokenizer is not None
+print("LLM is ready")
+
+
+with open("debug_training_texts.jsonl", "r", encoding="utf-8") as f:
+    all_dicts = [json.loads(line) for line in f if line.strip()]
+total_tokens = 0
+for d in all_dicts:
+    text = d["text"]
+    n_predicted = d["n_predicted"]
+    prompt = text[:-n_predicted]
+    response = text[-n_predicted:]
+    tokens = llm.tokenizer.encode(text)
+    total_tokens += len(tokens)
+print(f"Loaded {len(all_dicts)} texts, total tokens: {total_tokens}")
+
+prompts = [d["text"][:-d["n_predicted"]] for d in all_dicts]
+chunk_size = 4
+prompts_chunks = [prompts[i:i+chunk_size] for i in range(0, len(prompts), chunk_size)]
+print(f"Chunked to {len(prompts_chunks)} chunks")
+
+
+def benchmark_llm(n_workers: int):
+    ray.shutdown()
+    ray.init(num_cpus=n_workers)
+
+    def get_responses(prompts: str):
+        responses = []
+        # local_llm = TrainableLLM(base_url=llm_url, model_name=llm_model)
+        for i, prompt in enumerate(prompts):
+            r, dt = llm_quick_response(prompt)
+            responses.append((prompt + r, dt))
+        return responses
+
+    remote_fn = ray.remote(get_responses)
+
+    t = time.perf_counter()
+
+    chunks = prompts_chunks
+    if n_workers > len(chunks):
+        multiplier = n_workers // len(chunks) + 1
+        chunks = chunks * multiplier
+        print(f"Multiplied to {len(chunks)} chunks")
+    unfinished_tasks = []
+    for chunk in chunks:
+        unfinished_tasks.append(remote_fn.remote(chunk))
+
+    responses = []
+    total_tokens = 0
+    total_finished = 0
+    latencies = []
+    print(f"Submitted {len(unfinished_tasks)} tasks")
+    while unfinished_tasks:
+        finished_tasks, unfinished_tasks = ray.wait(unfinished_tasks, num_returns=len(unfinished_tasks), timeout=0.1)
+        for finished_task in finished_tasks:
+            responses = ray.get(finished_task)
+            total_finished += 1
+            for response, dt in responses:
+                latencies.append(dt)
+                tokens = llm.tokenizer.encode(response)
+                total_tokens += len(tokens)
+        dt = time.perf_counter() - t
+        if len(finished_tasks) > 0:
+            print(f"t: {dt:.2f}s, {total_finished} finished, Total tokens: {total_tokens}, tokens/sec: {total_tokens / dt:.2f}")
+        # if dt > 600:
+        #     print("Timeout 10 minutes, stopping")
+        #     break
+        time.sleep(1.0)
+
+    final_time = time.perf_counter() - t
+    print(f"Final, workers:{n_workers}, t:{final_time:.2f}s, total tokens: {total_tokens}, tokens/sec: {total_tokens / final_time:.2f}")
+    ray.shutdown()
+    mean_latency = np.mean(latencies)
+    return total_tokens, final_time, mean_latency
+
+stats = {}
+for n_workers in [2, 4, 8, 16, 32, 64, 128, 256, 512, 1024]:
+    print(f"Benchmarking {n_workers} workers..")
+    tokens, dt, mean_latency = benchmark_llm(n_workers)
+    print(f"Done {n_workers} workers: {tokens} tokens, {dt:.2f}s, speed {tokens / dt:.2f} tokens/sec, mean latency: {mean_latency:.2f}s")
+    stats[n_workers] = {"tokens": tokens, "dt": dt, "mean_latency": mean_latency}
+    with open(f"llm_token_stats_chunk{chunk_size}_{exp_name}.jsonl", "a") as f:
+        ts = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+        row = json.dumps({"ts": ts, "n_workers": n_workers, "tokens": tokens, "dt": dt, "mean_latency": mean_latency})
+        f.write(row + "\n")
+
+print("Benchmarking done")
+with open(f"llm_token_stats_all_chunk{chunk_size}_{exp_name}.json", "w") as f:
+    json.dump(stats, f, indent=4)
+print("All stats saved")
\ No newline at end of file

From 0f9bf6a685675063e813fefa34550a5ee8090325 Mon Sep 17 00:00:00 2001
From: Oleh Shliazhko <oleh.shliazhko@servicenow.com>
Date: Mon, 6 Oct 2025 13:09:41 +0000
Subject: [PATCH 076/126] move to vllm 0.8.5 to support qwen3

---
 pipelinerl/vllm0.py | 38 +++++++++++++++++++-------------------
 pipelinerl/vllm1.py | 30 +++++++++++++++---------------
 pyproject.toml      |  5 +++--
 3 files changed, 37 insertions(+), 36 deletions(-)

diff --git a/pipelinerl/vllm0.py b/pipelinerl/vllm0.py
index 32c17093..4ff219a2 100644
--- a/pipelinerl/vllm0.py
+++ b/pipelinerl/vllm0.py
@@ -3,39 +3,39 @@
 import logging
 import os
 import signal
-from pydantic import TypeAdapter
+
 import torch
+import torch.distributed as dist
 import uvloop
+from pydantic import TypeAdapter
 from vllm import AsyncLLMEngine
-from vllm.utils import FlexibleArgumentParser, set_ulimit
-from vllm.entrypoints.openai.cli_args import (
-    make_arg_parser,
-    validate_parsed_serve_args,
-)
+from vllm._version import version
+from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.entrypoints.launcher import serve_http
 from vllm.entrypoints.openai.api_server import (
-    run_server,
-    create_server_socket,
     build_app,
+    create_server_socket,
     init_app_state,
+    run_server,
+)
+from vllm.entrypoints.openai.cli_args import (
+    make_arg_parser,
+    validate_parsed_serve_args,
 )
-from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.entrypoints.openai.tool_parsers import ToolParserManager
-from vllm.logger import init_logger
-from vllm._version import version
-from vllm.worker.worker import Worker
-from vllm.executor.multiproc_worker_utils import ProcessWorkerWrapper
 from vllm.executor.mp_distributed_executor import MultiprocessingDistributedExecutor
+from vllm.executor.multiproc_worker_utils import ProcessWorkerWrapper
+from vllm.logger import init_logger
 from vllm.model_executor.layers.sampler import SamplerOutput
 from vllm.sequence import ExecuteModelRequest
 from vllm.usage.usage_lib import UsageContext
-from vllm.worker.multi_step_worker import MultiStepWorker
+from vllm.utils import FlexibleArgumentParser, set_ulimit
 from vllm.worker.multi_step_model_runner import MultiStepModelRunner
+from vllm.worker.multi_step_worker import MultiStepWorker
+from vllm.worker.worker import Worker
 
-
-import torch.distributed as dist
-from pipelinerl.finetune_loop import TrainerMessage, WeightUpdateRequest
 import pipelinerl.torch_utils
+from pipelinerl.finetune_loop import TrainerMessage, WeightUpdateRequest
 
 logger = logging.getLogger(__name__)
 # configure this logger individually, in order to avoid messign
@@ -247,8 +247,8 @@ async def _receive_weight_update(request: WeightUpdateRequest):
         await weight_update_manager.receive_weight_update(request)
         return {"status": "ok"}
 
-    model_config = await engine.get_model_config()
-    await init_app_state(engine, model_config, app.state, args)
+    # model_config = await engine.get_model_config()
+    await init_app_state(engine, engine_config, app.state, args)
     shutdown_task = await serve_http(
         app,
         sock,
diff --git a/pipelinerl/vllm1.py b/pipelinerl/vllm1.py
index 80cba297..48311d8e 100644
--- a/pipelinerl/vllm1.py
+++ b/pipelinerl/vllm1.py
@@ -1,32 +1,32 @@
 import logging
 import signal
+from typing import Any, Protocol, runtime_checkable
+
 import torch
 import uvloop
-from vllm.utils import FlexibleArgumentParser, set_ulimit
-from vllm.entrypoints.openai.cli_args import (
-    make_arg_parser,
-    validate_parsed_serve_args,
-)
+from vllm._version import version
+from vllm.config import ModelConfig
+from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.entrypoints.launcher import serve_http
 from vllm.entrypoints.openai.api_server import (
-    run_server,
-    create_server_socket,
     build_app,
+    create_server_socket,
     init_app_state,
+    run_server,
+)
+from vllm.entrypoints.openai.cli_args import (
+    make_arg_parser,
+    validate_parsed_serve_args,
 )
-from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.entrypoints.openai.tool_parsers import ToolParserManager
-from vllm._version import version
 from vllm.usage.usage_lib import UsageContext
-from vllm.config import ModelConfig
+from vllm.utils import FlexibleArgumentParser, set_ulimit
 from vllm.v1.engine.async_llm import AsyncLLM
 from vllm.v1.engine.core_client import AsyncMPClient
 from vllm.v1.worker.gpu_model_runner import GPUModelRunner
 
-
-from pipelinerl.finetune_loop import WeightUpdateRequest
-from typing import Any, Protocol, runtime_checkable
 import pipelinerl.torch_utils
+from pipelinerl.finetune_loop import WeightUpdateRequest
 
 logger = logging.getLogger(__name__)
 # configure this logger individually, in order to avoid messign
@@ -172,8 +172,8 @@ async def _receive_weight_update(request: WeightUpdateRequest):
         await weight_update_manager.receive_weight_update(request)
         return {"status": "ok"}
 
-    model_config = await engine.get_model_config()
-    await init_app_state(engine, model_config, app.state, args)
+    # model_config = await engine.get_model_config()
+    await init_app_state(engine, engine_config, app.state, args)
     shutdown_task = await serve_http(
         app,
         sock,
diff --git a/pyproject.toml b/pyproject.toml
index f950d75d..c069389a 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -14,16 +14,17 @@ authors = [
 ]
 dependencies = [
     "torch>=2.6",
-    "vllm==0.8.3",
+    "vllm==0.8.5.post1",
     "accelerate==1.7.0",
     "Tapeagents[finetune]==0.1.15",
-    "transformers==4.51.0",
+    "transformers==4.51.1",
     "flash-attn==2.7.4.post1",
     "ring-flash-attn==0.1.6",
     "math-verify[antlr4_9_3]==0.7.0",
     "orjson==3.10.16",
     "redis==5.2.1",
     "hydra-core>=1.3.2",
+    "ray[default]~=2.47.1",
 ]
 
 [tool.setuptools.packages.find]

From 44d0de4b8bf75a053565f5d926a4d058c91161da Mon Sep 17 00:00:00 2001
From: Oleh Shliazhko <oleh.shliazhko@servicenow.com>
Date: Mon, 6 Oct 2025 17:45:41 +0000
Subject: [PATCH 077/126] launch mode to run inference llm only

---
 pipelinerl/launch.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/pipelinerl/launch.py b/pipelinerl/launch.py
index be5c8faf..e2109e34 100644
--- a/pipelinerl/launch.py
+++ b/pipelinerl/launch.py
@@ -21,6 +21,7 @@
 
 # TODO: rm debug code
 import tapeagents
+
 os.environ["NCCL_CUMEM_ENABLE"] = "0"
 os.environ["TORCH_DISABLE_SHARE_RDZV_TCP_STORE"] = "1"
 os.environ["HF_DATASETS_DISABLE_PROGRESS_BARS"] = "1"
@@ -615,6 +616,8 @@ def main(cfg: DictConfig):
 
     if cfg.debug.mode == "finetune":
         processes.extend(launch_jobs(cfg, world_map, ["finetune"]))
+    elif cfg.debug.mode == "llm":
+        processes.extend(launch_jobs(cfg, world_map, ["actor_llm"]))
     elif cfg.debug.mode == "actor":
         processes.extend(launch_jobs(cfg, world_map, ["actor", "environment", "actor_llm"]))
     elif cfg.debug.mode == "preprocessor":

From 81675bdc60747f440a5bef064dee46ea12668b3c Mon Sep 17 00:00:00 2001
From: Oleh Shliazhko <oleh.shliazhko@servicenow.com>
Date: Mon, 6 Oct 2025 17:46:27 +0000
Subject: [PATCH 078/126] updated ray-based actor loop

---
 pipelinerl/actor.py | 168 ++++++++++++++++++++++++++++++++++++--------
 1 file changed, 140 insertions(+), 28 deletions(-)

diff --git a/pipelinerl/actor.py b/pipelinerl/actor.py
index d1907ed4..44b5daa7 100644
--- a/pipelinerl/actor.py
+++ b/pipelinerl/actor.py
@@ -10,13 +10,14 @@
 from multiprocessing.managers import SharedMemoryManager
 from pathlib import Path
 from queue import Empty
-from typing import Dict, List
+from typing import Callable, Dict, List
 
 import aiohttp
 import hydra
+import numpy as np
 import ray
 import uvloop
-from omegaconf import DictConfig
+from omegaconf import DictConfig, OmegaConf
 from pydantic import BaseModel, Field
 from tapeagents.llms import TrainableLLM
 from tapeagents.orchestrator import save_debug_line
@@ -198,12 +199,13 @@ async def rollout_and_maybe_produce_result(
     connector = aiohttp.TCPConnector(limit=50000, limit_per_host=50000, keepalive_timeout=1.0)
     timeout = aiohttp.ClientTimeout(total=3600.0, connect=3600.0, sock_read=3600.0)
     old_finished_rollouts = 0
+    start_time = time.time()
     async with aiohttp.ClientSession(connector=connector, timeout=timeout) as session:
         while True:
             if time.time() - last_logged > 10.0 and sum(active_rollouts):
                 if finished_rollouts > old_finished_rollouts:
                     old_finished_rollouts = finished_rollouts
-                    save_debug_line({"rollouts_finished": finished_rollouts, "tokens_produced": token_count})
+                    save_debug_line({"rollouts_finished": finished_rollouts, "tokens_produced": token_count, "dt": time.time() - start_time, "token_speed": token_count / (time.time() - start_time)})
                 logger.info(
                     f"{scheduler_name}: "
                     f"rollouts in progress: {sum(active_rollouts)}, "
@@ -300,32 +302,36 @@ def __init__(
         self.is_training = is_training
         self.is_scheduling_paused = False
         self.debug_mode = bool(cfg.debug.mode)
+        self.cfg: DictConfig = cfg
 
-        # Determine the number of processes to use
-        num_processes = min(self.cfg.actor.rollout_workers, len(self.llms))
-
-        # Divide LLMs approximately equally across processes
-        self.llm_groups = [[] for _ in range(num_processes)]
-        for i, llm in enumerate(self.llms):
-            self.llm_groups[i % num_processes].append((i, llm))
+        self.smm: SharedMemoryManager | None = None
+        self.problem_queue: SharedMemoryQueue | None = None
+        self.result_queue: SharedMemoryQueue | None = None
+        logger.info(f"Initialized {'train' if self.is_training else 'test'} actor loop")
 
+    def start_backend(self):
         self.smm = SharedMemoryManager()
         self.smm.start()
 
-
         # Use SharedMemoryQueue instead of separate problem_queue, result_queue, and io_buffer
         self.problem_queue = SharedMemoryQueue(self.smm, self.cfg.actor.problem_queue_size, cfg.actor.shared_memory_entry_size)
         self.result_queue = SharedMemoryQueue(self.smm, self.cfg.actor.result_queue_size, cfg.actor.shared_memory_entry_size)
 
-        logger.info(f"Initialized {'train' if self.is_training else 'test'} actor loop")
         logger.info(f"Problem queue size: {self.problem_queue.max_size}, result queue size: {self.result_queue.max_size}")
         logger.info(f"Result queue buffer size: {self.result_queue.get_memory_size() / 2**30} Gb")
 
-    def start_backend(self):
         # Create and start multiple rollout processes
         attempts = self.cfg.attempts if self.is_training else 1
+        # Determine the number of processes to use
+        num_processes = min(self.cfg.actor.rollout_workers, len(self.llms))
+
+        # Divide LLMs approximately equally across processes
+        llm_groups = [[] for _ in range(num_processes)]
+        for i, llm in enumerate(self.llms):
+            llm_groups[i % num_processes].append((i, llm))
+
         self.rollout_processes = []
-        for llm_group in self.llm_groups:
+        for llm_group in llm_groups:
             assert llm_group
             llm_idxs = [llm[0] for llm in llm_group]
             llms = [llm[1] for llm in llm_group]
@@ -455,7 +461,7 @@ def run(self, dataset: list[tuple[str, dict]]):
                 if not self.is_scheduling_paused:
                     while True:
                         blocked_by_lag = submitted_groups == can_submit_before_update and self.is_training
-                        if not blocked_by_lag and not self.problem_queue.full():
+                        if not blocked_by_lag and self.have_capacity():
                             try:
                                 try:
                                     problem = next(problem_iter)
@@ -471,7 +477,7 @@ def run(self, dataset: list[tuple[str, dict]]):
                 # Second, try return a result
                 try:
                     # Directly get the result from the SharedMemoryQueue
-                    rollout_results = self.check_for_new_results()
+                    rollout_results = self.get_new_results()
                 except queue.Empty:
                     continue
 
@@ -480,6 +486,8 @@ def run(self, dataset: list[tuple[str, dict]]):
                     raise rollout_results
 
                 assert isinstance(rollout_results, list)
+                if len(rollout_results) == 0:
+                    continue
                 assert isinstance(rollout_results[0], RolloutResult)
                 assert len(rollout_results) == attempts, (
                     f"Expected {attempts} rollouts, got {len(rollout_results)}"
@@ -589,37 +597,141 @@ def publish_stats(self, stats_writer: StreamWriter, loop_stats: Dict):
         stats_writer.write(stats)
         self.init_stats()  # Reset stats for the next iteration
 
+    def have_capacity(self) -> bool:
+        return not self.problem_queue.full()
+
     def submit_problem(self, problem: dict):
         self.problem_queue.put(problem, block=False)
 
     def stop_tasks(self):
         pass
 
-    def check_for_new_results(self):
-        rollout_results = self.result_queue.get(block=False)
-        return rollout_results
+    def get_new_results(self) -> list[RolloutResult]:
+        return self.result_queue.get(block=False)
 
 
-class ActorLoop2(ActorLoop):
+class ActorLoopRay(ActorLoop):
     """
     Loop that runs the ray tasks for n_jobs to perform rollouts in parallel
     """
+    ray_ready: bool = False
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.cfg_dict = OmegaConf.to_container(self.cfg, resolve=True)
+        self.unfinished_tasks = []
+        self.llms_by_url = {llm.get_base_url(): llm for llm in self.llms}
+        self.llms_utilization = {llm.get_base_url(): 0 for llm in self.llms}
+        self.problem_id = 0
+        self.unfinished_problems = defaultdict(list) # up to `attempts` rollout results for each problem
+        self.finished_problems = []
+        self.token_count = 0
+        self.finished_rollouts_count = 0
+
     def start_backend(self):
-        ray.init(num_cpus=self.cfg.actor.rollout_workers, dashboard_host="0.0.0.0")
+        if not self.ray_ready:
+            logger.info(f"Initializing Ray with {self.cfg.actor.rollout_workers} workers..")
+            ray_context = ray.init(num_cpus=self.cfg.actor.rollout_workers, dashboard_host="0.0.0.0", include_dashboard=True)
+            logger.info(f"Ray initialized, dashboard at {ray_context.dashboard_url}")
+            self.ray_ready = True
+        else:
+            logger.info("Ray already initialized")
+
+        rollout_policy: Callable[[DictConfig, TrainableLLM, dict], RolloutResult] = hydra.utils.get_method(self.cfg.actor.rollout_policy)
+        def rollout_wrapper(cfg: DictConfig, llm: TrainableLLM, problem: dict, problem_id: int) -> RolloutResult:
+            rollout_result: RolloutResult = rollout_policy(cfg, llm, problem)
+            ts = time.monotonic()
+            return rollout_result, llm.get_base_url(), problem_id, ts
+        self.ray_remote = ray.remote(rollout_wrapper)
+        self.start_time = time.time()
+
+    def have_capacity(self) -> bool:
+        have_capacity = len(self.unfinished_tasks) < self.cfg.actor.problem_queue_size
+        have_llm = any(self.llms_utilization[llm_url] < self.cfg.actor.llm_max_rollouts for llm_url in self.llms_utilization)
+        have_capacity = have_capacity and have_llm
+        if not have_capacity:
+            time.sleep(0.1) # sleep for a while to avoid quick loops when no capacity
+        return have_capacity
 
     def submit_problem(self, problem: dict):
-        pass
+        attempts = self.cfg.attempts if self.is_training else 1
+        for attempt_number in range(attempts):
+            llm_url, task_count = min(self.llms_utilization.items(), key=lambda x: x[1])
+            logger.info(f"Submitting problem attempt {attempt_number} to the least busy LLM {llm_url} with {task_count} tasks")
+            llm = self.llms_by_url[llm_url]
+            task_ref = self.ray_remote.remote(self.cfg_dict, llm, problem, self.problem_id)
+            self.problem_id += 1
+            self.llms_utilization[llm_url] += 1
+            self.unfinished_tasks.append(task_ref)
 
     def stop_tasks(self):
-        pass
-
-    def check_for_new_results(self):
-        pass
+        ray.shutdown()
 
+    def receive_finished_tasks(self):
+        num_returns = min(100, len(self.unfinished_tasks))
+        try:
+            finished_tasks, unfinished_tasks = ray.wait(self.unfinished_tasks, num_returns=num_returns, timeout=0.1)
+        except Exception as e:
+            logger.error(f"Error waiting for finished ray tasks: {e}")
+            return
+        if len(finished_tasks) > 0:
+            logger.info(f"Found {len(finished_tasks)} finished tasks, {len(unfinished_tasks)} unfinished tasks left")
+        self.unfinished_tasks = unfinished_tasks
+        dt = time.time() - self.start_time
+        ray_result_latencies = []
+        for finished_task in finished_tasks:
+            try:
+                rollout_result, llm_url, problem_id, inner_ts = ray.get(finished_task)
+                outer_ts = time.monotonic()
+                ray_result_latency = outer_ts - inner_ts
+                ray_result_latencies.append(ray_result_latency)
+            except Exception as e:
+                logger.error(f"Error getting finished ray task: {e}")
+                continue
+            if self.llms_utilization[llm_url] > 0:
+                self.llms_utilization[llm_url] -= 1
+            else:
+                logger.warning(f"LLM {llm_url} utilization is 0, but got a result")
+            self.token_count += get_number_of_tokens_in_result(rollout_result)
+            self.finished_rollouts_count += 1
+            self.unfinished_problems[problem_id].append(rollout_result)
+            logger.info(f"Problem {problem_id} has {len(self.unfinished_problems[problem_id])} rollout results")
+            if len(self.unfinished_problems[problem_id]) == self.cfg.attempts:
+                logger.info(f"Group for problem {problem_id} finished")
+                self.finished_problems.append(self.unfinished_problems[problem_id])
+                del self.unfinished_problems[problem_id]
+                logger.info(f"{len(self.finished_problems)} finished problems ready to return")
+            logger.info(
+                f"Ray {'train' if self.is_training else 'test'} actor loop: "
+                f"rollouts in progress: {len(self.unfinished_tasks)}, "
+                f"problems in progress: {len(self.unfinished_problems)}, "
+                f"rollouts finished: {self.finished_rollouts_count}, "
+                f"total tokens: {self.token_count}, "
+                f"gen speed: {self.token_count / dt:.2f} tokens/sec, "
+                f"ray latency: {np.mean(ray_result_latencies):.4f} seconds"
+            )
+            save_debug_line({
+                "rollouts_finished": self.finished_rollouts_count,
+                "rollouts_in_progress": len(self.unfinished_tasks),
+                "problems_in_progress": len(self.unfinished_problems),
+                "tokens_produced": self.token_count,
+                "dt": dt,
+                "token_speed": self.token_count / dt,
+                "ray_latency": np.mean(ray_result_latencies),
+            })
+            logger.info(f"LLMs utilization: {self.llms_utilization}")
+        
+    def get_new_results(self) -> list[list[RolloutResult]]:
+        self.receive_finished_tasks()
+        if len(self.finished_problems) > 0:
+            logger.info(f"have {len(self.finished_problems)} finished problems, pop one")
+            return self.finished_problems.pop(0)
+        return []
 
 
 def run_actor_loop(cfg: DictConfig):
     set_streams_backend(**cfg.streams)
+    actor_loop_class = ActorLoopRay if cfg.use_ray else ActorLoop
 
     # set seed for reproducibility (mostly intended for dataset loading)
     random.seed(cfg.seed)
@@ -697,12 +809,12 @@ def run_actor_loop(cfg: DictConfig):
         trainer_state.start_listening()
         trainer_state.wait_for_model_version()
 
-    train_loop = ActorLoop(
+    train_loop = actor_loop_class(
         data_stream=data_stream, cfg=cfg, trainer_state=trainer_state, stats_stream=stats_stream, llms=train_llms
     )
     train_loop.start_backend()
     train_loop_run = train_loop.run(dataset=train_dataset)
-    test_loop = ActorLoop(
+    test_loop = actor_loop_class(
         data_stream=test_data_stream,
         cfg=cfg,
         trainer_state=trainer_state,

From d16222ae969ba3dfb273a4d991bf2b8e37ae0d96 Mon Sep 17 00:00:00 2001
From: Oleh Shliazhko <oleh.shliazhko@servicenow.com>
Date: Mon, 6 Oct 2025 17:48:41 +0000
Subject: [PATCH 079/126] rollout debug

---
 pipelinerl/domains/mcp/rollouts.py | 29 +++++++++++++++++++++++------
 1 file changed, 23 insertions(+), 6 deletions(-)

diff --git a/pipelinerl/domains/mcp/rollouts.py b/pipelinerl/domains/mcp/rollouts.py
index c867cbc4..2758ed15 100644
--- a/pipelinerl/domains/mcp/rollouts.py
+++ b/pipelinerl/domains/mcp/rollouts.py
@@ -1,4 +1,5 @@
 import asyncio
+import json
 import logging
 import random
 import time
@@ -10,11 +11,12 @@
 from hydra.utils import instantiate
 from omegaconf import DictConfig, OmegaConf
 from tapeagents.agent import DEFAULT, Agent
-from tapeagents.core import LLMCall, Tape
+from tapeagents.core import LLMCall, Tape, TrainingText
 from tapeagents.dialog_tape import UserStep
+from tapeagents.llms import LiteLLM
 from tapeagents.llms.trainable import TrainableLLM
 from tapeagents.mcp import MCPEnvironment
-from tapeagents.orchestrator import async_execute_agent, execute_agent, get_agent_and_env_from_config
+from tapeagents.orchestrator import async_execute_agent, execute_agent, get_agent_and_env_from_config, save_debug_tape
 from tapeagents.remote_environment import AsyncRemoteEnvironment
 
 from pipelinerl.async_llm import make_training_text
@@ -29,6 +31,8 @@
 
 _embedded_worker: EmbeddedEnvironmentWorker | None = None
 
+class FailedRollout(Exception):
+    pass
 
 def _get_embedded_worker(env_cfg: DictConfig, concurrency: int) -> EmbeddedEnvironmentWorker:
     global _embedded_worker
@@ -294,8 +298,9 @@ def generate_mcp_rollout_with_local_env(
         cfg = OmegaConf.create(cfg)
     agent, _env = get_agent_and_env_from_config(cfg)
     environment: MCPEnvironment = _env
-    logger.info("Agent and environment loaded")
+    logger.info(f"Agent and environment loaded, using llm {llm.model_name} at {llm.get_base_url()}")
     try:
+        t_exec = time.perf_counter()
         start_result = environment.start_task(problem)
         logger.info("Task started")
         tape_metadata = start_result if isinstance(start_result, dict) else {}
@@ -310,12 +315,11 @@ def generate_mcp_rollout_with_local_env(
         if tape_metadata:
             tape.metadata.other.update(tape_metadata)
 
-        t_exec = time.perf_counter()
         logger.info("Running agent..")
         tape = execute_agent(agent, tape, environment, max_loops=cfg.agent_max_loops)
         logger.info("Agent finished")
         tape.metadata.result.update({"total_execution_time": time.perf_counter() - t_exec})
-
+        # save_debug_tape(tape)
         reward_table = RewardTable(**dict(cfg.rewards))
 
         llm_calls: list[LLMCall] = [
@@ -326,6 +330,7 @@ def generate_mcp_rollout_with_local_env(
         ]
         assert len(llm_calls) > 0, "No LLM calls found"
         tool_call_counts = count_tool_calls_by_category(llm_calls)
+        logger.info(f'Use {type(llm)} LLM to generate training texts')
         training_texts = [make_training_text(llm, llm_call) for llm_call in llm_calls]
         n_llm_calls = len(llm_calls)
         answer_status = verify_answer(
@@ -397,6 +402,7 @@ def generate_mcp_rollout_with_local_env(
 
         # Assign identical reward to all steps in the rollout (pipeline expects uniform rollout_reward)
         for text in training_texts:
+            # debug_save_training_text(text)
             text.reward = reward
             text.finished = tape_finished
 
@@ -427,5 +433,16 @@ def generate_mcp_rollout_with_local_env(
             dataset_name=problem["dataset"],
             llm_url=llm.get_base_url(),
         )
+    except Exception as e:
+        err_msg = f"Error generating rollout: {e}"
+        logger.error(err_msg)
+        raise FailedRollout(err_msg)
     finally:
-        environment.close()
+        try:
+            environment.close()
+        except Exception as e:
+            logger.error(f"Error closing environment: {e}")
+
+def debug_save_training_text(text: TrainingText):
+    with open("debug_training_texts.jsonl", "a") as f:
+        f.write(json.dumps({"text": text.text, "n_predicted": text.n_predicted}, ensure_ascii=False) + "\n")
\ No newline at end of file

From 74857cff059866702e7c8c1b3c6b7f9858cd5f86 Mon Sep 17 00:00:00 2001
From: Oleh Shliazhko <oleh.shliazhko@servicenow.com>
Date: Mon, 6 Oct 2025 17:49:08 +0000
Subject: [PATCH 080/126] llm benchmark scripts update

---
 llm.sh             |  15 ++--
 llm_bench.py       |  71 +++++++++-------
 llm_bench_async.py | 200 +++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 249 insertions(+), 37 deletions(-)
 create mode 100644 llm_bench_async.py

diff --git a/llm.sh b/llm.sh
index 9ed54075..78758724 100755
--- a/llm.sh
+++ b/llm.sh
@@ -14,7 +14,7 @@ echo "Run LLM only"
 #     --config-name mcp
 
 
-python -m pipelinerl.entrypoints.run_vllm0 \
+python -m pipelinerl.entrypoints.run_vllm1 \
     --model Qwen/Qwen3-8B \
     --host 0.0.0.0 \
     --port 8080 \
@@ -29,12 +29,12 @@ python -m pipelinerl.entrypoints.run_vllm0 \
     --disable-frontend-multiprocessing \
     --max-num-seqs 256 \
     --max-num-batched-tokens 32000 \
+    --max_model_len 32000 \
     --enable-chunked-prefill \
     --return-tokens-as-token-ids \
     --tensor-parallel-size 1 \
     --pipeline-parallel-size 1 \
     --generation-config vllm \
-    --max_model_len 32000 \
     --enable-auto-tool-choice \
     --tool-call-parser rl_tool \
     --tool-parser-plugin /home/toolkit/PipelineRL/pipelinerl/rl_tool_parser_plugin.py \
@@ -45,7 +45,7 @@ python -m pipelinerl.entrypoints.run_vllm0 \
 #     --model Qwen/Qwen2.5-7B \
 #     --host 0.0.0.0 \
 #     --port 8080 \
-#     --seed 13 \
+#     --seed 42 \
 #     --actor-llm-idx 0 \
 #     --weight-update-group-init-method tcp://localhost:9000 \
 #     --weight-update-group-world-size 2 \
@@ -54,14 +54,17 @@ python -m pipelinerl.entrypoints.run_vllm0 \
 #     --num-scheduler-steps 1 \
 #     --disable-log-requests \
 #     --disable-frontend-multiprocessing \
-#     --max-num-seqs 64 \
-#     --max-num-batched-tokens 1024 \
+#     --max-num-seqs 256 \
+#     --max-num-batched-tokens 32000 \
 #     --enable-chunked-prefill \
 #     --return-tokens-as-token-ids \
 #     --tensor-parallel-size 1 \
 #     --pipeline-parallel-size 1 \
 #     --generation-config vllm \
-#     --max_model_len 64000 \
+#     --max_model_len 32000 \
+#     --enable-auto-tool-choice \
+#     --tool-call-parser rl_tool \
+#     --tool-parser-plugin /home/toolkit/PipelineRL/pipelinerl/rl_tool_parser_plugin.py \
 #     --disable-weight-update
  
 # python -m pipelinerl.entrypoints.run_vllm0 --model /mnt/llmd/base_models/Mistral-Small-24B-Base-2501 --host 0.0.0.0 --port 8080 --seed 78 --actor-llm-idx 36 --weight-update-group-init-method tcp://dns-99833624-2133-43c0-a112-07520ffee505-0:9000 --weight-update-group-world-size 49 --dtype bfloat16 --gpu-memory-utilization 0.9 --num-scheduler-steps 1 --disable-log-requests --disable-frontend-multiprocessing --max-num-seqs 256 --max-num-batched-tokens 1024 --enable-chunked-prefill --return-tokens-as-token-ids --tensor-parallel-size 1 --pipeline-parallel-size 1 --generation-config vllm --max_model_len 32768
diff --git a/llm_bench.py b/llm_bench.py
index f4a014ee..e61cf342 100644
--- a/llm_bench.py
+++ b/llm_bench.py
@@ -1,5 +1,6 @@
 import json
 import os
+import random
 import time
 
 import numpy as np
@@ -10,13 +11,13 @@
 os.environ["TOKENIZERS_PARALLELISM"] = "false"
 
 llm_url = "http://localhost:8080"
-# llm_model = "Qwen/Qwen3-8B"
-llm_model = "Qwen/Qwen2.5-7B"
-# exp_name = "qwen3-8b"
-exp_name = "qwen2.5-7b"
+llm_model = "Qwen/Qwen3-8B"
+# llm_model = "Qwen/Qwen2.5-7B"
+exp_name = "qwen3-8b-v1"
+# exp_name = "qwen2.5-7b"
+max_tokens = 8192
 
 def llm_quick_response(prompt: str):
-    t = time.perf_counter()
     r = requests.post(
         url=f"{llm_url}/v1/chat/completions",
         json={
@@ -29,15 +30,13 @@ def llm_quick_response(prompt: str):
         verify=False,
     )
     d = r.json()
-    dt = time.perf_counter() - t
-    return d["choices"][0]["message"]["content"], dt
+    return d["choices"][0]["message"]["content"]
 
 
 llm = TrainableLLM(base_url=llm_url, model_name=llm_model)
 response = llm.quick_response("Hello, how are you?")
-response2, _ = llm_quick_response("Hello, how are you?")
+response = llm_quick_response("Hello, how are you?")
 assert len(response) > 0
-assert len(response2) > 0
 assert llm.tokenizer is not None
 print("LLM is ready")
 
@@ -55,10 +54,12 @@ def llm_quick_response(prompt: str):
 print(f"Loaded {len(all_dicts)} texts, total tokens: {total_tokens}")
 
 prompts = [d["text"][:-d["n_predicted"]] for d in all_dicts]
+random.seed(42)
+random.shuffle(prompts)
 chunk_size = 4
 prompts_chunks = [prompts[i:i+chunk_size] for i in range(0, len(prompts), chunk_size)]
 print(f"Chunked to {len(prompts_chunks)} chunks")
-
+too_many_chunks = prompts_chunks * 20
 
 def benchmark_llm(n_workers: int):
     ray.shutdown()
@@ -66,21 +67,24 @@ def benchmark_llm(n_workers: int):
 
     def get_responses(prompts: str):
         responses = []
-        # local_llm = TrainableLLM(base_url=llm_url, model_name=llm_model)
+        # local_llm = TrainableLLM(base_url=llm_url, model_name=llm_model, parameters={"max_tokens": max_tokens})
         for i, prompt in enumerate(prompts):
-            r, dt = llm_quick_response(prompt)
+            t = time.perf_counter()
+            # r = local_llm.quick_response(prompt)
+            r = llm_quick_response(prompt)
+            dt = time.perf_counter() - t
             responses.append((prompt + r, dt))
         return responses
 
     remote_fn = ray.remote(get_responses)
 
-    t = time.perf_counter()
+    start_time = time.perf_counter()
 
-    chunks = prompts_chunks
-    if n_workers > len(chunks):
-        multiplier = n_workers // len(chunks) + 1
-        chunks = chunks * multiplier
-        print(f"Multiplied to {len(chunks)} chunks")
+    n_chunks = max(200, n_workers * 2)
+    chunks = too_many_chunks[:n_chunks]
+    print(f"Multiplied to {len(chunks)} chunks")
+    random.seed(42)
+    random.shuffle(chunks)
     unfinished_tasks = []
     for chunk in chunks:
         unfinished_tasks.append(remote_fn.remote(chunk))
@@ -95,36 +99,41 @@ def get_responses(prompts: str):
         for finished_task in finished_tasks:
             responses = ray.get(finished_task)
             total_finished += 1
-            for response, dt in responses:
-                latencies.append(dt)
+            for response, latency in responses:
+                latencies.append(latency)
                 tokens = llm.tokenizer.encode(response)
                 total_tokens += len(tokens)
-        dt = time.perf_counter() - t
+        dt = time.perf_counter() - start_time
         if len(finished_tasks) > 0:
-            print(f"t: {dt:.2f}s, {total_finished} finished, Total tokens: {total_tokens}, tokens/sec: {total_tokens / dt:.2f}")
-        # if dt > 600:
-        #     print("Timeout 10 minutes, stopping")
-        #     break
-        time.sleep(1.0)
-
-    final_time = time.perf_counter() - t
+            print(f"t: {dt:.2f}s, {total_finished} finished, Total tokens: {total_tokens}, tokens/sec: {total_tokens / dt:.2f}, last 10 latency: {np.mean(latencies[-10:]):.2f}s")
+            with open(f"llm_token_stats_chunk{chunk_size}_{exp_name}_log.jsonl", "a") as f:
+                ts = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+                row = json.dumps({"ts": ts, "exp_name": exp_name, "n_workers": n_workers, "tokens": total_tokens, "dt": dt, "mean_latency": np.mean(latencies), "last_10_latency": np.mean(latencies[-10:]), "total_finished": total_finished, "token_speed": total_tokens / dt})
+                f.write(row + "\n")
+        if len(unfinished_tasks) < n_workers:
+            print(f"Saturation mode ended, stopping")
+            break
+        time.sleep(2.0)
+
+    final_time = time.perf_counter() - start_time
     print(f"Final, workers:{n_workers}, t:{final_time:.2f}s, total tokens: {total_tokens}, tokens/sec: {total_tokens / final_time:.2f}")
     ray.shutdown()
     mean_latency = np.mean(latencies)
     return total_tokens, final_time, mean_latency
 
 stats = {}
-for n_workers in [2, 4, 8, 16, 32, 64, 128, 256, 512, 1024]:
+for n_workers in [128]: #[64, 256, 128, 32, 4, 8, 16, 512, 1024]: # most optimal first
     print(f"Benchmarking {n_workers} workers..")
     tokens, dt, mean_latency = benchmark_llm(n_workers)
     print(f"Done {n_workers} workers: {tokens} tokens, {dt:.2f}s, speed {tokens / dt:.2f} tokens/sec, mean latency: {mean_latency:.2f}s")
     stats[n_workers] = {"tokens": tokens, "dt": dt, "mean_latency": mean_latency}
-    with open(f"llm_token_stats_chunk{chunk_size}_{exp_name}.jsonl", "a") as f:
+    with open(f"llm_token_stats_ray_chunk{chunk_size}_{exp_name}.jsonl", "a") as f:
         ts = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
         row = json.dumps({"ts": ts, "n_workers": n_workers, "tokens": tokens, "dt": dt, "mean_latency": mean_latency})
         f.write(row + "\n")
+    time.sleep(3.0)
 
 print("Benchmarking done")
-with open(f"llm_token_stats_all_chunk{chunk_size}_{exp_name}.json", "w") as f:
+with open(f"llm_token_stats_ray_all_chunk{chunk_size}_{exp_name}.json", "w") as f:
     json.dump(stats, f, indent=4)
 print("All stats saved")
\ No newline at end of file
diff --git a/llm_bench_async.py b/llm_bench_async.py
new file mode 100644
index 00000000..0f307576
--- /dev/null
+++ b/llm_bench_async.py
@@ -0,0 +1,200 @@
+import asyncio
+import json
+import os
+import random
+import time
+
+import aiohttp
+import numpy as np
+from tapeagents.llms import TrainableLLM
+
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+
+llm_url = "http://localhost:8080"
+llm_model = "Qwen/Qwen3-8B"
+# llm_model = "Qwen/Qwen2.5-7B"
+exp_name = "qwen3-8b-v1"
+# exp_name = "qwen2.5-7b"
+max_tokens = 8192
+
+
+async def llm_quick_response_async(session: aiohttp.ClientSession, prompt: str):
+    """Async version of LLM quick response"""
+    async with session.post(
+        url=f"{llm_url}/v1/chat/completions",
+        json={
+            "model": llm_model,
+            "messages": [{"role": "user", "content": prompt}],
+            "stream": False,
+        },
+        headers={"Content-Type": "application/json"},
+        ssl=False,
+    ) as response:
+        d = await response.json()
+        return d["choices"][0]["message"]["content"]
+
+
+
+
+# Initial LLM test (synchronous)
+llm = TrainableLLM(base_url=llm_url, model_name=llm_model)
+response = llm.quick_response("Hello, how are you?")
+assert len(response) > 0
+assert llm.tokenizer is not None
+print("LLM is ready")
+
+
+with open("debug_training_texts.jsonl", "r", encoding="utf-8") as f:
+    all_dicts = [json.loads(line) for line in f if line.strip()]
+total_tokens = 0
+for d in all_dicts:
+    text = d["text"]
+    n_predicted = d["n_predicted"]
+    prompt = text[:-n_predicted]
+    response = text[-n_predicted:]
+    tokens = llm.tokenizer.encode(text)
+    total_tokens += len(tokens)
+print(f"Loaded {len(all_dicts)} texts, total tokens: {total_tokens}")
+
+prompts = [d["text"][:-d["n_predicted"]] for d in all_dicts]
+random.seed(42)
+random.shuffle(prompts)
+chunk_size = 4
+prompts_chunks = [prompts[i:i+chunk_size] for i in range(0, len(prompts), chunk_size)]
+print(f"Chunked to {len(prompts_chunks)} chunks")
+too_many_chunks = prompts_chunks * 20
+
+
+async def get_responses_async(session: aiohttp.ClientSession, prompts: list[str], tokenizer):
+    """Process a chunk of prompts asynchronously"""
+    responses = []
+    for prompt in prompts:
+        t = time.perf_counter()
+        try:
+            r = await llm_quick_response_async(session, prompt)
+            dt = time.perf_counter() - t
+            responses.append((prompt + r, dt))
+        except Exception as e:
+            print(f"Error processing prompt: {e}")
+            dt = time.perf_counter() - t
+            responses.append((prompt, dt))
+    return responses
+
+
+async def benchmark_llm_async(n_workers: int):
+    """Benchmark LLM using async/await with controlled concurrency"""
+    print(f"Starting async benchmark with {n_workers} concurrent workers")
+    
+    start_time = time.perf_counter()
+    
+    n_chunks = max(200, n_workers * 2)
+    chunks = too_many_chunks[:n_chunks]
+    print(f"Multiplied to {len(chunks)} chunks")
+    random.seed(42)
+    random.shuffle(chunks)
+    
+    total_tokens = 0
+    total_finished = 0
+    latencies = []
+    
+    # Create shared aiohttp session with connection pooling
+    connector = aiohttp.TCPConnector(limit=n_workers, limit_per_host=n_workers)
+    timeout = aiohttp.ClientTimeout(total=300)  # 5 minute timeout
+    
+    async with aiohttp.ClientSession(connector=connector, timeout=timeout) as session:
+        # Create all tasks
+        tasks = []
+        for chunk in chunks:
+            task = asyncio.create_task(get_responses_async(session, chunk, llm.tokenizer))
+            tasks.append(task)
+        
+        print(f"Created {len(tasks)} tasks")
+        
+        # Process tasks with controlled concurrency
+        pending = set(tasks)
+        active = set()
+        
+        while pending or active:
+            # Fill up active tasks up to n_workers limit
+            while len(active) < n_workers and pending:
+                task = pending.pop()
+                active.add(task)
+            
+            if not active:
+                break
+            
+            # Wait for at least one task to complete
+            done, active = await asyncio.wait(active, timeout=0.1, return_when=asyncio.FIRST_COMPLETED)
+            
+            # Process completed tasks
+            for finished_task in done:
+                try:
+                    responses = await finished_task
+                    total_finished += 1
+                    for response, latency in responses:
+                        latencies.append(latency)
+                        tokens = llm.tokenizer.encode(response)
+                        total_tokens += len(tokens)
+                except Exception as e:
+                    print(f"Task failed with error: {e}")
+                    total_finished += 1
+            
+            # Log progress
+            dt = time.perf_counter() - start_time
+            if len(done) > 0:
+                print(f"t: {dt:.2f}s, {total_finished} finished, Total tokens: {total_tokens}, tokens/sec: {total_tokens / dt:.2f}, last 10 latency: {np.mean(latencies[-10:]) if latencies else 0:.2f}s")
+                with open(f"llm_token_stats_chunk{chunk_size}_{exp_name}_log.jsonl", "a") as f:
+                    ts = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+                    row = json.dumps({
+                        "ts": ts,
+                        "exp_name": exp_name,
+                        "n_workers": n_workers,
+                        "tokens": total_tokens,
+                        "dt": dt,
+                        "mean_latency": np.mean(latencies) if latencies else 0,
+                        "last_10_latency": np.mean(latencies[-10:]) if latencies else 0,
+                        "total_finished": total_finished,
+                        "token_speed": total_tokens / dt if dt > 0 else 0
+                    })
+                    f.write(row + "\n")
+            
+            # Check saturation mode
+            if len(pending) + len(active) < n_workers:
+                print(f"Saturation mode ended, stopping")
+                # Cancel remaining tasks
+                for task in active:
+                    task.cancel()
+                break
+            
+            await asyncio.sleep(2.0)
+    
+    final_time = time.perf_counter() - start_time
+    print(f"Final, workers:{n_workers}, t:{final_time:.2f}s, total tokens: {total_tokens}, tokens/sec: {total_tokens / final_time:.2f}")
+    mean_latency = np.mean(latencies) if latencies else 0
+    return total_tokens, final_time, mean_latency
+
+
+async def run_benchmarks():
+    """Run benchmarks for different worker counts"""
+    stats = {}
+    for n_workers in [128]:  # [64, 256, 128, 32, 4, 8, 16, 512, 1024]: # most optimal first
+        print(f"Benchmarking {n_workers} workers..")
+        tokens, dt, mean_latency = await benchmark_llm_async(n_workers)
+        print(f"Done {n_workers} workers: {tokens} tokens, {dt:.2f}s, speed {tokens / dt:.2f} tokens/sec, mean latency: {mean_latency:.2f}s")
+        stats[n_workers] = {"tokens": tokens, "dt": dt, "mean_latency": mean_latency}
+        with open(f"llm_token_stats_chunk{chunk_size}_{exp_name}.jsonl", "a") as f:
+            ts = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+            row = json.dumps({"ts": ts, "n_workers": n_workers, "tokens": tokens, "dt": dt, "mean_latency": mean_latency})
+            f.write(row + "\n")
+        await asyncio.sleep(3.0)
+    
+    print("Benchmarking done")
+    with open(f"llm_token_stats_all_chunk{chunk_size}_{exp_name}.json", "w") as f:
+        json.dump(stats, f, indent=4)
+    print("All stats saved")
+
+
+if __name__ == "__main__":
+    # Run the async benchmarks
+    asyncio.run(run_benchmarks())
+

From 1332fc29b59f6c8e33d5b59e543c88e2cc4cefbc Mon Sep 17 00:00:00 2001
From: Oleh Shliazhko <oleh.shliazhko@servicenow.com>
Date: Mon, 6 Oct 2025 17:49:30 +0000
Subject: [PATCH 081/126] flag to control ray usage

---
 conf/base.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/conf/base.yaml b/conf/base.yaml
index 638d2c13..5bf30c59 100644
--- a/conf/base.yaml
+++ b/conf/base.yaml
@@ -5,6 +5,7 @@ defaults:
   - _self_
 
 seed: 42
+use_ray: false
 
 finetune:
   seed: ${..seed}

From 2163313c70faa8f4b17bde39320341d72c10ae19 Mon Sep 17 00:00:00 2001
From: Oleh Shliazhko <oleh.shliazhko@servicenow.com>
Date: Mon, 6 Oct 2025 17:51:23 +0000
Subject: [PATCH 082/126] mcp config with ray and local envs

---
 conf/mcp.yaml | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/conf/mcp.yaml b/conf/mcp.yaml
index 330c6c9e..fa911e1a 100644
--- a/conf/mcp.yaml
+++ b/conf/mcp.yaml
@@ -3,7 +3,10 @@ defaults:
     - override finetune: grpo
     - _self_
 
+use_ray: true
+
 llm:
+  use_cache: false
   parameters:
     max_tokens: 8192
 
@@ -12,9 +15,11 @@ test_llm:
     max_tokens: 8192
 
 actor:
-  rollout_policy: pipelinerl.domains.mcp.generate_mcp_rollout
+  rollout_policy: pipelinerl.domains.mcp.generate_mcp_rollout_with_local_env
   system_prompt: Please reason step by step, and put your final answer within \boxed{{}}.
-  llm_max_rollouts: 64
+  rollout_workers: 64
+  llm_max_rollouts: 256
+  problem_queue_size: 256
   task_template: |-
     {task}
   shared_memory_entry_size: 10000000
@@ -31,26 +36,23 @@ test_dataset_names:
   - aime_2025
 
 vllm_config:
-  use_v1: false
+  use_v1: true
   vllm_kwargs:
     enable-auto-tool-choice: ""
     tool-call-parser: rl_tool
     tool-parser-plugin: ${hydra:runtime.cwd}/pipelinerl/rl_tool_parser_plugin.py
-    max-num-seqs: ${actor.llm_max_rollouts}
-    max-num-batched-tokens: 4096
+    max-num-seqs: 256
+    max-num-batched-tokens: 32000
     max_model_len: 32000
-    gpu-memory-utilization: 0.85
+    gpu-memory-utilization: 0.9
 
 environment:
-  _target_: pipelinerl.domains.mcp.env_server.EmbeddedMCPEnvironment
+  _target_: tapeagents.mcp.MCPEnvironment
   config_path: ${hydra:runtime.cwd}/conf/mcp/python.json
   tools_whitelist:
     - run_python_code
   read_timeout_seconds: 600
   use_cache: false
-  runtime_pool_workers: 4
-  offload_tools:
-    - run_python_code
 
 
 world:

From e6f232911759b43d159eb1742571967819bb8a62 Mon Sep 17 00:00:00 2001
From: Oleh Shliazhko <oleh.shliazhko@servicenow.com>
Date: Mon, 6 Oct 2025 17:51:47 +0000
Subject: [PATCH 083/126] update debug entrypoint

---
 debug.sh | 69 +++++++++++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 61 insertions(+), 8 deletions(-)

diff --git a/debug.sh b/debug.sh
index c1e2822a..a2aca44b 100755
--- a/debug.sh
+++ b/debug.sh
@@ -1,18 +1,71 @@
 #!/bin/bash
-python -m pipelinerl.launch \
-    output_dir=results/actor_debug1 \
+echo "Run 32 workers"
+DEBUG_FILE=timing_debug_workers32_3.jsonl python -m pipelinerl.launch \
+    output_dir=results/actor_debug32_3 \
     force_restart=true \
-    world.env_replicas_per_actor=1 \
-    actor.llm_max_rollouts=16 \
+    actor.llm_max_rollouts=256 \
     finetune.seq_parallel=8 \
     eval_every_n_versions=0 \
-    actor.rollout_workers=1 \
+    actor.rollout_workers=32 \
     debug.mode=actor \
     world.actor_fraction=8 \
     world.finetune_fraction=0 \
     world.preprocessor_fraction=0 \
     --config-name mcp
 
-    # environment.n_envs=4 \
-    # environment.mcp_read_timeout_seconds=300 \
-    # environment.env_call_timeout=300 \
\ No newline at end of file
+# echo "Run 10 workers"
+# DEBUG_FILE=timing_debug_gpt_workers10.jsonl python -m pipelinerl.launch \
+#     output_dir=results/actor_debug2 \
+#     force_restart=true \
+#     actor.llm_max_rollouts=16 \
+#     finetune.seq_parallel=8 \
+#     eval_every_n_versions=0 \
+#     actor.rollout_workers=10 \
+#     debug.mode=actor \
+#     world.actor_fraction=8 \
+#     world.finetune_fraction=0 \
+#     world.preprocessor_fraction=0 \
+#     --config-name mcp
+
+
+# echo "Run 5 workers"
+# DEBUG_FILE=timing_debug_gpt_workers5.jsonl python -m pipelinerl.launch \
+#     output_dir=results/actor_debug2 \
+#     force_restart=true \
+#     actor.llm_max_rollouts=16 \
+#     finetune.seq_parallel=8 \
+#     eval_every_n_versions=0 \
+#     actor.rollout_workers=5 \
+#     debug.mode=actor \
+#     world.actor_fraction=8 \
+#     world.finetune_fraction=0 \
+#     world.preprocessor_fraction=0 \
+#     --config-name mcp
+
+# echo "Run 40 workers"
+# DEBUG_FILE=timing_debug_gpt_workers40.jsonl python -m pipelinerl.launch \
+#     output_dir=results/actor_debug2 \
+#     force_restart=true \
+#     actor.llm_max_rollouts=16 \
+#     finetune.seq_parallel=8 \
+#     eval_every_n_versions=0 \
+#     actor.rollout_workers=40 \
+#     debug.mode=actor \
+#     world.actor_fraction=8 \
+#     world.finetune_fraction=0 \
+#     world.preprocessor_fraction=0 \
+#     --config-name mcp
+
+# echo "Run 30 workers"
+# DEBUG_FILE=timing_debug_gpt_workers30.jsonl python -m pipelinerl.launch \
+#     output_dir=results/actor_debug2 \
+#     force_restart=true \
+#     actor.llm_max_rollouts=16 \
+#     finetune.seq_parallel=8 \
+#     eval_every_n_versions=0 \
+#     actor.rollout_workers=30 \
+#     debug.mode=actor \
+#     world.actor_fraction=8 \
+#     world.finetune_fraction=0 \
+#     world.preprocessor_fraction=0 \
+#     --config-name mcp
\ No newline at end of file

From 3dcdf096f6a0f0e5cc4e7e67aaeea2e0952ed1d1 Mon Sep 17 00:00:00 2001
From: Oleh Shliazhko <oleh.shliazhko@servicenow.com>
Date: Wed, 8 Oct 2025 12:16:02 +0000
Subject: [PATCH 084/126] better timing logging

---
 debug.sh            |  8 ++++----
 pipelinerl/actor.py | 20 +++++++++++++-------
 2 files changed, 17 insertions(+), 11 deletions(-)

diff --git a/debug.sh b/debug.sh
index a2aca44b..b72841c1 100755
--- a/debug.sh
+++ b/debug.sh
@@ -1,12 +1,12 @@
 #!/bin/bash
-echo "Run 32 workers"
-DEBUG_FILE=timing_debug_workers32_3.jsonl python -m pipelinerl.launch \
-    output_dir=results/actor_debug32_3 \
+echo "Run 40 workers"
+DEBUG_FILE=timing_debug_workers40_1.jsonl python -m pipelinerl.launch \
+    output_dir=results/actor_debug40_1 \
     force_restart=true \
     actor.llm_max_rollouts=256 \
     finetune.seq_parallel=8 \
     eval_every_n_versions=0 \
-    actor.rollout_workers=32 \
+    actor.rollout_workers=40 \
     debug.mode=actor \
     world.actor_fraction=8 \
     world.finetune_fraction=0 \
diff --git a/pipelinerl/actor.py b/pipelinerl/actor.py
index 44b5daa7..b7193be1 100644
--- a/pipelinerl/actor.py
+++ b/pipelinerl/actor.py
@@ -627,6 +627,8 @@ def __init__(self, *args, **kwargs):
         self.finished_problems = []
         self.token_count = 0
         self.finished_rollouts_count = 0
+        self.task_latencies = []
+        self.ray_result_latencies = []
 
     def start_backend(self):
         if not self.ray_ready:
@@ -639,9 +641,10 @@ def start_backend(self):
 
         rollout_policy: Callable[[DictConfig, TrainableLLM, dict], RolloutResult] = hydra.utils.get_method(self.cfg.actor.rollout_policy)
         def rollout_wrapper(cfg: DictConfig, llm: TrainableLLM, problem: dict, problem_id: int) -> RolloutResult:
+            start_ts = time.monotonic()
             rollout_result: RolloutResult = rollout_policy(cfg, llm, problem)
             ts = time.monotonic()
-            return rollout_result, llm.get_base_url(), problem_id, ts
+            return rollout_result, llm.get_base_url(), problem_id, ts, start_ts
         self.ray_remote = ray.remote(rollout_wrapper)
         self.start_time = time.time()
 
@@ -678,13 +681,14 @@ def receive_finished_tasks(self):
             logger.info(f"Found {len(finished_tasks)} finished tasks, {len(unfinished_tasks)} unfinished tasks left")
         self.unfinished_tasks = unfinished_tasks
         dt = time.time() - self.start_time
-        ray_result_latencies = []
         for finished_task in finished_tasks:
             try:
-                rollout_result, llm_url, problem_id, inner_ts = ray.get(finished_task)
+                rollout_result, llm_url, problem_id, stop_ts, start_ts = ray.get(finished_task)
+                task_dt = stop_ts - start_ts
+                self.task_latencies.append(task_dt)
                 outer_ts = time.monotonic()
-                ray_result_latency = outer_ts - inner_ts
-                ray_result_latencies.append(ray_result_latency)
+                ray_result_latency = outer_ts - stop_ts
+                self.ray_result_latencies.append(ray_result_latency)
             except Exception as e:
                 logger.error(f"Error getting finished ray task: {e}")
                 continue
@@ -708,7 +712,8 @@ def receive_finished_tasks(self):
                 f"rollouts finished: {self.finished_rollouts_count}, "
                 f"total tokens: {self.token_count}, "
                 f"gen speed: {self.token_count / dt:.2f} tokens/sec, "
-                f"ray latency: {np.mean(ray_result_latencies):.4f} seconds"
+                f"task latency: {np.mean(self.task_latencies[-10:]):.2f} sec, "
+                f"ray delay: {np.mean(self.ray_result_latencies[-10:]):.4f} sec"
             )
             save_debug_line({
                 "rollouts_finished": self.finished_rollouts_count,
@@ -717,7 +722,8 @@ def receive_finished_tasks(self):
                 "tokens_produced": self.token_count,
                 "dt": dt,
                 "token_speed": self.token_count / dt,
-                "ray_latency": np.mean(ray_result_latencies),
+                "ray_latency": np.mean(self.ray_result_latencies[-10:]),
+                "task_latency": np.mean(self.task_latencies[-10:]),
             })
             logger.info(f"LLMs utilization: {self.llms_utilization}")
         

From ec567cc830f7b7ad076d627b40d7b67c0831ede1 Mon Sep 17 00:00:00 2001
From: Oleh Shliazhko <oleh.shliazhko@servicenow.com>
Date: Wed, 8 Oct 2025 17:25:46 +0000
Subject: [PATCH 085/126] fixes

---
 pipelinerl/actor.py | 23 +++++++++++++++--------
 1 file changed, 15 insertions(+), 8 deletions(-)

diff --git a/pipelinerl/actor.py b/pipelinerl/actor.py
index b7193be1..c44e6fd3 100644
--- a/pipelinerl/actor.py
+++ b/pipelinerl/actor.py
@@ -495,7 +495,7 @@ def run(self, dataset: list[tuple[str, dict]]):
                 group_samples = sum(len(r.training_texts) for r in rollout_results)
 
                 published_samples += group_samples
-                samples_in_queue = self.result_queue.qsize() * attempts
+                samples_in_queue = self.results_ready_to_publish()
                 all_text_dumps = []
                 for r in rollout_results:
                     for text in r.training_texts:
@@ -609,6 +609,9 @@ def stop_tasks(self):
     def get_new_results(self) -> list[RolloutResult]:
         return self.result_queue.get(block=False)
 
+    def results_ready_to_publish(self) -> int:
+        return self.result_queue.qsize() * self.cfg.attempts
+
 
 class ActorLoopRay(ActorLoop):
     """
@@ -623,6 +626,7 @@ def __init__(self, *args, **kwargs):
         self.llms_by_url = {llm.get_base_url(): llm for llm in self.llms}
         self.llms_utilization = {llm.get_base_url(): 0 for llm in self.llms}
         self.problem_id = 0
+        self.attempts = self.cfg.attempts if self.is_training else 1
         self.unfinished_problems = defaultdict(list) # up to `attempts` rollout results for each problem
         self.finished_problems = []
         self.token_count = 0
@@ -644,28 +648,28 @@ def rollout_wrapper(cfg: DictConfig, llm: TrainableLLM, problem: dict, problem_i
             start_ts = time.monotonic()
             rollout_result: RolloutResult = rollout_policy(cfg, llm, problem)
             ts = time.monotonic()
+            logger.info(f"Problem {problem_id} finished in {ts - start_ts:.2f} seconds")
             return rollout_result, llm.get_base_url(), problem_id, ts, start_ts
         self.ray_remote = ray.remote(rollout_wrapper)
         self.start_time = time.time()
 
     def have_capacity(self) -> bool:
         have_capacity = len(self.unfinished_tasks) < self.cfg.actor.problem_queue_size
-        have_llm = any(self.llms_utilization[llm_url] < self.cfg.actor.llm_max_rollouts for llm_url in self.llms_utilization)
-        have_capacity = have_capacity and have_llm
+        have_llm_capacity = any(self.llms_utilization[llm_url] < (self.cfg.actor.llm_max_rollouts - self.attempts) for llm_url in self.llms_utilization)
+        have_capacity = have_capacity and have_llm_capacity
         if not have_capacity:
             time.sleep(0.1) # sleep for a while to avoid quick loops when no capacity
         return have_capacity
 
     def submit_problem(self, problem: dict):
-        attempts = self.cfg.attempts if self.is_training else 1
-        for attempt_number in range(attempts):
+        for attempt_number in range(self.attempts):
             llm_url, task_count = min(self.llms_utilization.items(), key=lambda x: x[1])
-            logger.info(f"Submitting problem attempt {attempt_number} to the least busy LLM {llm_url} with {task_count} tasks")
+            logger.info(f"Submitting problem {self.problem_id} attempt {attempt_number}/{self.attempts} to the least busy LLM {llm_url} with {task_count} tasks")
             llm = self.llms_by_url[llm_url]
             task_ref = self.ray_remote.remote(self.cfg_dict, llm, problem, self.problem_id)
-            self.problem_id += 1
             self.llms_utilization[llm_url] += 1
             self.unfinished_tasks.append(task_ref)
+        self.problem_id += 1
 
     def stop_tasks(self):
         ray.shutdown()
@@ -701,7 +705,7 @@ def receive_finished_tasks(self):
             self.unfinished_problems[problem_id].append(rollout_result)
             logger.info(f"Problem {problem_id} has {len(self.unfinished_problems[problem_id])} rollout results")
             if len(self.unfinished_problems[problem_id]) == self.cfg.attempts:
-                logger.info(f"Group for problem {problem_id} finished")
+                logger.info(f"Problem {problem_id} group finished")
                 self.finished_problems.append(self.unfinished_problems[problem_id])
                 del self.unfinished_problems[problem_id]
                 logger.info(f"{len(self.finished_problems)} finished problems ready to return")
@@ -734,6 +738,9 @@ def get_new_results(self) -> list[list[RolloutResult]]:
             return self.finished_problems.pop(0)
         return []
 
+    def results_ready_to_publish(self) -> int:
+        return len(self.finished_problems) * self.cfg.attempts
+
 
 def run_actor_loop(cfg: DictConfig):
     set_streams_backend(**cfg.streams)

From 72c04a6715bfc7d512f5848a155d9a77f3cadb33 Mon Sep 17 00:00:00 2001
From: Oleh Shliazhko <oleh.shliazhko@servicenow.com>
Date: Wed, 8 Oct 2025 17:58:53 +0000
Subject: [PATCH 086/126] fixes

---
 pipelinerl/actor.py | 25 ++++++++++++++++++-------
 1 file changed, 18 insertions(+), 7 deletions(-)

diff --git a/pipelinerl/actor.py b/pipelinerl/actor.py
index c44e6fd3..90c30d31 100644
--- a/pipelinerl/actor.py
+++ b/pipelinerl/actor.py
@@ -314,8 +314,8 @@ def start_backend(self):
         self.smm.start()
 
         # Use SharedMemoryQueue instead of separate problem_queue, result_queue, and io_buffer
-        self.problem_queue = SharedMemoryQueue(self.smm, self.cfg.actor.problem_queue_size, cfg.actor.shared_memory_entry_size)
-        self.result_queue = SharedMemoryQueue(self.smm, self.cfg.actor.result_queue_size, cfg.actor.shared_memory_entry_size)
+        self.problem_queue = SharedMemoryQueue(self.smm, self.cfg.actor.problem_queue_size, self.cfg.actor.shared_memory_entry_size)
+        self.result_queue = SharedMemoryQueue(self.smm, self.cfg.actor.result_queue_size, self.cfg.actor.shared_memory_entry_size)
 
         logger.info(f"Problem queue size: {self.problem_queue.max_size}, result queue size: {self.result_queue.max_size}")
         logger.info(f"Result queue buffer size: {self.result_queue.get_memory_size() / 2**30} Gb")
@@ -523,8 +523,8 @@ def run(self, dataset: list[tuple[str, dict]]):
                     if self.is_training:
                         loop_stats = {
                             "published_samples": published_samples,
-                            "problem_queue_size": self.problem_queue.qsize(),
-                            "result_queue_size": self.result_queue.qsize(),
+                            "problem_queue_size": self.problem_queue_size(),
+                            "result_queue_size": self.result_queue_size(),
                             "finished_groups": finished_groups,
                             "trainer_model_version": trainer_version_to_publish,
                             "time_since_start": time.time() - loop_start_time,
@@ -610,7 +610,13 @@ def get_new_results(self) -> list[RolloutResult]:
         return self.result_queue.get(block=False)
 
     def results_ready_to_publish(self) -> int:
-        return self.result_queue.qsize() * self.cfg.attempts
+        return self.result_queue_size() * self.cfg.attempts
+
+    def problem_queue_size(self) -> int:
+        return self.problem_queue.qsize()
+
+    def result_queue_size(self) -> int:
+        return self.result_queue.qsize()
 
 
 class ActorLoopRay(ActorLoop):
@@ -643,6 +649,7 @@ def start_backend(self):
         else:
             logger.info("Ray already initialized")
 
+        assert self.trainer_state.propagated_weight_version is not None
         rollout_policy: Callable[[DictConfig, TrainableLLM, dict], RolloutResult] = hydra.utils.get_method(self.cfg.actor.rollout_policy)
         def rollout_wrapper(cfg: DictConfig, llm: TrainableLLM, problem: dict, problem_id: int) -> RolloutResult:
             start_ts = time.monotonic()
@@ -688,6 +695,7 @@ def receive_finished_tasks(self):
         for finished_task in finished_tasks:
             try:
                 rollout_result, llm_url, problem_id, stop_ts, start_ts = ray.get(finished_task)
+                rollout_result.model_version = self.trainer_state.propagated_weight_version
                 task_dt = stop_ts - start_ts
                 self.task_latencies.append(task_dt)
                 outer_ts = time.monotonic()
@@ -738,8 +746,11 @@ def get_new_results(self) -> list[list[RolloutResult]]:
             return self.finished_problems.pop(0)
         return []
 
-    def results_ready_to_publish(self) -> int:
-        return len(self.finished_problems) * self.cfg.attempts
+    def problem_queue_size(self) -> int:
+        return len(self.unfinished_tasks)
+
+    def result_queue_size(self) -> int:
+        return len(self.finished_problems)
 
 
 def run_actor_loop(cfg: DictConfig):

From 68c95343a9d80bd99ea59f61bf60390a28dfcea2 Mon Sep 17 00:00:00 2001
From: Oleh Shliazhko <oleh.shliazhko@servicenow.com>
Date: Thu, 9 Oct 2025 11:43:06 +0000
Subject: [PATCH 087/126] faster mcp server startup, significant speedup

---
 conf/mcp/python.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/conf/mcp/python.json b/conf/mcp/python.json
index d64cb8eb..fcbb4dcf 100644
--- a/conf/mcp/python.json
+++ b/conf/mcp/python.json
@@ -4,7 +4,7 @@
             "command": "bash",
             "args": [
                 "-c",
-                "JOB_TAG=${MCP_JOB_TAG:-${JOB_ID:-$HOSTNAME}} && BASE=/home/toolkit/.cache && mkdir -p \"$BASE/mcp_tmp/$JOB_TAG\" \"$BASE/deno_mcp/$JOB_TAG\" \"$BASE/tmp/$JOB_TAG\" && export DENO_DIR=\"$BASE/deno_mcp/$JOB_TAG\" TMPDIR=\"$BASE/tmp/$JOB_TAG\" && /home/toolkit/.deno/bin/deno cache jsr:@pydantic/mcp-run-python >/dev/null 2>&1 || true; DIR=$(mktemp -d -p \"$BASE/mcp_tmp/$JOB_TAG\" mcp_XXXXXXXX) && cd \"$DIR\" && /home/toolkit/.deno/bin/deno run -N -R=node_modules -W=node_modules --node-modules-dir=auto jsr:@pydantic/mcp-run-python stdio; EC=$?; cd /; rm -rf \"$DIR\"; exit $EC"
+                "deno run -N -R=node_modules -W=node_modules --node-modules-dir=auto jsr:@pydantic/mcp-run-python stdio"
                 ]
         }
     }

From 872059d2abf5552e9440beafcdf8c229438a1574 Mon Sep 17 00:00:00 2001
From: Oleh Shliazhko <oleh.shliazhko@servicenow.com>
Date: Thu, 9 Oct 2025 12:37:45 +0000
Subject: [PATCH 088/126] fix training texts metadata

---
 pipelinerl/actor.py                | 16 +++++++++++++++-
 pipelinerl/domains/mcp/rollouts.py |  3 +--
 2 files changed, 16 insertions(+), 3 deletions(-)

diff --git a/pipelinerl/actor.py b/pipelinerl/actor.py
index 90c30d31..56834363 100644
--- a/pipelinerl/actor.py
+++ b/pipelinerl/actor.py
@@ -74,6 +74,7 @@ def update(self, prompt_tokens: list[int], output_tokens: list[int]):
 
     def get_stats(self):
         if len(self.data.prompt_tokens_window) < self.window_size:
+            logger.warning(f"Not enough data to compute sliding stats, window size: {self.window_size}, data length: {len(self.data.prompt_tokens_window)}")
             return None
 
         # 1. How many samples do we produce per second?
@@ -592,6 +593,7 @@ def publish_stats(self, stats_writer: StreamWriter, loop_stats: Dict):
                     stats[f"{prefix}{new_suffix}"] = stats[key]
                     break
 
+        logger.info(f"Publish actor stats to wandb: {stats}")
         if self.cfg.wandb.use_wandb:
             wandb.log({f"actor/{k}": v for k, v in stats.items()})
         stats_writer.write(stats)
@@ -631,6 +633,7 @@ def __init__(self, *args, **kwargs):
         self.unfinished_tasks = []
         self.llms_by_url = {llm.get_base_url(): llm for llm in self.llms}
         self.llms_utilization = {llm.get_base_url(): 0 for llm in self.llms}
+        self.scheduler_name = f"{'train' if self.is_training else 'test'} ray scheduler"
         self.problem_id = 0
         self.attempts = self.cfg.attempts if self.is_training else 1
         self.unfinished_problems = defaultdict(list) # up to `attempts` rollout results for each problem
@@ -696,6 +699,15 @@ def receive_finished_tasks(self):
             try:
                 rollout_result, llm_url, problem_id, stop_ts, start_ts = ray.get(finished_task)
                 rollout_result.model_version = self.trainer_state.propagated_weight_version
+                full_group_id = f"{self.scheduler_name}_{problem_id}"
+                rollout_result.group_id = full_group_id
+                rollout_index = len(self.unfinished_problems[problem_id])
+                for step_index, sample in enumerate(rollout_result.training_texts):
+                    # Downstream in the pipeline we'll need these fields in every sample
+                    sample.metadata["model_version"] = rollout_result.model_version
+                    sample.metadata["rollout_index"] = rollout_index
+                    sample.metadata["step_index"] = step_index
+                    sample.group_id = full_group_id
                 task_dt = stop_ts - start_ts
                 self.task_latencies.append(task_dt)
                 outer_ts = time.monotonic()
@@ -714,7 +726,9 @@ def receive_finished_tasks(self):
             logger.info(f"Problem {problem_id} has {len(self.unfinished_problems[problem_id])} rollout results")
             if len(self.unfinished_problems[problem_id]) == self.cfg.attempts:
                 logger.info(f"Problem {problem_id} group finished")
-                self.finished_problems.append(self.unfinished_problems[problem_id])
+                group = self.unfinished_problems[problem_id]
+                random.shuffle(group)
+                self.finished_problems.append(group)
                 del self.unfinished_problems[problem_id]
                 logger.info(f"{len(self.finished_problems)} finished problems ready to return")
             logger.info(
diff --git a/pipelinerl/domains/mcp/rollouts.py b/pipelinerl/domains/mcp/rollouts.py
index 2758ed15..360290cb 100644
--- a/pipelinerl/domains/mcp/rollouts.py
+++ b/pipelinerl/domains/mcp/rollouts.py
@@ -430,8 +430,7 @@ def generate_mcp_rollout_with_local_env(
             training_texts=training_texts,
             metrics=metrics,
             latency=latency,
-            dataset_name=problem["dataset"],
-            llm_url=llm.get_base_url(),
+            dataset_name=problem["dataset"]
         )
     except Exception as e:
         err_msg = f"Error generating rollout: {e}"

From b702489b3bc0ad8e2fb4d54bf4bd21a19c1367cb Mon Sep 17 00:00:00 2001
From: Oleh Shliazhko <oleh.shliazhko@servicenow.com>
Date: Thu, 9 Oct 2025 13:00:50 +0000
Subject: [PATCH 089/126] fixes

---
 debug.sh                           |  8 ++++----
 pipelinerl/actor.py                | 13 ++++++++++---
 pipelinerl/domains/mcp/rollouts.py |  5 +----
 3 files changed, 15 insertions(+), 11 deletions(-)

diff --git a/debug.sh b/debug.sh
index b72841c1..115a5bb2 100755
--- a/debug.sh
+++ b/debug.sh
@@ -1,12 +1,12 @@
 #!/bin/bash
-echo "Run 40 workers"
-DEBUG_FILE=timing_debug_workers40_1.jsonl python -m pipelinerl.launch \
-    output_dir=results/actor_debug40_1 \
+echo "Run 38 workers"
+DEBUG_FILE=timing_debug_workers38_3.jsonl python -m pipelinerl.launch \
+    output_dir=results/actor_debug38_3 \
     force_restart=true \
     actor.llm_max_rollouts=256 \
     finetune.seq_parallel=8 \
     eval_every_n_versions=0 \
-    actor.rollout_workers=40 \
+    actor.rollout_workers=38 \
     debug.mode=actor \
     world.actor_fraction=8 \
     world.finetune_fraction=0 \
diff --git a/pipelinerl/actor.py b/pipelinerl/actor.py
index 56834363..19d3bc34 100644
--- a/pipelinerl/actor.py
+++ b/pipelinerl/actor.py
@@ -1,4 +1,5 @@
 import asyncio
+import json
 import logging
 import math
 import multiprocessing as mp
@@ -20,7 +21,6 @@
 from omegaconf import DictConfig, OmegaConf
 from pydantic import BaseModel, Field
 from tapeagents.llms import TrainableLLM
-from tapeagents.orchestrator import save_debug_line
 
 import wandb
 from pipelinerl.finetune.logging_ import flatten_dict_config, init_wandb
@@ -45,6 +45,11 @@
 
 logger = logging.getLogger(__name__)
 
+def save_debug_line(data:dict):
+    data["ts"] = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+    fname = os.environ.get("DEBUG_FILE", "timing_debug.jsonl")
+    with open(fname, "a") as f:
+        f.write(json.dumps(data, ensure_ascii=False) + "\n")
 
 class SlidingWindowData(BaseModel):
     prompt_tokens_window: list[list[int]] = Field(
@@ -73,9 +78,11 @@ def update(self, prompt_tokens: list[int], output_tokens: list[int]):
             self.data.timestamps.pop(0)
 
     def get_stats(self):
-        if len(self.data.prompt_tokens_window) < self.window_size:
-            logger.warning(f"Not enough data to compute sliding stats, window size: {self.window_size}, data length: {len(self.data.prompt_tokens_window)}")
+        if len(self.data.prompt_tokens_window) < 2:
+            logger.warning("Not enough data to compute sliding stats")
             return None
+        elif len(self.data.prompt_tokens_window) < self.window_size:
+            logger.warning(f"Compute sliding stats over just {len(self.data.prompt_tokens_window)} samples")
 
         # 1. How many samples do we produce per second?
         # 2. How many output tokens do we produce per second?
diff --git a/pipelinerl/domains/mcp/rollouts.py b/pipelinerl/domains/mcp/rollouts.py
index 360290cb..d3e998c2 100644
--- a/pipelinerl/domains/mcp/rollouts.py
+++ b/pipelinerl/domains/mcp/rollouts.py
@@ -13,10 +13,9 @@
 from tapeagents.agent import DEFAULT, Agent
 from tapeagents.core import LLMCall, Tape, TrainingText
 from tapeagents.dialog_tape import UserStep
-from tapeagents.llms import LiteLLM
 from tapeagents.llms.trainable import TrainableLLM
 from tapeagents.mcp import MCPEnvironment
-from tapeagents.orchestrator import async_execute_agent, execute_agent, get_agent_and_env_from_config, save_debug_tape
+from tapeagents.orchestrator import async_execute_agent, execute_agent, get_agent_and_env_from_config
 from tapeagents.remote_environment import AsyncRemoteEnvironment
 
 from pipelinerl.async_llm import make_training_text
@@ -43,7 +42,6 @@ def _get_embedded_worker(env_cfg: DictConfig, concurrency: int) -> EmbeddedEnvir
         _embedded_worker.set_concurrency(concurrency)
     return _embedded_worker
 
-
 def count_tool_calls_by_category(llm_calls: List[LLMCall]) -> Dict[str, int]:
     """
     Count the number of tool calls for each function name category.
@@ -319,7 +317,6 @@ def generate_mcp_rollout_with_local_env(
         tape = execute_agent(agent, tape, environment, max_loops=cfg.agent_max_loops)
         logger.info("Agent finished")
         tape.metadata.result.update({"total_execution_time": time.perf_counter() - t_exec})
-        # save_debug_tape(tape)
         reward_table = RewardTable(**dict(cfg.rewards))
 
         llm_calls: list[LLMCall] = [

From dea891a1c50e064ef26d574573fcfdd97015ff02 Mon Sep 17 00:00:00 2001
From: Oleh Shliazhko <oleh.shliazhko@servicenow.com>
Date: Thu, 9 Oct 2025 13:03:27 +0000
Subject: [PATCH 090/126] fix

---
 pipelinerl/actor.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/pipelinerl/actor.py b/pipelinerl/actor.py
index 19d3bc34..b5cdb596 100644
--- a/pipelinerl/actor.py
+++ b/pipelinerl/actor.py
@@ -64,8 +64,9 @@ class SlidingWindowData(BaseModel):
 
 
 class SlidingWindowAggregator:
-    def __init__(self, window_size: int):
+    def __init__(self, window_size: int, min_samples: int = 5):
         self.window_size = window_size
+        self.min_samples = min_samples
         self.data = SlidingWindowData()
 
     def update(self, prompt_tokens: list[int], output_tokens: list[int]):
@@ -78,7 +79,7 @@ def update(self, prompt_tokens: list[int], output_tokens: list[int]):
             self.data.timestamps.pop(0)
 
     def get_stats(self):
-        if len(self.data.prompt_tokens_window) < 2:
+        if len(self.data.prompt_tokens_window) < self.min_samples:
             logger.warning("Not enough data to compute sliding stats")
             return None
         elif len(self.data.prompt_tokens_window) < self.window_size:

From e6723127558584d5fcf50326a36b6c5ebe872081 Mon Sep 17 00:00:00 2001
From: Oleh Shliazhko <oleh.shliazhko@servicenow.com>
Date: Thu, 9 Oct 2025 13:19:39 +0000
Subject: [PATCH 091/126] make exp dir with all my scripts

---
 debug.sh => experiments/olmer/env_speed/debug.sh              | 4 ++--
 llm.sh => experiments/olmer/env_speed/llm.sh                  | 0
 llm_bench.py => experiments/olmer/env_speed/llm_bench.py      | 0
 .../olmer/env_speed/llm_bench_async.py                        | 0
 4 files changed, 2 insertions(+), 2 deletions(-)
 rename debug.sh => experiments/olmer/env_speed/debug.sh (95%)
 rename llm.sh => experiments/olmer/env_speed/llm.sh (100%)
 rename llm_bench.py => experiments/olmer/env_speed/llm_bench.py (100%)
 rename llm_bench_async.py => experiments/olmer/env_speed/llm_bench_async.py (100%)

diff --git a/debug.sh b/experiments/olmer/env_speed/debug.sh
similarity index 95%
rename from debug.sh
rename to experiments/olmer/env_speed/debug.sh
index 115a5bb2..c79ba094 100755
--- a/debug.sh
+++ b/experiments/olmer/env_speed/debug.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 echo "Run 38 workers"
-DEBUG_FILE=timing_debug_workers38_3.jsonl python -m pipelinerl.launch \
-    output_dir=results/actor_debug38_3 \
+DEBUG_FILE=timing_debug_workers38_4.jsonl python -m pipelinerl.launch \
+    output_dir=results/actor_debug38_4 \
     force_restart=true \
     actor.llm_max_rollouts=256 \
     finetune.seq_parallel=8 \
diff --git a/llm.sh b/experiments/olmer/env_speed/llm.sh
similarity index 100%
rename from llm.sh
rename to experiments/olmer/env_speed/llm.sh
diff --git a/llm_bench.py b/experiments/olmer/env_speed/llm_bench.py
similarity index 100%
rename from llm_bench.py
rename to experiments/olmer/env_speed/llm_bench.py
diff --git a/llm_bench_async.py b/experiments/olmer/env_speed/llm_bench_async.py
similarity index 100%
rename from llm_bench_async.py
rename to experiments/olmer/env_speed/llm_bench_async.py

From 4ad27f9d6895bd147a2b5b22e560bf87b365f988 Mon Sep 17 00:00:00 2001
From: Oleh Shliazhko <oleh.shliazhko@servicenow.com>
Date: Fri, 10 Oct 2025 17:00:13 +0200
Subject: [PATCH 092/126] move personal scripts out

---
 experiments/olmer/env_speed/debug.sh          |  71 -------
 experiments/olmer/env_speed/llm.sh            |  72 -------
 experiments/olmer/env_speed/llm_bench.py      | 139 ------------
 .../olmer/env_speed/llm_bench_async.py        | 200 ------------------
 4 files changed, 482 deletions(-)
 delete mode 100755 experiments/olmer/env_speed/debug.sh
 delete mode 100755 experiments/olmer/env_speed/llm.sh
 delete mode 100644 experiments/olmer/env_speed/llm_bench.py
 delete mode 100644 experiments/olmer/env_speed/llm_bench_async.py

diff --git a/experiments/olmer/env_speed/debug.sh b/experiments/olmer/env_speed/debug.sh
deleted file mode 100755
index c79ba094..00000000
--- a/experiments/olmer/env_speed/debug.sh
+++ /dev/null
@@ -1,71 +0,0 @@
-#!/bin/bash
-echo "Run 38 workers"
-DEBUG_FILE=timing_debug_workers38_4.jsonl python -m pipelinerl.launch \
-    output_dir=results/actor_debug38_4 \
-    force_restart=true \
-    actor.llm_max_rollouts=256 \
-    finetune.seq_parallel=8 \
-    eval_every_n_versions=0 \
-    actor.rollout_workers=38 \
-    debug.mode=actor \
-    world.actor_fraction=8 \
-    world.finetune_fraction=0 \
-    world.preprocessor_fraction=0 \
-    --config-name mcp
-
-# echo "Run 10 workers"
-# DEBUG_FILE=timing_debug_gpt_workers10.jsonl python -m pipelinerl.launch \
-#     output_dir=results/actor_debug2 \
-#     force_restart=true \
-#     actor.llm_max_rollouts=16 \
-#     finetune.seq_parallel=8 \
-#     eval_every_n_versions=0 \
-#     actor.rollout_workers=10 \
-#     debug.mode=actor \
-#     world.actor_fraction=8 \
-#     world.finetune_fraction=0 \
-#     world.preprocessor_fraction=0 \
-#     --config-name mcp
-
-
-# echo "Run 5 workers"
-# DEBUG_FILE=timing_debug_gpt_workers5.jsonl python -m pipelinerl.launch \
-#     output_dir=results/actor_debug2 \
-#     force_restart=true \
-#     actor.llm_max_rollouts=16 \
-#     finetune.seq_parallel=8 \
-#     eval_every_n_versions=0 \
-#     actor.rollout_workers=5 \
-#     debug.mode=actor \
-#     world.actor_fraction=8 \
-#     world.finetune_fraction=0 \
-#     world.preprocessor_fraction=0 \
-#     --config-name mcp
-
-# echo "Run 40 workers"
-# DEBUG_FILE=timing_debug_gpt_workers40.jsonl python -m pipelinerl.launch \
-#     output_dir=results/actor_debug2 \
-#     force_restart=true \
-#     actor.llm_max_rollouts=16 \
-#     finetune.seq_parallel=8 \
-#     eval_every_n_versions=0 \
-#     actor.rollout_workers=40 \
-#     debug.mode=actor \
-#     world.actor_fraction=8 \
-#     world.finetune_fraction=0 \
-#     world.preprocessor_fraction=0 \
-#     --config-name mcp
-
-# echo "Run 30 workers"
-# DEBUG_FILE=timing_debug_gpt_workers30.jsonl python -m pipelinerl.launch \
-#     output_dir=results/actor_debug2 \
-#     force_restart=true \
-#     actor.llm_max_rollouts=16 \
-#     finetune.seq_parallel=8 \
-#     eval_every_n_versions=0 \
-#     actor.rollout_workers=30 \
-#     debug.mode=actor \
-#     world.actor_fraction=8 \
-#     world.finetune_fraction=0 \
-#     world.preprocessor_fraction=0 \
-#     --config-name mcp
\ No newline at end of file
diff --git a/experiments/olmer/env_speed/llm.sh b/experiments/olmer/env_speed/llm.sh
deleted file mode 100755
index 78758724..00000000
--- a/experiments/olmer/env_speed/llm.sh
+++ /dev/null
@@ -1,72 +0,0 @@
-#!/bin/bash
-echo "Run LLM only"
-
-# python -m pipelinerl.launch \
-#     output_dir=results/llm_debug1 \
-#     force_restart=true \
-#     actor.llm_max_rollouts=16 \
-#     finetune.seq_parallel=8 \
-#     eval_every_n_versions=0 \
-#     debug.mode=llm \
-#     world.actor_fraction=8 \
-#     world.finetune_fraction=0 \
-#     world.preprocessor_fraction=0 \
-#     --config-name mcp
-
-
-python -m pipelinerl.entrypoints.run_vllm1 \
-    --model Qwen/Qwen3-8B \
-    --host 0.0.0.0 \
-    --port 8080 \
-    --seed 42 \
-    --actor-llm-idx 0 \
-    --weight-update-group-init-method tcp://localhost:9000 \
-    --weight-update-group-world-size 2 \
-    --dtype bfloat16 \
-    --gpu-memory-utilization 0.9 \
-    --num-scheduler-steps 1 \
-    --disable-log-requests \
-    --disable-frontend-multiprocessing \
-    --max-num-seqs 256 \
-    --max-num-batched-tokens 32000 \
-    --max_model_len 32000 \
-    --enable-chunked-prefill \
-    --return-tokens-as-token-ids \
-    --tensor-parallel-size 1 \
-    --pipeline-parallel-size 1 \
-    --generation-config vllm \
-    --enable-auto-tool-choice \
-    --tool-call-parser rl_tool \
-    --tool-parser-plugin /home/toolkit/PipelineRL/pipelinerl/rl_tool_parser_plugin.py \
-    --disable-weight-update
-
-
-# python -m pipelinerl.entrypoints.run_vllm0 \
-#     --model Qwen/Qwen2.5-7B \
-#     --host 0.0.0.0 \
-#     --port 8080 \
-#     --seed 42 \
-#     --actor-llm-idx 0 \
-#     --weight-update-group-init-method tcp://localhost:9000 \
-#     --weight-update-group-world-size 2 \
-#     --dtype bfloat16 \
-#     --gpu-memory-utilization 0.9 \
-#     --num-scheduler-steps 1 \
-#     --disable-log-requests \
-#     --disable-frontend-multiprocessing \
-#     --max-num-seqs 256 \
-#     --max-num-batched-tokens 32000 \
-#     --enable-chunked-prefill \
-#     --return-tokens-as-token-ids \
-#     --tensor-parallel-size 1 \
-#     --pipeline-parallel-size 1 \
-#     --generation-config vllm \
-#     --max_model_len 32000 \
-#     --enable-auto-tool-choice \
-#     --tool-call-parser rl_tool \
-#     --tool-parser-plugin /home/toolkit/PipelineRL/pipelinerl/rl_tool_parser_plugin.py \
-#     --disable-weight-update
- 
-# python -m pipelinerl.entrypoints.run_vllm0 --model /mnt/llmd/base_models/Mistral-Small-24B-Base-2501 --host 0.0.0.0 --port 8080 --seed 78 --actor-llm-idx 36 --weight-update-group-init-method tcp://dns-99833624-2133-43c0-a112-07520ffee505-0:9000 --weight-update-group-world-size 49 --dtype bfloat16 --gpu-memory-utilization 0.9 --num-scheduler-steps 1 --disable-log-requests --disable-frontend-multiprocessing --max-num-seqs 256 --max-num-batched-tokens 1024 --enable-chunked-prefill --return-tokens-as-token-ids --tensor-parallel-size 1 --pipeline-parallel-size 1 --generation-config vllm --max_model_len 32768
-
- 
\ No newline at end of file
diff --git a/experiments/olmer/env_speed/llm_bench.py b/experiments/olmer/env_speed/llm_bench.py
deleted file mode 100644
index e61cf342..00000000
--- a/experiments/olmer/env_speed/llm_bench.py
+++ /dev/null
@@ -1,139 +0,0 @@
-import json
-import os
-import random
-import time
-
-import numpy as np
-import ray
-import requests
-from tapeagents.llms import TrainableLLM
-
-os.environ["TOKENIZERS_PARALLELISM"] = "false"
-
-llm_url = "http://localhost:8080"
-llm_model = "Qwen/Qwen3-8B"
-# llm_model = "Qwen/Qwen2.5-7B"
-exp_name = "qwen3-8b-v1"
-# exp_name = "qwen2.5-7b"
-max_tokens = 8192
-
-def llm_quick_response(prompt: str):
-    r = requests.post(
-        url=f"{llm_url}/v1/chat/completions",
-        json={
-            "model": llm_model,
-            "messages": [{"role": "user", "content": prompt}],
-            "stream": False,
-        },
-        headers={"Content-Type": "application/json"},
-        stream=False,
-        verify=False,
-    )
-    d = r.json()
-    return d["choices"][0]["message"]["content"]
-
-
-llm = TrainableLLM(base_url=llm_url, model_name=llm_model)
-response = llm.quick_response("Hello, how are you?")
-response = llm_quick_response("Hello, how are you?")
-assert len(response) > 0
-assert llm.tokenizer is not None
-print("LLM is ready")
-
-
-with open("debug_training_texts.jsonl", "r", encoding="utf-8") as f:
-    all_dicts = [json.loads(line) for line in f if line.strip()]
-total_tokens = 0
-for d in all_dicts:
-    text = d["text"]
-    n_predicted = d["n_predicted"]
-    prompt = text[:-n_predicted]
-    response = text[-n_predicted:]
-    tokens = llm.tokenizer.encode(text)
-    total_tokens += len(tokens)
-print(f"Loaded {len(all_dicts)} texts, total tokens: {total_tokens}")
-
-prompts = [d["text"][:-d["n_predicted"]] for d in all_dicts]
-random.seed(42)
-random.shuffle(prompts)
-chunk_size = 4
-prompts_chunks = [prompts[i:i+chunk_size] for i in range(0, len(prompts), chunk_size)]
-print(f"Chunked to {len(prompts_chunks)} chunks")
-too_many_chunks = prompts_chunks * 20
-
-def benchmark_llm(n_workers: int):
-    ray.shutdown()
-    ray.init(num_cpus=n_workers)
-
-    def get_responses(prompts: str):
-        responses = []
-        # local_llm = TrainableLLM(base_url=llm_url, model_name=llm_model, parameters={"max_tokens": max_tokens})
-        for i, prompt in enumerate(prompts):
-            t = time.perf_counter()
-            # r = local_llm.quick_response(prompt)
-            r = llm_quick_response(prompt)
-            dt = time.perf_counter() - t
-            responses.append((prompt + r, dt))
-        return responses
-
-    remote_fn = ray.remote(get_responses)
-
-    start_time = time.perf_counter()
-
-    n_chunks = max(200, n_workers * 2)
-    chunks = too_many_chunks[:n_chunks]
-    print(f"Multiplied to {len(chunks)} chunks")
-    random.seed(42)
-    random.shuffle(chunks)
-    unfinished_tasks = []
-    for chunk in chunks:
-        unfinished_tasks.append(remote_fn.remote(chunk))
-
-    responses = []
-    total_tokens = 0
-    total_finished = 0
-    latencies = []
-    print(f"Submitted {len(unfinished_tasks)} tasks")
-    while unfinished_tasks:
-        finished_tasks, unfinished_tasks = ray.wait(unfinished_tasks, num_returns=len(unfinished_tasks), timeout=0.1)
-        for finished_task in finished_tasks:
-            responses = ray.get(finished_task)
-            total_finished += 1
-            for response, latency in responses:
-                latencies.append(latency)
-                tokens = llm.tokenizer.encode(response)
-                total_tokens += len(tokens)
-        dt = time.perf_counter() - start_time
-        if len(finished_tasks) > 0:
-            print(f"t: {dt:.2f}s, {total_finished} finished, Total tokens: {total_tokens}, tokens/sec: {total_tokens / dt:.2f}, last 10 latency: {np.mean(latencies[-10:]):.2f}s")
-            with open(f"llm_token_stats_chunk{chunk_size}_{exp_name}_log.jsonl", "a") as f:
-                ts = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
-                row = json.dumps({"ts": ts, "exp_name": exp_name, "n_workers": n_workers, "tokens": total_tokens, "dt": dt, "mean_latency": np.mean(latencies), "last_10_latency": np.mean(latencies[-10:]), "total_finished": total_finished, "token_speed": total_tokens / dt})
-                f.write(row + "\n")
-        if len(unfinished_tasks) < n_workers:
-            print(f"Saturation mode ended, stopping")
-            break
-        time.sleep(2.0)
-
-    final_time = time.perf_counter() - start_time
-    print(f"Final, workers:{n_workers}, t:{final_time:.2f}s, total tokens: {total_tokens}, tokens/sec: {total_tokens / final_time:.2f}")
-    ray.shutdown()
-    mean_latency = np.mean(latencies)
-    return total_tokens, final_time, mean_latency
-
-stats = {}
-for n_workers in [128]: #[64, 256, 128, 32, 4, 8, 16, 512, 1024]: # most optimal first
-    print(f"Benchmarking {n_workers} workers..")
-    tokens, dt, mean_latency = benchmark_llm(n_workers)
-    print(f"Done {n_workers} workers: {tokens} tokens, {dt:.2f}s, speed {tokens / dt:.2f} tokens/sec, mean latency: {mean_latency:.2f}s")
-    stats[n_workers] = {"tokens": tokens, "dt": dt, "mean_latency": mean_latency}
-    with open(f"llm_token_stats_ray_chunk{chunk_size}_{exp_name}.jsonl", "a") as f:
-        ts = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
-        row = json.dumps({"ts": ts, "n_workers": n_workers, "tokens": tokens, "dt": dt, "mean_latency": mean_latency})
-        f.write(row + "\n")
-    time.sleep(3.0)
-
-print("Benchmarking done")
-with open(f"llm_token_stats_ray_all_chunk{chunk_size}_{exp_name}.json", "w") as f:
-    json.dump(stats, f, indent=4)
-print("All stats saved")
\ No newline at end of file
diff --git a/experiments/olmer/env_speed/llm_bench_async.py b/experiments/olmer/env_speed/llm_bench_async.py
deleted file mode 100644
index 0f307576..00000000
--- a/experiments/olmer/env_speed/llm_bench_async.py
+++ /dev/null
@@ -1,200 +0,0 @@
-import asyncio
-import json
-import os
-import random
-import time
-
-import aiohttp
-import numpy as np
-from tapeagents.llms import TrainableLLM
-
-os.environ["TOKENIZERS_PARALLELISM"] = "false"
-
-llm_url = "http://localhost:8080"
-llm_model = "Qwen/Qwen3-8B"
-# llm_model = "Qwen/Qwen2.5-7B"
-exp_name = "qwen3-8b-v1"
-# exp_name = "qwen2.5-7b"
-max_tokens = 8192
-
-
-async def llm_quick_response_async(session: aiohttp.ClientSession, prompt: str):
-    """Async version of LLM quick response"""
-    async with session.post(
-        url=f"{llm_url}/v1/chat/completions",
-        json={
-            "model": llm_model,
-            "messages": [{"role": "user", "content": prompt}],
-            "stream": False,
-        },
-        headers={"Content-Type": "application/json"},
-        ssl=False,
-    ) as response:
-        d = await response.json()
-        return d["choices"][0]["message"]["content"]
-
-
-
-
-# Initial LLM test (synchronous)
-llm = TrainableLLM(base_url=llm_url, model_name=llm_model)
-response = llm.quick_response("Hello, how are you?")
-assert len(response) > 0
-assert llm.tokenizer is not None
-print("LLM is ready")
-
-
-with open("debug_training_texts.jsonl", "r", encoding="utf-8") as f:
-    all_dicts = [json.loads(line) for line in f if line.strip()]
-total_tokens = 0
-for d in all_dicts:
-    text = d["text"]
-    n_predicted = d["n_predicted"]
-    prompt = text[:-n_predicted]
-    response = text[-n_predicted:]
-    tokens = llm.tokenizer.encode(text)
-    total_tokens += len(tokens)
-print(f"Loaded {len(all_dicts)} texts, total tokens: {total_tokens}")
-
-prompts = [d["text"][:-d["n_predicted"]] for d in all_dicts]
-random.seed(42)
-random.shuffle(prompts)
-chunk_size = 4
-prompts_chunks = [prompts[i:i+chunk_size] for i in range(0, len(prompts), chunk_size)]
-print(f"Chunked to {len(prompts_chunks)} chunks")
-too_many_chunks = prompts_chunks * 20
-
-
-async def get_responses_async(session: aiohttp.ClientSession, prompts: list[str], tokenizer):
-    """Process a chunk of prompts asynchronously"""
-    responses = []
-    for prompt in prompts:
-        t = time.perf_counter()
-        try:
-            r = await llm_quick_response_async(session, prompt)
-            dt = time.perf_counter() - t
-            responses.append((prompt + r, dt))
-        except Exception as e:
-            print(f"Error processing prompt: {e}")
-            dt = time.perf_counter() - t
-            responses.append((prompt, dt))
-    return responses
-
-
-async def benchmark_llm_async(n_workers: int):
-    """Benchmark LLM using async/await with controlled concurrency"""
-    print(f"Starting async benchmark with {n_workers} concurrent workers")
-    
-    start_time = time.perf_counter()
-    
-    n_chunks = max(200, n_workers * 2)
-    chunks = too_many_chunks[:n_chunks]
-    print(f"Multiplied to {len(chunks)} chunks")
-    random.seed(42)
-    random.shuffle(chunks)
-    
-    total_tokens = 0
-    total_finished = 0
-    latencies = []
-    
-    # Create shared aiohttp session with connection pooling
-    connector = aiohttp.TCPConnector(limit=n_workers, limit_per_host=n_workers)
-    timeout = aiohttp.ClientTimeout(total=300)  # 5 minute timeout
-    
-    async with aiohttp.ClientSession(connector=connector, timeout=timeout) as session:
-        # Create all tasks
-        tasks = []
-        for chunk in chunks:
-            task = asyncio.create_task(get_responses_async(session, chunk, llm.tokenizer))
-            tasks.append(task)
-        
-        print(f"Created {len(tasks)} tasks")
-        
-        # Process tasks with controlled concurrency
-        pending = set(tasks)
-        active = set()
-        
-        while pending or active:
-            # Fill up active tasks up to n_workers limit
-            while len(active) < n_workers and pending:
-                task = pending.pop()
-                active.add(task)
-            
-            if not active:
-                break
-            
-            # Wait for at least one task to complete
-            done, active = await asyncio.wait(active, timeout=0.1, return_when=asyncio.FIRST_COMPLETED)
-            
-            # Process completed tasks
-            for finished_task in done:
-                try:
-                    responses = await finished_task
-                    total_finished += 1
-                    for response, latency in responses:
-                        latencies.append(latency)
-                        tokens = llm.tokenizer.encode(response)
-                        total_tokens += len(tokens)
-                except Exception as e:
-                    print(f"Task failed with error: {e}")
-                    total_finished += 1
-            
-            # Log progress
-            dt = time.perf_counter() - start_time
-            if len(done) > 0:
-                print(f"t: {dt:.2f}s, {total_finished} finished, Total tokens: {total_tokens}, tokens/sec: {total_tokens / dt:.2f}, last 10 latency: {np.mean(latencies[-10:]) if latencies else 0:.2f}s")
-                with open(f"llm_token_stats_chunk{chunk_size}_{exp_name}_log.jsonl", "a") as f:
-                    ts = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
-                    row = json.dumps({
-                        "ts": ts,
-                        "exp_name": exp_name,
-                        "n_workers": n_workers,
-                        "tokens": total_tokens,
-                        "dt": dt,
-                        "mean_latency": np.mean(latencies) if latencies else 0,
-                        "last_10_latency": np.mean(latencies[-10:]) if latencies else 0,
-                        "total_finished": total_finished,
-                        "token_speed": total_tokens / dt if dt > 0 else 0
-                    })
-                    f.write(row + "\n")
-            
-            # Check saturation mode
-            if len(pending) + len(active) < n_workers:
-                print(f"Saturation mode ended, stopping")
-                # Cancel remaining tasks
-                for task in active:
-                    task.cancel()
-                break
-            
-            await asyncio.sleep(2.0)
-    
-    final_time = time.perf_counter() - start_time
-    print(f"Final, workers:{n_workers}, t:{final_time:.2f}s, total tokens: {total_tokens}, tokens/sec: {total_tokens / final_time:.2f}")
-    mean_latency = np.mean(latencies) if latencies else 0
-    return total_tokens, final_time, mean_latency
-
-
-async def run_benchmarks():
-    """Run benchmarks for different worker counts"""
-    stats = {}
-    for n_workers in [128]:  # [64, 256, 128, 32, 4, 8, 16, 512, 1024]: # most optimal first
-        print(f"Benchmarking {n_workers} workers..")
-        tokens, dt, mean_latency = await benchmark_llm_async(n_workers)
-        print(f"Done {n_workers} workers: {tokens} tokens, {dt:.2f}s, speed {tokens / dt:.2f} tokens/sec, mean latency: {mean_latency:.2f}s")
-        stats[n_workers] = {"tokens": tokens, "dt": dt, "mean_latency": mean_latency}
-        with open(f"llm_token_stats_chunk{chunk_size}_{exp_name}.jsonl", "a") as f:
-            ts = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
-            row = json.dumps({"ts": ts, "n_workers": n_workers, "tokens": tokens, "dt": dt, "mean_latency": mean_latency})
-            f.write(row + "\n")
-        await asyncio.sleep(3.0)
-    
-    print("Benchmarking done")
-    with open(f"llm_token_stats_all_chunk{chunk_size}_{exp_name}.json", "w") as f:
-        json.dump(stats, f, indent=4)
-    print("All stats saved")
-
-
-if __name__ == "__main__":
-    # Run the async benchmarks
-    asyncio.run(run_benchmarks())
-

From 41a080dead71a3ffd451b9b010fbc37ff6803b6a Mon Sep 17 00:00:00 2001
From: rafapi <rafael.pardinas@servicenow.com>
Date: Fri, 10 Oct 2025 16:03:55 +0000
Subject: [PATCH 093/126] Remove test reward shaping

---
 conf/mcp.yaml                      | 21 +++------
 pipelinerl/domains/mcp/rollouts.py | 68 ++++++------------------------
 2 files changed, 17 insertions(+), 72 deletions(-)

diff --git a/conf/mcp.yaml b/conf/mcp.yaml
index 43ebf586..a2fa2bb4 100644
--- a/conf/mcp.yaml
+++ b/conf/mcp.yaml
@@ -11,6 +11,10 @@ test_llm:
   parameters:
     max_tokens: 8192
 
+rewards:
+  correct_answer_not_finished: 0.0
+  buffer_tokens: 2000
+
 actor:
   rollout_policy: pipelinerl.domains.mcp.generate_mcp_rollout
   system_prompt: Please reason step by step, and put your final answer within \boxed{{}}.
@@ -146,19 +150,4 @@ agent:
       next_node: code
 
 # model_path: Qwen/Qwen3-8B
-model_path: /mnt/llmd/base_models/ServiceNow-AI/7_9_25_14b_text_reasoning_sft
-
-# Local reward shaping for tool usage
-python_tool_shaping:
-  bonus_on_correct_with_python: 0.2
-  penalty_on_incorrect_without_python: 0.1
-  max_abs: 0.2
-
-# Encourage concise outputs (penalize long completions)
-length_shaping:
-  target_ratio: 0.1                # 10% of max_tokens; auto scales with max_tokens
-  min_target_tokens: 256           # lower clamp
-  max_target_tokens: 2048          # upper clamp
-  slope: 0.001                     # penalty per token beyond target
-  max_penalty: 0.2                 # clamp absolute penalty
-  bonus_on_short_correct: 0.05     # bonus if correct and concise
+model_path: /mnt/llmd/base_models/ServiceNow-AI/7_9_25_14b_text_reasoning_sft
\ No newline at end of file
diff --git a/pipelinerl/domains/mcp/rollouts.py b/pipelinerl/domains/mcp/rollouts.py
index f62f0567..861c5fae 100644
--- a/pipelinerl/domains/mcp/rollouts.py
+++ b/pipelinerl/domains/mcp/rollouts.py
@@ -22,7 +22,7 @@
 from tapeagents.remote_environment import AsyncRemoteEnvironment
 
 from pipelinerl.domains.mcp.env_server import EmbeddedEnvironmentWorker
-from pipelinerl.domains.math import RewardTable, get_reward, verify_answer, verify_answer_rpc
+from pipelinerl.domains.math import RewardTable, get_reward, verify_answer, verify_answer_rpc, length_penalty
 from pipelinerl.rollouts import RolloutResult, BaseMetrics
 
 logger = logging.getLogger(__name__)
@@ -192,63 +192,19 @@ async def generate_mcp_rollout(
     tape_finished = True if isinstance(tape.steps[-1], MathAnswer) else False
     base_reward = get_reward(answer_status, tape_finished, reward_table)
 
-    # Local reward shaping (configurable in conf/mcp.yaml)
-    total_shaping = 0.0
-    shaping_cfg = getattr(cfg, "python_tool_shaping", None)
-    if shaping_cfg is not None:
-        num_python_calls = tool_call_counts.get("run_python_code", 0)
-        bonus_on_correct_with_python = float(getattr(shaping_cfg, "bonus_on_correct_with_python", 0.0))
-        penalty_on_incorrect_without_python = float(getattr(shaping_cfg, "penalty_on_incorrect_without_python", 0.0))
-        max_abs = float(getattr(shaping_cfg, "max_abs", 0.2))
+    reward = base_reward
 
-        # Episode-level bonuses/penalties
-        if answer_status == "correct" and num_python_calls >= 1:
-            total_shaping += bonus_on_correct_with_python
-        if answer_status in ("wrong", "unparsable") and num_python_calls == 0:
-            total_shaping -= penalty_on_incorrect_without_python
+    discount_factor = float(getattr(cfg.actor, "discount_factor", 1.0))
+    if discount_factor != 1.0:
+        total_generated_tokens = sum(getattr(call, "output_length_tokens", 0) for call in llm_calls)
+        reward *= discount_factor ** total_generated_tokens
 
-        # Clamp total shaping
-        if total_shaping > max_abs:
-            total_shaping = max_abs
-        if total_shaping < -max_abs:
-            total_shaping = -max_abs
-
-    # Length shaping: discourage very long completions; award concise correct ones
-    length_cfg = getattr(cfg, "length_shaping", None)
-    if length_cfg is not None:
-        try:
-            # Prefer ratio-based target if provided; otherwise use absolute
-            if hasattr(length_cfg, "target_ratio"):
-                ratio = float(getattr(length_cfg, "target_ratio"))
-                max_gen = int(llm.parameters.get("max_tokens", 2048))
-                target_tokens = int(max(1, ratio * max_gen))
-                # Optional clamps
-                min_t = int(getattr(length_cfg, "min_target_tokens", 0))
-                max_t = int(getattr(length_cfg, "max_target_tokens", 10**9))
-                target_tokens = max(min_t, min(max_t, target_tokens))
-            else:
-                target_tokens = int(getattr(length_cfg, "target_output_tokens", 512))
-            slope = float(getattr(length_cfg, "slope", 0.0))
-            max_penalty = float(getattr(length_cfg, "max_penalty", 0.0))
-            bonus_short_correct = float(getattr(length_cfg, "bonus_on_short_correct", 0.0))
-        except Exception:
-            target_tokens, slope, max_penalty, bonus_short_correct = 512, 0.0, 0.0, 0.0
-
-        # average output tokens across llm calls for this rollout
-        try:
-            avg_output_tokens = sum(t.output_tokens for t in training_texts) / max(1, len(training_texts))
-        except Exception:
-            avg_output_tokens = 0.0
-
-        if slope > 0.0 and max_penalty > 0.0 and avg_output_tokens > target_tokens:
-            over_by = float(avg_output_tokens - target_tokens)
-            penalty = min(max_penalty, slope * over_by)
-            total_shaping -= penalty
-
-        if bonus_short_correct > 0.0 and answer_status == "correct" and avg_output_tokens <= target_tokens:
-            total_shaping += bonus_short_correct
-
-    reward = base_reward + total_shaping
+    buffer_tokens = getattr(reward_table, "buffer_tokens", 0)
+    if buffer_tokens:
+        max_tokens = int(llm.parameters.get("max_tokens", 0))
+        total_output_tokens = sum(getattr(text, "output_tokens", 0) for text in training_texts)
+        if max_tokens > 0:
+            reward += length_penalty(max_tokens, total_output_tokens, buffer_tokens)
 
     # Assign identical reward to all steps in the rollout (pipeline expects uniform rollout_reward)
     for text in training_texts:

From bd234113564717b0982f475f2a075d3f38a70ca1 Mon Sep 17 00:00:00 2001
From: rafapi <rafael.pardinas@servicenow.com>
Date: Fri, 10 Oct 2025 16:29:17 +0000
Subject: [PATCH 094/126] Fix imports

---
 conf/mcp.yaml                      | 4 ++--
 pipelinerl/domains/mcp/rollouts.py | 3 +++
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/conf/mcp.yaml b/conf/mcp.yaml
index a2fa2bb4..27c48a42 100644
--- a/conf/mcp.yaml
+++ b/conf/mcp.yaml
@@ -149,5 +149,5 @@ agent:
       trim_obs_except_last_n: 2
       next_node: code
 
-# model_path: Qwen/Qwen3-8B
-model_path: /mnt/llmd/base_models/ServiceNow-AI/7_9_25_14b_text_reasoning_sft
\ No newline at end of file
+model_path: Qwen/Qwen3-8B
+# model_path: /mnt/llmd/base_models/ServiceNow-AI/7_9_25_14b_text_reasoning_sft
\ No newline at end of file
diff --git a/pipelinerl/domains/mcp/rollouts.py b/pipelinerl/domains/mcp/rollouts.py
index 861c5fae..b3116672 100644
--- a/pipelinerl/domains/mcp/rollouts.py
+++ b/pipelinerl/domains/mcp/rollouts.py
@@ -21,7 +21,10 @@
 from tapeagents.core import LLMCall
 from tapeagents.remote_environment import AsyncRemoteEnvironment
 
+from pipelinerl.async_llm import make_training_text
 from pipelinerl.domains.mcp.env_server import EmbeddedEnvironmentWorker
+from pipelinerl.domains.mcp.steps import MathAnswer
+from pipelinerl.world import Job
 from pipelinerl.domains.math import RewardTable, get_reward, verify_answer, verify_answer_rpc, length_penalty
 from pipelinerl.rollouts import RolloutResult, BaseMetrics
 

From 50f3ff9699dec81d2f3a3b30254154c7b947d10b Mon Sep 17 00:00:00 2001
From: rafapi <rafael.pardinas@servicenow.com>
Date: Fri, 10 Oct 2025 16:35:03 +0000
Subject: [PATCH 095/126] Fix conflicts

---
 conf/mcp.yaml                      | 8 ++++----
 pipelinerl/domains/mcp/rollouts.py | 6 ------
 2 files changed, 4 insertions(+), 10 deletions(-)

diff --git a/conf/mcp.yaml b/conf/mcp.yaml
index ec44ace0..ca6da70e 100644
--- a/conf/mcp.yaml
+++ b/conf/mcp.yaml
@@ -151,6 +151,7 @@ agent:
       trim_obs_except_last_n: 2
       next_node: code
 
+<<<<<<< HEAD
 <<<<<<< HEAD
 model_path: Qwen/Qwen3-8B
 # model_path: /mnt/llmd/base_models/ServiceNow-AI/7_9_25_14b_text_reasoning_sft
@@ -169,7 +170,6 @@ length_shaping:
   slope: 0.001                     # penalty per token beyond target
   max_penalty: 0.2                 # clamp absolute penalty
   bonus_on_short_correct: 0.05     # bonus if correct and concise
-=======
-# model_path: Qwen/Qwen3-8B
-model_path: /mnt/llmd/base_models/ServiceNow-AI/7_9_25_14b_text_reasoning_sft
->>>>>>> mcp_tir
+
+model_path: Qwen/Qwen3-8B
+# model_path: /mnt/llmd/base_models/ServiceNow-AI/7_9_25_14b_text_reasoning_sft
diff --git a/pipelinerl/domains/mcp/rollouts.py b/pipelinerl/domains/mcp/rollouts.py
index a71a7d2f..782d4978 100644
--- a/pipelinerl/domains/mcp/rollouts.py
+++ b/pipelinerl/domains/mcp/rollouts.py
@@ -18,18 +18,12 @@
 from tapeagents.orchestrator import async_execute_agent, execute_agent, get_agent_and_env_from_config
 from tapeagents.remote_environment import AsyncRemoteEnvironment
 
-<<<<<<< HEAD
 from pipelinerl.async_llm import make_training_text
-from pipelinerl.domains.math import RewardTable, get_reward, verify_answer, verify_answer_rpc
 from pipelinerl.domains.mcp.env_server import EmbeddedEnvironmentWorker
 from pipelinerl.domains.mcp.steps import MathAnswer
-from pipelinerl.rollouts import BaseMetrics, RolloutResult
 from pipelinerl.world import Job
-=======
-from pipelinerl.domains.mcp.env_server import EmbeddedEnvironmentWorker
 from pipelinerl.domains.math import RewardTable, get_reward, verify_answer, verify_answer_rpc, length_penalty
 from pipelinerl.rollouts import RolloutResult, BaseMetrics
->>>>>>> mcp_tir
 
 logger = logging.getLogger(__name__)
 

From d9a65b3105be673f0aee1d6187f9d210153c6d39 Mon Sep 17 00:00:00 2001
From: rafapi <rafael.pardinas@servicenow.com>
Date: Fri, 10 Oct 2025 18:38:33 +0000
Subject: [PATCH 096/126] Fix

---
 pipelinerl/domains/math/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pipelinerl/domains/math/__init__.py b/pipelinerl/domains/math/__init__.py
index 1c7310f2..7a9809b7 100644
--- a/pipelinerl/domains/math/__init__.py
+++ b/pipelinerl/domains/math/__init__.py
@@ -1,3 +1,3 @@
 from .load_datasets import load_datasets
-from .rollouts import generate_math_rollout, RewardTable, get_reward
+from .rollouts import generate_math_rollout, RewardTable, get_reward, length_penalty
 from .verifier_api import MathEnvironment, verify_answer, verify_answer_rpc
\ No newline at end of file

From 154ae639150e29fc279b5ac93217b0ce2dc450c9 Mon Sep 17 00:00:00 2001
From: Oleh Shliazhko <oleh.shliazhko@servicenow.com>
Date: Mon, 13 Oct 2025 11:40:54 +0000
Subject: [PATCH 097/126] save tape in the rollout fun

---
 pipelinerl/actor.py                | 5 +++--
 pipelinerl/domains/mcp/rollouts.py | 7 +++++++
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/pipelinerl/actor.py b/pipelinerl/actor.py
index b5cdb596..ea96c12b 100644
--- a/pipelinerl/actor.py
+++ b/pipelinerl/actor.py
@@ -662,8 +662,9 @@ def start_backend(self):
 
         assert self.trainer_state.propagated_weight_version is not None
         rollout_policy: Callable[[DictConfig, TrainableLLM, dict], RolloutResult] = hydra.utils.get_method(self.cfg.actor.rollout_policy)
-        def rollout_wrapper(cfg: DictConfig, llm: TrainableLLM, problem: dict, problem_id: int) -> RolloutResult:
+        def rollout_wrapper(cfg: DictConfig, llm: TrainableLLM, problem: dict, problem_id: int, attempt_number: int) -> RolloutResult:
             start_ts = time.monotonic()
+            problem["_pipeline_rl_id"] = f"problem_{problem_id}_attempt_{attempt_number}"
             rollout_result: RolloutResult = rollout_policy(cfg, llm, problem)
             ts = time.monotonic()
             logger.info(f"Problem {problem_id} finished in {ts - start_ts:.2f} seconds")
@@ -684,7 +685,7 @@ def submit_problem(self, problem: dict):
             llm_url, task_count = min(self.llms_utilization.items(), key=lambda x: x[1])
             logger.info(f"Submitting problem {self.problem_id} attempt {attempt_number}/{self.attempts} to the least busy LLM {llm_url} with {task_count} tasks")
             llm = self.llms_by_url[llm_url]
-            task_ref = self.ray_remote.remote(self.cfg_dict, llm, problem, self.problem_id)
+            task_ref = self.ray_remote.remote(self.cfg_dict, llm, problem, self.problem_id, attempt_number)
             self.llms_utilization[llm_url] += 1
             self.unfinished_tasks.append(task_ref)
         self.problem_id += 1
diff --git a/pipelinerl/domains/mcp/rollouts.py b/pipelinerl/domains/mcp/rollouts.py
index 782d4978..7cd69137 100644
--- a/pipelinerl/domains/mcp/rollouts.py
+++ b/pipelinerl/domains/mcp/rollouts.py
@@ -4,6 +4,7 @@
 import random
 import time
 from collections import Counter
+from pathlib import Path
 from typing import Dict, List
 from urllib.parse import urlparse
 
@@ -13,6 +14,7 @@
 from tapeagents.agent import DEFAULT, Agent
 from tapeagents.core import LLMCall, Tape, TrainingText
 from tapeagents.dialog_tape import UserStep
+from tapeagents.io import save_json_tape
 from tapeagents.llms.trainable import TrainableLLM
 from tapeagents.mcp import MCPEnvironment
 from tapeagents.orchestrator import async_execute_agent, execute_agent, get_agent_and_env_from_config
@@ -250,6 +252,8 @@ def generate_mcp_rollout_with_local_env(
     start = time.perf_counter()
     if isinstance(cfg, dict):
         cfg = OmegaConf.create(cfg)
+    tapes_dir = Path(cfg.output_dir) / "actor" / "tapes"
+    tapes_dir.mkdir(parents=True, exist_ok=True)
     agent, _env = get_agent_and_env_from_config(cfg)
     environment: MCPEnvironment = _env
     logger.info(f"Agent and environment loaded, using llm {llm.model_name} at {llm.get_base_url()}")
@@ -365,6 +369,9 @@ def generate_mcp_rollout_with_local_env(
         env_time = tape.metadata.result.get("environment_execution_time", -1.0)
         total_time = tape.metadata.result.get("total_execution_time", -1.0)
 
+        tape_name = problem.get("_pipeline_rl_id", tape.metadata.id)
+        save_json_tape(tape, tapes_dir.as_posix(), tape_name)
+
         metrics = Metrics(
             reward=reward,
             success=answer_status == "correct",

From 0370f3cbf630165447a57cb4cacab25cac787716 Mon Sep 17 00:00:00 2001
From: Oleh Shliazhko <oleh.shliazhko@servicenow.com>
Date: Mon, 13 Oct 2025 12:42:09 +0000
Subject: [PATCH 098/126] remove debug print to separate file

---
 pipelinerl/actor.py | 23 +++--------------------
 1 file changed, 3 insertions(+), 20 deletions(-)

diff --git a/pipelinerl/actor.py b/pipelinerl/actor.py
index ea96c12b..c731e771 100644
--- a/pipelinerl/actor.py
+++ b/pipelinerl/actor.py
@@ -45,12 +45,6 @@
 
 logger = logging.getLogger(__name__)
 
-def save_debug_line(data:dict):
-    data["ts"] = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
-    fname = os.environ.get("DEBUG_FILE", "timing_debug.jsonl")
-    with open(fname, "a") as f:
-        f.write(json.dumps(data, ensure_ascii=False) + "\n")
-
 class SlidingWindowData(BaseModel):
     prompt_tokens_window: list[list[int]] = Field(
         default_factory=list,
@@ -208,13 +202,11 @@ async def rollout_and_maybe_produce_result(
     connector = aiohttp.TCPConnector(limit=50000, limit_per_host=50000, keepalive_timeout=1.0)
     timeout = aiohttp.ClientTimeout(total=3600.0, connect=3600.0, sock_read=3600.0)
     old_finished_rollouts = 0
-    start_time = time.time()
     async with aiohttp.ClientSession(connector=connector, timeout=timeout) as session:
         while True:
             if time.time() - last_logged > 10.0 and sum(active_rollouts):
                 if finished_rollouts > old_finished_rollouts:
                     old_finished_rollouts = finished_rollouts
-                    save_debug_line({"rollouts_finished": finished_rollouts, "tokens_produced": token_count, "dt": time.time() - start_time, "token_speed": token_count / (time.time() - start_time)})
                 logger.info(
                     f"{scheduler_name}: "
                     f"rollouts in progress: {sum(active_rollouts)}, "
@@ -748,19 +740,10 @@ def receive_finished_tasks(self):
                 f"total tokens: {self.token_count}, "
                 f"gen speed: {self.token_count / dt:.2f} tokens/sec, "
                 f"task latency: {np.mean(self.task_latencies[-10:]):.2f} sec, "
-                f"ray delay: {np.mean(self.ray_result_latencies[-10:]):.4f} sec"
+                f"ray delay: {np.mean(self.ray_result_latencies[-10:]):.4f} sec,"
+                f"time elapsed: {dt:.2f} sec,\n"
+                f"LLMs utilization: {self.llms_utilization}"
             )
-            save_debug_line({
-                "rollouts_finished": self.finished_rollouts_count,
-                "rollouts_in_progress": len(self.unfinished_tasks),
-                "problems_in_progress": len(self.unfinished_problems),
-                "tokens_produced": self.token_count,
-                "dt": dt,
-                "token_speed": self.token_count / dt,
-                "ray_latency": np.mean(self.ray_result_latencies[-10:]),
-                "task_latency": np.mean(self.task_latencies[-10:]),
-            })
-            logger.info(f"LLMs utilization: {self.llms_utilization}")
         
     def get_new_results(self) -> list[list[RolloutResult]]:
         self.receive_finished_tasks()

From 096896cc3d9821049d7660fccb83b1de5161869e Mon Sep 17 00:00:00 2001
From: Oleh Shliazhko <oleh.shliazhko@servicenow.com>
Date: Mon, 13 Oct 2025 12:51:16 +0000
Subject: [PATCH 099/126] remove old mcp rollout function, leave only new one

---
 conf/mcp.yaml                      |   2 +-
 pipelinerl/domains/mcp/__init__.py |   2 +-
 pipelinerl/domains/mcp/rollouts.py | 198 +----------------------------
 3 files changed, 4 insertions(+), 198 deletions(-)

diff --git a/conf/mcp.yaml b/conf/mcp.yaml
index 5f8680b1..6b942e4c 100644
--- a/conf/mcp.yaml
+++ b/conf/mcp.yaml
@@ -19,7 +19,7 @@ rewards:
   buffer_tokens: 2000
 
 actor:
-  rollout_policy: pipelinerl.domains.mcp.generate_mcp_rollout_with_local_env
+  rollout_policy: pipelinerl.domains.mcp.generate_mcp_rollout
   system_prompt: Please reason step by step, and put your final answer within \boxed{{}}.
   rollout_workers: 64
   llm_max_rollouts: 256
diff --git a/pipelinerl/domains/mcp/__init__.py b/pipelinerl/domains/mcp/__init__.py
index 4557fa53..c1aabe54 100644
--- a/pipelinerl/domains/mcp/__init__.py
+++ b/pipelinerl/domains/mcp/__init__.py
@@ -1,2 +1,2 @@
 from .env_server import EmbeddedEnvironmentWorker, EmbeddedMCPEnvironment, MCPEnvironmentServer
-from .rollouts import generate_mcp_rollout, generate_mcp_rollout_with_local_env
+from .rollouts import generate_mcp_rollout
diff --git a/pipelinerl/domains/mcp/rollouts.py b/pipelinerl/domains/mcp/rollouts.py
index 7cd69137..b1056a80 100644
--- a/pipelinerl/domains/mcp/rollouts.py
+++ b/pipelinerl/domains/mcp/rollouts.py
@@ -1,5 +1,4 @@
 import asyncio
-import json
 import logging
 import random
 import time
@@ -12,7 +11,7 @@
 from hydra.utils import instantiate
 from omegaconf import DictConfig, OmegaConf
 from tapeagents.agent import DEFAULT, Agent
-from tapeagents.core import LLMCall, Tape, TrainingText
+from tapeagents.core import LLMCall, Tape
 from tapeagents.dialog_tape import UserStep
 from tapeagents.io import save_json_tape
 from tapeagents.llms.trainable import TrainableLLM
@@ -73,178 +72,8 @@ class Metrics(BaseMetrics):
     environment_execution_time: float = -1.0
     overflow: bool = False
 
-async def generate_mcp_rollout(
-    cfg: DictConfig,
-    llm: TrainableLLM,
-    problem: dict,
-    session: aiohttp.ClientSession,
-) -> RolloutResult:
-    start = time.perf_counter()
-
-    chosen_url: str | None = None
-    env_host: str | None = None
-    env_port: int | None = None
-
-    if cfg.world.environment_mode == "remote":
-        env_jobs = [Job(**job) for job in cfg.jobs if job["kind"] == "environment"]
-        if not env_jobs:
-            raise RuntimeError("No environment servers available")
-
-        env_urls_all = [f"http://{job.hostname}:{job.port}" for job in env_jobs if job.port is not None]
-        if not env_urls_all:
-            raise RuntimeError("Environment server definitions missing ports")
-
-        while True:
-            env_urls = env_urls_all[:]
-            random.shuffle(env_urls)
-            chosen_url = None
-            for env_url in env_urls:
-                jitter = random.randint(3, 12)
-                try:
-                    environment = AsyncRemoteEnvironment(
-                        server_url=env_url, start_timeout_sec=600, start_repeat_delay=jitter)
-                    context_manager = environment.acontext(session, wait_for_env=True)
-                    env = await context_manager.__aenter__()
-                    try:
-                        await env.start_task(problem)
-                        chosen_url = env_url
-                        actions = await env.a_actions()
-                        tools_description = await env.a_tools_description()
-                        logger.debug(f"Available tools: {tools_description}")
-                        agent: Agent = instantiate(cfg.agent, known_actions=actions, tools_description=tools_description)
-                        agent.llms = {DEFAULT: llm}
-
-                        tape = Tape(steps=[
-                            UserStep(content=f"{problem['task']}. You have access to the following tools: {tools_description}")
-                        ])
-                        t_exec = time.perf_counter()
-                        while True:
-                            try:
-                                tape = await async_execute_agent(agent, tape, env, session, max_loops=cfg.agent_max_loops)
-                                tape.metadata.result.update({"total_execution_time": time.perf_counter() - t_exec})
-                                break
-                            except Exception:
-                                await asyncio.sleep(5)
-                        break  # success
-                    finally:
-                        await context_manager.__aexit__(None, None, None)
-                except Exception as e:
-                    logger.warning(f"Env start failed at {env_url}: {e}")
-                    continue
-            if chosen_url is not None:
-                break  # success
-            await asyncio.sleep(1.0)
-
-        parsed = urlparse(chosen_url)
-        env_host, env_port = parsed.hostname, parsed.port
-    else:
-        concurrency = max(1, int(getattr(cfg.world, "env_replicas_per_actor", 1)))
-        env_worker = _get_embedded_worker(cfg.environment, concurrency)
-        async with env_worker.alifecycle() as environment:
-            start_result = environment.start_task(problem)
-            tape_metadata = start_result if isinstance(start_result, dict) else {}
-
-            actions = environment.actions()
-            tools_description = environment.tools_description()
-            logger.debug(f"Embedded tools: {tools_description}")
-            agent: Agent = instantiate(cfg.agent, known_actions=actions, tools_description=tools_description)
-            agent.llms = {DEFAULT: llm}
-            tape = Tape(
-                steps=[
-                    UserStep(
-                        content=f"{problem['task']}. You have access to the following tools: {tools_description}"
-                    )
-                ]
-            )
-            if tape_metadata:
-                tape.metadata.other.update(tape_metadata)
-
-            t_exec = time.perf_counter()
-            tape = await async_execute_agent(agent, tape, environment, session, max_loops=cfg.agent_max_loops)
-            tape.metadata.result.update({"total_execution_time": time.perf_counter() - t_exec})
-        env_host = env_port = None
 
-    reward_table = RewardTable(**dict(cfg.rewards))
-
-    llm_calls: list[LLMCall] = [
-        LLMCall(**step.metadata.other["llm_call"])
-        if isinstance(step.metadata.other["llm_call"], dict)
-        else step.metadata.other["llm_call"]
-        for step in tape.steps if step.metadata.other.get("llm_call") is not None
-    ]
-    assert len(llm_calls) > 0, "No LLM calls found"
-    tool_call_counts = count_tool_calls_by_category(llm_calls)
-    training_texts = [make_training_text(llm, llm_call) for llm_call in llm_calls]
-    n_llm_calls = len(llm_calls)
-    if env_host and env_port:
-        answer_status = await verify_answer_rpc(
-            session=session,
-            host=env_host,
-            port=env_port,
-            prediction=llm_calls[-1].output.content,  # type: ignore
-            gold=problem["answer"],
-            strict=True,
-        )
-    else:
-        answer_status = verify_answer(
-            prediction=llm_calls[-1].output.content,  # type: ignore
-            gold=problem["answer"],
-            strict=True,
-        )
-    # Tape should finish with an answer
-    tape_finished = True if isinstance(tape.steps[-1], MathAnswer) else False
-    base_reward = get_reward(answer_status, tape_finished, reward_table)
-
-    reward = base_reward
-
-    discount_factor = float(getattr(cfg.actor, "discount_factor", 1.0))
-    if discount_factor != 1.0:
-        total_generated_tokens = sum(getattr(call, "output_length_tokens", 0) for call in llm_calls)
-        reward *= discount_factor ** total_generated_tokens
-
-    buffer_tokens = getattr(reward_table, "buffer_tokens", 0)
-    if buffer_tokens:
-        max_tokens = int(llm.parameters.get("max_tokens", 0))
-        total_output_tokens = sum(getattr(text, "output_tokens", 0) for text in training_texts)
-        if max_tokens > 0:
-            reward += length_penalty(max_tokens, total_output_tokens, buffer_tokens)
-
-    # Assign identical reward to all steps in the rollout (pipeline expects uniform rollout_reward)
-    for text in training_texts:
-        text.reward = reward
-        text.finished = tape_finished
-
-    latency = time.perf_counter() - start
-
-    agent_time = tape.metadata.result.get("agent_execution_time", -1.0)
-    env_time = tape.metadata.result.get("environment_execution_time", -1.0)
-    total_time = tape.metadata.result.get("total_execution_time", -1.0)
-
-
-    metrics = Metrics(
-        reward=reward,
-        success=answer_status == "correct",
-        no_error=answer_status != "unparsable",
-        no_answer=answer_status == "no_answer",
-        num_steps=len(tape.steps),
-        num_python_calls=tool_call_counts.get("run_python_code", 0),
-        n_llm_calls=n_llm_calls,
-        total_execution_time=total_time,
-        agent_execution_time=agent_time,
-        environment_execution_time=env_time,
-        overflow=not tape_finished,
-    )
-
-    return RolloutResult(
-        training_texts=training_texts,
-        metrics=metrics,
-        latency=latency,
-        dataset_name=problem["dataset"],
-    )
-
-
-
-def generate_mcp_rollout_with_local_env(
+def generate_mcp_rollout(
     cfg: DictConfig | dict,
     llm: TrainableLLM,
     problem: dict,
@@ -301,24 +130,6 @@ def generate_mcp_rollout_with_local_env(
 
         # Local reward shaping (configurable in conf/mcp.yaml)
         total_shaping = 0.0
-        shaping_cfg = getattr(cfg, "python_tool_shaping", None)
-        if shaping_cfg is not None:
-            num_python_calls = tool_call_counts.get("run_python_code", 0)
-            bonus_on_correct_with_python = float(getattr(shaping_cfg, "bonus_on_correct_with_python", 0.0))
-            penalty_on_incorrect_without_python = float(getattr(shaping_cfg, "penalty_on_incorrect_without_python", 0.0))
-            max_abs = float(getattr(shaping_cfg, "max_abs", 0.2))
-
-            # Episode-level bonuses/penalties
-            if answer_status == "correct" and num_python_calls >= 1:
-                total_shaping += bonus_on_correct_with_python
-            if answer_status in ("wrong", "unparsable") and num_python_calls == 0:
-                total_shaping -= penalty_on_incorrect_without_python
-
-            # Clamp total shaping
-            if total_shaping > max_abs:
-                total_shaping = max_abs
-            if total_shaping < -max_abs:
-                total_shaping = -max_abs
 
         # Length shaping: discourage very long completions; award concise correct ones
         length_cfg = getattr(cfg, "length_shaping", None)
@@ -359,7 +170,6 @@ def generate_mcp_rollout_with_local_env(
 
         # Assign identical reward to all steps in the rollout (pipeline expects uniform rollout_reward)
         for text in training_texts:
-            # debug_save_training_text(text)
             text.reward = reward
             text.finished = tape_finished
 
@@ -401,7 +211,3 @@ def generate_mcp_rollout_with_local_env(
             environment.close()
         except Exception as e:
             logger.error(f"Error closing environment: {e}")
-
-def debug_save_training_text(text: TrainingText):
-    with open("debug_training_texts.jsonl", "a") as f:
-        f.write(json.dumps({"text": text.text, "n_predicted": text.n_predicted}, ensure_ascii=False) + "\n")
\ No newline at end of file

From 135a3c5b0398098d3df3aa74a2b80b3498acc358 Mon Sep 17 00:00:00 2001
From: Oleh Shliazhko <oleh.shliazhko@servicenow.com>
Date: Tue, 14 Oct 2025 08:57:52 +0000
Subject: [PATCH 100/126] address review comments

---
 pipelinerl/launch.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/pipelinerl/launch.py b/pipelinerl/launch.py
index e2109e34..c788bc7c 100644
--- a/pipelinerl/launch.py
+++ b/pipelinerl/launch.py
@@ -19,9 +19,6 @@
 
 logger = logging.getLogger(__name__)
 
-# TODO: rm debug code
-import tapeagents
-
 os.environ["NCCL_CUMEM_ENABLE"] = "0"
 os.environ["TORCH_DISABLE_SHARE_RDZV_TCP_STORE"] = "1"
 os.environ["HF_DATASETS_DISABLE_PROGRESS_BARS"] = "1"
@@ -576,7 +573,6 @@ def main(cfg: DictConfig):
 
     processes = []
 
-    logger.info(f"TapeAgents loaded from: {tapeagents.__file__}")
     lead_launcher_stream = SingleStreamSpec(exp_path=exp_dir, topic="launcher_0")
     init_msg = {"exp_init": "true"}
     if world_map.my_rank == 0:

From d8f3e6852bb8b4e6061aef3b6562243a45da761f Mon Sep 17 00:00:00 2001
From: Oleh Shliazhko <oleh.shliazhko@servicenow.com>
Date: Tue, 14 Oct 2025 09:00:42 +0000
Subject: [PATCH 101/126] new rollouts in actor, both async and multiprocess

---
 conf/base.yaml                     |   2 +
 conf/mcp.yaml                      |   2 +-
 pipelinerl/actor.py                |  91 +++++++++----
 pipelinerl/domains/mcp/__init__.py |   2 +-
 pipelinerl/domains/mcp/rollouts.py | 209 ++++++++++++++++++++++++++---
 5 files changed, 262 insertions(+), 44 deletions(-)

diff --git a/conf/base.yaml b/conf/base.yaml
index 5bf30c59..274846f8 100644
--- a/conf/base.yaml
+++ b/conf/base.yaml
@@ -19,6 +19,8 @@ actor:
   result_queue_size: 64
   throughput_window_size: 50
   shared_memory_entry_size: 10000000
+  async_batch_size: 4
+
 environment: null
 preprocess:
   input: actor
diff --git a/conf/mcp.yaml b/conf/mcp.yaml
index 6b942e4c..b916a054 100644
--- a/conf/mcp.yaml
+++ b/conf/mcp.yaml
@@ -19,7 +19,7 @@ rewards:
   buffer_tokens: 2000
 
 actor:
-  rollout_policy: pipelinerl.domains.mcp.generate_mcp_rollout
+  rollout_policy: pipelinerl.domains.mcp.generate_mcp_rollout_async
   system_prompt: Please reason step by step, and put your final answer within \boxed{{}}.
   rollout_workers: 64
   llm_max_rollouts: 256
diff --git a/pipelinerl/actor.py b/pipelinerl/actor.py
index c731e771..d3384e98 100644
--- a/pipelinerl/actor.py
+++ b/pipelinerl/actor.py
@@ -1,5 +1,4 @@
 import asyncio
-import json
 import logging
 import math
 import multiprocessing as mp
@@ -629,6 +628,7 @@ class ActorLoopRay(ActorLoop):
 
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
+        assert self.cfg.attempts % self.cfg.actor.async_batch_size == 0, "attempts must be divisible by actor.async_batch_size"
         self.cfg_dict = OmegaConf.to_container(self.cfg, resolve=True)
         self.unfinished_tasks = []
         self.llms_by_url = {llm.get_base_url(): llm for llm in self.llms}
@@ -654,14 +654,45 @@ def start_backend(self):
 
         assert self.trainer_state.propagated_weight_version is not None
         rollout_policy: Callable[[DictConfig, TrainableLLM, dict], RolloutResult] = hydra.utils.get_method(self.cfg.actor.rollout_policy)
-        def rollout_wrapper(cfg: DictConfig, llm: TrainableLLM, problem: dict, problem_id: int, attempt_number: int) -> RolloutResult:
+        def rollout_wrapper(cfg_dict: dict, llm: TrainableLLM, problem: dict, problem_id: int, attempt_number: int) -> RolloutResult:
+            cfg = OmegaConf.create(cfg_dict)
             start_ts = time.monotonic()
-            problem["_pipeline_rl_id"] = f"problem_{problem_id}_attempt_{attempt_number}"
             rollout_result: RolloutResult = rollout_policy(cfg, llm, problem)
             ts = time.monotonic()
-            logger.info(f"Problem {problem_id} finished in {ts - start_ts:.2f} seconds")
+            logger.info(f"Problem {problem_id}_{attempt_number} finished in {ts - start_ts:.2f} seconds")
             return rollout_result, llm.get_base_url(), problem_id, ts, start_ts
-        self.ray_remote = ray.remote(rollout_wrapper)
+
+        async def run_multiple_rollouts(cfg: DictConfig, llm: TrainableLLM, problems: list[dict], session: aiohttp.ClientSession) -> RolloutResult:
+            # Run all rollouts in parallel using asyncio.gather
+            async def run_rollout(problem):
+                logger.info(f"Running async rollout loop for problem {problem['_task_id']}")
+                start_ts = time.monotonic()
+                rollout_result = await rollout_policy(cfg, llm, problem, session)
+                stop_ts = time.monotonic()
+                latency = stop_ts - start_ts
+                return rollout_result, latency
+
+            tasks = [run_rollout(problem) for problem in problems]
+            results_with_latencies = await asyncio.gather(*tasks)
+            rollout_results = [res for res, _ in results_with_latencies]
+            task_latencies = [latency for _, latency in results_with_latencies]
+            return rollout_results, task_latencies
+
+        async def run_rollouts_with_session(cfg: DictConfig, llm: TrainableLLM, problems: list[dict]) -> RolloutResult:
+            connector = aiohttp.TCPConnector(limit=cfg.actor.async_batch_size, limit_per_host=cfg.actor.async_batch_size, keepalive_timeout=1.0)
+            timeout = aiohttp.ClientTimeout(total=3600.0, connect=3600.0, sock_read=3600.0)
+            async with aiohttp.ClientSession(connector=connector, timeout=timeout) as session:
+                rollout_results, task_latencies = await run_multiple_rollouts(cfg, llm, problems, session)
+            return rollout_results, task_latencies
+
+        def rollout_async_batch_wrapper(cfg_dict: dict, llm: TrainableLLM, problems: list[dict], problem_id: int) -> RolloutResult:
+            cfg = OmegaConf.create(cfg_dict)
+            logger.info(f"Running async rollouts for {len(problems)} problems")
+            results, task_latencies = asyncio.run(run_rollouts_with_session(cfg, llm, problems))
+            stop_ts = time.monotonic()
+            return results, llm.get_base_url(), problem_id, task_latencies, stop_ts
+
+        self.ray_remote = ray.remote(rollout_async_batch_wrapper)
         self.start_time = time.time()
 
     def have_capacity(self) -> bool:
@@ -673,12 +704,22 @@ def have_capacity(self) -> bool:
         return have_capacity
 
     def submit_problem(self, problem: dict):
-        for attempt_number in range(self.attempts):
+        # Make a list of cfg.attempts identical problems (deepcopies can be used if necessary)
+        problems = []
+        for n in range(self.attempts):
+            p = problem.copy()
+            p["_task_id"] = f"problem_{self.problem_id}_attempt_{n}"
+            problems.append(p)
+        # Split problems into batches of up to cfg.async_batch_size
+        batches = [problems[i:i + self.cfg.actor.async_batch_size] for i in range(0, len(problems), self.cfg.actor.async_batch_size)]
+        for batch_idx, problem_batch in enumerate(batches):
             llm_url, task_count = min(self.llms_utilization.items(), key=lambda x: x[1])
-            logger.info(f"Submitting problem {self.problem_id} attempt {attempt_number}/{self.attempts} to the least busy LLM {llm_url} with {task_count} tasks")
+            logger.info(
+                f"Submitting problem {self.problem_id} batch {batch_idx + 1}/{len(batches)} to the least busy LLM {llm_url} with {task_count} tasks"
+            )
             llm = self.llms_by_url[llm_url]
-            task_ref = self.ray_remote.remote(self.cfg_dict, llm, problem, self.problem_id, attempt_number)
-            self.llms_utilization[llm_url] += 1
+            task_ref = self.ray_remote.remote(self.cfg_dict, llm, problem_batch, self.problem_id)
+            self.llms_utilization[llm_url] += len(problem_batch)
             self.unfinished_tasks.append(task_ref)
         self.problem_id += 1
 
@@ -698,7 +739,12 @@ def receive_finished_tasks(self):
         dt = time.time() - self.start_time
         for finished_task in finished_tasks:
             try:
-                rollout_result, llm_url, problem_id, stop_ts, start_ts = ray.get(finished_task)
+                rollout_results, llm_url, problem_id, task_latencies, stop_ts = ray.get(finished_task)
+            except Exception as e:
+                logger.error(f"Error getting finished ray task: {e}")
+                continue
+            self.ray_result_latencies.append(time.monotonic() - stop_ts)
+            for rollout_result in rollout_results:
                 rollout_result.model_version = self.trainer_state.propagated_weight_version
                 full_group_id = f"{self.scheduler_name}_{problem_id}"
                 rollout_result.group_id = full_group_id
@@ -709,21 +755,14 @@ def receive_finished_tasks(self):
                     sample.metadata["rollout_index"] = rollout_index
                     sample.metadata["step_index"] = step_index
                     sample.group_id = full_group_id
-                task_dt = stop_ts - start_ts
-                self.task_latencies.append(task_dt)
-                outer_ts = time.monotonic()
-                ray_result_latency = outer_ts - stop_ts
-                self.ray_result_latencies.append(ray_result_latency)
-            except Exception as e:
-                logger.error(f"Error getting finished ray task: {e}")
-                continue
-            if self.llms_utilization[llm_url] > 0:
-                self.llms_utilization[llm_url] -= 1
-            else:
-                logger.warning(f"LLM {llm_url} utilization is 0, but got a result")
-            self.token_count += get_number_of_tokens_in_result(rollout_result)
-            self.finished_rollouts_count += 1
-            self.unfinished_problems[problem_id].append(rollout_result)
+                self.task_latencies += task_latencies
+                if self.llms_utilization[llm_url] > 0:
+                    self.llms_utilization[llm_url] -= 1
+                else:
+                    logger.warning(f"LLM {llm_url} utilization is 0, but got a result")
+                self.token_count += get_number_of_tokens_in_result(rollout_result)
+                self.finished_rollouts_count += 1
+                self.unfinished_problems[problem_id].append(rollout_result)
             logger.info(f"Problem {problem_id} has {len(self.unfinished_problems[problem_id])} rollout results")
             if len(self.unfinished_problems[problem_id]) == self.cfg.attempts:
                 logger.info(f"Problem {problem_id} group finished")
@@ -744,7 +783,7 @@ def receive_finished_tasks(self):
                 f"time elapsed: {dt:.2f} sec,\n"
                 f"LLMs utilization: {self.llms_utilization}"
             )
-        
+
     def get_new_results(self) -> list[list[RolloutResult]]:
         self.receive_finished_tasks()
         if len(self.finished_problems) > 0:
diff --git a/pipelinerl/domains/mcp/__init__.py b/pipelinerl/domains/mcp/__init__.py
index c1aabe54..a3865d85 100644
--- a/pipelinerl/domains/mcp/__init__.py
+++ b/pipelinerl/domains/mcp/__init__.py
@@ -1,2 +1,2 @@
 from .env_server import EmbeddedEnvironmentWorker, EmbeddedMCPEnvironment, MCPEnvironmentServer
-from .rollouts import generate_mcp_rollout
+from .rollouts import generate_mcp_rollout, generate_mcp_rollout_async, generate_mcp_rollouts_shared_env
diff --git a/pipelinerl/domains/mcp/rollouts.py b/pipelinerl/domains/mcp/rollouts.py
index b1056a80..c602a9f1 100644
--- a/pipelinerl/domains/mcp/rollouts.py
+++ b/pipelinerl/domains/mcp/rollouts.py
@@ -1,11 +1,9 @@
 import asyncio
 import logging
-import random
 import time
 from collections import Counter
 from pathlib import Path
 from typing import Dict, List
-from urllib.parse import urlparse
 
 import aiohttp
 from hydra.utils import instantiate
@@ -17,14 +15,12 @@
 from tapeagents.llms.trainable import TrainableLLM
 from tapeagents.mcp import MCPEnvironment
 from tapeagents.orchestrator import async_execute_agent, execute_agent, get_agent_and_env_from_config
-from tapeagents.remote_environment import AsyncRemoteEnvironment
 
 from pipelinerl.async_llm import make_training_text
+from pipelinerl.domains.math import RewardTable, get_reward, verify_answer
 from pipelinerl.domains.mcp.env_server import EmbeddedEnvironmentWorker
 from pipelinerl.domains.mcp.steps import MathAnswer
-from pipelinerl.world import Job
-from pipelinerl.domains.math import RewardTable, get_reward, verify_answer, verify_answer_rpc, length_penalty
-from pipelinerl.rollouts import RolloutResult, BaseMetrics
+from pipelinerl.rollouts import BaseMetrics, RolloutResult
 
 logger = logging.getLogger(__name__)
 
@@ -34,15 +30,6 @@
 class FailedRollout(Exception):
     pass
 
-def _get_embedded_worker(env_cfg: DictConfig, concurrency: int) -> EmbeddedEnvironmentWorker:
-    global _embedded_worker
-    concurrency = max(1, concurrency)
-    if _embedded_worker is None or not _embedded_worker.matches(env_cfg):
-        _embedded_worker = EmbeddedEnvironmentWorker(env_cfg, concurrency=concurrency)
-    else:
-        _embedded_worker.set_concurrency(concurrency)
-    return _embedded_worker
-
 def count_tool_calls_by_category(llm_calls: List[LLMCall]) -> Dict[str, int]:
     """
     Count the number of tool calls for each function name category.
@@ -179,7 +166,7 @@ def generate_mcp_rollout(
         env_time = tape.metadata.result.get("environment_execution_time", -1.0)
         total_time = tape.metadata.result.get("total_execution_time", -1.0)
 
-        tape_name = problem.get("_pipeline_rl_id", tape.metadata.id)
+        tape_name = problem.get("_task_id", tape.metadata.id)
         save_json_tape(tape, tapes_dir.as_posix(), tape_name)
 
         metrics = Metrics(
@@ -211,3 +198,193 @@ def generate_mcp_rollout(
             environment.close()
         except Exception as e:
             logger.error(f"Error closing environment: {e}")
+
+
+async def generate_mcp_rollout_async(
+    cfg: DictConfig,
+    llm: TrainableLLM,
+    problem: dict,
+    session: aiohttp.ClientSession,
+) -> RolloutResult:
+    environment: MCPEnvironment = instantiate(cfg.environment)
+    await environment.ainitialize()
+    logger.info(f"Environment tools: {environment.tools_description()}")
+    agent: Agent = instantiate(
+        cfg.agent, known_actions=environment.actions(), tools_description=environment.tools_description()
+    )
+    logger.info(f"Agent and environment loaded, using llm {llm.model_name} at {llm.get_base_url()}")
+    try:
+        result = await generate_mcp_rollout_with_agent_and_env(cfg, environment, agent, llm, problem, session)
+    finally:
+        try:
+            await environment.aclose()
+        except Exception as e:
+            logger.error(f"Error closing environment: {e}")
+    return result
+
+
+async def generate_mcp_rollout_with_agent_and_env(
+    cfg: DictConfig,
+    environment: MCPEnvironment,
+    agent: Agent,
+    llm: TrainableLLM,
+    problem: dict,
+    session: aiohttp.ClientSession,
+) -> RolloutResult:
+    tapes_dir = Path(cfg.output_dir) / "actor" / "tapes"
+    tapes_dir.mkdir(parents=True, exist_ok=True)
+    try:
+        start_time = time.perf_counter()
+        start_result = environment.start_task(problem)
+        logger.info("Task started")
+        tape_metadata = start_result if isinstance(start_result, dict) else {}
+        agent.llms = {DEFAULT: llm}
+        tape = Tape(
+            steps=[
+                UserStep(
+                    content=f"{problem['task']}. You have access to the following tools: {environment.tools_description()}"
+                )
+            ]
+        )
+        if tape_metadata:
+            tape.metadata.other.update(tape_metadata)
+
+        logger.info("Running agent..")
+        tape = await async_execute_agent(agent, tape, environment, session, max_loops=cfg.agent_max_loops)
+        logger.info("Agent finished")
+        tape.metadata.result.update({"total_execution_time": time.perf_counter() - start_time})
+        reward_table = RewardTable(**dict(cfg.rewards))
+
+        llm_calls: list[LLMCall] = [
+            LLMCall(**step.metadata.other["llm_call"])
+            if isinstance(step.metadata.other["llm_call"], dict)
+            else step.metadata.other["llm_call"]
+            for step in tape.steps if step.metadata.other.get("llm_call") is not None
+        ]
+        assert len(llm_calls) > 0, "No LLM calls found"
+        tool_call_counts = count_tool_calls_by_category(llm_calls)
+        logger.info(f'Use {type(llm)} LLM to generate training texts')
+        training_texts = [make_training_text(llm, llm_call) for llm_call in llm_calls]
+        n_llm_calls = len(llm_calls)
+        answer_status = verify_answer(
+            prediction=llm_calls[-1].output.content,  # type: ignore
+            gold=problem["answer"],
+            strict=True,
+        )
+        # Tape should finish with an answer
+        tape_finished = True if isinstance(tape.steps[-1], MathAnswer) else False
+        base_reward = get_reward(answer_status, tape_finished, reward_table)
+
+        # Local reward shaping (configurable in conf/mcp.yaml)
+        total_shaping = 0.0
+
+        # Length shaping: discourage very long completions; award concise correct ones
+        length_cfg = getattr(cfg, "length_shaping", None)
+        if length_cfg is not None:
+            try:
+                # Prefer ratio-based target if provided; otherwise use absolute
+                if hasattr(length_cfg, "target_ratio"):
+                    ratio = float(getattr(length_cfg, "target_ratio"))
+                    max_gen = int(llm.parameters.get("max_tokens", 2048))
+                    target_tokens = int(max(1, ratio * max_gen))
+                    # Optional clamps
+                    min_t = int(getattr(length_cfg, "min_target_tokens", 0))
+                    max_t = int(getattr(length_cfg, "max_target_tokens", 10**9))
+                    target_tokens = max(min_t, min(max_t, target_tokens))
+                else:
+                    target_tokens = int(getattr(length_cfg, "target_output_tokens", 512))
+                slope = float(getattr(length_cfg, "slope", 0.0))
+                max_penalty = float(getattr(length_cfg, "max_penalty", 0.0))
+                bonus_short_correct = float(getattr(length_cfg, "bonus_on_short_correct", 0.0))
+            except Exception:
+                target_tokens, slope, max_penalty, bonus_short_correct = 512, 0.0, 0.0, 0.0
+
+            # average output tokens across llm calls for this rollout
+            try:
+                avg_output_tokens = sum(t.output_tokens for t in training_texts) / max(1, len(training_texts))
+            except Exception:
+                avg_output_tokens = 0.0
+
+            if slope > 0.0 and max_penalty > 0.0 and avg_output_tokens > target_tokens:
+                over_by = float(avg_output_tokens - target_tokens)
+                penalty = min(max_penalty, slope * over_by)
+                total_shaping -= penalty
+
+            if bonus_short_correct > 0.0 and answer_status == "correct" and avg_output_tokens <= target_tokens:
+                total_shaping += bonus_short_correct
+
+        reward = base_reward + total_shaping
+
+        # Assign identical reward to all steps in the rollout (pipeline expects uniform rollout_reward)
+        for text in training_texts:
+            text.reward = reward
+            text.finished = tape_finished
+
+        latency = time.perf_counter() - start_time
+
+        agent_time = tape.metadata.result.get("agent_execution_time", -1.0)
+        env_time = tape.metadata.result.get("environment_execution_time", -1.0)
+        total_time = tape.metadata.result.get("total_execution_time", -1.0)
+
+        tape_name = problem.get("_task_id", tape.metadata.id)
+        save_json_tape(tape, tapes_dir.as_posix(), tape_name)
+
+        metrics = Metrics(
+            reward=reward,
+            success=answer_status == "correct",
+            no_error=answer_status != "unparsable",
+            no_answer=answer_status == "no_answer",
+            num_steps=len(tape.steps),
+            num_python_calls=tool_call_counts.get("run_python_code", 0),
+            n_llm_calls=n_llm_calls,
+            total_execution_time=total_time,
+            agent_execution_time=agent_time,
+            environment_execution_time=env_time,
+            overflow=not tape_finished,
+        )
+
+        return RolloutResult(
+            training_texts=training_texts,
+            metrics=metrics,
+            latency=latency,
+            dataset_name=problem["dataset"]
+        )
+    except Exception as e:
+        err_msg = f"Error generating rollout: {e}"
+        logger.error(err_msg)
+        raise FailedRollout(err_msg)
+
+
+async def generate_mcp_rollouts_shared_env(
+    cfg: DictConfig,
+    llm: TrainableLLM,
+    problems: list[dict],
+    session: aiohttp.ClientSession,
+) -> RolloutResult:
+    """Caution: this function should be used only with stateless environment, as it shares it between multiple agents"""
+    environment: MCPEnvironment = instantiate(cfg.environment)
+    await environment.ainitialize()
+    logger.info(f"Shared environment loaded for {len(problems)} problems")
+    try:
+        async def run_rollout(problem):
+            logger.info(f"Running async rollout loop for problem {problem['_task_id']}")
+            start_ts = time.monotonic()
+            agent: Agent = instantiate(
+                cfg.agent, known_actions=environment.actions(), tools_description=environment.tools_description()
+            )
+            logger.info(f"Agent created with llm {llm.model_name} at {llm.get_base_url()}")
+            rollout_result = await generate_mcp_rollout_with_agent_and_env(cfg, environment, agent, llm, problem, session)
+            stop_ts = time.monotonic()
+            latency = stop_ts - start_ts
+            return rollout_result, latency
+
+        tasks = [run_rollout(problem) for problem in problems]
+        results_with_latencies = await asyncio.gather(*tasks)
+    finally:
+        try:
+            await environment.aclose()
+        except Exception as e:
+            logger.error(f"Error closing environment: {e}")
+    rollout_results = [res for res, _ in results_with_latencies]
+    task_latencies = [latency for _, latency in results_with_latencies]
+    return rollout_results, task_latencies

From b09e42ef97d466717d03466a8c77dd33fcaed498 Mon Sep 17 00:00:00 2001
From: Oleh Shliazhko <oleh.shliazhko@servicenow.com>
Date: Tue, 14 Oct 2025 09:05:06 +0000
Subject: [PATCH 102/126] remove unused env server

---
 pipelinerl/domains/mcp/__init__.py   |    1 -
 pipelinerl/domains/mcp/env_server.py | 1035 --------------------------
 pipelinerl/domains/mcp/rollouts.py   |    3 -
 3 files changed, 1039 deletions(-)
 delete mode 100644 pipelinerl/domains/mcp/env_server.py

diff --git a/pipelinerl/domains/mcp/__init__.py b/pipelinerl/domains/mcp/__init__.py
index a3865d85..cb1d3fc4 100644
--- a/pipelinerl/domains/mcp/__init__.py
+++ b/pipelinerl/domains/mcp/__init__.py
@@ -1,2 +1 @@
-from .env_server import EmbeddedEnvironmentWorker, EmbeddedMCPEnvironment, MCPEnvironmentServer
 from .rollouts import generate_mcp_rollout, generate_mcp_rollout_async, generate_mcp_rollouts_shared_env
diff --git a/pipelinerl/domains/mcp/env_server.py b/pipelinerl/domains/mcp/env_server.py
deleted file mode 100644
index 2298e5cd..00000000
--- a/pipelinerl/domains/mcp/env_server.py
+++ /dev/null
@@ -1,1035 +0,0 @@
-import asyncio
-import atexit
-import inspect
-import json
-import logging
-import os
-import re
-import threading
-import time
-import traceback
-from concurrent.futures import ProcessPoolExecutor
-from contextlib import asynccontextmanager
-from functools import partial
-from typing import Any, AsyncIterator, List
-
-import multiprocessing
-
-from fastapi import HTTPException
-from hydra.utils import instantiate
-from omegaconf import DictConfig, OmegaConf
-from pydantic import BaseModel
-from tapeagents.core import Action, Observation
-from tapeagents.environment import Environment
-from tapeagents.mcp import MCPClient, MCPEnvironment, NoTool
-from tapeagents.remote_environment import EnvironmentServer
-from tapeagents.tool_calling import FunctionSpec, ToolCallAction, ToolResult, ToolSpec
-from mcp.types import CallToolResult, TextContent
-
-from pipelinerl.domains.math.verifier_api import verify_answer
-from pipelinerl.domains.mcp.steps import MathAnswer
-
-logger = logging.getLogger(__name__)
-
-
-_CONNECTION_ERROR_PATTERNS = (
-    "closedresourceerror",
-    "brokenresourceerror",
-    "broken pipe",
-    "connectionreseterror",
-    "timed out while waiting for response",
-)
-
-
-_MCP_WORKER_STATE: dict[str, Any] | None = None
-
-
-def _shutdown_mcp_worker() -> None:
-    global _MCP_WORKER_STATE
-    if not _MCP_WORKER_STATE:
-        return
-    loop: asyncio.AbstractEventLoop = _MCP_WORKER_STATE["loop"]
-    client: MCPClient = _MCP_WORKER_STATE["client"]
-    try:
-        loop.run_until_complete(client.close())
-    except Exception:
-        logger.warning("Failed to close MCP client in worker", exc_info=True)
-    finally:
-        loop.close()
-        _MCP_WORKER_STATE = None
-
-
-def _initialize_mcp_worker(
-    config_path: str,
-    tools_whitelist: list[str] | tuple[str, ...] | None,
-    use_cache: bool,
-    read_timeout_seconds: int,
-) -> None:
-    """Initializer for the ProcessPool workers that own MCP runtimes."""
-    global _MCP_WORKER_STATE
-    loop = asyncio.new_event_loop()
-    asyncio.set_event_loop(loop)
-    client = MCPClient(
-        config_path=config_path,
-        use_cache=use_cache,
-        read_timeout_seconds=read_timeout_seconds,
-    )
-    loop.run_until_complete(client.start_servers())
-    _MCP_WORKER_STATE = {
-        "loop": loop,
-        "client": client,
-        "tools_whitelist": list(tools_whitelist or []),
-    }
-    atexit.register(_shutdown_mcp_worker)
-
-
-def _call_tool_in_worker(tool_name: str, tool_arguments: Any) -> dict[str, Any]:
-    """Execute an MCP tool call inside a worker process."""
-    if not _MCP_WORKER_STATE:
-        raise RuntimeError("MCP worker not initialized")
-    loop: asyncio.AbstractEventLoop = _MCP_WORKER_STATE["loop"]
-    client: MCPClient = _MCP_WORKER_STATE["client"]
-    whitelist: list[str] = _MCP_WORKER_STATE.get("tools_whitelist", [])
-    if whitelist and tool_name not in whitelist:
-        raise NoTool(f"Tool {tool_name} not allowed by whitelist")
-    result = loop.run_until_complete(client.call_tool(tool_name, tool_arguments))
-    return result.model_dump(exclude_none=True)
-
-
-class _RemoteCallError(RuntimeError):
-    def __init__(self, message: str, details: dict[str, Any] | None = None) -> None:
-        super().__init__(message)
-        self.details = details or {}
-
-
-def _invoke_environment_method(
-    environment: Environment,
-    method_name: str,
-    args: tuple[Any, ...],
-    kwargs: dict[str, Any],
-    loop: asyncio.AbstractEventLoop,
-) -> Any:
-    attr = getattr(environment, method_name)
-    if inspect.iscoroutinefunction(attr):
-        return loop.run_until_complete(attr(*args, **kwargs))
-    result = attr(*args, **kwargs)
-    if inspect.isawaitable(result):
-        return loop.run_until_complete(result)
-    return result
-
-
-def _environment_process_main(env_cfg_container: dict[str, Any], conn) -> None:
-    loop = asyncio.new_event_loop()
-    asyncio.set_event_loop(loop)
-    try:
-        env_cfg = OmegaConf.create(env_cfg_container)
-        environment: Environment = instantiate(env_cfg)
-    except Exception:
-        conn.send(
-            (
-                "exception",
-                {
-                    "type": "EnvironmentBootstrapError",
-                    "message": "Failed to instantiate environment",
-                    "traceback": traceback.format_exc(),
-                },
-            )
-        )
-        conn.close()
-        loop.close()
-        return
-
-    async_methods = {
-        name
-        for name in ("ainitialize", "areset", "aclose", "astep", "areact")
-        if hasattr(environment, name) and inspect.iscoroutinefunction(getattr(environment, name))
-    }
-    sync_methods = {
-        name
-        for name in (
-            "initialize",
-            "reset",
-            "close",
-            "start_task",
-            "actions",
-            "tools_description",
-            "mark_healthy",
-            "is_healthy",
-            "step",
-            "react",
-        )
-        if callable(getattr(environment, name, None))
-    }
-
-    conn.send(("capabilities", {"sync": list(sync_methods), "async": list(async_methods)}))
-
-    running = True
-    while running:
-        try:
-            message = conn.recv()
-        except EOFError:
-            break
-        if not isinstance(message, tuple) or len(message) != 3:
-            continue
-        command, args, kwargs = message
-        if command == "__shutdown__":
-            running = False
-            conn.send(("ok", None))
-            break
-        try:
-            result = _invoke_environment_method(environment, command, args, kwargs, loop)
-            conn.send(("ok", result))
-        except Exception as exc:
-            conn.send(
-                (
-                    "exception",
-                    {
-                        "type": exc.__class__.__name__,
-                        "message": str(exc),
-                        "traceback": traceback.format_exc(),
-                    },
-                )
-            )
-
-    try:
-        if "aclose" in async_methods:
-            loop.run_until_complete(environment.aclose())
-        elif "close" in sync_methods:
-            environment.close()
-    except Exception:
-        logger.debug("Failed to close environment during shutdown", exc_info=True)
-    finally:
-        conn.close()
-        loop.close()
-
-
-class _ProcessEnvironmentProxy:
-    def __init__(self, env_cfg: DictConfig):
-        self._ctx = multiprocessing.get_context("spawn")
-        self._parent_conn, child_conn = self._ctx.Pipe()
-        cfg_container = OmegaConf.to_container(env_cfg, resolve=True)
-        self._process = self._ctx.Process(
-            target=_environment_process_main,
-            args=(cfg_container, child_conn),
-        )
-        self._process.daemon = False
-        self._process.start()
-        self._lock = threading.Lock()
-        self._closed = False
-        try:
-            status, payload = self._parent_conn.recv()
-        except EOFError as error:
-            raise _RemoteCallError("Environment process terminated prematurely") from error
-        if status == "exception":
-            raise _RemoteCallError(payload.get("message", "Environment bootstrap failed"), payload)
-        if status != "capabilities":
-            raise _RemoteCallError("Unexpected handshake from environment process")
-        self._sync_methods = set(payload.get("sync", []))
-        self._async_methods = set(payload.get("async", []))
-
-    def supports_async(self, name: str) -> bool:
-        return name in self._async_methods
-
-    def supports_sync(self, name: str) -> bool:
-        return name in self._sync_methods
-
-    def _ensure_alive(self) -> None:
-        if self._closed:
-            raise _RemoteCallError("Environment proxy is closed")
-        if not self._process.is_alive():
-            raise _RemoteCallError("Environment process died unexpectedly")
-
-    def _call_remote(self, method: str, *args: Any, **kwargs: Any) -> Any:
-        self._ensure_alive()
-        with self._lock:
-            try:
-                self._parent_conn.send((method, args, kwargs))
-                status, payload = self._parent_conn.recv()
-            except EOFError as error:
-                raise _RemoteCallError("Lost connection to environment process") from error
-        if status == "ok":
-            return payload
-        if status == "exception":
-            raise _RemoteCallError(payload.get("message", "Remote call failed"), payload)
-        raise _RemoteCallError(f"Unexpected response type: {status}")
-
-    def start_task(self, task: dict) -> dict:
-        return self._call_remote("start_task", task)
-
-    def actions(self) -> tuple[type[Action], ...]:
-        return tuple(self._call_remote("actions"))
-
-    def tools_description(self) -> str:
-        return self._call_remote("tools_description")
-
-    def initialize(self):
-        if self.supports_sync("initialize"):
-            return self._call_remote("initialize")
-        if self.supports_async("ainitialize"):
-            return self._call_remote("ainitialize")
-        return None
-
-    async def ainitialize(self) -> None:
-        loop = asyncio.get_running_loop()
-        await loop.run_in_executor(None, self.initialize)
-
-    def reset(self) -> None:
-        if self.supports_sync("reset"):
-            self._call_remote("reset")
-        elif self.supports_async("areset"):
-            self._call_remote("areset")
-
-    async def areset(self) -> None:
-        loop = asyncio.get_running_loop()
-        await loop.run_in_executor(None, self.reset)
-
-    def step(self, action: Action) -> Observation:
-        if self.supports_sync("step"):
-            return self._call_remote("step", action)
-        if self.supports_async("astep"):
-            return self._call_remote("astep", action)
-        raise _RemoteCallError("Remote environment does not support step or astep")
-
-    async def astep(self, action: Action) -> Observation:
-        loop = asyncio.get_running_loop()
-        return await loop.run_in_executor(None, self.step, action)
-
-    def react(self, tape) -> Any:
-        if self.supports_sync("react"):
-            return self._call_remote("react", tape)
-        if self.supports_async("areact"):
-            return self._call_remote("areact", tape)
-        raise _RemoteCallError("Remote environment does not support react or areact")
-
-    async def areact(self, tape) -> Any:
-        loop = asyncio.get_running_loop()
-        return await loop.run_in_executor(None, self.react, tape)
-
-    def mark_healthy(self) -> None:
-        if self.supports_sync("mark_healthy"):
-            self._call_remote("mark_healthy")
-
-    def is_healthy(self) -> bool:
-        if self.supports_sync("is_healthy"):
-            return bool(self._call_remote("is_healthy"))
-        return True
-
-    def close(self) -> None:
-        if self._closed:
-            return
-        try:
-            if self.supports_sync("close"):
-                self._call_remote("close")
-            elif self.supports_async("aclose"):
-                self._call_remote("aclose")
-        except _RemoteCallError:
-            logger.debug("Remote close failed", exc_info=True)
-        finally:
-            self._shutdown()
-
-    async def aclose(self) -> None:
-        loop = asyncio.get_running_loop()
-        await loop.run_in_executor(None, self.close)
-
-    def _shutdown(self) -> None:
-        if self._closed:
-            return
-        try:
-            with self._lock:
-                if self._process.is_alive():
-                    self._parent_conn.send(("__shutdown__", (), {}))
-                    try:
-                        self._parent_conn.recv()
-                    except EOFError:
-                        pass
-        except Exception:
-            logger.debug("Failed to send shutdown to environment process", exc_info=True)
-        finally:
-            self._parent_conn.close()
-            self._process.join(timeout=5)
-            if self._process.is_alive():
-                self._process.terminate()
-            self._closed = True
-
-    def __del__(self) -> None:
-        try:
-            self._shutdown()
-        except Exception:
-            pass
-class EnvironmentServerWithVerifier(EnvironmentServer):
-    """Environment server that includes the verify_answer endpoint."""
-    
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.process_pool = ProcessPoolExecutor(max_workers=4)
-    
-    def create_app(self):
-        app = super().create_app()
-        
-        class VerifyAnswerRequest(BaseModel):
-            prediction: str
-            gold: str
-            strict: bool = True
-            max_prediction_length: int = 1000
-        
-        @app.post("/verify_answer")
-        async def verify_answer_endpoint(request: VerifyAnswerRequest):
-            try:
-                # Run verification in the process pool to avoid blocking the main thread
-                loop = asyncio.get_event_loop()
-                answer_status = await loop.run_in_executor(
-                    self.process_pool, 
-                    partial(
-                        verify_answer, 
-                        request.prediction, 
-                        request.gold, 
-                        request.strict, 
-                        request.max_prediction_length
-                    )
-                )
-                return {"answer_status": answer_status}
-            except Exception as e:
-                logger.exception(f"Error in verify_answer: {e}")
-                raise HTTPException(status_code=500, detail=f"Error verifying answer: {str(e)}")
-        
-        return app
-    
-    def shutdown(self):
-        super().shutdown()
-        if hasattr(self, 'process_pool'):
-            self.process_pool.shutdown(wait=True)
-
-
-class MCPEnvironmentServer:
-
-    def __init__(self,
-        n_envs: int,
-        host: str,
-        mcp_target: str,
-        mcp_config_path: str,
-        mcp_tools_whitelist: List[str],
-        exp_path: str,
-        env_call_timeout: int = 60,
-        mcp_read_timeout_seconds: int = 10,
-    ):
-        # Remote environment server configuration
-        self.n_envs = n_envs
-        self.host = host
-        self.env_call_timeout = env_call_timeout
-        # Individual web environment configuration
-        self.mcp_target = mcp_target
-        self.mcp_config_path = mcp_config_path
-        self.mcp_tools_whitelist = mcp_tools_whitelist
-        self.exp_path = exp_path
-        self.mcp_read_timeout_seconds = mcp_read_timeout_seconds
-
-
-    def launch(self, port: int):
-        """
-        Serve the environment in TapeAgent with verify_answer endpoint.
-        """
-        env_server = EnvironmentServerWithVerifier(
-            n_envs=self.n_envs, 
-            host=self.host, 
-            port=port, 
-            env_call_timeout=self.env_call_timeout
-        )
-        env_server.launch(OmegaConf.create({
-            "_target_": self.mcp_target,
-            "config_path": self.mcp_config_path,
-            "tools_whitelist": self.mcp_tools_whitelist,
-            "read_timeout_seconds": self.mcp_read_timeout_seconds,
-        }))
-
-
-class EmbeddedMCPEnvironment(MCPEnvironment):
-    def __init__(
-        self,
-        *args,
-        math_answer_description: str = "Submit the final answer in LaTeX \\boxed{} format.",
-        **kwargs,
-    ) -> None:
-        config_path = kwargs.get("config_path", "")
-        use_cache = kwargs.get("use_cache", False)
-        read_timeout_seconds = kwargs.get("read_timeout_seconds", 10)
-        runtime_pool_workers = kwargs.pop("runtime_pool_workers", 0)
-        offload_tools = tuple(kwargs.pop("offload_tools", ()))
-
-        super().__init__(*args, **kwargs)
-        self._broken = False
-        self._last_failure_reason: str | None = None
-        self._runtime_guard_installed: bool = False
-        self._runtime_pool: ProcessPoolExecutor | None = None
-        self._runtime_pool_lock = threading.Lock()
-        self._runtime_pool_workers = runtime_pool_workers
-        self._offload_tools = set(offload_tools)
-        self._config_path = getattr(self.client, "config_path", config_path)
-        self._use_cache = getattr(self.client, "use_cache", use_cache)
-        self._read_timeout_seconds = getattr(self.client, "read_timeout_seconds", read_timeout_seconds)
-
-        # try to catch time wasting patterns before execution
-        self._python_blocklist = (
-            (re.compile(r"\bsys\s*\.\s*exit\s*\(", re.IGNORECASE), "sys.exit"),
-            (re.compile(r"\bos\s*\.\s*_exit\s*\(", re.IGNORECASE), "os._exit"),
-            (re.compile(r"\bexit\s*\(", re.IGNORECASE), "exit"),
-            (re.compile(r"\bquit\s*\(", re.IGNORECASE), "quit"),
-            (re.compile(r"raise\s+systemexit", re.IGNORECASE), "raise SystemExit"),
-            (re.compile(r"from\s+sys\s+import\s+exit", re.IGNORECASE), "from sys import exit"),
-            (
-                re.compile(r"__import__\s*\(\s*['\"]os['\"]\s*\)\s*\.\s*_exit", re.IGNORECASE),
-                "__import__('os')._exit",
-            ),
-            (
-                re.compile(r"__import__\s*\(\s*['\"]sys['\"]\s*\)\s*\.\s*exit", re.IGNORECASE),
-                "__import__('sys').exit",
-            ),
-        )
-        self._math_answer_spec = ToolSpec(
-            function=FunctionSpec(
-                name="MathAnswer",
-                description=math_answer_description,
-                parameters={
-                    "type": "object",
-                    "properties": {
-                        "answer": {
-                            "type": "string",
-                            "description": "Final answer expressed in LaTeX \\boxed{} format.",
-                        }
-                    },
-                    "required": ["answer"],
-                },
-            )
-        )
-
-    def initialize(self):
-        super().initialize()
-        self._reset_health()
-        self._ensure_math_answer_tool()
-
-    async def ainitialize(self) -> None:
-        self.loop = asyncio.get_running_loop()
-        await super().ainitialize()
-        self._reset_health()
-        self._ensure_math_answer_tool()
-        await self._install_runtime_guard()
-
-    def actions(self):
-        base_actions = super().actions()
-        if not any(
-            getattr(action, "function", None) and action.function.name == "MathAnswer"
-            for action in base_actions
-        ):
-            base_actions = base_actions + (self._math_answer_spec,)
-        return base_actions
-
-    def _should_offload(self, tool_name: str) -> bool:
-        return bool(self._runtime_pool_workers) and tool_name in self._offload_tools
-
-    def _ensure_runtime_pool(self) -> ProcessPoolExecutor:
-        if self._runtime_pool is not None:
-            return self._runtime_pool
-        with self._runtime_pool_lock:
-            if self._runtime_pool is not None:
-                return self._runtime_pool
-            cpu_count = os.cpu_count() or 1
-            default_workers = max(1, cpu_count // 2)
-            max_workers = self._runtime_pool_workers or default_workers
-            whitelist = tuple(self.tools_whitelist) if getattr(self, "tools_whitelist", None) else tuple()
-            self._runtime_pool = ProcessPoolExecutor(
-                max_workers=max_workers,
-                initializer=_initialize_mcp_worker,
-                initargs=(
-                    self._config_path,
-                    whitelist,
-                    bool(self._use_cache),
-                    int(self._read_timeout_seconds),
-                ),
-            )
-            return self._runtime_pool
-
-    @staticmethod
-    def _make_error_call_result(tool_name: str, message: str) -> CallToolResult:
-        return CallToolResult(
-            content=[TextContent(type="text", text=message)],
-            isError=True,
-        )
-
-    def _resolve_pool_future_sync(self, future, tool_name: str) -> CallToolResult:
-        try:
-            payload = future.result()
-            return CallToolResult.model_validate(payload)
-        except NoTool:
-            logger.exception(f"Tool {tool_name} not found in MCP client")
-            return self._make_error_call_result(tool_name, f"Tool {tool_name} not found")
-        except KeyError as error:
-            logger.exception(f"KeyError when executing MCP tool call: {error}")
-            return self._make_error_call_result(
-                tool_name, f"Error executing tool {tool_name}: KeyError {error}"
-            )
-        except Exception as error:
-            logger.exception(f"Error executing MCP tool call: {error}")
-            return self._make_error_call_result(
-                tool_name, f"Error executing tool {tool_name}: {error}"
-            )
-
-    async def _resolve_pool_future_async(self, future, tool_name: str) -> CallToolResult:
-        try:
-            payload = await asyncio.wrap_future(future)
-            return CallToolResult.model_validate(payload)
-        except NoTool:
-            logger.exception(f"Tool {tool_name} not found in MCP client")
-            return self._make_error_call_result(tool_name, f"Tool {tool_name} not found")
-        except KeyError as error:
-            logger.exception(f"KeyError when executing MCP tool call: {error}")
-            return self._make_error_call_result(
-                tool_name, f"Error executing tool {tool_name}: KeyError {error}"
-            )
-        except Exception as error:
-            logger.exception(f"Error executing MCP tool call: {error}")
-            return self._make_error_call_result(
-                tool_name, f"Error executing tool {tool_name}: {error}"
-            )
-
-    def _shutdown_runtime_pool(self) -> None:
-        if self._runtime_pool is not None:
-            self._runtime_pool.shutdown(wait=True)
-            self._runtime_pool = None
-
-    def _execute_tool_via_pool_sync(self, action: ToolCallAction) -> ToolResult:
-        start = time.perf_counter()
-        future = self._ensure_runtime_pool().submit(
-            _call_tool_in_worker,
-            action.function.name,
-            action.function.arguments,
-        )
-        call_result = self._resolve_pool_future_sync(future, action.function.name)
-        observation = ToolResult(tool_call_id=getattr(action, "id", ""), content=call_result)
-        observation.metadata.other["action_execution_time"] = time.perf_counter() - start
-        observation.metadata.other["action_kind"] = action.kind
-        return observation
-
-    async def _execute_tool_via_pool_async(self, action: ToolCallAction) -> ToolResult:
-        start = time.perf_counter()
-        future = self._ensure_runtime_pool().submit(
-            _call_tool_in_worker,
-            action.function.name,
-            action.function.arguments,
-        )
-        call_result = await self._resolve_pool_future_async(future, action.function.name)
-        observation = ToolResult(tool_call_id=getattr(action, "id", ""), content=call_result)
-        observation.metadata.other["action_execution_time"] = time.perf_counter() - start
-        observation.metadata.other["action_kind"] = action.kind
-        return observation
-
-    def step(self, action: Action) -> Observation:
-        if not isinstance(action, ToolCallAction):
-            return super().step(action)
-
-        outcome, message = self._precheck_tool_action(action)
-        if outcome == "math_answer":
-            return self._create_math_answer(action)
-        if outcome == "error":
-            return self._make_error_tool_result(action, message or "")
-
-        try:
-            observation = self._execute_tool_call_sync(action)
-        except BaseException:
-            self._broken = True
-            raise
-
-        return self._postprocess_after_tool(action, observation)
-
-    async def astep(self, action: Action) -> Observation:
-        if not isinstance(action, ToolCallAction):
-            return await super().astep(action)
-
-        outcome, message = self._precheck_tool_action(action)
-        if outcome == "math_answer":
-            return self._create_math_answer(action)
-        if outcome == "error":
-            return self._make_error_tool_result(action, message or "")
-
-        try:
-            observation = await self._execute_tool_call_async(action)
-        except BaseException:
-            self._broken = True
-            raise
-
-        return self._postprocess_after_tool(action, observation)
-
-    def _precheck_tool_action(self, action: ToolCallAction) -> tuple[str, str | None]:
-        if action.function.name == "MathAnswer":
-            return "math_answer", None
-        if self._broken:
-            return "error", self._backend_unavailable_message()
-        if action.function.name == "run_python_code":
-            block_message = self._check_python_safety(action.function.arguments)
-            if block_message is not None:
-                return "error", block_message
-        return "ok", None
-
-    def _execute_tool_call_sync(self, action: ToolCallAction) -> Observation:
-        if self._should_offload(action.function.name):
-            return self._execute_tool_via_pool_sync(action)
-        return super().step(action)
-
-    async def _execute_tool_call_async(self, action: ToolCallAction) -> Observation:
-        if self._should_offload(action.function.name):
-            return await self._execute_tool_via_pool_async(action)
-        return await super().astep(action)
-
-    def _postprocess_after_tool(
-        self,
-        action: ToolCallAction,
-        observation: Observation,
-    ) -> Observation:
-        if action.function.name != "MathAnswer":
-            return self._postprocess_tool_observation(action, observation)
-        return observation
-
-    def _ensure_math_answer_tool(self) -> None:
-        if not any(
-            getattr(tool, "function", None) and tool.function.name == "MathAnswer"
-            for tool in self.tools
-        ):
-            self.tools.append(self._math_answer_spec)
-
-    def _reset_health(self) -> None:
-        self._broken = False
-        self._last_failure_reason = None
-        self._runtime_guard_installed = False
-
-    def _create_math_answer(self, action: ToolCallAction) -> MathAnswer:
-        answer_value = self._extract_answer(action.function.arguments)
-        math_answer = MathAnswer(answer=answer_value)
-        math_answer.metadata.other.update({
-            "action_kind": "MathAnswer",
-            "tool_call_id": getattr(action, "id", ""),
-            "action_execution_time": 0.0,
-        })
-        return math_answer
-
-    def mark_healthy(self) -> None:
-        self._reset_health()
-
-    def is_healthy(self) -> bool:
-        return not self._broken
-
-    def close(self) -> None:
-        self._shutdown_runtime_pool()
-        super().close()
-
-    async def aclose(self) -> None:
-        self._shutdown_runtime_pool()
-        await super().aclose()
-
-    @staticmethod
-    def _guard_snippet() -> str:
-        """generate Python code that installs safety guards"""
-        return (
-            "import builtins, sys, os, time, atexit\n"
-            "try:\n"
-            "    _PIPELINERL_TIME_LIMIT = float(os.environ.get('PIPELINERL_PY_TIMEOUT', '30'))\n"
-            "except ValueError:\n"
-            "    _PIPELINERL_TIME_LIMIT = 30.0\n"
-            "_PIPELINERL_START = time.perf_counter()\n"
-            "class _ExitBlocked(RuntimeError):\n"
-            "    pass\n"
-            "def _blocked_exit(*_args, **_kwargs):\n"
-            "    raise _ExitBlocked('exit() and os._exit() are disabled in this environment.')\n"
-            "for _target in (builtins, sys):\n"
-            "    for _name in ('exit', 'quit'):\n"
-            "        if hasattr(_target, _name):\n"
-            "            setattr(_target, _name, _blocked_exit)\n"
-            "if hasattr(os, '_exit'):\n"
-            "    os._exit = _blocked_exit\n"
-            "def _pipelinerl_trace(frame, event, arg):\n"
-            "    if event == 'line' and (time.perf_counter() - _PIPELINERL_START) > _PIPELINERL_TIME_LIMIT:\n"
-            "        sys.settrace(None)\n"
-            "        raise RuntimeError(f'Python execution timed out after {_PIPELINERL_TIME_LIMIT} seconds.')\n"
-            "    return _pipelinerl_trace\n"
-            "sys.settrace(_pipelinerl_trace)\n"
-            "atexit.register(lambda: sys.settrace(None))\n"
-        )
-
-    async def _install_runtime_guard(self) -> None:
-        """Install runtime safety guard in the Python environment."""
-        if self._runtime_guard_installed or not getattr(self, "client", None):
-            return
-        try:
-            snippet = self._guard_snippet()
-            if self._should_offload("run_python_code"):
-                future = self._ensure_runtime_pool().submit(
-                    _call_tool_in_worker,
-                    "run_python_code",
-                    {"python_code": snippet},
-                )
-                await self._resolve_pool_future_async(future, "run_python_code")
-            else:
-                await self.client.call_tool(
-                    "run_python_code",
-                    {"python_code": snippet},
-                )
-            self._runtime_guard_installed = True
-            logger.debug("Runtime guard installed successfully")
-        except Exception:
-            logger.warning("Failed to install runtime guard in MCP environment", exc_info=True)
-
-    def _postprocess_tool_observation(
-        self,
-        action: ToolCallAction,
-        observation: Observation,
-    ) -> Observation:
-        if not isinstance(observation, ToolResult):
-            return observation
-        call_result = observation.content
-        if not isinstance(call_result, CallToolResult):
-            return observation
-        if not getattr(call_result, "isError", False):
-            return observation
-        error_text = self._extract_call_result_text(call_result)
-        if not self._is_connection_error_message(error_text):
-            return observation
-        logger.warning(
-            "MCP backend failure detected for tool %s: %s",
-            action.function.name,
-            error_text,
-        )
-        return self._handle_connection_failure(action, observation, error_text)
-
-    @staticmethod
-    def _extract_call_result_text(call_result: CallToolResult) -> str:
-        if not isinstance(call_result.content, list):
-            return ""
-        parts: list[str] = []
-        for block in call_result.content:
-            if isinstance(block, TextContent) and isinstance(block.text, str):
-                parts.append(block.text)
-        return "\n".join(parts).strip()
-
-    @staticmethod
-    def _is_connection_error_message(message: str) -> bool:
-        lowered = message.lower()
-        return any(pattern in lowered for pattern in _CONNECTION_ERROR_PATTERNS)
-
-    def _handle_connection_failure(
-        self,
-        action: ToolCallAction,
-        observation: ToolResult,
-        error_text: str,
-    ) -> ToolResult:
-        """Mark environment as broken and update observation."""
-        self._broken = True
-        failure_message = (
-            "Python tool backend became unavailable (connection lost). "
-            "Environment will restart after this attempt; stop issuing additional tool calls."
-        )
-        if error_text:
-            failure_message = f"{failure_message}\nOriginal error: {error_text}"
-
-        observation.content = CallToolResult(
-            content=[TextContent(type="text", text=failure_message)],
-            isError=True,
-        )
-        observation.metadata.other.setdefault("action_execution_time", observation.metadata.other.get("action_execution_time", 0.0))
-        observation.metadata.other["connection_failure"] = True
-        observation.metadata.other["original_error"] = error_text
-        self._last_failure_reason = failure_message
-        return observation
-
-    def _backend_unavailable_message(self) -> str:
-        """Get message for unavailable backend."""
-        return self._last_failure_reason or (
-            "Python tool backend is restarting after a connection failure. "
-            "Abort this attempt and wait for a fresh environment."
-        )
-
-    @staticmethod
-    def _extract_answer(arguments: dict | str | None) -> str:
-        """Extract answer string from arguments."""
-        if arguments is None:
-            return ""
-        if isinstance(arguments, str):
-            try:
-                parsed = json.loads(arguments)
-                return str(parsed.get("answer", "")) if isinstance(parsed, dict) else str(parsed)
-            except json.JSONDecodeError:
-                return arguments
-        if isinstance(arguments, dict):
-            return str(arguments.get("answer", ""))
-        return str(arguments)
-
-    def _check_python_safety(self, arguments: dict | str | None) -> str | None:
-        """check for Python code problems"""
-        code = self._extract_python_code(arguments)
-        if not code:
-            return None
-        for pattern, label in self._python_blocklist:
-            if pattern.search(code):
-                return (
-                    f"Python execution rejected: forbidden call detected ({label}). "
-                    "Use pure computation without exiting the runtime."
-                )
-        return None
-
-    @staticmethod
-    def _extract_python_code(arguments: dict | str | None) -> str:
-        if arguments is None:
-            return ""
-        if isinstance(arguments, str):
-            try:
-                parsed = json.loads(arguments)
-                if isinstance(parsed, dict):
-                    return str(parsed.get("python_code", parsed.get("code", "")))
-                return str(parsed)
-            except json.JSONDecodeError:
-                return arguments
-        if isinstance(arguments, dict):
-            return str(arguments.get("python_code", arguments.get("code", "")))
-        return str(arguments)
-
-    def _make_error_tool_result(self, action: ToolCallAction, message: str) -> ToolResult:
-        result = CallToolResult(
-            content=[TextContent(type="text", text=message)],
-            isError=True,
-        )
-        tool_result = ToolResult(
-            tool_call_id=getattr(action, "id", ""),
-            content=result,
-        )
-        tool_result.metadata.other["action_execution_time"] = 0.0
-        tool_result.metadata.other["action_kind"] = action.kind
-        return tool_result
-
-
-class EmbeddedEnvironmentWorker:
-    def __init__(self, env_cfg: DictConfig, concurrency: int = 1):
-        # make repeated instantiations stable even if the caller changes its copy
-        self._env_cfg = OmegaConf.create(env_cfg)
-        self._cfg_signature = self._make_cfg_signature(self._env_cfg)
-        self._concurrency = max(1, concurrency)
-        self._init_lock = asyncio.Lock()
-        self._available: asyncio.Queue[_ProcessEnvironmentProxy] | None = None
-        self._all_envs: set[_ProcessEnvironmentProxy] = set()
-
-    @staticmethod
-    def _make_cfg_signature(cfg: DictConfig) -> str:
-        try:
-            container = OmegaConf.to_container(cfg, resolve=True)
-        except Exception:
-            container = OmegaConf.to_container(cfg, resolve=False)
-        return json.dumps(container, sort_keys=True, default=str)
-
-    @property
-    def concurrency(self) -> int:
-        return self._concurrency
-
-    def matches(self, env_cfg: DictConfig) -> bool:
-        return self._cfg_signature == self._make_cfg_signature(env_cfg)
-
-    def set_concurrency(self, concurrency: int) -> None:
-        self._concurrency = max(1, concurrency)
-
-    async def _ensure_pool(self) -> None:
-        if self._available is None:
-            self._available = asyncio.Queue()
-        if len(self._all_envs) >= self._concurrency:
-            return
-        async with self._init_lock:
-            if len(self._all_envs) >= self._concurrency:
-                return
-            missing = self._concurrency - len(self._all_envs)
-            for _ in range(missing):
-                environment = _ProcessEnvironmentProxy(self._env_cfg)
-                try:
-                    await self._init_and_reset(environment)
-                except Exception:
-                    logger.exception("Failed to initialize embedded environment instance")
-                    await self._close(environment)
-                    raise
-                self._all_envs.add(environment)
-                await self._available.put(environment)
-
-    @asynccontextmanager
-    async def alifecycle(self) -> AsyncIterator[Environment]:
-        """Context manager for environment lifecycle with automatic health checking."""
-        await self._ensure_pool()
-        assert self._available is not None
-        
-        environment = await self._available.get()
-        try:
-            await self._reset(environment)
-            yield environment
-        finally:
-            try:
-                unhealthy = (
-                    hasattr(environment, "is_healthy")
-                    and not environment.is_healthy()  # type: ignore
-                )
-            except Exception:
-                logger.warning("Failed to query embedded environment health; replacing", exc_info=True)
-                unhealthy = True
-            is_healthy = not unhealthy
-            
-            if is_healthy:
-                # try to reset and recycle healthy environment
-                try:
-                    await self._reset(environment)
-                    if hasattr(environment, "mark_healthy"):
-                        environment.mark_healthy()  # type: ignore
-                    await self._available.put(environment)
-                except Exception:
-                    logger.exception("Failed to recycle embedded environment; replacing")
-                    await self._replace(environment)
-            else:
-                # environment is unhealthy, replace it
-                logger.warning("Embedded environment is unhealthy, replacing")
-                await self._replace(environment)
-
-    async def _replace(self, environment: Environment) -> None:
-        """Replace a broken environment with a new one."""
-        if environment in self._all_envs:
-            self._all_envs.remove(environment)
-        try:
-            await self._close(environment)
-        except Exception:
-            logger.exception("Failed to close environment during replacement")
-        # Refill the pool
-        await self._ensure_pool()
-
-    async def _init_and_reset(self, env: Environment) -> None:
-        # init
-        if hasattr(env, "ainitialize") and inspect.iscoroutinefunction(env.ainitialize):
-            await env.ainitialize()  # type: ignore
-        else:
-            loop = asyncio.get_running_loop()
-            await loop.run_in_executor(None, env.initialize)
-        
-        # reset
-        await self._reset(env)
-
-    async def _reset(self, env: Environment) -> None:
-        if hasattr(env, "areset") and inspect.iscoroutinefunction(env.areset):
-            await env.areset()  # type: ignore
-        else:
-            reset_fn = getattr(env, "reset", None)
-            if callable(reset_fn):
-                loop = asyncio.get_running_loop()
-                await loop.run_in_executor(None, reset_fn)
-
-    async def _close(self, env: Environment) -> None:
-        loop = asyncio.get_running_loop()
-        
-        # try async close first
-        if hasattr(env, "aclose") and inspect.iscoroutinefunction(env.aclose):
-            try:
-                await env.aclose()  # type: ignore
-                return
-            except Exception as e:
-                logger.debug(f"Async close failed: {e}, trying sync close")
-        
-        # fallback to sync close
-        try:
-            await loop.run_in_executor(None, env.close)
-        except Exception as e:
-            logger.debug(f"Sync close failed: {e}")
diff --git a/pipelinerl/domains/mcp/rollouts.py b/pipelinerl/domains/mcp/rollouts.py
index c602a9f1..d258b7bf 100644
--- a/pipelinerl/domains/mcp/rollouts.py
+++ b/pipelinerl/domains/mcp/rollouts.py
@@ -18,15 +18,12 @@
 
 from pipelinerl.async_llm import make_training_text
 from pipelinerl.domains.math import RewardTable, get_reward, verify_answer
-from pipelinerl.domains.mcp.env_server import EmbeddedEnvironmentWorker
 from pipelinerl.domains.mcp.steps import MathAnswer
 from pipelinerl.rollouts import BaseMetrics, RolloutResult
 
 logger = logging.getLogger(__name__)
 
 
-_embedded_worker: EmbeddedEnvironmentWorker | None = None
-
 class FailedRollout(Exception):
     pass
 

From 35e52723f03425ac1736397a4e29d52e5caa183e Mon Sep 17 00:00:00 2001
From: Oleh Shliazhko <oleh.shliazhko@servicenow.com>
Date: Tue, 14 Oct 2025 09:14:00 +0000
Subject: [PATCH 103/126] log number of failed rollouts to wandb for monitoring

---
 pipelinerl/actor.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/pipelinerl/actor.py b/pipelinerl/actor.py
index d3384e98..8b17f49a 100644
--- a/pipelinerl/actor.py
+++ b/pipelinerl/actor.py
@@ -307,6 +307,7 @@ def __init__(
         self.smm: SharedMemoryManager | None = None
         self.problem_queue: SharedMemoryQueue | None = None
         self.result_queue: SharedMemoryQueue | None = None
+        self.rollout_errors = 0
         logger.info(f"Initialized {'train' if self.is_training else 'test'} actor loop")
 
     def start_backend(self):
@@ -529,6 +530,7 @@ def run(self, dataset: list[tuple[str, dict]]):
                             "trainer_model_version": trainer_version_to_publish,
                             "time_since_start": time.time() - loop_start_time,
                             "groups_in_progress": in_progress,
+                            "rollout_errors": self.rollout_errors,
                         }
                         trainer_version_to_publish = None
                     else:
@@ -742,6 +744,7 @@ def receive_finished_tasks(self):
                 rollout_results, llm_url, problem_id, task_latencies, stop_ts = ray.get(finished_task)
             except Exception as e:
                 logger.error(f"Error getting finished ray task: {e}")
+                self.rollout_errors += 1
                 continue
             self.ray_result_latencies.append(time.monotonic() - stop_ts)
             for rollout_result in rollout_results:

From 3a935476d87f5bb05ff66ccff6b0fc93d1aa1d04 Mon Sep 17 00:00:00 2001
From: Oleh Shliazhko <oleh.shliazhko@servicenow.com>
Date: Tue, 14 Oct 2025 13:41:27 +0000
Subject: [PATCH 104/126] better error msg

---
 pipelinerl/actor.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/pipelinerl/actor.py b/pipelinerl/actor.py
index 8b17f49a..cefaceff 100644
--- a/pipelinerl/actor.py
+++ b/pipelinerl/actor.py
@@ -630,7 +630,7 @@ class ActorLoopRay(ActorLoop):
 
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
-        assert self.cfg.attempts % self.cfg.actor.async_batch_size == 0, "attempts must be divisible by actor.async_batch_size"
+        assert self.cfg.attempts % self.cfg.actor.async_batch_size == 0, f"attempts {self.cfg.attempts} must be divisible by actor.async_batch_size {self.cfg.actor.async_batch_size}"
         self.cfg_dict = OmegaConf.to_container(self.cfg, resolve=True)
         self.unfinished_tasks = []
         self.llms_by_url = {llm.get_base_url(): llm for llm in self.llms}
@@ -916,9 +916,7 @@ def run_actor_loop(cfg: DictConfig):
         ):
             logger.info("Create test loop")
             test_loop.start_backend()
-            test_loop_run = test_loop.run(
-                dataset=test_dataset,
-            )
+            test_loop_run = test_loop.run(dataset=test_dataset)
             train_loop.is_scheduling_paused = True
             current_eval = next_regular_eval
 

From 1c56779a56f643b88da2a65936a3c46d5f5ed992 Mon Sep 17 00:00:00 2001
From: Oleh Shliazhko <oleh.shliazhko@servicenow.com>
Date: Tue, 14 Oct 2025 13:41:57 +0000
Subject: [PATCH 105/126] use embedded envs by default

---
 conf/base.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/conf/base.yaml b/conf/base.yaml
index 274846f8..7008ab75 100644
--- a/conf/base.yaml
+++ b/conf/base.yaml
@@ -85,7 +85,7 @@ world:
   actor_group_port: 9000
   environment_start_port: 7777
 # Remote vs embedded environment execution strategy
-  environment_mode: remote
+  environment_mode: embedded
 # this will be autocreated based on the config
 jobs: []
 

From 555deb18df8eb7ae410f6c683f92d402d4dce227 Mon Sep 17 00:00:00 2001
From: Oleh Shliazhko <oleh.shliazhko@servicenow.com>
Date: Tue, 14 Oct 2025 15:11:32 +0000
Subject: [PATCH 106/126] support sync mode with only one rollout per worker at
 a time

---
 pipelinerl/actor.py | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/pipelinerl/actor.py b/pipelinerl/actor.py
index cefaceff..1e20a54a 100644
--- a/pipelinerl/actor.py
+++ b/pipelinerl/actor.py
@@ -656,13 +656,16 @@ def start_backend(self):
 
         assert self.trainer_state.propagated_weight_version is not None
         rollout_policy: Callable[[DictConfig, TrainableLLM, dict], RolloutResult] = hydra.utils.get_method(self.cfg.actor.rollout_policy)
-        def rollout_wrapper(cfg_dict: dict, llm: TrainableLLM, problem: dict, problem_id: int, attempt_number: int) -> RolloutResult:
+        def rollout_wrapper(cfg_dict: dict, llm: TrainableLLM, problems: list[dict], problem_id: int) -> RolloutResult:
+            assert len(problems) == 1, "Sync mode should only be used with 1 problem at a time"
+            problem = problems[0]
             cfg = OmegaConf.create(cfg_dict)
             start_ts = time.monotonic()
+            logger.info(f"Running sync rollout for problem {problem['_task_id']}")
             rollout_result: RolloutResult = rollout_policy(cfg, llm, problem)
-            ts = time.monotonic()
-            logger.info(f"Problem {problem_id}_{attempt_number} finished in {ts - start_ts:.2f} seconds")
-            return rollout_result, llm.get_base_url(), problem_id, ts, start_ts
+            stop_ts = time.monotonic()
+            logger.info(f"Problem {problem['_task_id']} finished in {stop_ts - start_ts:.2f} seconds")
+            return [rollout_result], llm.get_base_url(), problem_id, [stop_ts - start_ts], stop_ts
 
         async def run_multiple_rollouts(cfg: DictConfig, llm: TrainableLLM, problems: list[dict], session: aiohttp.ClientSession) -> RolloutResult:
             # Run all rollouts in parallel using asyncio.gather
@@ -694,7 +697,12 @@ def rollout_async_batch_wrapper(cfg_dict: dict, llm: TrainableLLM, problems: lis
             stop_ts = time.monotonic()
             return results, llm.get_base_url(), problem_id, task_latencies, stop_ts
 
-        self.ray_remote = ray.remote(rollout_async_batch_wrapper)
+        if self.cfg.actor.async_batch_size > 1:
+            logger.info("Using async mode")
+            self.ray_remote = ray.remote(rollout_async_batch_wrapper)
+        else:
+            logger.info("Using sync mode")
+            self.ray_remote = ray.remote(rollout_wrapper)
         self.start_time = time.time()
 
     def have_capacity(self) -> bool:

From bae92d52948d17c0fa9dbb9b6fe4f91f8ac187d1 Mon Sep 17 00:00:00 2001
From: Oleh Shliazhko <oleh.shliazhko@servicenow.com>
Date: Tue, 14 Oct 2025 15:12:16 +0000
Subject: [PATCH 107/126] sync rollout function for miniwob with embedded env

---
 pipelinerl/domains/miniwob/rollouts.py | 135 +++++++++++++++++++++++--
 1 file changed, 129 insertions(+), 6 deletions(-)

diff --git a/pipelinerl/domains/miniwob/rollouts.py b/pipelinerl/domains/miniwob/rollouts.py
index 5b590665..c42cbf97 100644
--- a/pipelinerl/domains/miniwob/rollouts.py
+++ b/pipelinerl/domains/miniwob/rollouts.py
@@ -5,6 +5,7 @@
 import time
 
 import aiohttp
+from examples.rl_webagent.environment import WebEnvironment
 from examples.rl_webagent.steps import WebTape
 from hydra.utils import instantiate
 from omegaconf import DictConfig
@@ -12,7 +13,7 @@
 from tapeagents.core import LLMCall, LLMOutputParsingFailureAction, Observation
 from tapeagents.io import save_json_tape
 from tapeagents.llms.trainable import TrainableLLM
-from tapeagents.orchestrator import async_execute_agent
+from tapeagents.orchestrator import async_execute_agent, execute_agent, get_agent_and_env_from_config
 from tapeagents.remote_environment import AsyncRemoteEnvironment
 from tapeagents.tools.simple_browser import PageObservation
 
@@ -55,7 +56,129 @@ def tape_contains_an_error(tape: WebTape) -> bool:
     )
 
 
-async def generate_miniwob_rollout(
+def generate_miniwob_rollout(cfg: DictConfig, llm: TrainableLLM, problem: dict) -> RolloutResult:
+    # make agent and env
+    # set the llm
+    # run the agent
+    # get llm calls from tape
+    # compute rewards
+    # get training text from llm calls
+
+    start_time = time.time()
+
+    agent, env = get_agent_and_env_from_config(cfg)
+    environment: WebEnvironment = env
+    try:
+        agent.llms = {DEFAULT: llm}
+        logger.info(f"Agent and environment loaded, using llm {llm.model_name} at {llm.get_base_url()}")
+        start_attempts = cfg.start_attempts
+        t = time.perf_counter()
+        while True:
+            try:
+                tape, _ = environment.start_task(problem)
+                break
+            except Exception as e:
+                logger.exception(f"Failed to start task {problem['dataset']}/{problem['task']}/{problem['seed']}: {e}")
+                start_attempts -= 1
+                if start_attempts <= 0:
+                    raise Exception(f"Failed to start task {problem['dataset']}/{problem['task']}/{problem['seed']} after {cfg.start_attempts} attempts")
+                else:
+                    logger.warning("retry after 1 seconds")
+                    time.sleep(1)
+        logger.info(
+            f"Task {problem['dataset']}/{problem['task']}/{problem['seed']} started in {time.perf_counter() - t:.2f} seconds"
+        )
+        logger.info(f"Running agent for task {problem['dataset']}/{problem['task']}/{problem['seed']}")
+        tape = execute_agent(agent, tape, env, max_loops=cfg.agent_max_loops)
+        logger.info(
+            f"Agent finished task {problem['dataset']}/{problem['task']}/{problem['seed']} in {time.perf_counter() - t:.2f} sec, produced tape with {len(tape.steps)} steps"
+        )
+        tape.metadata.result.update({"total_execution_time": time.perf_counter() - t})
+
+        # save the tape as we go
+        if cfg.save_tapes:
+            save_json_tape(tape, os.path.join(cfg.output_dir, "tapes"), tape.metadata.id)
+
+        # (3) Compute rewards
+        obs_steps = [step for step in tape if isinstance(step, Observation)]
+        if obs_steps:
+            last_obs = obs_steps[-1]
+            # in Miniwob, the observation "reward" is defined as RAW_REWARD_GLOBAL > 0
+            # see here: https://github.com/ServiceNow/BrowserGym/blob/main/browsergym/miniwob/src/browsergym/miniwob/base.py#L188
+            # Let's take directly the RAW_REWARD_GLOBAL from the metadata
+            # raw_reward = last_obs.metadata.other.get("reward", 0.0)
+            raw_reward = last_obs.metadata.other.get("info", {}).get("task_info", {}).get("REWARD_GLOBAL", -1.0)
+        else:
+            raw_reward = -1.0
+
+        # get the number of LLMOutputParsingFailureAction in the tape
+        n_step_errors = len([step for step in tape.steps if isinstance(step, LLMOutputParsingFailureAction)])
+        # get the number of PageObservation steps in the tape
+        n_page_observations = len([step for step in tape.steps if isinstance(step, PageObservation)])
+
+        # reward = raw_reward * 0.99**n_step_errors if no_error and raw_reward >= 0 else -1.0
+        # massimo's setup:
+        reward = float(raw_reward > 0)
+        if reward == 0.0:
+            reward = -1.0
+        reward *= 0.98**n_page_observations
+
+        # (3) Get LLM calls from Tape
+        llm_calls = [step for step in tape.steps if step.metadata.other.get("llm_call") is not None]
+        n_llm_calls = len(llm_calls)
+        llm_calls: list[LLMCall] = [
+            LLMCall(**step.metadata.other["llm_call"])
+            if isinstance(step.metadata.other["llm_call"], dict)
+            else step.metadata.other["llm_call"]
+            for step in llm_calls
+        ]
+
+        # (4) # For each LLM interaction in the tape, make a training example.
+        all_finished = 1
+        prompt_tokens = [llm_call.prompt_length_tokens for llm_call in llm_calls]
+        output_tokens = [llm_call.output_length_tokens for llm_call in llm_calls]
+        training_texts = [make_training_text(llm, llm_call) for llm_call in llm_calls]
+        for text in training_texts:
+            text.reward = reward
+            all_finished &= 1 if text.input_ids[-1] == llm.tokenizer.eos_token_id else 0
+
+        latency = time.time() - start_time
+        agent_time = tape.metadata.result.get("agent_execution_time", -1.0)
+        env_time = tape.metadata.result.get("environment_execution_time", -1.0)
+        n_observations = len(
+            [s for s in tape.steps if isinstance(s, Observation)]
+        )  # TODO: is this not the same n_page_observations??
+        n_other_steps = len(tape.steps) - n_observations
+        metrics = MiniwobMetrics(
+            reward=reward,
+            success=reward > 0.5,
+            no_error=not tape_contains_an_error(tape),
+            no_answer=reward < 0,
+            overflow=not all_finished,
+            n_llm_calls=n_llm_calls,
+            n_step_errors=n_step_errors,
+            n_page_observations=n_page_observations,
+            n_steps=len(tape.steps),
+            total_execution_time=tape.metadata.result.get("total_execution_time", -1.0),
+            agent_execution_time=agent_time,
+            environment_execution_time=env_time,
+            env_step_time=env_time / n_observations if env_time > 0 and n_observations > 0 else -1.0,
+            agent_step_time=agent_time / n_other_steps if agent_time > 0 and n_other_steps > 0 else -1.0,
+        )
+
+        return RolloutResult(
+            training_texts=training_texts,
+            metrics=metrics,
+            latency=latency,
+            dataset_name=problem["dataset"],
+            prompt_tokens=prompt_tokens,
+            output_tokens=output_tokens,
+        )
+    finally:
+        environment.close()
+
+
+async def generate_miniwob_rollout_async(
     cfg: DictConfig,
     llm: TrainableLLM,
     problem: dict,
@@ -142,12 +265,12 @@ async def generate_miniwob_rollout(
     # get the number of PageObservation steps in the tape
     n_page_observations = len([step for step in tape.steps if isinstance(step, PageObservation)])
 
-    #reward = raw_reward * 0.99**n_step_errors if no_error and raw_reward >= 0 else -1.0
+    # reward = raw_reward * 0.99**n_step_errors if no_error and raw_reward >= 0 else -1.0
     # massimo's setup:
-    reward = float(raw_reward>0)
+    reward = float(raw_reward > 0)
     if reward == 0.0:
         reward = -1.0
-    reward *= 0.98 ** n_page_observations
+    reward *= 0.98**n_page_observations
 
     # (3) Get LLM calls from Tape
     llm_calls = [step for step in tape.steps if step.metadata.other.get("llm_call") is not None]
@@ -171,7 +294,7 @@ async def generate_miniwob_rollout(
     latency = time.time() - start_time
     agent_time = tape.metadata.result.get("agent_execution_time", -1.0)
     env_time = tape.metadata.result.get("environment_execution_time", -1.0)
-    n_observations = len([s for s in tape.steps if isinstance(s, Observation)])  # TODO: is this not the same n_page_observations??
+    n_observations = len([s for s in tape.steps if isinstance(s, Observation)])
     n_other_steps = len(tape.steps) - n_observations
     metrics = MiniwobMetrics(
         reward=reward,

From a39f8b4c9abb8f698bcd77bc24b560bcc384eb53 Mon Sep 17 00:00:00 2001
From: Oleh Shliazhko <oleh.shliazhko@servicenow.com>
Date: Tue, 14 Oct 2025 15:12:43 +0000
Subject: [PATCH 108/126] miniwob config with ray actor, sync mode, embedded
 env

---
 conf/miniwob.yaml | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/conf/miniwob.yaml b/conf/miniwob.yaml
index a55dfd65..f73560df 100644
--- a/conf/miniwob.yaml
+++ b/conf/miniwob.yaml
@@ -15,6 +15,8 @@ save_tapes: False
 
 output_dir: results/miniwob/${now:%Y-%m-%d}/${now:%H-%M-%S}
 model_path: meta-llama/Llama-3.1-8B-Instruct
+use_ray: true
+attempts: 8
 
 finetune:
   seq_length: 16384  # input + output tokens
@@ -37,6 +39,9 @@ vllm_config:
 
 actor:
   rollout_policy: pipelinerl.domains.miniwob.rollouts.generate_miniwob_rollout
+  async_batch_size: 1
+  llm_max_rollouts: 256
+  problem_queue_size: 256
   shared_memory_entry_size: 100000000
 
 preprocess:
@@ -123,12 +128,7 @@ agent:
 # ENVIRONMENT CONFIGURATION
 start_attempts: 3  # number of attempts to start each task
 environment:
-  _target_: pipelinerl.domains.miniwob.environment_server.WebEnvironmentServer
-  miniwob_url: ???
-  n_envs: 32
-  host: "0.0.0.0"
-  env_call_timeout: 600  # timeout for each environment call (e.g. start_task, act, etc.)
-  web_env_target: examples.rl_webagent.environment.WebEnvironment
+  _target_: examples.rl_webagent.environment.WebEnvironment
   exp_path: ${output_dir}/env_server
   headless: true
   observation_format: html

From 4aad548107aacdee45fdd9fcb1a9e10e7ed86ecd Mon Sep 17 00:00:00 2001
From: Oleh Shliazhko <oleh.shliazhko@servicenow.com>
Date: Wed, 15 Oct 2025 10:50:54 +0000
Subject: [PATCH 109/126] port changes from debug_miniwob branch

---
 conf/miniwob.yaml                        | 38 +++++++++++++++++------
 conf/miniwob_grpo.yaml                   | 10 ++++++
 conf/miniwob_massimo_grpo.yaml           | 15 +++++++++
 conf/miniwob_massimo_ppo.yaml            | 15 +++++++++
 pipelinerl/domains/miniwob/load_tasks.py |  5 +++
 pipelinerl/domains/miniwob/rollouts.py   | 39 +++++++++++++++++++-----
 pipelinerl/utils.py                      |  3 ++
 7 files changed, 108 insertions(+), 17 deletions(-)
 create mode 100644 conf/miniwob_grpo.yaml
 create mode 100644 conf/miniwob_massimo_grpo.yaml
 create mode 100644 conf/miniwob_massimo_ppo.yaml

diff --git a/conf/miniwob.yaml b/conf/miniwob.yaml
index f73560df..db147c2d 100644
--- a/conf/miniwob.yaml
+++ b/conf/miniwob.yaml
@@ -5,9 +5,9 @@ defaults:
   - _self_
 
 world:
-  actor_fraction: 3
+  actor_fraction: 2
   preprocessor_fraction: 0
-  finetune_fraction: 5
+  finetune_fraction: 6
 
 # debug:
 #   mode: actor
@@ -20,7 +20,11 @@ attempts: 8
 
 finetune:
   seq_length: 16384  # input + output tokens
-  max_train_steps: 1000
+  max_train_steps: 1000  # 1000 optim steps = 1000 * bs samples
+  train_batch_size: 1
+  gradient_accumulation_passes: 1024
+
+eval_every_n_versions: 10240  # 1024 effective bs * 10 "optim steps"
 
 llm:
   parameters:
@@ -45,10 +49,25 @@ actor:
   shared_memory_entry_size: 100000000
 
 preprocess:
-  shared_memory_entry_size: 1000000000
+  n_workers: 32  # Increase from 8
+  chunk_n_groups: 8  # Increase from 2 for better throughput
+  # queue for loaded raw groups
+  raw_queue_size: 32      # Increase from 8
+  # queue for processed chunks of multiple groups
+  input_queue_size: 64    # Increase from 32
+  # queue for ready chunks for multiple groups
+  output_queue_size: 64   # Increase from 32
+  # ring buffer to replace old samples with new ones when training is slow
+  ring_buffer_size: 1024  # Increase from 128
+  # "virtual" sample queue per lead trainer
+  max_ready_samples_per_lead: 256  # Increase from 64
+  shared_memory_entry_size: 1000000000  # Increase from 100M
 
 # AGENT CONFIGURATION
 agent_max_loops: 10  # max number of agent - environment interactions for each task
+agent_attempts: 3  # number of attempts to run the agent (retry on errors)
+rollout_timeout: 600  # overall timeout for entire rollout in seconds (10 minutes)
+reward_computation: nico
 agent:
   _target_: tapeagents.agent.Agent
   name : web_agent
@@ -66,8 +85,9 @@ agent:
       {allowed_steps}
       Do not reproduce schema when producing the steps, use it as a reference.
     json_format: |
-      Important! Respond with parsable JSON, do not include any text or code.
-      Do not output anything besides one JSON object.
+      Important! Respond with very simple parsable JSON!
+      Do not use any special characters or code. Do not use new lines, tabs, or any other formatting inside the JSON.
+      Do not output anything besides one simple JSON object.
   nodes:
     - _target_: examples.rl_webagent.agent.WebNode
       name: set_goal
@@ -94,7 +114,7 @@ agent:
         - check if you are stuck with repeating the same action over and over again, if so, try something else and change the action.
         - check if you think the task is done, if not give a detailed list of actions to do next to accomplish the task.
         - finally, if the task is not done, describe the immediate next action to be performed and its expected effect on the page.
-        Produce only one reasoning_thought step!
+        Produce only one reasoning_thought step! Be brief and to the point. You can skip some details if they are not relevant for this step.
         ${agent.templates.json_format}
       steps_prompt: ${agent.templates.allowed_steps}
       steps:
@@ -129,7 +149,7 @@ agent:
 start_attempts: 3  # number of attempts to start each task
 environment:
   _target_: examples.rl_webagent.environment.WebEnvironment
-  exp_path: ${output_dir}/env_server
+  exp_path: null
   headless: true
   observation_format: html
 
@@ -141,4 +161,4 @@ dataset_loader_params:
 train_dataset_names:
   - train
 test_dataset_names:
-  - test
+  - test
\ No newline at end of file
diff --git a/conf/miniwob_grpo.yaml b/conf/miniwob_grpo.yaml
new file mode 100644
index 00000000..6df78fbf
--- /dev/null
+++ b/conf/miniwob_grpo.yaml
@@ -0,0 +1,10 @@
+defaults:
+  - miniwob
+  - override finetune: grpo
+  - _self_
+
+finetune:
+  seq_length: 16384  # input + output tokens
+  max_train_steps: 5000  # 1000 optim steps = 1000 * bs samples
+  train_batch_size: 1
+  gradient_accumulation_passes: 1024
diff --git a/conf/miniwob_massimo_grpo.yaml b/conf/miniwob_massimo_grpo.yaml
new file mode 100644
index 00000000..cf4c2269
--- /dev/null
+++ b/conf/miniwob_massimo_grpo.yaml
@@ -0,0 +1,15 @@
+defaults:
+  - miniwob_grpo
+  - _self_
+
+train_dataset_names:
+  - massimo_train
+test_dataset_names:
+  - massimo_test
+
+reward_computation: massimo
+
+finetune:
+  gradient_accumulation_passes: 512
+
+eval_every_n_versions: 5120  # 512 effective bs * 10 "optim steps"
\ No newline at end of file
diff --git a/conf/miniwob_massimo_ppo.yaml b/conf/miniwob_massimo_ppo.yaml
new file mode 100644
index 00000000..8053493a
--- /dev/null
+++ b/conf/miniwob_massimo_ppo.yaml
@@ -0,0 +1,15 @@
+defaults:
+  - miniwob
+  - _self_
+
+train_dataset_names:
+  - massimo_train
+test_dataset_names:
+  - massimo_test
+
+reward_computation: massimo
+
+finetune:
+  gradient_accumulation_passes: 512
+
+eval_every_n_versions: 5120  # 512 effective bs * 10 "optim steps"
\ No newline at end of file
diff --git a/pipelinerl/domains/miniwob/load_tasks.py b/pipelinerl/domains/miniwob/load_tasks.py
index a056a311..63cbbb8a 100644
--- a/pipelinerl/domains/miniwob/load_tasks.py
+++ b/pipelinerl/domains/miniwob/load_tasks.py
@@ -207,6 +207,11 @@ def load_tasks(dataset_names: list[str], train_split: float = 0.6, seeds: list[i
                 {"dataset": task, "task": task, "seed": seed}
                 for task in MASSIMO_TRAIN_SPLIT for seed in range(3,10)  # seeds 0-2 are used for held out goals in Mass setup
             ])
+        elif name == "massimo_train_heldout_goals":
+            tasks.extend([
+                {"dataset": task, "task": task, "seed": seed}
+                for task in MASSIMO_TRAIN_SPLIT for seed in range(3)  # seeds 0-2 are used for held out goals in Mass setup
+            ])
         elif name == "massimo_test":
             tasks.extend([
                 {"dataset": task, "task": task, "seed": seed}
diff --git a/pipelinerl/domains/miniwob/rollouts.py b/pipelinerl/domains/miniwob/rollouts.py
index c42cbf97..0cd45c02 100644
--- a/pipelinerl/domains/miniwob/rollouts.py
+++ b/pipelinerl/domains/miniwob/rollouts.py
@@ -97,7 +97,10 @@ def generate_miniwob_rollout(cfg: DictConfig, llm: TrainableLLM, problem: dict)
 
         # save the tape as we go
         if cfg.save_tapes:
-            save_json_tape(tape, os.path.join(cfg.output_dir, "tapes"), tape.metadata.id)
+            try:
+                save_json_tape(tape, os.path.join(cfg.output_dir, "tapes"), tape.metadata.id)
+            except Exception as e:
+                logger.error(f"Error saving tape: {e}")
 
         # (3) Compute rewards
         obs_steps = [step for step in tape if isinstance(step, Observation)]
@@ -116,12 +119,32 @@ def generate_miniwob_rollout(cfg: DictConfig, llm: TrainableLLM, problem: dict)
         # get the number of PageObservation steps in the tape
         n_page_observations = len([step for step in tape.steps if isinstance(step, PageObservation)])
 
-        # reward = raw_reward * 0.99**n_step_errors if no_error and raw_reward >= 0 else -1.0
-        # massimo's setup:
-        reward = float(raw_reward > 0)
-        if reward == 0.0:
-            reward = -1.0
-        reward *= 0.98**n_page_observations
+        if obs_steps:
+            last_obs = obs_steps[-1]
+            # in Miniwob, the observation "reward" is defined as RAW_REWARD_GLOBAL > 0
+            # see here: https://github.com/ServiceNow/BrowserGym/blob/main/browsergym/miniwob/src/browsergym/miniwob/base.py#L188
+            # Let's take directly the RAW_REWARD_GLOBAL from the metadata
+            # raw_reward = last_obs.metadata.other.get("reward", 0.0)
+            raw_reward = last_obs.metadata.other.get("info", {}).get("task_info", {}).get("REWARD_GLOBAL", -1.0)
+        else:
+            raw_reward = -1.0
+
+        no_error = not tape_contains_an_error(tape)
+        # get the number of LLMOutputParsingFailureAction in the tape
+        n_step_errors = len([step for step in tape.steps if isinstance(step, LLMOutputParsingFailureAction)])
+        # get the number of PageObservation steps in the tape
+        n_page_observations = len([step for step in tape.steps if isinstance(step, PageObservation)])
+
+        if cfg.reward_computation == "nico":
+            reward = raw_reward * 0.99**n_step_errors if no_error and raw_reward >= 0 else -1.0
+        elif cfg.reward_computation == "massimo":
+            reward = float(raw_reward>0)
+            if reward == 0.0:
+                reward = -1.0
+            reward *= 0.98 ** n_page_observations
+        else:
+            raise ValueError(f"Invalid reward configuration: {cfg.reward_computation}")
+
 
         # (3) Get LLM calls from Tape
         llm_calls = [step for step in tape.steps if step.metadata.other.get("llm_call") is not None]
@@ -152,7 +175,7 @@ def generate_miniwob_rollout(cfg: DictConfig, llm: TrainableLLM, problem: dict)
         metrics = MiniwobMetrics(
             reward=reward,
             success=reward > 0.5,
-            no_error=not tape_contains_an_error(tape),
+            no_error=no_error,
             no_answer=reward < 0,
             overflow=not all_finished,
             n_llm_calls=n_llm_calls,
diff --git a/pipelinerl/utils.py b/pipelinerl/utils.py
index 6243c2c7..2378e2a2 100644
--- a/pipelinerl/utils.py
+++ b/pipelinerl/utils.py
@@ -238,6 +238,9 @@ def calculate_stats(stats: List | Dict[Any, Any]) -> Dict[str, float]:
     if not isinstance(stats, list):
         raise TypeError(f"Expected stats to be a list, got {type(stats)}")
 
+    if len(stats) == 0:
+        return {}
+
     aggregated_stats = {
         "max": float(max(stats)),
         "min": float(min(stats)),

From 6d129755b45cd051b938bd57df347081119a2ad3 Mon Sep 17 00:00:00 2001
From: Oleh Shliazhko <oleh.shliazhko@servicenow.com>
Date: Wed, 15 Oct 2025 16:39:56 +0000
Subject: [PATCH 110/126] format and store ray worker logs

---
 pipelinerl/actor.py | 127 +++++++++++++++++++++++++++-----------------
 1 file changed, 78 insertions(+), 49 deletions(-)

diff --git a/pipelinerl/actor.py b/pipelinerl/actor.py
index 1e20a54a..011fd65d 100644
--- a/pipelinerl/actor.py
+++ b/pipelinerl/actor.py
@@ -5,6 +5,7 @@
 import os
 import queue
 import random
+import sys
 import time
 from collections import defaultdict
 from multiprocessing.managers import SharedMemoryManager
@@ -44,6 +45,7 @@
 
 logger = logging.getLogger(__name__)
 
+
 class SlidingWindowData(BaseModel):
     prompt_tokens_window: list[list[int]] = Field(
         default_factory=list,
@@ -107,7 +109,6 @@ def get_stats(self):
         }
 
 
-
 def make_stats_dict() -> dict:
     return defaultdict(lambda: defaultdict(list))
 
@@ -315,10 +316,16 @@ def start_backend(self):
         self.smm.start()
 
         # Use SharedMemoryQueue instead of separate problem_queue, result_queue, and io_buffer
-        self.problem_queue = SharedMemoryQueue(self.smm, self.cfg.actor.problem_queue_size, self.cfg.actor.shared_memory_entry_size)
-        self.result_queue = SharedMemoryQueue(self.smm, self.cfg.actor.result_queue_size, self.cfg.actor.shared_memory_entry_size)
+        self.problem_queue = SharedMemoryQueue(
+            self.smm, self.cfg.actor.problem_queue_size, self.cfg.actor.shared_memory_entry_size
+        )
+        self.result_queue = SharedMemoryQueue(
+            self.smm, self.cfg.actor.result_queue_size, self.cfg.actor.shared_memory_entry_size
+        )
 
-        logger.info(f"Problem queue size: {self.problem_queue.max_size}, result queue size: {self.result_queue.max_size}")
+        logger.info(
+            f"Problem queue size: {self.problem_queue.max_size}, result queue size: {self.result_queue.max_size}"
+        )
         logger.info(f"Result queue buffer size: {self.result_queue.get_memory_size() / 2**30} Gb")
 
         # Create and start multiple rollout processes
@@ -355,10 +362,10 @@ def init_stats(self):
     def compute_domain_agnostic_metrics(self, result: RolloutResult) -> Dict[str, float]:
         metrics = {}
 
-        metrics['overflow'] = all([not training_text.finished for training_text in result.training_texts ])
-        metrics['num_turns'] = len(result.training_texts)
-        metrics['prompt_tokens'] = [training_text.prompt_tokens for training_text in result.training_texts]
-        metrics['output_tokens'] = [training_text.output_tokens for training_text in result.training_texts]
+        metrics["overflow"] = all([not training_text.finished for training_text in result.training_texts])
+        metrics["num_turns"] = len(result.training_texts)
+        metrics["prompt_tokens"] = [training_text.prompt_tokens for training_text in result.training_texts]
+        metrics["output_tokens"] = [training_text.output_tokens for training_text in result.training_texts]
 
         return metrics
 
@@ -382,16 +389,18 @@ def update_stats(self, rollout_results: List[RolloutResult]):
                 else:
                     raise ValueError(f"Unsupported metric type: {type(v)} for key {k}")
 
-        prompt_length_tokens = [training_text.prompt_tokens for result in rollout_results for training_text in result.training_texts]
-        output_length_tokens = [training_text.output_tokens for result in rollout_results for training_text in result.training_texts]
+        prompt_length_tokens = [
+            training_text.prompt_tokens for result in rollout_results for training_text in result.training_texts
+        ]
+        output_length_tokens = [
+            training_text.output_tokens for result in rollout_results for training_text in result.training_texts
+        ]
         self.sliding_aggregator.update(prompt_length_tokens, output_length_tokens)
         sliding_window_stats = self.sliding_aggregator.get_stats()
         if sliding_window_stats is not None:
             for k, v in sliding_window_stats.items():
                 self.sliding_stats[k].append(v)
 
-
-
     def run(self, dataset: list[tuple[str, dict]]):
         loop_start_time = time.time()
         self.init_stats()
@@ -490,9 +499,7 @@ def run(self, dataset: list[tuple[str, dict]]):
                 if len(rollout_results) == 0:
                     continue
                 assert isinstance(rollout_results[0], RolloutResult)
-                assert len(rollout_results) == attempts, (
-                    f"Expected {attempts} rollouts, got {len(rollout_results)}"
-                )
+                assert len(rollout_results) == attempts, f"Expected {attempts} rollouts, got {len(rollout_results)}"
                 group_samples = sum(len(r.training_texts) for r in rollout_results)
 
                 published_samples += group_samples
@@ -509,13 +516,11 @@ def run(self, dataset: list[tuple[str, dict]]):
                     f" {in_progress} groups in progress"
                 )
 
-
                 self.update_stats(rollout_results=rollout_results)
 
                 finished_groups += 1
                 time_to_publish_train_stats = (
-                    self.is_training
-                    and trainer_version_to_publish is not None
+                    self.is_training and trainer_version_to_publish is not None
                 ) or self.debug_mode
                 time_to_publish_test_stats = finished_groups == expected_rollouts
 
@@ -534,16 +539,13 @@ def run(self, dataset: list[tuple[str, dict]]):
                         }
                         trainer_version_to_publish = None
                     else:
-                        loop_stats = {
-                            "trainer_model_version": last_trainer_version
-                            }
+                        loop_stats = {"trainer_model_version": last_trainer_version}
 
                     self.publish_stats(
                         stats_writer=stats_writer,
                         loop_stats=loop_stats,
                     )
 
-
                 if finished_groups == expected_rollouts:
                     logger.info(f"Finished {expected_rollouts} rollouts, stopping actor loop")
                     self.stop_tasks()
@@ -562,18 +564,9 @@ def publish_stats(self, stats_writer: StreamWriter, loop_stats: Dict):
                     stats[f"{dataset_name}/{metric_name}_{agg}"] = sub_stats
 
         stats |= (
-            {
-                f"{split_name}{k}": v
-                for k, v in always_or_never_success_stats(self.stats["success"]).items()
-            }
-            | {
-                f"{split_name}latency_" + k: v
-                for k, v in calculate_stats(self.latency_list).items()
-            }
-            | {
-                f"{split_name}model_version_" + k: v
-                for k, v in calculate_stats(self.model_versions_list).items()
-            }
+            {f"{split_name}{k}": v for k, v in always_or_never_success_stats(self.stats["success"]).items()}
+            | {f"{split_name}latency_" + k: v for k, v in calculate_stats(self.latency_list).items()}
+            | {f"{split_name}model_version_" + k: v for k, v in calculate_stats(self.model_versions_list).items()}
         )
 
         stats |= loop_stats
@@ -626,11 +619,14 @@ class ActorLoopRay(ActorLoop):
     """
     Loop that runs the ray tasks for n_jobs to perform rollouts in parallel
     """
+
     ray_ready: bool = False
 
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        assert self.cfg.attempts % self.cfg.actor.async_batch_size == 0, f"attempts {self.cfg.attempts} must be divisible by actor.async_batch_size {self.cfg.actor.async_batch_size}"
+    def __init__(self, cfg: DictConfig, *args, **kwargs):
+        assert cfg.attempts % cfg.actor.async_batch_size == 0, (
+            f"attempts {cfg.attempts} must be divisible by actor.async_batch_size {cfg.actor.async_batch_size}"
+        )
+        super().__init__(cfg, *args, **kwargs)
         self.cfg_dict = OmegaConf.to_container(self.cfg, resolve=True)
         self.unfinished_tasks = []
         self.llms_by_url = {llm.get_base_url(): llm for llm in self.llms}
@@ -638,36 +634,53 @@ def __init__(self, *args, **kwargs):
         self.scheduler_name = f"{'train' if self.is_training else 'test'} ray scheduler"
         self.problem_id = 0
         self.attempts = self.cfg.attempts if self.is_training else 1
-        self.unfinished_problems = defaultdict(list) # up to `attempts` rollout results for each problem
+        self.unfinished_problems = defaultdict(list)  # up to `attempts` rollout results for each problem
         self.finished_problems = []
         self.token_count = 0
         self.finished_rollouts_count = 0
         self.task_latencies = []
         self.ray_result_latencies = []
+        self.log_dir = Path(self.cfg.output_dir) / "actor" / "ray"
 
     def start_backend(self):
         if not self.ray_ready:
             logger.info(f"Initializing Ray with {self.cfg.actor.rollout_workers} workers..")
-            ray_context = ray.init(num_cpus=self.cfg.actor.rollout_workers, dashboard_host="0.0.0.0", include_dashboard=True)
+            self.log_dir.mkdir(parents=True, exist_ok=True)
+            ray_context = ray.init(
+                num_cpus=self.cfg.actor.rollout_workers,
+                dashboard_host="0.0.0.0",
+                include_dashboard=True,
+                log_to_driver=True,
+            )
             logger.info(f"Ray initialized, dashboard at {ray_context.dashboard_url}")
             self.ray_ready = True
         else:
             logger.info("Ray already initialized")
 
         assert self.trainer_state.propagated_weight_version is not None
-        rollout_policy: Callable[[DictConfig, TrainableLLM, dict], RolloutResult] = hydra.utils.get_method(self.cfg.actor.rollout_policy)
+        rollout_policy: Callable[[DictConfig, TrainableLLM, dict], RolloutResult] = hydra.utils.get_method(
+            self.cfg.actor.rollout_policy
+        )
+
         def rollout_wrapper(cfg_dict: dict, llm: TrainableLLM, problems: list[dict], problem_id: int) -> RolloutResult:
             assert len(problems) == 1, "Sync mode should only be used with 1 problem at a time"
-            problem = problems[0]
             cfg = OmegaConf.create(cfg_dict)
+            problem = problems[0]
+            task_id = problem["_task_id"]
+            log_file = Path(cfg.output_dir) / "actor" / "ray" / f"{task_id}.log"
+            sys.stdout = open(log_file, "a", buffering=1)
+            sys.stderr = sys.stdout
+            logging.basicConfig(level=logging.INFO, stream=sys.stdout, force=True)
             start_ts = time.monotonic()
-            logger.info(f"Running sync rollout for problem {problem['_task_id']}")
+            logger.info(f"Running sync rollout for problem {task_id}")
             rollout_result: RolloutResult = rollout_policy(cfg, llm, problem)
             stop_ts = time.monotonic()
             logger.info(f"Problem {problem['_task_id']} finished in {stop_ts - start_ts:.2f} seconds")
             return [rollout_result], llm.get_base_url(), problem_id, [stop_ts - start_ts], stop_ts
 
-        async def run_multiple_rollouts(cfg: DictConfig, llm: TrainableLLM, problems: list[dict], session: aiohttp.ClientSession) -> RolloutResult:
+        async def run_multiple_rollouts(
+            cfg: DictConfig, llm: TrainableLLM, problems: list[dict], session: aiohttp.ClientSession
+        ) -> RolloutResult:
             # Run all rollouts in parallel using asyncio.gather
             async def run_rollout(problem):
                 logger.info(f"Running async rollout loop for problem {problem['_task_id']}")
@@ -684,14 +697,24 @@ async def run_rollout(problem):
             return rollout_results, task_latencies
 
         async def run_rollouts_with_session(cfg: DictConfig, llm: TrainableLLM, problems: list[dict]) -> RolloutResult:
-            connector = aiohttp.TCPConnector(limit=cfg.actor.async_batch_size, limit_per_host=cfg.actor.async_batch_size, keepalive_timeout=1.0)
+            connector = aiohttp.TCPConnector(
+                limit=cfg.actor.async_batch_size, limit_per_host=cfg.actor.async_batch_size, keepalive_timeout=1.0
+            )
             timeout = aiohttp.ClientTimeout(total=3600.0, connect=3600.0, sock_read=3600.0)
             async with aiohttp.ClientSession(connector=connector, timeout=timeout) as session:
                 rollout_results, task_latencies = await run_multiple_rollouts(cfg, llm, problems, session)
             return rollout_results, task_latencies
 
-        def rollout_async_batch_wrapper(cfg_dict: dict, llm: TrainableLLM, problems: list[dict], problem_id: int) -> RolloutResult:
+        def rollout_async_batch_wrapper(
+            cfg_dict: dict, llm: TrainableLLM, problems: list[dict], problem_id: int
+        ) -> RolloutResult:
             cfg = OmegaConf.create(cfg_dict)
+            log_file = (
+                Path(cfg.output_dir) / "actor" / "ray" / f"{problems[0]['_task_id']}_async_{len(problems)}_problems.log"
+            )
+            sys.stdout = open(log_file, "a", buffering=1)
+            sys.stderr = sys.stdout
+            logging.basicConfig(level=logging.INFO, stream=sys.stdout, force=True)
             logger.info(f"Running async rollouts for {len(problems)} problems")
             results, task_latencies = asyncio.run(run_rollouts_with_session(cfg, llm, problems))
             stop_ts = time.monotonic()
@@ -707,10 +730,13 @@ def rollout_async_batch_wrapper(cfg_dict: dict, llm: TrainableLLM, problems: lis
 
     def have_capacity(self) -> bool:
         have_capacity = len(self.unfinished_tasks) < self.cfg.actor.problem_queue_size
-        have_llm_capacity = any(self.llms_utilization[llm_url] < (self.cfg.actor.llm_max_rollouts - self.attempts) for llm_url in self.llms_utilization)
+        have_llm_capacity = any(
+            self.llms_utilization[llm_url] < (self.cfg.actor.llm_max_rollouts - self.attempts)
+            for llm_url in self.llms_utilization
+        )
         have_capacity = have_capacity and have_llm_capacity
         if not have_capacity:
-            time.sleep(0.1) # sleep for a while to avoid quick loops when no capacity
+            time.sleep(0.1)  # sleep for a while to avoid quick loops when no capacity
         return have_capacity
 
     def submit_problem(self, problem: dict):
@@ -721,7 +747,10 @@ def submit_problem(self, problem: dict):
             p["_task_id"] = f"problem_{self.problem_id}_attempt_{n}"
             problems.append(p)
         # Split problems into batches of up to cfg.async_batch_size
-        batches = [problems[i:i + self.cfg.actor.async_batch_size] for i in range(0, len(problems), self.cfg.actor.async_batch_size)]
+        batches = [
+            problems[i : i + self.cfg.actor.async_batch_size]
+            for i in range(0, len(problems), self.cfg.actor.async_batch_size)
+        ]
         for batch_idx, problem_batch in enumerate(batches):
             llm_url, task_count = min(self.llms_utilization.items(), key=lambda x: x[1])
             logger.info(
@@ -832,7 +861,7 @@ def run_actor_loop(cfg: DictConfig):
 
     dataset_loader = hydra.utils.get_method(cfg.dataset_loader)
     # Get dataset loader parameters if they exist in config, otherwise use empty dict
-    dataset_loader_params = cfg.get('dataset_loader_params', {})
+    dataset_loader_params = cfg.get("dataset_loader_params", {})
     # Use **dataset_loader_params to pass parameters only if they exist
     train_dataset = dataset_loader(cfg.train_dataset_names, **dataset_loader_params)
     test_dataset = dataset_loader(cfg.test_dataset_names, **dataset_loader_params)

From c38d9872babaf48b7e0924b570214cb4de98fe37 Mon Sep 17 00:00:00 2001
From: Oleh Shliazhko <oleh.shliazhko@servicenow.com>
Date: Wed, 15 Oct 2025 16:40:15 +0000
Subject: [PATCH 111/126] catch tape saving errors

---
 pipelinerl/domains/miniwob/rollouts.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/pipelinerl/domains/miniwob/rollouts.py b/pipelinerl/domains/miniwob/rollouts.py
index 0cd45c02..7a38a5de 100644
--- a/pipelinerl/domains/miniwob/rollouts.py
+++ b/pipelinerl/domains/miniwob/rollouts.py
@@ -98,9 +98,10 @@ def generate_miniwob_rollout(cfg: DictConfig, llm: TrainableLLM, problem: dict)
         # save the tape as we go
         if cfg.save_tapes:
             try:
-                save_json_tape(tape, os.path.join(cfg.output_dir, "tapes"), tape.metadata.id)
+                tape_name = problem.get("_task_id", tape.metadata.id)
+                save_json_tape(tape, os.path.join(cfg.output_dir, "tapes"), tape_name)
             except Exception as e:
-                logger.error(f"Error saving tape: {e}")
+                logger.error(f"Error saving tape {tape_name}: {e}")
 
         # (3) Compute rewards
         obs_steps = [step for step in tape if isinstance(step, Observation)]

From 9ffa38cd6a40d64db12d3e524844fe09d2bad2d2 Mon Sep 17 00:00:00 2001
From: Oleh Shliazhko <oleh.shliazhko@servicenow.com>
Date: Wed, 15 Oct 2025 16:40:26 +0000
Subject: [PATCH 112/126] fix

---
 pipelinerl/launch.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/pipelinerl/launch.py b/pipelinerl/launch.py
index c788bc7c..953e56c9 100644
--- a/pipelinerl/launch.py
+++ b/pipelinerl/launch.py
@@ -554,7 +554,8 @@ def main(cfg: DictConfig):
     group = str(exp_dir)
     root = cfg.wandb.wandb_workspace_root
     if root:
-        if not group.startswith(root + "/"):
+        check_root = (root + "/") if not root.endswith("/") else root
+        if not group.startswith(check_root):
             raise ValueError(f"run_dir {exp_dir} does not start with root {root}")
         cfg.wandb.wandb_group = group[len(root) + 1 :]
     if world_map.total_finetune_gpus:

From 6b55c6b44634fbc3c7253d7b4d7e8b9b4e8e3aff Mon Sep 17 00:00:00 2001
From: Oleh Shliazhko <oleh.shliazhko@servicenow.com>
Date: Wed, 15 Oct 2025 16:40:41 +0000
Subject: [PATCH 113/126] update miniwob conf

---
 conf/miniwob.yaml | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/conf/miniwob.yaml b/conf/miniwob.yaml
index db147c2d..253eb2d3 100644
--- a/conf/miniwob.yaml
+++ b/conf/miniwob.yaml
@@ -9,9 +9,7 @@ world:
   preprocessor_fraction: 0
   finetune_fraction: 6
 
-# debug:
-#   mode: actor
-save_tapes: False
+save_tapes: true
 
 output_dir: results/miniwob/${now:%Y-%m-%d}/${now:%H-%M-%S}
 model_path: meta-llama/Llama-3.1-8B-Instruct
@@ -43,9 +41,10 @@ vllm_config:
 
 actor:
   rollout_policy: pipelinerl.domains.miniwob.rollouts.generate_miniwob_rollout
-  async_batch_size: 1
   llm_max_rollouts: 256
   problem_queue_size: 256
+  async_batch_size: 1
+  rollout_workers: 32
   shared_memory_entry_size: 100000000
 
 preprocess:

From 3fca6839ade64525b8c79e84eebb833375321432 Mon Sep 17 00:00:00 2001
From: Oleh Shliazhko <oleh.shliazhko@servicenow.com>
Date: Mon, 27 Oct 2025 13:48:43 +0000
Subject: [PATCH 114/126] better logging

---
 pipelinerl/finetune/logging_.py | 2 +-
 pipelinerl/finetune_loop.py     | 1 +
 pipelinerl/preprocess.py        | 8 ++++----
 3 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/pipelinerl/finetune/logging_.py b/pipelinerl/finetune/logging_.py
index 0b221e24..0624f765 100644
--- a/pipelinerl/finetune/logging_.py
+++ b/pipelinerl/finetune/logging_.py
@@ -25,7 +25,7 @@ def setup_logging(cfg: DictConfig, output_dir: Path, run: wandb_run.Run | None =
     debug_handler = logging.FileHandler(log_dir / f"info_{get_accelerator().process_index}.log")
     debug_handler.setLevel(logging.INFO)
     logging.basicConfig(
-        format="[finetune]: %(asctime)s.%(msecs)03d - %(levelname)s - %(name)s - %(message)s",
+        format="[finetune]: %(asctime)s.%(msecs)03d - %(levelname)s - %(name)s:%(lineno)d - %(message)s",
         datefmt="%m/%d/%Y %H:%M:%S",
         level=logging.INFO,
         handlers=[debug_handler, logging.StreamHandler()],
diff --git a/pipelinerl/finetune_loop.py b/pipelinerl/finetune_loop.py
index 7b616c43..5a529d44 100644
--- a/pipelinerl/finetune_loop.py
+++ b/pipelinerl/finetune_loop.py
@@ -440,6 +440,7 @@ def run_finetuning_loop(
         logger.info("Load the first version of the model into inference LLMs")
         weight_update_manager.send_weight_update(training_metrics.samples)
     else:
+        logger.info("send_weight_updates disabled, weight_update_manager is None")
         weight_update_manager = None
 
     batch_queue = Queue(maxsize=1)
diff --git a/pipelinerl/preprocess.py b/pipelinerl/preprocess.py
index 9a2af86a..200a0220 100644
--- a/pipelinerl/preprocess.py
+++ b/pipelinerl/preprocess.py
@@ -202,7 +202,7 @@ def run_dataset_loader(
                     # This is a blocking call, but in most cases there will be space
                     raw_chunk_queue.put(buffer)
             except Exception as e:
-                logger.error(f"Error in dataset loader: {e}")
+                logger.exception(f"Error in dataset loader: {e}")
                 raw_chunk_queue.put(e)
                 break
 
@@ -389,8 +389,8 @@ def run_preprocessing_loop(
     
     # Initialize TrainerState
     trainer_state = TrainerState(exp_root_dir)
-    if cfg.debug.mode == "preprocessor":
-        logger.info("Debug mode: preprocessor")
+    if cfg.debug.mode == "preprocessor" or cfg.debug.mode == "actor+preprocessor":
+        logger.info(f"Debug mode: {cfg.debug.mode}")
         trainer_state.debug_mode_init()
     elif cfg.debug.mode == "finetune+preprocessor":
         logger.info("Debug mode: finetune+preprocessor")
@@ -537,7 +537,7 @@ def run_preprocessing_loop(
                             else:
                                 processed_entries_queue_popped_data += 1
                                 if processed_entries_queue_popped_data % 100 == 0 and last_time_notice != processed_entries_queue_popped_data // 100:
-                                    logger.warning(f"Popped {processed_entries_queue_popped_data} old entries from processed entries queue")
+                                    logger.warning(f"Popped {processed_entries_queue_popped_data} old entries from processed entries queue of max size {processed_entries_queue.maxlen}")
                                     last_time_notice = processed_entries_queue_popped_data // 100
                         entry = buffer.popleft()
                         processed_entries_queue.append(entry) # drop from the left if full

From cd07c3e187f957cba232ca0db437338ea41c25b5 Mon Sep 17 00:00:00 2001
From: Oleh Shliazhko <oleh.shliazhko@servicenow.com>
Date: Mon, 27 Oct 2025 13:50:02 +0000
Subject: [PATCH 115/126] fix ray reinit with flag

---
 pipelinerl/actor.py | 27 +++++++++++----------------
 1 file changed, 11 insertions(+), 16 deletions(-)

diff --git a/pipelinerl/actor.py b/pipelinerl/actor.py
index 011fd65d..268a59b6 100644
--- a/pipelinerl/actor.py
+++ b/pipelinerl/actor.py
@@ -519,6 +519,7 @@ def run(self, dataset: list[tuple[str, dict]]):
                 self.update_stats(rollout_results=rollout_results)
 
                 finished_groups += 1
+                logger.info(f"Finished {'train' if self.is_training else 'test'} groups {finished_groups} out of {expected_rollouts}")
                 time_to_publish_train_stats = (
                     self.is_training and trainer_version_to_publish is not None
                 ) or self.debug_mode
@@ -587,7 +588,6 @@ def publish_stats(self, stats_writer: StreamWriter, loop_stats: Dict):
                     stats[f"{prefix}{new_suffix}"] = stats[key]
                     break
 
-        logger.info(f"Publish actor stats to wandb: {stats}")
         if self.cfg.wandb.use_wandb:
             wandb.log({f"actor/{k}": v for k, v in stats.items()})
         stats_writer.write(stats)
@@ -620,8 +620,6 @@ class ActorLoopRay(ActorLoop):
     Loop that runs the ray tasks for n_jobs to perform rollouts in parallel
     """
 
-    ray_ready: bool = False
-
     def __init__(self, cfg: DictConfig, *args, **kwargs):
         assert cfg.attempts % cfg.actor.async_batch_size == 0, (
             f"attempts {cfg.attempts} must be divisible by actor.async_batch_size {cfg.actor.async_batch_size}"
@@ -643,19 +641,16 @@ def __init__(self, cfg: DictConfig, *args, **kwargs):
         self.log_dir = Path(self.cfg.output_dir) / "actor" / "ray"
 
     def start_backend(self):
-        if not self.ray_ready:
-            logger.info(f"Initializing Ray with {self.cfg.actor.rollout_workers} workers..")
-            self.log_dir.mkdir(parents=True, exist_ok=True)
-            ray_context = ray.init(
-                num_cpus=self.cfg.actor.rollout_workers,
-                dashboard_host="0.0.0.0",
-                include_dashboard=True,
-                log_to_driver=True,
-            )
-            logger.info(f"Ray initialized, dashboard at {ray_context.dashboard_url}")
-            self.ray_ready = True
-        else:
-            logger.info("Ray already initialized")
+        logger.info(f"Initializing Ray with {self.cfg.actor.rollout_workers} workers..")
+        self.log_dir.mkdir(parents=True, exist_ok=True)
+        ray_context = ray.init(
+            num_cpus=self.cfg.actor.rollout_workers,
+            dashboard_host="0.0.0.0",
+            include_dashboard=True,
+            log_to_driver=True,
+            ignore_reinit_error=True,
+        )
+        logger.info(f"Ray initialized, dashboard at {ray_context.dashboard_url}")
 
         assert self.trainer_state.propagated_weight_version is not None
         rollout_policy: Callable[[DictConfig, TrainableLLM, dict], RolloutResult] = hydra.utils.get_method(

From 3a0b70956b222143d16af5018b7b861d51eb3e94 Mon Sep 17 00:00:00 2001
From: Oleh Shliazhko <oleh.shliazhko@servicenow.com>
Date: Mon, 27 Oct 2025 14:56:58 +0000
Subject: [PATCH 116/126] fix stopping test loop

---
 pipelinerl/actor.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/pipelinerl/actor.py b/pipelinerl/actor.py
index 268a59b6..98c453f6 100644
--- a/pipelinerl/actor.py
+++ b/pipelinerl/actor.py
@@ -547,7 +547,7 @@ def run(self, dataset: list[tuple[str, dict]]):
                         loop_stats=loop_stats,
                     )
 
-                if finished_groups == expected_rollouts:
+                if expected_rollouts >= 0 and finished_groups + self.rollout_errors >= expected_rollouts:
                     logger.info(f"Finished {expected_rollouts} rollouts, stopping actor loop")
                     self.stop_tasks()
                     break
@@ -799,7 +799,8 @@ def receive_finished_tasks(self):
                 self.finished_rollouts_count += 1
                 self.unfinished_problems[problem_id].append(rollout_result)
             logger.info(f"Problem {problem_id} has {len(self.unfinished_problems[problem_id])} rollout results")
-            if len(self.unfinished_problems[problem_id]) == self.cfg.attempts:
+            attempts = self.cfg.attempts if self.is_training else 1
+            if len(self.unfinished_problems[problem_id]) == attempts:
                 logger.info(f"Problem {problem_id} group finished")
                 group = self.unfinished_problems[problem_id]
                 random.shuffle(group)

From 8101531e53e3e8b04b9cb8af069016e232762900 Mon Sep 17 00:00:00 2001
From: Oleh Shliazhko <oleh.shliazhko@servicenow.com>
Date: Mon, 27 Oct 2025 14:57:24 +0000
Subject: [PATCH 117/126] update miniwob vllm params

---
 conf/miniwob.yaml | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/conf/miniwob.yaml b/conf/miniwob.yaml
index 253eb2d3..9bfa553f 100644
--- a/conf/miniwob.yaml
+++ b/conf/miniwob.yaml
@@ -7,14 +7,13 @@ defaults:
 world:
   actor_fraction: 2
   preprocessor_fraction: 0
-  finetune_fraction: 6
+  finetune_fraction: 4
 
 save_tapes: true
 
 output_dir: results/miniwob/${now:%Y-%m-%d}/${now:%H-%M-%S}
 model_path: meta-llama/Llama-3.1-8B-Instruct
 use_ray: true
-attempts: 8
 
 finetune:
   seq_length: 16384  # input + output tokens
@@ -25,6 +24,7 @@ finetune:
 eval_every_n_versions: 10240  # 1024 effective bs * 10 "optim steps"
 
 llm:
+  use_cache: false
   parameters:
     max_tokens: 4096  # output tokens
     temperature: 1.0
@@ -36,8 +36,12 @@ test_llm:
     top_k: 50
 
 vllm_config:
+  use_v1: false
   vllm_kwargs:
-    max_model_len: 16384  # input + output tokens
+    max-num-seqs: 256
+    max-num-batched-tokens: 32000
+    max_model_len: 16384
+    gpu-memory-utilization: 0.9
 
 actor:
   rollout_policy: pipelinerl.domains.miniwob.rollouts.generate_miniwob_rollout
@@ -150,7 +154,7 @@ environment:
   _target_: examples.rl_webagent.environment.WebEnvironment
   exp_path: null
   headless: true
-  observation_format: html
+  observation_format: axtree
 
 # DATASET CONFIGURATION
 dataset_loader: pipelinerl.domains.miniwob.load_tasks.load_tasks

From 30e486fed60782d2a32d34dfb4381f67caff27b8 Mon Sep 17 00:00:00 2001
From: Oleh Shliazhko <oleh.shliazhko@servicenow.com>
Date: Mon, 27 Oct 2025 15:56:59 +0000
Subject: [PATCH 118/126] more rollout timing metrics

---
 pipelinerl/domains/miniwob/rollouts.py | 247 ++++++++++++++-----------
 1 file changed, 137 insertions(+), 110 deletions(-)

diff --git a/pipelinerl/domains/miniwob/rollouts.py b/pipelinerl/domains/miniwob/rollouts.py
index 7a38a5de..ff4318b0 100644
--- a/pipelinerl/domains/miniwob/rollouts.py
+++ b/pipelinerl/domains/miniwob/rollouts.py
@@ -35,10 +35,17 @@ class MiniwobMetrics(BaseMetrics):
     n_page_observations: int
     n_steps: int
     total_execution_time: float
+    env_start_time: float
+    env_close_time: float
+    env_agent_creation_time: float
     agent_execution_time: float
     environment_execution_time: float
     env_step_time: float
     agent_step_time: float
+    llm_call_time: float
+    env_call_time: float
+    total_llm_call_time: float
+    total_env_call_time: float
 
 
 def tape_contains_an_error(tape: WebTape) -> bool:
@@ -64,9 +71,10 @@ def generate_miniwob_rollout(cfg: DictConfig, llm: TrainableLLM, problem: dict)
     # compute rewards
     # get training text from llm calls
 
-    start_time = time.time()
+    start_time = time.perf_counter()
 
     agent, env = get_agent_and_env_from_config(cfg)
+    env_agent_creation_time = time.perf_counter() - start_time
     environment: WebEnvironment = env
     try:
         agent.llms = {DEFAULT: llm}
@@ -85,121 +93,140 @@ def generate_miniwob_rollout(cfg: DictConfig, llm: TrainableLLM, problem: dict)
                 else:
                     logger.warning("retry after 1 seconds")
                     time.sleep(1)
+        env_start_time = time.perf_counter() - t
         logger.info(
-            f"Task {problem['dataset']}/{problem['task']}/{problem['seed']} started in {time.perf_counter() - t:.2f} seconds"
+            f"Task {problem['dataset']}/{problem['task']}/{problem['seed']} started in {env_start_time:.2f} seconds"
         )
         logger.info(f"Running agent for task {problem['dataset']}/{problem['task']}/{problem['seed']}")
+        ex_t = time.perf_counter()
         tape = execute_agent(agent, tape, env, max_loops=cfg.agent_max_loops)
-        logger.info(
-            f"Agent finished task {problem['dataset']}/{problem['task']}/{problem['seed']} in {time.perf_counter() - t:.2f} sec, produced tape with {len(tape.steps)} steps"
-        )
-        tape.metadata.result.update({"total_execution_time": time.perf_counter() - t})
-
-        # save the tape as we go
-        if cfg.save_tapes:
-            try:
-                tape_name = problem.get("_task_id", tape.metadata.id)
-                save_json_tape(tape, os.path.join(cfg.output_dir, "tapes"), tape_name)
-            except Exception as e:
-                logger.error(f"Error saving tape {tape_name}: {e}")
-
-        # (3) Compute rewards
-        obs_steps = [step for step in tape if isinstance(step, Observation)]
-        if obs_steps:
-            last_obs = obs_steps[-1]
-            # in Miniwob, the observation "reward" is defined as RAW_REWARD_GLOBAL > 0
-            # see here: https://github.com/ServiceNow/BrowserGym/blob/main/browsergym/miniwob/src/browsergym/miniwob/base.py#L188
-            # Let's take directly the RAW_REWARD_GLOBAL from the metadata
-            # raw_reward = last_obs.metadata.other.get("reward", 0.0)
-            raw_reward = last_obs.metadata.other.get("info", {}).get("task_info", {}).get("REWARD_GLOBAL", -1.0)
-        else:
-            raw_reward = -1.0
-
-        # get the number of LLMOutputParsingFailureAction in the tape
-        n_step_errors = len([step for step in tape.steps if isinstance(step, LLMOutputParsingFailureAction)])
-        # get the number of PageObservation steps in the tape
-        n_page_observations = len([step for step in tape.steps if isinstance(step, PageObservation)])
-
-        if obs_steps:
-            last_obs = obs_steps[-1]
-            # in Miniwob, the observation "reward" is defined as RAW_REWARD_GLOBAL > 0
-            # see here: https://github.com/ServiceNow/BrowserGym/blob/main/browsergym/miniwob/src/browsergym/miniwob/base.py#L188
-            # Let's take directly the RAW_REWARD_GLOBAL from the metadata
-            # raw_reward = last_obs.metadata.other.get("reward", 0.0)
-            raw_reward = last_obs.metadata.other.get("info", {}).get("task_info", {}).get("REWARD_GLOBAL", -1.0)
-        else:
-            raw_reward = -1.0
-
-        no_error = not tape_contains_an_error(tape)
-        # get the number of LLMOutputParsingFailureAction in the tape
-        n_step_errors = len([step for step in tape.steps if isinstance(step, LLMOutputParsingFailureAction)])
-        # get the number of PageObservation steps in the tape
-        n_page_observations = len([step for step in tape.steps if isinstance(step, PageObservation)])
-
-        if cfg.reward_computation == "nico":
-            reward = raw_reward * 0.99**n_step_errors if no_error and raw_reward >= 0 else -1.0
-        elif cfg.reward_computation == "massimo":
-            reward = float(raw_reward>0)
-            if reward == 0.0:
-                reward = -1.0
-            reward *= 0.98 ** n_page_observations
-        else:
-            raise ValueError(f"Invalid reward configuration: {cfg.reward_computation}")
-
-
-        # (3) Get LLM calls from Tape
-        llm_calls = [step for step in tape.steps if step.metadata.other.get("llm_call") is not None]
-        n_llm_calls = len(llm_calls)
-        llm_calls: list[LLMCall] = [
-            LLMCall(**step.metadata.other["llm_call"])
-            if isinstance(step.metadata.other["llm_call"], dict)
-            else step.metadata.other["llm_call"]
-            for step in llm_calls
-        ]
-
-        # (4) # For each LLM interaction in the tape, make a training example.
-        all_finished = 1
-        prompt_tokens = [llm_call.prompt_length_tokens for llm_call in llm_calls]
-        output_tokens = [llm_call.output_length_tokens for llm_call in llm_calls]
-        training_texts = [make_training_text(llm, llm_call) for llm_call in llm_calls]
-        for text in training_texts:
-            text.reward = reward
-            all_finished &= 1 if text.input_ids[-1] == llm.tokenizer.eos_token_id else 0
-
-        latency = time.time() - start_time
-        agent_time = tape.metadata.result.get("agent_execution_time", -1.0)
-        env_time = tape.metadata.result.get("environment_execution_time", -1.0)
-        n_observations = len(
-            [s for s in tape.steps if isinstance(s, Observation)]
-        )  # TODO: is this not the same n_page_observations??
-        n_other_steps = len(tape.steps) - n_observations
-        metrics = MiniwobMetrics(
-            reward=reward,
-            success=reward > 0.5,
-            no_error=no_error,
-            no_answer=reward < 0,
-            overflow=not all_finished,
-            n_llm_calls=n_llm_calls,
-            n_step_errors=n_step_errors,
-            n_page_observations=n_page_observations,
-            n_steps=len(tape.steps),
-            total_execution_time=tape.metadata.result.get("total_execution_time", -1.0),
-            agent_execution_time=agent_time,
-            environment_execution_time=env_time,
-            env_step_time=env_time / n_observations if env_time > 0 and n_observations > 0 else -1.0,
-            agent_step_time=agent_time / n_other_steps if agent_time > 0 and n_other_steps > 0 else -1.0,
-        )
-
-        return RolloutResult(
-            training_texts=training_texts,
-            metrics=metrics,
-            latency=latency,
-            dataset_name=problem["dataset"],
-            prompt_tokens=prompt_tokens,
-            output_tokens=output_tokens,
-        )
+        execution_time = time.perf_counter() - ex_t
     finally:
+        close_t = time.perf_counter()
         environment.close()
+        env_close_time = time.perf_counter() - close_t
+    logger.info(
+        f"Agent finished task {problem['dataset']}/{problem['task']}/{problem['seed']}, times: start {env_start_time:.2f} sec, exec {execution_time:.2f} sec, close {env_close_time:.2f} sec, produced tape with {len(tape.steps)} steps"
+    )
+    total_execution_time = time.perf_counter() - t
+    tape.metadata.result.update({"total_execution_time": total_execution_time, "env_start_time": env_start_time, "env_agent_creation_time": env_agent_creation_time, "execution_time": execution_time, "env_close_time": env_close_time})
+
+    # save the tape as we go
+    if cfg.save_tapes:
+        try:
+            tape_name = problem.get("_task_id", tape.metadata.id)
+            save_json_tape(tape, os.path.join(cfg.output_dir, "tapes"), tape_name)
+        except Exception as e:
+            logger.error(f"Error saving tape {tape_name}: {e}")
+
+    # (3) Compute rewards
+    obs_steps = [step for step in tape if isinstance(step, Observation)]
+    if obs_steps:
+        last_obs = obs_steps[-1]
+        # in Miniwob, the observation "reward" is defined as RAW_REWARD_GLOBAL > 0
+        # see here: https://github.com/ServiceNow/BrowserGym/blob/main/browsergym/miniwob/src/browsergym/miniwob/base.py#L188
+        # Let's take directly the RAW_REWARD_GLOBAL from the metadata
+        # raw_reward = last_obs.metadata.other.get("reward", 0.0)
+        raw_reward = last_obs.metadata.other.get("info", {}).get("task_info", {}).get("REWARD_GLOBAL", -1.0)
+    else:
+        raw_reward = -1.0
+
+    # get the number of LLMOutputParsingFailureAction in the tape
+    n_step_errors = len([step for step in tape.steps if isinstance(step, LLMOutputParsingFailureAction)])
+    # get the number of PageObservation steps in the tape
+    n_page_observations = len([step for step in tape.steps if isinstance(step, PageObservation)])
+
+    if obs_steps:
+        last_obs = obs_steps[-1]
+        # in Miniwob, the observation "reward" is defined as RAW_REWARD_GLOBAL > 0
+        # see here: https://github.com/ServiceNow/BrowserGym/blob/main/browsergym/miniwob/src/browsergym/miniwob/base.py#L188
+        # Let's take directly the RAW_REWARD_GLOBAL from the metadata
+        # raw_reward = last_obs.metadata.other.get("reward", 0.0)
+        raw_reward = last_obs.metadata.other.get("info", {}).get("task_info", {}).get("REWARD_GLOBAL", -1.0)
+    else:
+        raw_reward = -1.0
+
+    no_error = not tape_contains_an_error(tape)
+    # get the number of LLMOutputParsingFailureAction in the tape
+    n_step_errors = len([step for step in tape.steps if isinstance(step, LLMOutputParsingFailureAction)])
+    # get the number of PageObservation steps in the tape
+    n_page_observations = len([step for step in tape.steps if isinstance(step, PageObservation)])
+
+    if cfg.reward_computation == "nico":
+        reward = raw_reward * 0.99**n_step_errors if no_error and raw_reward >= 0 else -1.0
+    elif cfg.reward_computation == "massimo":
+        reward = float(raw_reward>0)
+        if reward == 0.0:
+            reward = -1.0
+        reward *= 0.98 ** n_page_observations
+    else:
+        raise ValueError(f"Invalid reward configuration: {cfg.reward_computation}")
+
+
+    # (3) Get LLM calls from Tape
+    llm_calls = [step for step in tape.steps if step.metadata.other.get("llm_call") is not None]
+    n_llm_calls = len(llm_calls)
+    llm_calls: list[LLMCall] = [
+        LLMCall(**step.metadata.other["llm_call"])
+        if isinstance(step.metadata.other["llm_call"], dict)
+        else step.metadata.other["llm_call"]
+        for step in llm_calls
+    ]
+    llm_call_times = [step.metadata.other.get("llm_call_time") for step in tape.steps if"llm_call_time" in step.metadata.other]
+    env_call_times = [step.metadata.other.get("action_execution_time") for step in tape.steps if"action_execution_time" in step.metadata.other]
+    total_llm_call_time = sum(llm_call_times)
+    total_env_call_time = sum(env_call_times)
+    llm_call_time = total_llm_call_time / len(llm_call_times) if len(llm_call_times) > 0 else -1.0
+    env_call_time = total_env_call_time / len(env_call_times) if len(env_call_times) > 0 else -1.0
+
+    # (4) # For each LLM interaction in the tape, make a training example.
+    all_finished = 1
+    prompt_tokens = [llm_call.prompt_length_tokens for llm_call in llm_calls]
+    output_tokens = [llm_call.output_length_tokens for llm_call in llm_calls]
+    training_texts = [make_training_text(llm, llm_call) for llm_call in llm_calls]
+    for text in training_texts:
+        text.reward = reward
+        all_finished &= 1 if text.input_ids[-1] == llm.tokenizer.eos_token_id else 0
+
+    latency = time.perf_counter() - start_time
+    agent_time = tape.metadata.result.get("agent_execution_time", -1.0)
+    env_time = tape.metadata.result.get("environment_execution_time", -1.0)
+    n_observations = len(
+        [s for s in tape.steps if isinstance(s, Observation)]
+    )  # TODO: is this not the same n_page_observations??
+    n_other_steps = len(tape.steps) - n_observations
+    metrics = MiniwobMetrics(
+        reward=reward,
+        success=reward > 0.5,
+        no_error=no_error,
+        no_answer=reward < 0,
+        overflow=not all_finished,
+        n_llm_calls=n_llm_calls,
+        n_step_errors=n_step_errors,
+        n_page_observations=n_page_observations,
+        n_steps=len(tape.steps),
+        total_execution_time=total_execution_time,
+        env_start_time=env_start_time,
+        env_close_time=env_close_time,
+        env_agent_creation_time=env_agent_creation_time,
+        agent_execution_time=agent_time,
+        environment_execution_time=env_time,
+        env_step_time=env_time / n_observations if env_time > 0 and n_observations > 0 else -1.0,
+        agent_step_time=agent_time / n_other_steps if agent_time > 0 and n_other_steps > 0 else -1.0,
+        llm_call_time=llm_call_time,
+        env_call_time=env_call_time,
+        total_llm_call_time=total_llm_call_time,
+        total_env_call_time=total_env_call_time,
+    )
+
+    return RolloutResult(
+        training_texts=training_texts,
+        metrics=metrics,
+        latency=latency,
+        dataset_name=problem["dataset"],
+        prompt_tokens=prompt_tokens,
+        output_tokens=output_tokens,
+    )
 
 
 async def generate_miniwob_rollout_async(

From 2e3a218f687985aac26694f3a0804943dfc14ae0 Mon Sep 17 00:00:00 2001
From: Oleh Shliazhko <oleh.shliazhko@servicenow.com>
Date: Thu, 30 Oct 2025 13:50:41 +0000
Subject: [PATCH 119/126] common actor loop for submitting and retrieveing
 tassks

---
 pipelinerl/actor.py | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

diff --git a/pipelinerl/actor.py b/pipelinerl/actor.py
index 98c453f6..8478288f 100644
--- a/pipelinerl/actor.py
+++ b/pipelinerl/actor.py
@@ -451,6 +451,7 @@ def run(self, dataset: list[tuple[str, dict]]):
             can_submit_before_update = math.inf
 
         logger.info(f"Start {'train' if self.is_training else 'test'} actor loop")
+        rollouts_last_minute = []
         with (
             write_to_streams(self.data_stream, "a") as data_stream_writer,
             write_to_streams(self.stats_stream, "a") as stats_writer,
@@ -481,8 +482,7 @@ def run(self, dataset: list[tuple[str, dict]]):
                                     assert False, "Problem queue was not full just a moment ago, but now it is full"
                             except StopIteration:
                                 break
-                        else:
-                            break
+                        break
 
                 # Second, try return a result
                 try:
@@ -506,6 +506,7 @@ def run(self, dataset: list[tuple[str, dict]]):
                 samples_in_queue = self.results_ready_to_publish()
                 all_text_dumps = []
                 for r in rollout_results:
+                    rollouts_last_minute.append(time.perf_counter())
                     for text in r.training_texts:
                         all_text_dumps.append(text.model_dump())
                 data_stream_writer.write(all_text_dumps)
@@ -524,6 +525,10 @@ def run(self, dataset: list[tuple[str, dict]]):
                     self.is_training and trainer_version_to_publish is not None
                 ) or self.debug_mode
                 time_to_publish_test_stats = finished_groups == expected_rollouts
+                time_to_publish_train_stats = True # TODO: remove this
+
+                # leave only the rollouts that are in the last minute
+                rollouts_last_minute = [t for t in rollouts_last_minute if t > time.perf_counter() - 60]
 
                 # Publish stats at every new model version or if all tapes are finished
                 if time_to_publish_train_stats or time_to_publish_test_stats:
@@ -537,6 +542,7 @@ def run(self, dataset: list[tuple[str, dict]]):
                             "time_since_start": time.time() - loop_start_time,
                             "groups_in_progress": in_progress,
                             "rollout_errors": self.rollout_errors,
+                            "rollouts_per_min": len(rollouts_last_minute),
                         }
                         trainer_version_to_publish = None
                     else:
@@ -720,7 +726,7 @@ def rollout_async_batch_wrapper(
             self.ray_remote = ray.remote(rollout_async_batch_wrapper)
         else:
             logger.info("Using sync mode")
-            self.ray_remote = ray.remote(rollout_wrapper)
+            self.ray_remote = ray.remote(num_cpus=0)(rollout_wrapper)
         self.start_time = time.time()
 
     def have_capacity(self) -> bool:
@@ -730,8 +736,6 @@ def have_capacity(self) -> bool:
             for llm_url in self.llms_utilization
         )
         have_capacity = have_capacity and have_llm_capacity
-        if not have_capacity:
-            time.sleep(0.1)  # sleep for a while to avoid quick loops when no capacity
         return have_capacity
 
     def submit_problem(self, problem: dict):
@@ -753,6 +757,7 @@ def submit_problem(self, problem: dict):
             )
             llm = self.llms_by_url[llm_url]
             task_ref = self.ray_remote.remote(self.cfg_dict, llm, problem_batch, self.problem_id)
+            time.sleep(1.0) # TODO: remove this
             self.llms_utilization[llm_url] += len(problem_batch)
             self.unfinished_tasks.append(task_ref)
         self.problem_id += 1
@@ -886,7 +891,7 @@ def run_actor_loop(cfg: DictConfig):
             context_size=_context_size,
             parameters=cfg.llm.parameters,
             use_cache=False,
-            collect_logprobs=True,
+            collect_logprobs=cfg.actor.collect_logprobs,
             observe_llm_calls=False,
         )
         for url in llm_urls
@@ -899,7 +904,7 @@ def run_actor_loop(cfg: DictConfig):
             context_size=_context_size,
             parameters=cfg.test_llm.parameters,
             use_cache=False,
-            collect_logprobs=True,
+            collect_logprobs=cfg.actor.collect_logprobs,
             observe_llm_calls=False,
         )
         for url in llm_urls

From 65405b96c8b5c57760d0f2337e009695477a1f41 Mon Sep 17 00:00:00 2001
From: Oleh Shliazhko <oleh.shliazhko@servicenow.com>
Date: Thu, 30 Oct 2025 14:07:41 +0000
Subject: [PATCH 120/126] fix

---
 conf/base.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/conf/base.yaml b/conf/base.yaml
index 7008ab75..bff0c19c 100644
--- a/conf/base.yaml
+++ b/conf/base.yaml
@@ -20,6 +20,7 @@ actor:
   throughput_window_size: 50
   shared_memory_entry_size: 10000000
   async_batch_size: 4
+  collect_logprobs: true
 
 environment: null
 preprocess:

From 32a3102b69ffaea10330f78144b8e542076b20f8 Mon Sep 17 00:00:00 2001
From: Oleh Shliazhko <oleh.shliazhko@servicenow.com>
Date: Thu, 30 Oct 2025 17:33:30 +0000
Subject: [PATCH 121/126] remove train stats crutch

---
 pipelinerl/actor.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/pipelinerl/actor.py b/pipelinerl/actor.py
index 8478288f..a1f3dcca 100644
--- a/pipelinerl/actor.py
+++ b/pipelinerl/actor.py
@@ -525,7 +525,6 @@ def run(self, dataset: list[tuple[str, dict]]):
                     self.is_training and trainer_version_to_publish is not None
                 ) or self.debug_mode
                 time_to_publish_test_stats = finished_groups == expected_rollouts
-                time_to_publish_train_stats = True # TODO: remove this
 
                 # leave only the rollouts that are in the last minute
                 rollouts_last_minute = [t for t in rollouts_last_minute if t > time.perf_counter() - 60]

From 1397e8ab91551259cb451e5eeb5eb94d94db6c1d Mon Sep 17 00:00:00 2001
From: Oleh Shliazhko <oleh.shliazhko@servicenow.com>
Date: Fri, 28 Nov 2025 12:36:26 +0000
Subject: [PATCH 122/126] clean miniwob rollout

---
 pipelinerl/domains/miniwob/rollouts.py | 223 ++++++++++++++++---------
 1 file changed, 143 insertions(+), 80 deletions(-)

diff --git a/pipelinerl/domains/miniwob/rollouts.py b/pipelinerl/domains/miniwob/rollouts.py
index 320ba614..0779f801 100644
--- a/pipelinerl/domains/miniwob/rollouts.py
+++ b/pipelinerl/domains/miniwob/rollouts.py
@@ -1,5 +1,4 @@
 import asyncio
-import json
 import logging
 import os
 import random
@@ -7,25 +6,22 @@
 import traceback
 
 import aiohttp
-from examples.rl_webagent.environment import WebEnvironment
-from examples.rl_webagent.steps import WebTape
 from hydra.utils import instantiate
 from omegaconf import DictConfig
 from tapeagents.agent import DEFAULT, Agent
-from tapeagents.core import LLMCall, LLMOutputParsingFailureAction, Observation
+from tapeagents.core import LLMOutputParsingFailureAction, Observation
 from tapeagents.io import save_json_tape
-from tapeagents.llms.trainable import TrainableLLM
-from tapeagents.orchestrator import async_execute_agent, execute_agent, get_agent_and_env_from_config
+from tapeagents.orchestrator import async_execute_agent, execute_agent
 from tapeagents.remote_environment import AsyncRemoteEnvironment
 from tapeagents.tools.simple_browser import PageObservation
 
 from pipelinerl.async_llm import make_training_text
+from pipelinerl.domains.miniwob.environment import WebEnvironment
+from pipelinerl.domains.miniwob.steps import WebTape
 from pipelinerl.llm import LLMCall, TrainableLLM
 from pipelinerl.rollouts import BaseMetrics, RolloutResult
 from pipelinerl.world import Job
 
-from .steps import WebTape
-
 logger = logging.getLogger(__name__)
 
 
@@ -67,26 +63,34 @@ def tape_contains_an_error(tape: WebTape) -> bool:
         or (isinstance(tape.steps[-1], PageObservation) and bool(tape.steps[-1].error))
     )
 
+
 async def check_env_server_health(env_job: Job, session: aiohttp.ClientSession) -> dict:
     """Check environment server health via HTTP API."""
     try:
         url = f"http://{env_job.hostname}:{env_job.port}/health"
-        async with session.get(url, timeout=5) as response:
+        async with session.get(url, timeout=aiohttp.ClientTimeout(total=5.0)) as response:
             if response.status == 200:
                 health_data = await response.json()
-                return {
-                    "healthy": True,
-                    "health_data": health_data,
-                    "last_check": time.time()
-                }
+                return {"healthy": True, "health_data": health_data, "last_check": time.time()}
             else:
                 error_text = await response.text()
-                return {"healthy": False, "error_message": f"HTTP {response.status}: {error_text}", "last_check": time.time()}
+                return {
+                    "healthy": False,
+                    "error_message": f"HTTP {response.status}: {error_text}",
+                    "last_check": time.time(),
+                }
     except Exception as e:
         exception_type = type(e).__name__
         exception_message = str(e) if str(e) else "No message available"
-        logger.exception(f"Error checking environment server health: {exception_type}: {exception_message}", stack_info=True)
-        return {"healthy": False, "error_message": f"Exception: {exception_type}: {exception_message}", "last_check": time.time(), "error_stacktrace": traceback.format_exc()}
+        logger.exception(
+            f"Error checking environment server health: {exception_type}: {exception_message}", stack_info=True
+        )
+        return {
+            "healthy": False,
+            "error_message": f"Exception: {exception_type}: {exception_message}",
+            "last_check": time.time(),
+            "error_stacktrace": traceback.format_exc(),
+        }
 
 
 def generate_miniwob_rollout(cfg: DictConfig, llm: TrainableLLM, problem: dict) -> RolloutResult:
@@ -98,13 +102,18 @@ def generate_miniwob_rollout(cfg: DictConfig, llm: TrainableLLM, problem: dict)
     # get training text from llm calls
 
     start_time = time.perf_counter()
-
-    agent, env = get_agent_and_env_from_config(cfg)
+    environment: WebEnvironment = instantiate(cfg.environment)
+    environment.initialize()
+    logger.info(f"Environment tools: {environment.tools_description()}")
+    agent: Agent = instantiate(
+        cfg.agent,
+        known_actions=environment.actions(),
+        tools_description=environment.tools_description(),
+        llms={DEFAULT: llm},
+    )
+    logger.info(f"Agent and environment loaded, using llm {llm.model_name} at {llm.get_base_url()}")
     env_agent_creation_time = time.perf_counter() - start_time
-    environment: WebEnvironment = env
     try:
-        agent.llms = {DEFAULT: llm}
-        logger.info(f"Agent and environment loaded, using llm {llm.model_name} at {llm.get_base_url()}")
         start_attempts = cfg.start_attempts
         t = time.perf_counter()
         while True:
@@ -115,7 +124,9 @@ def generate_miniwob_rollout(cfg: DictConfig, llm: TrainableLLM, problem: dict)
                 logger.exception(f"Failed to start task {problem['dataset']}/{problem['task']}/{problem['seed']}: {e}")
                 start_attempts -= 1
                 if start_attempts <= 0:
-                    raise Exception(f"Failed to start task {problem['dataset']}/{problem['task']}/{problem['seed']} after {cfg.start_attempts} attempts")
+                    raise Exception(
+                        f"Failed to start task {problem['dataset']}/{problem['task']}/{problem['seed']} after {cfg.start_attempts} attempts"
+                    )
                 else:
                     logger.warning("retry after 1 seconds")
                     time.sleep(1)
@@ -125,7 +136,7 @@ def generate_miniwob_rollout(cfg: DictConfig, llm: TrainableLLM, problem: dict)
         )
         logger.info(f"Running agent for task {problem['dataset']}/{problem['task']}/{problem['seed']}")
         ex_t = time.perf_counter()
-        tape = execute_agent(agent, tape, env, max_loops=cfg.agent_max_loops)
+        tape = execute_agent(agent, tape, environment, max_loops=cfg.agent_max_loops)
         execution_time = time.perf_counter() - ex_t
     finally:
         close_t = time.perf_counter()
@@ -135,12 +146,20 @@ def generate_miniwob_rollout(cfg: DictConfig, llm: TrainableLLM, problem: dict)
         f"Agent finished task {problem['dataset']}/{problem['task']}/{problem['seed']}, times: start {env_start_time:.2f} sec, exec {execution_time:.2f} sec, close {env_close_time:.2f} sec, produced tape with {len(tape.steps)} steps"
     )
     total_execution_time = time.perf_counter() - t
-    tape.metadata.result.update({"total_execution_time": total_execution_time, "env_start_time": env_start_time, "env_agent_creation_time": env_agent_creation_time, "execution_time": execution_time, "env_close_time": env_close_time})
+    tape.metadata.result.update(
+        {
+            "total_execution_time": total_execution_time,
+            "env_start_time": env_start_time,
+            "env_agent_creation_time": env_agent_creation_time,
+            "execution_time": execution_time,
+            "env_close_time": env_close_time,
+        }
+    )
 
     # save the tape as we go
     if cfg.save_tapes:
+        tape_name = problem.get("_task_id", tape.metadata.id)
         try:
-            tape_name = problem.get("_task_id", tape.metadata.id)
             save_json_tape(tape, os.path.join(cfg.output_dir, "tapes"), tape_name)
         except Exception as e:
             logger.error(f"Error saving tape {tape_name}: {e}")
@@ -179,24 +198,23 @@ def generate_miniwob_rollout(cfg: DictConfig, llm: TrainableLLM, problem: dict)
     n_page_observations = len([step for step in tape.steps if isinstance(step, PageObservation)])
 
     if cfg.reward_computation == "uic":
-        reward = float(raw_reward>0)
+        reward = float(raw_reward > 0)
         if reward == 0.0:
             reward = -1.0
-        reward *= 0.98 ** n_page_observations
+        reward *= 0.98**n_page_observations
     else:
         reward = raw_reward * 0.99**n_step_errors if no_error and raw_reward >= 0 else -1.0
 
     # (3) Get LLM calls from Tape
-    llm_calls = [step for step in tape.steps if step.metadata.other.get("llm_call") is not None]
-    n_llm_calls = len(llm_calls)
     llm_calls: list[LLMCall] = [
         LLMCall(**step.metadata.other["llm_call"])
         if isinstance(step.metadata.other["llm_call"], dict)
         else step.metadata.other["llm_call"]
-        for step in llm_calls
+        for step in tape.steps
+        if "llm_call" in step.metadata.other
     ]
-    llm_call_times = [step.metadata.other.get("llm_call_time") for step in tape.steps if"llm_call_time" in step.metadata.other]
-    env_call_times = [step.metadata.other.get("action_execution_time") for step in tape.steps if"action_execution_time" in step.metadata.other]
+    llm_call_times = [float(step.metadata.other.get("llm_call_time", 0.0)) for step in tape.steps]
+    env_call_times = [float(step.metadata.other.get("action_execution_time", 0.0)) for step in tape.steps]
     total_llm_call_time = sum(llm_call_times)
     total_env_call_time = sum(env_call_times)
     llm_call_time = total_llm_call_time / len(llm_call_times) if len(llm_call_times) > 0 else -1.0
@@ -204,8 +222,6 @@ def generate_miniwob_rollout(cfg: DictConfig, llm: TrainableLLM, problem: dict)
 
     # (4) # For each LLM interaction in the tape, make a training example.
     all_finished = 1
-    prompt_tokens = [llm_call.prompt_length_tokens for llm_call in llm_calls]
-    output_tokens = [llm_call.output_length_tokens for llm_call in llm_calls]
     training_texts = [make_training_text(llm, llm_call) for llm_call in llm_calls]
     for text in training_texts:
         text.reward = reward
@@ -214,9 +230,7 @@ def generate_miniwob_rollout(cfg: DictConfig, llm: TrainableLLM, problem: dict)
     latency = time.perf_counter() - start_time
     agent_time = tape.metadata.result.get("agent_execution_time", -1.0)
     env_time = tape.metadata.result.get("environment_execution_time", -1.0)
-    n_observations = len(
-        [s for s in tape.steps if isinstance(s, Observation)]
-    )  # TODO: is this not the same n_page_observations??
+    n_observations = len([s for s in tape.steps if isinstance(s, Observation)])
     n_other_steps = len(tape.steps) - n_observations
     metrics = MiniwobMetrics(
         reward=reward,
@@ -224,7 +238,7 @@ def generate_miniwob_rollout(cfg: DictConfig, llm: TrainableLLM, problem: dict)
         no_error=no_error,
         no_answer=reward < 0,
         overflow=not all_finished,
-        n_llm_calls=n_llm_calls,
+        n_llm_calls=len(llm_calls),
         n_step_errors=n_step_errors,
         n_page_observations=n_page_observations,
         n_steps=len(tape.steps),
@@ -247,8 +261,6 @@ def generate_miniwob_rollout(cfg: DictConfig, llm: TrainableLLM, problem: dict)
         metrics=metrics,
         latency=latency,
         dataset_name=problem["dataset"],
-        prompt_tokens=prompt_tokens,
-        output_tokens=output_tokens,
     )
 
 
@@ -267,9 +279,9 @@ async def generate_miniwob_rollout_async(
     # get training text from llm calls
 
     start_time = time.time()
-    
+
     # Overall timeout for the entire rollout to prevent hanging
-    rollout_timeout = getattr(cfg, 'rollout_timeout', 600)  # 10 minutes default
+    rollout_timeout = getattr(cfg, "rollout_timeout", 600)  # 10 minutes default
 
     env_jobs = [Job(**job) for job in cfg.jobs if job["kind"] == "environment"]
     env_jobs_url_tried = []
@@ -277,7 +289,9 @@ async def generate_miniwob_rollout_async(
     # Try each environment server with health checks until one of them returns a rollout result
     for _ in range(len(env_jobs)):
         # Choose the next environment server to try randomly from the ones that have not been tried yet
-        env_job = random.choice([job for job in env_jobs if f"http://{job.hostname}:{job.port}" not in env_jobs_url_tried])
+        env_job = random.choice(
+            [job for job in env_jobs if f"http://{job.hostname}:{job.port}" not in env_jobs_url_tried]
+        )
         env_job_url = f"http://{env_job.hostname}:{env_job.port}"
         env_jobs_url_tried.append(env_job_url)
 
@@ -295,24 +309,30 @@ async def generate_miniwob_rollout_async(
             # Execute the entire rollout with a timeout
             return await asyncio.wait_for(
                 _execute_rollout_with_timeout(cfg, llm, problem, session, start_time, env_job_url),
-                timeout=rollout_timeout
+                timeout=rollout_timeout,
             )
         except asyncio.TimeoutError:
             health = await check_env_server_health(env_job, session)
             if stack_trace := health.get("error_stacktrace"):
                 logger.warning(f"Get health error stacktrace: {stack_trace}")
             logger.warning(f"Rollout timeout error stacktrace: {traceback.format_exc()}")
-            logger.warning(f"Rollout timed out after {rollout_timeout} seconds for task {problem['dataset']}/{problem['task']}/{problem['seed']} on environment {env_job_url}. Health: {health}. Trying next server.")
+            logger.warning(
+                f"Rollout timed out after {rollout_timeout} seconds for task {problem['dataset']}/{problem['task']}/{problem['seed']} on environment {env_job_url}. Health: {health}. Trying next server."
+            )
             continue
         except Exception as e:
             health = await check_env_server_health(env_job, session)
             if stack_trace := health.get("error_stacktrace"):
                 logger.warning(f"Get health error stacktrace: {stack_trace}")
             logger.warning(f"Rollout failed error stacktrace: {traceback.format_exc()}")
-            logger.warning(f"Rollout failed for task {problem['dataset']}/{problem['task']}/{problem['seed']} on environment {env_job_url}. Health: {health}. Trying next server.")
+            logger.warning(
+                f"Rollout failed for task {problem['dataset']}/{problem['task']}/{problem['seed']} on environment {env_job_url}. Health: {health}. Trying next server."
+            )
             continue
     # If all servers failed
-    logger.error(f"All environment servers failed for task {problem['dataset']}/{problem['task']}/{problem['seed']}. Returning a failed rollout result.")
+    logger.error(
+        f"All environment servers failed for task {problem['dataset']}/{problem['task']}/{problem['seed']}. Returning a failed rollout result."
+    )
     return _create_failed_rollout_result(problem, start_time, "all environment servers failed")
 
 
@@ -326,34 +346,41 @@ async def _execute_rollout_with_timeout(
 ) -> RolloutResult:
     # (2) Generate environment, TapeAgent, and run them to get a Tape
     no_error = True  # track if there was an error in the tape
+    t = time.perf_counter()
     environment = AsyncRemoteEnvironment(server_url=env_job_url)  # type: ignore
     async with environment.acontext(session, wait_for_env=True) as env:
+        env_agent_creation_time = time.perf_counter() - t
         start_attempts = cfg.start_attempts
         t = time.perf_counter()
+        tape_dict = {}
         while start_attempts > 0:
             try:
                 tape_dict, info = await env.start_task(problem)
                 if info.get("error"):
-                    raise ValueError(info['error'])
+                    raise ValueError(info["error"])
                 break
             except Exception as e:
                 start_attempts -= 1
-                logger.warning(f"Failed to start task {problem['dataset']}/{problem['task']}/{problem['seed']}. {start_attempts} attempts remaining. Error: {e}")
+                logger.warning(
+                    f"Failed to start task {problem['dataset']}/{problem['task']}/{problem['seed']}. {start_attempts} attempts remaining. Error: {e}"
+                )
                 if start_attempts <= 0:
                     logger.error(f"Failed to start task after all retry attempts: {e}")
                     no_error = False
-                    tape_dict = {}
                     break
                 else:
                     logger.warning("Retry start task after 5 seconds.")
                     await asyncio.sleep(5)
+        env_start_time = time.perf_counter() - t
         logger.info(
-            f"Task {problem['dataset']}/{problem['task']}/{problem['seed']} started in {time.perf_counter() - t:.2f} seconds. Worker ID: {env.worker_id}. Tape dict: {tape_dict}"
+            f"Task {problem['dataset']}/{problem['task']}/{problem['seed']} started in {env_start_time:.2f} seconds. Worker ID: {env.worker_id}. Tape dict: {tape_dict}"
         )
         tape: WebTape = WebTape(**tape_dict)  # convert http response dict to WebTape object
         t = time.perf_counter()
         if no_error:  # only run the agent if the task started successfully
-            logger.info(f"Running agent for task {problem['dataset']}/{problem['task']}/{problem['seed']} with worker ID: {env.worker_id} and tape ID {tape.metadata.id}")
+            logger.info(
+                f"Running agent for task {problem['dataset']}/{problem['task']}/{problem['seed']} with worker ID: {env.worker_id} and tape ID {tape.metadata.id}"
+            )
             agent_attempts = cfg.agent_attempts
             while agent_attempts > 0:
                 # check if the worker is alive.
@@ -361,43 +388,63 @@ async def _execute_rollout_with_timeout(
                     # this will either raise RuntimeError if worker is not alive anymore, or return a dictionary with the worker status
                     worker_status = await env.check_worker_alive()
                     if worker_status.get("status") == "starting":
-                        logger.warning(f"Worker {env.worker_id} for task {problem['dataset']}/{problem['task']}/{problem['seed']} and tape ID {tape.metadata.id} is starting, waiting 5 seconds for it to be fully started.")
+                        logger.warning(
+                            f"Worker {env.worker_id} for task {problem['dataset']}/{problem['task']}/{problem['seed']} and tape ID {tape.metadata.id} is starting, waiting 5 seconds for it to be fully started."
+                        )
                         await asyncio.sleep(5)
                         continue
                 except Exception as e:
                     # if worker is dead, no need to retry
-                    logger.exception(f"Worker {env.worker_id} for task {problem['dataset']}/{problem['task']}/{problem['seed']} and tape ID {tape.metadata.id} is dead. Error: {e}", stack_info=True)
+                    logger.exception(
+                        f"Worker {env.worker_id} for task {problem['dataset']}/{problem['task']}/{problem['seed']} and tape ID {tape.metadata.id} is dead. Error: {e}",
+                        stack_info=True,
+                    )
                     no_error = False
                     break
                 # if worker is alive, run the agent
                 try:
+                    t = time.perf_counter()
                     actions = await env.a_actions()
                     tools_description = await env.a_tools_description()
                     agent: Agent = instantiate(cfg.agent, known_actions=actions, tools_description=tools_description)
-                    agent.llms = {DEFAULT: llm}
+                    agent.llms = {DEFAULT: llm}  # type: ignore
+                    env_agent_creation_time += time.perf_counter() - t
                     tape = await async_execute_agent(agent, tape, env, session, max_loops=cfg.agent_max_loops)
                     # Check if the tape has an error from the orchestrator (e.g., SocketTimeoutError, RuntimeError: Worker is not alive, etc.)
                     if tape.metadata.error:
-                        logger.error(f"Agent execution for task {problem['dataset']}/{problem['task']}/{problem['seed']} with worker ID: {env.worker_id} and tape ID {tape.metadata.id} returned a tape with error: {tape.metadata.error}")
+                        logger.error(
+                            f"Agent execution for task {problem['dataset']}/{problem['task']}/{problem['seed']} with worker ID: {env.worker_id} and tape ID {tape.metadata.id} returned a tape with error: {tape.metadata.error}"
+                        )
                         raise ValueError(tape.metadata.error)
                     else:
                         # Success - break out of retry loop
-                        logger.info(f"Agent execution for task {problem['dataset']}/{problem['task']}/{problem['seed']} with worker ID: {env.worker_id} and tape ID {tape.metadata.id} finished successfully")
+                        logger.info(
+                            f"Agent execution for task {problem['dataset']}/{problem['task']}/{problem['seed']} with worker ID: {env.worker_id} and tape ID {tape.metadata.id} finished successfully"
+                        )
                         break
                 except Exception as e:
                     agent_attempts -= 1
-                    logger.warning(f"Error occurred while running agent for task {problem['dataset']}/{problem['task']}/{problem['seed']} with worker ID: {env.worker_id} and tape ID {tape.metadata.id}. {agent_attempts} attempts remaining. Error: {e}")
+                    logger.warning(
+                        f"Error occurred while running agent for task {problem['dataset']}/{problem['task']}/{problem['seed']} with worker ID: {env.worker_id} and tape ID {tape.metadata.id}. {agent_attempts} attempts remaining. Error: {e}"
+                    )
                     if agent_attempts <= 0:
-                        logger.error(f"Agent execution failed after all retry attempts for task {problem['dataset']}/{problem['task']}/{problem['seed']} with worker ID: {env.worker_id} and tape ID {tape.metadata.id}: {e}")
+                        logger.error(
+                            f"Agent execution failed after all retry attempts for task {problem['dataset']}/{problem['task']}/{problem['seed']} with worker ID: {env.worker_id} and tape ID {tape.metadata.id}: {e}"
+                        )
                         no_error = False
                         break
                     else:
-                        logger.warning(f"Retry agent execution after 5 seconds for task {problem['dataset']}/{problem['task']}/{problem['seed']} with worker ID: {env.worker_id} and tape ID {tape.metadata.id}.")
+                        logger.warning(
+                            f"Retry agent execution after 5 seconds for task {problem['dataset']}/{problem['task']}/{problem['seed']} with worker ID: {env.worker_id} and tape ID {tape.metadata.id}."
+                        )
                         await asyncio.sleep(5)
             logger.info(
                 f"Agent finished task {problem['dataset']}/{problem['task']}/{problem['seed']} in {time.perf_counter() - t:.2f} seconds with worker ID: {env.worker_id} and tape ID {tape.metadata.id}"
             )
         tape.metadata.result.update({"total_execution_time": time.perf_counter() - t})
+        t = time.perf_counter()
+        await env.aclose()
+        env_close_time = time.perf_counter() - t
 
     # save the tape as we go
     if cfg.save_tapes:
@@ -422,26 +469,24 @@ async def _execute_rollout_with_timeout(
     n_page_observations = len([step for step in tape.steps if isinstance(step, PageObservation)])
 
     if cfg.reward_computation == "uic":
-        reward = float(raw_reward>0)
+        reward = float(raw_reward > 0)
         if reward == 0.0:
             reward = -1.0
-        reward *= 0.98 ** n_page_observations
+        reward *= 0.98**n_page_observations
     else:
         reward = raw_reward * 0.99**n_step_errors if no_error and raw_reward >= 0 else -1.0
 
     # (3) Get LLM calls from Tape
-    llm_calls = [step for step in tape.steps if step.metadata.other.get("llm_call") is not None]
-    n_llm_calls = len(llm_calls)
     llm_calls: list[LLMCall] = [
-        LLMCall(**step.metadata.other["llm_call"]) if isinstance(step.metadata.other["llm_call"], dict)
+        LLMCall(**step.metadata.other["llm_call"])
+        if isinstance(step.metadata.other["llm_call"], dict)
         else step.metadata.other["llm_call"]
-        for step in llm_calls
+        for step in tape.steps
+        if "llm_call" in step.metadata.other
     ]
 
     # (4) # For each LLM interaction in the tape, make a training example.
     all_finished = 1
-    prompt_tokens = [llm_call.prompt_length_tokens for llm_call in llm_calls]
-    output_tokens = [llm_call.output_length_tokens for llm_call in llm_calls]
     training_texts = [make_training_text(llm, llm_call) for llm_call in llm_calls]
     for text in training_texts:
         text.reward = reward
@@ -452,13 +497,21 @@ async def _execute_rollout_with_timeout(
     env_time = tape.metadata.result.get("environment_execution_time", -1.0)
     n_observations = len([s for s in tape.steps if isinstance(s, Observation)])
     n_other_steps = len(tape.steps) - n_observations
+
+    llm_call_times = [float(step.metadata.other.get("llm_call_time", 0.0)) for step in tape.steps]
+    env_call_times = [float(step.metadata.other.get("action_execution_time", 0.0)) for step in tape.steps]
+    total_llm_call_time = sum(llm_call_times)
+    total_env_call_time = sum(env_call_times)
+    llm_call_time = total_llm_call_time / len(llm_call_times) if len(llm_call_times) > 0 else -1.0
+    env_call_time = total_env_call_time / len(env_call_times) if len(env_call_times) > 0 else -1.0
+
     metrics = MiniwobMetrics(
         reward=reward,
         success=reward > 0.5,
         no_error=no_error,
         no_answer=reward < 0,
         overflow=not all_finished,
-        n_llm_calls=n_llm_calls,
+        n_llm_calls=len(llm_calls),
         n_step_errors=n_step_errors,
         n_page_observations=n_page_observations,
         n_steps=len(tape.steps),
@@ -467,6 +520,13 @@ async def _execute_rollout_with_timeout(
         environment_execution_time=env_time,
         env_step_time=env_time / n_observations if env_time > 0 and n_observations > 0 else -1.0,
         agent_step_time=agent_time / n_other_steps if agent_time > 0 and n_other_steps > 0 else -1.0,
+        llm_call_time=llm_call_time,
+        env_call_time=env_call_time,
+        total_llm_call_time=total_llm_call_time,
+        total_env_call_time=total_env_call_time,
+        env_start_time=env_start_time,
+        env_close_time=env_close_time,
+        env_agent_creation_time=env_agent_creation_time,
     )
 
     return RolloutResult(
@@ -474,15 +534,13 @@ async def _execute_rollout_with_timeout(
         metrics=metrics,
         latency=latency,
         dataset_name=problem["dataset"],
-        prompt_tokens=prompt_tokens,
-        output_tokens=output_tokens,
     )
 
 
 def _create_failed_rollout_result(problem: dict, start_time: float, error_type: str) -> RolloutResult:
     """Create a failed rollout result for timeout or other errors."""
     latency = time.time() - start_time
-    
+
     # Create empty training texts and metrics for failed rollout
     metrics = MiniwobMetrics(
         reward=-1.0,
@@ -495,17 +553,22 @@ def _create_failed_rollout_result(problem: dict, start_time: float, error_type:
         n_page_observations=0,
         n_steps=0,
         total_execution_time=latency,
-        agent_execution_time=-1.0,
-        environment_execution_time=-1.0,
-        env_step_time=-1.0,
-        agent_step_time=-1.0,
+        agent_execution_time=0.0,
+        environment_execution_time=0.0,
+        env_step_time=0.0,
+        agent_step_time=0.0,
+        llm_call_time=0.0,
+        env_call_time=0.0,
+        total_llm_call_time=0.0,
+        total_env_call_time=0.0,
+        env_start_time=0.0,
+        env_close_time=0.0,
+        env_agent_creation_time=0.0,
     )
-    
+
     return RolloutResult(
         training_texts=[],
         metrics=metrics,
         latency=latency,
         dataset_name=problem["dataset"],
-        prompt_tokens=[],
-        output_tokens=[],
     )

From 524fa83c6df4d51c70119b0158ce1dab91b59684 Mon Sep 17 00:00:00 2001
From: Oleh Shliazhko <oleh.shliazhko@servicenow.com>
Date: Fri, 28 Nov 2025 13:41:11 +0000
Subject: [PATCH 123/126] remove duplicate training text definition

---
 pipelinerl/llm.py | 37 ++-----------------------------------
 1 file changed, 2 insertions(+), 35 deletions(-)

diff --git a/pipelinerl/llm.py b/pipelinerl/llm.py
index cc099c15..96950231 100644
--- a/pipelinerl/llm.py
+++ b/pipelinerl/llm.py
@@ -21,45 +21,12 @@
 from pydantic import BaseModel, Field, TypeAdapter
 from tenacity import retry, stop_after_attempt, wait_exponential
 
+from pipelinerl.rollouts import TrainingText
+
 logger = logging.getLogger(__name__)
 
 PIPELINERL_LLM_TOKEN = "PIPELINERL_LLM_TOKEN"
 
-class TrainingText(BaseModel):
-    """
-    Training text instance used to finetune a language model.
-
-    Attributes:
-        text (str): The full text of the training instance.
-        n_predicted (int): The number of predicted characters in the text.
-        reward (float): The reward associated with the training instance. Defaults to 0.0.
-        logprobs (List[float]): A list of log probabilities of the completion tokens from the assistant model.
-        ref_logprobs (List[float]): A list of reference log probabilities of the completion tokens from the reference model.
-        input_ids (List[int]): The tokenized input ids of the text.
-        labels (List[int]): The tokenized labels of the text (i.e., masked token ids for the prompt and regular token ids for the prediction).
-        group_id (str, optional): ID of the group. It is used by the RL finetuning script to normalize rewards.
-        prompt_text (str): Portion of the text that serves as the prompt (i.e., the text excluding the predicted characters).
-        output_text (str): Portion of the text that represents the predicted output (i.e., the last n_predicted characters).
-    """
-
-    text: str
-    n_predicted: int
-    reward: float = 0.0
-    logprobs: list[float] = Field(default_factory=list)
-    ref_logprobs: list[float] = Field(default_factory=list)
-    input_ids: list[int] = Field(default_factory=list)
-    labels: list[int] = Field(default_factory=list)
-    group_id: str | None = None
-    metadata: dict = Field(default_factory=dict)
-
-    @property
-    def prompt_text(self) -> str:
-        return self.text[: -self.n_predicted]
-
-    @property
-    def output_text(self) -> str:
-        return self.text[-self.n_predicted :]
-
 
 class Prompt(BaseModel):
     """

From 6c0c9a606ec7e689c898112831325e7b360fd9a1 Mon Sep 17 00:00:00 2001
From: Oleh Shliazhko <oleh.shliazhko@servicenow.com>
Date: Fri, 28 Nov 2025 13:41:30 +0000
Subject: [PATCH 124/126] refactor and clean up miniwob rollout

---
 pipelinerl/domains/miniwob/rollouts.py | 621 ++++++++++---------------
 1 file changed, 257 insertions(+), 364 deletions(-)

diff --git a/pipelinerl/domains/miniwob/rollouts.py b/pipelinerl/domains/miniwob/rollouts.py
index 0779f801..9392efe6 100644
--- a/pipelinerl/domains/miniwob/rollouts.py
+++ b/pipelinerl/domains/miniwob/rollouts.py
@@ -4,6 +4,7 @@
 import random
 import time
 import traceback
+from typing import Literal
 
 import aiohttp
 from hydra.utils import instantiate
@@ -18,38 +19,41 @@
 from pipelinerl.async_llm import make_training_text
 from pipelinerl.domains.miniwob.environment import WebEnvironment
 from pipelinerl.domains.miniwob.steps import WebTape
-from pipelinerl.llm import LLMCall, TrainableLLM
+from pipelinerl.llm import LLMCall, TrainableLLM, TrainingText
 from pipelinerl.rollouts import BaseMetrics, RolloutResult
 from pipelinerl.world import Job
 
 logger = logging.getLogger(__name__)
 
 
+def task_id(problem: dict) -> str:
+    """Format task identifier for logging."""
+    return f"{problem['dataset']}/{problem['task']}/{problem['seed']}"
+
+
 class MiniwobMetrics(BaseMetrics):
-    reward: float
-    success: bool
-    no_error: bool
-    no_answer: bool
-    overflow: bool
-    n_llm_calls: int
-    n_step_errors: int
-    n_page_observations: int
-    n_steps: int
-    total_execution_time: float
-    env_start_time: float
-    env_close_time: float
-    env_agent_creation_time: float
-    agent_execution_time: float
-    environment_execution_time: float
-    env_step_time: float
-    agent_step_time: float
-    llm_call_time: float
-    env_call_time: float
-    total_llm_call_time: float
-    total_env_call_time: float
-
-
-def tape_contains_an_error(tape: WebTape) -> bool:
+    reward: float = -1.0
+    success: bool = False
+    has_error: bool = False
+    no_answer: bool = True
+    overflow: bool = False
+    n_llm_calls: int = 0
+    n_step_errors: int = 0
+    n_observations: int = 0
+    n_steps: int = 0
+    env_creation_time: float = 0.0
+    agent_creation_time: float = 0.0
+    env_start_time: float = 0.0
+    env_close_time: float = 0.0
+    agent_execution_time: float = 0.0
+    total_execution_time: float = 0.0
+    llm_call_time: float = 0.0
+    env_step_time: float = 0.0
+    total_llm_call_time: float = 0.0
+    total_env_call_time: float = 0.0
+
+
+def _tape_contains_an_error(tape: WebTape) -> bool:
     """
     Returns true if the tape ends with an error, ie if one of the following is true:
     - the last step is an LLMOutputParsingFailureAction
@@ -64,6 +68,118 @@ def tape_contains_an_error(tape: WebTape) -> bool:
     )
 
 
+def _compute_reward(
+    tape: WebTape,
+    reward_computation: Literal["uic", "default"] = "default",
+    has_error: bool = False,
+) -> tuple[float, bool]:
+    """
+    Compute reward from tape.
+
+    Args:
+        tape: The execution tape
+        cfg: Configuration with reward_computation setting
+        has_error: If there were errors during execution
+
+    Returns:
+        tuple of (reward, has_error)
+    """
+    # Extract raw reward from last observation
+    obs_steps = [step for step in tape if isinstance(step, Observation)]
+    if obs_steps:
+        last_obs = obs_steps[-1]
+        raw_reward = last_obs.metadata.other.get("info", {}).get("task_info", {}).get("REWARD_GLOBAL", -1.0)
+    else:
+        raw_reward = -1.0
+
+    # Count errors and page observations
+    n_step_errors = len([step for step in tape.steps if isinstance(step, LLMOutputParsingFailureAction)])
+    n_observations = len([step for step in tape.steps if isinstance(step, Observation)])
+
+    # Determine if tape has errors
+    has_error = has_error or _tape_contains_an_error(tape)
+
+    # Compute final reward based on configuration
+    if reward_computation == "uic":
+        reward = float(raw_reward > 0)
+        if reward == 0.0:
+            reward = -1.0
+        reward *= 0.98**n_observations
+    else:
+        reward = raw_reward * 0.99**n_step_errors if not has_error and raw_reward >= 0 else -1.0
+
+    return reward, has_error
+
+
+def _extract_llm_calls(tape: WebTape) -> list[LLMCall]:
+    """Extract LLM calls from tape steps."""
+    return [
+        LLMCall(**step.metadata.other["llm_call"])
+        if isinstance(step.metadata.other["llm_call"], dict)
+        else step.metadata.other["llm_call"]
+        for step in tape.steps
+        if "llm_call" in step.metadata.other
+    ]
+
+
+def _compute_metrics(
+    tape: WebTape,
+    training_texts: list[TrainingText],
+    reward: float,
+    has_error: bool,
+    n_llm_calls: int,
+) -> MiniwobMetrics:
+    # Create training texts
+    has_overflow = False
+    for text in training_texts:
+        text.reward = reward
+        has_overflow |= not text.finished
+
+    # Extract timing information
+    llm_call_times = [float(step.metadata.other.get("llm_call_time", 0.0)) for step in tape.steps]
+    env_call_times = [float(step.metadata.other.get("action_execution_time", 0.0)) for step in tape.steps]
+    total_llm_call_time = sum(llm_call_times)
+    total_env_call_time = sum(env_call_times)
+    llm_call_time = total_llm_call_time / len(llm_call_times) if llm_call_times else -1.0
+    env_step_time = total_env_call_time / len(env_call_times) if env_call_times else -1.0
+    env_start_time = tape.metadata.result.get("env_start_time", -1.0)
+    env_close_time = tape.metadata.result.get("env_close_time", -1.0)
+    env_creation_time = tape.metadata.result.get("env_creation_time", -1)
+    agent_creation_time = tape.metadata.result.get("agent_creation_time", -1)
+    agent_execution_time = tape.metadata.result.get("agent_execution_time", -1.0)
+    total_execution_time = tape.metadata.result.get("total_execution_time", -1.0)
+
+    # Compute step counts
+    n_observations = len([s for s in tape.steps if isinstance(s, Observation)])
+    n_step_errors = len([step for step in tape.steps if isinstance(step, LLMOutputParsingFailureAction)])
+
+    metrics = MiniwobMetrics(
+        reward=reward,
+        success=reward > 0.5,
+        has_error=has_error,
+        no_answer=reward < 0,
+        overflow=has_overflow,
+        n_llm_calls=n_llm_calls,
+        n_step_errors=n_step_errors,
+        n_steps=len(tape.steps),
+        n_observations=n_observations,
+
+        env_creation_time=env_creation_time,
+        env_start_time=env_start_time,
+        env_close_time=env_close_time,
+        
+        agent_creation_time=agent_creation_time,
+        agent_execution_time=agent_execution_time,
+
+        llm_call_time=llm_call_time,
+        env_step_time=env_step_time,
+        total_llm_call_time=total_llm_call_time,
+        total_env_call_time=total_env_call_time,
+        total_execution_time=total_execution_time,
+    )
+    return metrics
+
+
 async def check_env_server_health(env_job: Job, session: aiohttp.ClientSession) -> dict:
     """Check environment server health via HTTP API."""
     try:
@@ -74,11 +190,8 @@ async def check_env_server_health(env_job: Job, session: aiohttp.ClientSession)
                 return {"healthy": True, "health_data": health_data, "last_check": time.time()}
             else:
                 error_text = await response.text()
-                return {
-                    "healthy": False,
-                    "error_message": f"HTTP {response.status}: {error_text}",
-                    "last_check": time.time(),
-                }
+                health_data = f"HTTP {response.status}: {error_text}"
+                return {"healthy": False, "health_data": health_data, "last_check": time.time()}
     except Exception as e:
         exception_type = type(e).__name__
         exception_message = str(e) if str(e) else "No message available"
@@ -87,24 +200,36 @@ async def check_env_server_health(env_job: Job, session: aiohttp.ClientSession)
         )
         return {
             "healthy": False,
-            "error_message": f"Exception: {exception_type}: {exception_message}",
+            "health_data": f"Exception: {exception_type}: {exception_message}",
             "last_check": time.time(),
             "error_stacktrace": traceback.format_exc(),
         }
 
 
 def generate_miniwob_rollout(cfg: DictConfig, llm: TrainableLLM, problem: dict) -> RolloutResult:
-    # make agent and env
-    # set the llm
-    # run the agent
-    # get llm calls from tape
-    # compute rewards
-    # get training text from llm calls
-
+    """
+    Generate a MiniWoB rollout. Steps:
+    - make agent and env
+    - set the llm
+    - run the agent
+    - get llm calls from tape
+    - compute rewards
+    - get training text from llm calls
+
+    Args:
+        cfg: Configuration for the rollout
+        llm: The LLM to use
+        problem: The problem dict
+    Returns:
+        RolloutResult with training texts and metrics
+    """
+    tid = task_id(problem)
     start_time = time.perf_counter()
     environment: WebEnvironment = instantiate(cfg.environment)
     environment.initialize()
+    env_creation_time = time.perf_counter() - start_time
     logger.info(f"Environment tools: {environment.tools_description()}")
+    t = time.perf_counter()
     agent: Agent = instantiate(
         cfg.agent,
         known_actions=environment.actions(),
@@ -112,7 +237,7 @@ def generate_miniwob_rollout(cfg: DictConfig, llm: TrainableLLM, problem: dict)
         llms={DEFAULT: llm},
     )
     logger.info(f"Agent and environment loaded, using llm {llm.model_name} at {llm.get_base_url()}")
-    env_agent_creation_time = time.perf_counter() - start_time
+    agent_creation_time = time.perf_counter() - t
     try:
         start_attempts = cfg.start_attempts
         t = time.perf_counter()
@@ -121,141 +246,51 @@ def generate_miniwob_rollout(cfg: DictConfig, llm: TrainableLLM, problem: dict)
                 tape, _ = environment.start_task(problem)
                 break
             except Exception as e:
-                logger.exception(f"Failed to start task {problem['dataset']}/{problem['task']}/{problem['seed']}: {e}")
+                logger.exception(f"Failed to start task {tid}: {e}")
                 start_attempts -= 1
                 if start_attempts <= 0:
-                    raise Exception(
-                        f"Failed to start task {problem['dataset']}/{problem['task']}/{problem['seed']} after {cfg.start_attempts} attempts"
-                    )
+                    raise Exception(f"Failed to start task {tid} after {cfg.start_attempts} attempts")
                 else:
-                    logger.warning("retry after 1 seconds")
+                    logger.warning("Retrying after 1 second")
                     time.sleep(1)
         env_start_time = time.perf_counter() - t
-        logger.info(
-            f"Task {problem['dataset']}/{problem['task']}/{problem['seed']} started in {env_start_time:.2f} seconds"
-        )
-        logger.info(f"Running agent for task {problem['dataset']}/{problem['task']}/{problem['seed']}")
-        ex_t = time.perf_counter()
+        logger.info(f"Task {tid} started in {env_start_time:.2f}s")
+        t = time.perf_counter()
         tape = execute_agent(agent, tape, environment, max_loops=cfg.agent_max_loops)
-        execution_time = time.perf_counter() - ex_t
+        agent_execution_time = time.perf_counter() - t
     finally:
-        close_t = time.perf_counter()
+        t = time.perf_counter()
         environment.close()
-        env_close_time = time.perf_counter() - close_t
-    logger.info(
-        f"Agent finished task {problem['dataset']}/{problem['task']}/{problem['seed']}, times: start {env_start_time:.2f} sec, exec {execution_time:.2f} sec, close {env_close_time:.2f} sec, produced tape with {len(tape.steps)} steps"
-    )
-    total_execution_time = time.perf_counter() - t
+        env_close_time = time.perf_counter() - t
+    total_execution_time = time.perf_counter() - start_time
+    logger.info(f"Task {tid} finished in {total_execution_time:.2f}s")
     tape.metadata.result.update(
         {
             "total_execution_time": total_execution_time,
+            "env_creation_time": env_creation_time,
             "env_start_time": env_start_time,
-            "env_agent_creation_time": env_agent_creation_time,
-            "execution_time": execution_time,
             "env_close_time": env_close_time,
+            "agent_creation_time": agent_creation_time,
+            "agent_execution_time": agent_execution_time,
         }
     )
 
     # save the tape as we go
     if cfg.save_tapes:
-        tape_name = problem.get("_task_id", tape.metadata.id)
-        try:
-            save_json_tape(tape, os.path.join(cfg.output_dir, "tapes"), tape_name)
-        except Exception as e:
-            logger.error(f"Error saving tape {tape_name}: {e}")
-
-    # (3) Compute rewards
-    obs_steps = [step for step in tape if isinstance(step, Observation)]
-    if obs_steps:
-        last_obs = obs_steps[-1]
-        # in Miniwob, the observation "reward" is defined as RAW_REWARD_GLOBAL > 0
-        # see here: https://github.com/ServiceNow/BrowserGym/blob/main/browsergym/miniwob/src/browsergym/miniwob/base.py#L188
-        # Let's take directly the RAW_REWARD_GLOBAL from the metadata
-        # raw_reward = last_obs.metadata.other.get("reward", 0.0)
-        raw_reward = last_obs.metadata.other.get("info", {}).get("task_info", {}).get("REWARD_GLOBAL", -1.0)
-    else:
-        raw_reward = -1.0
-
-    # get the number of LLMOutputParsingFailureAction in the tape
-    n_step_errors = len([step for step in tape.steps if isinstance(step, LLMOutputParsingFailureAction)])
-    # get the number of PageObservation steps in the tape
-    n_page_observations = len([step for step in tape.steps if isinstance(step, PageObservation)])
+        _save_tapes(cfg, problem, tape)
 
-    if obs_steps:
-        last_obs = obs_steps[-1]
-        # in Miniwob, the observation "reward" is defined as RAW_REWARD_GLOBAL > 0
-        # see here: https://github.com/ServiceNow/BrowserGym/blob/main/browsergym/miniwob/src/browsergym/miniwob/base.py#L188
-        # Let's take directly the RAW_REWARD_GLOBAL from the metadata
-        # raw_reward = last_obs.metadata.other.get("reward", 0.0)
-        raw_reward = last_obs.metadata.other.get("info", {}).get("task_info", {}).get("REWARD_GLOBAL", -1.0)
-    else:
-        raw_reward = -1.0
-
-    no_error = not tape_contains_an_error(tape)
-    # get the number of LLMOutputParsingFailureAction in the tape
-    n_step_errors = len([step for step in tape.steps if isinstance(step, LLMOutputParsingFailureAction)])
-    # get the number of PageObservation steps in the tape
-    n_page_observations = len([step for step in tape.steps if isinstance(step, PageObservation)])
-
-    if cfg.reward_computation == "uic":
-        reward = float(raw_reward > 0)
-        if reward == 0.0:
-            reward = -1.0
-        reward *= 0.98**n_page_observations
-    else:
-        reward = raw_reward * 0.99**n_step_errors if no_error and raw_reward >= 0 else -1.0
-
-    # (3) Get LLM calls from Tape
-    llm_calls: list[LLMCall] = [
-        LLMCall(**step.metadata.other["llm_call"])
-        if isinstance(step.metadata.other["llm_call"], dict)
-        else step.metadata.other["llm_call"]
-        for step in tape.steps
-        if "llm_call" in step.metadata.other
-    ]
-    llm_call_times = [float(step.metadata.other.get("llm_call_time", 0.0)) for step in tape.steps]
-    env_call_times = [float(step.metadata.other.get("action_execution_time", 0.0)) for step in tape.steps]
-    total_llm_call_time = sum(llm_call_times)
-    total_env_call_time = sum(env_call_times)
-    llm_call_time = total_llm_call_time / len(llm_call_times) if len(llm_call_times) > 0 else -1.0
-    env_call_time = total_env_call_time / len(env_call_times) if len(env_call_times) > 0 else -1.0
-
-    # (4) # For each LLM interaction in the tape, make a training example.
-    all_finished = 1
+    # Compute reward and metrics
+    reward, has_error = _compute_reward(tape, cfg.reward_computation)
+    llm_calls = _extract_llm_calls(tape)
     training_texts = [make_training_text(llm, llm_call) for llm_call in llm_calls]
-    for text in training_texts:
-        text.reward = reward
-        all_finished &= 1 if text.input_ids[-1] == llm.tokenizer.eos_token_id else 0
-
-    latency = time.perf_counter() - start_time
-    agent_time = tape.metadata.result.get("agent_execution_time", -1.0)
-    env_time = tape.metadata.result.get("environment_execution_time", -1.0)
-    n_observations = len([s for s in tape.steps if isinstance(s, Observation)])
-    n_other_steps = len(tape.steps) - n_observations
-    metrics = MiniwobMetrics(
-        reward=reward,
-        success=reward > 0.5,
-        no_error=no_error,
-        no_answer=reward < 0,
-        overflow=not all_finished,
-        n_llm_calls=len(llm_calls),
-        n_step_errors=n_step_errors,
-        n_page_observations=n_page_observations,
-        n_steps=len(tape.steps),
-        total_execution_time=total_execution_time,
-        env_start_time=env_start_time,
-        env_close_time=env_close_time,
-        env_agent_creation_time=env_agent_creation_time,
-        agent_execution_time=agent_time,
-        environment_execution_time=env_time,
-        env_step_time=env_time / n_observations if env_time > 0 and n_observations > 0 else -1.0,
-        agent_step_time=agent_time / n_other_steps if agent_time > 0 and n_other_steps > 0 else -1.0,
-        llm_call_time=llm_call_time,
-        env_call_time=env_call_time,
-        total_llm_call_time=total_llm_call_time,
-        total_env_call_time=total_env_call_time,
+    metrics = _compute_metrics(
+        tape,
+        training_texts,
+        reward,
+        has_error,
+        len(llm_calls),
     )
-
+    latency = time.perf_counter() - start_time
     return RolloutResult(
         training_texts=training_texts,
         metrics=metrics,
@@ -263,6 +298,13 @@ def generate_miniwob_rollout(cfg: DictConfig, llm: TrainableLLM, problem: dict)
         dataset_name=problem["dataset"],
     )
 
+def _save_tapes(cfg, problem, tape):
+    tape_name = problem.get("_task_id", tape.metadata.id)
+    try:
+        save_json_tape(tape, os.path.join(cfg.output_dir, "tapes"), tape_name)
+    except Exception as e:
+        logger.error(f"Error saving tape {tape_name}: {e}")
+
 
 async def generate_miniwob_rollout_async(
     cfg: DictConfig,
@@ -270,70 +312,46 @@ async def generate_miniwob_rollout_async(
     problem: dict,
     session: aiohttp.ClientSession,
 ) -> RolloutResult:
-    # choose a random environment server
-    # Generate environment
-    # Generate TapeAgent
-    # run the agent
-    # get llm calls from tape
-    # compute rewards
-    # get training text from llm calls
-
-    start_time = time.time()
-
-    # Overall timeout for the entire rollout to prevent hanging
+    start_time = time.perf_counter()
+    tid = task_id(problem)
     rollout_timeout = getattr(cfg, "rollout_timeout", 600)  # 10 minutes default
 
     env_jobs = [Job(**job) for job in cfg.jobs if job["kind"] == "environment"]
     env_jobs_url_tried = []
 
-    # Try each environment server with health checks until one of them returns a rollout result
     for _ in range(len(env_jobs)):
-        # Choose the next environment server to try randomly from the ones that have not been tried yet
         env_job = random.choice(
             [job for job in env_jobs if f"http://{job.hostname}:{job.port}" not in env_jobs_url_tried]
         )
         env_job_url = f"http://{env_job.hostname}:{env_job.port}"
         env_jobs_url_tried.append(env_job_url)
 
-        # Check server health before using
         health = await check_env_server_health(env_job, session)
         if not health["healthy"]:
-            logger.warning(f"Environment server {env_job_url} is unhealthy: {health}")
-            logger.warning(f"Get health error stacktrace: {health['error_stacktrace']}")
+            logger.warning(f"Env server {env_job_url} unhealthy: {health.get('health_data', 'unknown')}, skip to next one")
             continue
-        # Log health status for monitoring
-        if health["healthy"]:
-            logger.info(f"Using healthy environment server {env_job_url}: {health}")
+        logger.debug(f"Using env server {env_job_url}")
 
         try:
-            # Execute the entire rollout with a timeout
             return await asyncio.wait_for(
                 _execute_rollout_with_timeout(cfg, llm, problem, session, start_time, env_job_url),
                 timeout=rollout_timeout,
             )
         except asyncio.TimeoutError:
-            health = await check_env_server_health(env_job, session)
-            if stack_trace := health.get("error_stacktrace"):
-                logger.warning(f"Get health error stacktrace: {stack_trace}")
-            logger.warning(f"Rollout timeout error stacktrace: {traceback.format_exc()}")
-            logger.warning(
-                f"Rollout timed out after {rollout_timeout} seconds for task {problem['dataset']}/{problem['task']}/{problem['seed']} on environment {env_job_url}. Health: {health}. Trying next server."
-            )
+            logger.warning(f"Task {tid} timed out after {rollout_timeout}s on {env_job_url}")
             continue
         except Exception as e:
-            health = await check_env_server_health(env_job, session)
-            if stack_trace := health.get("error_stacktrace"):
-                logger.warning(f"Get health error stacktrace: {stack_trace}")
-            logger.warning(f"Rollout failed error stacktrace: {traceback.format_exc()}")
-            logger.warning(
-                f"Rollout failed for task {problem['dataset']}/{problem['task']}/{problem['seed']} on environment {env_job_url}. Health: {health}. Trying next server."
-            )
+            logger.warning(f"Task {tid} failed on {env_job_url}: {e}")
             continue
-    # If all servers failed
-    logger.error(
-        f"All environment servers failed for task {problem['dataset']}/{problem['task']}/{problem['seed']}. Returning a failed rollout result."
+
+    logger.error(f"Task {tid}: all environment servers failed")
+    # Return a failed rollout result
+    return RolloutResult(
+        training_texts=[],
+        metrics=MiniwobMetrics(),
+        latency=time.perf_counter() - start_time,
+        dataset_name=problem["dataset"],
     )
-    return _create_failed_rollout_result(problem, start_time, "all environment servers failed")
 
 
 async def _execute_rollout_with_timeout(
@@ -344,12 +362,13 @@ async def _execute_rollout_with_timeout(
     start_time: float,
     env_job_url: str,
 ) -> RolloutResult:
-    # (2) Generate environment, TapeAgent, and run them to get a Tape
-    no_error = True  # track if there was an error in the tape
-    t = time.perf_counter()
+    tid = task_id(problem)
+    has_error = False
+    start_time = time.perf_counter()
     environment = AsyncRemoteEnvironment(server_url=env_job_url)  # type: ignore
     async with environment.acontext(session, wait_for_env=True) as env:
-        env_agent_creation_time = time.perf_counter() - t
+        env_creation_time = time.perf_counter() - start_time
+        agent_creation_time = 0.0
         start_attempts = cfg.start_attempts
         t = time.perf_counter()
         tape_dict = {}
@@ -361,213 +380,87 @@ async def _execute_rollout_with_timeout(
                 break
             except Exception as e:
                 start_attempts -= 1
-                logger.warning(
-                    f"Failed to start task {problem['dataset']}/{problem['task']}/{problem['seed']}. {start_attempts} attempts remaining. Error: {e}"
-                )
+                logger.warning(f"Task {tid} start failed, {start_attempts} attempts left: {e}")
                 if start_attempts <= 0:
-                    logger.error(f"Failed to start task after all retry attempts: {e}")
-                    no_error = False
+                    logger.error(f"Task {tid} start failed after all retries: {e}")
+                    has_error = True
                     break
                 else:
-                    logger.warning("Retry start task after 5 seconds.")
                     await asyncio.sleep(5)
         env_start_time = time.perf_counter() - t
-        logger.info(
-            f"Task {problem['dataset']}/{problem['task']}/{problem['seed']} started in {env_start_time:.2f} seconds. Worker ID: {env.worker_id}. Tape dict: {tape_dict}"
-        )
-        tape: WebTape = WebTape(**tape_dict)  # convert http response dict to WebTape object
+        logger.info(f"Task {tid} started in {env_start_time:.2f}s (worker={env.worker_id})")
+        tape: WebTape = WebTape(**tape_dict)
         t = time.perf_counter()
-        if no_error:  # only run the agent if the task started successfully
-            logger.info(
-                f"Running agent for task {problem['dataset']}/{problem['task']}/{problem['seed']} with worker ID: {env.worker_id} and tape ID {tape.metadata.id}"
-            )
+        agent_execution_time = 0.0
+        if not has_error:
             agent_attempts = cfg.agent_attempts
             while agent_attempts > 0:
-                # check if the worker is alive.
                 try:
-                    # this will either raise RuntimeError if worker is not alive anymore, or return a dictionary with the worker status
                     worker_status = await env.check_worker_alive()
                     if worker_status.get("status") == "starting":
-                        logger.warning(
-                            f"Worker {env.worker_id} for task {problem['dataset']}/{problem['task']}/{problem['seed']} and tape ID {tape.metadata.id} is starting, waiting 5 seconds for it to be fully started."
-                        )
+                        logger.debug(f"Task {tid}: worker {env.worker_id} starting, waiting...")
                         await asyncio.sleep(5)
                         continue
                 except Exception as e:
-                    # if worker is dead, no need to retry
-                    logger.exception(
-                        f"Worker {env.worker_id} for task {problem['dataset']}/{problem['task']}/{problem['seed']} and tape ID {tape.metadata.id} is dead. Error: {e}",
-                        stack_info=True,
-                    )
-                    no_error = False
+                    logger.exception(f"Task {tid}: worker {env.worker_id} dead: {e}")
+                    has_error = True
                     break
-                # if worker is alive, run the agent
+
                 try:
                     t = time.perf_counter()
                     actions = await env.a_actions()
                     tools_description = await env.a_tools_description()
                     agent: Agent = instantiate(cfg.agent, known_actions=actions, tools_description=tools_description)
                     agent.llms = {DEFAULT: llm}  # type: ignore
-                    env_agent_creation_time += time.perf_counter() - t
+                    agent_creation_time = time.perf_counter() - t
+                    t = time.perf_counter()
                     tape = await async_execute_agent(agent, tape, env, session, max_loops=cfg.agent_max_loops)
-                    # Check if the tape has an error from the orchestrator (e.g., SocketTimeoutError, RuntimeError: Worker is not alive, etc.)
+                    agent_execution_time = time.perf_counter() - t
                     if tape.metadata.error:
-                        logger.error(
-                            f"Agent execution for task {problem['dataset']}/{problem['task']}/{problem['seed']} with worker ID: {env.worker_id} and tape ID {tape.metadata.id} returned a tape with error: {tape.metadata.error}"
-                        )
+                        logger.error(f"Task {tid}: agent error: {tape.metadata.error}")
                         raise ValueError(tape.metadata.error)
-                    else:
-                        # Success - break out of retry loop
-                        logger.info(
-                            f"Agent execution for task {problem['dataset']}/{problem['task']}/{problem['seed']} with worker ID: {env.worker_id} and tape ID {tape.metadata.id} finished successfully"
-                        )
-                        break
+                    logger.info(f"Task {tid}: agent execution succeeded")
+                    break
                 except Exception as e:
                     agent_attempts -= 1
-                    logger.warning(
-                        f"Error occurred while running agent for task {problem['dataset']}/{problem['task']}/{problem['seed']} with worker ID: {env.worker_id} and tape ID {tape.metadata.id}. {agent_attempts} attempts remaining. Error: {e}"
-                    )
+                    logger.warning(f"Task {tid}: agent error, {agent_attempts} attempts left: {e}")
                     if agent_attempts <= 0:
-                        logger.error(
-                            f"Agent execution failed after all retry attempts for task {problem['dataset']}/{problem['task']}/{problem['seed']} with worker ID: {env.worker_id} and tape ID {tape.metadata.id}: {e}"
-                        )
-                        no_error = False
+                        logger.error(f"Task {tid}: agent failed after all retries: {e}")
+                        has_error = True
                         break
-                    else:
-                        logger.warning(
-                            f"Retry agent execution after 5 seconds for task {problem['dataset']}/{problem['task']}/{problem['seed']} with worker ID: {env.worker_id} and tape ID {tape.metadata.id}."
-                        )
-                        await asyncio.sleep(5)
-            logger.info(
-                f"Agent finished task {problem['dataset']}/{problem['task']}/{problem['seed']} in {time.perf_counter() - t:.2f} seconds with worker ID: {env.worker_id} and tape ID {tape.metadata.id}"
-            )
-        tape.metadata.result.update({"total_execution_time": time.perf_counter() - t})
+                    await asyncio.sleep(5)
+
+            logger.info(f"Task {tid} finished in {time.perf_counter() - t:.2f}s (worker={env.worker_id})")
         t = time.perf_counter()
         await env.aclose()
         env_close_time = time.perf_counter() - t
+        total_execution_time=time.perf_counter() - start_time
+        tape.metadata.result.update({
+            "total_execution_time": total_execution_time,
+            "env_creation_time": env_creation_time,
+            "env_start_time": env_start_time,
+            "env_close_time": env_close_time,
+            "agent_creation_time": agent_creation_time,
+            "agent_execution_time": agent_execution_time,
+        })
 
-    # save the tape as we go
     if cfg.save_tapes:
-        save_json_tape(tape, os.path.join(cfg.output_dir, "tapes"), tape.metadata.id)
-
-    # (3) Compute rewards
-    obs_steps = [step for step in tape if isinstance(step, Observation)]
-    if obs_steps:
-        last_obs = obs_steps[-1]
-        # in Miniwob, the observation "reward" is defined as RAW_REWARD_GLOBAL > 0
-        # see here: https://github.com/ServiceNow/BrowserGym/blob/main/browsergym/miniwob/src/browsergym/miniwob/base.py#L188
-        # Let's take directly the RAW_REWARD_GLOBAL from the metadata
-        # raw_reward = last_obs.metadata.other.get("reward", 0.0)
-        raw_reward = last_obs.metadata.other.get("info", {}).get("task_info", {}).get("REWARD_GLOBAL", -1.0)
-    else:
-        raw_reward = -1.0
+        _save_tapes(cfg, problem, tape)
 
-    no_error = no_error and not tape_contains_an_error(tape)
-    # get the number of LLMOutputParsingFailureAction in the tape
-    n_step_errors = len([step for step in tape.steps if isinstance(step, LLMOutputParsingFailureAction)])
-    # get the number of PageObservation steps in the tape
-    n_page_observations = len([step for step in tape.steps if isinstance(step, PageObservation)])
-
-    if cfg.reward_computation == "uic":
-        reward = float(raw_reward > 0)
-        if reward == 0.0:
-            reward = -1.0
-        reward *= 0.98**n_page_observations
-    else:
-        reward = raw_reward * 0.99**n_step_errors if no_error and raw_reward >= 0 else -1.0
-
-    # (3) Get LLM calls from Tape
-    llm_calls: list[LLMCall] = [
-        LLMCall(**step.metadata.other["llm_call"])
-        if isinstance(step.metadata.other["llm_call"], dict)
-        else step.metadata.other["llm_call"]
-        for step in tape.steps
-        if "llm_call" in step.metadata.other
-    ]
-
-    # (4) # For each LLM interaction in the tape, make a training example.
-    all_finished = 1
+    # Compute reward and metrics
+    reward, has_error = _compute_reward(tape, cfg.reward_computation, has_error)
+    llm_calls = _extract_llm_calls(tape)
     training_texts = [make_training_text(llm, llm_call) for llm_call in llm_calls]
-    for text in training_texts:
-        text.reward = reward
-        all_finished &= 1 if text.input_ids[-1] == llm.tokenizer.eos_token_id else 0
-
-    latency = time.time() - start_time
-    agent_time = tape.metadata.result.get("agent_execution_time", -1.0)
-    env_time = tape.metadata.result.get("environment_execution_time", -1.0)
-    n_observations = len([s for s in tape.steps if isinstance(s, Observation)])
-    n_other_steps = len(tape.steps) - n_observations
-
-    llm_call_times = [float(step.metadata.other.get("llm_call_time", 0.0)) for step in tape.steps]
-    env_call_times = [float(step.metadata.other.get("action_execution_time", 0.0)) for step in tape.steps]
-    total_llm_call_time = sum(llm_call_times)
-    total_env_call_time = sum(env_call_times)
-    llm_call_time = total_llm_call_time / len(llm_call_times) if len(llm_call_times) > 0 else -1.0
-    env_call_time = total_env_call_time / len(env_call_times) if len(env_call_times) > 0 else -1.0
-
-    metrics = MiniwobMetrics(
-        reward=reward,
-        success=reward > 0.5,
-        no_error=no_error,
-        no_answer=reward < 0,
-        overflow=not all_finished,
-        n_llm_calls=len(llm_calls),
-        n_step_errors=n_step_errors,
-        n_page_observations=n_page_observations,
-        n_steps=len(tape.steps),
-        total_execution_time=tape.metadata.result.get("total_execution_time", -1.0),
-        agent_execution_time=agent_time,
-        environment_execution_time=env_time,
-        env_step_time=env_time / n_observations if env_time > 0 and n_observations > 0 else -1.0,
-        agent_step_time=agent_time / n_other_steps if agent_time > 0 and n_other_steps > 0 else -1.0,
-        llm_call_time=llm_call_time,
-        env_call_time=env_call_time,
-        total_llm_call_time=total_llm_call_time,
-        total_env_call_time=total_env_call_time,
-        env_start_time=env_start_time,
-        env_close_time=env_close_time,
-        env_agent_creation_time=env_agent_creation_time,
+    metrics = _compute_metrics(
+        tape,
+        training_texts,
+        reward,
+        has_error,
+        len(llm_calls),
     )
-
-    return RolloutResult(
-        training_texts=training_texts,
-        metrics=metrics,
-        latency=latency,
-        dataset_name=problem["dataset"],
-    )
-
-
-def _create_failed_rollout_result(problem: dict, start_time: float, error_type: str) -> RolloutResult:
-    """Create a failed rollout result for timeout or other errors."""
     latency = time.time() - start_time
-
-    # Create empty training texts and metrics for failed rollout
-    metrics = MiniwobMetrics(
-        reward=-1.0,
-        success=False,
-        no_error=False,
-        no_answer=True,
-        overflow=False,
-        n_llm_calls=0,
-        n_step_errors=0,
-        n_page_observations=0,
-        n_steps=0,
-        total_execution_time=latency,
-        agent_execution_time=0.0,
-        environment_execution_time=0.0,
-        env_step_time=0.0,
-        agent_step_time=0.0,
-        llm_call_time=0.0,
-        env_call_time=0.0,
-        total_llm_call_time=0.0,
-        total_env_call_time=0.0,
-        env_start_time=0.0,
-        env_close_time=0.0,
-        env_agent_creation_time=0.0,
-    )
-
     return RolloutResult(
-        training_texts=[],
+        training_texts=training_texts,
         metrics=metrics,
         latency=latency,
         dataset_name=problem["dataset"],

From 9f26fa303156127b12e971e736c0cf64c5868e30 Mon Sep 17 00:00:00 2001
From: Oleh Shliazhko <oleh.shliazhko@servicenow.com>
Date: Fri, 28 Nov 2025 15:08:36 +0100
Subject: [PATCH 125/126] remove domain changes to separate branch
 update_domain_rollouts

---
 conf/finetune/base.yaml                  |   2 +-
 conf/mcp.yaml                            | 155 ---------
 conf/mcp/python.json                     |  11 -
 conf/miniwob.yaml                        |   2 +-
 pipelinerl/domains/math/__init__.py      |   2 +-
 pipelinerl/domains/math/load_datasets.py |  26 --
 pipelinerl/domains/math/rollouts.py      |  58 ++--
 pipelinerl/domains/mcp/__init__.py       |   1 -
 pipelinerl/domains/mcp/rollouts.py       | 387 -----------------------
 pipelinerl/domains/mcp/steps.py          |  13 -
 pipelinerl/finetune/rl/__init__.py       |  13 +-
 pipelinerl/finetune_loop.py              |   1 -
 12 files changed, 33 insertions(+), 638 deletions(-)
 delete mode 100644 conf/mcp.yaml
 delete mode 100644 conf/mcp/python.json
 delete mode 100644 pipelinerl/domains/mcp/__init__.py
 delete mode 100644 pipelinerl/domains/mcp/rollouts.py
 delete mode 100644 pipelinerl/domains/mcp/steps.py

diff --git a/conf/finetune/base.yaml b/conf/finetune/base.yaml
index 47a5e020..4998a8bf 100644
--- a/conf/finetune/base.yaml
+++ b/conf/finetune/base.yaml
@@ -36,7 +36,7 @@ learning_rate: 1e-6
 # How much to clip the gradient (no clipping if null)
 gradient_clipping_threshold: 0.3
 # Learning rate scheduler type (indexed by completed_steps).
-lr_scheduler_type: constant # could be cosine, constant_with_warmup
+lr_scheduler_type: cosine # could be cosine, constant_with_warmup
 # Number of warmup (completed) steps in the learning rate schedule.
 num_warmup_steps: 50
 # Number of gradient accumulation steps.
diff --git a/conf/mcp.yaml b/conf/mcp.yaml
deleted file mode 100644
index b916a054..00000000
--- a/conf/mcp.yaml
+++ /dev/null
@@ -1,155 +0,0 @@
-defaults:
-    - base
-    - override finetune: grpo
-    - _self_
-
-use_ray: true
-
-llm:
-  use_cache: false
-  parameters:
-    max_tokens: 8192
-
-test_llm:
-  parameters:
-    max_tokens: 8192
-
-rewards:
-  correct_answer_not_finished: 0.0
-  buffer_tokens: 2000
-
-actor:
-  rollout_policy: pipelinerl.domains.mcp.generate_mcp_rollout_async
-  system_prompt: Please reason step by step, and put your final answer within \boxed{{}}.
-  rollout_workers: 64
-  llm_max_rollouts: 256
-  problem_queue_size: 256
-  task_template: |-
-    {task}
-  shared_memory_entry_size: 200000000
-
-preprocess:
-  shared_memory_entry_size: 2000000000
-
-finetune:
-  seq_length: 32000
-  seq_parallel: 8
-
-dataset_loader: pipelinerl.domains.math.load_datasets
-train_dataset_names:
-- open_reasoner_zero_57k
-- open_reasoner_zero_extended_72k 
-test_dataset_names:
-  - aime_2025
-
-vllm_config:
-  use_v1: true
-  vllm_kwargs:
-    enable-auto-tool-choice: ""
-    tool-call-parser: rl_tool
-    tool-parser-plugin: ${hydra:runtime.cwd}/pipelinerl/rl_tool_parser_plugin.py
-    max-num-seqs: 256
-    max-num-batched-tokens: 32000
-    max_model_len: 32000
-    gpu-memory-utilization: 0.9
-
-environment:
-  _target_: tapeagents.mcp.MCPEnvironment
-  config_path: ${hydra:runtime.cwd}/conf/mcp/python.json
-  tools_whitelist:
-    - run_python_code
-  read_timeout_seconds: 600
-  use_cache: false
-
-
-world:
-  env_replicas_per_actor: 8
-  environment_mode: embedded
-
-agent_max_loops: 3
-agent:
-  _target_: tapeagents.agent.Agent
-  name : mcp_agent
-  max_iterations: 3
-  store_llm_calls: true
-  templates:
-    system_prompt: |
-      You are a math-focused AI Agent. Solve problems by combining clear symbolic reasoning
-      with short, deterministic Python code.
-      Keep your replies concise and direct. Prioritize clarity and avoid over-elaboration.
-      Always present the final answer in LaTeX \boxed{{}}.
-      Do not express emotions or opinions about user questions.
-
-      Workflow:
-      1. Draft a brief plan in plain text.
-      2. Execute one run_python_code call to compute or verify the result.
-      3. Finalize by calling MathAnswer with the LaTeX-formatted answer.
-
-      Python execution policy (run_python_code):
-      - Use Python strictly for pure computation to verify and validate the final answer.
-      - No network, file system, OS or environment access.
-      - Keep snippets minimal and self-contained; avoid large outputs and long-running loops; print only the final result.
-
-      Validation:
-      - Cross-check results (alternative derivation, invariants, higher precision) before finalizing.
-      - If execution fails, propose the minimal fix and retry.
-      Keep replies direct and avoid unnecessary text.
-    allowed_tools: |
-      You can call the following tools:
-      {tools_description}
-      - run_python_code: deterministic math code; print only the final value.
-      - MathAnswer: return the LaTeX \boxed{{}} answer when the solution is verified.
-      Always verify with run_python_code before invoking MathAnswer.
-    thought_format: |
-      Important! Respond with the plain text, do not include any JSON or code.
-      Do not output anything besides what I asked in this message.
-    allowed_steps: |
-      Workflow summary:
-      - Plan briefly in plain text.
-      - Call run_python_code exactly once per loop to compute/verify.
-      - Finish with a single MathAnswer tool call carrying the \boxed{{}} result.
-    format: |
-      For finalization, reply with a single short sentence that ends in the \boxed{{}} answer,
-      immediately followed by the MathAnswer function call containing the same \boxed{{}} value.
-      Never emit unrelated JSON wrappers or duplicate the final thought.
-      
-
-  nodes:
-    - _target_: tapeagents.nodes.StandardNode
-      name: plan
-      system_prompt: ${agent.templates.system_prompt}
-      guidance: |
-        Produce a concise math plan (formulas/checks). You will ALWAYS verify by executing Python code.
-        ${agent.templates.thought_format}
-      steps_prompt: ${agent.templates.allowed_tools}
-      trim_obs_except_last_n: 2
-
-    - _target_: tapeagents.nodes.StandardNode
-      name: code
-      system_prompt: ${agent.templates.system_prompt}
-      guidance: |
-        ALWAYS call run_python_code once to compute/verify the result.
-        Use exact, deterministic code; print only the final scalar or tuple.
-        If code fails, fix minimally and call run_python_code again after reviewing the error.
-      use_known_actions: true
-      use_function_calls: true
-      trim_obs_except_last_n: 2
-
-    - _target_: tapeagents.nodes.StandardNode
-      name: finalize
-      system_prompt: ${agent.templates.system_prompt}
-      guidance: |
-        Read the last Python stdout value. First, state the answer in one short sentence that ends with LaTeX \boxed{{}}.
-        Immediately after that sentence, call the MathAnswer tool exactly once with:
-          name: MathAnswer
-          arguments: {"answer": "<final answer in LaTeX \\boxed{}>"}
-        Do not add any extra text around the tool call. Once the sentence is emitted, return only the MathAnswer function call.
-      steps:
-        - pipelinerl.domains.mcp.steps.MathAnswer
-      use_known_actions: true
-      use_function_calls: true
-      trim_obs_except_last_n: 2
-      next_node: code
-
-model_path: Qwen/Qwen3-8B
-# model_path: /mnt/llmd/base_models/ServiceNow-AI/7_9_25_14b_text_reasoning_sft
diff --git a/conf/mcp/python.json b/conf/mcp/python.json
deleted file mode 100644
index fcbb4dcf..00000000
--- a/conf/mcp/python.json
+++ /dev/null
@@ -1,11 +0,0 @@
-{
-    "mcpServers": {
-        "python_exec": {
-            "command": "bash",
-            "args": [
-                "-c",
-                "deno run -N -R=node_modules -W=node_modules --node-modules-dir=auto jsr:@pydantic/mcp-run-python stdio"
-                ]
-        }
-    }
-}
\ No newline at end of file
diff --git a/conf/miniwob.yaml b/conf/miniwob.yaml
index 75acf2be..7c051a93 100644
--- a/conf/miniwob.yaml
+++ b/conf/miniwob.yaml
@@ -154,7 +154,7 @@ environment:
   _target_: examples.rl_webagent.environment.WebEnvironment
   exp_path: null
   headless: true
-  observation_format: axtree
+  observation_format: html
 
 # DATASET CONFIGURATION
 dataset_loader: pipelinerl.domains.miniwob.load_tasks.load_tasks
diff --git a/pipelinerl/domains/math/__init__.py b/pipelinerl/domains/math/__init__.py
index 7a9809b7..9aee0b8f 100644
--- a/pipelinerl/domains/math/__init__.py
+++ b/pipelinerl/domains/math/__init__.py
@@ -1,3 +1,3 @@
 from .load_datasets import load_datasets
-from .rollouts import generate_math_rollout, RewardTable, get_reward, length_penalty
+from .rollouts import generate_math_rollout, RewardTable
 from .verifier_api import MathEnvironment, verify_answer, verify_answer_rpc
\ No newline at end of file
diff --git a/pipelinerl/domains/math/load_datasets.py b/pipelinerl/domains/math/load_datasets.py
index 7cbf9c18..4b44dfb6 100644
--- a/pipelinerl/domains/math/load_datasets.py
+++ b/pipelinerl/domains/math/load_datasets.py
@@ -170,26 +170,6 @@ def _load_aime_dataset(year: int, upsample_factor: int = 0) -> list[dict]:
     return add_ids(samples)
 
 
-def _load_aime_2025_opencompass(upsample_factor: int = 0) -> list[dict]:
-    configs = ["AIME2025-I", "AIME2025-II"]
-    dataset_name = "aime_2025" + ("" if upsample_factor > 0 else "_original")
-
-    samples: list[dict] = []
-    for config_name in configs:
-        ds = load_dataset("opencompass/AIME2025", config_name, split="test")
-        samples.extend([s for s in process_math(ds, dataset_name) if s is not None])
-
-    original_size = len(samples)
-    if upsample_factor > 0:
-        samples *= upsample_factor
-
-    logger.info(
-        f"Loading aime 2025 (OpenCompass) dataset: {len(samples)} samples"
-        + (f" (upsampled from {original_size})" if upsample_factor > 0 else "")
-    )
-    return add_ids(samples)
-
-
 def _load_amc_dataset(year: int, upsample_factor: int = 0) -> list[dict]:
     amc_dataset = load_dataset("AI-MO/aimo-validation-amc", split="train", trust_remote_code=True)
     amc_dataset = amc_dataset.filter(lambda x: str(year) in x["url"])
@@ -355,12 +335,6 @@ def load_datasets(dataset_names: List[str] | str | None, seed: int | None = None
     if "aime_2024_original" in dataset_names:
         datasets += _load_aime_dataset(2024)
 
-    if "aime_2025" in dataset_names:
-        datasets += _load_aime_2025_opencompass(upsample_factor=16)
-
-    if "aime_2025_original" in dataset_names:
-        datasets += _load_aime_2025_opencompass()
-
     if "amc_2022" in dataset_names:
         # TODO: AMC 2022 is 43 problems, is that to be expected?
         datasets += _load_amc_dataset(2022, upsample_factor=16)
diff --git a/pipelinerl/domains/math/rollouts.py b/pipelinerl/domains/math/rollouts.py
index 4a29227c..41a61021 100644
--- a/pipelinerl/domains/math/rollouts.py
+++ b/pipelinerl/domains/math/rollouts.py
@@ -1,18 +1,17 @@
-import random
 import time
+import random
 
 import aiohttp
 from omegaconf import DictConfig
 from pydantic import BaseModel
-
-from pipelinerl.async_llm import llm_async_generate, make_training_text
-from pipelinerl.llm import Prompt, TrainableLLM
-from pipelinerl.rollouts import BaseMetrics, RolloutResult
+from pipelinerl.rollouts import RolloutResult, BaseMetrics
 from pipelinerl.world import Job
+from tapeagents.core import Prompt
+from tapeagents.llms.trainable import TrainableLLM
 
+from pipelinerl.async_llm import llm_async_generate, make_training_text
 from .verifier_api import verify_answer_rpc
 
-
 class Metrics(BaseMetrics):
     penalty: float
 
@@ -27,28 +26,6 @@ class RewardTable(BaseModel):
     correct_answer_finished: float
     buffer_tokens: int = 0 # 0 means no overlong reward shaping
 
-def get_reward(answer_status: str, finished: bool, reward_table: RewardTable) -> float:
-    match (answer_status, finished):
-        case ("wrong", False):
-            return reward_table.wrong_answer_not_finished
-        case ("wrong", True):
-            return reward_table.wrong_answer_finished
-        case ("no_answer", False):
-            return reward_table.no_answer_not_finished
-        case ("no_answer", True):
-            return reward_table.no_answer_finished
-        case ("unparsable", False):
-            return reward_table.unparsable_not_finished
-        case ("unparsable", True):
-            return reward_table.unparsable_finished
-        case ("correct", False):
-            return reward_table.correct_answer_not_finished
-        case ("correct", True):
-            return reward_table.correct_answer_finished
-        case _:
-            raise ValueError(f"Invalid answer_status/finished combination: {answer_status}/{finished}")
-
-
 def length_penalty(max_length: int, sequence_length: int, buffer_tokens: int) -> float:
     """
     Compute the overlong penalty
@@ -74,7 +51,7 @@ async def generate_math_rollout(
     latency = time.time() - time_start
 
     assert llm_call.output.content is not None
-    reward_table = RewardTable(**dict(cfg.rewards))
+    rewards = RewardTable(**dict(cfg.rewards))
     discount_factor = cfg.actor.discount_factor
 
     # math_verify is a fast environment, no support for environment replicas for now
@@ -93,11 +70,30 @@ async def generate_math_rollout(
 
     trace = make_training_text(llm, llm_call)
     # Determine reward based on answer status and finished state
-    reward = get_reward(answer_status, trace.finished, reward_table)
+    match (answer_status, trace.finished):
+        case ("wrong", False):
+            reward = rewards.wrong_answer_not_finished
+        case ("wrong", True):
+            reward = rewards.wrong_answer_finished
+        case ("no_answer", False):
+            reward = rewards.no_answer_not_finished
+        case ("no_answer", True):
+            reward = rewards.no_answer_finished
+        case ("unparsable", False):
+            reward = rewards.unparsable_not_finished
+        case ("unparsable", True):
+            reward = rewards.unparsable_finished
+        case ("correct", False):
+            reward = rewards.correct_answer_not_finished
+        case ("correct", True):
+            reward = rewards.correct_answer_finished
+        case _:
+            raise ValueError(f"Invalid answer_status/finished combination: {answer_status}/{trace.finished}")
+
     # Apply discount factor based on output length
     reward *= discount_factor**llm_call.output_length_tokens
     overlong_penalty = 0
-    if reward_table.buffer_tokens > 0:
+    if rewards.buffer_tokens > 0:
         overlong_penalty = length_penalty(llm.parameters['max_tokens'], llm_call.output_length_tokens, rewards.buffer_tokens)
     reward += overlong_penalty
     trace.reward = reward
diff --git a/pipelinerl/domains/mcp/__init__.py b/pipelinerl/domains/mcp/__init__.py
deleted file mode 100644
index cb1d3fc4..00000000
--- a/pipelinerl/domains/mcp/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-from .rollouts import generate_mcp_rollout, generate_mcp_rollout_async, generate_mcp_rollouts_shared_env
diff --git a/pipelinerl/domains/mcp/rollouts.py b/pipelinerl/domains/mcp/rollouts.py
deleted file mode 100644
index d258b7bf..00000000
--- a/pipelinerl/domains/mcp/rollouts.py
+++ /dev/null
@@ -1,387 +0,0 @@
-import asyncio
-import logging
-import time
-from collections import Counter
-from pathlib import Path
-from typing import Dict, List
-
-import aiohttp
-from hydra.utils import instantiate
-from omegaconf import DictConfig, OmegaConf
-from tapeagents.agent import DEFAULT, Agent
-from tapeagents.core import LLMCall, Tape
-from tapeagents.dialog_tape import UserStep
-from tapeagents.io import save_json_tape
-from tapeagents.llms.trainable import TrainableLLM
-from tapeagents.mcp import MCPEnvironment
-from tapeagents.orchestrator import async_execute_agent, execute_agent, get_agent_and_env_from_config
-
-from pipelinerl.async_llm import make_training_text
-from pipelinerl.domains.math import RewardTable, get_reward, verify_answer
-from pipelinerl.domains.mcp.steps import MathAnswer
-from pipelinerl.rollouts import BaseMetrics, RolloutResult
-
-logger = logging.getLogger(__name__)
-
-
-class FailedRollout(Exception):
-    pass
-
-def count_tool_calls_by_category(llm_calls: List[LLMCall]) -> Dict[str, int]:
-    """
-    Count the number of tool calls for each function name category.
-
-    Args:
-        llm_calls: List of LLMCall objects
-
-    Returns:
-        Dictionary mapping function names to their counts
-    """
-    tool_call_names = []
-
-    for llm_call in llm_calls:
-        if llm_call.output.tool_calls:
-            for tool_call in llm_call.output.tool_calls:
-                tool_call_names.append(tool_call.function.name)
-
-    return dict(Counter(tool_call_names))
-
-
-class Metrics(BaseMetrics):
-    num_python_calls: int = 0
-    num_steps: int = 0
-    n_llm_calls: int = 0
-    total_execution_time: float = -1.0
-    agent_execution_time: float = -1.0
-    environment_execution_time: float = -1.0
-    overflow: bool = False
-
-
-def generate_mcp_rollout(
-    cfg: DictConfig | dict,
-    llm: TrainableLLM,
-    problem: dict,
-) -> RolloutResult:
-    start = time.perf_counter()
-    if isinstance(cfg, dict):
-        cfg = OmegaConf.create(cfg)
-    tapes_dir = Path(cfg.output_dir) / "actor" / "tapes"
-    tapes_dir.mkdir(parents=True, exist_ok=True)
-    agent, _env = get_agent_and_env_from_config(cfg)
-    environment: MCPEnvironment = _env
-    logger.info(f"Agent and environment loaded, using llm {llm.model_name} at {llm.get_base_url()}")
-    try:
-        t_exec = time.perf_counter()
-        start_result = environment.start_task(problem)
-        logger.info("Task started")
-        tape_metadata = start_result if isinstance(start_result, dict) else {}
-        agent.llms = {DEFAULT: llm}
-        tape = Tape(
-            steps=[
-                UserStep(
-                    content=f"{problem['task']}. You have access to the following tools: {environment.tools_description()}"
-                )
-            ]
-        )
-        if tape_metadata:
-            tape.metadata.other.update(tape_metadata)
-
-        logger.info("Running agent..")
-        tape = execute_agent(agent, tape, environment, max_loops=cfg.agent_max_loops)
-        logger.info("Agent finished")
-        tape.metadata.result.update({"total_execution_time": time.perf_counter() - t_exec})
-        reward_table = RewardTable(**dict(cfg.rewards))
-
-        llm_calls: list[LLMCall] = [
-            LLMCall(**step.metadata.other["llm_call"])
-            if isinstance(step.metadata.other["llm_call"], dict)
-            else step.metadata.other["llm_call"]
-            for step in tape.steps if step.metadata.other.get("llm_call") is not None
-        ]
-        assert len(llm_calls) > 0, "No LLM calls found"
-        tool_call_counts = count_tool_calls_by_category(llm_calls)
-        logger.info(f'Use {type(llm)} LLM to generate training texts')
-        training_texts = [make_training_text(llm, llm_call) for llm_call in llm_calls]
-        n_llm_calls = len(llm_calls)
-        answer_status = verify_answer(
-            prediction=llm_calls[-1].output.content,  # type: ignore
-            gold=problem["answer"],
-            strict=True,
-        )
-        # Tape should finish with an answer
-        tape_finished = True if isinstance(tape.steps[-1], MathAnswer) else False
-        base_reward = get_reward(answer_status, tape_finished, reward_table)
-
-        # Local reward shaping (configurable in conf/mcp.yaml)
-        total_shaping = 0.0
-
-        # Length shaping: discourage very long completions; award concise correct ones
-        length_cfg = getattr(cfg, "length_shaping", None)
-        if length_cfg is not None:
-            try:
-                # Prefer ratio-based target if provided; otherwise use absolute
-                if hasattr(length_cfg, "target_ratio"):
-                    ratio = float(getattr(length_cfg, "target_ratio"))
-                    max_gen = int(llm.parameters.get("max_tokens", 2048))
-                    target_tokens = int(max(1, ratio * max_gen))
-                    # Optional clamps
-                    min_t = int(getattr(length_cfg, "min_target_tokens", 0))
-                    max_t = int(getattr(length_cfg, "max_target_tokens", 10**9))
-                    target_tokens = max(min_t, min(max_t, target_tokens))
-                else:
-                    target_tokens = int(getattr(length_cfg, "target_output_tokens", 512))
-                slope = float(getattr(length_cfg, "slope", 0.0))
-                max_penalty = float(getattr(length_cfg, "max_penalty", 0.0))
-                bonus_short_correct = float(getattr(length_cfg, "bonus_on_short_correct", 0.0))
-            except Exception:
-                target_tokens, slope, max_penalty, bonus_short_correct = 512, 0.0, 0.0, 0.0
-
-            # average output tokens across llm calls for this rollout
-            try:
-                avg_output_tokens = sum(t.output_tokens for t in training_texts) / max(1, len(training_texts))
-            except Exception:
-                avg_output_tokens = 0.0
-
-            if slope > 0.0 and max_penalty > 0.0 and avg_output_tokens > target_tokens:
-                over_by = float(avg_output_tokens - target_tokens)
-                penalty = min(max_penalty, slope * over_by)
-                total_shaping -= penalty
-
-            if bonus_short_correct > 0.0 and answer_status == "correct" and avg_output_tokens <= target_tokens:
-                total_shaping += bonus_short_correct
-
-        reward = base_reward + total_shaping
-
-        # Assign identical reward to all steps in the rollout (pipeline expects uniform rollout_reward)
-        for text in training_texts:
-            text.reward = reward
-            text.finished = tape_finished
-
-        latency = time.perf_counter() - start
-
-        agent_time = tape.metadata.result.get("agent_execution_time", -1.0)
-        env_time = tape.metadata.result.get("environment_execution_time", -1.0)
-        total_time = tape.metadata.result.get("total_execution_time", -1.0)
-
-        tape_name = problem.get("_task_id", tape.metadata.id)
-        save_json_tape(tape, tapes_dir.as_posix(), tape_name)
-
-        metrics = Metrics(
-            reward=reward,
-            success=answer_status == "correct",
-            no_error=answer_status != "unparsable",
-            no_answer=answer_status == "no_answer",
-            num_steps=len(tape.steps),
-            num_python_calls=tool_call_counts.get("run_python_code", 0),
-            n_llm_calls=n_llm_calls,
-            total_execution_time=total_time,
-            agent_execution_time=agent_time,
-            environment_execution_time=env_time,
-            overflow=not tape_finished,
-        )
-
-        return RolloutResult(
-            training_texts=training_texts,
-            metrics=metrics,
-            latency=latency,
-            dataset_name=problem["dataset"]
-        )
-    except Exception as e:
-        err_msg = f"Error generating rollout: {e}"
-        logger.error(err_msg)
-        raise FailedRollout(err_msg)
-    finally:
-        try:
-            environment.close()
-        except Exception as e:
-            logger.error(f"Error closing environment: {e}")
-
-
-async def generate_mcp_rollout_async(
-    cfg: DictConfig,
-    llm: TrainableLLM,
-    problem: dict,
-    session: aiohttp.ClientSession,
-) -> RolloutResult:
-    environment: MCPEnvironment = instantiate(cfg.environment)
-    await environment.ainitialize()
-    logger.info(f"Environment tools: {environment.tools_description()}")
-    agent: Agent = instantiate(
-        cfg.agent, known_actions=environment.actions(), tools_description=environment.tools_description()
-    )
-    logger.info(f"Agent and environment loaded, using llm {llm.model_name} at {llm.get_base_url()}")
-    try:
-        result = await generate_mcp_rollout_with_agent_and_env(cfg, environment, agent, llm, problem, session)
-    finally:
-        try:
-            await environment.aclose()
-        except Exception as e:
-            logger.error(f"Error closing environment: {e}")
-    return result
-
-
-async def generate_mcp_rollout_with_agent_and_env(
-    cfg: DictConfig,
-    environment: MCPEnvironment,
-    agent: Agent,
-    llm: TrainableLLM,
-    problem: dict,
-    session: aiohttp.ClientSession,
-) -> RolloutResult:
-    tapes_dir = Path(cfg.output_dir) / "actor" / "tapes"
-    tapes_dir.mkdir(parents=True, exist_ok=True)
-    try:
-        start_time = time.perf_counter()
-        start_result = environment.start_task(problem)
-        logger.info("Task started")
-        tape_metadata = start_result if isinstance(start_result, dict) else {}
-        agent.llms = {DEFAULT: llm}
-        tape = Tape(
-            steps=[
-                UserStep(
-                    content=f"{problem['task']}. You have access to the following tools: {environment.tools_description()}"
-                )
-            ]
-        )
-        if tape_metadata:
-            tape.metadata.other.update(tape_metadata)
-
-        logger.info("Running agent..")
-        tape = await async_execute_agent(agent, tape, environment, session, max_loops=cfg.agent_max_loops)
-        logger.info("Agent finished")
-        tape.metadata.result.update({"total_execution_time": time.perf_counter() - start_time})
-        reward_table = RewardTable(**dict(cfg.rewards))
-
-        llm_calls: list[LLMCall] = [
-            LLMCall(**step.metadata.other["llm_call"])
-            if isinstance(step.metadata.other["llm_call"], dict)
-            else step.metadata.other["llm_call"]
-            for step in tape.steps if step.metadata.other.get("llm_call") is not None
-        ]
-        assert len(llm_calls) > 0, "No LLM calls found"
-        tool_call_counts = count_tool_calls_by_category(llm_calls)
-        logger.info(f'Use {type(llm)} LLM to generate training texts')
-        training_texts = [make_training_text(llm, llm_call) for llm_call in llm_calls]
-        n_llm_calls = len(llm_calls)
-        answer_status = verify_answer(
-            prediction=llm_calls[-1].output.content,  # type: ignore
-            gold=problem["answer"],
-            strict=True,
-        )
-        # Tape should finish with an answer
-        tape_finished = True if isinstance(tape.steps[-1], MathAnswer) else False
-        base_reward = get_reward(answer_status, tape_finished, reward_table)
-
-        # Local reward shaping (configurable in conf/mcp.yaml)
-        total_shaping = 0.0
-
-        # Length shaping: discourage very long completions; award concise correct ones
-        length_cfg = getattr(cfg, "length_shaping", None)
-        if length_cfg is not None:
-            try:
-                # Prefer ratio-based target if provided; otherwise use absolute
-                if hasattr(length_cfg, "target_ratio"):
-                    ratio = float(getattr(length_cfg, "target_ratio"))
-                    max_gen = int(llm.parameters.get("max_tokens", 2048))
-                    target_tokens = int(max(1, ratio * max_gen))
-                    # Optional clamps
-                    min_t = int(getattr(length_cfg, "min_target_tokens", 0))
-                    max_t = int(getattr(length_cfg, "max_target_tokens", 10**9))
-                    target_tokens = max(min_t, min(max_t, target_tokens))
-                else:
-                    target_tokens = int(getattr(length_cfg, "target_output_tokens", 512))
-                slope = float(getattr(length_cfg, "slope", 0.0))
-                max_penalty = float(getattr(length_cfg, "max_penalty", 0.0))
-                bonus_short_correct = float(getattr(length_cfg, "bonus_on_short_correct", 0.0))
-            except Exception:
-                target_tokens, slope, max_penalty, bonus_short_correct = 512, 0.0, 0.0, 0.0
-
-            # average output tokens across llm calls for this rollout
-            try:
-                avg_output_tokens = sum(t.output_tokens for t in training_texts) / max(1, len(training_texts))
-            except Exception:
-                avg_output_tokens = 0.0
-
-            if slope > 0.0 and max_penalty > 0.0 and avg_output_tokens > target_tokens:
-                over_by = float(avg_output_tokens - target_tokens)
-                penalty = min(max_penalty, slope * over_by)
-                total_shaping -= penalty
-
-            if bonus_short_correct > 0.0 and answer_status == "correct" and avg_output_tokens <= target_tokens:
-                total_shaping += bonus_short_correct
-
-        reward = base_reward + total_shaping
-
-        # Assign identical reward to all steps in the rollout (pipeline expects uniform rollout_reward)
-        for text in training_texts:
-            text.reward = reward
-            text.finished = tape_finished
-
-        latency = time.perf_counter() - start_time
-
-        agent_time = tape.metadata.result.get("agent_execution_time", -1.0)
-        env_time = tape.metadata.result.get("environment_execution_time", -1.0)
-        total_time = tape.metadata.result.get("total_execution_time", -1.0)
-
-        tape_name = problem.get("_task_id", tape.metadata.id)
-        save_json_tape(tape, tapes_dir.as_posix(), tape_name)
-
-        metrics = Metrics(
-            reward=reward,
-            success=answer_status == "correct",
-            no_error=answer_status != "unparsable",
-            no_answer=answer_status == "no_answer",
-            num_steps=len(tape.steps),
-            num_python_calls=tool_call_counts.get("run_python_code", 0),
-            n_llm_calls=n_llm_calls,
-            total_execution_time=total_time,
-            agent_execution_time=agent_time,
-            environment_execution_time=env_time,
-            overflow=not tape_finished,
-        )
-
-        return RolloutResult(
-            training_texts=training_texts,
-            metrics=metrics,
-            latency=latency,
-            dataset_name=problem["dataset"]
-        )
-    except Exception as e:
-        err_msg = f"Error generating rollout: {e}"
-        logger.error(err_msg)
-        raise FailedRollout(err_msg)
-
-
-async def generate_mcp_rollouts_shared_env(
-    cfg: DictConfig,
-    llm: TrainableLLM,
-    problems: list[dict],
-    session: aiohttp.ClientSession,
-) -> RolloutResult:
-    """Caution: this function should be used only with stateless environment, as it shares it between multiple agents"""
-    environment: MCPEnvironment = instantiate(cfg.environment)
-    await environment.ainitialize()
-    logger.info(f"Shared environment loaded for {len(problems)} problems")
-    try:
-        async def run_rollout(problem):
-            logger.info(f"Running async rollout loop for problem {problem['_task_id']}")
-            start_ts = time.monotonic()
-            agent: Agent = instantiate(
-                cfg.agent, known_actions=environment.actions(), tools_description=environment.tools_description()
-            )
-            logger.info(f"Agent created with llm {llm.model_name} at {llm.get_base_url()}")
-            rollout_result = await generate_mcp_rollout_with_agent_and_env(cfg, environment, agent, llm, problem, session)
-            stop_ts = time.monotonic()
-            latency = stop_ts - start_ts
-            return rollout_result, latency
-
-        tasks = [run_rollout(problem) for problem in problems]
-        results_with_latencies = await asyncio.gather(*tasks)
-    finally:
-        try:
-            await environment.aclose()
-        except Exception as e:
-            logger.error(f"Error closing environment: {e}")
-    rollout_results = [res for res, _ in results_with_latencies]
-    task_latencies = [latency for _, latency in results_with_latencies]
-    return rollout_results, task_latencies
diff --git a/pipelinerl/domains/mcp/steps.py b/pipelinerl/domains/mcp/steps.py
deleted file mode 100644
index 9b29a717..00000000
--- a/pipelinerl/domains/mcp/steps.py
+++ /dev/null
@@ -1,13 +0,0 @@
-from typing import Any, Literal
-from pydantic import Field
-from tapeagents.core import FinalObservation
-
-
-class MathAnswer(FinalObservation):
-    """
-    Action that indicates the agent has finished solving a math problem.
-    The final answer must be contained within \\boxed{} format.
-    """
-
-    kind: Literal["math_answer_action"] = "math_answer_action"
-    answer: Any = Field(description="Final answer in \\boxed{} format")
diff --git a/pipelinerl/finetune/rl/__init__.py b/pipelinerl/finetune/rl/__init__.py
index 4e7d3faa..f2472388 100644
--- a/pipelinerl/finetune/rl/__init__.py
+++ b/pipelinerl/finetune/rl/__init__.py
@@ -7,18 +7,12 @@
 import pandas as pd
 import torch
 import torch.nn.functional as F
-from datasets import Dataset
 from pydantic import BaseModel, Field
-from transformers import PreTrainedModel
+from transformers.modeling_utils import PreTrainedModel
 
 from pipelinerl.finetune.rl.utils import per_segment_sums
 from pipelinerl.finetune.types import PipelineBatchEncoding
-
-from .utils import (
-    mean_sum,
-    replace_dataset_column,
-    sum_sum,
-)
+from pipelinerl.finetune.rl.utils import sum_sum
 
 # FIXME: remove a warnings, but might be worth investigating
 os.environ["TOKENIZERS_PARALLELISM"] = "false"
@@ -40,8 +34,7 @@
 class RLConfig(BaseModel):
     policy_loss: str = Field(
         default="ppo",
-        description="Policy Loss to use for RL",
-        choices=["ppo", "reinforce", "gspo"],
+        description="Policy Loss to use for RL, one of ['ppo', 'reinforce', 'gspo']",
     )
     use_advantages: bool = Field(
         default=True,
diff --git a/pipelinerl/finetune_loop.py b/pipelinerl/finetune_loop.py
index 02533738..71dac6ea 100644
--- a/pipelinerl/finetune_loop.py
+++ b/pipelinerl/finetune_loop.py
@@ -496,7 +496,6 @@ def run_finetuning_loop(
             weight_update_manager.shutdown()
         if actor_update_group:
             dist.destroy_process_group(actor_update_group)
-        raise RuntimeError("Finetuning loop finished, exiting worker thread")
 
 
 def rl_finetuning_worker(

From 1247c6ff7288f0530399bdb52c9cbafb0cebaf56 Mon Sep 17 00:00:00 2001
From: Oleh Shliazhko <oleh.shliazhko@servicenow.com>
Date: Fri, 28 Nov 2025 18:20:58 +0100
Subject: [PATCH 126/126] improve rollout wrappers

---
 conf/base.yaml         |   3 +-
 pipelinerl/actor.py    | 216 ++++++++++++++++++++---------------------
 pipelinerl/llm.py      |   6 +-
 pipelinerl/rollouts.py |   2 +
 4 files changed, 111 insertions(+), 116 deletions(-)

diff --git a/conf/base.yaml b/conf/base.yaml
index f5fc4f15..80429dfb 100644
--- a/conf/base.yaml
+++ b/conf/base.yaml
@@ -19,7 +19,8 @@ actor:
   result_queue_size: 64
   throughput_window_size: 50
   shared_memory_entry_size: 10000000
-  async_batch_size: 4
+  async_batch_size: 1 # if ==1, rollout function will be called as synchronous
+  task_submission_delay_sec: 0.5
   collect_logprobs: true
 
 environment: null
diff --git a/pipelinerl/actor.py b/pipelinerl/actor.py
index d2a66031..100aa529 100644
--- a/pipelinerl/actor.py
+++ b/pipelinerl/actor.py
@@ -11,7 +11,7 @@
 from multiprocessing.managers import SharedMemoryManager
 from pathlib import Path
 from queue import Empty
-from typing import Callable, Dict, List
+from typing import Any, Awaitable, Callable, Dict, List
 
 import aiohttp
 import hydra
@@ -152,10 +152,7 @@ async def schedule_rollouts(
     samples_target = final_steps * cfg.finetune.train_batch_size * cfg.finetune.gradient_accumulation_passes
 
     def is_trainer_finished() -> bool:
-        return (
-            trainer_state.samples_processed is not None
-            and trainer_state.samples_processed >= samples_target
-        )
+        return trainer_state.samples_processed is not None and trainer_state.samples_processed >= samples_target
 
     def handle_rollout_exception(exc: Exception):
         if isinstance(exc, aiohttp.ClientError) and is_trainer_finished():
@@ -481,8 +478,13 @@ def run(self, dataset: list[tuple[str, dict]]):
                 yield
 
                 final_steps = calculate_train_steps(self.cfg.finetune, self.cfg.finetune.interrupt_train_steps)
-                samples_target = final_steps * self.cfg.finetune.train_batch_size * self.cfg.finetune.gradient_accumulation_passes
-                if self.trainer_state.samples_processed is not None and self.trainer_state.samples_processed >= samples_target:
+                samples_target = (
+                    final_steps * self.cfg.finetune.train_batch_size * self.cfg.finetune.gradient_accumulation_passes
+                )
+                if (
+                    self.trainer_state.samples_processed is not None
+                    and self.trainer_state.samples_processed >= samples_target
+                ):
                     logger.info("Trainer signalled completion; stopping actor loop")
                     break
 
@@ -546,7 +548,9 @@ def run(self, dataset: list[tuple[str, dict]]):
                 self.update_stats(rollout_results=rollout_results)
 
                 finished_groups += 1
-                logger.info(f"Finished {'train' if self.is_training else 'test'} groups {finished_groups} out of {expected_rollouts}")
+                logger.info(
+                    f"Finished {'train' if self.is_training else 'test'} groups {finished_groups} out of {expected_rollouts}"
+                )
                 time_to_publish_train_stats = (
                     self.is_training and trainer_version_to_publish is not None
                 ) or self.debug_mode
@@ -656,19 +660,17 @@ def __init__(self, cfg: DictConfig, *args, **kwargs):
             f"attempts {cfg.attempts} must be divisible by actor.async_batch_size {cfg.actor.async_batch_size}"
         )
         super().__init__(cfg, *args, **kwargs)
-        self.cfg_dict = OmegaConf.to_container(self.cfg, resolve=True)
+        self.cfg_dict: dict = OmegaConf.to_container(self.cfg, resolve=True)  # type: ignore
         self.unfinished_tasks = []
         self.llms_by_url = {llm.get_base_url(): llm for llm in self.llms}
         self.llms_utilization = {llm.get_base_url(): 0 for llm in self.llms}
         self.scheduler_name = f"{'train' if self.is_training else 'test'} ray scheduler"
         self.problem_id = 0
         self.attempts = self.cfg.attempts if self.is_training else 1
-        self.unfinished_problems = defaultdict(list)  # up to `attempts` rollout results for each problem
-        self.finished_problems = []
+        self.unfinished_groups = defaultdict(list)  # up to `attempts` rollout results for each problem
+        self.finished_groups = []
         self.token_count = 0
         self.finished_rollouts_count = 0
-        self.task_latencies = []
-        self.ray_result_latencies = []
         self.log_dir = Path(self.cfg.output_dir) / "actor" / "ray"
 
     def start_backend(self):
@@ -684,74 +686,71 @@ def start_backend(self):
         logger.info(f"Ray initialized, dashboard at {ray_context.dashboard_url}")
 
         assert self.trainer_state.propagated_weight_version is not None
-        rollout_policy: Callable[[DictConfig, TrainableLLM, dict], RolloutResult] = hydra.utils.get_method(
-            self.cfg.actor.rollout_policy
-        )
+        rollout_policy = hydra.utils.get_method(self.cfg.actor.rollout_policy)
 
-        def rollout_wrapper(cfg_dict: dict, llm: TrainableLLM, problems: list[dict], problem_id: int) -> RolloutResult:
+        def rollout_wrapper(cfg_dict: dict, llm: TrainableLLM, problems: list[dict]) -> list[RolloutResult]:
             assert len(problems) == 1, "Sync mode should only be used with 1 problem at a time"
             cfg = OmegaConf.create(cfg_dict)
             problem = problems[0]
-            task_id = problem["_task_id"]
-            log_file = Path(cfg.output_dir) / "actor" / "ray" / f"{task_id}.log"
+            group_id = problem["_group_id"]
+            attempt = problem["_attempt"]
+            log_file = Path(cfg.output_dir) / "actor" / "ray" / f"{group_id}.log"
             sys.stdout = open(log_file, "a", buffering=1)
             sys.stderr = sys.stdout
             logging.basicConfig(level=logging.INFO, stream=sys.stdout, force=True)
-            start_ts = time.monotonic()
-            logger.info(f"Running sync rollout for problem {task_id}")
+            logger.info(f"Running sync rollout for task {group_id}_{attempt}")
+            start_ts = time.perf_counter()
             rollout_result: RolloutResult = rollout_policy(cfg, llm, problem)
-            stop_ts = time.monotonic()
-            logger.info(f"Problem {problem['_task_id']} finished in {stop_ts - start_ts:.2f} seconds")
-            return [rollout_result], llm.get_base_url(), problem_id, [stop_ts - start_ts], stop_ts
-
-        async def run_multiple_rollouts(
-            cfg: DictConfig, llm: TrainableLLM, problems: list[dict], session: aiohttp.ClientSession
-        ) -> RolloutResult:
-            # Run all rollouts in parallel using asyncio.gather
-            async def run_rollout(problem):
-                logger.info(f"Running async rollout loop for problem {problem['_task_id']}")
-                start_ts = time.monotonic()
-                rollout_result = await rollout_policy(cfg, llm, problem, session)
-                stop_ts = time.monotonic()
-                latency = stop_ts - start_ts
-                return rollout_result, latency
-
-            tasks = [run_rollout(problem) for problem in problems]
-            results_with_latencies = await asyncio.gather(*tasks)
-            rollout_results = [res for res, _ in results_with_latencies]
-            task_latencies = [latency for _, latency in results_with_latencies]
-            return rollout_results, task_latencies
-
-        async def run_rollouts_with_session(cfg: DictConfig, llm: TrainableLLM, problems: list[dict]) -> RolloutResult:
+            rollout_result.latency = time.perf_counter() - start_ts
+            rollout_result.llm_url = llm.get_base_url()
+            rollout_result.group_id = group_id
+            rollout_result.attempt = attempt
+            logger.info(f"Task {group_id}_{attempt} finished in {rollout_result.latency:.2f} seconds")
+            return [rollout_result]
+
+        async def run_rollouts_with_session(
+            cfg: DictConfig, llm: TrainableLLM, problems: list[dict]
+        ) -> list[RolloutResult]:
             connector = aiohttp.TCPConnector(
                 limit=cfg.actor.async_batch_size, limit_per_host=cfg.actor.async_batch_size, keepalive_timeout=1.0
             )
             timeout = aiohttp.ClientTimeout(total=3600.0, connect=3600.0, sock_read=3600.0)
             async with aiohttp.ClientSession(connector=connector, timeout=timeout) as session:
-                rollout_results, task_latencies = await run_multiple_rollouts(cfg, llm, problems, session)
-            return rollout_results, task_latencies
-
-        def rollout_async_batch_wrapper(
-            cfg_dict: dict, llm: TrainableLLM, problems: list[dict], problem_id: int
-        ) -> RolloutResult:
+                # Run all rollouts in parallel using asyncio.gather
+                async def run_rollout(problem) -> RolloutResult:
+                    group_id = problem["_group_id"]
+                    attempt = problem["_attempt"]
+                    logger.info(f"Running async rollout loop for task {group_id}_{attempt}")
+                    start_ts = time.perf_counter()
+                    rollout_result = await rollout_policy(cfg, llm, problem, session)
+                    rollout_result.latency = time.perf_counter() - start_ts
+                    rollout_result.llm_url = llm.get_base_url()
+                    rollout_result.group_id = group_id
+                    rollout_result.attempt = attempt
+                    logger.info(f"Task {group_id}_{attempt} finished in {rollout_result.latency:.2f} seconds")
+                    return rollout_result
+
+                tasks = [run_rollout(problem) for problem in problems]
+                rollout_results = await asyncio.gather(*tasks)
+            return rollout_results
+
+        def rollout_async_batch_wrapper(cfg_dict: dict, llm: TrainableLLM, problems: list[dict]) -> list[RolloutResult]:
             cfg = OmegaConf.create(cfg_dict)
-            log_file = (
-                Path(cfg.output_dir) / "actor" / "ray" / f"{problems[0]['_task_id']}_async_{len(problems)}_problems.log"
-            )
+            group_id = problems[0]["_group_id"]
+            log_file = Path(cfg.output_dir) / "actor" / "ray" / f"{group_id}_async_{len(problems)}.log"
             sys.stdout = open(log_file, "a", buffering=1)
             sys.stderr = sys.stdout
             logging.basicConfig(level=logging.INFO, stream=sys.stdout, force=True)
-            logger.info(f"Running async rollouts for {len(problems)} problems")
-            results, task_latencies = asyncio.run(run_rollouts_with_session(cfg, llm, problems))
-            stop_ts = time.monotonic()
-            return results, llm.get_base_url(), problem_id, task_latencies, stop_ts
+            logger.info(f"Running async rollouts for group {group_id} with {len(problems)} problems")
+            rollout_results = asyncio.run(run_rollouts_with_session(cfg, llm, problems))
+            return rollout_results
 
         if self.cfg.actor.async_batch_size > 1:
             logger.info("Using async mode")
-            self.ray_remote = ray.remote(rollout_async_batch_wrapper)
+            self.ray_remote = ray.remote()(rollout_async_batch_wrapper)
         else:
             logger.info("Using sync mode")
-            self.ray_remote = ray.remote(num_cpus=0)(rollout_wrapper)
+            self.ray_remote = ray.remote()(rollout_wrapper)
         self.start_time = time.time()
 
     def have_capacity(self) -> bool:
@@ -766,10 +765,12 @@ def have_capacity(self) -> bool:
     def submit_problem(self, problem: dict):
         # Make a list of cfg.attempts identical problems (deepcopies can be used if necessary)
         problems = []
-        for n in range(self.attempts):
+        for attempt in range(self.attempts):
             p = problem.copy()
-            p["_task_id"] = f"problem_{self.problem_id}_attempt_{n}"
+            p["_group_id"] = f"{self.scheduler_name}_{self.problem_id}"
+            p["_attempt"] = attempt
             problems.append(p)
+
         # Split problems into batches of up to cfg.async_batch_size
         batches = [
             problems[i : i + self.cfg.actor.async_batch_size]
@@ -781,8 +782,8 @@ def submit_problem(self, problem: dict):
                 f"Submitting problem {self.problem_id} batch {batch_idx + 1}/{len(batches)} to the least busy LLM {llm_url} with {task_count} tasks"
             )
             llm = self.llms_by_url[llm_url]
-            task_ref = self.ray_remote.remote(self.cfg_dict, llm, problem_batch, self.problem_id)
-            time.sleep(1.0) # TODO: remove this
+            task_ref = self.ray_remote.remote(self.cfg_dict, llm, problem_batch)
+            time.sleep(self.cfg.actor.task_submission_delay_sec)
             self.llms_utilization[llm_url] += len(problem_batch)
             self.unfinished_tasks.append(task_ref)
         self.problem_id += 1
@@ -791,7 +792,7 @@ def stop_tasks(self):
         ray.shutdown()
 
     def receive_finished_tasks(self):
-        num_returns = min(100, len(self.unfinished_tasks))
+        num_returns = min(100, len(self.unfinished_tasks))  # query up to 100 tasks at a time
         try:
             finished_tasks, unfinished_tasks = ray.wait(self.unfinished_tasks, num_returns=num_returns, timeout=0.1)
         except Exception as e:
@@ -801,67 +802,62 @@ def receive_finished_tasks(self):
             logger.info(f"Found {len(finished_tasks)} finished tasks, {len(unfinished_tasks)} unfinished tasks left")
         self.unfinished_tasks = unfinished_tasks
         dt = time.time() - self.start_time
+        rollout_results: list[RolloutResult] = []
         for finished_task in finished_tasks:
             try:
-                rollout_results, llm_url, problem_id, task_latencies, stop_ts = ray.get(finished_task)
+                rollout_results += ray.get(finished_task)
             except Exception as e:
                 logger.error(f"Error getting finished ray task: {e}")
-                self.rollout_errors += 1
+                self.rollout_errors += self.cfg.actor.async_batch_size
                 continue
-            self.ray_result_latencies.append(time.monotonic() - stop_ts)
-            for rollout_result in rollout_results:
-                rollout_result.model_version = self.trainer_state.propagated_weight_version
-                full_group_id = f"{self.scheduler_name}_{problem_id}"
-                rollout_result.group_id = full_group_id
-                rollout_index = len(self.unfinished_problems[problem_id])
-                for step_index, sample in enumerate(rollout_result.training_texts):
-                    # Downstream in the pipeline we'll need these fields in every sample
-                    sample.metadata["model_version"] = rollout_result.model_version
-                    sample.metadata["rollout_index"] = rollout_index
-                    sample.metadata["step_index"] = step_index
-                    sample.group_id = full_group_id
-                self.task_latencies += task_latencies
-                if self.llms_utilization[llm_url] > 0:
-                    self.llms_utilization[llm_url] -= 1
-                else:
-                    logger.warning(f"LLM {llm_url} utilization is 0, but got a result")
-                self.token_count += get_number_of_tokens_in_result(rollout_result)
-                self.finished_rollouts_count += 1
-                self.unfinished_problems[problem_id].append(rollout_result)
-            logger.info(f"Problem {problem_id} has {len(self.unfinished_problems[problem_id])} rollout results")
-            attempts = self.cfg.attempts if self.is_training else 1
-            if len(self.unfinished_problems[problem_id]) == attempts:
-                logger.info(f"Problem {problem_id} group finished")
-                group = self.unfinished_problems[problem_id]
+        logger.info(f"Received {len(rollout_results)} rollout results from {len(finished_tasks)} finished tasks")
+        for rollout_result in rollout_results:
+            rollout_result.model_version = self.trainer_state.propagated_weight_version
+            rollout_index = len(self.unfinished_groups[rollout_result.group_id])
+            for step_index, tr_text in enumerate(rollout_result.training_texts):
+                # Downstream in the pipeline we'll need these fields in every sample
+                tr_text.metadata["model_version"] = rollout_result.model_version
+                tr_text.metadata["rollout_index"] = rollout_index
+                tr_text.metadata["step_index"] = step_index
+                tr_text.group_id = rollout_result.group_id
+            if self.llms_utilization[rollout_result.llm_url] > 0:
+                self.llms_utilization[rollout_result.llm_url] -= 1
+            else:
+                logger.warning(f"LLM {rollout_result.llm_url} utilization is 0, but got a result")  # should not happen
+            self.token_count += get_number_of_tokens_in_result(rollout_result)
+            self.finished_rollouts_count += 1
+            self.unfinished_groups[rollout_result.group_id].append(rollout_result)
+
+            if len(self.unfinished_groups[rollout_result.group_id]) == self.attempts:
+                logger.info(f"Problem {rollout_result.group_id} group finished")
+                group = self.unfinished_groups[rollout_result.group_id]
                 random.shuffle(group)
-                self.finished_problems.append(group)
-                del self.unfinished_problems[problem_id]
-                logger.info(f"{len(self.finished_problems)} finished problems ready to return")
-            logger.info(
-                f"Ray {'train' if self.is_training else 'test'} actor loop: "
-                f"rollouts in progress: {len(self.unfinished_tasks)}, "
-                f"problems in progress: {len(self.unfinished_problems)}, "
-                f"rollouts finished: {self.finished_rollouts_count}, "
-                f"total tokens: {self.token_count}, "
-                f"gen speed: {self.token_count / dt:.2f} tokens/sec, "
-                f"task latency: {np.mean(self.task_latencies[-10:]):.2f} sec, "
-                f"ray delay: {np.mean(self.ray_result_latencies[-10:]):.4f} sec,"
-                f"time elapsed: {dt:.2f} sec,\n"
-                f"LLMs utilization: {self.llms_utilization}"
-            )
+                self.finished_groups.append(group)
+                del self.unfinished_groups[rollout_result.group_id]
+                logger.info(f"{len(self.finished_groups)} finished groups ready to return")
+        logger.info(
+            f"Ray {'train' if self.is_training else 'test'} actor loop: "
+            f"rollouts in progress: {len(self.unfinished_tasks)}, "
+            f"groups in progress: {len(self.unfinished_groups)}, "
+            f"rollouts finished: {self.finished_rollouts_count}, "
+            f"total tokens: {self.token_count}, "
+            f"gen speed: {self.token_count / dt:.2f} tokens/sec, "
+            f"time elapsed: {dt:.2f} sec,\n"
+            f"LLMs utilization: {self.llms_utilization}"
+        )
 
     def get_new_results(self) -> list[list[RolloutResult]]:
         self.receive_finished_tasks()
-        if len(self.finished_problems) > 0:
-            logger.info(f"have {len(self.finished_problems)} finished problems, pop one")
-            return self.finished_problems.pop(0)
+        if len(self.finished_groups) > 0:
+            logger.info(f"have {len(self.finished_groups)} finished problems, pop one")
+            return self.finished_groups.pop(0)
         return []
 
     def problem_queue_size(self) -> int:
         return len(self.unfinished_tasks)
 
     def result_queue_size(self) -> int:
-        return len(self.finished_problems)
+        return len(self.finished_groups)
 
 
 def run_actor_loop(cfg: DictConfig):
diff --git a/pipelinerl/llm.py b/pipelinerl/llm.py
index 96950231..04dd93b7 100644
--- a/pipelinerl/llm.py
+++ b/pipelinerl/llm.py
@@ -359,10 +359,6 @@ class TrainableLLM(LLM):
         base_url (str): Base URL of the API endpoint
         api_token (str): Authentication token for API access
     """
-
-    # TODO: use OpenAI Python client when the certificate issue is resolved.
-    # TODO: consider using litellm
-
     base_url: str = "https://api.openai.com"
     api_token: str = Field(default="", exclude=True)
     collect_logprobs: bool = False
@@ -370,7 +366,7 @@ class TrainableLLM(LLM):
     max_parallel_requests: int = 32
     max_retries: int = 5
     base_delay: float = 0.5
-    _semaphore: asyncio.Semaphore
+    _semaphore: asyncio.Semaphore = None # type: ignore
 
     def model_post_init(self, __context):
         super().model_post_init(__context)
diff --git a/pipelinerl/rollouts.py b/pipelinerl/rollouts.py
index dcb27f2d..c755cf31 100644
--- a/pipelinerl/rollouts.py
+++ b/pipelinerl/rollouts.py
@@ -64,3 +64,5 @@ class RolloutResult(BaseModel):
     model_version: int | None = None
     dataset_name: str | None = None
     group_id: str | None = None
+    llm_url: str = ""
+    attempt: int = 0 # number of attempt in the group of that problem