From 17b55e210d177dec03b3a46faa18d5acb5104cea Mon Sep 17 00:00:00 2001 From: atchudhansg Date: Sun, 16 Nov 2025 12:12:22 +0530 Subject: [PATCH] feat: Add custom task system for BrowserGym environment - Add custom task framework with base classes and models - Implement copy-paste tasks (single-tab and multi-tab) - Add comprehensive documentation in README - Include example script for custom tasks - Update .gitignore to exclude virtual environments --- .gitignore | 1 + examples/browsergym_custom_example.py | 134 +++++ src/envs/browsergym_env/README.md | 494 +++++++++++++++--- .../server/browsergym_environment.py | 224 ++++++-- .../browsergym_env/server/custom/__init__.py | 9 + .../server/custom/custom_base.py | 346 ++++++++++++ .../server/custom/custom_models.py | 107 ++++ .../server/custom/custom_tasks.py | 320 ++++++++++++ .../custom/tasks/copy-paste-source.html | 37 ++ .../custom/tasks/copy-paste-target.html | 94 ++++ .../server/custom/tasks/copy-paste.html | 93 ++++ 11 files changed, 1746 insertions(+), 113 deletions(-) create mode 100644 examples/browsergym_custom_example.py create mode 100644 src/envs/browsergym_env/server/custom/__init__.py create mode 100644 src/envs/browsergym_env/server/custom/custom_base.py create mode 100644 src/envs/browsergym_env/server/custom/custom_models.py create mode 100644 src/envs/browsergym_env/server/custom/custom_tasks.py create mode 100644 src/envs/browsergym_env/server/custom/tasks/copy-paste-source.html create mode 100644 src/envs/browsergym_env/server/custom/tasks/copy-paste-target.html create mode 100644 src/envs/browsergym_env/server/custom/tasks/copy-paste.html diff --git a/.gitignore b/.gitignore index 29ead01a..c561ab5e 100644 --- a/.gitignore +++ b/.gitignore @@ -53,6 +53,7 @@ coverage.xml .env .venv venv/ +myenv/ ENV/ env.bak/ venv.bak/ diff --git a/examples/browsergym_custom_example.py b/examples/browsergym_custom_example.py new file mode 100644 index 00000000..c7a7a189 --- /dev/null +++ b/examples/browsergym_custom_example.py @@ -0,0 +1,134 @@ +"""Example usage of custom BrowserGym tasks. + +This script demonstrates how to create and use custom tasks with the +BrowserGym environment wrapper in OpenEnv. +""" + +import sys +import os +import time + +# Add src to path for imports +sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', '..', '..')) + +from envs.browsergym_env.server.browsergym_environment import BrowserGymEnvironment +from envs.browsergym_env.models import BrowserGymAction + + +def multi_tab_copy_paste_example(): + """Run the multi-tab copy-paste example.""" + + print("Multi-Tab Copy-Paste Task Example") + print("-" * 80) + + # Create environment + env = BrowserGymEnvironment( + benchmark="custom", + task_name="copy-paste-multitab", + headless=False, + viewport_width=1280, + viewport_height=720, + timeout=10000.0, + ) + + # Reset environment + obs = env.reset() + print(f"Goal: {obs.goal}\n") + + # Solve the multi-tab task- simulates user actions + steps = [ + ("Select source text", "click('#source-text')"), + ("Select all text", "press('Control+A')"), + ("Copy text", "press('Control+C')"), + ("Navigate to target page", "click('#open-target-btn')"), + ("Click target input field", "click('#target-text')"), + ("Paste text", "press('Control+V')"), + ("Submit form", "click('#submit-btn')"), + ] + + for i, (description, action_str) in enumerate(steps, 1): + print(f"Step {i}: {description}") + action = BrowserGymAction(action_str=action_str) + obs = env.step(action) + + # Show which page we're on + current_page = "unknown" + if obs.metadata and 'custom_data' in obs.metadata: + current_page = obs.metadata['custom_data'].get('current_page', 'unknown') + + print(f" Reward: {obs.reward}, Done: {obs.done}, Page: {current_page}") + + # Add delay to see the browser actions + time.sleep(1) + + if obs.done: + print(f"\nāœ“ Task completed! Total reward: {env.state.cum_reward}") + break + + env.close() + print("-" * 80) + +def single_tab_copy_paste_example(): + """Run the single-tab copy-paste example.""" + + print("Custom BrowserGym Task Example: Copy-Paste") + print("-" * 80) + + # Create environment + env = BrowserGymEnvironment( + benchmark="custom", + task_name="copy-paste", + headless=False, + viewport_width=1280, + viewport_height=720, + timeout=10000.0, + ) + + # Reset environment + obs = env.reset() + print(f"Goal: {obs.goal}\n") + + # Solve the task + steps = [ + ("Click source text field", "click('#source-text')"), + ("Select all text", "press('Control+A')"), + ("Copy text", "press('Control+C')"), + ("Click target field", "click('#target-text')"), + ("Paste text", "press('Control+V')"), + ("Click submit button", "click('#submit-btn')"), + ] + + for i, (description, action_str) in enumerate(steps, 1): + print(f"Step {i}: {description}") + action = BrowserGymAction(action_str=action_str) + obs = env.step(action) + print(f" Reward: {obs.reward}, Done: {obs.done}") + + # Add delay to see the browser actions + time.sleep(1) + + if obs.done: + print(f"\nāœ“ Task completed! Total reward: {env.state.cum_reward}") + break + + env.close() + print("-" * 80) + +def main(): + """Run the custom task example.""" + + # Run single-tab copy-paste example + print("Single-Tab Copy-Paste") + single_tab_copy_paste_example() + + time.sleep(3) + + # Run multi-tab copy-paste example + print("\nMulti-Tab Copy-Paste") + multi_tab_copy_paste_example() + + +if __name__ == "__main__": + # Run main example + main() + diff --git a/src/envs/browsergym_env/README.md b/src/envs/browsergym_env/README.md index d730ce37..ca3a531b 100644 --- a/src/envs/browsergym_env/README.md +++ b/src/envs/browsergym_env/README.md @@ -25,7 +25,7 @@ BrowserGym provides a complete pipeline for developing web agents: train on simp ## Quick Start - Training (MiniWoB) -### No Setup Required! šŸŽ‰ +### No Setup Required! ```python from envs.browsergym_env import BrowserGymEnv, BrowserGymAction @@ -58,7 +58,350 @@ for episode in range(1000): env.close() ``` -### Available Tasks by Benchmark +## Custom Tasks - Create Your Own Benchmarks + +In addition to official BrowserGym benchmarks (MiniWoB, WebArena, etc.), you can create **custom tasks** for domain-specific training or prototyping. + +### Why Custom Tasks? + +**Official Benchmarks** (miniwob, webarena): +- Established, well-tested tasks +- Standardized evaluation +- Community benchmarks +- Fixed task set - can't add your own +- Requires BrowserGym package installation +- Must integrate with BrowserGym's registration system + +**Custom Tasks**: +- Create unlimited domain-specific tasks +- No BrowserGym package needed +- No registration complexity +- Full control over HTML, rewards, termination +- Rapid prototyping and iteration +- Not standardized (for research/training only) + +### Quick Start - Custom Tasks + +```python +from envs.browsergym_env import BrowserGymEnv, BrowserGymAction + +# Use a custom task (no BrowserGym installation needed!) +env = BrowserGymEnv.from_docker_image( + "ghcr.io/openenv/browsergym-env:latest", + environment={ + "BROWSERGYM_BENCHMARK": "custom", + "BROWSERGYM_TASK_NAME": "copy-paste", # or "copy-paste-multitab" + } +) + +# Train on your custom task +result = env.reset() +print(f"Goal: {result.observation.goal}") + +action = BrowserGymAction(action_str="click('#source-text')") +result = env.step(action) +print(f"Reward: {result.reward}") + +env.close() +``` + +### Available Custom Tasks + +| Task Name | Description | Difficulty | Multi-Page | +|-----------|-------------|------------|------------| +| `copy-paste` | Copy text from one field to another | Easy | No | +| `copy-paste-multitab` | Copy text across two pages | Medium | Yes | + +### Action Format Reference + +Custom tasks support BrowserGym-style action strings: + +- **Click**: `click('button')` or `click('#submit')` or `click('.classname')` +- **Fill**: `fill('input[name="username"]', 'john@example.com')` +- **Navigate**: `goto('https://example.com')` or `goto('file:///path/to/page.html')` +- **Press key**: `press('Enter')` or `press('Control+C')` +- **Scroll**: `scroll('down')` or `scroll('up')` +- **Custom JavaScript**: Any other string is executed as JavaScript in the browser context + +**Examples:** +```python +# Click actions +BrowserGymAction(action_str="click('#submit-btn')") +BrowserGymAction(action_str="click('button.primary')") + +# Fill forms +BrowserGymAction(action_str="fill('#email', 'user@example.com')") +BrowserGymAction(action_str="fill('input[name=\"password\"]', 'secret123')") + +# Keyboard +BrowserGymAction(action_str="press('Tab')") +BrowserGymAction(action_str="press('Control+A')") + +# Navigation +BrowserGymAction(action_str="goto('https://example.com')") + +# JavaScript (for complex interactions) +BrowserGymAction(action_str="document.querySelector('#dropdown').value = 'option2'") +``` + +### Creating Custom Tasks + +Custom tasks are defined in `server/custom/custom_tasks.py`. Each task needs: + +1. **Task HTML** - Minimal HTML page(s) with your UI +2. **Python Task Class** - Defines behavior, rewards, termination +3. **Registration** - Add to task registry + +**File Structure:** +``` +server/custom/ + custom_models.py # CustomGymAction, CustomGymObservation, CustomGymState + custom_base.py # Base class for custom environments + custom_tasks.py # Task registry and implementations + tasks/ # HTML files for tasks + copy-paste.html + copy-paste-source.html + copy-paste-target.html +``` + +**Design Philosophy** (Following Official Benchmarks): + + **DO:** +- Keep HTML minimal and functional (like MiniWoB) +- Let agents figure out what to do from task description +- Use simple, clean styling +- Focus on task logic, not visual appeal + + **DON'T:** +- Add step-by-step instructions in HTML +- Use fancy animations or gradients +- Add visual hints or progress indicators +- Use emojis or decorative elements + +**Example: Single-Page Task** + +```python +# In server/custom/custom_tasks.py +from custom_base import CustomBrowserGymEnvironment + +class MyCustomTask(CustomBrowserGymEnvironment): + def _get_task_url(self) -> str: + """Return path to your HTML file.""" + import os + task_html = os.path.join( + os.path.dirname(__file__), + "tasks", + "my-task.html" + ) + return f"file://{task_html}" + + def _get_goal_description(self) -> str: + """Return task instruction for the agent.""" + return "Click the submit button after filling the form" + + async def _extract_observation(self, page) -> dict: + """Extract state from the page.""" + content = await page.content() + form_valid = await page.evaluate( + "document.querySelector('form')?.checkValidity() || false" + ) + + return { + "text": content, # Full HTML for agent + "pruned_html": content[:1000], # Truncated version + "custom_data": { + "form_valid": form_valid, + } + } + + def _calculate_reward(self, page_data, action, error=None) -> float: + """Calculate reward based on page state.""" + if error: + return -0.1 # Small penalty for errors + + custom_data = page_data.get("custom_data", {}) + if custom_data.get("form_valid"): + return 1.0 # Success! + + return 0.0 # No progress + + def _check_done(self, page_data) -> bool: + """Check if task is complete.""" + custom_data = page_data.get("custom_data", {}) + return custom_data.get("form_valid", False) + +# Register your task in server/custom/custom_tasks.py +register_custom_task("my-task", MyCustomTask) +``` + +**Step-by-Step Registration:** +1. Create your task class in `server/custom/custom_tasks.py` (or import it) +2. Call `register_custom_task("task-name", YourTaskClass)` at the bottom of the file +3. Create HTML file(s) in `server/custom/tasks/` directory if needed +4. Use with `BROWSERGYM_TASK_NAME="task-name"` + +**Example: Multi-Page Task** + +```python +class MyMultiPageTask(CustomBrowserGymEnvironment): + async def _extract_observation(self, page) -> dict: + content = await page.content() + current_url = page.url + + # Determine which page we're on + if "page1" in current_url: + data = await page.evaluate("getPage1Data()") + return { + "text": content, + "custom_data": {"current_page": "page1", **data} + } + elif "page2" in current_url: + data = await page.evaluate("getPage2Data()") + return { + "text": content, + "custom_data": {"current_page": "page2", **data} + } + + return {"text": content, "custom_data": {}} + + def _calculate_reward(self, page_data, action, error=None) -> float: + """Reward for navigation and completion.""" + custom_data = page_data.get("custom_data", {}) + current_page = custom_data.get("current_page") + + # Reward for successfully navigating to page2 + if current_page == "page2" and "goto" in action.lower(): + return 0.3 + + # Reward for task completion on page2 + if current_page == "page2" and custom_data.get("task_complete"): + return 1.0 + + return 0.0 + +register_custom_task("my-multitab-task", MyMultiPageTask) +``` + +### Custom Task HTML Guidelines + +Follow official benchmark style (MiniWoB, WebArena): + +```html + + + + My Task + + + + + + + + + + + + + + +``` + +**Key Principles:** +- No visual hints or progress bars +- No step-by-step instructions in HTML +- No emojis or decorative elements +- Simple, clean, functional UI +- Agent figures out task from goal description + +### Custom vs Official Benchmarks + +| Aspect | Official (miniwob, webarena) | Custom | +|--------|----------------------------|--------| +| **Installation** | Requires browsergym-{benchmark} | No packages needed | +| **Task Creation** | Fixed task set | Unlimited custom tasks | +| **Registration** | gym.make() system | Simple Python registry | +| **Browser Control** | BrowserGym internals | Playwright directly | +| **HTML Location** | BrowserGym package | Local server/custom/ directory | +| **Use Case** | Standardized evaluation | Rapid prototyping, domain-specific training | +| **Community** | Established benchmarks | Your own tasks | + +### When to Use Custom Tasks + + **Use Custom Tasks For:** +- Rapid prototyping of new task ideas +- Domain-specific training (e.g., corporate workflows, specialized forms) +- Testing new agent architectures quickly +- Educational purposes +- Tasks not covered by official benchmarks + + **Use Official Benchmarks For:** +- Publishing research results +- Comparing with other papers +- Standardized evaluation +- Established task benchmarks + +### Advanced: Custom Task Features + +**Dynamic Task Generation:** +```python +class DynamicFormTask(CustomBrowserGymEnvironment): + def __init__(self, **kwargs): + super().__init__(**kwargs) + self.num_fields = self.custom_params.get("num_fields", 3) + + def _get_task_url(self) -> str: + # Generate HTML dynamically + html = self._generate_form_html(self.num_fields) + # Use data: URL + return f"data:text/html,{html}" + +# Use with custom parameters +env = BrowserGymEnv(environment={ + "BROWSERGYM_BENCHMARK": "custom", + "BROWSERGYM_TASK_NAME": "dynamic-form", + "num_fields": "5" # Custom parameter +}) +``` + +**State Persistence Across Pages:** +```python +class MultiPageWithState(CustomBrowserGymEnvironment): + def __init__(self, **kwargs): + super().__init__(**kwargs) + self.task_state = {} # Persistent state + + def _calculate_reward(self, page_data, action, error=None) -> float: + # Access state from previous pages + if self.task_state.get("collected_item"): + return 1.0 + return 0.0 +``` + +**See also:** +- `server/custom/README.md` - Detailed custom task documentation +- `server/custom/custom_tasks.py` - Example implementations +- `examples/browsergym_custom_example.py` - Usage examples + +--- + +## Evaluation (WebArena) #### MiniWoB++ Tasks (Training - 100+ tasks) @@ -67,72 +410,72 @@ MiniWoB tasks are organized by difficulty and type. Here are the main categories **Click Tasks** (Basic interaction) | Task Name | Description | Difficulty | |-----------|-------------|------------| -| `click-test` | Click a single button | ⭐ Easy | -| `click-button` | Click button with specific text | ⭐ Easy | -| `click-button-sequence` | Click buttons in order | ⭐⭐ Medium | -| `click-checkboxes` | Select specific checkboxes | ⭐⭐ Medium | -| `click-checkboxes-soft` | Select checkboxes (multiple valid) | ⭐⭐ Medium | -| `click-checkboxes-large` | Many checkboxes to select from | ⭐⭐ Medium | -| `click-checkboxes-transfer` | Transfer learning variation | ⭐⭐ Medium | -| `click-dialog` | Click correct button in dialog | ⭐ Easy | -| `click-dialog-2` | More complex dialog | ⭐⭐ Medium | -| `click-link` | Click on a link | ⭐ Easy | -| `click-option` | Select from dropdown | ⭐⭐ Medium | -| `click-pie` | Click on pie chart slice | ⭐⭐ Medium | -| `click-scroll-list` | Click item in scrollable list | ⭐⭐⭐ Hard | -| `click-shades` | Click on specific color shade | ⭐⭐ Medium | -| `click-shape` | Click on specific shape | ⭐⭐ Medium | -| `click-tab` | Switch between tabs | ⭐⭐ Medium | -| `click-tab-2` | More complex tab switching | ⭐⭐⭐ Hard | -| `click-widget` | Click on UI widget | ⭐⭐ Medium | +| `click-test` | Click a single button | Easy | +| `click-button` | Click button with specific text | Easy | +| `click-button-sequence` | Click buttons in order | Medium | +| `click-checkboxes` | Select specific checkboxes | Medium | +| `click-checkboxes-soft` | Select checkboxes (multiple valid) | Medium | +| `click-checkboxes-large` | Many checkboxes to select from | Medium | +| `click-checkboxes-transfer` | Transfer learning variation | Medium | +| `click-dialog` | Click correct button in dialog | Easy | +| `click-dialog-2` | More complex dialog | Medium | +| `click-link` | Click on a link | Easy | +| `click-option` | Select from dropdown | Medium | +| `click-pie` | Click on pie chart slice | Medium | +| `click-scroll-list` | Click item in scrollable list | Hard | +| `click-shades` | Click on specific color shade | Medium | +| `click-shape` | Click on specific shape | Medium | +| `click-tab` | Switch between tabs | Medium | +| `click-tab-2` | More complex tab switching | Hard | +| `click-widget` | Click on UI widget | Medium | **Text Entry Tasks** (Typing and forms) | Task Name | Description | Difficulty | |-----------|-------------|------------| -| `enter-text` | Type text into input field | ⭐ Easy | -| `enter-text-dynamic` | Dynamic text entry | ⭐⭐ Medium | -| `enter-text-2` | Multiple text fields | ⭐⭐ Medium | -| `enter-password` | Fill password field | ⭐ Easy | -| `enter-date` | Enter a date | ⭐⭐ Medium | -| `enter-time` | Enter a time | ⭐⭐ Medium | -| `login-user` | Complete login form | ⭐⭐ Medium | -| `login-user-popup` | Login via popup | ⭐⭐⭐ Hard | +| `enter-text` | Type text into input field | Easy | +| `enter-text-dynamic` | Dynamic text entry | Medium | +| `enter-text-2` | Multiple text fields | Medium | +| `enter-password` | Fill password field | Easy | +| `enter-date` | Enter a date | Medium | +| `enter-time` | Enter a time | Medium | +| `login-user` | Complete login form | Medium | +| `login-user-popup` | Login via popup | Hard | **Navigation Tasks** (Multi-step interaction) | Task Name | Description | Difficulty | |-----------|-------------|------------| -| `navigate-tree` | Navigate through tree structure | ⭐⭐⭐ Hard | -| `search-engine` | Use search interface | ⭐⭐ Medium | -| `use-autocomplete` | Interact with autocomplete | ⭐⭐⭐ Hard | -| `book-flight` | Book a flight (complex form) | ⭐⭐⭐⭐ Very Hard | -| `choose-date` | Pick date from calendar | ⭐⭐⭐ Hard | -| `choose-date-easy` | Simplified date picker | ⭐⭐ Medium | -| `choose-date-medium` | Medium difficulty date picker | ⭐⭐⭐ Hard | -| `choose-list` | Select from long list | ⭐⭐ Medium | +| `navigate-tree` | Navigate through tree structure | Hard | +| `search-engine` | Use search interface | Medium | +| `use-autocomplete` | Interact with autocomplete | Hard | +| `book-flight` | Book a flight (complex form) | Very Hard | +| `choose-date` | Pick date from calendar | Hard | +| `choose-date-easy` | Simplified date picker | Medium | +| `choose-date-medium` | Medium difficulty date picker | Hard | +| `choose-list` | Select from long list | Medium | **Visual/Spatial Tasks** (Requires visual understanding) | Task Name | Description | Difficulty | |-----------|-------------|------------| -| `count-sides` | Count sides of shape | ⭐⭐ Medium | -| `count-shape` | Count specific shapes | ⭐⭐ Medium | -| `find-word` | Find word in text | ⭐⭐ Medium | -| `focus-text` | Focus on text element | ⭐ Easy | -| `focus-text-2` | More complex focus task | ⭐⭐ Medium | -| `grid-coordinate` | Click grid coordinate | ⭐⭐ Medium | -| `guess-number` | Guess a number game | ⭐⭐⭐ Hard | -| `identify-shape` | Identify shape type | ⭐⭐ Medium | -| `read-table` | Extract info from table | ⭐⭐⭐ Hard | -| `read-table-2` | More complex table reading | ⭐⭐⭐ Hard | +| `count-sides` | Count sides of shape | Medium | +| `count-shape` | Count specific shapes | Medium | +| `find-word` | Find word in text | Medium | +| `focus-text` | Focus on text element | Easy | +| `focus-text-2` | More complex focus task | Medium | +| `grid-coordinate` | Click grid coordinate | Medium | +| `guess-number` | Guess a number game | Hard | +| `identify-shape` | Identify shape type | Medium | +| `read-table` | Extract info from table | Hard | +| `read-table-2` | More complex table reading | Hard | **Email/Social Tasks** (Realistic scenarios) | Task Name | Description | Difficulty | |-----------|-------------|------------| -| `email-inbox` | Manage email inbox | ⭐⭐⭐⭐ Very Hard | -| `email-inbox-forward` | Forward emails | ⭐⭐⭐⭐ Very Hard | -| `email-inbox-nl` | Natural language email task | ⭐⭐⭐⭐ Very Hard | -| `email-inbox-star-reply` | Star and reply to emails | ⭐⭐⭐⭐ Very Hard | -| `social-media` | Social media interaction | ⭐⭐⭐⭐ Very Hard | -| `social-media-some` | Partial social media task | ⭐⭐⭐ Hard | +| `email-inbox` | Manage email inbox | Very Hard | +| `email-inbox-forward` | Forward emails | Very Hard | +| `email-inbox-nl` | Natural language email task | Very Hard | +| `email-inbox-star-reply` | Star and reply to emails | Very Hard | +| `social-media` | Social media interaction | Very Hard | +| `social-media-some` | Partial social media task | Hard | **Total:** 100+ tasks across all categories @@ -416,7 +759,26 @@ Environment variables: ## Supported Benchmarks -### 1. MiniWoB++ (Training) āœ… Recommended for Training +### 1. Custom Tasks (Rapid Prototyping) For Development + +- **Unlimited tasks**: Create domain-specific tasks +- **No dependencies**: No BrowserGym package needed +- **Instant iteration**: Modify HTML and logic quickly +- **Full control**: Define rewards, termination, UI +- **Fast setup**: Just add Python class and HTML file + +**Use Case**: Rapid prototyping, domain-specific training, testing new ideas + +**Tasks**: `copy-paste`, `copy-paste-multitab`, *[your tasks here]* + +```python +env = BrowserGymEnv(environment={ + "BROWSERGYM_BENCHMARK": "custom", + "BROWSERGYM_TASK_NAME": "copy-paste" +}) +``` + +### 2. MiniWoB++ (Training) Recommended for Training - **100+ tasks** ranging from simple (click buttons) to complex (form filling, navigation) - **Fast**: Instant resets, quick episodes @@ -426,7 +788,7 @@ Environment variables: **Use Case**: Train agents on fundamental web navigation skills -### 2. WebArena (Evaluation) šŸ“Š Benchmark +### 3. WebArena (Evaluation) Benchmark - **812 realistic tasks** across 6 websites - **Complex**: Multi-step reasoning, real web interfaces @@ -436,7 +798,7 @@ Environment variables: **Use Case**: Evaluate agents on realistic web tasks -### 3. VisualWebArena (Evaluation) šŸ‘ļø Visual Benchmark +### 4. VisualWebArena (Evaluation) Visual Benchmark - **910 tasks** requiring visual understanding - **Multimodal**: Both text and visual observations @@ -445,7 +807,7 @@ Environment variables: **Use Case**: Test visual web navigation capabilities -### 4. WorkArena (Evaluation) šŸ’¼ Enterprise Benchmark +### 5. WorkArena (Evaluation) Enterprise Benchmark - **Enterprise tasks**: CRM, project management, etc. - **Realistic workflows**: Real enterprise software @@ -517,16 +879,16 @@ python app.py ``` browsergym_env/ -ā”œā”€ā”€ __init__.py # Module exports -ā”œā”€ā”€ models.py # Action, Observation, State dataclasses -ā”œā”€ā”€ client.py # HTTPEnvClient implementation -ā”œā”€ā”€ README.md # This file -└── server/ - ā”œā”€ā”€ __init__.py - ā”œā”€ā”€ app.py # FastAPI application - ā”œā”€ā”€ browsergym_environment.py # Environment implementation - ā”œā”€ā”€ Dockerfile # Container specification - └── requirements.txt # Python dependencies + __init__.py # Module exports + models.py # Action, Observation, State dataclasses + client.py # HTTPEnvClient implementation + README.md # This file + server/ + __init__.py + app.py # FastAPI application + browsergym_environment.py # Environment implementation + Dockerfile # Container specification + requirements.txt # Python dependencies ``` ## References diff --git a/src/envs/browsergym_env/server/browsergym_environment.py b/src/envs/browsergym_env/server/browsergym_environment.py index 42f30a6f..da132bcf 100644 --- a/src/envs/browsergym_env/server/browsergym_environment.py +++ b/src/envs/browsergym_env/server/browsergym_environment.py @@ -9,7 +9,9 @@ """ import importlib -from typing import Any, Dict, Optional +import os +import sys +from typing import Any, Dict, Optional, TYPE_CHECKING from uuid import uuid4 import gymnasium as gym @@ -21,6 +23,39 @@ BrowserGymState, ) +# Add the server directory to sys.path to allow custom module imports +_SERVER_DIR = os.path.dirname(os.path.abspath(__file__)) +if _SERVER_DIR not in sys.path: + sys.path.insert(0, _SERVER_DIR) + +# Import custom models for custom benchmark +# Use TYPE_CHECKING to avoid runtime import issues with type hints +if TYPE_CHECKING: + from custom.custom_models import ( + CustomGymAction, + CustomGymObservation, + CustomGymState, + ) + +try: + from custom.custom_models import ( + CustomGymAction as _CustomGymAction, + CustomGymObservation as _CustomGymObservation, + CustomGymState as _CustomGymState, + ) + CUSTOM_AVAILABLE = True + CustomGymAction = _CustomGymAction + CustomGymObservation = _CustomGymObservation + CustomGymState = _CustomGymState + _CUSTOM_IMPORT_ERROR = None +except ImportError as e: + CUSTOM_AVAILABLE = False + CustomGymAction = None # type: ignore + CustomGymObservation = None # type: ignore + CustomGymState = None # type: ignore + _CUSTOM_IMPORT_ERROR = str(e) + + class BrowserGymEnvironment(Environment): """BrowserGym environment wrapper for OpenEnv. @@ -61,53 +96,99 @@ def __init__( self.timeout = timeout self.gym_kwargs = gym_kwargs - # Build environment ID - if task_name: - self.env_id = f"browsergym/{benchmark}.{task_name}" + # Check if this is a custom benchmark + self.is_custom = benchmark == "custom" + + if self.is_custom: + # Handle custom benchmark differently + if not CUSTOM_AVAILABLE: + raise ValueError( + f"Custom benchmark requested but custom models not available.\n" + f"Import error: {_CUSTOM_IMPORT_ERROR}\n" + f"Make sure custom/custom_models.py exists in {_SERVER_DIR}/custom/" + ) + + if not task_name: + raise ValueError("task_name is required for custom benchmark") + + # Import and instantiate custom environment + try: + from custom.custom_tasks import get_custom_task + self.custom_env = get_custom_task( + task_name=task_name, + headless=headless, + viewport_width=viewport_width, + viewport_height=viewport_height, + timeout=timeout, + **gym_kwargs + ) + except ImportError as e: + raise ValueError( + f"Failed to import custom task '{task_name}': {e}\n" + f"Make sure the task is registered in custom/custom_tasks.py" + ) from e + + self.gym_env = None + self.env_id = f"custom/{task_name}" + + # Use CustomGymState for custom benchmarks + self._state = CustomGymState( + episode_id=str(uuid4()), + step_count=0, + benchmark="custom", + task_name=task_name, + ) else: - self.env_id = f"browsergym/{benchmark}" - - # force import the benchmark module - benchmark_modules = { - "miniwob": "browsergym.envs.miniwob", - "webarena": "browsergym.envs.webarena", - "visualwebarena": "browsergym.envs.visualwebarena", - "workarena": "browsergym.envs.workarena", - } - module_path = benchmark_modules.get(benchmark) - try: - if module_path: - importlib.import_module(module_path) + # Original BrowserGym benchmark handling + # Build environment ID + if task_name: + self.env_id = f"browsergym/{benchmark}.{task_name}" else: - importlib.import_module("browsergym") - except ModuleNotFoundError as import_error: - raise ValueError( - f"Failed to import BrowserGym benchmark '{benchmark}': {import_error}\n" - f"Make sure the package browsergym-{benchmark} is installed." - ) from import_error - - # Create the BrowserGym environment - try: - self.gym_env = gym.make( - self.env_id, - headless=headless, - viewport={"width": viewport_width, "height": viewport_height}, - timeout=timeout, - **gym_kwargs, + self.env_id = f"browsergym/{benchmark}" + + # force import the benchmark module + benchmark_modules = { + "miniwob": "browsergym.envs.miniwob", + "webarena": "browsergym.envs.webarena", + "visualwebarena": "browsergym.envs.visualwebarena", + "workarena": "browsergym.envs.workarena", + } + module_path = benchmark_modules.get(benchmark) + try: + if module_path: + importlib.import_module(module_path) + else: + importlib.import_module("browsergym") + except ModuleNotFoundError as import_error: + raise ValueError( + f"Failed to import BrowserGym benchmark '{benchmark}': {import_error}\n" + f"Make sure the package browsergym-{benchmark} is installed." + ) from import_error + + # Create the BrowserGym environment + try: + self.gym_env = gym.make( + self.env_id, + headless=headless, + viewport={"width": viewport_width, "height": viewport_height}, + timeout=timeout, + **gym_kwargs, + ) + except Exception as e: + raise ValueError( + f"Failed to create BrowserGym environment '{self.env_id}': {e}\n" + f"Make sure the benchmark is installed (e.g., pip install browsergym-{benchmark})" + ) from e + + # State tracking for standard benchmarks + self._state = BrowserGymState( + episode_id=str(uuid4()), + step_count=0, + benchmark=benchmark, + task_name=task_name or "", ) - except Exception as e: - raise ValueError( - f"Failed to create BrowserGym environment '{self.env_id}': {e}\n" - f"Make sure the benchmark is installed (e.g., pip install browsergym-{benchmark})" - ) - - # State tracking - self._state = BrowserGymState( - episode_id=str(uuid4()), - step_count=0, - benchmark=benchmark, - task_name=task_name or "", - ) + + self.custom_env = None self._last_obs: Optional[Dict[str, Any]] = None self._last_info: Optional[Dict[str, Any]] = None @@ -126,6 +207,14 @@ def reset( Returns: Initial observation for the task """ + if self.is_custom: + # Handle custom environment reset + obs = self.custom_env.reset(seed=seed) + self._state = self.custom_env.state + # Convert CustomGymObservation to BrowserGymObservation + return self._convert_custom_observation(obs) + + # Original BrowserGym handling # Generate new episode ID self._state = BrowserGymState( episode_id=str(uuid4()), @@ -157,6 +246,15 @@ def step(self, action: BrowserGymAction) -> BrowserGymObservation: Returns: Observation after executing the action """ + if self.is_custom: + # Convert BrowserGymAction to CustomGymAction + custom_action = CustomGymAction(action_str=action.action_str) + obs = self.custom_env.step(custom_action) + self._state = self.custom_env.state + # Convert CustomGymObservation to BrowserGymObservation + return self._convert_custom_observation(obs) + + # Original BrowserGym handling self._state.step_count += 1 # Execute action in gym environment @@ -260,6 +358,34 @@ def _create_observation( metadata=browsergym_metadata, ) + def _convert_custom_observation( + self, custom_obs: "CustomGymObservation" # type: ignore + ) -> BrowserGymObservation: + """Convert CustomGymObservation to BrowserGymObservation. + + Args: + custom_obs: Custom observation to convert + + Returns: + BrowserGymObservation + """ + return BrowserGymObservation( + text=custom_obs.text, + url=custom_obs.url, + screenshot=custom_obs.screenshot, + goal=custom_obs.goal, + axtree_txt=custom_obs.axtree_txt, + pruned_html=custom_obs.pruned_html, + error=custom_obs.error, + last_action_error=custom_obs.last_action_error, + done=custom_obs.done, + reward=custom_obs.reward, + metadata={ + "custom_data": custom_obs.custom_data, + **(custom_obs.metadata or {}), + }, + ) + @property def state(self) -> BrowserGymState: """Get the current environment state.""" @@ -267,5 +393,9 @@ def state(self) -> BrowserGymState: def close(self) -> None: """Clean up environment resources.""" - if hasattr(self, "gym_env"): - self.gym_env.close() + if self.is_custom: + if self.custom_env: + self.custom_env.close() + else: + if hasattr(self, "gym_env"): + self.gym_env.close() diff --git a/src/envs/browsergym_env/server/custom/__init__.py b/src/envs/browsergym_env/server/custom/__init__.py new file mode 100644 index 00000000..885973c4 --- /dev/null +++ b/src/envs/browsergym_env/server/custom/__init__.py @@ -0,0 +1,9 @@ +"""Custom BrowserGym tasks module. + +This module provides custom task functionality for BrowserGym environments. +Custom tasks are registered in custom_tasks.py and can be used by setting +benchmark="custom" in BrowserGymEnvironment. +""" + +# The custom tasks are registered in custom_tasks.py +# No need to import anything here - imports happen when needed diff --git a/src/envs/browsergym_env/server/custom/custom_base.py b/src/envs/browsergym_env/server/custom/custom_base.py new file mode 100644 index 00000000..043ffccd --- /dev/null +++ b/src/envs/browsergym_env/server/custom/custom_base.py @@ -0,0 +1,346 @@ +"""Base custom environment for BrowserGym custom tasks. + +This module provides a base class for creating custom BrowserGym tasks that +are not part of the official benchmarks. It simulates the BrowserGym gym +environment interface using Playwright directly. +""" + +import asyncio +import sys +import os +from abc import abstractmethod +from typing import Any, Dict, Optional +from uuid import uuid4 + +from playwright.async_api import async_playwright, Browser, Page, Playwright + +# Add current directory to path for relative imports +_CURRENT_DIR = os.path.dirname(os.path.abspath(__file__)) +if _CURRENT_DIR not in sys.path: + sys.path.insert(0, _CURRENT_DIR) + +from custom_models import ( + CustomGymAction, + CustomGymObservation, + CustomGymState, +) + + +class CustomBrowserGymEnvironment: + """Base class for custom BrowserGym environments. + + This class provides the basic Gym-like interface (reset, step, close) + but uses Playwright directly instead of going through BrowserGym's + registration system. + + To create a custom task: + 1. Subclass this class + 2. Implement _get_task_url() to return the starting URL + 3. Implement _extract_observation() to parse page state + 4. Implement _calculate_reward() to compute rewards + 5. Implement _check_done() to determine episode termination + """ + + def __init__( + self, + task_name: str, + headless: bool = True, + viewport_width: int = 1280, + viewport_height: int = 720, + timeout: float = 10000.0, + max_steps: int = 50, + **kwargs: Any, + ): + """Initialize the custom environment. + + Args: + task_name: Name of your custom task + headless: Whether to run browser in headless mode + viewport_width: Browser viewport width + viewport_height: Browser viewport height + timeout: Action timeout in milliseconds + max_steps: Maximum steps per episode + **kwargs: Additional custom parameters + """ + self.task_name = task_name + self.headless = headless + self.viewport_width = viewport_width + self.viewport_height = viewport_height + self.timeout = timeout + self.max_steps = max_steps + self.custom_params = kwargs + + # Playwright objects (initialized in reset) + self._playwright: Optional[Playwright] = None + self._browser: Optional[Browser] = None + self._page: Optional[Page] = None + self._event_loop: Optional[asyncio.AbstractEventLoop] = None + + # State tracking + self._state = CustomGymState( + episode_id=str(uuid4()), + step_count=0, + benchmark="custom", + task_name=task_name, + max_steps=max_steps, + ) + + @abstractmethod + def _get_task_url(self) -> str: + """Get the starting URL for this task. + + Returns: + URL to navigate to when resetting the environment + """ + pass + + @abstractmethod + def _get_goal_description(self) -> str: + """Get the goal/instruction for this task. + + Returns: + Human-readable description of the task goal + """ + pass + + @abstractmethod + async def _extract_observation(self, page: Page) -> Dict[str, Any]: + """Extract observation data from the current page state. + + Args: + page: Playwright Page object + + Returns: + Dictionary with observation data (text, axtree_txt, etc.) + """ + pass + + @abstractmethod + def _calculate_reward( + self, + page_data: Dict[str, Any], + action: str, + error: Optional[str] = None + ) -> float: + """Calculate reward for the current step. + + Args: + page_data: Data extracted from _extract_observation + action: Action that was executed + error: Error message if action failed + + Returns: + Reward value + """ + pass + + @abstractmethod + def _check_done(self, page_data: Dict[str, Any]) -> bool: + """Check if the episode should terminate. + + Args: + page_data: Data extracted from _extract_observation + + Returns: + True if episode should end, False otherwise + """ + pass + + def _get_or_create_event_loop(self) -> asyncio.AbstractEventLoop: + """Get or create an event loop for async operations.""" + try: + loop = asyncio.get_event_loop() + if loop.is_closed(): + loop = asyncio.new_event_loop() + asyncio.set_event_loop(loop) + except RuntimeError: + loop = asyncio.new_event_loop() + asyncio.set_event_loop(loop) + return loop + + async def _async_reset(self, seed: Optional[int] = None) -> CustomGymObservation: + """Async implementation of reset.""" + # Generate new episode ID + self._state = CustomGymState( + episode_id=str(uuid4()), + step_count=0, + benchmark="custom", + task_name=self.task_name, + max_steps=self.max_steps, + ) + + # Initialize Playwright if needed + if self._playwright is None: + self._playwright = await async_playwright().start() + self._browser = await self._playwright.chromium.launch( + headless=self.headless + ) + + # Create new page + if self._page: + await self._page.close() + + self._page = await self._browser.new_page( + viewport={ + "width": self.viewport_width, + "height": self.viewport_height, + } + ) + + # Set timeout + self._page.set_default_timeout(self.timeout) + + # Navigate to task URL + task_url = self._get_task_url() + await self._page.goto(task_url) + + # Extract initial observation + page_data = await self._extract_observation(self._page) + goal = self._get_goal_description() + + self._state.current_url = self._page.url + self._state.goal = goal + + return CustomGymObservation( + text=page_data.get("text", ""), + url=self._page.url, + screenshot=page_data.get("screenshot"), + goal=goal, + axtree_txt=page_data.get("axtree_txt", ""), + pruned_html=page_data.get("pruned_html", ""), + error="", + last_action_error=False, + done=False, + reward=0.0, + custom_data=page_data.get("custom_data"), + ) + + async def _async_step(self, action_str: str) -> CustomGymObservation: + """Async implementation of step.""" + self._state.step_count += 1 + + error_msg = "" + last_action_error = False + + try: + # Execute the action + # BrowserGym actions are Python-like function calls + # We need to parse and execute them + await self._execute_action(action_str) + + except Exception as e: + error_msg = str(e) + last_action_error = True + + # Extract observation + page_data = await self._extract_observation(self._page) + + # Calculate reward + reward = self._calculate_reward(page_data, action_str, error_msg) + self._state.cum_reward += reward + + # Check if done + done = self._check_done(page_data) or self._state.step_count >= self.max_steps + + # Update state + self._state.current_url = self._page.url + + return CustomGymObservation( + text=page_data.get("text", ""), + url=self._page.url, + screenshot=page_data.get("screenshot"), + goal=self._state.goal, + axtree_txt=page_data.get("axtree_txt", ""), + pruned_html=page_data.get("pruned_html", ""), + error=error_msg, + last_action_error=last_action_error, + done=done, + reward=reward, + custom_data=page_data.get("custom_data"), + ) + + async def _execute_action(self, action_str: str) -> None: + """Execute a BrowserGym-style action string. + + Args: + action_str: Action string like "click('button')" or "fill('input', 'text')" + """ + # Simple action parser - you can make this more sophisticated + action_str = action_str.strip() + + if action_str.startswith("click("): + # Extract selector from click('selector') + selector = action_str[6:-1].strip("'\"") + await self._page.click(selector) + + elif action_str.startswith("fill("): + # Extract selector and text from fill('selector', 'text') + parts = action_str[5:-1].split(",", 1) + selector = parts[0].strip().strip("'\"") + text = parts[1].strip().strip("'\"") if len(parts) > 1 else "" + await self._page.fill(selector, text) + + elif action_str.startswith("goto("): + # Extract URL from goto('url') + url = action_str[5:-1].strip("'\"") + await self._page.goto(url) + + elif action_str.startswith("press("): + # Extract key from press('key') + key = action_str[6:-1].strip("'\"") + await self._page.keyboard.press(key) + + elif action_str.startswith("scroll("): + # Extract direction from scroll('direction') + direction = action_str[7:-1].strip("'\"") + if direction == "down": + await self._page.mouse.wheel(0, 500) + elif direction == "up": + await self._page.mouse.wheel(0, -500) + + else: + # Try to execute as JavaScript if not recognized + await self._page.evaluate(action_str) + + def reset(self, seed: Optional[int] = None) -> CustomGymObservation: + """Reset the environment. + + Args: + seed: Random seed for reproducibility + + Returns: + Initial observation + """ + loop = self._get_or_create_event_loop() + return loop.run_until_complete(self._async_reset(seed)) + + def step(self, action: CustomGymAction) -> CustomGymObservation: + """Execute an action. + + Args: + action: Action to execute + + Returns: + Observation after executing the action + """ + loop = self._get_or_create_event_loop() + return loop.run_until_complete(self._async_step(action.action_str)) + + @property + def state(self) -> CustomGymState: + """Get the current environment state.""" + return self._state + + def close(self) -> None: + """Clean up environment resources.""" + async def _async_close(): + if self._page: + await self._page.close() + if self._browser: + await self._browser.close() + if self._playwright: + await self._playwright.stop() + + if self._playwright: + loop = self._get_or_create_event_loop() + loop.run_until_complete(_async_close()) diff --git a/src/envs/browsergym_env/server/custom/custom_models.py b/src/envs/browsergym_env/server/custom/custom_models.py new file mode 100644 index 00000000..3356b5a0 --- /dev/null +++ b/src/envs/browsergym_env/server/custom/custom_models.py @@ -0,0 +1,107 @@ +"""Data models for custom BrowserGym tasks. + +These models are used specifically for custom tasks that are not part of the +official BrowserGym benchmarks (miniwob, webarena, visualwebarena, workarena). +""" + +import sys +import os +from dataclasses import dataclass +from typing import List, Optional, Dict, Any + +# Add src directory to path for core imports +_SRC_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..', '..', '..')) +if _SRC_DIR not in sys.path: + sys.path.insert(0, _SRC_DIR) + +from core.env_server.types import Action, Observation, State + + +@dataclass(kw_only=True) +class CustomGymAction(Action): + """Action to be executed in a custom BrowserGym environment. + + Custom actions support the same BrowserGym action format but may include + additional custom fields specific to your task. + + Example actions: + - "click('Submit button')" + - "fill('username', 'john@example.com')" + - "goto('https://example.com')" + - "scroll(down)" + - "send_keys('Enter')" + """ + + action_str: str + """Natural language action string (e.g., "click('Submit')")""" + + metadata: Optional[Dict[str, Any]] = None + """Optional metadata for custom task-specific data""" + + +@dataclass(kw_only=True) +class CustomGymObservation(Observation): + """Observation returned from a custom BrowserGym environment. + + Contains multiple observation modalities including text (accessibility tree + or DOM), visual (screenshot), and page metadata, plus custom fields. + """ + + text: str = "" + """Text representation of the page (accessibility tree or DOM)""" + + url: str = "" + """Current URL of the page""" + + screenshot: Optional[List[List[List[int]]]] = None + """Screenshot as numpy array [height, width, channels] (if visual observation enabled)""" + + goal: str = "" + """Task goal/instruction for the current episode""" + + axtree_txt: str = "" + """Full accessibility tree as text""" + + pruned_html: str = "" + """Pruned HTML content (interactive elements only)""" + + error: str = "" + """Error message if action execution failed""" + + last_action_error: bool = False + """Whether the last action resulted in an error""" + + custom_data: Optional[Dict[str, Any]] = None + """Optional custom data specific to your task""" + + +@dataclass +class CustomGymState(State): + """State of a custom BrowserGym environment. + + Tracks the current task, and progress through an episode, plus custom state fields. + """ + + benchmark: str = "custom" + """Benchmark name (always 'custom' for custom tasks)""" + + task_name: str = "" + """Specific custom task name (e.g., 'copy-paste', 'data-entry')""" + + task_id: Optional[str] = None + """Task ID for custom task tracking""" + + goal: str = "" + """Task goal/instruction""" + + current_url: str = "" + """Current URL of the active page""" + + max_steps: Optional[int] = None + """Maximum steps allowed for this task""" + + cum_reward: float = 0.0 + """Cumulative reward for the current episode""" + + custom_state: Optional[Dict[str, Any]] = None + """Optional custom state data specific to your task""" diff --git a/src/envs/browsergym_env/server/custom/custom_tasks.py b/src/envs/browsergym_env/server/custom/custom_tasks.py new file mode 100644 index 00000000..59a9798c --- /dev/null +++ b/src/envs/browsergym_env/server/custom/custom_tasks.py @@ -0,0 +1,320 @@ +"""Registry for custom BrowserGym tasks. + +This module provides a central place to register and retrieve custom tasks. +Add your custom tasks here to make them available through the BrowserGym environment. +""" + +import sys +import os +from typing import Any, Dict + +# Add current directory to path for relative imports +_CURRENT_DIR = os.path.dirname(os.path.abspath(__file__)) +if _CURRENT_DIR not in sys.path: + sys.path.insert(0, _CURRENT_DIR) + +from custom_base import CustomBrowserGymEnvironment + + +# Registry of custom tasks +_CUSTOM_TASKS: Dict[str, type] = {} + + +def register_custom_task(name: str, task_class: type) -> None: + """Register a custom task. + + Args: + name: Task name (e.g., 'copy-paste', 'data-entry') + task_class: Class that extends CustomBrowserGymEnvironment + """ + if not issubclass(task_class, CustomBrowserGymEnvironment): + raise ValueError( + f"Task class must extend CustomBrowserGymEnvironment, got {task_class}" + ) + _CUSTOM_TASKS[name] = task_class + + +def get_custom_task(task_name: str, **kwargs: Any) -> CustomBrowserGymEnvironment: + """Get a custom task instance. + + Args: + task_name: Name of the task to retrieve + **kwargs: Arguments to pass to the task constructor + + Returns: + Instance of the custom task + + Raises: + ValueError: If task is not registered + """ + if task_name not in _CUSTOM_TASKS: + available = ", ".join(_CUSTOM_TASKS.keys()) or "none" + raise ValueError( + f"Custom task '{task_name}' not found. " + f"Available tasks: {available}. " + f"Register your task using register_custom_task()." + ) + + task_class = _CUSTOM_TASKS[task_name] + return task_class(task_name=task_name, **kwargs) + + +def list_custom_tasks() -> list[str]: + """List all registered custom tasks. + + Returns: + List of task names + """ + return list(_CUSTOM_TASKS.keys()) + + +# ============================================================================ +# Copy-Paste in a single page HTML task +# ============================================================================ + +class CopyPasteTask(CustomBrowserGymEnvironment): + """Copy text from one field and paste into another.""" + + def _get_task_url(self) -> str: + """Get the URL for the copy-paste task.""" + # This should point to a local HTML file or a URL hosting the task + import os + task_html = os.path.join( + os.path.dirname(__file__), + "tasks", + "copy-paste.html" + ) + return f"file://{task_html}" + + def _get_goal_description(self) -> str: + """Get the goal description.""" + return "Copy the text from the source field and paste it into the target field, then click Submit." + + async def _extract_observation(self, page) -> dict: + """Extract observation from the page.""" + # Get the accessibility tree or HTML + try: + # Try to get the page content + content = await page.content() + + # Get the current values of source and target fields + source_value = await page.evaluate( + "document.querySelector('#source-text')?.value || ''" + ) + target_value = await page.evaluate( + "document.querySelector('#target-text')?.value || ''" + ) + + # Get success message if visible + success_msg = await page.evaluate( + "document.querySelector('#success-message')?.textContent || ''" + ) + + return { + "text": content, + "pruned_html": content[:1000], # Truncate for observation + "custom_data": { + "source_value": source_value, + "target_value": target_value, + "success_message": success_msg, + } + } + except Exception as e: + return { + "text": f"Error extracting observation: {e}", + "custom_data": {"error": str(e)} + } + + def _calculate_reward( + self, + page_data: dict, + action: str, + error: str | None = None + ) -> float: + """Calculate reward based on page state.""" + if error: + return -0.1 # Small penalty for errors + + custom_data = page_data.get("custom_data", {}) + + # Check if task is completed successfully + if "Success!" in custom_data.get("success_message", ""): + return 1.0 + + # Partial reward if text is copied correctly + source = custom_data.get("source_value", "") + target = custom_data.get("target_value", "") + + if source and target and source == target: + return 0.5 + + return 0.0 + + def _check_done(self, page_data: dict) -> bool: + """Check if the task is complete.""" + custom_data = page_data.get("custom_data", {}) + # Task is done if success message is shown + return "Success!" in custom_data.get("success_message", "") + + +# Register the example task +register_custom_task("copy-paste", CopyPasteTask) + + +# ============================================================================ +# Multi-Tab Copy-Paste Task +# ============================================================================ + +class CopyPasteMultiTabTask(CustomBrowserGymEnvironment): + """Copy text from one tab and paste it into another tab. + + This task demonstrates handling multiple browser tabs/pages. + The agent needs to: + 1. Copy text from the source page (tab 1) + 2. Navigate/switch to the target page (tab 2) + 3. Paste the text into the target field + 4. Submit the form + """ + + def _get_task_url(self) -> str: + """Get the URL for the first tab (source page).""" + import os + task_html = os.path.join( + os.path.dirname(__file__), + "tasks", + "copy-paste-source.html" + ) + return f"file://{task_html}" + + def _get_goal_description(self) -> str: + """Get the goal description.""" + return ( + "Copy the text from the source page, then navigate to the target page " + "(click 'Open Target Page' button), paste the text into the input field, " + "and click Submit." + ) + + async def _extract_observation(self, page) -> dict: + """Extract observation from the current page.""" + try: + content = await page.content() + current_url = page.url + + # Determine which page we're on + if "source" in current_url: + # On source page + source_value = await page.evaluate( + "document.querySelector('#source-text')?.textContent || ''" + ) + + return { + "text": content, + "pruned_html": content[:1000], + "custom_data": { + "current_page": "source", + "source_value": source_value, + "task_step": "copy_from_source", + } + } + + elif "target" in current_url: + # On target page + target_value = await page.evaluate( + "document.querySelector('#target-text')?.value || ''" + ) + success_msg = await page.evaluate( + "document.querySelector('#success-message')?.textContent || ''" + ) + + return { + "text": content, + "pruned_html": content[:1000], + "custom_data": { + "current_page": "target", + "target_value": target_value, + "success_message": success_msg, + "task_step": "paste_to_target", + } + } + + else: + # Unknown page + return { + "text": content, + "custom_data": { + "current_page": "unknown", + "error": "Not on source or target page" + } + } + + except Exception as e: + return { + "text": f"Error extracting observation: {e}", + "custom_data": {"error": str(e)} + } + + def _calculate_reward( + self, + page_data: dict, + action: str, + error: str | None = None + ) -> float: + """Calculate reward based on page state and action.""" + if error: + return -0.1 + + custom_data = page_data.get("custom_data", {}) + current_page = custom_data.get("current_page", "") + + # Big reward for completing the task + if "Success!" in custom_data.get("success_message", ""): + return 1.0 + + # Small reward for successfully navigating to target page + if current_page == "target" and "goto" in action.lower(): + return 0.3 + + # Medium reward if text is pasted correctly in target + if current_page == "target": + target_value = custom_data.get("target_value", "") + # The expected text from source page + if target_value and "Hello from the source page!" in target_value: + return 0.6 + + return 0.0 + + def _check_done(self, page_data: dict) -> bool: + """Check if the task is complete.""" + custom_data = page_data.get("custom_data", {}) + return "Success!" in custom_data.get("success_message", "") + + +# Register the multi-tab task +register_custom_task("copy-paste-multitab", CopyPasteMultiTabTask) + + +# ============================================================================ +# Add your own custom tasks below by: +# 1. Creating a class that extends CustomBrowserGymEnvironment +# 2. Implementing the required methods +# 3. Registering it with register_custom_task() +# ============================================================================ + +# Example: +# class MyCustomTask(CustomBrowserGymEnvironment): +# def _get_task_url(self) -> str: +# return "https://my-task-url.com" +# +# def _get_goal_description(self) -> str: +# return "Do something amazing" +# +# async def _extract_observation(self, page) -> dict: +# return {"text": await page.content()} +# +# def _calculate_reward(self, page_data, action, error=None) -> float: +# return 1.0 if some_condition else 0.0 +# +# def _check_done(self, page_data) -> bool: +# return some_completion_check +# +# register_custom_task("my-task", MyCustomTask) diff --git a/src/envs/browsergym_env/server/custom/tasks/copy-paste-source.html b/src/envs/browsergym_env/server/custom/tasks/copy-paste-source.html new file mode 100644 index 00000000..137b65fe --- /dev/null +++ b/src/envs/browsergym_env/server/custom/tasks/copy-paste-source.html @@ -0,0 +1,37 @@ + + + + + + Source Page + + + +

Source Page

+

Text to copy:

+
Hello from the source page!
+ + + + + diff --git a/src/envs/browsergym_env/server/custom/tasks/copy-paste-target.html b/src/envs/browsergym_env/server/custom/tasks/copy-paste-target.html new file mode 100644 index 00000000..a20c5267 --- /dev/null +++ b/src/envs/browsergym_env/server/custom/tasks/copy-paste-target.html @@ -0,0 +1,94 @@ + + + + + + Target Page + + + + Back to Source Page + +

Target Page

+ +
+ + +
+ + + +
Success! You've completed the multi-tab copy-paste task correctly!
+
Error: The text doesn't match. Please copy the correct text from the source page.
+ + + + diff --git a/src/envs/browsergym_env/server/custom/tasks/copy-paste.html b/src/envs/browsergym_env/server/custom/tasks/copy-paste.html new file mode 100644 index 00000000..18eb2c3b --- /dev/null +++ b/src/envs/browsergym_env/server/custom/tasks/copy-paste.html @@ -0,0 +1,93 @@ + + + + + + Copy-Paste Task + + + +
+ + +
+ +
+ + +
+ + + +
Success! You've completed the task correctly.
+
Error: The text doesn't match. Please try again.
+ + + + \ No newline at end of file