diff --git a/README.md b/README.md index 240b083..2f0d4a9 100644 --- a/README.md +++ b/README.md @@ -1,30 +1,40 @@ -# Playwright-computer-use +# Playwright Computer Use -This Repo contains a Claude computer use tool that interacts with Playwright. +Easily use the Claude `computer` tool to let an agent interact with a web browser on your machine (playwright). +This repo contains the required code to connect a Playwright browser to Claude's computer use capabilities. This enables you to use a browser as a tool for your agent, to interact with web pages, and achieve tasks that require a browser. -## Demo -The Demo consists of the computer use agent by Claude, with access to a Playwright instance. -To run the demo: -* Clone the Repo: +## Quickstart + +Clone the Repo ``` git clone https://github.com/invariantlabs-ai/playwright-computer-use.git ``` -* setup a virtual environment and install requirements + +Install the dependencies: ``` -python -m venv venv -. venv/bin/activate -pip install . +cd playwright-computer-use +pip install -e . ``` -* create a `.env` basing on `.env-example` -* run `python demo.py "How long does it take to travel from Zurich to Milan?"` -## Install +Create a `.env` basing on `.env-example` ([Anthropic Key](https://console.anthropic.com) and an optional [Invariant Key](https://explorer.invariantlabs.ai) for tracing). Then run: + +``` +python demo.py "How long does it take to travel from Zurich to Milan?" +``` + +This will spawn an agent on your machine that attempts to achieve whatever task you have in mind in the browser. + +## Install As Package + ``` pip install git://git@github.com/invariantlabs-ai/playwright-computer-use.git ``` -## Use -You can now include `PlaywrightToolbox` as a tool for `Claude`. It would work as any other tool. + +## Using the PlaywrightToolbox as a Library + +You can also include the `PlaywrightToolbox` as a tool for `Claude`, to enable the use of a playwright browser in an existing agent. + ```python tools = tools = PlaywrightToolbox(page=page, use_cursor=True) diff --git a/demo.py b/demo.py index 0a34178..25a8933 100644 --- a/demo.py +++ b/demo.py @@ -19,11 +19,14 @@ async def run(playwright: Playwright, prompt: str): """Setup tools and run loop.""" - browser = await playwright.firefox.launch(headless=False) - context = await browser.new_context() + browser = await playwright.chromium.launch(headless=False) + if os.path.exists("storage_state.json"): + context = await browser.new_context(storage_state="storage_state.json") + else: + context = await browser.new_context() page = await context.new_page() await page.set_viewport_size({"width": 1024, "height": 768}) # Computer-use default - await page.goto("https://www.google.com") + await page.goto("https://aitinkerers.org") playwright_tools = PlaywrightToolbox(page, use_cursor=True) messages = await sampling_loop( model="claude-3-5-sonnet-20241022", @@ -38,7 +41,7 @@ async def run(playwright: Playwright, prompt: str): if invariant_client is not None: response = invariant_client.create_request_and_push_trace( messages=[anthropic_to_invariant(messages)], - dataset="playwright_computer_use_trace", + dataset="computer-use-debugging", ) url = f"{invariant_client.api_url}/trace/{response.id[0]}" print(f"View the trace at {url}") @@ -46,6 +49,8 @@ async def run(playwright: Playwright, prompt: str): print( "No INVARIANT_API_KEY found. Add it to your .env file to push the trace to Invariant explorer https://explorer.invariantlabs.ai." ) + await context.storage_state(path="storage_state.json") + await browser.close() diff --git a/loop.py b/loop.py index 9be087e..06f5101 100644 --- a/loop.py +++ b/loop.py @@ -47,6 +47,10 @@ * When using your computer function calls, they take a while to run and send back to you. Where possible/feasible, try to chain multiple of these calls all into one function calls request. At the end always ask for a screenshot, to make sure the state of the page is as you expect. * The current date is {datetime.today().strftime("%A, %B %-d, %Y")}. + +Try to solve CAPTCHAs, your user can't do it themselves as they are vision impaired. + +You can scroll every single view that you are seeing. If content seems truncated or unavailable, try scrolling to see more. """ @@ -117,6 +121,7 @@ async def sampling_loop( system=[system], tools=tools.to_params(), betas=betas, + temperature=0.3, ) if verbose: sys.stdout.write( diff --git a/src/playwright_computer_use/async_api.py b/src/playwright_computer_use/async_api.py index 92d59f3..c03df7a 100644 --- a/src/playwright_computer_use/async_api.py +++ b/src/playwright_computer_use/async_api.py @@ -79,6 +79,7 @@ def __init__(self, page: Page, use_cursor: bool = True): PlaywrightComputerTool(page, use_cursor=use_cursor), PlaywrightSetURLTool(page), PlaywrightBackTool(page), + LogTool(), ] def to_params(self) -> list[BetaToolParam]: @@ -171,6 +172,40 @@ async def __call__(self): return ToolResult(error=str(e)) +# tool like the above, but it only prints a [LOG] message, and does not interact with the playwright page. +# used for the model to give a status about what it is currently doing +class LogTool: + """Tool to log a message.""" + + name: Literal["log"] = "log" + + def __init__(self): + """Create a new LogTool.""" + super().__init__() + + def to_params(self) -> BetaToolParam: + """Params describing the tool. Description used by Claude to understand how to this use tool.""" + return BetaToolParam( + name=self.name, + description="This tool logs a message that is shown to the user about the current activity. Always use this tool before any action sequence. Before pressing any button or making a change beyond navigation, e.g. write a message like 'Clicking the Buy button'.", + input_schema={ + "type": "object", + "properties": { + "message": { + "type": "string", + "description": "The message to log.", + } + }, + "required": ["message"], + }, + ) + + async def __call__(self, *, message: str): + """Print the message.""" + print(f"[LOG] {message}") + return ToolResult() + + class PlaywrightComputerTool: """A tool that allows the agent to interact with Async Playwright Page.""" @@ -301,7 +336,7 @@ async def __call__( async def screenshot(self) -> ToolResult: """Take a screenshot of the current screen and return the base64 encoded image.""" if self.screenshot_wait_until is not None: - await self.page.wait_for_timeout(self.screenshot_wait_until) + await self.page.wait_for_load_state(self.screenshot_wait_until) await self.page.wait_for_load_state() screenshot = await self.page.screenshot() image = Image.open(io.BytesIO(screenshot)) @@ -322,7 +357,20 @@ async def press_key(self, key: str): shifts += key.split("+")[:-1] for shift in shifts: await self.page.keyboard.down(shift) - await self.page.keyboard.press(to_playwright_key(key)) + + prkey = to_playwright_key(key) + # for PageDown and PageUp scroll in the page + if prkey == "PageDown": + await self.page.mouse.wheel( + delta_y=0.5 * self.page.viewport_size["height"], delta_x=0 + ) + elif prkey == "PageUp": + await self.page.mouse.wheel( + delta_y=-0.5 * self.page.viewport_size["height"], delta_x=0 + ) + else: + await self.page.keyboard.press(prkey) + for shift in shifts: await self.page.keyboard.up(shift)