diff --git a/pkg/templates/python/gemini-computer-use/README.md b/pkg/templates/python/gemini-computer-use/README.md index 6344a2f..065f949 100644 --- a/pkg/templates/python/gemini-computer-use/README.md +++ b/pkg/templates/python/gemini-computer-use/README.md @@ -53,6 +53,14 @@ The Gemini model can execute the following browser actions: | `drag_and_drop` | Drag from one point to another | | `wait_5_seconds` | Wait for 5 seconds | +## Known Limitations + +### URL Reporting + +The Gemini Computer Use API requires a URL in all function responses. However, the Kernel Computer Controls API doesn't provide a method to retrieve the current page URL. + +As a workaround, this template reports `about:blank` as the URL in all responses. This works because Gemini primarily uses the screenshot to understand page state - the URL is a required field but not critical for functionality. + ## Resources - [Google Gemini Computer Use Documentation](https://ai.google.dev/gemini-api/docs/computer-use) diff --git a/pkg/templates/python/gemini-computer-use/loop.py b/pkg/templates/python/gemini-computer-use/loop.py index ada2130..a3453ba 100644 --- a/pkg/templates/python/gemini-computer-use/loop.py +++ b/pkg/templates/python/gemini-computer-use/loop.py @@ -177,7 +177,7 @@ async def sampling_loop( Part( function_response=FunctionResponse( name=fc.name, - response={"error": result.error}, + response={"error": result.error, "url": result.url or "about:blank"}, ) ) ) diff --git a/pkg/templates/python/gemini-computer-use/tools/computer.py b/pkg/templates/python/gemini-computer-use/tools/computer.py index de2297d..5cf309d 100644 --- a/pkg/templates/python/gemini-computer-use/tools/computer.py +++ b/pkg/templates/python/gemini-computer-use/tools/computer.py @@ -46,20 +46,12 @@ async def screenshot(self) -> ToolResult: response = self.kernel.browsers.computer.capture_screenshot(self.session_id) screenshot_bytes = response.read() - # Get current URL - url = "" - try: - state = self.kernel.browsers.computer.get_state(self.session_id) - url = state.url or "" - except Exception: - pass - return ToolResult( base64_image=base64.b64encode(screenshot_bytes).decode("utf-8"), - url=url, + url="about:blank", ) except Exception as e: - return ToolResult(error=f"Failed to take screenshot: {e}") + return ToolResult(error=f"Failed to take screenshot: {e}", url="about:blank") async def execute_action( self, action_name: str, args: GeminiFunctionArgs @@ -272,4 +264,4 @@ async def execute_action( return await self.screenshot() except Exception as e: - return ToolResult(error=f"Action failed: {e}") + return ToolResult(error=f"Action failed: {e}", url="about:blank") diff --git a/pkg/templates/typescript/gemini-computer-use/README.md b/pkg/templates/typescript/gemini-computer-use/README.md index 3fb7c32..0a62590 100644 --- a/pkg/templates/typescript/gemini-computer-use/README.md +++ b/pkg/templates/typescript/gemini-computer-use/README.md @@ -53,6 +53,14 @@ The Gemini model can execute the following browser actions: | `drag_and_drop` | Drag from one point to another | | `wait_5_seconds` | Wait for 5 seconds | +## Known Limitations + +### URL Reporting + +The Gemini Computer Use API requires a URL in all function responses. However, the Kernel Computer Controls API doesn't provide a method to retrieve the current page URL. + +As a workaround, this template reports `about:blank` as the URL in all responses. This works because Gemini primarily uses the screenshot to understand page state - the URL is a required field but not critical for functionality. + ## Resources - [Google Gemini Computer Use Documentation](https://ai.google.dev/gemini-api/docs/computer-use) diff --git a/pkg/templates/typescript/gemini-computer-use/loop.ts b/pkg/templates/typescript/gemini-computer-use/loop.ts index 284147b..ba3dc5c 100644 --- a/pkg/templates/typescript/gemini-computer-use/loop.ts +++ b/pkg/templates/typescript/gemini-computer-use/loop.ts @@ -173,7 +173,8 @@ export async function samplingLoop({ functionResponses.push({ functionResponse: { name: fc.name, - response: { error: result.error }, + // Always include URL (required by Gemini Computer Use API) + response: { error: result.error, url: result.url || 'about:blank' }, }, }); } else { diff --git a/pkg/templates/typescript/gemini-computer-use/tools/computer.ts b/pkg/templates/typescript/gemini-computer-use/tools/computer.ts index 82e98b6..f415f72 100644 --- a/pkg/templates/typescript/gemini-computer-use/tools/computer.ts +++ b/pkg/templates/typescript/gemini-computer-use/tools/computer.ts @@ -48,22 +48,14 @@ export class ComputerTool { const arrayBuffer = await blob.arrayBuffer(); const buffer = Buffer.from(arrayBuffer); - // Get current URL - let url = ''; - try { - const state = await this.kernel.browsers.computer.getState(this.sessionId); - url = state.url || ''; - } catch { - // Ignore URL fetch errors - } - return { base64Image: buffer.toString('base64'), - url, + url: 'about:blank', }; } catch (error) { return { error: `Failed to take screenshot: ${error}`, + url: 'about:blank', }; } } @@ -294,7 +286,9 @@ export class ComputerTool { return await this.screenshot(); } catch (error) { - return { error: `Action failed: ${error}` }; + // Return about:blank as URL fallback (required by Gemini Computer Use API) + // Note: Computer Controls API doesn't provide a way to get current page URL + return { error: `Action failed: ${error}`, url: 'about:blank' }; } }