Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions pkg/templates/python/gemini-computer-use/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,14 @@ The Gemini model can execute the following browser actions:
| `drag_and_drop` | Drag from one point to another |
| `wait_5_seconds` | Wait for 5 seconds |

## Known Limitations

### URL Reporting

The Gemini Computer Use API requires a URL in all function responses. However, the Kernel Computer Controls API doesn't provide a method to retrieve the current page URL.

As a workaround, this template reports `about:blank` as the URL in all responses. This works because Gemini primarily uses the screenshot to understand page state - the URL is a required field but not critical for functionality.

## Resources

- [Google Gemini Computer Use Documentation](https://ai.google.dev/gemini-api/docs/computer-use)
Expand Down
2 changes: 1 addition & 1 deletion pkg/templates/python/gemini-computer-use/loop.py
Original file line number Diff line number Diff line change
Expand Up @@ -177,7 +177,7 @@ async def sampling_loop(
Part(
function_response=FunctionResponse(
name=fc.name,
response={"error": result.error},
response={"error": result.error, "url": result.url or "about:blank"},
)
)
)
Expand Down
14 changes: 3 additions & 11 deletions pkg/templates/python/gemini-computer-use/tools/computer.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,20 +46,12 @@ async def screenshot(self) -> ToolResult:
response = self.kernel.browsers.computer.capture_screenshot(self.session_id)
screenshot_bytes = response.read()

# Get current URL
url = ""
try:
state = self.kernel.browsers.computer.get_state(self.session_id)
url = state.url or ""
except Exception:
pass

return ToolResult(
base64_image=base64.b64encode(screenshot_bytes).decode("utf-8"),
url=url,
url="about:blank",
)
except Exception as e:
return ToolResult(error=f"Failed to take screenshot: {e}")
return ToolResult(error=f"Failed to take screenshot: {e}", url="about:blank")

async def execute_action(
self, action_name: str, args: GeminiFunctionArgs
Expand Down Expand Up @@ -272,4 +264,4 @@ async def execute_action(
return await self.screenshot()

except Exception as e:
return ToolResult(error=f"Action failed: {e}")
return ToolResult(error=f"Action failed: {e}", url="about:blank")
8 changes: 8 additions & 0 deletions pkg/templates/typescript/gemini-computer-use/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,14 @@ The Gemini model can execute the following browser actions:
| `drag_and_drop` | Drag from one point to another |
| `wait_5_seconds` | Wait for 5 seconds |

## Known Limitations

### URL Reporting

The Gemini Computer Use API requires a URL in all function responses. However, the Kernel Computer Controls API doesn't provide a method to retrieve the current page URL.

As a workaround, this template reports `about:blank` as the URL in all responses. This works because Gemini primarily uses the screenshot to understand page state - the URL is a required field but not critical for functionality.

## Resources

- [Google Gemini Computer Use Documentation](https://ai.google.dev/gemini-api/docs/computer-use)
Expand Down
3 changes: 2 additions & 1 deletion pkg/templates/typescript/gemini-computer-use/loop.ts
Original file line number Diff line number Diff line change
Expand Up @@ -173,7 +173,8 @@ export async function samplingLoop({
functionResponses.push({
functionResponse: {
name: fc.name,
response: { error: result.error },
// Always include URL (required by Gemini Computer Use API)
response: { error: result.error, url: result.url || 'about:blank' },
},
});
} else {
Expand Down
16 changes: 5 additions & 11 deletions pkg/templates/typescript/gemini-computer-use/tools/computer.ts
Original file line number Diff line number Diff line change
Expand Up @@ -48,22 +48,14 @@ export class ComputerTool {
const arrayBuffer = await blob.arrayBuffer();
const buffer = Buffer.from(arrayBuffer);

// Get current URL
let url = '';
try {
const state = await this.kernel.browsers.computer.getState(this.sessionId);
url = state.url || '';
} catch {
// Ignore URL fetch errors
}

return {
base64Image: buffer.toString('base64'),
url,
url: 'about:blank',
};
} catch (error) {
return {
error: `Failed to take screenshot: ${error}`,
url: 'about:blank',
};
}
}
Expand Down Expand Up @@ -294,7 +286,9 @@ export class ComputerTool {
return await this.screenshot();

} catch (error) {
return { error: `Action failed: ${error}` };
// Return about:blank as URL fallback (required by Gemini Computer Use API)
// Note: Computer Controls API doesn't provide a way to get current page URL
return { error: `Action failed: ${error}`, url: 'about:blank' };
}
}

Expand Down