-
Notifications
You must be signed in to change notification settings - Fork 59
feat(android): add UIAutomator hierarchy dump, parsing, and agent tool #251
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Large diffs are not rendered by default.
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -18,6 +18,7 @@ | |
| UnknownAndroidDisplay, | ||
| ) | ||
| from askui.tools.android.android_agent_os_error import AndroidAgentOsError | ||
| from askui.tools.android.uiautomator_hierarchy import UIElementCollection | ||
| from askui.utils.annotated_image import AnnotatedImage | ||
|
|
||
|
|
||
|
|
@@ -34,6 +35,7 @@ class PpadbAgentOs(AndroidAgentOs): | |
| """ | ||
|
|
||
| _REPORTER_ROLE_NAME: str = "AndroidAgentOS" | ||
| _UIAUTOMATOR_DUMP_PATH: str = "/data/local/tmp/askui_window_dump.xml" | ||
|
|
||
| def __init__( | ||
| self, reporter: Reporter = NULL_REPORTER, device_identifier: str | int = 0 | ||
|
|
@@ -482,3 +484,33 @@ def pull(self, remote_path: str, local_path: str) -> None: | |
| self._REPORTER_ROLE_NAME, | ||
| f"pull(remote_path='{remote_path}', local_path='{local_path}')", | ||
| ) | ||
|
|
||
| def get_ui_elements(self) -> UIElementCollection: | ||
| """ | ||
| Return UI elements from a `uiautomator dump` of the current screen. | ||
|
|
||
| Returns: | ||
| UIElementCollection: Parsed hierarchy from the dump, or empty if the dump | ||
| has no usable content. | ||
|
|
||
| Raises: | ||
| AndroidAgentOsError: When the dump command does not report success (often | ||
| while animations are visible on screen). | ||
|
|
||
| Notes: | ||
| `uiautomator dump` is unreliable while the screen shows animation | ||
| (transitions, loaders, pulsing highlights, etc.). Retry after motion has | ||
| stopped and the UI has settled. | ||
| """ | ||
| self._check_if_device_is_selected() | ||
| assert self._device is not None | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should the assert not be integrated in _check_if_device_is_selected |
||
| dump_cmd = f"uiautomator dump {self._UIAUTOMATOR_DUMP_PATH}" | ||
| dump_response = self.shell(dump_cmd) | ||
| if "dumped" not in dump_response.lower(): | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What does the "dumped" mean? |
||
| msg = f"Failed to dump UI hierarchy: {dump_response}" | ||
| raise AndroidAgentOsError(msg) | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do we have to terminate the Agent Loop or is this error recoverably from the Agent?? |
||
|
|
||
| raw = self.shell(f"cat {self._UIAUTOMATOR_DUMP_PATH}") | ||
| if not raw or not raw.strip(): | ||
| return UIElementCollection([]) | ||
| return UIElementCollection.build_from_xml_dump(raw) | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,158 @@ | ||
| """ | ||
| Parse UIAutomator hierarchy dump XML from Android (normalized shell output). | ||
| """ | ||
|
|
||
| from __future__ import annotations | ||
|
|
||
| import re | ||
| import xml.etree.ElementTree as ET | ||
| from dataclasses import dataclass | ||
| from typing import TYPE_CHECKING | ||
|
|
||
| if TYPE_CHECKING: | ||
| from collections.abc import Iterator, Mapping | ||
|
|
||
| # Match & that is not start of a valid XML entity | ||
| _RE_INVALID_AMP = re.compile(r"&(?!(?:amp|lt|gt|apos|quot|#\d+|#x[0-9a-fA-F]+);)") # noqa: E501 | ||
| _RE_BOUNDS = re.compile(r"\[(\d+),(\d+)\]\[(\d+),(\d+)\]") | ||
|
|
||
| _XML_START_MARKERS = ("<?xml", "<hierarchy") | ||
|
|
||
|
|
||
| @dataclass | ||
| class UIElement: | ||
| """Parsed UI element from UIAutomator dump.""" | ||
|
|
||
| text: str | ||
| resource_id: str | ||
| content_desc: str | ||
| class_name: str | ||
| bounds: str | ||
| clickable: bool | ||
| enabled: bool | ||
| package: str | ||
| _center: tuple[int, int] | None = None | ||
|
|
||
| @property | ||
| def center(self) -> tuple[int, int] | None: | ||
| """Return (x, y) center of bounds, or None if bounds invalid.""" | ||
| if self._center is not None: | ||
| return self._center | ||
| m = _RE_BOUNDS.match(self.bounds) | ||
| if not m: | ||
| return None | ||
| x1, y1, x2, y2 = (int(g) for g in m.groups()) | ||
| self._center = ((x1 + x2) // 2, (y1 + y2) // 2) | ||
| return self._center | ||
|
|
||
| def __str__(self) -> str: | ||
| """Short description for list output.""" | ||
| parts: list[str] = [f"clickable={self.clickable}"] | ||
| if self.center: | ||
| parts.append(f"center=(x={self.center[0]}, y={self.center[1]})") | ||
| if self.text: | ||
| parts.append(f'text="{self.text}"') | ||
| if self.resource_id: | ||
| parts.append(f'resource-id="{self.resource_id}"') | ||
| if self.content_desc: | ||
| parts.append(f'content-desc="{self.content_desc}"') | ||
| if self.class_name: | ||
| parts.append(f"class={self.class_name.split('.')[-1]}") | ||
| return " | ".join(parts) | ||
|
|
||
| def set_center(self, center: tuple[int, int]) -> None: | ||
| """Set the center of the element.""" | ||
| self._center = center | ||
|
|
||
| @classmethod | ||
| def from_xml_attrib(cls, attrib: Mapping[str, str]) -> UIElement | None: | ||
| """Build from XML node attributes, or None if there are no bounds.""" | ||
| bounds = attrib.get("bounds", "").strip() | ||
| if not bounds: | ||
| return None | ||
| return cls( | ||
| text=attrib.get("text", ""), | ||
| resource_id=attrib.get("resource-id", ""), | ||
| content_desc=attrib.get("content-desc", ""), | ||
| class_name=attrib.get("class", ""), | ||
| bounds=bounds, | ||
| clickable=attrib.get("clickable", "false") == "true", | ||
| enabled=attrib.get("enabled", "true") == "true", | ||
| package=attrib.get("package", ""), | ||
| ) | ||
|
|
||
| @staticmethod | ||
| def from_json(json_content: Mapping[str, str]) -> UIElement: | ||
| """Build a UIElement from a string-keyed mapping (e.g. JSON object).""" | ||
| return UIElement( | ||
| text=json_content.get("text", ""), | ||
| resource_id=json_content.get("resource-id", ""), | ||
| content_desc=json_content.get("content-desc", ""), | ||
| class_name=json_content.get("class", ""), | ||
| bounds=json_content.get("bounds", ""), | ||
| clickable=json_content.get("clickable", "false") == "true", | ||
| enabled=json_content.get("enabled", "true") == "true", | ||
| package=json_content.get("package", ""), | ||
| ) | ||
|
|
||
|
|
||
| class UIElementCollection: | ||
| """Collection of UI elements.""" | ||
|
|
||
| def __init__(self, elements: list[UIElement]) -> None: | ||
| self._elements = list(elements) | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why parsing to a list? |
||
|
|
||
| def get_all(self) -> list[UIElement]: | ||
| """Return a copy of all elements.""" | ||
| return list(self._elements) | ||
|
|
||
| def __iter__(self) -> Iterator[UIElement]: | ||
| return iter(self._elements) | ||
|
|
||
| def __len__(self) -> int: | ||
| return len(self._elements) | ||
|
|
||
| def __str__(self) -> str: | ||
| """String representation of the collection.""" | ||
| return "\n".join(str(element) for element in self._elements) | ||
|
|
||
| @staticmethod | ||
| def _normalize_dump_string(raw: str) -> str: | ||
| """ | ||
| Normalize raw shell output to valid XML before parsing. | ||
|
|
||
| Handles encoding, ADB/shell cruft, control chars, and unescaped & in attributes. | ||
| """ | ||
| raw = raw.strip().lstrip("\ufeff") | ||
| start_indices = [raw.find(marker) for marker in _XML_START_MARKERS] | ||
| valid = [i for i in start_indices if i >= 0] | ||
| if valid: | ||
| raw = raw[min(valid) :] | ||
| end_tag = "</hierarchy>" | ||
| j = raw.rfind(end_tag) | ||
| if j >= 0: | ||
| raw = raw[: j + len(end_tag)] | ||
| raw = "".join(c for c in raw if c in "\n\t" or ord(c) >= 32) | ||
| return _RE_INVALID_AMP.sub("&", raw) | ||
|
|
||
| @staticmethod | ||
| def build_from_xml_dump(xml_content: str) -> UIElementCollection: | ||
| """Build a UIElementCollection from a UIAutomator dump XML string.""" | ||
| elements: list[UIElement] = [] | ||
| xml_content = UIElementCollection._normalize_dump_string(xml_content) | ||
| if not xml_content: | ||
| return UIElementCollection(elements) | ||
| try: | ||
| root = ET.fromstring(xml_content) | ||
| except ET.ParseError: | ||
| return UIElementCollection(elements) | ||
|
|
||
| def collect(node: ET.Element) -> None: | ||
| elem = UIElement.from_xml_attrib(node.attrib) | ||
| if elem is not None: | ||
| elements.append(elem) | ||
| for child in node: | ||
| collect(child) | ||
|
|
||
| collect(root) | ||
| return UIElementCollection(elements) | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,74 @@ | ||
| from askui.models.shared import AndroidBaseTool, ToolTags | ||
| from askui.tools.android.agent_os_facade import AndroidAgentOsFacade | ||
|
|
||
|
|
||
| class AndroidGetUIAutomatorHierarchyTool(AndroidBaseTool): | ||
| """ | ||
| Returns a flattened, text-friendly snapshot of the Android accessibility hierarchy | ||
| for the connected device (via UIAutomator window dump). | ||
|
|
||
| Each line describes one on-screen view: `clickable`, tap `center` computed from | ||
| bounds, and when non-empty: quoted `text`, `resource-id`, `content-desc`, and a | ||
| short view `class` name (last segment of the fully qualified class). Views without | ||
| parseable bounds are omitted. | ||
|
|
||
| Prefer this over screenshots when capture fails, is unavailable, or you want | ||
| explicit structure (ids, descriptions, centers) instead of visual inference. | ||
| Prefer using returned centers and labels over blind coordinate guesses. | ||
|
|
||
| Lines use ` | ` between fields, for example: | ||
| `clickable=True | center=(x=120, y=340) | text="OK" | class=Button`. | ||
|
|
||
| Args: | ||
| agent_os (AndroidAgentOsFacade | None, optional): The Android agent OS facade. | ||
| If omitted, the agent supplies the connected device implementation at | ||
| runtime. | ||
|
|
||
| Examples: | ||
| ```python | ||
| from askui import AndroidAgent | ||
| from askui.tools.store.android import AndroidGetUIAutomatorHierarchyTool | ||
|
|
||
| with AndroidAgent() as agent: | ||
| agent.act( | ||
| "List tappable elements on the screen using the accessibility tree", | ||
| tools=[AndroidGetUIAutomatorHierarchyTool()], | ||
| ) | ||
| ``` | ||
|
|
||
| ```python | ||
| from askui import AndroidAgent | ||
| from askui.tools.store.android import AndroidGetUIAutomatorHierarchyTool | ||
|
|
||
| with AndroidAgent(act_tools=[AndroidGetUIAutomatorHierarchyTool()]) as agent: | ||
| agent.act("What buttons and links are visible on this screen?") | ||
| ``` | ||
| """ | ||
|
|
||
| def __init__(self, agent_os: AndroidAgentOsFacade | None = None) -> None: | ||
| super().__init__( | ||
| name="get_uiautomator_hierarchy_tool", | ||
| description=( | ||
| "UIAutomator accessibility snapshot for the current Android screen" | ||
| " (window dump). Returns one text line per view: clickable, tap center" | ||
| " from bounds (`center=(x=..., y=...)`), and when set: text," | ||
| " resource-id," | ||
| " content-desc, short view class—fields joined by ` | `. Skips views" | ||
| " without valid bounds. Use instead of screenshots when capture is" | ||
| " unreliable or you need ids, descriptions, and tap centers for" | ||
| " structured reasoning; avoid guessing raw coordinates." | ||
| ), | ||
| required_tags=[ToolTags.SCALED_AGENT_OS.value], | ||
| agent_os=agent_os, | ||
| ) | ||
|
|
||
| def __call__(self) -> str: | ||
| """ | ||
| Build one string of the accessibility hierarchy for the model. | ||
|
|
||
| Returns: | ||
| str: Prefix `UIAutomator hierarchy was retrieved:` followed by newline- | ||
| separated element lines (see class docstring for field format). | ||
| """ | ||
| hierarchy = self.agent_os.get_ui_elements() | ||
| return f"UIAutomator hierarchy was retrieved: {str(hierarchy)}" |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
What does the
from_agentmean?