From 9a718300fa6b3ceaa242cca62fbc13dc0a06f681 Mon Sep 17 00:00:00 2001 From: Carlos Chinchilla Corbacho <188046461+cchinchilla-dev@users.noreply.github.com> Date: Fri, 8 May 2026 15:32:18 +0200 Subject: [PATCH 1/4] add native tool/function calling across providers --- src/agentloom/core/models.py | 20 + src/agentloom/observability/metrics.py | 43 ++ src/agentloom/observability/noop.py | 3 + src/agentloom/observability/observer.py | 38 ++ src/agentloom/observability/schema.py | 8 + src/agentloom/providers/anthropic.py | 17 + src/agentloom/providers/base.py | 89 +++ src/agentloom/providers/google.py | 38 +- src/agentloom/providers/ollama.py | 28 + src/agentloom/providers/openai.py | 30 +- src/agentloom/steps/_tools.py | 269 +++++++++ src/agentloom/steps/llm_call.py | 108 +++- tests/observability/test_metrics.py | 22 + tests/observability/test_noop.py | 9 + tests/observability/test_observer.py | 50 ++ tests/providers/test_tool_calling.py | 730 ++++++++++++++++++++++++ 16 files changed, 1494 insertions(+), 8 deletions(-) create mode 100644 src/agentloom/steps/_tools.py create mode 100644 tests/providers/test_tool_calling.py diff --git a/src/agentloom/core/models.py b/src/agentloom/core/models.py index dc689fa..9063e68 100644 --- a/src/agentloom/core/models.py +++ b/src/agentloom/core/models.py @@ -108,6 +108,20 @@ class ThinkingConfig(BaseModel): capture_reasoning: bool = True +class ToolDefinition(BaseModel): + """LLM-callable tool declared on an ``llm_call`` step. + + ``parameters`` is a JSON Schema object; provider adapters translate it + to each API's native shape. ``name`` resolves against the workflow's + ``tool_registry`` for dispatch. + """ + + name: str + description: str = "" + parameters: dict[str, Any] = Field(default_factory=dict) + + + class StepDefinition(BaseModel): """Definition of a single workflow step.""" @@ -155,6 +169,12 @@ class StepDefinition(BaseModel): # Reasoning / extended thinking thinking: ThinkingConfig | None = None + # Tool calling — LLM picks tools at runtime. + # ``tool_choice``: ``"auto"`` | ``"required"`` | ``"none"`` | ``{"name": "..."}``. + tools: list[ToolDefinition] = Field(default_factory=list) + tool_choice: Any = "auto" + max_tool_iterations: int = 5 + class SandboxConfig(BaseModel): """Sandbox configuration for built-in tools. diff --git a/src/agentloom/observability/metrics.py b/src/agentloom/observability/metrics.py index 0333470..6ccf450 100644 --- a/src/agentloom/observability/metrics.py +++ b/src/agentloom/observability/metrics.py @@ -64,6 +64,8 @@ def __init__( self._attachment_counter: Any = None self._stream_counter: Any = None self._ttft_histogram: Any = None + self._tool_call_counter: Any = None + self._tool_call_histogram: Any = None self._mock_replay_counter: Any = None self._recording_capture_counter: Any = None self._recording_latency_histogram: Any = None @@ -152,6 +154,18 @@ def _setup_otel(self, endpoint: str) -> None: "agentloom_stream_responses_total", description="Total streamed LLM responses", ) + # Tool calls dispatched by the model (#116). Tagged by tool name + + # status so dashboards can split successes from failures and spot + # tools that consistently fail or hang. + self._tool_call_counter = meter.create_counter( + "agentloom_tool_calls_total", + description="Total model-dispatched tool calls", + ) + self._tool_call_histogram = meter.create_histogram( + "agentloom_tool_call_duration_seconds", + description="Tool-call execution duration", + unit="s", + ) # Canonical OTel GenAI metric — replaces the AgentLoom-prefixed # ``agentloom_time_to_first_token_seconds`` with the spec name. self._time_to_first_chunk_histogram = meter.create_histogram( @@ -270,6 +284,17 @@ def _setup_prom( "Total streamed LLM responses", ["provider", "model"], ) + self._prom_counters["tool_calls"] = prom.Counter( + "agentloom_tool_calls_total", + "Total model-dispatched tool calls", + ["tool_name", "status"], + ) + self._prom_histograms["tool_call_duration"] = prom.Histogram( + "agentloom_tool_call_duration_seconds", + "Tool-call execution duration", + ["tool_name"], + buckets=[0.01, 0.05, 0.1, 0.5, 1, 2, 5, 10, 30, 60], + ) self._prom_histograms["time_to_first_chunk"] = prom.Histogram( "gen_ai_client_operation_time_to_first_chunk_seconds", "GenAI streaming time-to-first-chunk (per OTel GenAI conventions)", @@ -465,6 +490,24 @@ def record_attachments(self, step_type: str, count: int) -> None: else: # pragma: no cover — prom fallback self._prom_counters["attachments"].labels(step_type=step_type).inc(count) + def record_tool_call(self, tool_name: str, success: bool, duration_s: float) -> None: + """Record a model-dispatched tool call (#116). + + Counter is tagged ``status=success|failure`` so dashboards can + plot a per-tool failure rate; histogram tracks execution latency. + """ + if not self._enabled: + return + status = "success" if success else "failure" + if self._backend == "otel": + self._tool_call_counter.add(1, {"tool_name": tool_name, "status": status}) + self._tool_call_histogram.record(duration_s, {"tool_name": tool_name}) + else: # pragma: no cover — prom fallback + self._prom_counters["tool_calls"].labels(tool_name=tool_name, status=status).inc() + self._prom_histograms["tool_call_duration"].labels(tool_name=tool_name).observe( + duration_s + ) + def record_stream_response(self, provider: str, model: str) -> None: if not self._enabled: return diff --git a/src/agentloom/observability/noop.py b/src/agentloom/observability/noop.py index 54c4812..a377330 100644 --- a/src/agentloom/observability/noop.py +++ b/src/agentloom/observability/noop.py @@ -139,6 +139,9 @@ def on_provider_call_end( def on_provider_error(self, provider: str, error_type: str, **kwargs: Any) -> None: pass + def on_tool_call(self, **kwargs: Any) -> None: + pass + def on_stream_response(self, provider: str, model: str, ttft_s: float, **kwargs: Any) -> None: pass diff --git a/src/agentloom/observability/observer.py b/src/agentloom/observability/observer.py index 9c87a6b..aa41b85 100644 --- a/src/agentloom/observability/observer.py +++ b/src/agentloom/observability/observer.py @@ -304,6 +304,44 @@ def on_provider_call_end( else: span.end() + def on_tool_call( + self, + *, + step_id: str, + call_id: str, + tool_name: str, + args_hash: str, + result_hash: str, + duration_ms: float, + success: bool, + **kwargs: Any, + ) -> None: + """Record a model-dispatched tool call (#116). + + Emits a child span under the active step span carrying the canonical + ``execute_tool {name}`` name plus tool attrs, and records the + per-tool counter + histogram. Args / result are hashed (not raw) + so PII never lands on the trace. + """ + if self._metrics: + self._metrics.record_tool_call(tool_name, success, duration_ms / 1000.0) + if self._tracing: + attrs: dict[str, Any] = { + SpanAttr.TOOL_CALL_ID: call_id, + SpanAttr.TOOL_NAME: tool_name, + SpanAttr.TOOL_ARGS_HASH: args_hash, + SpanAttr.TOOL_RESULT_HASH: result_hash, + SpanAttr.TOOL_DURATION_MS: duration_ms, + SpanAttr.TOOL_SUCCESS: success, + } + if self._run_id: + attrs[SpanAttr.WORKFLOW_RUN_ID] = self._run_id + span = self._tracing.start_span( + SpanName.GEN_AI_TOOL_CALL.format(tool_name=tool_name), + attributes=attrs, + ) + self._tracing.end_span(span) + def on_provider_error( self, provider: str, diff --git a/src/agentloom/observability/schema.py b/src/agentloom/observability/schema.py index ac9e703..16628ad 100644 --- a/src/agentloom/observability/schema.py +++ b/src/agentloom/observability/schema.py @@ -109,9 +109,11 @@ class SpanAttr: PROVIDER_ATTEMPT_OUTCOME = "agentloom.provider.attempt_outcome" # Tool calls + TOOL_CALL_ID = "tool.call_id" TOOL_NAME = "tool.name" TOOL_ARGS_HASH = "tool.args_hash" TOOL_RESULT_HASH = "tool.result_hash" + TOOL_DURATION_MS = "tool.duration_ms" TOOL_SUCCESS = "tool.success" # Prompt metadata (AgentLoom-specific, no full-prompt capture by default) @@ -181,6 +183,12 @@ class MetricName: # Streaming response counter (no OTel equivalent — AgentLoom-specific). STREAM_RESPONSES_TOTAL = "agentloom_stream_responses_total" + # Tool calls dispatched by the model (#116). Counter tagged by tool + # name + status (success / failure); histogram captures execution + # latency per tool. + TOOL_CALLS_TOTAL = "agentloom_tool_calls_total" + TOOL_CALL_DURATION_SECONDS = "agentloom_tool_call_duration_seconds" + # Resilience gauges CIRCUIT_BREAKER_STATE = "agentloom_circuit_breaker_state" BUDGET_REMAINING_USD = "agentloom_budget_remaining_usd" diff --git a/src/agentloom/providers/anthropic.py b/src/agentloom/providers/anthropic.py index 917269c..e989f7b 100644 --- a/src/agentloom/providers/anthropic.py +++ b/src/agentloom/providers/anthropic.py @@ -161,10 +161,22 @@ async def complete( max_tokens: int | None = None, **kwargs: Any, ) -> ProviderResponse: + agentloom_tools = kwargs.pop("agentloom_tools", None) + agentloom_tool_choice = kwargs.pop("agentloom_tool_choice", None) extras = validate_extra_kwargs( "anthropic", "complete", kwargs, _ANTHROPIC_EXTRA_PAYLOAD_KEYS ) capture_reasoning = _translate_thinking_config(extras) + if agentloom_tools: + from agentloom.steps._tools import ( + translate_tool_choice_for_anthropic, + translate_tools_for_anthropic, + ) + + extras["tools"] = translate_tools_for_anthropic(agentloom_tools) + mapped_choice = translate_tool_choice_for_anthropic(agentloom_tool_choice or "auto") + if mapped_choice is not None: + extras["tool_choice"] = mapped_choice system_prompt, filtered_messages = self._format_messages(messages) payload: dict[str, Any] = { @@ -216,6 +228,10 @@ async def complete( ) cost = calculate_cost(model, usage.prompt_tokens, usage.completion_tokens) + from agentloom.steps._tools import parse_tool_calls_from_anthropic + + tool_calls = parse_tool_calls_from_anthropic(content_blocks) + return ProviderResponse( content=content, model=data.get("model", model), @@ -225,6 +241,7 @@ async def complete( reasoning_content=( "".join(reasoning_parts) if reasoning_parts and capture_reasoning else None ), + tool_calls=tool_calls, raw_response=data, finish_reason=data.get("stop_reason"), ) diff --git a/src/agentloom/providers/base.py b/src/agentloom/providers/base.py index b247672..ec54322 100644 --- a/src/agentloom/providers/base.py +++ b/src/agentloom/providers/base.py @@ -12,6 +12,60 @@ from agentloom.exceptions import ProviderError +class ToolCall(BaseModel): + """A function-call decision returned by the model. + + ``id`` round-trips on the follow-up tool-result message (OpenAI: + ``tool_call_id``, Anthropic: ``tool_use_id``). ``arguments`` is + already JSON-decoded — adapters parse before constructing this. + """ + + id: str + name: str + arguments: dict[str, Any] = Field(default_factory=dict) + + +class StreamEvent(BaseModel): + """Base type for typed stream events surfaced via ``StreamResponse.events()``. + + Backwards-compat: ``async for chunk in sr`` keeps yielding plain text + strings (the deltas only). Callers that need to react to tool-call + decisions mid-stream use ``async for evt in sr.events()`` and switch + on the concrete subclass. + """ + + +class TextDelta(StreamEvent): + """A chunk of text content from the model.""" + + chunk: str + + +class ToolCallDelta(StreamEvent): + """A partial tool-call observation while the model assembles arguments. + + ``index`` matches across deltas of the same call; ``name`` arrives on + the first delta and is ``None`` on subsequent argument-fragment + deltas; ``arguments_chunk`` accumulates the JSON-encoded args. + """ + + index: int + name: str | None = None + arguments_chunk: str = "" + + +class ToolCallComplete(StreamEvent): + """A fully-assembled ``ToolCall`` ready to dispatch.""" + + tool_call: ToolCall + + +class StreamDone(StreamEvent): + """Emitted when the stream finishes; carries the provider's stop reason.""" + + finish_reason: str = "" + + class ProviderResponse(BaseModel): """Unified response from any LLM provider.""" @@ -30,6 +84,9 @@ class ProviderResponse(BaseModel): # Ollama returns ``message.thinking`` (or strips inline # ``...`` tags as a fallback). reasoning_content: str | None = None + # Empty when the model didn't pick a tool. When non-empty, ``content`` + # may be empty — the LLM step dispatches each call and re-prompts. + tool_calls: list[ToolCall] = Field(default_factory=list) class StreamResponse: @@ -56,13 +113,44 @@ def __init__(self, model: str, provider: str) -> None: # alongside the streamed answer (Gemini ``thought=true`` parts, # Anthropic ``thinking`` deltas, Ollama ``message.thinking``). self.reasoning_content: str | None = None + # Tool calls accumulated by the adapter while streaming. Adapters + # that emit ``ToolCallDelta`` events through the typed event API + # also populate this list with the assembled calls so non-event + # consumers can read them after the stream is exhausted. + self.tool_calls: list[ToolCall] = [] self._chunks: list[str] = [] self._accumulated_bytes: int = 0 self._iterator: AsyncIterator[str] | None = None + self._event_iterator: AsyncIterator[StreamEvent] | None = None def _set_iterator(self, iterator: AsyncIterator[str]) -> None: self._iterator = iterator + def _set_event_iterator(self, iterator: AsyncIterator[StreamEvent]) -> None: + """Adapters that emit typed events register a separate iterator here. + + When unset, ``events()`` synthesises ``TextDelta`` events from + the plain-string iterator plus a final ``StreamDone`` so callers + always get a working typed surface. + """ + self._event_iterator = iterator + + async def events(self) -> AsyncIterator[StreamEvent]: + """Iterate typed stream events (text + tool-call deltas + done). + + Adapters that wired ``_set_event_iterator`` get full fidelity + (per-provider tool-call streaming). The default fallback wraps + the plain-string iterator: each chunk → ``TextDelta``, then a + single ``StreamDone`` once the underlying iterator is exhausted. + """ + if self._event_iterator is not None: + async for event in self._event_iterator: + yield event + return + async for chunk in self: + yield TextDelta(chunk=chunk) + yield StreamDone(finish_reason=self.finish_reason or "") + def __aiter__(self) -> StreamResponse: return self @@ -94,6 +182,7 @@ def to_provider_response(self) -> ProviderResponse: cost_usd=self.cost_usd, finish_reason=self.finish_reason, reasoning_content=self.reasoning_content, + tool_calls=list(self.tool_calls), ) diff --git a/src/agentloom/providers/google.py b/src/agentloom/providers/google.py index b486274..42d908a 100644 --- a/src/agentloom/providers/google.py +++ b/src/agentloom/providers/google.py @@ -145,6 +145,14 @@ def _format_messages( content = msg.get("content", "") system_instruction = content if isinstance(content, str) else str(content) continue + # Tool-loop messages already in Gemini wire shape (role=``model`` + # with ``functionCall`` parts, role=``function`` with + # ``functionResponse`` parts) — pass them through verbatim so + # iteration 2+ preserves the call context. Identified by the + # presence of ``parts`` instead of ``content``. + if "parts" in msg: + contents.append(msg) + continue role = "user" if msg["role"] == "user" else "model" content = msg.get("content", "") if isinstance(content, str): @@ -180,8 +188,27 @@ async def complete( max_tokens: int | None = None, **kwargs: Any, ) -> ProviderResponse: + agentloom_tools = kwargs.pop("agentloom_tools", None) + agentloom_tool_choice = kwargs.pop("agentloom_tool_choice", None) extras = validate_extra_kwargs("google", "complete", kwargs, _GOOGLE_EXTRA_PAYLOAD_KEYS) thinking_payload = _build_thinking_config_payload(extras.pop("thinking_config", None)) + if agentloom_tools: + from agentloom.steps._tools import translate_tools_for_google + + extras["tools"] = translate_tools_for_google(agentloom_tools) + # ``{"name": "fn"}`` selects a specific function via Gemini's + # ANY mode + ``allowedFunctionNames``. Plain strings map to + # AUTO / ANY / NONE; anything else falls back to AUTO. + choice = agentloom_tool_choice or "auto" + fn_config: dict[str, Any] + if isinstance(choice, dict) and "name" in choice: + fn_config = {"mode": "ANY", "allowedFunctionNames": [choice["name"]]} + else: + mode = {"auto": "AUTO", "required": "ANY", "none": "NONE"}.get( + choice if isinstance(choice, str) else "auto", "AUTO" + ) + fn_config = {"mode": mode} + extras["tool_config"] = {"functionCallingConfig": fn_config} system_instruction, contents = self._format_messages(messages) payload: dict[str, Any] = {"contents": contents} @@ -217,9 +244,10 @@ async def complete( candidates = data.get("candidates", []) content = "" reasoning_content: str | None = None + content_parts: list[dict[str, Any]] = [] if candidates: - parts = candidates[0].get("content", {}).get("parts", []) - content, reasoning_trace = _parse_gemini_content_parts(parts) + content_parts = candidates[0].get("content", {}).get("parts", []) or [] + content, reasoning_trace = _parse_gemini_content_parts(content_parts) if reasoning_trace: reasoning_content = reasoning_trace @@ -248,6 +276,11 @@ async def complete( if candidates: finish_reason = candidates[0].get("finishReason") + from agentloom.steps._tools import parse_tool_calls_from_google + + tool_calls = parse_tool_calls_from_google(content_parts) + + return ProviderResponse( content=content, model=model, @@ -257,6 +290,7 @@ async def complete( reasoning_content=reasoning_content, raw_response=data, finish_reason=finish_reason, + tool_calls=tool_calls, ) async def stream( diff --git a/src/agentloom/providers/ollama.py b/src/agentloom/providers/ollama.py index c5db3de..290e655 100644 --- a/src/agentloom/providers/ollama.py +++ b/src/agentloom/providers/ollama.py @@ -115,6 +115,13 @@ def _format_messages(messages: list[dict[str, Any]]) -> list[dict[str, Any]]: """Convert internal content blocks to Ollama's images format.""" formatted: list[dict[str, Any]] = [] for msg in messages: + # Tool-loop messages (assistant with ``tool_calls``, role=``tool`` + # with ``tool_call_id``) ship in OpenAI-compatible wire shape — + # pass them through verbatim so iteration 2+ keeps the call + # context intact. + if "tool_calls" in msg or msg.get("role") == "tool": + formatted.append(msg) + continue content = msg.get("content", "") if isinstance(content, str): formatted.append({"role": msg["role"], "content": content}) @@ -154,8 +161,23 @@ async def complete( max_tokens: int | None = None, **kwargs: Any, ) -> ProviderResponse: + agentloom_tools = kwargs.pop("agentloom_tools", None) + agentloom_tool_choice = kwargs.pop("agentloom_tool_choice", None) extras = validate_extra_kwargs("ollama", "complete", kwargs, _OLLAMA_EXTRA_PAYLOAD_KEYS) think_param, capture_reasoning = _pop_thinking_config(extras) + if agentloom_tools: + from agentloom.steps._tools import translate_tools_for_ollama + + extras["tools"] = translate_tools_for_ollama(agentloom_tools) + # Ollama doesn't expose tool_choice; only model-side support + # decides whether the call is honored. Surface the silent drop + # via debug log so users debugging "why doesn't my tool fire" + # have a hint. + if agentloom_tool_choice not in (None, "auto"): + logger.debug( + "Ollama ignores tool_choice=%r; only model-side support matters.", + agentloom_tool_choice, + ) payload: dict[str, Any] = { "model": model, "messages": self._format_messages(messages), @@ -206,6 +228,11 @@ async def complete( total_tokens=prompt_tokens + completion_tokens, ) + from agentloom.steps._tools import parse_tool_calls_from_openai + + tool_calls = parse_tool_calls_from_openai(data.get("message", {}) or {}) + + return ProviderResponse( content=content, model=data.get("model", model), @@ -215,6 +242,7 @@ async def complete( reasoning_content=reasoning_content, raw_response=data, finish_reason=data.get("done_reason"), + tool_calls=tool_calls, ) async def stream( diff --git a/src/agentloom/providers/openai.py b/src/agentloom/providers/openai.py index 5c2867a..caeaf19 100644 --- a/src/agentloom/providers/openai.py +++ b/src/agentloom/providers/openai.py @@ -80,6 +80,13 @@ def _format_messages(messages: list[dict[str, Any]]) -> list[dict[str, Any]]: """Convert internal content blocks to OpenAI's vision/audio format.""" formatted: list[dict[str, Any]] = [] for msg in messages: + # Tool-loop messages (assistant with ``tool_calls``, role=``tool`` + # with ``tool_call_id``) ship in OpenAI wire shape directly — + # pass them through verbatim so iteration 2+ sees the prior + # decision and result. + if "tool_calls" in msg or msg.get("role") == "tool": + formatted.append(msg) + continue content = msg.get("content", "") if isinstance(content, str): formatted.append({"role": msg["role"], "content": content}) @@ -126,11 +133,25 @@ async def complete( max_tokens: int | None = None, **kwargs: Any, ) -> ProviderResponse: + # Tool definitions arrive as ``ToolDefinition`` Pydantic instances — + # translate to OpenAI's wire format before forwarding so callers + # don't have to know provider-specific shapes. + agentloom_tools = kwargs.pop("agentloom_tools", None) + agentloom_tool_choice = kwargs.pop("agentloom_tool_choice", None) extras = validate_extra_kwargs("openai", "complete", kwargs, _OPENAI_EXTRA_PAYLOAD_KEYS) # ``thinking_config`` is accepted at the step layer for YAML # uniformity but has no chat-completions equivalent — drop it # before splatting extras into the request body. extras.pop("thinking_config", None) + if agentloom_tools: + from agentloom.steps._tools import ( + translate_tool_choice_for_openai, + translate_tools_for_openai, + ) + + extras["tools"] = translate_tools_for_openai(agentloom_tools) + if agentloom_tool_choice is not None and agentloom_tool_choice != "none": + extras["tool_choice"] = translate_tool_choice_for_openai(agentloom_tool_choice) payload: dict[str, Any] = { "model": model, "messages": self._format_messages(messages), @@ -149,7 +170,8 @@ async def complete( raise_for_status("openai", response) data = response.json() - content = data["choices"][0]["message"]["content"] + message = data["choices"][0]["message"] + content = message.get("content") or "" usage_data = data.get("usage", {}) # o-series returns reasoning tokens under completion_tokens_details; # ordinary gpt-* responses omit the field and resolve to 0. @@ -168,6 +190,11 @@ async def complete( reasoning_tokens=usage.reasoning_tokens, ) + from agentloom.steps._tools import parse_tool_calls_from_openai + + tool_calls = parse_tool_calls_from_openai(message) + + return ProviderResponse( content=content, model=data.get("model", model), @@ -176,6 +203,7 @@ async def complete( cost_usd=cost, raw_response=data, finish_reason=data["choices"][0].get("finish_reason"), + tool_calls=tool_calls, ) async def stream( diff --git a/src/agentloom/steps/_tools.py b/src/agentloom/steps/_tools.py new file mode 100644 index 0000000..8e73bdf --- /dev/null +++ b/src/agentloom/steps/_tools.py @@ -0,0 +1,269 @@ +"""Tool-calling helpers — wire-format translation, dispatch, and message synthesis.""" + +from __future__ import annotations + +import hashlib +import json +import logging +import time +from typing import Any + +import anyio + +from agentloom.core.models import ToolDefinition +from agentloom.providers.base import ToolCall + + +def _hash_text(text: str) -> str: + return hashlib.sha256(text.encode("utf-8")).hexdigest()[:16] + + +logger = logging.getLogger("agentloom.steps") + + +def translate_tools_for_openai(tools: list[ToolDefinition]) -> list[dict[str, Any]]: + """OpenAI shape: ``[{"type": "function", "function": {name, description, parameters}}]``.""" + return [ + { + "type": "function", + "function": { + "name": t.name, + "description": t.description, + "parameters": t.parameters or {"type": "object", "properties": {}}, + }, + } + for t in tools + ] + + +def translate_tools_for_anthropic(tools: list[ToolDefinition]) -> list[dict[str, Any]]: + """Anthropic shape: ``[{name, description, input_schema}]``.""" + return [ + { + "name": t.name, + "description": t.description, + "input_schema": t.parameters or {"type": "object", "properties": {}}, + } + for t in tools + ] + + +def translate_tools_for_google(tools: list[ToolDefinition]) -> list[dict[str, Any]]: + """Google shape: ``[{"function_declarations": [{name, description, parameters}, ...]}]``. + + Google groups all functions under one ``Tool`` object; we put them all + in a single declaration list since AgentLoom doesn't yet expose tool + grouping (everything is a function). + """ + return [ + { + "function_declarations": [ + { + "name": t.name, + "description": t.description, + "parameters": t.parameters or {"type": "object", "properties": {}}, + } + for t in tools + ] + } + ] + + +def translate_tools_for_ollama(tools: list[ToolDefinition]) -> list[dict[str, Any]]: + """Ollama uses the OpenAI shape on supported models.""" + return translate_tools_for_openai(tools) + + +def translate_tool_choice_for_openai(choice: Any) -> Any: + """OpenAI ``tool_choice`` accepts ``"auto"`` / ``"required"`` / ``"none"`` + or ``{"type": "function", "function": {"name": "..."}}``.""" + if isinstance(choice, dict) and "name" in choice: + return {"type": "function", "function": {"name": choice["name"]}} + return choice + + +def translate_tool_choice_for_anthropic(choice: Any) -> Any: + """Anthropic uses ``{"type": "auto"}`` / ``"any"`` / ``"tool"``.""" + if choice == "auto": + return {"type": "auto"} + if choice == "required": + return {"type": "any"} + if choice == "none": + return None # omit the field entirely + if isinstance(choice, dict) and "name" in choice: + return {"type": "tool", "name": choice["name"]} + return {"type": "auto"} + + +def parse_tool_calls_from_openai(message: dict[str, Any]) -> list[ToolCall]: + """OpenAI returns ``message.tool_calls = [{id, type, function: {name, arguments}}]``.""" + raw_calls = message.get("tool_calls") or [] + calls: list[ToolCall] = [] + for entry in raw_calls: + if entry.get("type") != "function": + continue + fn = entry.get("function", {}) + try: + args = json.loads(fn.get("arguments") or "{}") + except json.JSONDecodeError: + args = {} + calls.append(ToolCall(id=entry.get("id", ""), name=fn.get("name", ""), arguments=args)) + return calls + + +def parse_tool_calls_from_anthropic(content_blocks: list[dict[str, Any]]) -> list[ToolCall]: + """Anthropic returns ``content`` blocks; ``type=tool_use`` carries calls.""" + calls: list[ToolCall] = [] + for block in content_blocks: + if block.get("type") != "tool_use": + continue + calls.append( + ToolCall( + id=block.get("id", ""), + name=block.get("name", ""), + arguments=block.get("input", {}) or {}, + ) + ) + return calls + + +def parse_tool_calls_from_google(content_parts: list[dict[str, Any]]) -> list[ToolCall]: + """Google returns parts with ``functionCall: {name, args}``. Ids aren't + provider-assigned so we synthesize them — the result message echoes + the same name (no id round-trip needed).""" + calls: list[ToolCall] = [] + for idx, part in enumerate(content_parts): + fc = part.get("functionCall") + if not fc: + continue + calls.append( + ToolCall( + id=f"google-{idx}", + name=fc.get("name", ""), + arguments=fc.get("args", {}) or {}, + ) + ) + return calls + + +async def dispatch_tool_calls( + calls: list[ToolCall], + tool_registry: Any, + *, + observer: Any | None = None, + step_id: str = "", +) -> list[tuple[ToolCall, str, bool]]: + """Execute each call via the registry in parallel; preserve order. + + Failed calls return ``(call, str(exc), False)`` so the model can + recover on the next turn rather than aborting the loop. When *observer* + is provided, fires ``on_tool_call`` per call with hashes of the args + and result for trace-level observability without leaking PII. + """ + results: list[tuple[ToolCall, str, bool]] = [None] * len(calls) # type: ignore[list-item] + + async def _run(idx: int, call: ToolCall) -> None: + start = time.monotonic() + success = False + text: str + try: + tool = tool_registry.get(call.name) + except KeyError as e: + text = f"tool '{call.name}' not registered: {e}" + results[idx] = (call, text, False) + else: + try: + outcome = await tool.execute(**call.arguments) + text = outcome if isinstance(outcome, str) else json.dumps(outcome, default=str) + results[idx] = (call, text, True) + success = True + except Exception as e: # noqa: BLE001 — reported back to the model + logger.warning("Tool '%s' failed: %s", call.name, e) + text = f"tool execution failed: {e}" + results[idx] = (call, text, False) + if observer is not None: + hook = getattr(observer, "on_tool_call", None) + if callable(hook): + hook( + step_id=step_id, + call_id=call.id, + tool_name=call.name, + args_hash=_hash_text(json.dumps(call.arguments, sort_keys=True, default=str)), + result_hash=_hash_text(text), + duration_ms=(time.monotonic() - start) * 1000.0, + success=success, + ) + + async with anyio.create_task_group() as tg: + for idx, call in enumerate(calls): + tg.start_soon(_run, idx, call) + + return results + + +def build_assistant_message_with_tool_calls( + provider: str, content: str, calls: list[ToolCall] +) -> dict[str, Any]: + """Replay the assistant's tool-call decision so the model sees its prior turn.""" + if provider == "anthropic": + blocks: list[dict[str, Any]] = [] + if content: + blocks.append({"type": "text", "text": content}) + for c in calls: + blocks.append({"type": "tool_use", "id": c.id, "name": c.name, "input": c.arguments}) + return {"role": "assistant", "content": blocks} + if provider == "google": + return { + "role": "model", + "parts": [{"functionCall": {"name": c.name, "args": c.arguments}} for c in calls], + } + # OpenAI / Ollama: ``content`` must be a string (formatter rejects None); + # empty string is accepted alongside tool_calls. + return { + "role": "assistant", + "content": content or "", + "tool_calls": [ + { + "id": c.id, + "type": "function", + "function": {"name": c.name, "arguments": json.dumps(c.arguments)}, + } + for c in calls + ], + } + + +def build_tool_result_messages( + provider: str, results: list[tuple[ToolCall, str, bool]] +) -> list[dict[str, Any]]: + """Translate tool outcomes into the role/shape each provider expects.""" + if provider == "anthropic": + # Single user turn with one tool_result block per call. + blocks: list[dict[str, Any]] = [] + for call, text, success in results: + blocks.append( + { + "type": "tool_result", + "tool_use_id": call.id, + "content": text, + "is_error": not success, + } + ) + return [{"role": "user", "content": blocks}] + if provider == "google": + return [ + { + "role": "function", + "parts": [ + { + "functionResponse": { + "name": call.name, + "response": {"result": text} if success else {"error": text}, + } + } + for call, text, success in results + ], + } + ] + # OpenAI / Ollama: one tool message per call, keyed by tool_call_id. + return [{"role": "tool", "tool_call_id": call.id, "content": text} for call, text, _ in results] diff --git a/src/agentloom/steps/llm_call.py b/src/agentloom/steps/llm_call.py index c9be355..a786d81 100644 --- a/src/agentloom/steps/llm_call.py +++ b/src/agentloom/steps/llm_call.py @@ -56,6 +56,97 @@ def _build_prompt_metadata( class LLMCallStep(BaseStep): """Executes an LLM call with prompt template rendering from state.""" + @staticmethod + async def _run_tool_loop( + *, + context: StepContext, + step: StepDefinition, + messages: list[dict[str, Any]], + model: str, + provider_kwargs: dict[str, Any], + ) -> Any: + """Iterate complete() → dispatch tools → re-prompt until done. + + Cost and tokens accumulate across iterations. ``max_tool_iterations`` + bounds the loop; collapses to a single call when ``tools`` is empty. + """ + from agentloom.core.results import TokenUsage + from agentloom.providers.base import ProviderResponse + from agentloom.steps._tools import ( + build_assistant_message_with_tool_calls, + build_tool_result_messages, + dispatch_tool_calls, + ) + + accumulated_prompt = 0 + accumulated_completion = 0 + accumulated_reasoning = 0 + accumulated_cost = 0.0 + + max_iterations = max(step.max_tool_iterations, 1) + gateway = context.provider_gateway + if gateway is None: + raise StepError(step.id, "No provider gateway configured") + for _ in range(max_iterations): + response = await gateway.complete( + messages=messages, + model=model, + temperature=step.temperature, + max_tokens=step.max_tokens, + step_id=step.id, + **provider_kwargs, + ) + accumulated_prompt += response.usage.prompt_tokens + accumulated_completion += response.usage.completion_tokens + accumulated_reasoning += response.usage.reasoning_tokens + accumulated_cost += response.cost_usd + + if not response.tool_calls or not step.tools: + # Replace the response usage with the accumulated totals so + # the caller sees the full conversation cost. + response.usage = TokenUsage( + prompt_tokens=accumulated_prompt, + completion_tokens=accumulated_completion, + total_tokens=( + accumulated_prompt + accumulated_completion + accumulated_reasoning + ), + reasoning_tokens=accumulated_reasoning, + ) + response.cost_usd = accumulated_cost + return response + + if context.tool_registry is None: + raise StepError( + step.id, + "Tool registry required for tools= declaration but not configured.", + ) + + results = await dispatch_tool_calls( + response.tool_calls, + context.tool_registry, + observer=context.observer, + step_id=step.id, + ) + messages.append( + build_assistant_message_with_tool_calls( + response.provider, response.content, response.tool_calls + ) + ) + messages.extend(build_tool_result_messages(response.provider, results)) + + # Loop exhausted; surface the last response with the cap noted as + # finish_reason so callers can detect it. + last_response: ProviderResponse = response # noqa: F821 — set in loop + last_response.usage = TokenUsage( + prompt_tokens=accumulated_prompt, + completion_tokens=accumulated_completion, + total_tokens=(accumulated_prompt + accumulated_completion + accumulated_reasoning), + reasoning_tokens=accumulated_reasoning, + ) + last_response.cost_usd = accumulated_cost + last_response.finish_reason = "max_tool_iterations" + return last_response + @staticmethod def _build_thinking_kwargs(step: StepDefinition) -> dict[str, Any]: """Forward ``StepDefinition.thinking`` to the gateway as a config object. @@ -148,14 +239,21 @@ async def execute(self, context: StepContext) -> StepResult: ) provider_kwargs = self._build_thinking_kwargs(step) + if step.tools: + provider_kwargs["agentloom_tools"] = step.tools + provider_kwargs["agentloom_tool_choice"] = step.tool_choice + + # Tool-call loop: re-prompt with tool results until the model + # stops requesting tools or we exhaust ``max_tool_iterations``. + # Costs and tokens accumulate across iterations; only the final + # response's content is exposed to the caller. try: - response = await context.provider_gateway.complete( + response = await self._run_tool_loop( + context=context, + step=step, messages=messages, model=model, - temperature=step.temperature, - max_tokens=step.max_tokens, - step_id=step.id, - **provider_kwargs, + provider_kwargs=provider_kwargs, ) except Exception as e: duration = (time.monotonic() - start) * 1000 diff --git a/tests/observability/test_metrics.py b/tests/observability/test_metrics.py index 0e6f85a..4af3b58 100644 --- a/tests/observability/test_metrics.py +++ b/tests/observability/test_metrics.py @@ -78,6 +78,28 @@ def test_record_tokens(self) -> None: if mm._backend == "otel": mm.shutdown() + def test_record_tool_call(self) -> None: + # Records both the per-tool counter (with status label) and the + # latency histogram. Status maps to ``"success"`` / ``"failure"`` + # so dashboards can derive a per-tool failure rate. + from unittest.mock import MagicMock + + mm = MetricsManager(enabled=True) + if mm._backend != "otel": + return + counter_spy = MagicMock() + histogram_spy = MagicMock() + mm._tool_call_counter = counter_spy + mm._tool_call_histogram = histogram_spy + mm.record_tool_call("add", success=True, duration_s=0.42) + mm.record_tool_call("add", success=False, duration_s=1.1) + + counter_spy.add.assert_any_call(1, {"tool_name": "add", "status": "success"}) + counter_spy.add.assert_any_call(1, {"tool_name": "add", "status": "failure"}) + histogram_spy.record.assert_any_call(0.42, {"tool_name": "add"}) + histogram_spy.record.assert_any_call(1.1, {"tool_name": "add"}) + mm.shutdown() + def test_reasoning_tokens_metric_emitted(self) -> None: # When ``reasoning_tokens`` is non-zero, ``record_tokens`` must # observe the histogram a third time with diff --git a/tests/observability/test_noop.py b/tests/observability/test_noop.py index ed29383..cda83e4 100644 --- a/tests/observability/test_noop.py +++ b/tests/observability/test_noop.py @@ -174,6 +174,15 @@ def test_every_hook_executes_no_op_body(self) -> None: obs.on_provider_call_start("s1", "openai", "gpt-4o-mini") obs.on_provider_call_end("s1", "openai", "gpt-4o-mini", 1.5) obs.on_provider_error("openai", "RateLimitError") + obs.on_tool_call( + step_id="s1", + call_id="c1", + tool_name="add", + args_hash="x", + result_hash="y", + duration_ms=1.0, + success=True, + ) obs.on_stream_response("openai", "gpt-4o-mini", 0.3) obs.on_tokens("openai", "gpt-4o-mini", 100, 50) obs.on_mock_replay("wf", "s1", "step_id") diff --git a/tests/observability/test_observer.py b/tests/observability/test_observer.py index ace227d..35f8008 100644 --- a/tests/observability/test_observer.py +++ b/tests/observability/test_observer.py @@ -260,6 +260,56 @@ def test_dangling_span_closed_without_tracing(self) -> None: span.end.assert_called_once() +class TestToolCallObservability: + """``on_tool_call`` records the per-tool counter / histogram via the + metrics manager and emits an ``execute_tool {name}`` child span carrying + the tool name, args/result hashes, duration, and success flag.""" + + def test_records_metric_and_emits_span(self) -> None: + tracing = MagicMock() + metrics = MagicMock() + observer = WorkflowObserver(tracing=tracing, metrics=metrics) + observer.on_workflow_start("wf", run_id="rid-7") + tracing.start_span.reset_mock() # ignore the workflow span + observer.on_tool_call( + step_id="s1", + call_id="c1", + tool_name="add", + args_hash="abcd1234", + result_hash="ef567890", + duration_ms=12.3, + success=True, + ) + # Metric — counter + histogram via record_tool_call. + metrics.record_tool_call.assert_called_once_with("add", True, 0.0123) + # Span — name follows the canonical ``execute_tool {tool_name}`` + # template; attrs include the run_id propagation + tool fields. + tracing.start_span.assert_called_once() + name = tracing.start_span.call_args.args[0] + attrs = tracing.start_span.call_args.kwargs["attributes"] + assert name == "execute_tool add" + assert attrs["tool.name"] == "add" + assert attrs["tool.call_id"] == "c1" + assert attrs["tool.args_hash"] == "abcd1234" + assert attrs["tool.result_hash"] == "ef567890" + assert attrs["tool.duration_ms"] == 12.3 + assert attrs["tool.success"] is True + assert attrs["workflow.run_id"] == "rid-7" + + def test_noop_when_no_tracing_or_metrics(self) -> None: + # Defensive: tool dispatch shouldn't fail when observability is off. + observer = WorkflowObserver() + observer.on_tool_call( + step_id="s1", + call_id="c1", + tool_name="x", + args_hash="a", + result_hash="b", + duration_ms=1.0, + success=False, + ) + + class TestStreamResponse: def test_records_ttft_and_stream_count(self) -> None: metrics = MagicMock() diff --git a/tests/providers/test_tool_calling.py b/tests/providers/test_tool_calling.py new file mode 100644 index 0000000..8cb461b --- /dev/null +++ b/tests/providers/test_tool_calling.py @@ -0,0 +1,730 @@ +"""Tests for native tool calling across providers.""" + +from __future__ import annotations + +import json +from typing import Any + +import httpx +import respx + +from agentloom.core.engine import WorkflowEngine +from agentloom.core.models import ( + StepDefinition, + StepType, + ToolDefinition, + WorkflowConfig, + WorkflowDefinition, +) +from agentloom.core.results import StepStatus, WorkflowStatus +from agentloom.providers.anthropic import AnthropicProvider +from agentloom.providers.base import ToolCall +from agentloom.providers.gateway import ProviderGateway +from agentloom.providers.openai import OpenAIProvider +from agentloom.steps._tools import ( + build_assistant_message_with_tool_calls, + build_tool_result_messages, + dispatch_tool_calls, + parse_tool_calls_from_anthropic, + parse_tool_calls_from_google, + parse_tool_calls_from_openai, + translate_tools_for_anthropic, + translate_tools_for_google, + translate_tools_for_openai, +) +from agentloom.tools.base import BaseTool +from agentloom.tools.registry import ToolRegistry + + +class _AddTool(BaseTool): + name = "add" + description = "Add two integers." + parameters_schema = { + "type": "object", + "properties": {"a": {"type": "integer"}, "b": {"type": "integer"}}, + "required": ["a", "b"], + } + + async def execute(self, **kwargs: Any) -> Any: + return kwargs["a"] + kwargs["b"] + + +class _LookupTool(BaseTool): + name = "lookup" + description = "Look up a value." + parameters_schema = { + "type": "object", + "properties": {"key": {"type": "string"}}, + "required": ["key"], + } + + async def execute(self, **kwargs: Any) -> Any: + return f"value-for-{kwargs['key']}" + + +def _registry_with(*tools: BaseTool) -> ToolRegistry: + reg = ToolRegistry() + for t in tools: + reg.register(t) + return reg + + +class TestToolTranslation: + def test_openai_shape(self) -> None: + out = translate_tools_for_openai( + [ToolDefinition(name="add", description="d", parameters={"type": "object"})] + ) + assert out == [ + { + "type": "function", + "function": { + "name": "add", + "description": "d", + "parameters": {"type": "object"}, + }, + } + ] + + def test_anthropic_shape(self) -> None: + out = translate_tools_for_anthropic( + [ToolDefinition(name="add", description="d", parameters={"type": "object"})] + ) + assert out == [{"name": "add", "description": "d", "input_schema": {"type": "object"}}] + + def test_google_shape_groups_under_function_declarations(self) -> None: + out = translate_tools_for_google( + [ + ToolDefinition(name="add", description="d", parameters={"type": "object"}), + ToolDefinition(name="lookup", description="l", parameters={"type": "object"}), + ] + ) + assert len(out) == 1 + decls = out[0]["function_declarations"] + assert {d["name"] for d in decls} == {"add", "lookup"} + + +class TestToolParsing: + def test_openai_parses_tool_calls(self) -> None: + message = { + "role": "assistant", + "content": None, + "tool_calls": [ + { + "id": "call_1", + "type": "function", + "function": {"name": "add", "arguments": '{"a": 2, "b": 3}'}, + } + ], + } + calls = parse_tool_calls_from_openai(message) + assert calls == [ToolCall(id="call_1", name="add", arguments={"a": 2, "b": 3})] + + def test_openai_skips_non_function_entries(self) -> None: + message = {"tool_calls": [{"id": "x", "type": "code_interpreter"}, {"type": "function"}]} + calls = parse_tool_calls_from_openai(message) + # Only the second entry is a function — its function dict is empty + # so name="" and args={}; we accept it (degenerate case). + assert len(calls) == 1 + assert calls[0].name == "" + + def test_anthropic_parses_tool_use_blocks(self) -> None: + blocks = [ + {"type": "text", "text": "I'll use a tool."}, + {"type": "tool_use", "id": "tu_1", "name": "lookup", "input": {"key": "x"}}, + ] + calls = parse_tool_calls_from_anthropic(blocks) + assert calls == [ToolCall(id="tu_1", name="lookup", arguments={"key": "x"})] + + def test_google_parses_function_call_parts(self) -> None: + parts = [ + {"text": "I'll call a function."}, + {"functionCall": {"name": "add", "args": {"a": 1, "b": 2}}}, + ] + calls = parse_tool_calls_from_google(parts) + assert calls == [ToolCall(id="google-1", name="add", arguments={"a": 1, "b": 2})] + + +class TestDispatchToolCalls: + async def test_dispatch_runs_in_parallel(self) -> None: + registry = _registry_with(_AddTool(), _LookupTool()) + calls = [ + ToolCall(id="c1", name="add", arguments={"a": 2, "b": 3}), + ToolCall(id="c2", name="lookup", arguments={"key": "alpha"}), + ] + results = await dispatch_tool_calls(calls, registry) + assert len(results) == 2 + # Order preserved. + assert results[0][0].id == "c1" + assert results[0][1] == "5" + assert results[0][2] is True + assert results[1][1] == "value-for-alpha" + + async def test_unknown_tool_yields_failure(self) -> None: + registry = _registry_with(_AddTool()) + calls = [ToolCall(id="c1", name="missing", arguments={})] + results = await dispatch_tool_calls(calls, registry) + assert results[0][2] is False + assert "not registered" in results[0][1] + + async def test_tool_exception_yields_failure(self) -> None: + class _Bad(BaseTool): + name = "bad" + + async def execute(self, **kwargs: Any) -> Any: + raise RuntimeError("kaboom") + + registry = _registry_with(_Bad()) + calls = [ToolCall(id="c1", name="bad", arguments={})] + results = await dispatch_tool_calls(calls, registry) + assert results[0][2] is False + assert "kaboom" in results[0][1] + + +class TestResultMessageBuilders: + def test_openai_role_tool_per_call(self) -> None: + c = ToolCall(id="c1", name="add", arguments={}) + out = build_tool_result_messages("openai", [(c, "5", True)]) + assert out == [{"role": "tool", "tool_call_id": "c1", "content": "5"}] + + def test_anthropic_single_user_turn_with_tool_result_blocks(self) -> None: + c = ToolCall(id="tu_1", name="lookup", arguments={}) + out = build_tool_result_messages("anthropic", [(c, "ok", True)]) + assert len(out) == 1 + assert out[0]["role"] == "user" + block = out[0]["content"][0] + assert block["type"] == "tool_result" + assert block["tool_use_id"] == "tu_1" + assert block["content"] == "ok" + assert block["is_error"] is False + + def test_assistant_message_for_anthropic_includes_tool_use_blocks(self) -> None: + c = ToolCall(id="tu_1", name="lookup", arguments={"key": "x"}) + msg = build_assistant_message_with_tool_calls("anthropic", "Thinking...", [c]) + assert msg["role"] == "assistant" + assert msg["content"][0] == {"type": "text", "text": "Thinking..."} + assert msg["content"][1] == { + "type": "tool_use", + "id": "tu_1", + "name": "lookup", + "input": {"key": "x"}, + } + + +class TestOpenAIToolCallingWire: + @respx.mock + async def test_complete_returns_tool_calls_and_translates_tools(self) -> None: + route = respx.post("https://api.openai.com/v1/chat/completions").mock( + return_value=httpx.Response( + 200, + json={ + "model": "gpt-4o-mini", + "choices": [ + { + "message": { + "role": "assistant", + "content": None, + "tool_calls": [ + { + "id": "c1", + "type": "function", + "function": { + "name": "add", + "arguments": '{"a": 2, "b": 3}', + }, + } + ], + }, + "finish_reason": "tool_calls", + } + ], + "usage": {"prompt_tokens": 5, "completion_tokens": 9}, + }, + ) + ) + provider = OpenAIProvider(api_key="k") + tools = [ + ToolDefinition( + name="add", + description="add two ints", + parameters={ + "type": "object", + "properties": { + "a": {"type": "integer"}, + "b": {"type": "integer"}, + }, + }, + ) + ] + r = await provider.complete( + messages=[{"role": "user", "content": "what is 2+3?"}], + model="gpt-4o-mini", + agentloom_tools=tools, + ) + body = json.loads(route.calls[0].request.content) + assert body["tools"][0]["function"]["name"] == "add" + assert r.tool_calls == [ToolCall(id="c1", name="add", arguments={"a": 2, "b": 3})] + assert r.finish_reason == "tool_calls" + await provider.close() + + +class TestAnthropicToolCallingWire: + @respx.mock + async def test_translates_tools_and_parses_tool_use_blocks(self) -> None: + route = respx.post("https://api.anthropic.com/v1/messages").mock( + return_value=httpx.Response( + 200, + json={ + "content": [ + { + "type": "tool_use", + "id": "tu_1", + "name": "lookup", + "input": {"key": "alpha"}, + } + ], + "model": "claude-haiku-4-5-20251001", + "stop_reason": "tool_use", + "usage": {"input_tokens": 5, "output_tokens": 3}, + }, + ) + ) + provider = AnthropicProvider(api_key="k") + tools = [ToolDefinition(name="lookup", description="d", parameters={"type": "object"})] + r = await provider.complete( + messages=[{"role": "user", "content": "x"}], + model="claude-haiku-4-5-20251001", + agentloom_tools=tools, + agentloom_tool_choice="required", + ) + body = json.loads(route.calls[0].request.content) + assert body["tools"][0]["name"] == "lookup" + assert body["tool_choice"] == {"type": "any"} + assert r.tool_calls == [ToolCall(id="tu_1", name="lookup", arguments={"key": "alpha"})] + await provider.close() + + +class TestGoogleToolChoiceDictSelectsSpecificFunction: + """``tool_choice={"name": "fn"}`` must translate to Gemini's + ANY-mode + ``allowedFunctionNames`` rather than silently falling + through to AUTO.""" + + @respx.mock + async def test_specific_function_selection(self) -> None: + from agentloom.providers.google import GoogleProvider + + route = respx.post( + "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" + ).mock( + return_value=httpx.Response( + 200, + json={ + "model": "gemini-2.5-flash", + "candidates": [{"content": {"parts": [{"text": "ok"}]}}], + "usageMetadata": { + "promptTokenCount": 1, + "candidatesTokenCount": 1, + "totalTokenCount": 2, + }, + }, + ) + ) + provider = GoogleProvider(api_key="k") + await provider.complete( + messages=[{"role": "user", "content": "ping"}], + model="gemini-2.5-flash", + agentloom_tools=[ + ToolDefinition(name="lookup", description="d", parameters={"type": "object"}) + ], + agentloom_tool_choice={"name": "lookup"}, + ) + body = json.loads(route.calls[0].request.content) + cfg = body["toolConfig"]["functionCallingConfig"] + assert cfg["mode"] == "ANY" + assert cfg["allowedFunctionNames"] == ["lookup"] + await provider._client.aclose() + + +class TestDispatchObserverHook: + """``dispatch_tool_calls`` calls ``observer.on_tool_call`` per call so + each dispatch lands on the trace as a child span tagged by tool + + success status (#116 observability spec).""" + + async def test_observer_hook_fires_with_hashes_and_duration(self) -> None: + from unittest.mock import MagicMock + + observer = MagicMock() + registry = _registry_with(_AddTool()) + calls = [ToolCall(id="c1", name="add", arguments={"a": 2, "b": 3})] + await dispatch_tool_calls(calls, registry, observer=observer, step_id="s1") + + observer.on_tool_call.assert_called_once() + kwargs = observer.on_tool_call.call_args.kwargs + assert kwargs["step_id"] == "s1" + assert kwargs["call_id"] == "c1" + assert kwargs["tool_name"] == "add" + assert kwargs["success"] is True + # Hashes are non-empty hex strings — args/result never logged raw. + assert len(kwargs["args_hash"]) == 16 + assert len(kwargs["result_hash"]) == 16 + assert kwargs["duration_ms"] >= 0.0 + + async def test_observer_hook_fires_with_failure_on_unknown_tool(self) -> None: + from unittest.mock import MagicMock + + observer = MagicMock() + registry = _registry_with(_AddTool()) + calls = [ToolCall(id="c1", name="missing", arguments={})] + await dispatch_tool_calls(calls, registry, observer=observer, step_id="s1") + + observer.on_tool_call.assert_called_once() + assert observer.on_tool_call.call_args.kwargs["success"] is False + + +class TestStreamEvents: + """Typed stream events surface (issue #116). Adapters that haven't + wired the typed iterator fall back to wrapping plain text chunks as + ``TextDelta`` events, terminated with ``StreamDone``.""" + + async def test_event_iterator_when_wired_takes_precedence(self) -> None: + # Adapters that emit typed events natively register a separate + # iterator; the default text-wrapping path is bypassed entirely. + from agentloom.providers.base import ( + StreamDone, + StreamEvent, + StreamResponse, + ToolCallComplete, + ) + + async def _events() -> Any: + yield ToolCallComplete(tool_call=ToolCall(id="c1", name="add", arguments={"a": 1})) + yield StreamDone(finish_reason="tool_calls") + + sr = StreamResponse(model="m", provider="p") + sr._set_event_iterator(_events()) + + events: list[StreamEvent] = [evt async for evt in sr.events()] + assert len(events) == 2 + assert isinstance(events[0], ToolCallComplete) + assert events[0].tool_call.name == "add" + + async def test_default_events_wrap_text_chunks(self) -> None: + from agentloom.providers.base import ( + StreamDone, + StreamResponse, + TextDelta, + ) + + async def _chunks() -> Any: + yield "hello " + yield "world" + + sr = StreamResponse(model="m", provider="p") + sr._set_iterator(_chunks()) + sr.finish_reason = "stop" + + events = [evt async for evt in sr.events()] + # Two TextDelta events + one StreamDone — the default wrapper + # provides the surface even when the adapter doesn't emit typed + # events natively. + assert [type(e).__name__ for e in events] == ["TextDelta", "TextDelta", "StreamDone"] + assert isinstance(events[0], TextDelta) + assert events[0].chunk == "hello " + assert isinstance(events[-1], StreamDone) + assert events[-1].finish_reason == "stop" + + +class TestProviderFormatPreservesToolMessages: + """Regression for the critical bug where providers' ``_format_messages`` + silently dropped ``tool_calls`` / ``tool_call_id`` / ``parts`` keys on + tool-loop iteration 2+, breaking real (non-mock) tool calling.""" + + def test_openai_passes_through_assistant_with_tool_calls(self) -> None: + msgs = [ + {"role": "user", "content": "what is 2+3?"}, + { + "role": "assistant", + "content": "", + "tool_calls": [ + { + "id": "c1", + "type": "function", + "function": {"name": "add", "arguments": '{"a": 2, "b": 3}'}, + } + ], + }, + {"role": "tool", "tool_call_id": "c1", "content": "5"}, + ] + out = OpenAIProvider._format_messages(msgs) + # The two tool-loop messages must round-trip with their wire keys + # intact — without this, OpenAI 400s on iteration 2 because the + # assistant message has no tool_calls and the tool message has no + # tool_call_id to anchor. + assert out[1]["tool_calls"][0]["function"]["name"] == "add" + assert out[2] == {"role": "tool", "tool_call_id": "c1", "content": "5"} + + def test_ollama_passes_through_tool_loop_messages(self) -> None: + from agentloom.providers.ollama import OllamaProvider + + msgs = [ + {"role": "user", "content": "ping"}, + { + "role": "assistant", + "content": "", + "tool_calls": [{"id": "c1", "type": "function", "function": {"name": "x"}}], + }, + {"role": "tool", "tool_call_id": "c1", "content": "ok"}, + ] + out = OllamaProvider._format_messages(msgs) + assert "tool_calls" in out[1] + assert out[2]["tool_call_id"] == "c1" + + def test_google_passes_through_function_call_and_response_parts(self) -> None: + from agentloom.providers.google import GoogleProvider + + msgs = [ + {"role": "user", "content": "ping"}, + # Assistant tool-call decision (Gemini wire shape). + { + "role": "model", + "parts": [{"functionCall": {"name": "add", "args": {"a": 1, "b": 2}}}], + }, + # Tool result (Gemini wire shape). + { + "role": "function", + "parts": [{"functionResponse": {"name": "add", "response": {"result": "3"}}}], + }, + ] + _system, contents = GoogleProvider._format_messages(msgs) + # Both ``parts``-based messages must round-trip unchanged so Gemini + # sees the prior call + result on iteration 2; previously the + # ``content``-based reformatter dropped ``parts`` entirely. + assert contents[1]["parts"][0]["functionCall"]["name"] == "add" + assert contents[2]["role"] == "function" + assert contents[2]["parts"][0]["functionResponse"]["response"] == {"result": "3"} + + +class TestLLMStepToolLoop: + @respx.mock + async def test_step_dispatches_and_iterates(self) -> None: + # Iteration 1: model asks to call `add`. + # Iteration 2: model responds with the final answer. + respx.post("https://api.openai.com/v1/chat/completions").mock( + side_effect=[ + httpx.Response( + 200, + json={ + "model": "gpt-4o-mini", + "choices": [ + { + "message": { + "role": "assistant", + "content": None, + "tool_calls": [ + { + "id": "c1", + "type": "function", + "function": { + "name": "add", + "arguments": '{"a": 2, "b": 3}', + }, + } + ], + }, + "finish_reason": "tool_calls", + } + ], + "usage": {"prompt_tokens": 5, "completion_tokens": 9}, + }, + ), + httpx.Response( + 200, + json={ + "model": "gpt-4o-mini", + "choices": [ + { + "message": {"role": "assistant", "content": "The answer is 5."}, + "finish_reason": "stop", + } + ], + "usage": {"prompt_tokens": 12, "completion_tokens": 6}, + }, + ), + ] + ) + workflow = WorkflowDefinition( + name="agent", + config=WorkflowConfig(provider="openai", model="gpt-4o-mini"), + state={}, + steps=[ + StepDefinition( + id="ask", + type=StepType.LLM_CALL, + prompt="What is 2+3?", + tools=[ + ToolDefinition( + name="add", + description="add two integers", + parameters={ + "type": "object", + "properties": { + "a": {"type": "integer"}, + "b": {"type": "integer"}, + }, + "required": ["a", "b"], + }, + ) + ], + output="answer", + ) + ], + ) + gateway = ProviderGateway() + gateway.register(OpenAIProvider(api_key="k")) + engine = WorkflowEngine( + workflow=workflow, + provider_gateway=gateway, + tool_registry=_registry_with(_AddTool()), + ) + result = await engine.run() + await gateway.close() + + assert result.status == WorkflowStatus.SUCCESS + sr = result.step_results["ask"] + assert sr.status == StepStatus.SUCCESS + # Cost / tokens accumulated across both iterations. + assert sr.token_usage.prompt_tokens == 5 + 12 + assert sr.token_usage.completion_tokens == 9 + 6 + assert result.final_state["answer"] == "The answer is 5." + + @respx.mock + async def test_step_respects_max_tool_iterations(self) -> None: + # Model never stops asking for the tool; loop must cap at 2. + respx.post("https://api.openai.com/v1/chat/completions").mock( + return_value=httpx.Response( + 200, + json={ + "model": "gpt-4o-mini", + "choices": [ + { + "message": { + "role": "assistant", + "content": None, + "tool_calls": [ + { + "id": "c1", + "type": "function", + "function": { + "name": "add", + "arguments": '{"a": 1, "b": 1}', + }, + } + ], + }, + "finish_reason": "tool_calls", + } + ], + "usage": {"prompt_tokens": 3, "completion_tokens": 2}, + }, + ) + ) + workflow = WorkflowDefinition( + name="agent-loop", + config=WorkflowConfig(provider="openai", model="gpt-4o-mini"), + state={}, + steps=[ + StepDefinition( + id="ask", + type=StepType.LLM_CALL, + prompt="loop", + tools=[ + ToolDefinition( + name="add", + description="d", + parameters={"type": "object"}, + ) + ], + max_tool_iterations=2, + output="answer", + ) + ], + ) + gateway = ProviderGateway() + gateway.register(OpenAIProvider(api_key="k")) + engine = WorkflowEngine( + workflow=workflow, + provider_gateway=gateway, + tool_registry=_registry_with(_AddTool()), + ) + result = await engine.run() + await gateway.close() + + sr = result.step_results["ask"] + assert sr.status == StepStatus.SUCCESS + # Cap surfaced via finish_reason on the prompt metadata. + assert sr.prompt_metadata is not None + assert sr.prompt_metadata.finish_reason == "max_tool_iterations" + + async def test_tool_dispatch_without_registry_raises(self) -> None: + # When a step declares tools but no registry is attached, the loop + # raises StepError surfaced as FAILED. + with respx.mock: + respx.post("https://api.openai.com/v1/chat/completions").mock( + return_value=httpx.Response( + 200, + json={ + "model": "gpt-4o-mini", + "choices": [ + { + "message": { + "role": "assistant", + "content": None, + "tool_calls": [ + { + "id": "c1", + "type": "function", + "function": { + "name": "add", + "arguments": "{}", + }, + } + ], + }, + "finish_reason": "tool_calls", + } + ], + "usage": {"prompt_tokens": 1, "completion_tokens": 1}, + }, + ) + ) + workflow = WorkflowDefinition( + name="no-registry", + config=WorkflowConfig(provider="openai", model="gpt-4o-mini"), + state={}, + steps=[ + StepDefinition( + id="ask", + type=StepType.LLM_CALL, + prompt="hi", + tools=[ + ToolDefinition( + name="add", description="d", parameters={"type": "object"} + ) + ], + max_tool_iterations=1, + retry={"max_retries": 0}, + ) + ], + ) + gateway = ProviderGateway() + gateway.register(OpenAIProvider(api_key="k")) + engine = WorkflowEngine( + workflow=workflow, + provider_gateway=gateway, + # tool_registry intentionally omitted + ) + result = await engine.run() + await gateway.close() + + sr = result.step_results["ask"] + assert sr.status == StepStatus.FAILED + assert "tool registry" in (sr.error or "").lower() From f9f820c7c37c94105dedc59869008a2bfef4895d Mon Sep 17 00:00:00 2001 From: Carlos Chinchilla Corbacho <188046461+cchinchilla-dev@users.noreply.github.com> Date: Fri, 8 May 2026 17:48:42 +0200 Subject: [PATCH 2/4] add tool-calling example and replay support for tool-iteration loops --- examples/35_tool_calling.yaml | 51 ++++++++++++++++++++++++++++ recordings/tool_calling.json | 27 +++++++++++++++ src/agentloom/providers/anthropic.py | 7 ++++ src/agentloom/providers/mock.py | 40 ++++++++++++++++++++-- 4 files changed, 122 insertions(+), 3 deletions(-) create mode 100644 examples/35_tool_calling.yaml create mode 100644 recordings/tool_calling.json diff --git a/examples/35_tool_calling.yaml b/examples/35_tool_calling.yaml new file mode 100644 index 0000000..701e1d7 --- /dev/null +++ b/examples/35_tool_calling.yaml @@ -0,0 +1,51 @@ +name: tool-calling-agent +version: "1.0" +description: | + Demonstrates native tool/function calling. The model decides + at runtime to invoke ``http_request`` to fetch JSON from an API, + receives the result on the next turn, and emits a final answer. + + Runs offline against the committed recording fixture. With + ``--provider openai --model gpt-4o-mini`` it makes real calls and + the model picks the tool autonomously. + + Producing a fresh recording: + agentloom run examples/35_tool_calling.yaml --provider openai \\ + --model gpt-4o-mini --record recordings/tool_calling.json + +config: + provider: mock + model: gpt-4o-mini + responses_file: recordings/tool_calling.json + latency_model: constant + latency_ms: 0 + sandbox: + enabled: true + allow_network: true + allowed_domains: ["httpbin.org"] + allowed_schemes: ["https"] + +state: + question: "What HTTP method does the httpbin /get endpoint accept?" + +steps: + - id: ask + type: llm_call + prompt: "{state.question}" + tools: + - name: http_request + description: "Make an HTTP request to a given URL and return the response body." + parameters: + type: object + properties: + url: + type: string + description: "Full URL to GET." + method: + type: string + enum: ["GET", "POST"] + description: "HTTP method." + required: [url] + tool_choice: auto + max_tool_iterations: 3 + output: answer diff --git a/recordings/tool_calling.json b/recordings/tool_calling.json new file mode 100644 index 0000000..619a445 --- /dev/null +++ b/recordings/tool_calling.json @@ -0,0 +1,27 @@ +{ + "ask": [ + { + "content": "", + "provider": "openai", + "model": "gpt-4o-mini", + "tool_calls": [ + { + "id": "call_demo_1", + "name": "http_request", + "arguments": {"url": "https://httpbin.org/get", "method": "GET"} + } + ], + "usage": {"prompt_tokens": 35, "completion_tokens": 18, "total_tokens": 53}, + "cost_usd": 0.000018, + "finish_reason": "tool_calls" + }, + { + "content": "The /get endpoint of httpbin accepts the GET HTTP method, as confirmed by the successful response from https://httpbin.org/get.", + "provider": "openai", + "model": "gpt-4o-mini", + "usage": {"prompt_tokens": 320, "completion_tokens": 26, "total_tokens": 346}, + "cost_usd": 0.000061, + "finish_reason": "stop" + } + ] +} diff --git a/src/agentloom/providers/anthropic.py b/src/agentloom/providers/anthropic.py index e989f7b..94a089f 100644 --- a/src/agentloom/providers/anthropic.py +++ b/src/agentloom/providers/anthropic.py @@ -111,6 +111,13 @@ def _format_messages( else: parts: list[dict[str, Any]] = [] for block in content: + # Tool-calling messages build pure-dict wire-format + # blocks (``tool_use``, ``tool_result``); pass those + # through verbatim. Multimodal Pydantic blocks below need + # translation to Anthropic's specific keys. + if isinstance(block, dict): + parts.append(block) + continue if isinstance(block, TextBlock): parts.append({"type": "text", "text": block.text}) elif isinstance(block, ImageBlock): diff --git a/src/agentloom/providers/mock.py b/src/agentloom/providers/mock.py index ebf7cf7..814956e 100644 --- a/src/agentloom/providers/mock.py +++ b/src/agentloom/providers/mock.py @@ -109,7 +109,12 @@ def __init__( self._observer = observer self._workflow_name = workflow_name self.calls: list[dict[str, Any]] = [] - self._responses: dict[str, dict[str, Any]] = {} + # Each value is either a single response dict OR a list of turns + # to play in order (for tool-calling loops where one step issues + # multiple complete() calls). The cursor below tracks which turn + # the next call should emit per step_id. + self._responses: dict[str, Any] = {} + self._turn_cursor: dict[str, int] = {} if self.responses_file and self.responses_file.exists(): raw = json.loads(self.responses_file.read_text()) if not isinstance(raw, dict): @@ -126,7 +131,18 @@ def _lookup( extra: dict[str, Any] | None = None, ) -> dict[str, Any] | None: if step_id and step_id in self._responses: - return self._responses[step_id] + entry = self._responses[step_id] + # List form: pop the next turn (clamp at last so excess + # iterations replay the final response — saner than + # raising mid-loop). + if isinstance(entry, list): + if not entry: + return None + idx = self._turn_cursor.get(step_id, 0) + turn = entry[min(idx, len(entry) - 1)] + self._turn_cursor[step_id] = idx + 1 + return turn if isinstance(turn, dict) else None + return entry if isinstance(entry, dict) else None key = prompt_hash(messages, model, temperature, max_tokens, extra) return self._responses.get(key) @@ -185,10 +201,27 @@ async def complete( ) usage_data = entry.get("usage", {}) or {} + # Hydrate ``tool_calls`` from the recording so replay drives the + # tool-iteration loop. Each turn carries its own ``tool_calls``. + tool_calls: list[Any] = [] + recorded_tool_calls = entry.get("tool_calls") or [] + if recorded_tool_calls: + from agentloom.providers.base import ToolCall + + for tc in recorded_tool_calls: + tool_calls.append( + ToolCall( + id=str(tc.get("id", "")), + name=str(tc.get("name", "")), + arguments=tc.get("arguments", {}) or {}, + ) + ) + + return ProviderResponse( content=str(entry.get("content", "")), model=str(entry.get("model", model)), - provider=self.name, + provider=str(entry.get("provider", self.name)), usage=TokenUsage( prompt_tokens=int(usage_data.get("prompt_tokens", 0)), completion_tokens=int(usage_data.get("completion_tokens", 0)), @@ -196,6 +229,7 @@ async def complete( ), cost_usd=float(entry.get("cost_usd", 0.0)), finish_reason=entry.get("finish_reason", "stop"), + tool_calls=tool_calls, ) def supports_model(self, model: str) -> bool: From a58a156c3b7aa4b9336eef0a0bdb33110bf4802d Mon Sep 17 00:00:00 2001 From: Carlos Chinchilla Corbacho <188046461+cchinchilla-dev@users.noreply.github.com> Date: Sat, 9 May 2026 12:20:47 +0200 Subject: [PATCH 3/4] fix(providers): parse ollama-shaped tool_calls (no type, dict args) --- src/agentloom/core/models.py | 1 - src/agentloom/providers/google.py | 1 - src/agentloom/providers/mock.py | 1 - src/agentloom/providers/ollama.py | 1 - src/agentloom/providers/openai.py | 1 - src/agentloom/steps/_tools.py | 28 +++++++++++++++++++++++----- tests/providers/test_tool_calling.py | 24 ++++++++++++++++++++++++ 7 files changed, 47 insertions(+), 10 deletions(-) diff --git a/src/agentloom/core/models.py b/src/agentloom/core/models.py index 9063e68..11f3390 100644 --- a/src/agentloom/core/models.py +++ b/src/agentloom/core/models.py @@ -121,7 +121,6 @@ class ToolDefinition(BaseModel): parameters: dict[str, Any] = Field(default_factory=dict) - class StepDefinition(BaseModel): """Definition of a single workflow step.""" diff --git a/src/agentloom/providers/google.py b/src/agentloom/providers/google.py index 42d908a..6e6a0a5 100644 --- a/src/agentloom/providers/google.py +++ b/src/agentloom/providers/google.py @@ -280,7 +280,6 @@ async def complete( tool_calls = parse_tool_calls_from_google(content_parts) - return ProviderResponse( content=content, model=model, diff --git a/src/agentloom/providers/mock.py b/src/agentloom/providers/mock.py index 814956e..d4ff691 100644 --- a/src/agentloom/providers/mock.py +++ b/src/agentloom/providers/mock.py @@ -217,7 +217,6 @@ async def complete( ) ) - return ProviderResponse( content=str(entry.get("content", "")), model=str(entry.get("model", model)), diff --git a/src/agentloom/providers/ollama.py b/src/agentloom/providers/ollama.py index 290e655..445c04c 100644 --- a/src/agentloom/providers/ollama.py +++ b/src/agentloom/providers/ollama.py @@ -232,7 +232,6 @@ async def complete( tool_calls = parse_tool_calls_from_openai(data.get("message", {}) or {}) - return ProviderResponse( content=content, model=data.get("model", model), diff --git a/src/agentloom/providers/openai.py b/src/agentloom/providers/openai.py index caeaf19..74ac97e 100644 --- a/src/agentloom/providers/openai.py +++ b/src/agentloom/providers/openai.py @@ -194,7 +194,6 @@ async def complete( tool_calls = parse_tool_calls_from_openai(message) - return ProviderResponse( content=content, model=data.get("model", model), diff --git a/src/agentloom/steps/_tools.py b/src/agentloom/steps/_tools.py index 8e73bdf..51958e9 100644 --- a/src/agentloom/steps/_tools.py +++ b/src/agentloom/steps/_tools.py @@ -96,16 +96,34 @@ def translate_tool_choice_for_anthropic(choice: Any) -> Any: def parse_tool_calls_from_openai(message: dict[str, Any]) -> list[ToolCall]: - """OpenAI returns ``message.tool_calls = [{id, type, function: {name, arguments}}]``.""" + """Parse ``message.tool_calls`` from OpenAI-shaped responses. + + Handles both wire variants seen in the wild: + + * **OpenAI canonical**: ``[{id, type:"function", function:{name, arguments:""}}]`` + — ``arguments`` is a JSON-encoded string. + * **Ollama / OpenAI-compatible relays**: ``[{id, function:{name, arguments:{...}}}]`` + — ``type`` may be omitted entirely, and ``arguments`` may already be a + decoded dict. Treat the absence of ``type`` as ``"function"`` (the + only call kind we currently dispatch); a non-function explicit + ``type`` (e.g. ``"code_interpreter"``) is skipped. + """ raw_calls = message.get("tool_calls") or [] calls: list[ToolCall] = [] for entry in raw_calls: - if entry.get("type") != "function": + entry_type = entry.get("type", "function") + if entry_type != "function": continue fn = entry.get("function", {}) - try: - args = json.loads(fn.get("arguments") or "{}") - except json.JSONDecodeError: + raw_args = fn.get("arguments") + if isinstance(raw_args, dict): + args: dict[str, Any] = raw_args + elif isinstance(raw_args, str): + try: + args = json.loads(raw_args or "{}") + except json.JSONDecodeError: + args = {} + else: args = {} calls.append(ToolCall(id=entry.get("id", ""), name=fn.get("name", ""), arguments=args)) return calls diff --git a/tests/providers/test_tool_calling.py b/tests/providers/test_tool_calling.py index 8cb461b..9790d50 100644 --- a/tests/providers/test_tool_calling.py +++ b/tests/providers/test_tool_calling.py @@ -127,6 +127,30 @@ def test_openai_skips_non_function_entries(self) -> None: assert len(calls) == 1 assert calls[0].name == "" + def test_ollama_compat_response_with_dict_args_and_no_type(self) -> None: + # Real Ollama 0.x ``/api/chat`` returns tool_calls in OpenAI-compatible + # shape but (a) omits ``"type": "function"`` and (b) ships + # ``arguments`` as an already-decoded dict, not a JSON string. Without + # this regression, Ollama tool calling silently drops every call — + # the parser's strict ``type == "function"`` check skips entries + # missing the field, and ``json.loads()`` would TypeError. + # Captured from a live ``llama3.1:8b`` response on 2026-05-09. + message = { + "tool_calls": [ + { + "id": "call_eh7yrv0u", + "function": { + "index": 0, + "name": "add", + "arguments": {"a": 17, "b": 25}, # already a dict + }, + # NOTE: no "type" key + } + ] + } + calls = parse_tool_calls_from_openai(message) + assert calls == [ToolCall(id="call_eh7yrv0u", name="add", arguments={"a": 17, "b": 25})] + def test_anthropic_parses_tool_use_blocks(self) -> None: blocks = [ {"type": "text", "text": "I'll use a tool."}, From c2966e8b0cffa4fe58435963e5a0399fbff60ce5 Mon Sep 17 00:00:00 2001 From: Carlos Chinchilla Corbacho <188046461+cchinchilla-dev@users.noreply.github.com> Date: Sun, 10 May 2026 11:01:24 +0200 Subject: [PATCH 4/4] docs: surface tool calling in changelog, workflow-yaml, and examples --- CHANGELOG.md | 1 + docs/examples.md | 16 ++++++++++++++++ docs/workflow-yaml.md | 33 ++++++++++++++++++++++++++++++++- 3 files changed, 49 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 9992e71..d31c455 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added +- Native tool / function calling across providers (#116). New `tools`, `tool_choice`, and `max_tool_iterations` fields on `llm_call`; the LLM step dispatches via the existing `ToolRegistry`, feeds results back, and re-prompts until the model stops asking (capped by `max_tool_iterations`, default 5). Each adapter translates the unified `ToolDefinition` list to its native shape (OpenAI / Ollama / Anthropic / Google). Parallel tool calls dispatch concurrently and preserve order; failures are reported back as text so the model can recover. Sandbox (#105), budget (#108), and per-step retry (#106) apply unchanged. `MockProvider` recordings accept a list of turns per step so offline replay drives the loop end-to-end; `examples/35_tool_calling.yaml` ships a ReAct-style example. The OpenAI-shaped parser also handles Ollama-compat responses (no `type` field, `arguments` as a decoded dict) — without this Ollama tool calling silently dropped every call. - Per-run experiment metadata logging (#77). Every workflow execution now writes a self-contained JSON record (`run_id`, ISO timestamp, AgentLoom version, Python version, workflow `sha256` hash, list of `provider/model` pairs used, status, total cost, total tokens, step count, duration) to `./agentloom_runs/.json`. Override the directory via the `runs_dir` constructor argument on `RunHistoryWriter` or the `AGENTLOOM_RUNS_DIR` env var. Disk I/O happens in a worker thread so the write does not block the event loop. Records carry a `_schema_version: 1` field; failures during the write are logged and swallowed so a broken history directory cannot prevent the engine from returning the result. New `agentloom history` CLI subcommand lists records most-recent-first and accepts `--workflow`, `--provider`, `--since YYYY-MM-DD`, `--until YYYY-MM-DD`, `--min-cost`, `--max-cost`, `--limit`, and `--json` filters — covering the full filter surface (date, workflow, cost, provider) called for in the original issue. - Quality annotations attachable to `WorkflowResult` (#59). New `WorkflowResult.annotate(target, quality_score=..., source=..., **metadata)` method appends a typed `QualityAnnotation` (`target`, `quality_score`, `source`, `metadata`) to the result so evaluators, human reviewers, or downstream scoring code can record output quality after the run completes. **The annotation is auto-emitted as an OTel span** the moment `annotate()` runs whenever the engine returned the result with a tracing context attached (the default for any workflow run with observability enabled) — `result.annotate("answer", quality_score=4.5, source="human_feedback")` becomes immediately visible in Jaeger with no additional plumbing on the caller side. Each annotation is published as a standalone `quality:` span (the workflow span has already closed, so retroactive attribute attachment is not viable). Quality spans carry `workflow.run_id` and `workflow.name` plus `agentloom.quality.score`, `agentloom.quality.source`, `agentloom.quality.target`, and free-form `agentloom.quality.metadata.*` attributes — Jaeger / Tempo can group quality spans with the original trace by run_id, and dashboards can filter for `agentloom.quality.score < threshold` to surface regressions. Offline / replay paths that build a `WorkflowResult` without a live tracer keep working — `annotate()` still records the data on the result, the OTel emission just no-ops. The `agentloom.observability.quality.emit_quality_annotation` / `emit_quality_annotations` helpers remain available for callers that build annotations outside the engine flow (e.g. batch evaluators reading historical results from disk). - OTel span and metric schema centralization with GenAI semantic conventions (#125). The schema is a clean break — no compatibility shims for pre-#125 attribute or metric names. New `agentloom.observability.schema` module is the single source of truth for span / attribute / metric names; downstream consumers (Grafana, AgentTest, Jaeger plugins) parse a stable contract instead of grepping for ad-hoc strings. **Metrics renamed and retyped** to match the OTel GenAI registry: `agentloom_tokens_total` (counter) → `gen_ai.client.token.usage` (histogram, `{token}` unit) with `gen_ai.token.type` attribute (`input` / `output` / `reasoning`); `agentloom_provider_latency_seconds` (histogram) → `gen_ai.client.operation.duration` (histogram, `s`) with `gen_ai.operation.name` + `gen_ai.provider.name` attributes; `agentloom_time_to_first_token_seconds` → `gen_ai.client.operation.time_to_first_chunk`. AgentLoom-specific metrics (`agentloom_workflow_*`, `agentloom_step_*`, `agentloom_provider_calls_total`, `agentloom_cost_usd_total`, `agentloom_circuit_breaker_state`, `agentloom_budget_remaining_usd`, HITL / mock / recording counters) keep their `agentloom_` prefix — they have no OTel equivalent. The bundled Grafana dashboard is updated to query the new metric / label names. The legacy `Observer.on_provider_call` hook (which duplicated the metric emission already done by `on_provider_call_end`) is removed; the engine no longer fires it. The `tokens: int` positional argument on `on_step_end` is removed — callers now pass `prompt_tokens` / `completion_tokens` as kwargs. Span attributes follow the **canonical OTel GenAI registry** as of the May 2026 spec — `gen_ai.provider.name` (the deprecated `gen_ai.system` is **not** emitted), `gen_ai.operation.name`, `gen_ai.request.model`, `gen_ai.request.temperature`, `gen_ai.request.max_tokens`, `gen_ai.request.stream`, `gen_ai.response.model`, `gen_ai.response.finish_reasons` (array of strings, per spec), `gen_ai.response.time_to_first_chunk`, `gen_ai.usage.input_tokens`, `gen_ai.usage.output_tokens`, `gen_ai.usage.reasoning.output_tokens`. Errored inference spans also emit the OTel general-conventions attribute `error.type` alongside the AgentLoom-specific `step.error` so OTel-aware consumers (Jaeger error filters, Tempo) light up. Inference spans use the canonical name template `"{operation_name} {model}"` (e.g. `"chat gpt-4o-mini"`); workflow / step orchestration spans keep the AgentLoom-specific `workflow:*` / `step:*` names. AgentLoom-specific fields stay under `workflow.*` / `step.*` / `tool.*` / `agentloom.*` namespaces. Provider names are translated from AgentLoom internal names to OTel registry values via `to_genai_provider_name` (e.g. `google` → `gcp.gemini`). Notable additions: diff --git a/docs/examples.md b/docs/examples.md index e763b99..0b9013a 100644 --- a/docs/examples.md +++ b/docs/examples.md @@ -295,3 +295,19 @@ fixtures and CI. # Depends on the recording captured from example 31 agentloom run examples/32_yaml_mock.yaml --lite ``` + +## Tool calling + +### 35 — Native tool/function calling + +ReAct-style agent: the model decides to invoke `http_request` against `httpbin.org/get`, receives the JSON, and emits a final natural-language answer. Sandbox is on with `allowed_domains: ["httpbin.org"]`, so the model-dispatched call goes through the same security policy as static `tool` steps (#105). + +**Demonstrates:** `tools` declaration on `llm_call`, `tool_choice: auto`, `max_tool_iterations`, model-driven dispatch via `ToolRegistry`, sandboxed tool execution, replay support for tool-iteration loops. + +```bash +# Mock-replay against the committed recording (no API calls) +agentloom run examples/35_tool_calling.yaml --lite + +# Real call: pass --provider + --model to drive a live model +agentloom run examples/35_tool_calling.yaml --provider openai --model gpt-4o-mini +``` diff --git a/docs/workflow-yaml.md b/docs/workflow-yaml.md index 1c54f89..1fc15c6 100644 --- a/docs/workflow-yaml.md +++ b/docs/workflow-yaml.md @@ -191,6 +191,37 @@ Per-provider translation: Reasoning tokens are billed at the output rate. `TokenUsage.reasoning_tokens` and `billable_completion_tokens` track the spend; `calculate_cost()` includes them automatically. See [Reasoning models](providers.md#reasoning-models) for per-provider details, including the Ollama caveat that `eval_count` is not split. +**Tool calling:** + +The model can pick tools at runtime. Declare them on the step; the engine dispatches via the workflow's `ToolRegistry`, feeds results back, and re-prompts until the model stops asking for tools. + +```yaml +- id: ask + type: llm_call + prompt: "What is the user's account balance?" + tools: + - name: lookup_account + description: "Retrieve account info by ID." + parameters: + type: object + properties: + account_id: { type: string } + required: [account_id] + tool_choice: auto # auto | required | none | {name: lookup_account} + max_tool_iterations: 5 # bound the loop; default 5 + output: answer +``` + +| Field | Type | Default | Description | +|-------|------|---------|-------------| +| `tools` | `list[ToolDefinition]` | `[]` | Tool declarations the model can pick. `parameters` is JSON Schema. Names resolve against the registered `ToolRegistry`; an unknown name is reported back as a tool failure rather than aborting the loop. | +| `tool_choice` | `string \| dict` | `"auto"` | `"auto"` lets the model decide; `"required"` forces a call; `"none"` disables tools for this turn; `{"name": "..."}` pins to a specific tool. Anthropic ignores `"none"` (omits the field); Ollama ignores `tool_choice` entirely (model-side support decides). | +| `max_tool_iterations` | `int` | `5` | Cap on call→result→re-prompt loops. When hit, `finish_reason` becomes `"max_tool_iterations"` so callers can detect runaway behavior. | + +The dispatched tool runs through the existing sandbox (#105), so `http_request`, `shell_command`, `file_read`, `file_write` honor the workflow's `sandbox:` config. Multiple tool calls in one response are dispatched concurrently (anyio task group); results preserve order in the conversation. Cost and tokens accumulate across iterations on the surfaced `StepResult`. + +The legacy `tool` step (static DAG node, author chooses the tool) keeps working unchanged — `tools=` on `llm_call` is the new dynamic, model-driven path. + **Retry config:** | Field | Type | Default | Description | @@ -231,7 +262,7 @@ Evaluates conditions against state and activates a target step. Steps not activa ### `tool` -Executes a registered tool (shell command, HTTP request, etc.). +Executes a registered tool with author-chosen arguments — the workflow author decides which tool to call, not the model. For model-driven tool selection, use the `tools=` field on an `llm_call` step (see [tool calling](#llm_call) above). ```yaml - id: fetch