From 9a718300fa6b3ceaa242cca62fbc13dc0a06f681 Mon Sep 17 00:00:00 2001
From: Carlos Chinchilla Corbacho
<188046461+cchinchilla-dev@users.noreply.github.com>
Date: Fri, 8 May 2026 15:32:18 +0200
Subject: [PATCH 1/4] add native tool/function calling across providers
---
src/agentloom/core/models.py | 20 +
src/agentloom/observability/metrics.py | 43 ++
src/agentloom/observability/noop.py | 3 +
src/agentloom/observability/observer.py | 38 ++
src/agentloom/observability/schema.py | 8 +
src/agentloom/providers/anthropic.py | 17 +
src/agentloom/providers/base.py | 89 +++
src/agentloom/providers/google.py | 38 +-
src/agentloom/providers/ollama.py | 28 +
src/agentloom/providers/openai.py | 30 +-
src/agentloom/steps/_tools.py | 269 +++++++++
src/agentloom/steps/llm_call.py | 108 +++-
tests/observability/test_metrics.py | 22 +
tests/observability/test_noop.py | 9 +
tests/observability/test_observer.py | 50 ++
tests/providers/test_tool_calling.py | 730 ++++++++++++++++++++++++
16 files changed, 1494 insertions(+), 8 deletions(-)
create mode 100644 src/agentloom/steps/_tools.py
create mode 100644 tests/providers/test_tool_calling.py
diff --git a/src/agentloom/core/models.py b/src/agentloom/core/models.py
index dc689fa..9063e68 100644
--- a/src/agentloom/core/models.py
+++ b/src/agentloom/core/models.py
@@ -108,6 +108,20 @@ class ThinkingConfig(BaseModel):
capture_reasoning: bool = True
+class ToolDefinition(BaseModel):
+ """LLM-callable tool declared on an ``llm_call`` step.
+
+ ``parameters`` is a JSON Schema object; provider adapters translate it
+ to each API's native shape. ``name`` resolves against the workflow's
+ ``tool_registry`` for dispatch.
+ """
+
+ name: str
+ description: str = ""
+ parameters: dict[str, Any] = Field(default_factory=dict)
+
+
+
class StepDefinition(BaseModel):
"""Definition of a single workflow step."""
@@ -155,6 +169,12 @@ class StepDefinition(BaseModel):
# Reasoning / extended thinking
thinking: ThinkingConfig | None = None
+ # Tool calling — LLM picks tools at runtime.
+ # ``tool_choice``: ``"auto"`` | ``"required"`` | ``"none"`` | ``{"name": "..."}``.
+ tools: list[ToolDefinition] = Field(default_factory=list)
+ tool_choice: Any = "auto"
+ max_tool_iterations: int = 5
+
class SandboxConfig(BaseModel):
"""Sandbox configuration for built-in tools.
diff --git a/src/agentloom/observability/metrics.py b/src/agentloom/observability/metrics.py
index 0333470..6ccf450 100644
--- a/src/agentloom/observability/metrics.py
+++ b/src/agentloom/observability/metrics.py
@@ -64,6 +64,8 @@ def __init__(
self._attachment_counter: Any = None
self._stream_counter: Any = None
self._ttft_histogram: Any = None
+ self._tool_call_counter: Any = None
+ self._tool_call_histogram: Any = None
self._mock_replay_counter: Any = None
self._recording_capture_counter: Any = None
self._recording_latency_histogram: Any = None
@@ -152,6 +154,18 @@ def _setup_otel(self, endpoint: str) -> None:
"agentloom_stream_responses_total",
description="Total streamed LLM responses",
)
+ # Tool calls dispatched by the model (#116). Tagged by tool name +
+ # status so dashboards can split successes from failures and spot
+ # tools that consistently fail or hang.
+ self._tool_call_counter = meter.create_counter(
+ "agentloom_tool_calls_total",
+ description="Total model-dispatched tool calls",
+ )
+ self._tool_call_histogram = meter.create_histogram(
+ "agentloom_tool_call_duration_seconds",
+ description="Tool-call execution duration",
+ unit="s",
+ )
# Canonical OTel GenAI metric — replaces the AgentLoom-prefixed
# ``agentloom_time_to_first_token_seconds`` with the spec name.
self._time_to_first_chunk_histogram = meter.create_histogram(
@@ -270,6 +284,17 @@ def _setup_prom(
"Total streamed LLM responses",
["provider", "model"],
)
+ self._prom_counters["tool_calls"] = prom.Counter(
+ "agentloom_tool_calls_total",
+ "Total model-dispatched tool calls",
+ ["tool_name", "status"],
+ )
+ self._prom_histograms["tool_call_duration"] = prom.Histogram(
+ "agentloom_tool_call_duration_seconds",
+ "Tool-call execution duration",
+ ["tool_name"],
+ buckets=[0.01, 0.05, 0.1, 0.5, 1, 2, 5, 10, 30, 60],
+ )
self._prom_histograms["time_to_first_chunk"] = prom.Histogram(
"gen_ai_client_operation_time_to_first_chunk_seconds",
"GenAI streaming time-to-first-chunk (per OTel GenAI conventions)",
@@ -465,6 +490,24 @@ def record_attachments(self, step_type: str, count: int) -> None:
else: # pragma: no cover — prom fallback
self._prom_counters["attachments"].labels(step_type=step_type).inc(count)
+ def record_tool_call(self, tool_name: str, success: bool, duration_s: float) -> None:
+ """Record a model-dispatched tool call (#116).
+
+ Counter is tagged ``status=success|failure`` so dashboards can
+ plot a per-tool failure rate; histogram tracks execution latency.
+ """
+ if not self._enabled:
+ return
+ status = "success" if success else "failure"
+ if self._backend == "otel":
+ self._tool_call_counter.add(1, {"tool_name": tool_name, "status": status})
+ self._tool_call_histogram.record(duration_s, {"tool_name": tool_name})
+ else: # pragma: no cover — prom fallback
+ self._prom_counters["tool_calls"].labels(tool_name=tool_name, status=status).inc()
+ self._prom_histograms["tool_call_duration"].labels(tool_name=tool_name).observe(
+ duration_s
+ )
+
def record_stream_response(self, provider: str, model: str) -> None:
if not self._enabled:
return
diff --git a/src/agentloom/observability/noop.py b/src/agentloom/observability/noop.py
index 54c4812..a377330 100644
--- a/src/agentloom/observability/noop.py
+++ b/src/agentloom/observability/noop.py
@@ -139,6 +139,9 @@ def on_provider_call_end(
def on_provider_error(self, provider: str, error_type: str, **kwargs: Any) -> None:
pass
+ def on_tool_call(self, **kwargs: Any) -> None:
+ pass
+
def on_stream_response(self, provider: str, model: str, ttft_s: float, **kwargs: Any) -> None:
pass
diff --git a/src/agentloom/observability/observer.py b/src/agentloom/observability/observer.py
index 9c87a6b..aa41b85 100644
--- a/src/agentloom/observability/observer.py
+++ b/src/agentloom/observability/observer.py
@@ -304,6 +304,44 @@ def on_provider_call_end(
else:
span.end()
+ def on_tool_call(
+ self,
+ *,
+ step_id: str,
+ call_id: str,
+ tool_name: str,
+ args_hash: str,
+ result_hash: str,
+ duration_ms: float,
+ success: bool,
+ **kwargs: Any,
+ ) -> None:
+ """Record a model-dispatched tool call (#116).
+
+ Emits a child span under the active step span carrying the canonical
+ ``execute_tool {name}`` name plus tool attrs, and records the
+ per-tool counter + histogram. Args / result are hashed (not raw)
+ so PII never lands on the trace.
+ """
+ if self._metrics:
+ self._metrics.record_tool_call(tool_name, success, duration_ms / 1000.0)
+ if self._tracing:
+ attrs: dict[str, Any] = {
+ SpanAttr.TOOL_CALL_ID: call_id,
+ SpanAttr.TOOL_NAME: tool_name,
+ SpanAttr.TOOL_ARGS_HASH: args_hash,
+ SpanAttr.TOOL_RESULT_HASH: result_hash,
+ SpanAttr.TOOL_DURATION_MS: duration_ms,
+ SpanAttr.TOOL_SUCCESS: success,
+ }
+ if self._run_id:
+ attrs[SpanAttr.WORKFLOW_RUN_ID] = self._run_id
+ span = self._tracing.start_span(
+ SpanName.GEN_AI_TOOL_CALL.format(tool_name=tool_name),
+ attributes=attrs,
+ )
+ self._tracing.end_span(span)
+
def on_provider_error(
self,
provider: str,
diff --git a/src/agentloom/observability/schema.py b/src/agentloom/observability/schema.py
index ac9e703..16628ad 100644
--- a/src/agentloom/observability/schema.py
+++ b/src/agentloom/observability/schema.py
@@ -109,9 +109,11 @@ class SpanAttr:
PROVIDER_ATTEMPT_OUTCOME = "agentloom.provider.attempt_outcome"
# Tool calls
+ TOOL_CALL_ID = "tool.call_id"
TOOL_NAME = "tool.name"
TOOL_ARGS_HASH = "tool.args_hash"
TOOL_RESULT_HASH = "tool.result_hash"
+ TOOL_DURATION_MS = "tool.duration_ms"
TOOL_SUCCESS = "tool.success"
# Prompt metadata (AgentLoom-specific, no full-prompt capture by default)
@@ -181,6 +183,12 @@ class MetricName:
# Streaming response counter (no OTel equivalent — AgentLoom-specific).
STREAM_RESPONSES_TOTAL = "agentloom_stream_responses_total"
+ # Tool calls dispatched by the model (#116). Counter tagged by tool
+ # name + status (success / failure); histogram captures execution
+ # latency per tool.
+ TOOL_CALLS_TOTAL = "agentloom_tool_calls_total"
+ TOOL_CALL_DURATION_SECONDS = "agentloom_tool_call_duration_seconds"
+
# Resilience gauges
CIRCUIT_BREAKER_STATE = "agentloom_circuit_breaker_state"
BUDGET_REMAINING_USD = "agentloom_budget_remaining_usd"
diff --git a/src/agentloom/providers/anthropic.py b/src/agentloom/providers/anthropic.py
index 917269c..e989f7b 100644
--- a/src/agentloom/providers/anthropic.py
+++ b/src/agentloom/providers/anthropic.py
@@ -161,10 +161,22 @@ async def complete(
max_tokens: int | None = None,
**kwargs: Any,
) -> ProviderResponse:
+ agentloom_tools = kwargs.pop("agentloom_tools", None)
+ agentloom_tool_choice = kwargs.pop("agentloom_tool_choice", None)
extras = validate_extra_kwargs(
"anthropic", "complete", kwargs, _ANTHROPIC_EXTRA_PAYLOAD_KEYS
)
capture_reasoning = _translate_thinking_config(extras)
+ if agentloom_tools:
+ from agentloom.steps._tools import (
+ translate_tool_choice_for_anthropic,
+ translate_tools_for_anthropic,
+ )
+
+ extras["tools"] = translate_tools_for_anthropic(agentloom_tools)
+ mapped_choice = translate_tool_choice_for_anthropic(agentloom_tool_choice or "auto")
+ if mapped_choice is not None:
+ extras["tool_choice"] = mapped_choice
system_prompt, filtered_messages = self._format_messages(messages)
payload: dict[str, Any] = {
@@ -216,6 +228,10 @@ async def complete(
)
cost = calculate_cost(model, usage.prompt_tokens, usage.completion_tokens)
+ from agentloom.steps._tools import parse_tool_calls_from_anthropic
+
+ tool_calls = parse_tool_calls_from_anthropic(content_blocks)
+
return ProviderResponse(
content=content,
model=data.get("model", model),
@@ -225,6 +241,7 @@ async def complete(
reasoning_content=(
"".join(reasoning_parts) if reasoning_parts and capture_reasoning else None
),
+ tool_calls=tool_calls,
raw_response=data,
finish_reason=data.get("stop_reason"),
)
diff --git a/src/agentloom/providers/base.py b/src/agentloom/providers/base.py
index b247672..ec54322 100644
--- a/src/agentloom/providers/base.py
+++ b/src/agentloom/providers/base.py
@@ -12,6 +12,60 @@
from agentloom.exceptions import ProviderError
+class ToolCall(BaseModel):
+ """A function-call decision returned by the model.
+
+ ``id`` round-trips on the follow-up tool-result message (OpenAI:
+ ``tool_call_id``, Anthropic: ``tool_use_id``). ``arguments`` is
+ already JSON-decoded — adapters parse before constructing this.
+ """
+
+ id: str
+ name: str
+ arguments: dict[str, Any] = Field(default_factory=dict)
+
+
+class StreamEvent(BaseModel):
+ """Base type for typed stream events surfaced via ``StreamResponse.events()``.
+
+ Backwards-compat: ``async for chunk in sr`` keeps yielding plain text
+ strings (the deltas only). Callers that need to react to tool-call
+ decisions mid-stream use ``async for evt in sr.events()`` and switch
+ on the concrete subclass.
+ """
+
+
+class TextDelta(StreamEvent):
+ """A chunk of text content from the model."""
+
+ chunk: str
+
+
+class ToolCallDelta(StreamEvent):
+ """A partial tool-call observation while the model assembles arguments.
+
+ ``index`` matches across deltas of the same call; ``name`` arrives on
+ the first delta and is ``None`` on subsequent argument-fragment
+ deltas; ``arguments_chunk`` accumulates the JSON-encoded args.
+ """
+
+ index: int
+ name: str | None = None
+ arguments_chunk: str = ""
+
+
+class ToolCallComplete(StreamEvent):
+ """A fully-assembled ``ToolCall`` ready to dispatch."""
+
+ tool_call: ToolCall
+
+
+class StreamDone(StreamEvent):
+ """Emitted when the stream finishes; carries the provider's stop reason."""
+
+ finish_reason: str = ""
+
+
class ProviderResponse(BaseModel):
"""Unified response from any LLM provider."""
@@ -30,6 +84,9 @@ class ProviderResponse(BaseModel):
# Ollama returns ``message.thinking`` (or strips inline
# ``...`` tags as a fallback).
reasoning_content: str | None = None
+ # Empty when the model didn't pick a tool. When non-empty, ``content``
+ # may be empty — the LLM step dispatches each call and re-prompts.
+ tool_calls: list[ToolCall] = Field(default_factory=list)
class StreamResponse:
@@ -56,13 +113,44 @@ def __init__(self, model: str, provider: str) -> None:
# alongside the streamed answer (Gemini ``thought=true`` parts,
# Anthropic ``thinking`` deltas, Ollama ``message.thinking``).
self.reasoning_content: str | None = None
+ # Tool calls accumulated by the adapter while streaming. Adapters
+ # that emit ``ToolCallDelta`` events through the typed event API
+ # also populate this list with the assembled calls so non-event
+ # consumers can read them after the stream is exhausted.
+ self.tool_calls: list[ToolCall] = []
self._chunks: list[str] = []
self._accumulated_bytes: int = 0
self._iterator: AsyncIterator[str] | None = None
+ self._event_iterator: AsyncIterator[StreamEvent] | None = None
def _set_iterator(self, iterator: AsyncIterator[str]) -> None:
self._iterator = iterator
+ def _set_event_iterator(self, iterator: AsyncIterator[StreamEvent]) -> None:
+ """Adapters that emit typed events register a separate iterator here.
+
+ When unset, ``events()`` synthesises ``TextDelta`` events from
+ the plain-string iterator plus a final ``StreamDone`` so callers
+ always get a working typed surface.
+ """
+ self._event_iterator = iterator
+
+ async def events(self) -> AsyncIterator[StreamEvent]:
+ """Iterate typed stream events (text + tool-call deltas + done).
+
+ Adapters that wired ``_set_event_iterator`` get full fidelity
+ (per-provider tool-call streaming). The default fallback wraps
+ the plain-string iterator: each chunk → ``TextDelta``, then a
+ single ``StreamDone`` once the underlying iterator is exhausted.
+ """
+ if self._event_iterator is not None:
+ async for event in self._event_iterator:
+ yield event
+ return
+ async for chunk in self:
+ yield TextDelta(chunk=chunk)
+ yield StreamDone(finish_reason=self.finish_reason or "")
+
def __aiter__(self) -> StreamResponse:
return self
@@ -94,6 +182,7 @@ def to_provider_response(self) -> ProviderResponse:
cost_usd=self.cost_usd,
finish_reason=self.finish_reason,
reasoning_content=self.reasoning_content,
+ tool_calls=list(self.tool_calls),
)
diff --git a/src/agentloom/providers/google.py b/src/agentloom/providers/google.py
index b486274..42d908a 100644
--- a/src/agentloom/providers/google.py
+++ b/src/agentloom/providers/google.py
@@ -145,6 +145,14 @@ def _format_messages(
content = msg.get("content", "")
system_instruction = content if isinstance(content, str) else str(content)
continue
+ # Tool-loop messages already in Gemini wire shape (role=``model``
+ # with ``functionCall`` parts, role=``function`` with
+ # ``functionResponse`` parts) — pass them through verbatim so
+ # iteration 2+ preserves the call context. Identified by the
+ # presence of ``parts`` instead of ``content``.
+ if "parts" in msg:
+ contents.append(msg)
+ continue
role = "user" if msg["role"] == "user" else "model"
content = msg.get("content", "")
if isinstance(content, str):
@@ -180,8 +188,27 @@ async def complete(
max_tokens: int | None = None,
**kwargs: Any,
) -> ProviderResponse:
+ agentloom_tools = kwargs.pop("agentloom_tools", None)
+ agentloom_tool_choice = kwargs.pop("agentloom_tool_choice", None)
extras = validate_extra_kwargs("google", "complete", kwargs, _GOOGLE_EXTRA_PAYLOAD_KEYS)
thinking_payload = _build_thinking_config_payload(extras.pop("thinking_config", None))
+ if agentloom_tools:
+ from agentloom.steps._tools import translate_tools_for_google
+
+ extras["tools"] = translate_tools_for_google(agentloom_tools)
+ # ``{"name": "fn"}`` selects a specific function via Gemini's
+ # ANY mode + ``allowedFunctionNames``. Plain strings map to
+ # AUTO / ANY / NONE; anything else falls back to AUTO.
+ choice = agentloom_tool_choice or "auto"
+ fn_config: dict[str, Any]
+ if isinstance(choice, dict) and "name" in choice:
+ fn_config = {"mode": "ANY", "allowedFunctionNames": [choice["name"]]}
+ else:
+ mode = {"auto": "AUTO", "required": "ANY", "none": "NONE"}.get(
+ choice if isinstance(choice, str) else "auto", "AUTO"
+ )
+ fn_config = {"mode": mode}
+ extras["tool_config"] = {"functionCallingConfig": fn_config}
system_instruction, contents = self._format_messages(messages)
payload: dict[str, Any] = {"contents": contents}
@@ -217,9 +244,10 @@ async def complete(
candidates = data.get("candidates", [])
content = ""
reasoning_content: str | None = None
+ content_parts: list[dict[str, Any]] = []
if candidates:
- parts = candidates[0].get("content", {}).get("parts", [])
- content, reasoning_trace = _parse_gemini_content_parts(parts)
+ content_parts = candidates[0].get("content", {}).get("parts", []) or []
+ content, reasoning_trace = _parse_gemini_content_parts(content_parts)
if reasoning_trace:
reasoning_content = reasoning_trace
@@ -248,6 +276,11 @@ async def complete(
if candidates:
finish_reason = candidates[0].get("finishReason")
+ from agentloom.steps._tools import parse_tool_calls_from_google
+
+ tool_calls = parse_tool_calls_from_google(content_parts)
+
+
return ProviderResponse(
content=content,
model=model,
@@ -257,6 +290,7 @@ async def complete(
reasoning_content=reasoning_content,
raw_response=data,
finish_reason=finish_reason,
+ tool_calls=tool_calls,
)
async def stream(
diff --git a/src/agentloom/providers/ollama.py b/src/agentloom/providers/ollama.py
index c5db3de..290e655 100644
--- a/src/agentloom/providers/ollama.py
+++ b/src/agentloom/providers/ollama.py
@@ -115,6 +115,13 @@ def _format_messages(messages: list[dict[str, Any]]) -> list[dict[str, Any]]:
"""Convert internal content blocks to Ollama's images format."""
formatted: list[dict[str, Any]] = []
for msg in messages:
+ # Tool-loop messages (assistant with ``tool_calls``, role=``tool``
+ # with ``tool_call_id``) ship in OpenAI-compatible wire shape —
+ # pass them through verbatim so iteration 2+ keeps the call
+ # context intact.
+ if "tool_calls" in msg or msg.get("role") == "tool":
+ formatted.append(msg)
+ continue
content = msg.get("content", "")
if isinstance(content, str):
formatted.append({"role": msg["role"], "content": content})
@@ -154,8 +161,23 @@ async def complete(
max_tokens: int | None = None,
**kwargs: Any,
) -> ProviderResponse:
+ agentloom_tools = kwargs.pop("agentloom_tools", None)
+ agentloom_tool_choice = kwargs.pop("agentloom_tool_choice", None)
extras = validate_extra_kwargs("ollama", "complete", kwargs, _OLLAMA_EXTRA_PAYLOAD_KEYS)
think_param, capture_reasoning = _pop_thinking_config(extras)
+ if agentloom_tools:
+ from agentloom.steps._tools import translate_tools_for_ollama
+
+ extras["tools"] = translate_tools_for_ollama(agentloom_tools)
+ # Ollama doesn't expose tool_choice; only model-side support
+ # decides whether the call is honored. Surface the silent drop
+ # via debug log so users debugging "why doesn't my tool fire"
+ # have a hint.
+ if agentloom_tool_choice not in (None, "auto"):
+ logger.debug(
+ "Ollama ignores tool_choice=%r; only model-side support matters.",
+ agentloom_tool_choice,
+ )
payload: dict[str, Any] = {
"model": model,
"messages": self._format_messages(messages),
@@ -206,6 +228,11 @@ async def complete(
total_tokens=prompt_tokens + completion_tokens,
)
+ from agentloom.steps._tools import parse_tool_calls_from_openai
+
+ tool_calls = parse_tool_calls_from_openai(data.get("message", {}) or {})
+
+
return ProviderResponse(
content=content,
model=data.get("model", model),
@@ -215,6 +242,7 @@ async def complete(
reasoning_content=reasoning_content,
raw_response=data,
finish_reason=data.get("done_reason"),
+ tool_calls=tool_calls,
)
async def stream(
diff --git a/src/agentloom/providers/openai.py b/src/agentloom/providers/openai.py
index 5c2867a..caeaf19 100644
--- a/src/agentloom/providers/openai.py
+++ b/src/agentloom/providers/openai.py
@@ -80,6 +80,13 @@ def _format_messages(messages: list[dict[str, Any]]) -> list[dict[str, Any]]:
"""Convert internal content blocks to OpenAI's vision/audio format."""
formatted: list[dict[str, Any]] = []
for msg in messages:
+ # Tool-loop messages (assistant with ``tool_calls``, role=``tool``
+ # with ``tool_call_id``) ship in OpenAI wire shape directly —
+ # pass them through verbatim so iteration 2+ sees the prior
+ # decision and result.
+ if "tool_calls" in msg or msg.get("role") == "tool":
+ formatted.append(msg)
+ continue
content = msg.get("content", "")
if isinstance(content, str):
formatted.append({"role": msg["role"], "content": content})
@@ -126,11 +133,25 @@ async def complete(
max_tokens: int | None = None,
**kwargs: Any,
) -> ProviderResponse:
+ # Tool definitions arrive as ``ToolDefinition`` Pydantic instances —
+ # translate to OpenAI's wire format before forwarding so callers
+ # don't have to know provider-specific shapes.
+ agentloom_tools = kwargs.pop("agentloom_tools", None)
+ agentloom_tool_choice = kwargs.pop("agentloom_tool_choice", None)
extras = validate_extra_kwargs("openai", "complete", kwargs, _OPENAI_EXTRA_PAYLOAD_KEYS)
# ``thinking_config`` is accepted at the step layer for YAML
# uniformity but has no chat-completions equivalent — drop it
# before splatting extras into the request body.
extras.pop("thinking_config", None)
+ if agentloom_tools:
+ from agentloom.steps._tools import (
+ translate_tool_choice_for_openai,
+ translate_tools_for_openai,
+ )
+
+ extras["tools"] = translate_tools_for_openai(agentloom_tools)
+ if agentloom_tool_choice is not None and agentloom_tool_choice != "none":
+ extras["tool_choice"] = translate_tool_choice_for_openai(agentloom_tool_choice)
payload: dict[str, Any] = {
"model": model,
"messages": self._format_messages(messages),
@@ -149,7 +170,8 @@ async def complete(
raise_for_status("openai", response)
data = response.json()
- content = data["choices"][0]["message"]["content"]
+ message = data["choices"][0]["message"]
+ content = message.get("content") or ""
usage_data = data.get("usage", {})
# o-series returns reasoning tokens under completion_tokens_details;
# ordinary gpt-* responses omit the field and resolve to 0.
@@ -168,6 +190,11 @@ async def complete(
reasoning_tokens=usage.reasoning_tokens,
)
+ from agentloom.steps._tools import parse_tool_calls_from_openai
+
+ tool_calls = parse_tool_calls_from_openai(message)
+
+
return ProviderResponse(
content=content,
model=data.get("model", model),
@@ -176,6 +203,7 @@ async def complete(
cost_usd=cost,
raw_response=data,
finish_reason=data["choices"][0].get("finish_reason"),
+ tool_calls=tool_calls,
)
async def stream(
diff --git a/src/agentloom/steps/_tools.py b/src/agentloom/steps/_tools.py
new file mode 100644
index 0000000..8e73bdf
--- /dev/null
+++ b/src/agentloom/steps/_tools.py
@@ -0,0 +1,269 @@
+"""Tool-calling helpers — wire-format translation, dispatch, and message synthesis."""
+
+from __future__ import annotations
+
+import hashlib
+import json
+import logging
+import time
+from typing import Any
+
+import anyio
+
+from agentloom.core.models import ToolDefinition
+from agentloom.providers.base import ToolCall
+
+
+def _hash_text(text: str) -> str:
+ return hashlib.sha256(text.encode("utf-8")).hexdigest()[:16]
+
+
+logger = logging.getLogger("agentloom.steps")
+
+
+def translate_tools_for_openai(tools: list[ToolDefinition]) -> list[dict[str, Any]]:
+ """OpenAI shape: ``[{"type": "function", "function": {name, description, parameters}}]``."""
+ return [
+ {
+ "type": "function",
+ "function": {
+ "name": t.name,
+ "description": t.description,
+ "parameters": t.parameters or {"type": "object", "properties": {}},
+ },
+ }
+ for t in tools
+ ]
+
+
+def translate_tools_for_anthropic(tools: list[ToolDefinition]) -> list[dict[str, Any]]:
+ """Anthropic shape: ``[{name, description, input_schema}]``."""
+ return [
+ {
+ "name": t.name,
+ "description": t.description,
+ "input_schema": t.parameters or {"type": "object", "properties": {}},
+ }
+ for t in tools
+ ]
+
+
+def translate_tools_for_google(tools: list[ToolDefinition]) -> list[dict[str, Any]]:
+ """Google shape: ``[{"function_declarations": [{name, description, parameters}, ...]}]``.
+
+ Google groups all functions under one ``Tool`` object; we put them all
+ in a single declaration list since AgentLoom doesn't yet expose tool
+ grouping (everything is a function).
+ """
+ return [
+ {
+ "function_declarations": [
+ {
+ "name": t.name,
+ "description": t.description,
+ "parameters": t.parameters or {"type": "object", "properties": {}},
+ }
+ for t in tools
+ ]
+ }
+ ]
+
+
+def translate_tools_for_ollama(tools: list[ToolDefinition]) -> list[dict[str, Any]]:
+ """Ollama uses the OpenAI shape on supported models."""
+ return translate_tools_for_openai(tools)
+
+
+def translate_tool_choice_for_openai(choice: Any) -> Any:
+ """OpenAI ``tool_choice`` accepts ``"auto"`` / ``"required"`` / ``"none"``
+ or ``{"type": "function", "function": {"name": "..."}}``."""
+ if isinstance(choice, dict) and "name" in choice:
+ return {"type": "function", "function": {"name": choice["name"]}}
+ return choice
+
+
+def translate_tool_choice_for_anthropic(choice: Any) -> Any:
+ """Anthropic uses ``{"type": "auto"}`` / ``"any"`` / ``"tool"``."""
+ if choice == "auto":
+ return {"type": "auto"}
+ if choice == "required":
+ return {"type": "any"}
+ if choice == "none":
+ return None # omit the field entirely
+ if isinstance(choice, dict) and "name" in choice:
+ return {"type": "tool", "name": choice["name"]}
+ return {"type": "auto"}
+
+
+def parse_tool_calls_from_openai(message: dict[str, Any]) -> list[ToolCall]:
+ """OpenAI returns ``message.tool_calls = [{id, type, function: {name, arguments}}]``."""
+ raw_calls = message.get("tool_calls") or []
+ calls: list[ToolCall] = []
+ for entry in raw_calls:
+ if entry.get("type") != "function":
+ continue
+ fn = entry.get("function", {})
+ try:
+ args = json.loads(fn.get("arguments") or "{}")
+ except json.JSONDecodeError:
+ args = {}
+ calls.append(ToolCall(id=entry.get("id", ""), name=fn.get("name", ""), arguments=args))
+ return calls
+
+
+def parse_tool_calls_from_anthropic(content_blocks: list[dict[str, Any]]) -> list[ToolCall]:
+ """Anthropic returns ``content`` blocks; ``type=tool_use`` carries calls."""
+ calls: list[ToolCall] = []
+ for block in content_blocks:
+ if block.get("type") != "tool_use":
+ continue
+ calls.append(
+ ToolCall(
+ id=block.get("id", ""),
+ name=block.get("name", ""),
+ arguments=block.get("input", {}) or {},
+ )
+ )
+ return calls
+
+
+def parse_tool_calls_from_google(content_parts: list[dict[str, Any]]) -> list[ToolCall]:
+ """Google returns parts with ``functionCall: {name, args}``. Ids aren't
+ provider-assigned so we synthesize them — the result message echoes
+ the same name (no id round-trip needed)."""
+ calls: list[ToolCall] = []
+ for idx, part in enumerate(content_parts):
+ fc = part.get("functionCall")
+ if not fc:
+ continue
+ calls.append(
+ ToolCall(
+ id=f"google-{idx}",
+ name=fc.get("name", ""),
+ arguments=fc.get("args", {}) or {},
+ )
+ )
+ return calls
+
+
+async def dispatch_tool_calls(
+ calls: list[ToolCall],
+ tool_registry: Any,
+ *,
+ observer: Any | None = None,
+ step_id: str = "",
+) -> list[tuple[ToolCall, str, bool]]:
+ """Execute each call via the registry in parallel; preserve order.
+
+ Failed calls return ``(call, str(exc), False)`` so the model can
+ recover on the next turn rather than aborting the loop. When *observer*
+ is provided, fires ``on_tool_call`` per call with hashes of the args
+ and result for trace-level observability without leaking PII.
+ """
+ results: list[tuple[ToolCall, str, bool]] = [None] * len(calls) # type: ignore[list-item]
+
+ async def _run(idx: int, call: ToolCall) -> None:
+ start = time.monotonic()
+ success = False
+ text: str
+ try:
+ tool = tool_registry.get(call.name)
+ except KeyError as e:
+ text = f"tool '{call.name}' not registered: {e}"
+ results[idx] = (call, text, False)
+ else:
+ try:
+ outcome = await tool.execute(**call.arguments)
+ text = outcome if isinstance(outcome, str) else json.dumps(outcome, default=str)
+ results[idx] = (call, text, True)
+ success = True
+ except Exception as e: # noqa: BLE001 — reported back to the model
+ logger.warning("Tool '%s' failed: %s", call.name, e)
+ text = f"tool execution failed: {e}"
+ results[idx] = (call, text, False)
+ if observer is not None:
+ hook = getattr(observer, "on_tool_call", None)
+ if callable(hook):
+ hook(
+ step_id=step_id,
+ call_id=call.id,
+ tool_name=call.name,
+ args_hash=_hash_text(json.dumps(call.arguments, sort_keys=True, default=str)),
+ result_hash=_hash_text(text),
+ duration_ms=(time.monotonic() - start) * 1000.0,
+ success=success,
+ )
+
+ async with anyio.create_task_group() as tg:
+ for idx, call in enumerate(calls):
+ tg.start_soon(_run, idx, call)
+
+ return results
+
+
+def build_assistant_message_with_tool_calls(
+ provider: str, content: str, calls: list[ToolCall]
+) -> dict[str, Any]:
+ """Replay the assistant's tool-call decision so the model sees its prior turn."""
+ if provider == "anthropic":
+ blocks: list[dict[str, Any]] = []
+ if content:
+ blocks.append({"type": "text", "text": content})
+ for c in calls:
+ blocks.append({"type": "tool_use", "id": c.id, "name": c.name, "input": c.arguments})
+ return {"role": "assistant", "content": blocks}
+ if provider == "google":
+ return {
+ "role": "model",
+ "parts": [{"functionCall": {"name": c.name, "args": c.arguments}} for c in calls],
+ }
+ # OpenAI / Ollama: ``content`` must be a string (formatter rejects None);
+ # empty string is accepted alongside tool_calls.
+ return {
+ "role": "assistant",
+ "content": content or "",
+ "tool_calls": [
+ {
+ "id": c.id,
+ "type": "function",
+ "function": {"name": c.name, "arguments": json.dumps(c.arguments)},
+ }
+ for c in calls
+ ],
+ }
+
+
+def build_tool_result_messages(
+ provider: str, results: list[tuple[ToolCall, str, bool]]
+) -> list[dict[str, Any]]:
+ """Translate tool outcomes into the role/shape each provider expects."""
+ if provider == "anthropic":
+ # Single user turn with one tool_result block per call.
+ blocks: list[dict[str, Any]] = []
+ for call, text, success in results:
+ blocks.append(
+ {
+ "type": "tool_result",
+ "tool_use_id": call.id,
+ "content": text,
+ "is_error": not success,
+ }
+ )
+ return [{"role": "user", "content": blocks}]
+ if provider == "google":
+ return [
+ {
+ "role": "function",
+ "parts": [
+ {
+ "functionResponse": {
+ "name": call.name,
+ "response": {"result": text} if success else {"error": text},
+ }
+ }
+ for call, text, success in results
+ ],
+ }
+ ]
+ # OpenAI / Ollama: one tool message per call, keyed by tool_call_id.
+ return [{"role": "tool", "tool_call_id": call.id, "content": text} for call, text, _ in results]
diff --git a/src/agentloom/steps/llm_call.py b/src/agentloom/steps/llm_call.py
index c9be355..a786d81 100644
--- a/src/agentloom/steps/llm_call.py
+++ b/src/agentloom/steps/llm_call.py
@@ -56,6 +56,97 @@ def _build_prompt_metadata(
class LLMCallStep(BaseStep):
"""Executes an LLM call with prompt template rendering from state."""
+ @staticmethod
+ async def _run_tool_loop(
+ *,
+ context: StepContext,
+ step: StepDefinition,
+ messages: list[dict[str, Any]],
+ model: str,
+ provider_kwargs: dict[str, Any],
+ ) -> Any:
+ """Iterate complete() → dispatch tools → re-prompt until done.
+
+ Cost and tokens accumulate across iterations. ``max_tool_iterations``
+ bounds the loop; collapses to a single call when ``tools`` is empty.
+ """
+ from agentloom.core.results import TokenUsage
+ from agentloom.providers.base import ProviderResponse
+ from agentloom.steps._tools import (
+ build_assistant_message_with_tool_calls,
+ build_tool_result_messages,
+ dispatch_tool_calls,
+ )
+
+ accumulated_prompt = 0
+ accumulated_completion = 0
+ accumulated_reasoning = 0
+ accumulated_cost = 0.0
+
+ max_iterations = max(step.max_tool_iterations, 1)
+ gateway = context.provider_gateway
+ if gateway is None:
+ raise StepError(step.id, "No provider gateway configured")
+ for _ in range(max_iterations):
+ response = await gateway.complete(
+ messages=messages,
+ model=model,
+ temperature=step.temperature,
+ max_tokens=step.max_tokens,
+ step_id=step.id,
+ **provider_kwargs,
+ )
+ accumulated_prompt += response.usage.prompt_tokens
+ accumulated_completion += response.usage.completion_tokens
+ accumulated_reasoning += response.usage.reasoning_tokens
+ accumulated_cost += response.cost_usd
+
+ if not response.tool_calls or not step.tools:
+ # Replace the response usage with the accumulated totals so
+ # the caller sees the full conversation cost.
+ response.usage = TokenUsage(
+ prompt_tokens=accumulated_prompt,
+ completion_tokens=accumulated_completion,
+ total_tokens=(
+ accumulated_prompt + accumulated_completion + accumulated_reasoning
+ ),
+ reasoning_tokens=accumulated_reasoning,
+ )
+ response.cost_usd = accumulated_cost
+ return response
+
+ if context.tool_registry is None:
+ raise StepError(
+ step.id,
+ "Tool registry required for tools= declaration but not configured.",
+ )
+
+ results = await dispatch_tool_calls(
+ response.tool_calls,
+ context.tool_registry,
+ observer=context.observer,
+ step_id=step.id,
+ )
+ messages.append(
+ build_assistant_message_with_tool_calls(
+ response.provider, response.content, response.tool_calls
+ )
+ )
+ messages.extend(build_tool_result_messages(response.provider, results))
+
+ # Loop exhausted; surface the last response with the cap noted as
+ # finish_reason so callers can detect it.
+ last_response: ProviderResponse = response # noqa: F821 — set in loop
+ last_response.usage = TokenUsage(
+ prompt_tokens=accumulated_prompt,
+ completion_tokens=accumulated_completion,
+ total_tokens=(accumulated_prompt + accumulated_completion + accumulated_reasoning),
+ reasoning_tokens=accumulated_reasoning,
+ )
+ last_response.cost_usd = accumulated_cost
+ last_response.finish_reason = "max_tool_iterations"
+ return last_response
+
@staticmethod
def _build_thinking_kwargs(step: StepDefinition) -> dict[str, Any]:
"""Forward ``StepDefinition.thinking`` to the gateway as a config object.
@@ -148,14 +239,21 @@ async def execute(self, context: StepContext) -> StepResult:
)
provider_kwargs = self._build_thinking_kwargs(step)
+ if step.tools:
+ provider_kwargs["agentloom_tools"] = step.tools
+ provider_kwargs["agentloom_tool_choice"] = step.tool_choice
+
+ # Tool-call loop: re-prompt with tool results until the model
+ # stops requesting tools or we exhaust ``max_tool_iterations``.
+ # Costs and tokens accumulate across iterations; only the final
+ # response's content is exposed to the caller.
try:
- response = await context.provider_gateway.complete(
+ response = await self._run_tool_loop(
+ context=context,
+ step=step,
messages=messages,
model=model,
- temperature=step.temperature,
- max_tokens=step.max_tokens,
- step_id=step.id,
- **provider_kwargs,
+ provider_kwargs=provider_kwargs,
)
except Exception as e:
duration = (time.monotonic() - start) * 1000
diff --git a/tests/observability/test_metrics.py b/tests/observability/test_metrics.py
index 0e6f85a..4af3b58 100644
--- a/tests/observability/test_metrics.py
+++ b/tests/observability/test_metrics.py
@@ -78,6 +78,28 @@ def test_record_tokens(self) -> None:
if mm._backend == "otel":
mm.shutdown()
+ def test_record_tool_call(self) -> None:
+ # Records both the per-tool counter (with status label) and the
+ # latency histogram. Status maps to ``"success"`` / ``"failure"``
+ # so dashboards can derive a per-tool failure rate.
+ from unittest.mock import MagicMock
+
+ mm = MetricsManager(enabled=True)
+ if mm._backend != "otel":
+ return
+ counter_spy = MagicMock()
+ histogram_spy = MagicMock()
+ mm._tool_call_counter = counter_spy
+ mm._tool_call_histogram = histogram_spy
+ mm.record_tool_call("add", success=True, duration_s=0.42)
+ mm.record_tool_call("add", success=False, duration_s=1.1)
+
+ counter_spy.add.assert_any_call(1, {"tool_name": "add", "status": "success"})
+ counter_spy.add.assert_any_call(1, {"tool_name": "add", "status": "failure"})
+ histogram_spy.record.assert_any_call(0.42, {"tool_name": "add"})
+ histogram_spy.record.assert_any_call(1.1, {"tool_name": "add"})
+ mm.shutdown()
+
def test_reasoning_tokens_metric_emitted(self) -> None:
# When ``reasoning_tokens`` is non-zero, ``record_tokens`` must
# observe the histogram a third time with
diff --git a/tests/observability/test_noop.py b/tests/observability/test_noop.py
index ed29383..cda83e4 100644
--- a/tests/observability/test_noop.py
+++ b/tests/observability/test_noop.py
@@ -174,6 +174,15 @@ def test_every_hook_executes_no_op_body(self) -> None:
obs.on_provider_call_start("s1", "openai", "gpt-4o-mini")
obs.on_provider_call_end("s1", "openai", "gpt-4o-mini", 1.5)
obs.on_provider_error("openai", "RateLimitError")
+ obs.on_tool_call(
+ step_id="s1",
+ call_id="c1",
+ tool_name="add",
+ args_hash="x",
+ result_hash="y",
+ duration_ms=1.0,
+ success=True,
+ )
obs.on_stream_response("openai", "gpt-4o-mini", 0.3)
obs.on_tokens("openai", "gpt-4o-mini", 100, 50)
obs.on_mock_replay("wf", "s1", "step_id")
diff --git a/tests/observability/test_observer.py b/tests/observability/test_observer.py
index ace227d..35f8008 100644
--- a/tests/observability/test_observer.py
+++ b/tests/observability/test_observer.py
@@ -260,6 +260,56 @@ def test_dangling_span_closed_without_tracing(self) -> None:
span.end.assert_called_once()
+class TestToolCallObservability:
+ """``on_tool_call`` records the per-tool counter / histogram via the
+ metrics manager and emits an ``execute_tool {name}`` child span carrying
+ the tool name, args/result hashes, duration, and success flag."""
+
+ def test_records_metric_and_emits_span(self) -> None:
+ tracing = MagicMock()
+ metrics = MagicMock()
+ observer = WorkflowObserver(tracing=tracing, metrics=metrics)
+ observer.on_workflow_start("wf", run_id="rid-7")
+ tracing.start_span.reset_mock() # ignore the workflow span
+ observer.on_tool_call(
+ step_id="s1",
+ call_id="c1",
+ tool_name="add",
+ args_hash="abcd1234",
+ result_hash="ef567890",
+ duration_ms=12.3,
+ success=True,
+ )
+ # Metric — counter + histogram via record_tool_call.
+ metrics.record_tool_call.assert_called_once_with("add", True, 0.0123)
+ # Span — name follows the canonical ``execute_tool {tool_name}``
+ # template; attrs include the run_id propagation + tool fields.
+ tracing.start_span.assert_called_once()
+ name = tracing.start_span.call_args.args[0]
+ attrs = tracing.start_span.call_args.kwargs["attributes"]
+ assert name == "execute_tool add"
+ assert attrs["tool.name"] == "add"
+ assert attrs["tool.call_id"] == "c1"
+ assert attrs["tool.args_hash"] == "abcd1234"
+ assert attrs["tool.result_hash"] == "ef567890"
+ assert attrs["tool.duration_ms"] == 12.3
+ assert attrs["tool.success"] is True
+ assert attrs["workflow.run_id"] == "rid-7"
+
+ def test_noop_when_no_tracing_or_metrics(self) -> None:
+ # Defensive: tool dispatch shouldn't fail when observability is off.
+ observer = WorkflowObserver()
+ observer.on_tool_call(
+ step_id="s1",
+ call_id="c1",
+ tool_name="x",
+ args_hash="a",
+ result_hash="b",
+ duration_ms=1.0,
+ success=False,
+ )
+
+
class TestStreamResponse:
def test_records_ttft_and_stream_count(self) -> None:
metrics = MagicMock()
diff --git a/tests/providers/test_tool_calling.py b/tests/providers/test_tool_calling.py
new file mode 100644
index 0000000..8cb461b
--- /dev/null
+++ b/tests/providers/test_tool_calling.py
@@ -0,0 +1,730 @@
+"""Tests for native tool calling across providers."""
+
+from __future__ import annotations
+
+import json
+from typing import Any
+
+import httpx
+import respx
+
+from agentloom.core.engine import WorkflowEngine
+from agentloom.core.models import (
+ StepDefinition,
+ StepType,
+ ToolDefinition,
+ WorkflowConfig,
+ WorkflowDefinition,
+)
+from agentloom.core.results import StepStatus, WorkflowStatus
+from agentloom.providers.anthropic import AnthropicProvider
+from agentloom.providers.base import ToolCall
+from agentloom.providers.gateway import ProviderGateway
+from agentloom.providers.openai import OpenAIProvider
+from agentloom.steps._tools import (
+ build_assistant_message_with_tool_calls,
+ build_tool_result_messages,
+ dispatch_tool_calls,
+ parse_tool_calls_from_anthropic,
+ parse_tool_calls_from_google,
+ parse_tool_calls_from_openai,
+ translate_tools_for_anthropic,
+ translate_tools_for_google,
+ translate_tools_for_openai,
+)
+from agentloom.tools.base import BaseTool
+from agentloom.tools.registry import ToolRegistry
+
+
+class _AddTool(BaseTool):
+ name = "add"
+ description = "Add two integers."
+ parameters_schema = {
+ "type": "object",
+ "properties": {"a": {"type": "integer"}, "b": {"type": "integer"}},
+ "required": ["a", "b"],
+ }
+
+ async def execute(self, **kwargs: Any) -> Any:
+ return kwargs["a"] + kwargs["b"]
+
+
+class _LookupTool(BaseTool):
+ name = "lookup"
+ description = "Look up a value."
+ parameters_schema = {
+ "type": "object",
+ "properties": {"key": {"type": "string"}},
+ "required": ["key"],
+ }
+
+ async def execute(self, **kwargs: Any) -> Any:
+ return f"value-for-{kwargs['key']}"
+
+
+def _registry_with(*tools: BaseTool) -> ToolRegistry:
+ reg = ToolRegistry()
+ for t in tools:
+ reg.register(t)
+ return reg
+
+
+class TestToolTranslation:
+ def test_openai_shape(self) -> None:
+ out = translate_tools_for_openai(
+ [ToolDefinition(name="add", description="d", parameters={"type": "object"})]
+ )
+ assert out == [
+ {
+ "type": "function",
+ "function": {
+ "name": "add",
+ "description": "d",
+ "parameters": {"type": "object"},
+ },
+ }
+ ]
+
+ def test_anthropic_shape(self) -> None:
+ out = translate_tools_for_anthropic(
+ [ToolDefinition(name="add", description="d", parameters={"type": "object"})]
+ )
+ assert out == [{"name": "add", "description": "d", "input_schema": {"type": "object"}}]
+
+ def test_google_shape_groups_under_function_declarations(self) -> None:
+ out = translate_tools_for_google(
+ [
+ ToolDefinition(name="add", description="d", parameters={"type": "object"}),
+ ToolDefinition(name="lookup", description="l", parameters={"type": "object"}),
+ ]
+ )
+ assert len(out) == 1
+ decls = out[0]["function_declarations"]
+ assert {d["name"] for d in decls} == {"add", "lookup"}
+
+
+class TestToolParsing:
+ def test_openai_parses_tool_calls(self) -> None:
+ message = {
+ "role": "assistant",
+ "content": None,
+ "tool_calls": [
+ {
+ "id": "call_1",
+ "type": "function",
+ "function": {"name": "add", "arguments": '{"a": 2, "b": 3}'},
+ }
+ ],
+ }
+ calls = parse_tool_calls_from_openai(message)
+ assert calls == [ToolCall(id="call_1", name="add", arguments={"a": 2, "b": 3})]
+
+ def test_openai_skips_non_function_entries(self) -> None:
+ message = {"tool_calls": [{"id": "x", "type": "code_interpreter"}, {"type": "function"}]}
+ calls = parse_tool_calls_from_openai(message)
+ # Only the second entry is a function — its function dict is empty
+ # so name="" and args={}; we accept it (degenerate case).
+ assert len(calls) == 1
+ assert calls[0].name == ""
+
+ def test_anthropic_parses_tool_use_blocks(self) -> None:
+ blocks = [
+ {"type": "text", "text": "I'll use a tool."},
+ {"type": "tool_use", "id": "tu_1", "name": "lookup", "input": {"key": "x"}},
+ ]
+ calls = parse_tool_calls_from_anthropic(blocks)
+ assert calls == [ToolCall(id="tu_1", name="lookup", arguments={"key": "x"})]
+
+ def test_google_parses_function_call_parts(self) -> None:
+ parts = [
+ {"text": "I'll call a function."},
+ {"functionCall": {"name": "add", "args": {"a": 1, "b": 2}}},
+ ]
+ calls = parse_tool_calls_from_google(parts)
+ assert calls == [ToolCall(id="google-1", name="add", arguments={"a": 1, "b": 2})]
+
+
+class TestDispatchToolCalls:
+ async def test_dispatch_runs_in_parallel(self) -> None:
+ registry = _registry_with(_AddTool(), _LookupTool())
+ calls = [
+ ToolCall(id="c1", name="add", arguments={"a": 2, "b": 3}),
+ ToolCall(id="c2", name="lookup", arguments={"key": "alpha"}),
+ ]
+ results = await dispatch_tool_calls(calls, registry)
+ assert len(results) == 2
+ # Order preserved.
+ assert results[0][0].id == "c1"
+ assert results[0][1] == "5"
+ assert results[0][2] is True
+ assert results[1][1] == "value-for-alpha"
+
+ async def test_unknown_tool_yields_failure(self) -> None:
+ registry = _registry_with(_AddTool())
+ calls = [ToolCall(id="c1", name="missing", arguments={})]
+ results = await dispatch_tool_calls(calls, registry)
+ assert results[0][2] is False
+ assert "not registered" in results[0][1]
+
+ async def test_tool_exception_yields_failure(self) -> None:
+ class _Bad(BaseTool):
+ name = "bad"
+
+ async def execute(self, **kwargs: Any) -> Any:
+ raise RuntimeError("kaboom")
+
+ registry = _registry_with(_Bad())
+ calls = [ToolCall(id="c1", name="bad", arguments={})]
+ results = await dispatch_tool_calls(calls, registry)
+ assert results[0][2] is False
+ assert "kaboom" in results[0][1]
+
+
+class TestResultMessageBuilders:
+ def test_openai_role_tool_per_call(self) -> None:
+ c = ToolCall(id="c1", name="add", arguments={})
+ out = build_tool_result_messages("openai", [(c, "5", True)])
+ assert out == [{"role": "tool", "tool_call_id": "c1", "content": "5"}]
+
+ def test_anthropic_single_user_turn_with_tool_result_blocks(self) -> None:
+ c = ToolCall(id="tu_1", name="lookup", arguments={})
+ out = build_tool_result_messages("anthropic", [(c, "ok", True)])
+ assert len(out) == 1
+ assert out[0]["role"] == "user"
+ block = out[0]["content"][0]
+ assert block["type"] == "tool_result"
+ assert block["tool_use_id"] == "tu_1"
+ assert block["content"] == "ok"
+ assert block["is_error"] is False
+
+ def test_assistant_message_for_anthropic_includes_tool_use_blocks(self) -> None:
+ c = ToolCall(id="tu_1", name="lookup", arguments={"key": "x"})
+ msg = build_assistant_message_with_tool_calls("anthropic", "Thinking...", [c])
+ assert msg["role"] == "assistant"
+ assert msg["content"][0] == {"type": "text", "text": "Thinking..."}
+ assert msg["content"][1] == {
+ "type": "tool_use",
+ "id": "tu_1",
+ "name": "lookup",
+ "input": {"key": "x"},
+ }
+
+
+class TestOpenAIToolCallingWire:
+ @respx.mock
+ async def test_complete_returns_tool_calls_and_translates_tools(self) -> None:
+ route = respx.post("https://api.openai.com/v1/chat/completions").mock(
+ return_value=httpx.Response(
+ 200,
+ json={
+ "model": "gpt-4o-mini",
+ "choices": [
+ {
+ "message": {
+ "role": "assistant",
+ "content": None,
+ "tool_calls": [
+ {
+ "id": "c1",
+ "type": "function",
+ "function": {
+ "name": "add",
+ "arguments": '{"a": 2, "b": 3}',
+ },
+ }
+ ],
+ },
+ "finish_reason": "tool_calls",
+ }
+ ],
+ "usage": {"prompt_tokens": 5, "completion_tokens": 9},
+ },
+ )
+ )
+ provider = OpenAIProvider(api_key="k")
+ tools = [
+ ToolDefinition(
+ name="add",
+ description="add two ints",
+ parameters={
+ "type": "object",
+ "properties": {
+ "a": {"type": "integer"},
+ "b": {"type": "integer"},
+ },
+ },
+ )
+ ]
+ r = await provider.complete(
+ messages=[{"role": "user", "content": "what is 2+3?"}],
+ model="gpt-4o-mini",
+ agentloom_tools=tools,
+ )
+ body = json.loads(route.calls[0].request.content)
+ assert body["tools"][0]["function"]["name"] == "add"
+ assert r.tool_calls == [ToolCall(id="c1", name="add", arguments={"a": 2, "b": 3})]
+ assert r.finish_reason == "tool_calls"
+ await provider.close()
+
+
+class TestAnthropicToolCallingWire:
+ @respx.mock
+ async def test_translates_tools_and_parses_tool_use_blocks(self) -> None:
+ route = respx.post("https://api.anthropic.com/v1/messages").mock(
+ return_value=httpx.Response(
+ 200,
+ json={
+ "content": [
+ {
+ "type": "tool_use",
+ "id": "tu_1",
+ "name": "lookup",
+ "input": {"key": "alpha"},
+ }
+ ],
+ "model": "claude-haiku-4-5-20251001",
+ "stop_reason": "tool_use",
+ "usage": {"input_tokens": 5, "output_tokens": 3},
+ },
+ )
+ )
+ provider = AnthropicProvider(api_key="k")
+ tools = [ToolDefinition(name="lookup", description="d", parameters={"type": "object"})]
+ r = await provider.complete(
+ messages=[{"role": "user", "content": "x"}],
+ model="claude-haiku-4-5-20251001",
+ agentloom_tools=tools,
+ agentloom_tool_choice="required",
+ )
+ body = json.loads(route.calls[0].request.content)
+ assert body["tools"][0]["name"] == "lookup"
+ assert body["tool_choice"] == {"type": "any"}
+ assert r.tool_calls == [ToolCall(id="tu_1", name="lookup", arguments={"key": "alpha"})]
+ await provider.close()
+
+
+class TestGoogleToolChoiceDictSelectsSpecificFunction:
+ """``tool_choice={"name": "fn"}`` must translate to Gemini's
+ ANY-mode + ``allowedFunctionNames`` rather than silently falling
+ through to AUTO."""
+
+ @respx.mock
+ async def test_specific_function_selection(self) -> None:
+ from agentloom.providers.google import GoogleProvider
+
+ route = respx.post(
+ "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent"
+ ).mock(
+ return_value=httpx.Response(
+ 200,
+ json={
+ "model": "gemini-2.5-flash",
+ "candidates": [{"content": {"parts": [{"text": "ok"}]}}],
+ "usageMetadata": {
+ "promptTokenCount": 1,
+ "candidatesTokenCount": 1,
+ "totalTokenCount": 2,
+ },
+ },
+ )
+ )
+ provider = GoogleProvider(api_key="k")
+ await provider.complete(
+ messages=[{"role": "user", "content": "ping"}],
+ model="gemini-2.5-flash",
+ agentloom_tools=[
+ ToolDefinition(name="lookup", description="d", parameters={"type": "object"})
+ ],
+ agentloom_tool_choice={"name": "lookup"},
+ )
+ body = json.loads(route.calls[0].request.content)
+ cfg = body["toolConfig"]["functionCallingConfig"]
+ assert cfg["mode"] == "ANY"
+ assert cfg["allowedFunctionNames"] == ["lookup"]
+ await provider._client.aclose()
+
+
+class TestDispatchObserverHook:
+ """``dispatch_tool_calls`` calls ``observer.on_tool_call`` per call so
+ each dispatch lands on the trace as a child span tagged by tool +
+ success status (#116 observability spec)."""
+
+ async def test_observer_hook_fires_with_hashes_and_duration(self) -> None:
+ from unittest.mock import MagicMock
+
+ observer = MagicMock()
+ registry = _registry_with(_AddTool())
+ calls = [ToolCall(id="c1", name="add", arguments={"a": 2, "b": 3})]
+ await dispatch_tool_calls(calls, registry, observer=observer, step_id="s1")
+
+ observer.on_tool_call.assert_called_once()
+ kwargs = observer.on_tool_call.call_args.kwargs
+ assert kwargs["step_id"] == "s1"
+ assert kwargs["call_id"] == "c1"
+ assert kwargs["tool_name"] == "add"
+ assert kwargs["success"] is True
+ # Hashes are non-empty hex strings — args/result never logged raw.
+ assert len(kwargs["args_hash"]) == 16
+ assert len(kwargs["result_hash"]) == 16
+ assert kwargs["duration_ms"] >= 0.0
+
+ async def test_observer_hook_fires_with_failure_on_unknown_tool(self) -> None:
+ from unittest.mock import MagicMock
+
+ observer = MagicMock()
+ registry = _registry_with(_AddTool())
+ calls = [ToolCall(id="c1", name="missing", arguments={})]
+ await dispatch_tool_calls(calls, registry, observer=observer, step_id="s1")
+
+ observer.on_tool_call.assert_called_once()
+ assert observer.on_tool_call.call_args.kwargs["success"] is False
+
+
+class TestStreamEvents:
+ """Typed stream events surface (issue #116). Adapters that haven't
+ wired the typed iterator fall back to wrapping plain text chunks as
+ ``TextDelta`` events, terminated with ``StreamDone``."""
+
+ async def test_event_iterator_when_wired_takes_precedence(self) -> None:
+ # Adapters that emit typed events natively register a separate
+ # iterator; the default text-wrapping path is bypassed entirely.
+ from agentloom.providers.base import (
+ StreamDone,
+ StreamEvent,
+ StreamResponse,
+ ToolCallComplete,
+ )
+
+ async def _events() -> Any:
+ yield ToolCallComplete(tool_call=ToolCall(id="c1", name="add", arguments={"a": 1}))
+ yield StreamDone(finish_reason="tool_calls")
+
+ sr = StreamResponse(model="m", provider="p")
+ sr._set_event_iterator(_events())
+
+ events: list[StreamEvent] = [evt async for evt in sr.events()]
+ assert len(events) == 2
+ assert isinstance(events[0], ToolCallComplete)
+ assert events[0].tool_call.name == "add"
+
+ async def test_default_events_wrap_text_chunks(self) -> None:
+ from agentloom.providers.base import (
+ StreamDone,
+ StreamResponse,
+ TextDelta,
+ )
+
+ async def _chunks() -> Any:
+ yield "hello "
+ yield "world"
+
+ sr = StreamResponse(model="m", provider="p")
+ sr._set_iterator(_chunks())
+ sr.finish_reason = "stop"
+
+ events = [evt async for evt in sr.events()]
+ # Two TextDelta events + one StreamDone — the default wrapper
+ # provides the surface even when the adapter doesn't emit typed
+ # events natively.
+ assert [type(e).__name__ for e in events] == ["TextDelta", "TextDelta", "StreamDone"]
+ assert isinstance(events[0], TextDelta)
+ assert events[0].chunk == "hello "
+ assert isinstance(events[-1], StreamDone)
+ assert events[-1].finish_reason == "stop"
+
+
+class TestProviderFormatPreservesToolMessages:
+ """Regression for the critical bug where providers' ``_format_messages``
+ silently dropped ``tool_calls`` / ``tool_call_id`` / ``parts`` keys on
+ tool-loop iteration 2+, breaking real (non-mock) tool calling."""
+
+ def test_openai_passes_through_assistant_with_tool_calls(self) -> None:
+ msgs = [
+ {"role": "user", "content": "what is 2+3?"},
+ {
+ "role": "assistant",
+ "content": "",
+ "tool_calls": [
+ {
+ "id": "c1",
+ "type": "function",
+ "function": {"name": "add", "arguments": '{"a": 2, "b": 3}'},
+ }
+ ],
+ },
+ {"role": "tool", "tool_call_id": "c1", "content": "5"},
+ ]
+ out = OpenAIProvider._format_messages(msgs)
+ # The two tool-loop messages must round-trip with their wire keys
+ # intact — without this, OpenAI 400s on iteration 2 because the
+ # assistant message has no tool_calls and the tool message has no
+ # tool_call_id to anchor.
+ assert out[1]["tool_calls"][0]["function"]["name"] == "add"
+ assert out[2] == {"role": "tool", "tool_call_id": "c1", "content": "5"}
+
+ def test_ollama_passes_through_tool_loop_messages(self) -> None:
+ from agentloom.providers.ollama import OllamaProvider
+
+ msgs = [
+ {"role": "user", "content": "ping"},
+ {
+ "role": "assistant",
+ "content": "",
+ "tool_calls": [{"id": "c1", "type": "function", "function": {"name": "x"}}],
+ },
+ {"role": "tool", "tool_call_id": "c1", "content": "ok"},
+ ]
+ out = OllamaProvider._format_messages(msgs)
+ assert "tool_calls" in out[1]
+ assert out[2]["tool_call_id"] == "c1"
+
+ def test_google_passes_through_function_call_and_response_parts(self) -> None:
+ from agentloom.providers.google import GoogleProvider
+
+ msgs = [
+ {"role": "user", "content": "ping"},
+ # Assistant tool-call decision (Gemini wire shape).
+ {
+ "role": "model",
+ "parts": [{"functionCall": {"name": "add", "args": {"a": 1, "b": 2}}}],
+ },
+ # Tool result (Gemini wire shape).
+ {
+ "role": "function",
+ "parts": [{"functionResponse": {"name": "add", "response": {"result": "3"}}}],
+ },
+ ]
+ _system, contents = GoogleProvider._format_messages(msgs)
+ # Both ``parts``-based messages must round-trip unchanged so Gemini
+ # sees the prior call + result on iteration 2; previously the
+ # ``content``-based reformatter dropped ``parts`` entirely.
+ assert contents[1]["parts"][0]["functionCall"]["name"] == "add"
+ assert contents[2]["role"] == "function"
+ assert contents[2]["parts"][0]["functionResponse"]["response"] == {"result": "3"}
+
+
+class TestLLMStepToolLoop:
+ @respx.mock
+ async def test_step_dispatches_and_iterates(self) -> None:
+ # Iteration 1: model asks to call `add`.
+ # Iteration 2: model responds with the final answer.
+ respx.post("https://api.openai.com/v1/chat/completions").mock(
+ side_effect=[
+ httpx.Response(
+ 200,
+ json={
+ "model": "gpt-4o-mini",
+ "choices": [
+ {
+ "message": {
+ "role": "assistant",
+ "content": None,
+ "tool_calls": [
+ {
+ "id": "c1",
+ "type": "function",
+ "function": {
+ "name": "add",
+ "arguments": '{"a": 2, "b": 3}',
+ },
+ }
+ ],
+ },
+ "finish_reason": "tool_calls",
+ }
+ ],
+ "usage": {"prompt_tokens": 5, "completion_tokens": 9},
+ },
+ ),
+ httpx.Response(
+ 200,
+ json={
+ "model": "gpt-4o-mini",
+ "choices": [
+ {
+ "message": {"role": "assistant", "content": "The answer is 5."},
+ "finish_reason": "stop",
+ }
+ ],
+ "usage": {"prompt_tokens": 12, "completion_tokens": 6},
+ },
+ ),
+ ]
+ )
+ workflow = WorkflowDefinition(
+ name="agent",
+ config=WorkflowConfig(provider="openai", model="gpt-4o-mini"),
+ state={},
+ steps=[
+ StepDefinition(
+ id="ask",
+ type=StepType.LLM_CALL,
+ prompt="What is 2+3?",
+ tools=[
+ ToolDefinition(
+ name="add",
+ description="add two integers",
+ parameters={
+ "type": "object",
+ "properties": {
+ "a": {"type": "integer"},
+ "b": {"type": "integer"},
+ },
+ "required": ["a", "b"],
+ },
+ )
+ ],
+ output="answer",
+ )
+ ],
+ )
+ gateway = ProviderGateway()
+ gateway.register(OpenAIProvider(api_key="k"))
+ engine = WorkflowEngine(
+ workflow=workflow,
+ provider_gateway=gateway,
+ tool_registry=_registry_with(_AddTool()),
+ )
+ result = await engine.run()
+ await gateway.close()
+
+ assert result.status == WorkflowStatus.SUCCESS
+ sr = result.step_results["ask"]
+ assert sr.status == StepStatus.SUCCESS
+ # Cost / tokens accumulated across both iterations.
+ assert sr.token_usage.prompt_tokens == 5 + 12
+ assert sr.token_usage.completion_tokens == 9 + 6
+ assert result.final_state["answer"] == "The answer is 5."
+
+ @respx.mock
+ async def test_step_respects_max_tool_iterations(self) -> None:
+ # Model never stops asking for the tool; loop must cap at 2.
+ respx.post("https://api.openai.com/v1/chat/completions").mock(
+ return_value=httpx.Response(
+ 200,
+ json={
+ "model": "gpt-4o-mini",
+ "choices": [
+ {
+ "message": {
+ "role": "assistant",
+ "content": None,
+ "tool_calls": [
+ {
+ "id": "c1",
+ "type": "function",
+ "function": {
+ "name": "add",
+ "arguments": '{"a": 1, "b": 1}',
+ },
+ }
+ ],
+ },
+ "finish_reason": "tool_calls",
+ }
+ ],
+ "usage": {"prompt_tokens": 3, "completion_tokens": 2},
+ },
+ )
+ )
+ workflow = WorkflowDefinition(
+ name="agent-loop",
+ config=WorkflowConfig(provider="openai", model="gpt-4o-mini"),
+ state={},
+ steps=[
+ StepDefinition(
+ id="ask",
+ type=StepType.LLM_CALL,
+ prompt="loop",
+ tools=[
+ ToolDefinition(
+ name="add",
+ description="d",
+ parameters={"type": "object"},
+ )
+ ],
+ max_tool_iterations=2,
+ output="answer",
+ )
+ ],
+ )
+ gateway = ProviderGateway()
+ gateway.register(OpenAIProvider(api_key="k"))
+ engine = WorkflowEngine(
+ workflow=workflow,
+ provider_gateway=gateway,
+ tool_registry=_registry_with(_AddTool()),
+ )
+ result = await engine.run()
+ await gateway.close()
+
+ sr = result.step_results["ask"]
+ assert sr.status == StepStatus.SUCCESS
+ # Cap surfaced via finish_reason on the prompt metadata.
+ assert sr.prompt_metadata is not None
+ assert sr.prompt_metadata.finish_reason == "max_tool_iterations"
+
+ async def test_tool_dispatch_without_registry_raises(self) -> None:
+ # When a step declares tools but no registry is attached, the loop
+ # raises StepError surfaced as FAILED.
+ with respx.mock:
+ respx.post("https://api.openai.com/v1/chat/completions").mock(
+ return_value=httpx.Response(
+ 200,
+ json={
+ "model": "gpt-4o-mini",
+ "choices": [
+ {
+ "message": {
+ "role": "assistant",
+ "content": None,
+ "tool_calls": [
+ {
+ "id": "c1",
+ "type": "function",
+ "function": {
+ "name": "add",
+ "arguments": "{}",
+ },
+ }
+ ],
+ },
+ "finish_reason": "tool_calls",
+ }
+ ],
+ "usage": {"prompt_tokens": 1, "completion_tokens": 1},
+ },
+ )
+ )
+ workflow = WorkflowDefinition(
+ name="no-registry",
+ config=WorkflowConfig(provider="openai", model="gpt-4o-mini"),
+ state={},
+ steps=[
+ StepDefinition(
+ id="ask",
+ type=StepType.LLM_CALL,
+ prompt="hi",
+ tools=[
+ ToolDefinition(
+ name="add", description="d", parameters={"type": "object"}
+ )
+ ],
+ max_tool_iterations=1,
+ retry={"max_retries": 0},
+ )
+ ],
+ )
+ gateway = ProviderGateway()
+ gateway.register(OpenAIProvider(api_key="k"))
+ engine = WorkflowEngine(
+ workflow=workflow,
+ provider_gateway=gateway,
+ # tool_registry intentionally omitted
+ )
+ result = await engine.run()
+ await gateway.close()
+
+ sr = result.step_results["ask"]
+ assert sr.status == StepStatus.FAILED
+ assert "tool registry" in (sr.error or "").lower()
From f9f820c7c37c94105dedc59869008a2bfef4895d Mon Sep 17 00:00:00 2001
From: Carlos Chinchilla Corbacho
<188046461+cchinchilla-dev@users.noreply.github.com>
Date: Fri, 8 May 2026 17:48:42 +0200
Subject: [PATCH 2/4] add tool-calling example and replay support for
tool-iteration loops
---
examples/35_tool_calling.yaml | 51 ++++++++++++++++++++++++++++
recordings/tool_calling.json | 27 +++++++++++++++
src/agentloom/providers/anthropic.py | 7 ++++
src/agentloom/providers/mock.py | 40 ++++++++++++++++++++--
4 files changed, 122 insertions(+), 3 deletions(-)
create mode 100644 examples/35_tool_calling.yaml
create mode 100644 recordings/tool_calling.json
diff --git a/examples/35_tool_calling.yaml b/examples/35_tool_calling.yaml
new file mode 100644
index 0000000..701e1d7
--- /dev/null
+++ b/examples/35_tool_calling.yaml
@@ -0,0 +1,51 @@
+name: tool-calling-agent
+version: "1.0"
+description: |
+ Demonstrates native tool/function calling. The model decides
+ at runtime to invoke ``http_request`` to fetch JSON from an API,
+ receives the result on the next turn, and emits a final answer.
+
+ Runs offline against the committed recording fixture. With
+ ``--provider openai --model gpt-4o-mini`` it makes real calls and
+ the model picks the tool autonomously.
+
+ Producing a fresh recording:
+ agentloom run examples/35_tool_calling.yaml --provider openai \\
+ --model gpt-4o-mini --record recordings/tool_calling.json
+
+config:
+ provider: mock
+ model: gpt-4o-mini
+ responses_file: recordings/tool_calling.json
+ latency_model: constant
+ latency_ms: 0
+ sandbox:
+ enabled: true
+ allow_network: true
+ allowed_domains: ["httpbin.org"]
+ allowed_schemes: ["https"]
+
+state:
+ question: "What HTTP method does the httpbin /get endpoint accept?"
+
+steps:
+ - id: ask
+ type: llm_call
+ prompt: "{state.question}"
+ tools:
+ - name: http_request
+ description: "Make an HTTP request to a given URL and return the response body."
+ parameters:
+ type: object
+ properties:
+ url:
+ type: string
+ description: "Full URL to GET."
+ method:
+ type: string
+ enum: ["GET", "POST"]
+ description: "HTTP method."
+ required: [url]
+ tool_choice: auto
+ max_tool_iterations: 3
+ output: answer
diff --git a/recordings/tool_calling.json b/recordings/tool_calling.json
new file mode 100644
index 0000000..619a445
--- /dev/null
+++ b/recordings/tool_calling.json
@@ -0,0 +1,27 @@
+{
+ "ask": [
+ {
+ "content": "",
+ "provider": "openai",
+ "model": "gpt-4o-mini",
+ "tool_calls": [
+ {
+ "id": "call_demo_1",
+ "name": "http_request",
+ "arguments": {"url": "https://httpbin.org/get", "method": "GET"}
+ }
+ ],
+ "usage": {"prompt_tokens": 35, "completion_tokens": 18, "total_tokens": 53},
+ "cost_usd": 0.000018,
+ "finish_reason": "tool_calls"
+ },
+ {
+ "content": "The /get endpoint of httpbin accepts the GET HTTP method, as confirmed by the successful response from https://httpbin.org/get.",
+ "provider": "openai",
+ "model": "gpt-4o-mini",
+ "usage": {"prompt_tokens": 320, "completion_tokens": 26, "total_tokens": 346},
+ "cost_usd": 0.000061,
+ "finish_reason": "stop"
+ }
+ ]
+}
diff --git a/src/agentloom/providers/anthropic.py b/src/agentloom/providers/anthropic.py
index e989f7b..94a089f 100644
--- a/src/agentloom/providers/anthropic.py
+++ b/src/agentloom/providers/anthropic.py
@@ -111,6 +111,13 @@ def _format_messages(
else:
parts: list[dict[str, Any]] = []
for block in content:
+ # Tool-calling messages build pure-dict wire-format
+ # blocks (``tool_use``, ``tool_result``); pass those
+ # through verbatim. Multimodal Pydantic blocks below need
+ # translation to Anthropic's specific keys.
+ if isinstance(block, dict):
+ parts.append(block)
+ continue
if isinstance(block, TextBlock):
parts.append({"type": "text", "text": block.text})
elif isinstance(block, ImageBlock):
diff --git a/src/agentloom/providers/mock.py b/src/agentloom/providers/mock.py
index ebf7cf7..814956e 100644
--- a/src/agentloom/providers/mock.py
+++ b/src/agentloom/providers/mock.py
@@ -109,7 +109,12 @@ def __init__(
self._observer = observer
self._workflow_name = workflow_name
self.calls: list[dict[str, Any]] = []
- self._responses: dict[str, dict[str, Any]] = {}
+ # Each value is either a single response dict OR a list of turns
+ # to play in order (for tool-calling loops where one step issues
+ # multiple complete() calls). The cursor below tracks which turn
+ # the next call should emit per step_id.
+ self._responses: dict[str, Any] = {}
+ self._turn_cursor: dict[str, int] = {}
if self.responses_file and self.responses_file.exists():
raw = json.loads(self.responses_file.read_text())
if not isinstance(raw, dict):
@@ -126,7 +131,18 @@ def _lookup(
extra: dict[str, Any] | None = None,
) -> dict[str, Any] | None:
if step_id and step_id in self._responses:
- return self._responses[step_id]
+ entry = self._responses[step_id]
+ # List form: pop the next turn (clamp at last so excess
+ # iterations replay the final response — saner than
+ # raising mid-loop).
+ if isinstance(entry, list):
+ if not entry:
+ return None
+ idx = self._turn_cursor.get(step_id, 0)
+ turn = entry[min(idx, len(entry) - 1)]
+ self._turn_cursor[step_id] = idx + 1
+ return turn if isinstance(turn, dict) else None
+ return entry if isinstance(entry, dict) else None
key = prompt_hash(messages, model, temperature, max_tokens, extra)
return self._responses.get(key)
@@ -185,10 +201,27 @@ async def complete(
)
usage_data = entry.get("usage", {}) or {}
+ # Hydrate ``tool_calls`` from the recording so replay drives the
+ # tool-iteration loop. Each turn carries its own ``tool_calls``.
+ tool_calls: list[Any] = []
+ recorded_tool_calls = entry.get("tool_calls") or []
+ if recorded_tool_calls:
+ from agentloom.providers.base import ToolCall
+
+ for tc in recorded_tool_calls:
+ tool_calls.append(
+ ToolCall(
+ id=str(tc.get("id", "")),
+ name=str(tc.get("name", "")),
+ arguments=tc.get("arguments", {}) or {},
+ )
+ )
+
+
return ProviderResponse(
content=str(entry.get("content", "")),
model=str(entry.get("model", model)),
- provider=self.name,
+ provider=str(entry.get("provider", self.name)),
usage=TokenUsage(
prompt_tokens=int(usage_data.get("prompt_tokens", 0)),
completion_tokens=int(usage_data.get("completion_tokens", 0)),
@@ -196,6 +229,7 @@ async def complete(
),
cost_usd=float(entry.get("cost_usd", 0.0)),
finish_reason=entry.get("finish_reason", "stop"),
+ tool_calls=tool_calls,
)
def supports_model(self, model: str) -> bool:
From a58a156c3b7aa4b9336eef0a0bdb33110bf4802d Mon Sep 17 00:00:00 2001
From: Carlos Chinchilla Corbacho
<188046461+cchinchilla-dev@users.noreply.github.com>
Date: Sat, 9 May 2026 12:20:47 +0200
Subject: [PATCH 3/4] fix(providers): parse ollama-shaped tool_calls (no type,
dict args)
---
src/agentloom/core/models.py | 1 -
src/agentloom/providers/google.py | 1 -
src/agentloom/providers/mock.py | 1 -
src/agentloom/providers/ollama.py | 1 -
src/agentloom/providers/openai.py | 1 -
src/agentloom/steps/_tools.py | 28 +++++++++++++++++++++++-----
tests/providers/test_tool_calling.py | 24 ++++++++++++++++++++++++
7 files changed, 47 insertions(+), 10 deletions(-)
diff --git a/src/agentloom/core/models.py b/src/agentloom/core/models.py
index 9063e68..11f3390 100644
--- a/src/agentloom/core/models.py
+++ b/src/agentloom/core/models.py
@@ -121,7 +121,6 @@ class ToolDefinition(BaseModel):
parameters: dict[str, Any] = Field(default_factory=dict)
-
class StepDefinition(BaseModel):
"""Definition of a single workflow step."""
diff --git a/src/agentloom/providers/google.py b/src/agentloom/providers/google.py
index 42d908a..6e6a0a5 100644
--- a/src/agentloom/providers/google.py
+++ b/src/agentloom/providers/google.py
@@ -280,7 +280,6 @@ async def complete(
tool_calls = parse_tool_calls_from_google(content_parts)
-
return ProviderResponse(
content=content,
model=model,
diff --git a/src/agentloom/providers/mock.py b/src/agentloom/providers/mock.py
index 814956e..d4ff691 100644
--- a/src/agentloom/providers/mock.py
+++ b/src/agentloom/providers/mock.py
@@ -217,7 +217,6 @@ async def complete(
)
)
-
return ProviderResponse(
content=str(entry.get("content", "")),
model=str(entry.get("model", model)),
diff --git a/src/agentloom/providers/ollama.py b/src/agentloom/providers/ollama.py
index 290e655..445c04c 100644
--- a/src/agentloom/providers/ollama.py
+++ b/src/agentloom/providers/ollama.py
@@ -232,7 +232,6 @@ async def complete(
tool_calls = parse_tool_calls_from_openai(data.get("message", {}) or {})
-
return ProviderResponse(
content=content,
model=data.get("model", model),
diff --git a/src/agentloom/providers/openai.py b/src/agentloom/providers/openai.py
index caeaf19..74ac97e 100644
--- a/src/agentloom/providers/openai.py
+++ b/src/agentloom/providers/openai.py
@@ -194,7 +194,6 @@ async def complete(
tool_calls = parse_tool_calls_from_openai(message)
-
return ProviderResponse(
content=content,
model=data.get("model", model),
diff --git a/src/agentloom/steps/_tools.py b/src/agentloom/steps/_tools.py
index 8e73bdf..51958e9 100644
--- a/src/agentloom/steps/_tools.py
+++ b/src/agentloom/steps/_tools.py
@@ -96,16 +96,34 @@ def translate_tool_choice_for_anthropic(choice: Any) -> Any:
def parse_tool_calls_from_openai(message: dict[str, Any]) -> list[ToolCall]:
- """OpenAI returns ``message.tool_calls = [{id, type, function: {name, arguments}}]``."""
+ """Parse ``message.tool_calls`` from OpenAI-shaped responses.
+
+ Handles both wire variants seen in the wild:
+
+ * **OpenAI canonical**: ``[{id, type:"function", function:{name, arguments:""}}]``
+ — ``arguments`` is a JSON-encoded string.
+ * **Ollama / OpenAI-compatible relays**: ``[{id, function:{name, arguments:{...}}}]``
+ — ``type`` may be omitted entirely, and ``arguments`` may already be a
+ decoded dict. Treat the absence of ``type`` as ``"function"`` (the
+ only call kind we currently dispatch); a non-function explicit
+ ``type`` (e.g. ``"code_interpreter"``) is skipped.
+ """
raw_calls = message.get("tool_calls") or []
calls: list[ToolCall] = []
for entry in raw_calls:
- if entry.get("type") != "function":
+ entry_type = entry.get("type", "function")
+ if entry_type != "function":
continue
fn = entry.get("function", {})
- try:
- args = json.loads(fn.get("arguments") or "{}")
- except json.JSONDecodeError:
+ raw_args = fn.get("arguments")
+ if isinstance(raw_args, dict):
+ args: dict[str, Any] = raw_args
+ elif isinstance(raw_args, str):
+ try:
+ args = json.loads(raw_args or "{}")
+ except json.JSONDecodeError:
+ args = {}
+ else:
args = {}
calls.append(ToolCall(id=entry.get("id", ""), name=fn.get("name", ""), arguments=args))
return calls
diff --git a/tests/providers/test_tool_calling.py b/tests/providers/test_tool_calling.py
index 8cb461b..9790d50 100644
--- a/tests/providers/test_tool_calling.py
+++ b/tests/providers/test_tool_calling.py
@@ -127,6 +127,30 @@ def test_openai_skips_non_function_entries(self) -> None:
assert len(calls) == 1
assert calls[0].name == ""
+ def test_ollama_compat_response_with_dict_args_and_no_type(self) -> None:
+ # Real Ollama 0.x ``/api/chat`` returns tool_calls in OpenAI-compatible
+ # shape but (a) omits ``"type": "function"`` and (b) ships
+ # ``arguments`` as an already-decoded dict, not a JSON string. Without
+ # this regression, Ollama tool calling silently drops every call —
+ # the parser's strict ``type == "function"`` check skips entries
+ # missing the field, and ``json.loads()`` would TypeError.
+ # Captured from a live ``llama3.1:8b`` response on 2026-05-09.
+ message = {
+ "tool_calls": [
+ {
+ "id": "call_eh7yrv0u",
+ "function": {
+ "index": 0,
+ "name": "add",
+ "arguments": {"a": 17, "b": 25}, # already a dict
+ },
+ # NOTE: no "type" key
+ }
+ ]
+ }
+ calls = parse_tool_calls_from_openai(message)
+ assert calls == [ToolCall(id="call_eh7yrv0u", name="add", arguments={"a": 17, "b": 25})]
+
def test_anthropic_parses_tool_use_blocks(self) -> None:
blocks = [
{"type": "text", "text": "I'll use a tool."},
From c2966e8b0cffa4fe58435963e5a0399fbff60ce5 Mon Sep 17 00:00:00 2001
From: Carlos Chinchilla Corbacho
<188046461+cchinchilla-dev@users.noreply.github.com>
Date: Sun, 10 May 2026 11:01:24 +0200
Subject: [PATCH 4/4] docs: surface tool calling in changelog, workflow-yaml,
and examples
---
CHANGELOG.md | 1 +
docs/examples.md | 16 ++++++++++++++++
docs/workflow-yaml.md | 33 ++++++++++++++++++++++++++++++++-
3 files changed, 49 insertions(+), 1 deletion(-)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 9992e71..d31c455 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -9,6 +9,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
### Added
+- Native tool / function calling across providers (#116). New `tools`, `tool_choice`, and `max_tool_iterations` fields on `llm_call`; the LLM step dispatches via the existing `ToolRegistry`, feeds results back, and re-prompts until the model stops asking (capped by `max_tool_iterations`, default 5). Each adapter translates the unified `ToolDefinition` list to its native shape (OpenAI / Ollama / Anthropic / Google). Parallel tool calls dispatch concurrently and preserve order; failures are reported back as text so the model can recover. Sandbox (#105), budget (#108), and per-step retry (#106) apply unchanged. `MockProvider` recordings accept a list of turns per step so offline replay drives the loop end-to-end; `examples/35_tool_calling.yaml` ships a ReAct-style example. The OpenAI-shaped parser also handles Ollama-compat responses (no `type` field, `arguments` as a decoded dict) — without this Ollama tool calling silently dropped every call.
- Per-run experiment metadata logging (#77). Every workflow execution now writes a self-contained JSON record (`run_id`, ISO timestamp, AgentLoom version, Python version, workflow `sha256` hash, list of `provider/model` pairs used, status, total cost, total tokens, step count, duration) to `./agentloom_runs/.json`. Override the directory via the `runs_dir` constructor argument on `RunHistoryWriter` or the `AGENTLOOM_RUNS_DIR` env var. Disk I/O happens in a worker thread so the write does not block the event loop. Records carry a `_schema_version: 1` field; failures during the write are logged and swallowed so a broken history directory cannot prevent the engine from returning the result. New `agentloom history` CLI subcommand lists records most-recent-first and accepts `--workflow`, `--provider`, `--since YYYY-MM-DD`, `--until YYYY-MM-DD`, `--min-cost`, `--max-cost`, `--limit`, and `--json` filters — covering the full filter surface (date, workflow, cost, provider) called for in the original issue.
- Quality annotations attachable to `WorkflowResult` (#59). New `WorkflowResult.annotate(target, quality_score=..., source=..., **metadata)` method appends a typed `QualityAnnotation` (`target`, `quality_score`, `source`, `metadata`) to the result so evaluators, human reviewers, or downstream scoring code can record output quality after the run completes. **The annotation is auto-emitted as an OTel span** the moment `annotate()` runs whenever the engine returned the result with a tracing context attached (the default for any workflow run with observability enabled) — `result.annotate("answer", quality_score=4.5, source="human_feedback")` becomes immediately visible in Jaeger with no additional plumbing on the caller side. Each annotation is published as a standalone `quality:` span (the workflow span has already closed, so retroactive attribute attachment is not viable). Quality spans carry `workflow.run_id` and `workflow.name` plus `agentloom.quality.score`, `agentloom.quality.source`, `agentloom.quality.target`, and free-form `agentloom.quality.metadata.*` attributes — Jaeger / Tempo can group quality spans with the original trace by run_id, and dashboards can filter for `agentloom.quality.score < threshold` to surface regressions. Offline / replay paths that build a `WorkflowResult` without a live tracer keep working — `annotate()` still records the data on the result, the OTel emission just no-ops. The `agentloom.observability.quality.emit_quality_annotation` / `emit_quality_annotations` helpers remain available for callers that build annotations outside the engine flow (e.g. batch evaluators reading historical results from disk).
- OTel span and metric schema centralization with GenAI semantic conventions (#125). The schema is a clean break — no compatibility shims for pre-#125 attribute or metric names. New `agentloom.observability.schema` module is the single source of truth for span / attribute / metric names; downstream consumers (Grafana, AgentTest, Jaeger plugins) parse a stable contract instead of grepping for ad-hoc strings. **Metrics renamed and retyped** to match the OTel GenAI registry: `agentloom_tokens_total` (counter) → `gen_ai.client.token.usage` (histogram, `{token}` unit) with `gen_ai.token.type` attribute (`input` / `output` / `reasoning`); `agentloom_provider_latency_seconds` (histogram) → `gen_ai.client.operation.duration` (histogram, `s`) with `gen_ai.operation.name` + `gen_ai.provider.name` attributes; `agentloom_time_to_first_token_seconds` → `gen_ai.client.operation.time_to_first_chunk`. AgentLoom-specific metrics (`agentloom_workflow_*`, `agentloom_step_*`, `agentloom_provider_calls_total`, `agentloom_cost_usd_total`, `agentloom_circuit_breaker_state`, `agentloom_budget_remaining_usd`, HITL / mock / recording counters) keep their `agentloom_` prefix — they have no OTel equivalent. The bundled Grafana dashboard is updated to query the new metric / label names. The legacy `Observer.on_provider_call` hook (which duplicated the metric emission already done by `on_provider_call_end`) is removed; the engine no longer fires it. The `tokens: int` positional argument on `on_step_end` is removed — callers now pass `prompt_tokens` / `completion_tokens` as kwargs. Span attributes follow the **canonical OTel GenAI registry** as of the May 2026 spec — `gen_ai.provider.name` (the deprecated `gen_ai.system` is **not** emitted), `gen_ai.operation.name`, `gen_ai.request.model`, `gen_ai.request.temperature`, `gen_ai.request.max_tokens`, `gen_ai.request.stream`, `gen_ai.response.model`, `gen_ai.response.finish_reasons` (array of strings, per spec), `gen_ai.response.time_to_first_chunk`, `gen_ai.usage.input_tokens`, `gen_ai.usage.output_tokens`, `gen_ai.usage.reasoning.output_tokens`. Errored inference spans also emit the OTel general-conventions attribute `error.type` alongside the AgentLoom-specific `step.error` so OTel-aware consumers (Jaeger error filters, Tempo) light up. Inference spans use the canonical name template `"{operation_name} {model}"` (e.g. `"chat gpt-4o-mini"`); workflow / step orchestration spans keep the AgentLoom-specific `workflow:*` / `step:*` names. AgentLoom-specific fields stay under `workflow.*` / `step.*` / `tool.*` / `agentloom.*` namespaces. Provider names are translated from AgentLoom internal names to OTel registry values via `to_genai_provider_name` (e.g. `google` → `gcp.gemini`). Notable additions:
diff --git a/docs/examples.md b/docs/examples.md
index e763b99..0b9013a 100644
--- a/docs/examples.md
+++ b/docs/examples.md
@@ -295,3 +295,19 @@ fixtures and CI.
# Depends on the recording captured from example 31
agentloom run examples/32_yaml_mock.yaml --lite
```
+
+## Tool calling
+
+### 35 — Native tool/function calling
+
+ReAct-style agent: the model decides to invoke `http_request` against `httpbin.org/get`, receives the JSON, and emits a final natural-language answer. Sandbox is on with `allowed_domains: ["httpbin.org"]`, so the model-dispatched call goes through the same security policy as static `tool` steps (#105).
+
+**Demonstrates:** `tools` declaration on `llm_call`, `tool_choice: auto`, `max_tool_iterations`, model-driven dispatch via `ToolRegistry`, sandboxed tool execution, replay support for tool-iteration loops.
+
+```bash
+# Mock-replay against the committed recording (no API calls)
+agentloom run examples/35_tool_calling.yaml --lite
+
+# Real call: pass --provider + --model to drive a live model
+agentloom run examples/35_tool_calling.yaml --provider openai --model gpt-4o-mini
+```
diff --git a/docs/workflow-yaml.md b/docs/workflow-yaml.md
index 1c54f89..1fc15c6 100644
--- a/docs/workflow-yaml.md
+++ b/docs/workflow-yaml.md
@@ -191,6 +191,37 @@ Per-provider translation:
Reasoning tokens are billed at the output rate. `TokenUsage.reasoning_tokens` and `billable_completion_tokens` track the spend; `calculate_cost()` includes them automatically. See [Reasoning models](providers.md#reasoning-models) for per-provider details, including the Ollama caveat that `eval_count` is not split.
+**Tool calling:**
+
+The model can pick tools at runtime. Declare them on the step; the engine dispatches via the workflow's `ToolRegistry`, feeds results back, and re-prompts until the model stops asking for tools.
+
+```yaml
+- id: ask
+ type: llm_call
+ prompt: "What is the user's account balance?"
+ tools:
+ - name: lookup_account
+ description: "Retrieve account info by ID."
+ parameters:
+ type: object
+ properties:
+ account_id: { type: string }
+ required: [account_id]
+ tool_choice: auto # auto | required | none | {name: lookup_account}
+ max_tool_iterations: 5 # bound the loop; default 5
+ output: answer
+```
+
+| Field | Type | Default | Description |
+|-------|------|---------|-------------|
+| `tools` | `list[ToolDefinition]` | `[]` | Tool declarations the model can pick. `parameters` is JSON Schema. Names resolve against the registered `ToolRegistry`; an unknown name is reported back as a tool failure rather than aborting the loop. |
+| `tool_choice` | `string \| dict` | `"auto"` | `"auto"` lets the model decide; `"required"` forces a call; `"none"` disables tools for this turn; `{"name": "..."}` pins to a specific tool. Anthropic ignores `"none"` (omits the field); Ollama ignores `tool_choice` entirely (model-side support decides). |
+| `max_tool_iterations` | `int` | `5` | Cap on call→result→re-prompt loops. When hit, `finish_reason` becomes `"max_tool_iterations"` so callers can detect runaway behavior. |
+
+The dispatched tool runs through the existing sandbox (#105), so `http_request`, `shell_command`, `file_read`, `file_write` honor the workflow's `sandbox:` config. Multiple tool calls in one response are dispatched concurrently (anyio task group); results preserve order in the conversation. Cost and tokens accumulate across iterations on the surfaced `StepResult`.
+
+The legacy `tool` step (static DAG node, author chooses the tool) keeps working unchanged — `tools=` on `llm_call` is the new dynamic, model-driven path.
+
**Retry config:**
| Field | Type | Default | Description |
@@ -231,7 +262,7 @@ Evaluates conditions against state and activates a target step. Steps not activa
### `tool`
-Executes a registered tool (shell command, HTTP request, etc.).
+Executes a registered tool with author-chosen arguments — the workflow author decides which tool to call, not the model. For model-driven tool selection, use the `tools=` field on an `llm_call` step (see [tool calling](#llm_call) above).
```yaml
- id: fetch