From 9a718300fa6b3ceaa242cca62fbc13dc0a06f681 Mon Sep 17 00:00:00 2001
From: Carlos Chinchilla Corbacho
 <188046461+cchinchilla-dev@users.noreply.github.com>
Date: Fri, 8 May 2026 15:32:18 +0200
Subject: [PATCH 1/4] add native tool/function calling across providers

---
 src/agentloom/core/models.py            |  20 +
 src/agentloom/observability/metrics.py  |  43 ++
 src/agentloom/observability/noop.py     |   3 +
 src/agentloom/observability/observer.py |  38 ++
 src/agentloom/observability/schema.py   |   8 +
 src/agentloom/providers/anthropic.py    |  17 +
 src/agentloom/providers/base.py         |  89 +++
 src/agentloom/providers/google.py       |  38 +-
 src/agentloom/providers/ollama.py       |  28 +
 src/agentloom/providers/openai.py       |  30 +-
 src/agentloom/steps/_tools.py           | 269 +++++++++
 src/agentloom/steps/llm_call.py         | 108 +++-
 tests/observability/test_metrics.py     |  22 +
 tests/observability/test_noop.py        |   9 +
 tests/observability/test_observer.py    |  50 ++
 tests/providers/test_tool_calling.py    | 730 ++++++++++++++++++++++++
 16 files changed, 1494 insertions(+), 8 deletions(-)
 create mode 100644 src/agentloom/steps/_tools.py
 create mode 100644 tests/providers/test_tool_calling.py

diff --git a/src/agentloom/core/models.py b/src/agentloom/core/models.py
index dc689fa..9063e68 100644
--- a/src/agentloom/core/models.py
+++ b/src/agentloom/core/models.py
@@ -108,6 +108,20 @@ class ThinkingConfig(BaseModel):
     capture_reasoning: bool = True
 
 
+class ToolDefinition(BaseModel):
+    """LLM-callable tool declared on an ``llm_call`` step.
+
+    ``parameters`` is a JSON Schema object; provider adapters translate it
+    to each API's native shape. ``name`` resolves against the workflow's
+    ``tool_registry`` for dispatch.
+    """
+
+    name: str
+    description: str = ""
+    parameters: dict[str, Any] = Field(default_factory=dict)
+
+
+
 class StepDefinition(BaseModel):
     """Definition of a single workflow step."""
 
@@ -155,6 +169,12 @@ class StepDefinition(BaseModel):
     # Reasoning / extended thinking
     thinking: ThinkingConfig | None = None
 
+    # Tool calling — LLM picks tools at runtime.
+    # ``tool_choice``: ``"auto"`` | ``"required"`` | ``"none"`` | ``{"name": "..."}``.
+    tools: list[ToolDefinition] = Field(default_factory=list)
+    tool_choice: Any = "auto"
+    max_tool_iterations: int = 5
+
 
 class SandboxConfig(BaseModel):
     """Sandbox configuration for built-in tools.
diff --git a/src/agentloom/observability/metrics.py b/src/agentloom/observability/metrics.py
index 0333470..6ccf450 100644
--- a/src/agentloom/observability/metrics.py
+++ b/src/agentloom/observability/metrics.py
@@ -64,6 +64,8 @@ def __init__(
         self._attachment_counter: Any = None
         self._stream_counter: Any = None
         self._ttft_histogram: Any = None
+        self._tool_call_counter: Any = None
+        self._tool_call_histogram: Any = None
         self._mock_replay_counter: Any = None
         self._recording_capture_counter: Any = None
         self._recording_latency_histogram: Any = None
@@ -152,6 +154,18 @@ def _setup_otel(self, endpoint: str) -> None:
             "agentloom_stream_responses_total",
             description="Total streamed LLM responses",
         )
+        # Tool calls dispatched by the model (#116). Tagged by tool name +
+        # status so dashboards can split successes from failures and spot
+        # tools that consistently fail or hang.
+        self._tool_call_counter = meter.create_counter(
+            "agentloom_tool_calls_total",
+            description="Total model-dispatched tool calls",
+        )
+        self._tool_call_histogram = meter.create_histogram(
+            "agentloom_tool_call_duration_seconds",
+            description="Tool-call execution duration",
+            unit="s",
+        )
         # Canonical OTel GenAI metric — replaces the AgentLoom-prefixed
         # ``agentloom_time_to_first_token_seconds`` with the spec name.
         self._time_to_first_chunk_histogram = meter.create_histogram(
@@ -270,6 +284,17 @@ def _setup_prom(
             "Total streamed LLM responses",
             ["provider", "model"],
         )
+        self._prom_counters["tool_calls"] = prom.Counter(
+            "agentloom_tool_calls_total",
+            "Total model-dispatched tool calls",
+            ["tool_name", "status"],
+        )
+        self._prom_histograms["tool_call_duration"] = prom.Histogram(
+            "agentloom_tool_call_duration_seconds",
+            "Tool-call execution duration",
+            ["tool_name"],
+            buckets=[0.01, 0.05, 0.1, 0.5, 1, 2, 5, 10, 30, 60],
+        )
         self._prom_histograms["time_to_first_chunk"] = prom.Histogram(
             "gen_ai_client_operation_time_to_first_chunk_seconds",
             "GenAI streaming time-to-first-chunk (per OTel GenAI conventions)",
@@ -465,6 +490,24 @@ def record_attachments(self, step_type: str, count: int) -> None:
         else:  # pragma: no cover — prom fallback
             self._prom_counters["attachments"].labels(step_type=step_type).inc(count)
 
+    def record_tool_call(self, tool_name: str, success: bool, duration_s: float) -> None:
+        """Record a model-dispatched tool call (#116).
+
+        Counter is tagged ``status=success|failure`` so dashboards can
+        plot a per-tool failure rate; histogram tracks execution latency.
+        """
+        if not self._enabled:
+            return
+        status = "success" if success else "failure"
+        if self._backend == "otel":
+            self._tool_call_counter.add(1, {"tool_name": tool_name, "status": status})
+            self._tool_call_histogram.record(duration_s, {"tool_name": tool_name})
+        else:  # pragma: no cover — prom fallback
+            self._prom_counters["tool_calls"].labels(tool_name=tool_name, status=status).inc()
+            self._prom_histograms["tool_call_duration"].labels(tool_name=tool_name).observe(
+                duration_s
+            )
+
     def record_stream_response(self, provider: str, model: str) -> None:
         if not self._enabled:
             return
diff --git a/src/agentloom/observability/noop.py b/src/agentloom/observability/noop.py
index 54c4812..a377330 100644
--- a/src/agentloom/observability/noop.py
+++ b/src/agentloom/observability/noop.py
@@ -139,6 +139,9 @@ def on_provider_call_end(
     def on_provider_error(self, provider: str, error_type: str, **kwargs: Any) -> None:
         pass
 
+    def on_tool_call(self, **kwargs: Any) -> None:
+        pass
+
     def on_stream_response(self, provider: str, model: str, ttft_s: float, **kwargs: Any) -> None:
         pass
 
diff --git a/src/agentloom/observability/observer.py b/src/agentloom/observability/observer.py
index 9c87a6b..aa41b85 100644
--- a/src/agentloom/observability/observer.py
+++ b/src/agentloom/observability/observer.py
@@ -304,6 +304,44 @@ def on_provider_call_end(
         else:
             span.end()
 
+    def on_tool_call(
+        self,
+        *,
+        step_id: str,
+        call_id: str,
+        tool_name: str,
+        args_hash: str,
+        result_hash: str,
+        duration_ms: float,
+        success: bool,
+        **kwargs: Any,
+    ) -> None:
+        """Record a model-dispatched tool call (#116).
+
+        Emits a child span under the active step span carrying the canonical
+        ``execute_tool {name}`` name plus tool attrs, and records the
+        per-tool counter + histogram. Args / result are hashed (not raw)
+        so PII never lands on the trace.
+        """
+        if self._metrics:
+            self._metrics.record_tool_call(tool_name, success, duration_ms / 1000.0)
+        if self._tracing:
+            attrs: dict[str, Any] = {
+                SpanAttr.TOOL_CALL_ID: call_id,
+                SpanAttr.TOOL_NAME: tool_name,
+                SpanAttr.TOOL_ARGS_HASH: args_hash,
+                SpanAttr.TOOL_RESULT_HASH: result_hash,
+                SpanAttr.TOOL_DURATION_MS: duration_ms,
+                SpanAttr.TOOL_SUCCESS: success,
+            }
+            if self._run_id:
+                attrs[SpanAttr.WORKFLOW_RUN_ID] = self._run_id
+            span = self._tracing.start_span(
+                SpanName.GEN_AI_TOOL_CALL.format(tool_name=tool_name),
+                attributes=attrs,
+            )
+            self._tracing.end_span(span)
+
     def on_provider_error(
         self,
         provider: str,
diff --git a/src/agentloom/observability/schema.py b/src/agentloom/observability/schema.py
index ac9e703..16628ad 100644
--- a/src/agentloom/observability/schema.py
+++ b/src/agentloom/observability/schema.py
@@ -109,9 +109,11 @@ class SpanAttr:
     PROVIDER_ATTEMPT_OUTCOME = "agentloom.provider.attempt_outcome"
 
     # Tool calls
+    TOOL_CALL_ID = "tool.call_id"
     TOOL_NAME = "tool.name"
     TOOL_ARGS_HASH = "tool.args_hash"
     TOOL_RESULT_HASH = "tool.result_hash"
+    TOOL_DURATION_MS = "tool.duration_ms"
     TOOL_SUCCESS = "tool.success"
 
     # Prompt metadata (AgentLoom-specific, no full-prompt capture by default)
@@ -181,6 +183,12 @@ class MetricName:
     # Streaming response counter (no OTel equivalent — AgentLoom-specific).
     STREAM_RESPONSES_TOTAL = "agentloom_stream_responses_total"
 
+    # Tool calls dispatched by the model (#116). Counter tagged by tool
+    # name + status (success / failure); histogram captures execution
+    # latency per tool.
+    TOOL_CALLS_TOTAL = "agentloom_tool_calls_total"
+    TOOL_CALL_DURATION_SECONDS = "agentloom_tool_call_duration_seconds"
+
     # Resilience gauges
     CIRCUIT_BREAKER_STATE = "agentloom_circuit_breaker_state"
     BUDGET_REMAINING_USD = "agentloom_budget_remaining_usd"
diff --git a/src/agentloom/providers/anthropic.py b/src/agentloom/providers/anthropic.py
index 917269c..e989f7b 100644
--- a/src/agentloom/providers/anthropic.py
+++ b/src/agentloom/providers/anthropic.py
@@ -161,10 +161,22 @@ async def complete(
         max_tokens: int | None = None,
         **kwargs: Any,
     ) -> ProviderResponse:
+        agentloom_tools = kwargs.pop("agentloom_tools", None)
+        agentloom_tool_choice = kwargs.pop("agentloom_tool_choice", None)
         extras = validate_extra_kwargs(
             "anthropic", "complete", kwargs, _ANTHROPIC_EXTRA_PAYLOAD_KEYS
         )
         capture_reasoning = _translate_thinking_config(extras)
+        if agentloom_tools:
+            from agentloom.steps._tools import (
+                translate_tool_choice_for_anthropic,
+                translate_tools_for_anthropic,
+            )
+
+            extras["tools"] = translate_tools_for_anthropic(agentloom_tools)
+            mapped_choice = translate_tool_choice_for_anthropic(agentloom_tool_choice or "auto")
+            if mapped_choice is not None:
+                extras["tool_choice"] = mapped_choice
         system_prompt, filtered_messages = self._format_messages(messages)
 
         payload: dict[str, Any] = {
@@ -216,6 +228,10 @@ async def complete(
         )
         cost = calculate_cost(model, usage.prompt_tokens, usage.completion_tokens)
 
+        from agentloom.steps._tools import parse_tool_calls_from_anthropic
+
+        tool_calls = parse_tool_calls_from_anthropic(content_blocks)
+
         return ProviderResponse(
             content=content,
             model=data.get("model", model),
@@ -225,6 +241,7 @@ async def complete(
             reasoning_content=(
                 "".join(reasoning_parts) if reasoning_parts and capture_reasoning else None
             ),
+            tool_calls=tool_calls,
             raw_response=data,
             finish_reason=data.get("stop_reason"),
         )
diff --git a/src/agentloom/providers/base.py b/src/agentloom/providers/base.py
index b247672..ec54322 100644
--- a/src/agentloom/providers/base.py
+++ b/src/agentloom/providers/base.py
@@ -12,6 +12,60 @@
 from agentloom.exceptions import ProviderError
 
 
+class ToolCall(BaseModel):
+    """A function-call decision returned by the model.
+
+    ``id`` round-trips on the follow-up tool-result message (OpenAI:
+    ``tool_call_id``, Anthropic: ``tool_use_id``). ``arguments`` is
+    already JSON-decoded — adapters parse before constructing this.
+    """
+
+    id: str
+    name: str
+    arguments: dict[str, Any] = Field(default_factory=dict)
+
+
+class StreamEvent(BaseModel):
+    """Base type for typed stream events surfaced via ``StreamResponse.events()``.
+
+    Backwards-compat: ``async for chunk in sr`` keeps yielding plain text
+    strings (the deltas only). Callers that need to react to tool-call
+    decisions mid-stream use ``async for evt in sr.events()`` and switch
+    on the concrete subclass.
+    """
+
+
+class TextDelta(StreamEvent):
+    """A chunk of text content from the model."""
+
+    chunk: str
+
+
+class ToolCallDelta(StreamEvent):
+    """A partial tool-call observation while the model assembles arguments.
+
+    ``index`` matches across deltas of the same call; ``name`` arrives on
+    the first delta and is ``None`` on subsequent argument-fragment
+    deltas; ``arguments_chunk`` accumulates the JSON-encoded args.
+    """
+
+    index: int
+    name: str | None = None
+    arguments_chunk: str = ""
+
+
+class ToolCallComplete(StreamEvent):
+    """A fully-assembled ``ToolCall`` ready to dispatch."""
+
+    tool_call: ToolCall
+
+
+class StreamDone(StreamEvent):
+    """Emitted when the stream finishes; carries the provider's stop reason."""
+
+    finish_reason: str = ""
+
+
 class ProviderResponse(BaseModel):
     """Unified response from any LLM provider."""
 
@@ -30,6 +84,9 @@ class ProviderResponse(BaseModel):
     # Ollama returns ``message.thinking`` (or strips inline
     # ``<think>...</think>`` tags as a fallback).
     reasoning_content: str | None = None
+    # Empty when the model didn't pick a tool. When non-empty, ``content``
+    # may be empty — the LLM step dispatches each call and re-prompts.
+    tool_calls: list[ToolCall] = Field(default_factory=list)
 
 
 class StreamResponse:
@@ -56,13 +113,44 @@ def __init__(self, model: str, provider: str) -> None:
         # alongside the streamed answer (Gemini ``thought=true`` parts,
         # Anthropic ``thinking`` deltas, Ollama ``message.thinking``).
         self.reasoning_content: str | None = None
+        # Tool calls accumulated by the adapter while streaming. Adapters
+        # that emit ``ToolCallDelta`` events through the typed event API
+        # also populate this list with the assembled calls so non-event
+        # consumers can read them after the stream is exhausted.
+        self.tool_calls: list[ToolCall] = []
         self._chunks: list[str] = []
         self._accumulated_bytes: int = 0
         self._iterator: AsyncIterator[str] | None = None
+        self._event_iterator: AsyncIterator[StreamEvent] | None = None
 
     def _set_iterator(self, iterator: AsyncIterator[str]) -> None:
         self._iterator = iterator
 
+    def _set_event_iterator(self, iterator: AsyncIterator[StreamEvent]) -> None:
+        """Adapters that emit typed events register a separate iterator here.
+
+        When unset, ``events()`` synthesises ``TextDelta`` events from
+        the plain-string iterator plus a final ``StreamDone`` so callers
+        always get a working typed surface.
+        """
+        self._event_iterator = iterator
+
+    async def events(self) -> AsyncIterator[StreamEvent]:
+        """Iterate typed stream events (text + tool-call deltas + done).
+
+        Adapters that wired ``_set_event_iterator`` get full fidelity
+        (per-provider tool-call streaming). The default fallback wraps
+        the plain-string iterator: each chunk → ``TextDelta``, then a
+        single ``StreamDone`` once the underlying iterator is exhausted.
+        """
+        if self._event_iterator is not None:
+            async for event in self._event_iterator:
+                yield event
+            return
+        async for chunk in self:
+            yield TextDelta(chunk=chunk)
+        yield StreamDone(finish_reason=self.finish_reason or "")
+
     def __aiter__(self) -> StreamResponse:
         return self
 
@@ -94,6 +182,7 @@ def to_provider_response(self) -> ProviderResponse:
             cost_usd=self.cost_usd,
             finish_reason=self.finish_reason,
             reasoning_content=self.reasoning_content,
+            tool_calls=list(self.tool_calls),
         )
 
 
diff --git a/src/agentloom/providers/google.py b/src/agentloom/providers/google.py
index b486274..42d908a 100644
--- a/src/agentloom/providers/google.py
+++ b/src/agentloom/providers/google.py
@@ -145,6 +145,14 @@ def _format_messages(
                 content = msg.get("content", "")
                 system_instruction = content if isinstance(content, str) else str(content)
                 continue
+            # Tool-loop messages already in Gemini wire shape (role=``model``
+            # with ``functionCall`` parts, role=``function`` with
+            # ``functionResponse`` parts) — pass them through verbatim so
+            # iteration 2+ preserves the call context. Identified by the
+            # presence of ``parts`` instead of ``content``.
+            if "parts" in msg:
+                contents.append(msg)
+                continue
             role = "user" if msg["role"] == "user" else "model"
             content = msg.get("content", "")
             if isinstance(content, str):
@@ -180,8 +188,27 @@ async def complete(
         max_tokens: int | None = None,
         **kwargs: Any,
     ) -> ProviderResponse:
+        agentloom_tools = kwargs.pop("agentloom_tools", None)
+        agentloom_tool_choice = kwargs.pop("agentloom_tool_choice", None)
         extras = validate_extra_kwargs("google", "complete", kwargs, _GOOGLE_EXTRA_PAYLOAD_KEYS)
         thinking_payload = _build_thinking_config_payload(extras.pop("thinking_config", None))
+        if agentloom_tools:
+            from agentloom.steps._tools import translate_tools_for_google
+
+            extras["tools"] = translate_tools_for_google(agentloom_tools)
+            # ``{"name": "fn"}`` selects a specific function via Gemini's
+            # ANY mode + ``allowedFunctionNames``. Plain strings map to
+            # AUTO / ANY / NONE; anything else falls back to AUTO.
+            choice = agentloom_tool_choice or "auto"
+            fn_config: dict[str, Any]
+            if isinstance(choice, dict) and "name" in choice:
+                fn_config = {"mode": "ANY", "allowedFunctionNames": [choice["name"]]}
+            else:
+                mode = {"auto": "AUTO", "required": "ANY", "none": "NONE"}.get(
+                    choice if isinstance(choice, str) else "auto", "AUTO"
+                )
+                fn_config = {"mode": mode}
+            extras["tool_config"] = {"functionCallingConfig": fn_config}
         system_instruction, contents = self._format_messages(messages)
 
         payload: dict[str, Any] = {"contents": contents}
@@ -217,9 +244,10 @@ async def complete(
         candidates = data.get("candidates", [])
         content = ""
         reasoning_content: str | None = None
+        content_parts: list[dict[str, Any]] = []
         if candidates:
-            parts = candidates[0].get("content", {}).get("parts", [])
-            content, reasoning_trace = _parse_gemini_content_parts(parts)
+            content_parts = candidates[0].get("content", {}).get("parts", []) or []
+            content, reasoning_trace = _parse_gemini_content_parts(content_parts)
             if reasoning_trace:
                 reasoning_content = reasoning_trace
 
@@ -248,6 +276,11 @@ async def complete(
         if candidates:
             finish_reason = candidates[0].get("finishReason")
 
+        from agentloom.steps._tools import parse_tool_calls_from_google
+
+        tool_calls = parse_tool_calls_from_google(content_parts)
+
+
         return ProviderResponse(
             content=content,
             model=model,
@@ -257,6 +290,7 @@ async def complete(
             reasoning_content=reasoning_content,
             raw_response=data,
             finish_reason=finish_reason,
+            tool_calls=tool_calls,
         )
 
     async def stream(
diff --git a/src/agentloom/providers/ollama.py b/src/agentloom/providers/ollama.py
index c5db3de..290e655 100644
--- a/src/agentloom/providers/ollama.py
+++ b/src/agentloom/providers/ollama.py
@@ -115,6 +115,13 @@ def _format_messages(messages: list[dict[str, Any]]) -> list[dict[str, Any]]:
         """Convert internal content blocks to Ollama's images format."""
         formatted: list[dict[str, Any]] = []
         for msg in messages:
+            # Tool-loop messages (assistant with ``tool_calls``, role=``tool``
+            # with ``tool_call_id``) ship in OpenAI-compatible wire shape —
+            # pass them through verbatim so iteration 2+ keeps the call
+            # context intact.
+            if "tool_calls" in msg or msg.get("role") == "tool":
+                formatted.append(msg)
+                continue
             content = msg.get("content", "")
             if isinstance(content, str):
                 formatted.append({"role": msg["role"], "content": content})
@@ -154,8 +161,23 @@ async def complete(
         max_tokens: int | None = None,
         **kwargs: Any,
     ) -> ProviderResponse:
+        agentloom_tools = kwargs.pop("agentloom_tools", None)
+        agentloom_tool_choice = kwargs.pop("agentloom_tool_choice", None)
         extras = validate_extra_kwargs("ollama", "complete", kwargs, _OLLAMA_EXTRA_PAYLOAD_KEYS)
         think_param, capture_reasoning = _pop_thinking_config(extras)
+        if agentloom_tools:
+            from agentloom.steps._tools import translate_tools_for_ollama
+
+            extras["tools"] = translate_tools_for_ollama(agentloom_tools)
+            # Ollama doesn't expose tool_choice; only model-side support
+            # decides whether the call is honored. Surface the silent drop
+            # via debug log so users debugging "why doesn't my tool fire"
+            # have a hint.
+            if agentloom_tool_choice not in (None, "auto"):
+                logger.debug(
+                    "Ollama ignores tool_choice=%r; only model-side support matters.",
+                    agentloom_tool_choice,
+                )
         payload: dict[str, Any] = {
             "model": model,
             "messages": self._format_messages(messages),
@@ -206,6 +228,11 @@ async def complete(
             total_tokens=prompt_tokens + completion_tokens,
         )
 
+        from agentloom.steps._tools import parse_tool_calls_from_openai
+
+        tool_calls = parse_tool_calls_from_openai(data.get("message", {}) or {})
+
+
         return ProviderResponse(
             content=content,
             model=data.get("model", model),
@@ -215,6 +242,7 @@ async def complete(
             reasoning_content=reasoning_content,
             raw_response=data,
             finish_reason=data.get("done_reason"),
+            tool_calls=tool_calls,
         )
 
     async def stream(
diff --git a/src/agentloom/providers/openai.py b/src/agentloom/providers/openai.py
index 5c2867a..caeaf19 100644
--- a/src/agentloom/providers/openai.py
+++ b/src/agentloom/providers/openai.py
@@ -80,6 +80,13 @@ def _format_messages(messages: list[dict[str, Any]]) -> list[dict[str, Any]]:
         """Convert internal content blocks to OpenAI's vision/audio format."""
         formatted: list[dict[str, Any]] = []
         for msg in messages:
+            # Tool-loop messages (assistant with ``tool_calls``, role=``tool``
+            # with ``tool_call_id``) ship in OpenAI wire shape directly —
+            # pass them through verbatim so iteration 2+ sees the prior
+            # decision and result.
+            if "tool_calls" in msg or msg.get("role") == "tool":
+                formatted.append(msg)
+                continue
             content = msg.get("content", "")
             if isinstance(content, str):
                 formatted.append({"role": msg["role"], "content": content})
@@ -126,11 +133,25 @@ async def complete(
         max_tokens: int | None = None,
         **kwargs: Any,
     ) -> ProviderResponse:
+        # Tool definitions arrive as ``ToolDefinition`` Pydantic instances —
+        # translate to OpenAI's wire format before forwarding so callers
+        # don't have to know provider-specific shapes.
+        agentloom_tools = kwargs.pop("agentloom_tools", None)
+        agentloom_tool_choice = kwargs.pop("agentloom_tool_choice", None)
         extras = validate_extra_kwargs("openai", "complete", kwargs, _OPENAI_EXTRA_PAYLOAD_KEYS)
         # ``thinking_config`` is accepted at the step layer for YAML
         # uniformity but has no chat-completions equivalent — drop it
         # before splatting extras into the request body.
         extras.pop("thinking_config", None)
+        if agentloom_tools:
+            from agentloom.steps._tools import (
+                translate_tool_choice_for_openai,
+                translate_tools_for_openai,
+            )
+
+            extras["tools"] = translate_tools_for_openai(agentloom_tools)
+            if agentloom_tool_choice is not None and agentloom_tool_choice != "none":
+                extras["tool_choice"] = translate_tool_choice_for_openai(agentloom_tool_choice)
         payload: dict[str, Any] = {
             "model": model,
             "messages": self._format_messages(messages),
@@ -149,7 +170,8 @@ async def complete(
         raise_for_status("openai", response)
 
         data = response.json()
-        content = data["choices"][0]["message"]["content"]
+        message = data["choices"][0]["message"]
+        content = message.get("content") or ""
         usage_data = data.get("usage", {})
         # o-series returns reasoning tokens under completion_tokens_details;
         # ordinary gpt-* responses omit the field and resolve to 0.
@@ -168,6 +190,11 @@ async def complete(
             reasoning_tokens=usage.reasoning_tokens,
         )
 
+        from agentloom.steps._tools import parse_tool_calls_from_openai
+
+        tool_calls = parse_tool_calls_from_openai(message)
+
+
         return ProviderResponse(
             content=content,
             model=data.get("model", model),
@@ -176,6 +203,7 @@ async def complete(
             cost_usd=cost,
             raw_response=data,
             finish_reason=data["choices"][0].get("finish_reason"),
+            tool_calls=tool_calls,
         )
 
     async def stream(
diff --git a/src/agentloom/steps/_tools.py b/src/agentloom/steps/_tools.py
new file mode 100644
index 0000000..8e73bdf
--- /dev/null
+++ b/src/agentloom/steps/_tools.py
@@ -0,0 +1,269 @@
+"""Tool-calling helpers — wire-format translation, dispatch, and message synthesis."""
+
+from __future__ import annotations
+
+import hashlib
+import json
+import logging
+import time
+from typing import Any
+
+import anyio
+
+from agentloom.core.models import ToolDefinition
+from agentloom.providers.base import ToolCall
+
+
+def _hash_text(text: str) -> str:
+    return hashlib.sha256(text.encode("utf-8")).hexdigest()[:16]
+
+
+logger = logging.getLogger("agentloom.steps")
+
+
+def translate_tools_for_openai(tools: list[ToolDefinition]) -> list[dict[str, Any]]:
+    """OpenAI shape: ``[{"type": "function", "function": {name, description, parameters}}]``."""
+    return [
+        {
+            "type": "function",
+            "function": {
+                "name": t.name,
+                "description": t.description,
+                "parameters": t.parameters or {"type": "object", "properties": {}},
+            },
+        }
+        for t in tools
+    ]
+
+
+def translate_tools_for_anthropic(tools: list[ToolDefinition]) -> list[dict[str, Any]]:
+    """Anthropic shape: ``[{name, description, input_schema}]``."""
+    return [
+        {
+            "name": t.name,
+            "description": t.description,
+            "input_schema": t.parameters or {"type": "object", "properties": {}},
+        }
+        for t in tools
+    ]
+
+
+def translate_tools_for_google(tools: list[ToolDefinition]) -> list[dict[str, Any]]:
+    """Google shape: ``[{"function_declarations": [{name, description, parameters}, ...]}]``.
+
+    Google groups all functions under one ``Tool`` object; we put them all
+    in a single declaration list since AgentLoom doesn't yet expose tool
+    grouping (everything is a function).
+    """
+    return [
+        {
+            "function_declarations": [
+                {
+                    "name": t.name,
+                    "description": t.description,
+                    "parameters": t.parameters or {"type": "object", "properties": {}},
+                }
+                for t in tools
+            ]
+        }
+    ]
+
+
+def translate_tools_for_ollama(tools: list[ToolDefinition]) -> list[dict[str, Any]]:
+    """Ollama uses the OpenAI shape on supported models."""
+    return translate_tools_for_openai(tools)
+
+
+def translate_tool_choice_for_openai(choice: Any) -> Any:
+    """OpenAI ``tool_choice`` accepts ``"auto"`` / ``"required"`` / ``"none"``
+    or ``{"type": "function", "function": {"name": "..."}}``."""
+    if isinstance(choice, dict) and "name" in choice:
+        return {"type": "function", "function": {"name": choice["name"]}}
+    return choice
+
+
+def translate_tool_choice_for_anthropic(choice: Any) -> Any:
+    """Anthropic uses ``{"type": "auto"}`` / ``"any"`` / ``"tool"``."""
+    if choice == "auto":
+        return {"type": "auto"}
+    if choice == "required":
+        return {"type": "any"}
+    if choice == "none":
+        return None  # omit the field entirely
+    if isinstance(choice, dict) and "name" in choice:
+        return {"type": "tool", "name": choice["name"]}
+    return {"type": "auto"}
+
+
+def parse_tool_calls_from_openai(message: dict[str, Any]) -> list[ToolCall]:
+    """OpenAI returns ``message.tool_calls = [{id, type, function: {name, arguments}}]``."""
+    raw_calls = message.get("tool_calls") or []
+    calls: list[ToolCall] = []
+    for entry in raw_calls:
+        if entry.get("type") != "function":
+            continue
+        fn = entry.get("function", {})
+        try:
+            args = json.loads(fn.get("arguments") or "{}")
+        except json.JSONDecodeError:
+            args = {}
+        calls.append(ToolCall(id=entry.get("id", ""), name=fn.get("name", ""), arguments=args))
+    return calls
+
+
+def parse_tool_calls_from_anthropic(content_blocks: list[dict[str, Any]]) -> list[ToolCall]:
+    """Anthropic returns ``content`` blocks; ``type=tool_use`` carries calls."""
+    calls: list[ToolCall] = []
+    for block in content_blocks:
+        if block.get("type") != "tool_use":
+            continue
+        calls.append(
+            ToolCall(
+                id=block.get("id", ""),
+                name=block.get("name", ""),
+                arguments=block.get("input", {}) or {},
+            )
+        )
+    return calls
+
+
+def parse_tool_calls_from_google(content_parts: list[dict[str, Any]]) -> list[ToolCall]:
+    """Google returns parts with ``functionCall: {name, args}``. Ids aren't
+    provider-assigned so we synthesize them — the result message echoes
+    the same name (no id round-trip needed)."""
+    calls: list[ToolCall] = []
+    for idx, part in enumerate(content_parts):
+        fc = part.get("functionCall")
+        if not fc:
+            continue
+        calls.append(
+            ToolCall(
+                id=f"google-{idx}",
+                name=fc.get("name", ""),
+                arguments=fc.get("args", {}) or {},
+            )
+        )
+    return calls
+
+
+async def dispatch_tool_calls(
+    calls: list[ToolCall],
+    tool_registry: Any,
+    *,
+    observer: Any | None = None,
+    step_id: str = "",
+) -> list[tuple[ToolCall, str, bool]]:
+    """Execute each call via the registry in parallel; preserve order.
+
+    Failed calls return ``(call, str(exc), False)`` so the model can
+    recover on the next turn rather than aborting the loop. When *observer*
+    is provided, fires ``on_tool_call`` per call with hashes of the args
+    and result for trace-level observability without leaking PII.
+    """
+    results: list[tuple[ToolCall, str, bool]] = [None] * len(calls)  # type: ignore[list-item]
+
+    async def _run(idx: int, call: ToolCall) -> None:
+        start = time.monotonic()
+        success = False
+        text: str
+        try:
+            tool = tool_registry.get(call.name)
+        except KeyError as e:
+            text = f"tool '{call.name}' not registered: {e}"
+            results[idx] = (call, text, False)
+        else:
+            try:
+                outcome = await tool.execute(**call.arguments)
+                text = outcome if isinstance(outcome, str) else json.dumps(outcome, default=str)
+                results[idx] = (call, text, True)
+                success = True
+            except Exception as e:  # noqa: BLE001 — reported back to the model
+                logger.warning("Tool '%s' failed: %s", call.name, e)
+                text = f"tool execution failed: {e}"
+                results[idx] = (call, text, False)
+        if observer is not None:
+            hook = getattr(observer, "on_tool_call", None)
+            if callable(hook):
+                hook(
+                    step_id=step_id,
+                    call_id=call.id,
+                    tool_name=call.name,
+                    args_hash=_hash_text(json.dumps(call.arguments, sort_keys=True, default=str)),
+                    result_hash=_hash_text(text),
+                    duration_ms=(time.monotonic() - start) * 1000.0,
+                    success=success,
+                )
+
+    async with anyio.create_task_group() as tg:
+        for idx, call in enumerate(calls):
+            tg.start_soon(_run, idx, call)
+
+    return results
+
+
+def build_assistant_message_with_tool_calls(
+    provider: str, content: str, calls: list[ToolCall]
+) -> dict[str, Any]:
+    """Replay the assistant's tool-call decision so the model sees its prior turn."""
+    if provider == "anthropic":
+        blocks: list[dict[str, Any]] = []
+        if content:
+            blocks.append({"type": "text", "text": content})
+        for c in calls:
+            blocks.append({"type": "tool_use", "id": c.id, "name": c.name, "input": c.arguments})
+        return {"role": "assistant", "content": blocks}
+    if provider == "google":
+        return {
+            "role": "model",
+            "parts": [{"functionCall": {"name": c.name, "args": c.arguments}} for c in calls],
+        }
+    # OpenAI / Ollama: ``content`` must be a string (formatter rejects None);
+    # empty string is accepted alongside tool_calls.
+    return {
+        "role": "assistant",
+        "content": content or "",
+        "tool_calls": [
+            {
+                "id": c.id,
+                "type": "function",
+                "function": {"name": c.name, "arguments": json.dumps(c.arguments)},
+            }
+            for c in calls
+        ],
+    }
+
+
+def build_tool_result_messages(
+    provider: str, results: list[tuple[ToolCall, str, bool]]
+) -> list[dict[str, Any]]:
+    """Translate tool outcomes into the role/shape each provider expects."""
+    if provider == "anthropic":
+        # Single user turn with one tool_result block per call.
+        blocks: list[dict[str, Any]] = []
+        for call, text, success in results:
+            blocks.append(
+                {
+                    "type": "tool_result",
+                    "tool_use_id": call.id,
+                    "content": text,
+                    "is_error": not success,
+                }
+            )
+        return [{"role": "user", "content": blocks}]
+    if provider == "google":
+        return [
+            {
+                "role": "function",
+                "parts": [
+                    {
+                        "functionResponse": {
+                            "name": call.name,
+                            "response": {"result": text} if success else {"error": text},
+                        }
+                    }
+                    for call, text, success in results
+                ],
+            }
+        ]
+    # OpenAI / Ollama: one tool message per call, keyed by tool_call_id.
+    return [{"role": "tool", "tool_call_id": call.id, "content": text} for call, text, _ in results]
diff --git a/src/agentloom/steps/llm_call.py b/src/agentloom/steps/llm_call.py
index c9be355..a786d81 100644
--- a/src/agentloom/steps/llm_call.py
+++ b/src/agentloom/steps/llm_call.py
@@ -56,6 +56,97 @@ def _build_prompt_metadata(
 class LLMCallStep(BaseStep):
     """Executes an LLM call with prompt template rendering from state."""
 
+    @staticmethod
+    async def _run_tool_loop(
+        *,
+        context: StepContext,
+        step: StepDefinition,
+        messages: list[dict[str, Any]],
+        model: str,
+        provider_kwargs: dict[str, Any],
+    ) -> Any:
+        """Iterate complete() → dispatch tools → re-prompt until done.
+
+        Cost and tokens accumulate across iterations. ``max_tool_iterations``
+        bounds the loop; collapses to a single call when ``tools`` is empty.
+        """
+        from agentloom.core.results import TokenUsage
+        from agentloom.providers.base import ProviderResponse
+        from agentloom.steps._tools import (
+            build_assistant_message_with_tool_calls,
+            build_tool_result_messages,
+            dispatch_tool_calls,
+        )
+
+        accumulated_prompt = 0
+        accumulated_completion = 0
+        accumulated_reasoning = 0
+        accumulated_cost = 0.0
+
+        max_iterations = max(step.max_tool_iterations, 1)
+        gateway = context.provider_gateway
+        if gateway is None:
+            raise StepError(step.id, "No provider gateway configured")
+        for _ in range(max_iterations):
+            response = await gateway.complete(
+                messages=messages,
+                model=model,
+                temperature=step.temperature,
+                max_tokens=step.max_tokens,
+                step_id=step.id,
+                **provider_kwargs,
+            )
+            accumulated_prompt += response.usage.prompt_tokens
+            accumulated_completion += response.usage.completion_tokens
+            accumulated_reasoning += response.usage.reasoning_tokens
+            accumulated_cost += response.cost_usd
+
+            if not response.tool_calls or not step.tools:
+                # Replace the response usage with the accumulated totals so
+                # the caller sees the full conversation cost.
+                response.usage = TokenUsage(
+                    prompt_tokens=accumulated_prompt,
+                    completion_tokens=accumulated_completion,
+                    total_tokens=(
+                        accumulated_prompt + accumulated_completion + accumulated_reasoning
+                    ),
+                    reasoning_tokens=accumulated_reasoning,
+                )
+                response.cost_usd = accumulated_cost
+                return response
+
+            if context.tool_registry is None:
+                raise StepError(
+                    step.id,
+                    "Tool registry required for tools= declaration but not configured.",
+                )
+
+            results = await dispatch_tool_calls(
+                response.tool_calls,
+                context.tool_registry,
+                observer=context.observer,
+                step_id=step.id,
+            )
+            messages.append(
+                build_assistant_message_with_tool_calls(
+                    response.provider, response.content, response.tool_calls
+                )
+            )
+            messages.extend(build_tool_result_messages(response.provider, results))
+
+        # Loop exhausted; surface the last response with the cap noted as
+        # finish_reason so callers can detect it.
+        last_response: ProviderResponse = response  # noqa: F821 — set in loop
+        last_response.usage = TokenUsage(
+            prompt_tokens=accumulated_prompt,
+            completion_tokens=accumulated_completion,
+            total_tokens=(accumulated_prompt + accumulated_completion + accumulated_reasoning),
+            reasoning_tokens=accumulated_reasoning,
+        )
+        last_response.cost_usd = accumulated_cost
+        last_response.finish_reason = "max_tool_iterations"
+        return last_response
+
     @staticmethod
     def _build_thinking_kwargs(step: StepDefinition) -> dict[str, Any]:
         """Forward ``StepDefinition.thinking`` to the gateway as a config object.
@@ -148,14 +239,21 @@ async def execute(self, context: StepContext) -> StepResult:
             )
 
         provider_kwargs = self._build_thinking_kwargs(step)
+        if step.tools:
+            provider_kwargs["agentloom_tools"] = step.tools
+            provider_kwargs["agentloom_tool_choice"] = step.tool_choice
+
+        # Tool-call loop: re-prompt with tool results until the model
+        # stops requesting tools or we exhaust ``max_tool_iterations``.
+        # Costs and tokens accumulate across iterations; only the final
+        # response's content is exposed to the caller.
         try:
-            response = await context.provider_gateway.complete(
+            response = await self._run_tool_loop(
+                context=context,
+                step=step,
                 messages=messages,
                 model=model,
-                temperature=step.temperature,
-                max_tokens=step.max_tokens,
-                step_id=step.id,
-                **provider_kwargs,
+                provider_kwargs=provider_kwargs,
             )
         except Exception as e:
             duration = (time.monotonic() - start) * 1000
diff --git a/tests/observability/test_metrics.py b/tests/observability/test_metrics.py
index 0e6f85a..4af3b58 100644
--- a/tests/observability/test_metrics.py
+++ b/tests/observability/test_metrics.py
@@ -78,6 +78,28 @@ def test_record_tokens(self) -> None:
         if mm._backend == "otel":
             mm.shutdown()
 
+    def test_record_tool_call(self) -> None:
+        # Records both the per-tool counter (with status label) and the
+        # latency histogram. Status maps to ``"success"`` / ``"failure"``
+        # so dashboards can derive a per-tool failure rate.
+        from unittest.mock import MagicMock
+
+        mm = MetricsManager(enabled=True)
+        if mm._backend != "otel":
+            return
+        counter_spy = MagicMock()
+        histogram_spy = MagicMock()
+        mm._tool_call_counter = counter_spy
+        mm._tool_call_histogram = histogram_spy
+        mm.record_tool_call("add", success=True, duration_s=0.42)
+        mm.record_tool_call("add", success=False, duration_s=1.1)
+
+        counter_spy.add.assert_any_call(1, {"tool_name": "add", "status": "success"})
+        counter_spy.add.assert_any_call(1, {"tool_name": "add", "status": "failure"})
+        histogram_spy.record.assert_any_call(0.42, {"tool_name": "add"})
+        histogram_spy.record.assert_any_call(1.1, {"tool_name": "add"})
+        mm.shutdown()
+
     def test_reasoning_tokens_metric_emitted(self) -> None:
         # When ``reasoning_tokens`` is non-zero, ``record_tokens`` must
         # observe the histogram a third time with
diff --git a/tests/observability/test_noop.py b/tests/observability/test_noop.py
index ed29383..cda83e4 100644
--- a/tests/observability/test_noop.py
+++ b/tests/observability/test_noop.py
@@ -174,6 +174,15 @@ def test_every_hook_executes_no_op_body(self) -> None:
         obs.on_provider_call_start("s1", "openai", "gpt-4o-mini")
         obs.on_provider_call_end("s1", "openai", "gpt-4o-mini", 1.5)
         obs.on_provider_error("openai", "RateLimitError")
+        obs.on_tool_call(
+            step_id="s1",
+            call_id="c1",
+            tool_name="add",
+            args_hash="x",
+            result_hash="y",
+            duration_ms=1.0,
+            success=True,
+        )
         obs.on_stream_response("openai", "gpt-4o-mini", 0.3)
         obs.on_tokens("openai", "gpt-4o-mini", 100, 50)
         obs.on_mock_replay("wf", "s1", "step_id")
diff --git a/tests/observability/test_observer.py b/tests/observability/test_observer.py
index ace227d..35f8008 100644
--- a/tests/observability/test_observer.py
+++ b/tests/observability/test_observer.py
@@ -260,6 +260,56 @@ def test_dangling_span_closed_without_tracing(self) -> None:
         span.end.assert_called_once()
 
 
+class TestToolCallObservability:
+    """``on_tool_call`` records the per-tool counter / histogram via the
+    metrics manager and emits an ``execute_tool {name}`` child span carrying
+    the tool name, args/result hashes, duration, and success flag."""
+
+    def test_records_metric_and_emits_span(self) -> None:
+        tracing = MagicMock()
+        metrics = MagicMock()
+        observer = WorkflowObserver(tracing=tracing, metrics=metrics)
+        observer.on_workflow_start("wf", run_id="rid-7")
+        tracing.start_span.reset_mock()  # ignore the workflow span
+        observer.on_tool_call(
+            step_id="s1",
+            call_id="c1",
+            tool_name="add",
+            args_hash="abcd1234",
+            result_hash="ef567890",
+            duration_ms=12.3,
+            success=True,
+        )
+        # Metric — counter + histogram via record_tool_call.
+        metrics.record_tool_call.assert_called_once_with("add", True, 0.0123)
+        # Span — name follows the canonical ``execute_tool {tool_name}``
+        # template; attrs include the run_id propagation + tool fields.
+        tracing.start_span.assert_called_once()
+        name = tracing.start_span.call_args.args[0]
+        attrs = tracing.start_span.call_args.kwargs["attributes"]
+        assert name == "execute_tool add"
+        assert attrs["tool.name"] == "add"
+        assert attrs["tool.call_id"] == "c1"
+        assert attrs["tool.args_hash"] == "abcd1234"
+        assert attrs["tool.result_hash"] == "ef567890"
+        assert attrs["tool.duration_ms"] == 12.3
+        assert attrs["tool.success"] is True
+        assert attrs["workflow.run_id"] == "rid-7"
+
+    def test_noop_when_no_tracing_or_metrics(self) -> None:
+        # Defensive: tool dispatch shouldn't fail when observability is off.
+        observer = WorkflowObserver()
+        observer.on_tool_call(
+            step_id="s1",
+            call_id="c1",
+            tool_name="x",
+            args_hash="a",
+            result_hash="b",
+            duration_ms=1.0,
+            success=False,
+        )
+
+
 class TestStreamResponse:
     def test_records_ttft_and_stream_count(self) -> None:
         metrics = MagicMock()
diff --git a/tests/providers/test_tool_calling.py b/tests/providers/test_tool_calling.py
new file mode 100644
index 0000000..8cb461b
--- /dev/null
+++ b/tests/providers/test_tool_calling.py
@@ -0,0 +1,730 @@
+"""Tests for native tool calling across providers."""
+
+from __future__ import annotations
+
+import json
+from typing import Any
+
+import httpx
+import respx
+
+from agentloom.core.engine import WorkflowEngine
+from agentloom.core.models import (
+    StepDefinition,
+    StepType,
+    ToolDefinition,
+    WorkflowConfig,
+    WorkflowDefinition,
+)
+from agentloom.core.results import StepStatus, WorkflowStatus
+from agentloom.providers.anthropic import AnthropicProvider
+from agentloom.providers.base import ToolCall
+from agentloom.providers.gateway import ProviderGateway
+from agentloom.providers.openai import OpenAIProvider
+from agentloom.steps._tools import (
+    build_assistant_message_with_tool_calls,
+    build_tool_result_messages,
+    dispatch_tool_calls,
+    parse_tool_calls_from_anthropic,
+    parse_tool_calls_from_google,
+    parse_tool_calls_from_openai,
+    translate_tools_for_anthropic,
+    translate_tools_for_google,
+    translate_tools_for_openai,
+)
+from agentloom.tools.base import BaseTool
+from agentloom.tools.registry import ToolRegistry
+
+
+class _AddTool(BaseTool):
+    name = "add"
+    description = "Add two integers."
+    parameters_schema = {
+        "type": "object",
+        "properties": {"a": {"type": "integer"}, "b": {"type": "integer"}},
+        "required": ["a", "b"],
+    }
+
+    async def execute(self, **kwargs: Any) -> Any:
+        return kwargs["a"] + kwargs["b"]
+
+
+class _LookupTool(BaseTool):
+    name = "lookup"
+    description = "Look up a value."
+    parameters_schema = {
+        "type": "object",
+        "properties": {"key": {"type": "string"}},
+        "required": ["key"],
+    }
+
+    async def execute(self, **kwargs: Any) -> Any:
+        return f"value-for-{kwargs['key']}"
+
+
+def _registry_with(*tools: BaseTool) -> ToolRegistry:
+    reg = ToolRegistry()
+    for t in tools:
+        reg.register(t)
+    return reg
+
+
+class TestToolTranslation:
+    def test_openai_shape(self) -> None:
+        out = translate_tools_for_openai(
+            [ToolDefinition(name="add", description="d", parameters={"type": "object"})]
+        )
+        assert out == [
+            {
+                "type": "function",
+                "function": {
+                    "name": "add",
+                    "description": "d",
+                    "parameters": {"type": "object"},
+                },
+            }
+        ]
+
+    def test_anthropic_shape(self) -> None:
+        out = translate_tools_for_anthropic(
+            [ToolDefinition(name="add", description="d", parameters={"type": "object"})]
+        )
+        assert out == [{"name": "add", "description": "d", "input_schema": {"type": "object"}}]
+
+    def test_google_shape_groups_under_function_declarations(self) -> None:
+        out = translate_tools_for_google(
+            [
+                ToolDefinition(name="add", description="d", parameters={"type": "object"}),
+                ToolDefinition(name="lookup", description="l", parameters={"type": "object"}),
+            ]
+        )
+        assert len(out) == 1
+        decls = out[0]["function_declarations"]
+        assert {d["name"] for d in decls} == {"add", "lookup"}
+
+
+class TestToolParsing:
+    def test_openai_parses_tool_calls(self) -> None:
+        message = {
+            "role": "assistant",
+            "content": None,
+            "tool_calls": [
+                {
+                    "id": "call_1",
+                    "type": "function",
+                    "function": {"name": "add", "arguments": '{"a": 2, "b": 3}'},
+                }
+            ],
+        }
+        calls = parse_tool_calls_from_openai(message)
+        assert calls == [ToolCall(id="call_1", name="add", arguments={"a": 2, "b": 3})]
+
+    def test_openai_skips_non_function_entries(self) -> None:
+        message = {"tool_calls": [{"id": "x", "type": "code_interpreter"}, {"type": "function"}]}
+        calls = parse_tool_calls_from_openai(message)
+        # Only the second entry is a function — its function dict is empty
+        # so name="" and args={}; we accept it (degenerate case).
+        assert len(calls) == 1
+        assert calls[0].name == ""
+
+    def test_anthropic_parses_tool_use_blocks(self) -> None:
+        blocks = [
+            {"type": "text", "text": "I'll use a tool."},
+            {"type": "tool_use", "id": "tu_1", "name": "lookup", "input": {"key": "x"}},
+        ]
+        calls = parse_tool_calls_from_anthropic(blocks)
+        assert calls == [ToolCall(id="tu_1", name="lookup", arguments={"key": "x"})]
+
+    def test_google_parses_function_call_parts(self) -> None:
+        parts = [
+            {"text": "I'll call a function."},
+            {"functionCall": {"name": "add", "args": {"a": 1, "b": 2}}},
+        ]
+        calls = parse_tool_calls_from_google(parts)
+        assert calls == [ToolCall(id="google-1", name="add", arguments={"a": 1, "b": 2})]
+
+
+class TestDispatchToolCalls:
+    async def test_dispatch_runs_in_parallel(self) -> None:
+        registry = _registry_with(_AddTool(), _LookupTool())
+        calls = [
+            ToolCall(id="c1", name="add", arguments={"a": 2, "b": 3}),
+            ToolCall(id="c2", name="lookup", arguments={"key": "alpha"}),
+        ]
+        results = await dispatch_tool_calls(calls, registry)
+        assert len(results) == 2
+        # Order preserved.
+        assert results[0][0].id == "c1"
+        assert results[0][1] == "5"
+        assert results[0][2] is True
+        assert results[1][1] == "value-for-alpha"
+
+    async def test_unknown_tool_yields_failure(self) -> None:
+        registry = _registry_with(_AddTool())
+        calls = [ToolCall(id="c1", name="missing", arguments={})]
+        results = await dispatch_tool_calls(calls, registry)
+        assert results[0][2] is False
+        assert "not registered" in results[0][1]
+
+    async def test_tool_exception_yields_failure(self) -> None:
+        class _Bad(BaseTool):
+            name = "bad"
+
+            async def execute(self, **kwargs: Any) -> Any:
+                raise RuntimeError("kaboom")
+
+        registry = _registry_with(_Bad())
+        calls = [ToolCall(id="c1", name="bad", arguments={})]
+        results = await dispatch_tool_calls(calls, registry)
+        assert results[0][2] is False
+        assert "kaboom" in results[0][1]
+
+
+class TestResultMessageBuilders:
+    def test_openai_role_tool_per_call(self) -> None:
+        c = ToolCall(id="c1", name="add", arguments={})
+        out = build_tool_result_messages("openai", [(c, "5", True)])
+        assert out == [{"role": "tool", "tool_call_id": "c1", "content": "5"}]
+
+    def test_anthropic_single_user_turn_with_tool_result_blocks(self) -> None:
+        c = ToolCall(id="tu_1", name="lookup", arguments={})
+        out = build_tool_result_messages("anthropic", [(c, "ok", True)])
+        assert len(out) == 1
+        assert out[0]["role"] == "user"
+        block = out[0]["content"][0]
+        assert block["type"] == "tool_result"
+        assert block["tool_use_id"] == "tu_1"
+        assert block["content"] == "ok"
+        assert block["is_error"] is False
+
+    def test_assistant_message_for_anthropic_includes_tool_use_blocks(self) -> None:
+        c = ToolCall(id="tu_1", name="lookup", arguments={"key": "x"})
+        msg = build_assistant_message_with_tool_calls("anthropic", "Thinking...", [c])
+        assert msg["role"] == "assistant"
+        assert msg["content"][0] == {"type": "text", "text": "Thinking..."}
+        assert msg["content"][1] == {
+            "type": "tool_use",
+            "id": "tu_1",
+            "name": "lookup",
+            "input": {"key": "x"},
+        }
+
+
+class TestOpenAIToolCallingWire:
+    @respx.mock
+    async def test_complete_returns_tool_calls_and_translates_tools(self) -> None:
+        route = respx.post("https://api.openai.com/v1/chat/completions").mock(
+            return_value=httpx.Response(
+                200,
+                json={
+                    "model": "gpt-4o-mini",
+                    "choices": [
+                        {
+                            "message": {
+                                "role": "assistant",
+                                "content": None,
+                                "tool_calls": [
+                                    {
+                                        "id": "c1",
+                                        "type": "function",
+                                        "function": {
+                                            "name": "add",
+                                            "arguments": '{"a": 2, "b": 3}',
+                                        },
+                                    }
+                                ],
+                            },
+                            "finish_reason": "tool_calls",
+                        }
+                    ],
+                    "usage": {"prompt_tokens": 5, "completion_tokens": 9},
+                },
+            )
+        )
+        provider = OpenAIProvider(api_key="k")
+        tools = [
+            ToolDefinition(
+                name="add",
+                description="add two ints",
+                parameters={
+                    "type": "object",
+                    "properties": {
+                        "a": {"type": "integer"},
+                        "b": {"type": "integer"},
+                    },
+                },
+            )
+        ]
+        r = await provider.complete(
+            messages=[{"role": "user", "content": "what is 2+3?"}],
+            model="gpt-4o-mini",
+            agentloom_tools=tools,
+        )
+        body = json.loads(route.calls[0].request.content)
+        assert body["tools"][0]["function"]["name"] == "add"
+        assert r.tool_calls == [ToolCall(id="c1", name="add", arguments={"a": 2, "b": 3})]
+        assert r.finish_reason == "tool_calls"
+        await provider.close()
+
+
+class TestAnthropicToolCallingWire:
+    @respx.mock
+    async def test_translates_tools_and_parses_tool_use_blocks(self) -> None:
+        route = respx.post("https://api.anthropic.com/v1/messages").mock(
+            return_value=httpx.Response(
+                200,
+                json={
+                    "content": [
+                        {
+                            "type": "tool_use",
+                            "id": "tu_1",
+                            "name": "lookup",
+                            "input": {"key": "alpha"},
+                        }
+                    ],
+                    "model": "claude-haiku-4-5-20251001",
+                    "stop_reason": "tool_use",
+                    "usage": {"input_tokens": 5, "output_tokens": 3},
+                },
+            )
+        )
+        provider = AnthropicProvider(api_key="k")
+        tools = [ToolDefinition(name="lookup", description="d", parameters={"type": "object"})]
+        r = await provider.complete(
+            messages=[{"role": "user", "content": "x"}],
+            model="claude-haiku-4-5-20251001",
+            agentloom_tools=tools,
+            agentloom_tool_choice="required",
+        )
+        body = json.loads(route.calls[0].request.content)
+        assert body["tools"][0]["name"] == "lookup"
+        assert body["tool_choice"] == {"type": "any"}
+        assert r.tool_calls == [ToolCall(id="tu_1", name="lookup", arguments={"key": "alpha"})]
+        await provider.close()
+
+
+class TestGoogleToolChoiceDictSelectsSpecificFunction:
+    """``tool_choice={"name": "fn"}`` must translate to Gemini's
+    ANY-mode + ``allowedFunctionNames`` rather than silently falling
+    through to AUTO."""
+
+    @respx.mock
+    async def test_specific_function_selection(self) -> None:
+        from agentloom.providers.google import GoogleProvider
+
+        route = respx.post(
+            "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent"
+        ).mock(
+            return_value=httpx.Response(
+                200,
+                json={
+                    "model": "gemini-2.5-flash",
+                    "candidates": [{"content": {"parts": [{"text": "ok"}]}}],
+                    "usageMetadata": {
+                        "promptTokenCount": 1,
+                        "candidatesTokenCount": 1,
+                        "totalTokenCount": 2,
+                    },
+                },
+            )
+        )
+        provider = GoogleProvider(api_key="k")
+        await provider.complete(
+            messages=[{"role": "user", "content": "ping"}],
+            model="gemini-2.5-flash",
+            agentloom_tools=[
+                ToolDefinition(name="lookup", description="d", parameters={"type": "object"})
+            ],
+            agentloom_tool_choice={"name": "lookup"},
+        )
+        body = json.loads(route.calls[0].request.content)
+        cfg = body["toolConfig"]["functionCallingConfig"]
+        assert cfg["mode"] == "ANY"
+        assert cfg["allowedFunctionNames"] == ["lookup"]
+        await provider._client.aclose()
+
+
+class TestDispatchObserverHook:
+    """``dispatch_tool_calls`` calls ``observer.on_tool_call`` per call so
+    each dispatch lands on the trace as a child span tagged by tool +
+    success status (#116 observability spec)."""
+
+    async def test_observer_hook_fires_with_hashes_and_duration(self) -> None:
+        from unittest.mock import MagicMock
+
+        observer = MagicMock()
+        registry = _registry_with(_AddTool())
+        calls = [ToolCall(id="c1", name="add", arguments={"a": 2, "b": 3})]
+        await dispatch_tool_calls(calls, registry, observer=observer, step_id="s1")
+
+        observer.on_tool_call.assert_called_once()
+        kwargs = observer.on_tool_call.call_args.kwargs
+        assert kwargs["step_id"] == "s1"
+        assert kwargs["call_id"] == "c1"
+        assert kwargs["tool_name"] == "add"
+        assert kwargs["success"] is True
+        # Hashes are non-empty hex strings — args/result never logged raw.
+        assert len(kwargs["args_hash"]) == 16
+        assert len(kwargs["result_hash"]) == 16
+        assert kwargs["duration_ms"] >= 0.0
+
+    async def test_observer_hook_fires_with_failure_on_unknown_tool(self) -> None:
+        from unittest.mock import MagicMock
+
+        observer = MagicMock()
+        registry = _registry_with(_AddTool())
+        calls = [ToolCall(id="c1", name="missing", arguments={})]
+        await dispatch_tool_calls(calls, registry, observer=observer, step_id="s1")
+
+        observer.on_tool_call.assert_called_once()
+        assert observer.on_tool_call.call_args.kwargs["success"] is False
+
+
+class TestStreamEvents:
+    """Typed stream events surface (issue #116). Adapters that haven't
+    wired the typed iterator fall back to wrapping plain text chunks as
+    ``TextDelta`` events, terminated with ``StreamDone``."""
+
+    async def test_event_iterator_when_wired_takes_precedence(self) -> None:
+        # Adapters that emit typed events natively register a separate
+        # iterator; the default text-wrapping path is bypassed entirely.
+        from agentloom.providers.base import (
+            StreamDone,
+            StreamEvent,
+            StreamResponse,
+            ToolCallComplete,
+        )
+
+        async def _events() -> Any:
+            yield ToolCallComplete(tool_call=ToolCall(id="c1", name="add", arguments={"a": 1}))
+            yield StreamDone(finish_reason="tool_calls")
+
+        sr = StreamResponse(model="m", provider="p")
+        sr._set_event_iterator(_events())
+
+        events: list[StreamEvent] = [evt async for evt in sr.events()]
+        assert len(events) == 2
+        assert isinstance(events[0], ToolCallComplete)
+        assert events[0].tool_call.name == "add"
+
+    async def test_default_events_wrap_text_chunks(self) -> None:
+        from agentloom.providers.base import (
+            StreamDone,
+            StreamResponse,
+            TextDelta,
+        )
+
+        async def _chunks() -> Any:
+            yield "hello "
+            yield "world"
+
+        sr = StreamResponse(model="m", provider="p")
+        sr._set_iterator(_chunks())
+        sr.finish_reason = "stop"
+
+        events = [evt async for evt in sr.events()]
+        # Two TextDelta events + one StreamDone — the default wrapper
+        # provides the surface even when the adapter doesn't emit typed
+        # events natively.
+        assert [type(e).__name__ for e in events] == ["TextDelta", "TextDelta", "StreamDone"]
+        assert isinstance(events[0], TextDelta)
+        assert events[0].chunk == "hello "
+        assert isinstance(events[-1], StreamDone)
+        assert events[-1].finish_reason == "stop"
+
+
+class TestProviderFormatPreservesToolMessages:
+    """Regression for the critical bug where providers' ``_format_messages``
+    silently dropped ``tool_calls`` / ``tool_call_id`` / ``parts`` keys on
+    tool-loop iteration 2+, breaking real (non-mock) tool calling."""
+
+    def test_openai_passes_through_assistant_with_tool_calls(self) -> None:
+        msgs = [
+            {"role": "user", "content": "what is 2+3?"},
+            {
+                "role": "assistant",
+                "content": "",
+                "tool_calls": [
+                    {
+                        "id": "c1",
+                        "type": "function",
+                        "function": {"name": "add", "arguments": '{"a": 2, "b": 3}'},
+                    }
+                ],
+            },
+            {"role": "tool", "tool_call_id": "c1", "content": "5"},
+        ]
+        out = OpenAIProvider._format_messages(msgs)
+        # The two tool-loop messages must round-trip with their wire keys
+        # intact — without this, OpenAI 400s on iteration 2 because the
+        # assistant message has no tool_calls and the tool message has no
+        # tool_call_id to anchor.
+        assert out[1]["tool_calls"][0]["function"]["name"] == "add"
+        assert out[2] == {"role": "tool", "tool_call_id": "c1", "content": "5"}
+
+    def test_ollama_passes_through_tool_loop_messages(self) -> None:
+        from agentloom.providers.ollama import OllamaProvider
+
+        msgs = [
+            {"role": "user", "content": "ping"},
+            {
+                "role": "assistant",
+                "content": "",
+                "tool_calls": [{"id": "c1", "type": "function", "function": {"name": "x"}}],
+            },
+            {"role": "tool", "tool_call_id": "c1", "content": "ok"},
+        ]
+        out = OllamaProvider._format_messages(msgs)
+        assert "tool_calls" in out[1]
+        assert out[2]["tool_call_id"] == "c1"
+
+    def test_google_passes_through_function_call_and_response_parts(self) -> None:
+        from agentloom.providers.google import GoogleProvider
+
+        msgs = [
+            {"role": "user", "content": "ping"},
+            # Assistant tool-call decision (Gemini wire shape).
+            {
+                "role": "model",
+                "parts": [{"functionCall": {"name": "add", "args": {"a": 1, "b": 2}}}],
+            },
+            # Tool result (Gemini wire shape).
+            {
+                "role": "function",
+                "parts": [{"functionResponse": {"name": "add", "response": {"result": "3"}}}],
+            },
+        ]
+        _system, contents = GoogleProvider._format_messages(msgs)
+        # Both ``parts``-based messages must round-trip unchanged so Gemini
+        # sees the prior call + result on iteration 2; previously the
+        # ``content``-based reformatter dropped ``parts`` entirely.
+        assert contents[1]["parts"][0]["functionCall"]["name"] == "add"
+        assert contents[2]["role"] == "function"
+        assert contents[2]["parts"][0]["functionResponse"]["response"] == {"result": "3"}
+
+
+class TestLLMStepToolLoop:
+    @respx.mock
+    async def test_step_dispatches_and_iterates(self) -> None:
+        # Iteration 1: model asks to call `add`.
+        # Iteration 2: model responds with the final answer.
+        respx.post("https://api.openai.com/v1/chat/completions").mock(
+            side_effect=[
+                httpx.Response(
+                    200,
+                    json={
+                        "model": "gpt-4o-mini",
+                        "choices": [
+                            {
+                                "message": {
+                                    "role": "assistant",
+                                    "content": None,
+                                    "tool_calls": [
+                                        {
+                                            "id": "c1",
+                                            "type": "function",
+                                            "function": {
+                                                "name": "add",
+                                                "arguments": '{"a": 2, "b": 3}',
+                                            },
+                                        }
+                                    ],
+                                },
+                                "finish_reason": "tool_calls",
+                            }
+                        ],
+                        "usage": {"prompt_tokens": 5, "completion_tokens": 9},
+                    },
+                ),
+                httpx.Response(
+                    200,
+                    json={
+                        "model": "gpt-4o-mini",
+                        "choices": [
+                            {
+                                "message": {"role": "assistant", "content": "The answer is 5."},
+                                "finish_reason": "stop",
+                            }
+                        ],
+                        "usage": {"prompt_tokens": 12, "completion_tokens": 6},
+                    },
+                ),
+            ]
+        )
+        workflow = WorkflowDefinition(
+            name="agent",
+            config=WorkflowConfig(provider="openai", model="gpt-4o-mini"),
+            state={},
+            steps=[
+                StepDefinition(
+                    id="ask",
+                    type=StepType.LLM_CALL,
+                    prompt="What is 2+3?",
+                    tools=[
+                        ToolDefinition(
+                            name="add",
+                            description="add two integers",
+                            parameters={
+                                "type": "object",
+                                "properties": {
+                                    "a": {"type": "integer"},
+                                    "b": {"type": "integer"},
+                                },
+                                "required": ["a", "b"],
+                            },
+                        )
+                    ],
+                    output="answer",
+                )
+            ],
+        )
+        gateway = ProviderGateway()
+        gateway.register(OpenAIProvider(api_key="k"))
+        engine = WorkflowEngine(
+            workflow=workflow,
+            provider_gateway=gateway,
+            tool_registry=_registry_with(_AddTool()),
+        )
+        result = await engine.run()
+        await gateway.close()
+
+        assert result.status == WorkflowStatus.SUCCESS
+        sr = result.step_results["ask"]
+        assert sr.status == StepStatus.SUCCESS
+        # Cost / tokens accumulated across both iterations.
+        assert sr.token_usage.prompt_tokens == 5 + 12
+        assert sr.token_usage.completion_tokens == 9 + 6
+        assert result.final_state["answer"] == "The answer is 5."
+
+    @respx.mock
+    async def test_step_respects_max_tool_iterations(self) -> None:
+        # Model never stops asking for the tool; loop must cap at 2.
+        respx.post("https://api.openai.com/v1/chat/completions").mock(
+            return_value=httpx.Response(
+                200,
+                json={
+                    "model": "gpt-4o-mini",
+                    "choices": [
+                        {
+                            "message": {
+                                "role": "assistant",
+                                "content": None,
+                                "tool_calls": [
+                                    {
+                                        "id": "c1",
+                                        "type": "function",
+                                        "function": {
+                                            "name": "add",
+                                            "arguments": '{"a": 1, "b": 1}',
+                                        },
+                                    }
+                                ],
+                            },
+                            "finish_reason": "tool_calls",
+                        }
+                    ],
+                    "usage": {"prompt_tokens": 3, "completion_tokens": 2},
+                },
+            )
+        )
+        workflow = WorkflowDefinition(
+            name="agent-loop",
+            config=WorkflowConfig(provider="openai", model="gpt-4o-mini"),
+            state={},
+            steps=[
+                StepDefinition(
+                    id="ask",
+                    type=StepType.LLM_CALL,
+                    prompt="loop",
+                    tools=[
+                        ToolDefinition(
+                            name="add",
+                            description="d",
+                            parameters={"type": "object"},
+                        )
+                    ],
+                    max_tool_iterations=2,
+                    output="answer",
+                )
+            ],
+        )
+        gateway = ProviderGateway()
+        gateway.register(OpenAIProvider(api_key="k"))
+        engine = WorkflowEngine(
+            workflow=workflow,
+            provider_gateway=gateway,
+            tool_registry=_registry_with(_AddTool()),
+        )
+        result = await engine.run()
+        await gateway.close()
+
+        sr = result.step_results["ask"]
+        assert sr.status == StepStatus.SUCCESS
+        # Cap surfaced via finish_reason on the prompt metadata.
+        assert sr.prompt_metadata is not None
+        assert sr.prompt_metadata.finish_reason == "max_tool_iterations"
+
+    async def test_tool_dispatch_without_registry_raises(self) -> None:
+        # When a step declares tools but no registry is attached, the loop
+        # raises StepError surfaced as FAILED.
+        with respx.mock:
+            respx.post("https://api.openai.com/v1/chat/completions").mock(
+                return_value=httpx.Response(
+                    200,
+                    json={
+                        "model": "gpt-4o-mini",
+                        "choices": [
+                            {
+                                "message": {
+                                    "role": "assistant",
+                                    "content": None,
+                                    "tool_calls": [
+                                        {
+                                            "id": "c1",
+                                            "type": "function",
+                                            "function": {
+                                                "name": "add",
+                                                "arguments": "{}",
+                                            },
+                                        }
+                                    ],
+                                },
+                                "finish_reason": "tool_calls",
+                            }
+                        ],
+                        "usage": {"prompt_tokens": 1, "completion_tokens": 1},
+                    },
+                )
+            )
+            workflow = WorkflowDefinition(
+                name="no-registry",
+                config=WorkflowConfig(provider="openai", model="gpt-4o-mini"),
+                state={},
+                steps=[
+                    StepDefinition(
+                        id="ask",
+                        type=StepType.LLM_CALL,
+                        prompt="hi",
+                        tools=[
+                            ToolDefinition(
+                                name="add", description="d", parameters={"type": "object"}
+                            )
+                        ],
+                        max_tool_iterations=1,
+                        retry={"max_retries": 0},
+                    )
+                ],
+            )
+            gateway = ProviderGateway()
+            gateway.register(OpenAIProvider(api_key="k"))
+            engine = WorkflowEngine(
+                workflow=workflow,
+                provider_gateway=gateway,
+                # tool_registry intentionally omitted
+            )
+            result = await engine.run()
+            await gateway.close()
+
+            sr = result.step_results["ask"]
+            assert sr.status == StepStatus.FAILED
+            assert "tool registry" in (sr.error or "").lower()

From f9f820c7c37c94105dedc59869008a2bfef4895d Mon Sep 17 00:00:00 2001
From: Carlos Chinchilla Corbacho
 <188046461+cchinchilla-dev@users.noreply.github.com>
Date: Fri, 8 May 2026 17:48:42 +0200
Subject: [PATCH 2/4] add tool-calling example and replay support for
 tool-iteration loops

---
 examples/35_tool_calling.yaml        | 51 ++++++++++++++++++++++++++++
 recordings/tool_calling.json         | 27 +++++++++++++++
 src/agentloom/providers/anthropic.py |  7 ++++
 src/agentloom/providers/mock.py      | 40 ++++++++++++++++++++--
 4 files changed, 122 insertions(+), 3 deletions(-)
 create mode 100644 examples/35_tool_calling.yaml
 create mode 100644 recordings/tool_calling.json

diff --git a/examples/35_tool_calling.yaml b/examples/35_tool_calling.yaml
new file mode 100644
index 0000000..701e1d7
--- /dev/null
+++ b/examples/35_tool_calling.yaml
@@ -0,0 +1,51 @@
+name: tool-calling-agent
+version: "1.0"
+description: |
+  Demonstrates native tool/function calling. The model decides
+  at runtime to invoke ``http_request`` to fetch JSON from an API,
+  receives the result on the next turn, and emits a final answer.
+
+  Runs offline against the committed recording fixture. With
+  ``--provider openai --model gpt-4o-mini`` it makes real calls and
+  the model picks the tool autonomously.
+
+  Producing a fresh recording:
+    agentloom run examples/35_tool_calling.yaml --provider openai \\
+      --model gpt-4o-mini --record recordings/tool_calling.json
+
+config:
+  provider: mock
+  model: gpt-4o-mini
+  responses_file: recordings/tool_calling.json
+  latency_model: constant
+  latency_ms: 0
+  sandbox:
+    enabled: true
+    allow_network: true
+    allowed_domains: ["httpbin.org"]
+    allowed_schemes: ["https"]
+
+state:
+  question: "What HTTP method does the httpbin /get endpoint accept?"
+
+steps:
+  - id: ask
+    type: llm_call
+    prompt: "{state.question}"
+    tools:
+      - name: http_request
+        description: "Make an HTTP request to a given URL and return the response body."
+        parameters:
+          type: object
+          properties:
+            url:
+              type: string
+              description: "Full URL to GET."
+            method:
+              type: string
+              enum: ["GET", "POST"]
+              description: "HTTP method."
+          required: [url]
+    tool_choice: auto
+    max_tool_iterations: 3
+    output: answer
diff --git a/recordings/tool_calling.json b/recordings/tool_calling.json
new file mode 100644
index 0000000..619a445
--- /dev/null
+++ b/recordings/tool_calling.json
@@ -0,0 +1,27 @@
+{
+  "ask": [
+    {
+      "content": "",
+      "provider": "openai",
+      "model": "gpt-4o-mini",
+      "tool_calls": [
+        {
+          "id": "call_demo_1",
+          "name": "http_request",
+          "arguments": {"url": "https://httpbin.org/get", "method": "GET"}
+        }
+      ],
+      "usage": {"prompt_tokens": 35, "completion_tokens": 18, "total_tokens": 53},
+      "cost_usd": 0.000018,
+      "finish_reason": "tool_calls"
+    },
+    {
+      "content": "The /get endpoint of httpbin accepts the GET HTTP method, as confirmed by the successful response from https://httpbin.org/get.",
+      "provider": "openai",
+      "model": "gpt-4o-mini",
+      "usage": {"prompt_tokens": 320, "completion_tokens": 26, "total_tokens": 346},
+      "cost_usd": 0.000061,
+      "finish_reason": "stop"
+    }
+  ]
+}
diff --git a/src/agentloom/providers/anthropic.py b/src/agentloom/providers/anthropic.py
index e989f7b..94a089f 100644
--- a/src/agentloom/providers/anthropic.py
+++ b/src/agentloom/providers/anthropic.py
@@ -111,6 +111,13 @@ def _format_messages(
             else:
                 parts: list[dict[str, Any]] = []
                 for block in content:
+                    # Tool-calling messages build pure-dict wire-format
+                    # blocks (``tool_use``, ``tool_result``); pass those
+                    # through verbatim. Multimodal Pydantic blocks below need
+                    # translation to Anthropic's specific keys.
+                    if isinstance(block, dict):
+                        parts.append(block)
+                        continue
                     if isinstance(block, TextBlock):
                         parts.append({"type": "text", "text": block.text})
                     elif isinstance(block, ImageBlock):
diff --git a/src/agentloom/providers/mock.py b/src/agentloom/providers/mock.py
index ebf7cf7..814956e 100644
--- a/src/agentloom/providers/mock.py
+++ b/src/agentloom/providers/mock.py
@@ -109,7 +109,12 @@ def __init__(
         self._observer = observer
         self._workflow_name = workflow_name
         self.calls: list[dict[str, Any]] = []
-        self._responses: dict[str, dict[str, Any]] = {}
+        # Each value is either a single response dict OR a list of turns
+        # to play in order (for tool-calling loops where one step issues
+        # multiple complete() calls). The cursor below tracks which turn
+        # the next call should emit per step_id.
+        self._responses: dict[str, Any] = {}
+        self._turn_cursor: dict[str, int] = {}
         if self.responses_file and self.responses_file.exists():
             raw = json.loads(self.responses_file.read_text())
             if not isinstance(raw, dict):
@@ -126,7 +131,18 @@ def _lookup(
         extra: dict[str, Any] | None = None,
     ) -> dict[str, Any] | None:
         if step_id and step_id in self._responses:
-            return self._responses[step_id]
+            entry = self._responses[step_id]
+            # List form: pop the next turn (clamp at last so excess
+            # iterations replay the final response — saner than
+            # raising mid-loop).
+            if isinstance(entry, list):
+                if not entry:
+                    return None
+                idx = self._turn_cursor.get(step_id, 0)
+                turn = entry[min(idx, len(entry) - 1)]
+                self._turn_cursor[step_id] = idx + 1
+                return turn if isinstance(turn, dict) else None
+            return entry if isinstance(entry, dict) else None
         key = prompt_hash(messages, model, temperature, max_tokens, extra)
         return self._responses.get(key)
 
@@ -185,10 +201,27 @@ async def complete(
             )
 
         usage_data = entry.get("usage", {}) or {}
+        # Hydrate ``tool_calls`` from the recording so replay drives the
+        # tool-iteration loop. Each turn carries its own ``tool_calls``.
+        tool_calls: list[Any] = []
+        recorded_tool_calls = entry.get("tool_calls") or []
+        if recorded_tool_calls:
+            from agentloom.providers.base import ToolCall
+
+            for tc in recorded_tool_calls:
+                tool_calls.append(
+                    ToolCall(
+                        id=str(tc.get("id", "")),
+                        name=str(tc.get("name", "")),
+                        arguments=tc.get("arguments", {}) or {},
+                    )
+                )
+
+
         return ProviderResponse(
             content=str(entry.get("content", "")),
             model=str(entry.get("model", model)),
-            provider=self.name,
+            provider=str(entry.get("provider", self.name)),
             usage=TokenUsage(
                 prompt_tokens=int(usage_data.get("prompt_tokens", 0)),
                 completion_tokens=int(usage_data.get("completion_tokens", 0)),
@@ -196,6 +229,7 @@ async def complete(
             ),
             cost_usd=float(entry.get("cost_usd", 0.0)),
             finish_reason=entry.get("finish_reason", "stop"),
+            tool_calls=tool_calls,
         )
 
     def supports_model(self, model: str) -> bool:

From a58a156c3b7aa4b9336eef0a0bdb33110bf4802d Mon Sep 17 00:00:00 2001
From: Carlos Chinchilla Corbacho
 <188046461+cchinchilla-dev@users.noreply.github.com>
Date: Sat, 9 May 2026 12:20:47 +0200
Subject: [PATCH 3/4] fix(providers): parse ollama-shaped tool_calls (no type,
 dict args)

---
 src/agentloom/core/models.py         |  1 -
 src/agentloom/providers/google.py    |  1 -
 src/agentloom/providers/mock.py      |  1 -
 src/agentloom/providers/ollama.py    |  1 -
 src/agentloom/providers/openai.py    |  1 -
 src/agentloom/steps/_tools.py        | 28 +++++++++++++++++++++++-----
 tests/providers/test_tool_calling.py | 24 ++++++++++++++++++++++++
 7 files changed, 47 insertions(+), 10 deletions(-)

diff --git a/src/agentloom/core/models.py b/src/agentloom/core/models.py
index 9063e68..11f3390 100644
--- a/src/agentloom/core/models.py
+++ b/src/agentloom/core/models.py
@@ -121,7 +121,6 @@ class ToolDefinition(BaseModel):
     parameters: dict[str, Any] = Field(default_factory=dict)
 
 
-
 class StepDefinition(BaseModel):
     """Definition of a single workflow step."""
 
diff --git a/src/agentloom/providers/google.py b/src/agentloom/providers/google.py
index 42d908a..6e6a0a5 100644
--- a/src/agentloom/providers/google.py
+++ b/src/agentloom/providers/google.py
@@ -280,7 +280,6 @@ async def complete(
 
         tool_calls = parse_tool_calls_from_google(content_parts)
 
-
         return ProviderResponse(
             content=content,
             model=model,
diff --git a/src/agentloom/providers/mock.py b/src/agentloom/providers/mock.py
index 814956e..d4ff691 100644
--- a/src/agentloom/providers/mock.py
+++ b/src/agentloom/providers/mock.py
@@ -217,7 +217,6 @@ async def complete(
                     )
                 )
 
-
         return ProviderResponse(
             content=str(entry.get("content", "")),
             model=str(entry.get("model", model)),
diff --git a/src/agentloom/providers/ollama.py b/src/agentloom/providers/ollama.py
index 290e655..445c04c 100644
--- a/src/agentloom/providers/ollama.py
+++ b/src/agentloom/providers/ollama.py
@@ -232,7 +232,6 @@ async def complete(
 
         tool_calls = parse_tool_calls_from_openai(data.get("message", {}) or {})
 
-
         return ProviderResponse(
             content=content,
             model=data.get("model", model),
diff --git a/src/agentloom/providers/openai.py b/src/agentloom/providers/openai.py
index caeaf19..74ac97e 100644
--- a/src/agentloom/providers/openai.py
+++ b/src/agentloom/providers/openai.py
@@ -194,7 +194,6 @@ async def complete(
 
         tool_calls = parse_tool_calls_from_openai(message)
 
-
         return ProviderResponse(
             content=content,
             model=data.get("model", model),
diff --git a/src/agentloom/steps/_tools.py b/src/agentloom/steps/_tools.py
index 8e73bdf..51958e9 100644
--- a/src/agentloom/steps/_tools.py
+++ b/src/agentloom/steps/_tools.py
@@ -96,16 +96,34 @@ def translate_tool_choice_for_anthropic(choice: Any) -> Any:
 
 
 def parse_tool_calls_from_openai(message: dict[str, Any]) -> list[ToolCall]:
-    """OpenAI returns ``message.tool_calls = [{id, type, function: {name, arguments}}]``."""
+    """Parse ``message.tool_calls`` from OpenAI-shaped responses.
+
+    Handles both wire variants seen in the wild:
+
+    * **OpenAI canonical**: ``[{id, type:"function", function:{name, arguments:"<json>"}}]``
+      — ``arguments`` is a JSON-encoded string.
+    * **Ollama / OpenAI-compatible relays**: ``[{id, function:{name, arguments:{...}}}]``
+      — ``type`` may be omitted entirely, and ``arguments`` may already be a
+      decoded dict. Treat the absence of ``type`` as ``"function"`` (the
+      only call kind we currently dispatch); a non-function explicit
+      ``type`` (e.g. ``"code_interpreter"``) is skipped.
+    """
     raw_calls = message.get("tool_calls") or []
     calls: list[ToolCall] = []
     for entry in raw_calls:
-        if entry.get("type") != "function":
+        entry_type = entry.get("type", "function")
+        if entry_type != "function":
             continue
         fn = entry.get("function", {})
-        try:
-            args = json.loads(fn.get("arguments") or "{}")
-        except json.JSONDecodeError:
+        raw_args = fn.get("arguments")
+        if isinstance(raw_args, dict):
+            args: dict[str, Any] = raw_args
+        elif isinstance(raw_args, str):
+            try:
+                args = json.loads(raw_args or "{}")
+            except json.JSONDecodeError:
+                args = {}
+        else:
             args = {}
         calls.append(ToolCall(id=entry.get("id", ""), name=fn.get("name", ""), arguments=args))
     return calls
diff --git a/tests/providers/test_tool_calling.py b/tests/providers/test_tool_calling.py
index 8cb461b..9790d50 100644
--- a/tests/providers/test_tool_calling.py
+++ b/tests/providers/test_tool_calling.py
@@ -127,6 +127,30 @@ def test_openai_skips_non_function_entries(self) -> None:
         assert len(calls) == 1
         assert calls[0].name == ""
 
+    def test_ollama_compat_response_with_dict_args_and_no_type(self) -> None:
+        # Real Ollama 0.x ``/api/chat`` returns tool_calls in OpenAI-compatible
+        # shape but (a) omits ``"type": "function"`` and (b) ships
+        # ``arguments`` as an already-decoded dict, not a JSON string. Without
+        # this regression, Ollama tool calling silently drops every call —
+        # the parser's strict ``type == "function"`` check skips entries
+        # missing the field, and ``json.loads(<dict>)`` would TypeError.
+        # Captured from a live ``llama3.1:8b`` response on 2026-05-09.
+        message = {
+            "tool_calls": [
+                {
+                    "id": "call_eh7yrv0u",
+                    "function": {
+                        "index": 0,
+                        "name": "add",
+                        "arguments": {"a": 17, "b": 25},  # already a dict
+                    },
+                    # NOTE: no "type" key
+                }
+            ]
+        }
+        calls = parse_tool_calls_from_openai(message)
+        assert calls == [ToolCall(id="call_eh7yrv0u", name="add", arguments={"a": 17, "b": 25})]
+
     def test_anthropic_parses_tool_use_blocks(self) -> None:
         blocks = [
             {"type": "text", "text": "I'll use a tool."},

From c2966e8b0cffa4fe58435963e5a0399fbff60ce5 Mon Sep 17 00:00:00 2001
From: Carlos Chinchilla Corbacho
 <188046461+cchinchilla-dev@users.noreply.github.com>
Date: Sun, 10 May 2026 11:01:24 +0200
Subject: [PATCH 4/4] docs: surface tool calling in changelog, workflow-yaml,
 and examples

---
 CHANGELOG.md          |  1 +
 docs/examples.md      | 16 ++++++++++++++++
 docs/workflow-yaml.md | 33 ++++++++++++++++++++++++++++++++-
 3 files changed, 49 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 9992e71..d31c455 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -9,6 +9,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Added
 
+- Native tool / function calling across providers (#116). New `tools`, `tool_choice`, and `max_tool_iterations` fields on `llm_call`; the LLM step dispatches via the existing `ToolRegistry`, feeds results back, and re-prompts until the model stops asking (capped by `max_tool_iterations`, default 5). Each adapter translates the unified `ToolDefinition` list to its native shape (OpenAI / Ollama / Anthropic / Google). Parallel tool calls dispatch concurrently and preserve order; failures are reported back as text so the model can recover. Sandbox (#105), budget (#108), and per-step retry (#106) apply unchanged. `MockProvider` recordings accept a list of turns per step so offline replay drives the loop end-to-end; `examples/35_tool_calling.yaml` ships a ReAct-style example. The OpenAI-shaped parser also handles Ollama-compat responses (no `type` field, `arguments` as a decoded dict) — without this Ollama tool calling silently dropped every call.
 - Per-run experiment metadata logging (#77). Every workflow execution now writes a self-contained JSON record (`run_id`, ISO timestamp, AgentLoom version, Python version, workflow `sha256` hash, list of `provider/model` pairs used, status, total cost, total tokens, step count, duration) to `./agentloom_runs/<run_id>.json`. Override the directory via the `runs_dir` constructor argument on `RunHistoryWriter` or the `AGENTLOOM_RUNS_DIR` env var. Disk I/O happens in a worker thread so the write does not block the event loop. Records carry a `_schema_version: 1` field; failures during the write are logged and swallowed so a broken history directory cannot prevent the engine from returning the result. New `agentloom history` CLI subcommand lists records most-recent-first and accepts `--workflow`, `--provider`, `--since YYYY-MM-DD`, `--until YYYY-MM-DD`, `--min-cost`, `--max-cost`, `--limit`, and `--json` filters — covering the full filter surface (date, workflow, cost, provider) called for in the original issue.
 - Quality annotations attachable to `WorkflowResult` (#59). New `WorkflowResult.annotate(target, quality_score=..., source=..., **metadata)` method appends a typed `QualityAnnotation` (`target`, `quality_score`, `source`, `metadata`) to the result so evaluators, human reviewers, or downstream scoring code can record output quality after the run completes. **The annotation is auto-emitted as an OTel span** the moment `annotate()` runs whenever the engine returned the result with a tracing context attached (the default for any workflow run with observability enabled) — `result.annotate("answer", quality_score=4.5, source="human_feedback")` becomes immediately visible in Jaeger with no additional plumbing on the caller side. Each annotation is published as a standalone `quality:<target>` span (the workflow span has already closed, so retroactive attribute attachment is not viable). Quality spans carry `workflow.run_id` and `workflow.name` plus `agentloom.quality.score`, `agentloom.quality.source`, `agentloom.quality.target`, and free-form `agentloom.quality.metadata.*` attributes — Jaeger / Tempo can group quality spans with the original trace by run_id, and dashboards can filter for `agentloom.quality.score < threshold` to surface regressions. Offline / replay paths that build a `WorkflowResult` without a live tracer keep working — `annotate()` still records the data on the result, the OTel emission just no-ops. The `agentloom.observability.quality.emit_quality_annotation` / `emit_quality_annotations` helpers remain available for callers that build annotations outside the engine flow (e.g. batch evaluators reading historical results from disk).
 - OTel span and metric schema centralization with GenAI semantic conventions (#125). The schema is a clean break — no compatibility shims for pre-#125 attribute or metric names. New `agentloom.observability.schema` module is the single source of truth for span / attribute / metric names; downstream consumers (Grafana, AgentTest, Jaeger plugins) parse a stable contract instead of grepping for ad-hoc strings. **Metrics renamed and retyped** to match the OTel GenAI registry: `agentloom_tokens_total` (counter) → `gen_ai.client.token.usage` (histogram, `{token}` unit) with `gen_ai.token.type` attribute (`input` / `output` / `reasoning`); `agentloom_provider_latency_seconds` (histogram) → `gen_ai.client.operation.duration` (histogram, `s`) with `gen_ai.operation.name` + `gen_ai.provider.name` attributes; `agentloom_time_to_first_token_seconds` → `gen_ai.client.operation.time_to_first_chunk`. AgentLoom-specific metrics (`agentloom_workflow_*`, `agentloom_step_*`, `agentloom_provider_calls_total`, `agentloom_cost_usd_total`, `agentloom_circuit_breaker_state`, `agentloom_budget_remaining_usd`, HITL / mock / recording counters) keep their `agentloom_` prefix — they have no OTel equivalent. The bundled Grafana dashboard is updated to query the new metric / label names. The legacy `Observer.on_provider_call` hook (which duplicated the metric emission already done by `on_provider_call_end`) is removed; the engine no longer fires it. The `tokens: int` positional argument on `on_step_end` is removed — callers now pass `prompt_tokens` / `completion_tokens` as kwargs. Span attributes follow the **canonical OTel GenAI registry** as of the May 2026 spec — `gen_ai.provider.name` (the deprecated `gen_ai.system` is **not** emitted), `gen_ai.operation.name`, `gen_ai.request.model`, `gen_ai.request.temperature`, `gen_ai.request.max_tokens`, `gen_ai.request.stream`, `gen_ai.response.model`, `gen_ai.response.finish_reasons` (array of strings, per spec), `gen_ai.response.time_to_first_chunk`, `gen_ai.usage.input_tokens`, `gen_ai.usage.output_tokens`, `gen_ai.usage.reasoning.output_tokens`. Errored inference spans also emit the OTel general-conventions attribute `error.type` alongside the AgentLoom-specific `step.error` so OTel-aware consumers (Jaeger error filters, Tempo) light up. Inference spans use the canonical name template `"{operation_name} {model}"` (e.g. `"chat gpt-4o-mini"`); workflow / step orchestration spans keep the AgentLoom-specific `workflow:*` / `step:*` names. AgentLoom-specific fields stay under `workflow.*` / `step.*` / `tool.*` / `agentloom.*` namespaces. Provider names are translated from AgentLoom internal names to OTel registry values via `to_genai_provider_name` (e.g. `google` → `gcp.gemini`). Notable additions:
diff --git a/docs/examples.md b/docs/examples.md
index e763b99..0b9013a 100644
--- a/docs/examples.md
+++ b/docs/examples.md
@@ -295,3 +295,19 @@ fixtures and CI.
 # Depends on the recording captured from example 31
 agentloom run examples/32_yaml_mock.yaml --lite
 ```
+
+## Tool calling
+
+### 35 — Native tool/function calling
+
+ReAct-style agent: the model decides to invoke `http_request` against `httpbin.org/get`, receives the JSON, and emits a final natural-language answer. Sandbox is on with `allowed_domains: ["httpbin.org"]`, so the model-dispatched call goes through the same security policy as static `tool` steps (#105).
+
+**Demonstrates:** `tools` declaration on `llm_call`, `tool_choice: auto`, `max_tool_iterations`, model-driven dispatch via `ToolRegistry`, sandboxed tool execution, replay support for tool-iteration loops.
+
+```bash
+# Mock-replay against the committed recording (no API calls)
+agentloom run examples/35_tool_calling.yaml --lite
+
+# Real call: pass --provider + --model to drive a live model
+agentloom run examples/35_tool_calling.yaml --provider openai --model gpt-4o-mini
+```
diff --git a/docs/workflow-yaml.md b/docs/workflow-yaml.md
index 1c54f89..1fc15c6 100644
--- a/docs/workflow-yaml.md
+++ b/docs/workflow-yaml.md
@@ -191,6 +191,37 @@ Per-provider translation:
 
 Reasoning tokens are billed at the output rate. `TokenUsage.reasoning_tokens` and `billable_completion_tokens` track the spend; `calculate_cost()` includes them automatically. See [Reasoning models](providers.md#reasoning-models) for per-provider details, including the Ollama caveat that `eval_count` is not split.
 
+**Tool calling:**
+
+The model can pick tools at runtime. Declare them on the step; the engine dispatches via the workflow's `ToolRegistry`, feeds results back, and re-prompts until the model stops asking for tools.
+
+```yaml
+- id: ask
+  type: llm_call
+  prompt: "What is the user's account balance?"
+  tools:
+    - name: lookup_account
+      description: "Retrieve account info by ID."
+      parameters:
+        type: object
+        properties:
+          account_id: { type: string }
+        required: [account_id]
+  tool_choice: auto              # auto | required | none | {name: lookup_account}
+  max_tool_iterations: 5         # bound the loop; default 5
+  output: answer
+```
+
+| Field | Type | Default | Description |
+|-------|------|---------|-------------|
+| `tools` | `list[ToolDefinition]` | `[]` | Tool declarations the model can pick. `parameters` is JSON Schema. Names resolve against the registered `ToolRegistry`; an unknown name is reported back as a tool failure rather than aborting the loop. |
+| `tool_choice` | `string \| dict` | `"auto"` | `"auto"` lets the model decide; `"required"` forces a call; `"none"` disables tools for this turn; `{"name": "..."}` pins to a specific tool. Anthropic ignores `"none"` (omits the field); Ollama ignores `tool_choice` entirely (model-side support decides). |
+| `max_tool_iterations` | `int` | `5` | Cap on call→result→re-prompt loops. When hit, `finish_reason` becomes `"max_tool_iterations"` so callers can detect runaway behavior. |
+
+The dispatched tool runs through the existing sandbox (#105), so `http_request`, `shell_command`, `file_read`, `file_write` honor the workflow's `sandbox:` config. Multiple tool calls in one response are dispatched concurrently (anyio task group); results preserve order in the conversation. Cost and tokens accumulate across iterations on the surfaced `StepResult`.
+
+The legacy `tool` step (static DAG node, author chooses the tool) keeps working unchanged — `tools=` on `llm_call` is the new dynamic, model-driven path.
+
 **Retry config:**
 
 | Field | Type | Default | Description |
@@ -231,7 +262,7 @@ Evaluates conditions against state and activates a target step. Steps not activa
 
 ### `tool`
 
-Executes a registered tool (shell command, HTTP request, etc.).
+Executes a registered tool with author-chosen arguments — the workflow author decides which tool to call, not the model. For model-driven tool selection, use the `tools=` field on an `llm_call` step (see [tool calling](#llm_call) above).
 
 ```yaml
 - id: fetch