From 91bf184ea9877dde22ece7b7f9297fe8c9240404 Mon Sep 17 00:00:00 2001 From: Workshop Participant Date: Wed, 9 Jul 2025 09:21:48 +0000 Subject: [PATCH 01/12] Updates tests with cache tokens for the supported models --- tests-integ/test_bedrock_cache_point.py | 16 ++---- tests/strands/event_loop/test_streaming.py | 60 +++++++++++++++++++--- tests/strands/models/test_anthropic.py | 9 +++- tests/strands/models/test_bedrock.py | 56 +++++++++++++++++--- tests/strands/models/test_litellm.py | 10 +++- tests/strands/models/test_llamaapi.py | 2 + tests/strands/models/test_mistral.py | 4 ++ tests/strands/models/test_ollama.py | 2 + tests/strands/telemetry/test_metrics.py | 16 +++--- tests/strands/telemetry/test_tracer.py | 15 +++++- tests/strands/types/models/test_openai.py | 12 ++++- 11 files changed, 164 insertions(+), 38 deletions(-) diff --git a/tests-integ/test_bedrock_cache_point.py b/tests-integ/test_bedrock_cache_point.py index 82bca22a2..8d6d3c16a 100644 --- a/tests-integ/test_bedrock_cache_point.py +++ b/tests-integ/test_bedrock_cache_point.py @@ -16,16 +16,8 @@ def test_bedrock_cache_point(): {"role": "assistant", "content": [{"text": "Blue!"}]}, ] - cache_point_usage = 0 + agent = Agent(messages=messages, load_tools_from_directory=False) + response = agent("What is favorite color?") - def cache_point_callback_handler(**kwargs): - nonlocal cache_point_usage - if "event" in kwargs and kwargs["event"] and "metadata" in kwargs["event"] and kwargs["event"]["metadata"]: - metadata = kwargs["event"]["metadata"] - if "usage" in metadata and metadata["usage"]: - if "cacheReadInputTokens" in metadata["usage"] or "cacheWriteInputTokens" in metadata["usage"]: - cache_point_usage += 1 - - agent = Agent(messages=messages, callback_handler=cache_point_callback_handler, load_tools_from_directory=False) - agent("What is favorite color?") - assert cache_point_usage > 0 + usage = response.metrics.accumulated_usage + assert usage["cacheReadInputTokens"] >= 0 or usage["cacheWriteInputTokens"] > 0 # At least one should have tokens diff --git a/tests/strands/event_loop/test_streaming.py b/tests/strands/event_loop/test_streaming.py index 7b64264e3..5199ea3fe 100644 --- a/tests/strands/event_loop/test_streaming.py +++ b/tests/strands/event_loop/test_streaming.py @@ -250,7 +250,13 @@ def test_handle_message_stop(): def test_extract_usage_metrics(): event = { - "usage": {"inputTokens": 0, "outputTokens": 0, "totalTokens": 0}, + "usage": { + "inputTokens": 0, + "outputTokens": 0, + "totalTokens": 0, + "cacheReadInputTokens": 0, + "cacheWriteInputTokens": 0, + }, "metrics": {"latencyMs": 0}, } @@ -279,7 +285,13 @@ def test_extract_usage_metrics(): }, { "metadata": { - "usage": {"inputTokens": 1, "outputTokens": 1, "totalTokens": 1}, + "usage": { + "inputTokens": 1, + "outputTokens": 1, + "totalTokens": 1, + "cacheReadInputTokens": 1, + "cacheWriteInputTokens": 1, + }, "metrics": {"latencyMs": 1}, } }, @@ -364,6 +376,8 @@ def test_extract_usage_metrics(): "inputTokens": 1, "outputTokens": 1, "totalTokens": 1, + "cacheReadInputTokens": 1, + "cacheWriteInputTokens": 1, }, }, }, @@ -376,7 +390,13 @@ def test_extract_usage_metrics(): "role": "assistant", "content": [{"toolUse": {"toolUseId": "123", "name": "test", "input": {"key": "value"}}}], }, - {"inputTokens": 1, "outputTokens": 1, "totalTokens": 1}, + { + "inputTokens": 1, + "outputTokens": 1, + "totalTokens": 1, + "cacheReadInputTokens": 1, + "cacheWriteInputTokens": 1, + }, {"latencyMs": 1}, ) }, @@ -398,7 +418,13 @@ def test_extract_usage_metrics(): "role": "assistant", "content": [], }, - {"inputTokens": 0, "outputTokens": 0, "totalTokens": 0}, + { + "inputTokens": 0, + "outputTokens": 0, + "totalTokens": 0, + "cacheReadInputTokens": 0, + "cacheWriteInputTokens": 0, + }, {"latencyMs": 0}, ), }, @@ -426,7 +452,13 @@ def test_extract_usage_metrics(): }, { "metadata": { - "usage": {"inputTokens": 1, "outputTokens": 1, "totalTokens": 1}, + "usage": { + "inputTokens": 1, + "outputTokens": 1, + "totalTokens": 1, + "cacheReadInputTokens": 1, + "cacheWriteInputTokens": 1, + }, "metrics": {"latencyMs": 1}, } }, @@ -506,6 +538,8 @@ def test_extract_usage_metrics(): "inputTokens": 1, "outputTokens": 1, "totalTokens": 1, + "cacheReadInputTokens": 1, + "cacheWriteInputTokens": 1, }, }, }, @@ -518,7 +552,13 @@ def test_extract_usage_metrics(): "role": "assistant", "content": [{"text": "REDACTED."}], }, - {"inputTokens": 1, "outputTokens": 1, "totalTokens": 1}, + { + "inputTokens": 1, + "outputTokens": 1, + "totalTokens": 1, + "cacheReadInputTokens": 1, + "cacheWriteInputTokens": 1, + }, {"latencyMs": 1}, ), }, @@ -584,7 +624,13 @@ async def test_stream_messages(agenerator, alist): "stop": ( "end_turn", {"role": "assistant", "content": [{"text": "test"}]}, - {"inputTokens": 0, "outputTokens": 0, "totalTokens": 0}, + { + "inputTokens": 0, + "outputTokens": 0, + "totalTokens": 0, + "cacheReadInputTokens": 0, + "cacheWriteInputTokens": 0, + }, {"latencyMs": 0}, ) }, diff --git a/tests/strands/models/test_anthropic.py b/tests/strands/models/test_anthropic.py index 66046b7a8..6b47dc2b0 100644 --- a/tests/strands/models/test_anthropic.py +++ b/tests/strands/models/test_anthropic.py @@ -597,7 +597,12 @@ def test_format_chunk_message_stop(model): def test_format_chunk_metadata(model): event = { "type": "metadata", - "usage": {"input_tokens": 1, "output_tokens": 2}, + "usage": { + "input_tokens": 1, + "output_tokens": 2, + "cache_read_input_tokens": 4, + "cache_creation_input_tokens": 5, + }, } tru_chunk = model.format_chunk(event) @@ -607,6 +612,8 @@ def test_format_chunk_metadata(model): "inputTokens": 1, "outputTokens": 2, "totalTokens": 3, + "cacheReadInputTokens": 4, + "cacheWriteInputTokens": 5, }, "metrics": { "latencyMs": 0, diff --git a/tests/strands/models/test_bedrock.py b/tests/strands/models/test_bedrock.py index e9fd9f34a..fb059479e 100644 --- a/tests/strands/models/test_bedrock.py +++ b/tests/strands/models/test_bedrock.py @@ -497,7 +497,13 @@ async def test_converse_stream_input_guardrails( ): metadata_event = { "metadata": { - "usage": {"inputTokens": 0, "outputTokens": 0, "totalTokens": 0}, + "usage": { + "inputTokens": 0, + "outputTokens": 0, + "totalTokens": 0, + "cacheReadInputTokens": 0, + "cacheWriteInputTokens": 0, + }, "metrics": {"latencyMs": 245}, "trace": { "guardrail": { @@ -552,7 +558,13 @@ async def test_converse_stream_output_guardrails( model.update_config(guardrail_redact_input=False, guardrail_redact_output=True) metadata_event = { "metadata": { - "usage": {"inputTokens": 0, "outputTokens": 0, "totalTokens": 0}, + "usage": { + "inputTokens": 0, + "outputTokens": 0, + "totalTokens": 0, + "cacheReadInputTokens": 0, + "cacheWriteInputTokens": 0, + }, "metrics": {"latencyMs": 245}, "trace": { "guardrail": { @@ -609,7 +621,13 @@ async def test_converse_output_guardrails_redacts_input_and_output( model.update_config(guardrail_redact_output=True) metadata_event = { "metadata": { - "usage": {"inputTokens": 0, "outputTokens": 0, "totalTokens": 0}, + "usage": { + "inputTokens": 0, + "outputTokens": 0, + "totalTokens": 0, + "cacheReadInputTokens": 0, + "cacheWriteInputTokens": 0, + }, "metrics": {"latencyMs": 245}, "trace": { "guardrail": { @@ -666,7 +684,13 @@ async def test_converse_output_no_blocked_guardrails_doesnt_redact( ): metadata_event = { "metadata": { - "usage": {"inputTokens": 0, "outputTokens": 0, "totalTokens": 0}, + "usage": { + "inputTokens": 0, + "outputTokens": 0, + "totalTokens": 0, + "cacheReadInputTokens": 0, + "cacheWriteInputTokens": 0, + }, "metrics": {"latencyMs": 245}, "trace": { "guardrail": { @@ -719,7 +743,13 @@ async def test_converse_output_no_guardrail_redact( ): metadata_event = { "metadata": { - "usage": {"inputTokens": 0, "outputTokens": 0, "totalTokens": 0}, + "usage": { + "inputTokens": 0, + "outputTokens": 0, + "totalTokens": 0, + "cacheReadInputTokens": 0, + "cacheWriteInputTokens": 0, + }, "metrics": {"latencyMs": 245}, "trace": { "guardrail": { @@ -909,7 +939,13 @@ async def test_stream_with_streaming_false_with_metrics_and_usage(bedrock_client """Test stream method with streaming=False.""" bedrock_client.converse.return_value = { "output": {"message": {"role": "assistant", "content": [{"text": "test"}]}}, - "usage": {"inputTokens": 1234, "outputTokens": 1234, "totalTokens": 2468}, + "usage": { + "inputTokens": 1234, + "outputTokens": 1234, + "totalTokens": 2468, + "cacheReadInputTokens": 128, + "cacheWriteInputTokens": 512, + }, "metrics": {"latencyMs": 1234}, "stopReason": "tool_use", } @@ -927,7 +963,13 @@ async def test_stream_with_streaming_false_with_metrics_and_usage(bedrock_client {"messageStop": {"stopReason": "tool_use", "additionalModelResponseFields": None}}, { "metadata": { - "usage": {"inputTokens": 1234, "outputTokens": 1234, "totalTokens": 2468}, + "usage": { + "inputTokens": 1234, + "outputTokens": 1234, + "totalTokens": 2468, + "cacheReadInputTokens": 128, + "cacheWriteInputTokens": 512, + }, "metrics": {"latencyMs": 1234}, } }, diff --git a/tests/strands/models/test_litellm.py b/tests/strands/models/test_litellm.py index 8f4a9e341..1e496e814 100644 --- a/tests/strands/models/test_litellm.py +++ b/tests/strands/models/test_litellm.py @@ -146,7 +146,15 @@ async def test_stream(litellm_client, model, alist): mock_event_3 = unittest.mock.Mock(choices=[unittest.mock.Mock(finish_reason=None, delta=mock_delta_3)]) mock_event_4 = unittest.mock.Mock(choices=[unittest.mock.Mock(finish_reason=None, delta=mock_delta_4)]) mock_event_5 = unittest.mock.Mock(choices=[unittest.mock.Mock(finish_reason="tool_calls", delta=mock_delta_5)]) - mock_event_6 = unittest.mock.Mock() + mock_event_6 = unittest.mock.Mock( + usage=unittest.mock.Mock( + prompt_tokens_details=unittest.mock.Mock( + audio_tokens=None, cached_tokens=0, text_tokens=None, image_tokens=None + ), + cache_creation_input_tokens=0, + cache_read_input_tokens=0, + ) + ) litellm_client.chat.completions.create.return_value = iter( [mock_event_1, mock_event_2, mock_event_3, mock_event_4, mock_event_5, mock_event_6] diff --git a/tests/strands/models/test_llamaapi.py b/tests/strands/models/test_llamaapi.py index 309dac2e9..ee8449e51 100644 --- a/tests/strands/models/test_llamaapi.py +++ b/tests/strands/models/test_llamaapi.py @@ -346,6 +346,8 @@ def test_format_chunk_metadata(model): "inputTokens": 100, "outputTokens": 50, "totalTokens": 150, + "cacheReadInputTokens": 0, + "cacheWriteInputTokens": 0, }, "metrics": { "latencyMs": 0, diff --git a/tests/strands/models/test_mistral.py b/tests/strands/models/test_mistral.py index 786ba25b3..566feffe9 100644 --- a/tests/strands/models/test_mistral.py +++ b/tests/strands/models/test_mistral.py @@ -391,6 +391,8 @@ def test_format_chunk_metadata(model): "inputTokens": 100, "outputTokens": 50, "totalTokens": 150, + "cacheReadInputTokens": 0, + "cacheWriteInputTokens": 0, }, "metrics": { "latencyMs": 250, @@ -419,6 +421,8 @@ def test_format_chunk_metadata_no_latency(model): "inputTokens": 100, "outputTokens": 50, "totalTokens": 150, + "cacheReadInputTokens": 0, + "cacheWriteInputTokens": 0, }, "metrics": { "latencyMs": 0, diff --git a/tests/strands/models/test_ollama.py b/tests/strands/models/test_ollama.py index c718a602c..2f765e820 100644 --- a/tests/strands/models/test_ollama.py +++ b/tests/strands/models/test_ollama.py @@ -398,6 +398,8 @@ def test_format_chunk_metadata(model): "inputTokens": 100, "outputTokens": 50, "totalTokens": 150, + "cacheReadInputTokens": 0, + "cacheWriteInputTokens": 0, }, "metrics": { "latencyMs": 1.0, diff --git a/tests/strands/telemetry/test_metrics.py b/tests/strands/telemetry/test_metrics.py index 215e1efde..f1615e79a 100644 --- a/tests/strands/telemetry/test_metrics.py +++ b/tests/strands/telemetry/test_metrics.py @@ -90,6 +90,8 @@ def usage(request): "inputTokens": 1, "outputTokens": 2, "totalTokens": 3, + "cacheReadInputTokens": 4, + "cacheWriteInputTokens": 5, } if hasattr(request, "param"): params.update(request.param) @@ -315,17 +317,15 @@ def test_event_loop_metrics_update_usage(usage, event_loop_metrics, mock_get_met event_loop_metrics.update_usage(usage) tru_usage = event_loop_metrics.accumulated_usage - exp_usage = Usage( - inputTokens=3, - outputTokens=6, - totalTokens=9, - ) + exp_usage = Usage(inputTokens=3, outputTokens=6, totalTokens=9, cacheReadInputTokens=12, cacheWriteInputTokens=15) assert tru_usage == exp_usage mock_get_meter_provider.return_value.get_meter.assert_called() metrics_client = event_loop_metrics._metrics_client metrics_client.event_loop_input_tokens.record.assert_called() metrics_client.event_loop_output_tokens.record.assert_called() + metrics_client.event_loop_input_tokens_cache_read.record.assert_called() + metrics_client.event_loop_input_tokens_cache_write.record.assert_called() def test_event_loop_metrics_update_metrics(metrics, event_loop_metrics, mock_get_meter_provider): @@ -358,6 +358,8 @@ def test_event_loop_metrics_get_summary(trace, tool, event_loop_metrics, mock_ge "inputTokens": 0, "outputTokens": 0, "totalTokens": 0, + "cacheReadInputTokens": 0, + "cacheWriteInputTokens": 0, }, "average_cycle_time": 0, "tool_usage": { @@ -394,7 +396,7 @@ def test_event_loop_metrics_get_summary(trace, tool, event_loop_metrics, mock_ge {}, "Event Loop Metrics Summary:\n" "├─ Cycles: total=0, avg_time=0.000s, total_time=0.000s\n" - "├─ Tokens: in=0, out=0, total=0\n" + "├─ Tokens: in=0 (cache_write=0), out=0, total=0 (cache_read=0)\n" "├─ Bedrock Latency: 0ms\n" "├─ Tool Usage:\n" " └─ tool1:\n" @@ -412,7 +414,7 @@ def test_event_loop_metrics_get_summary(trace, tool, event_loop_metrics, mock_ge {}, "Event Loop Metrics Summary:\n" "├─ Cycles: total=0, avg_time=0.000s, total_time=0.000s\n" - "├─ Tokens: in=0, out=0, total=0\n" + "├─ Tokens: in=0 (cache_write=0), out=0, total=0 (cache_read=0)\n" "├─ Bedrock Latency: 0ms\n" "├─ Tool Usage:\n" " └─ tool1:\n" diff --git a/tests/strands/telemetry/test_tracer.py b/tests/strands/telemetry/test_tracer.py index 2fcd98c39..8d13ebd16 100644 --- a/tests/strands/telemetry/test_tracer.py +++ b/tests/strands/telemetry/test_tracer.py @@ -166,7 +166,7 @@ def test_end_model_invoke_span(mock_span): """Test ending a model invoke span.""" tracer = Tracer() message = {"role": "assistant", "content": [{"text": "Response"}]} - usage = Usage(inputTokens=10, outputTokens=20, totalTokens=30) + usage = Usage(inputTokens=10, outputTokens=20, totalTokens=30, cacheReadInputTokens=4, cacheWriteInputTokens=25) stop_reason: StopReason = "end_turn" tracer.end_model_invoke_span(mock_span, message, usage, stop_reason) @@ -176,6 +176,9 @@ def test_end_model_invoke_span(mock_span): mock_span.set_attribute.assert_any_call("gen_ai.usage.completion_tokens", 20) mock_span.set_attribute.assert_any_call("gen_ai.usage.output_tokens", 20) mock_span.set_attribute.assert_any_call("gen_ai.usage.total_tokens", 30) + mock_span.set_attribute.assert_any_call("gen_ai.usage.cache_read_input_tokens", 4) + mock_span.set_attribute.assert_any_call("gen_ai.usage.cache_write_input_tokens", 25) + mock_span.add_event.assert_called_with( "gen_ai.choice", attributes={"message": json.dumps(message["content"]), "finish_reason": "end_turn"}, @@ -305,7 +308,13 @@ def test_end_agent_span(mock_span): # Mock AgentResult with metrics mock_metrics = mock.MagicMock() - mock_metrics.accumulated_usage = {"inputTokens": 50, "outputTokens": 100, "totalTokens": 150} + mock_metrics.accumulated_usage = { + "inputTokens": 50, + "outputTokens": 100, + "totalTokens": 150, + "cacheReadInputTokens": 60, + "cacheWriteInputTokens": 100, + } mock_response = mock.MagicMock() mock_response.metrics = mock_metrics @@ -319,6 +328,8 @@ def test_end_agent_span(mock_span): mock_span.set_attribute.assert_any_call("gen_ai.usage.completion_tokens", 100) mock_span.set_attribute.assert_any_call("gen_ai.usage.output_tokens", 100) mock_span.set_attribute.assert_any_call("gen_ai.usage.total_tokens", 150) + mock_span.set_attribute.assert_any_call("gen_ai.usage.cache_read_input_tokens", 60) + mock_span.set_attribute.assert_any_call("gen_ai.usage.cache_write_input_tokens", 100) mock_span.add_event.assert_any_call( "gen_ai.choice", attributes={"message": "Agent response", "finish_reason": "end_turn"}, diff --git a/tests/strands/types/models/test_openai.py b/tests/strands/types/models/test_openai.py index 5baa7e709..04a8330b0 100644 --- a/tests/strands/types/models/test_openai.py +++ b/tests/strands/types/models/test_openai.py @@ -322,7 +322,15 @@ def test_format_request(model, messages, tool_specs, system_prompt): ( { "chunk_type": "metadata", - "data": unittest.mock.Mock(prompt_tokens=100, completion_tokens=50, total_tokens=150), + "data": unittest.mock.Mock( + spec_set=["prompt_tokens", "completion_tokens", "total_tokens", "prompt_tokens_details"], + prompt_tokens=100, + completion_tokens=50, + total_tokens=150, + prompt_tokens_details=unittest.mock.Mock( + spec_set=["cached_tokens", "audio_tokens"], cached_tokens=42, audio_tokens=None + ), + ), }, { "metadata": { @@ -330,6 +338,8 @@ def test_format_request(model, messages, tool_specs, system_prompt): "inputTokens": 100, "outputTokens": 50, "totalTokens": 150, + "cacheReadInputTokens": 42, + "cacheWriteInputTokens": 0, }, "metrics": { "latencyMs": 0, From 28c461be6c9da459cd9df9903792b00182520fe8 Mon Sep 17 00:00:00 2001 From: Workshop Participant Date: Wed, 9 Jul 2025 09:22:37 +0000 Subject: [PATCH 02/12] Adds cache tokens to usage type --- src/strands/types/event_loop.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/strands/types/event_loop.py b/src/strands/types/event_loop.py index 7be33b6fd..99f154800 100644 --- a/src/strands/types/event_loop.py +++ b/src/strands/types/event_loop.py @@ -5,18 +5,22 @@ from typing_extensions import TypedDict -class Usage(TypedDict): +class Usage(TypedDict, total=False): """Token usage information for model interactions. Attributes: inputTokens: Number of tokens sent in the request to the model.. outputTokens: Number of tokens that the model generated for the request. totalTokens: Total number of tokens (input + output). + cacheReadInputTokens: Number of tokens read from cache. + cacheWriteInputTokens: Number of tokens written to cache. """ inputTokens: int outputTokens: int totalTokens: int + cacheReadInputTokens: int + cacheWriteInputTokens: int class Metrics(TypedDict): From 9ca9b72978642f3bbfbe3b7fff2f20ca0b6ab517 Mon Sep 17 00:00:00 2001 From: Workshop Participant Date: Wed, 9 Jul 2025 09:23:33 +0000 Subject: [PATCH 03/12] Inits cache token usage --- src/strands/event_loop/streaming.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/strands/event_loop/streaming.py b/src/strands/event_loop/streaming.py index 6ecc3e270..6fa2a18d8 100644 --- a/src/strands/event_loop/streaming.py +++ b/src/strands/event_loop/streaming.py @@ -275,7 +275,7 @@ async def process_stream( } state["content"] = state["message"]["content"] - usage: Usage = Usage(inputTokens=0, outputTokens=0, totalTokens=0) + usage: Usage = Usage(inputTokens=0, outputTokens=0, totalTokens=0, cacheReadInputTokens=0, cacheWriteInputTokens=0) metrics: Metrics = Metrics(latencyMs=0) async for chunk in chunks: From 345a1479642d769b33c4cb57c2b935954734d785 Mon Sep 17 00:00:00 2001 From: Workshop Participant Date: Wed, 9 Jul 2025 09:24:19 +0000 Subject: [PATCH 04/12] Reads cache tokens from anthropic response --- src/strands/models/anthropic.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/strands/models/anthropic.py b/src/strands/models/anthropic.py index 02c3d9089..94058ffa8 100644 --- a/src/strands/models/anthropic.py +++ b/src/strands/models/anthropic.py @@ -333,6 +333,8 @@ def format_chunk(self, event: dict[str, Any]) -> StreamEvent: "inputTokens": usage["input_tokens"], "outputTokens": usage["output_tokens"], "totalTokens": usage["input_tokens"] + usage["output_tokens"], + "cacheReadInputTokens": usage.get("cache_read_input_tokens", 0), + "cacheWriteInputTokens": usage.get("cache_creation_input_tokens", 0), }, "metrics": { "latencyMs": 0, # TODO From a42d4f7a05fc7d6f54dee7b70b302d26ca0fa131 Mon Sep 17 00:00:00 2001 From: Workshop Participant Date: Wed, 9 Jul 2025 09:24:46 +0000 Subject: [PATCH 05/12] Reads cache tokens from openai response --- src/strands/types/models/openai.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/strands/types/models/openai.py b/src/strands/types/models/openai.py index 09d24bd80..ab8927392 100644 --- a/src/strands/types/models/openai.py +++ b/src/strands/types/models/openai.py @@ -260,6 +260,10 @@ def format_chunk(self, event: dict[str, Any]) -> StreamEvent: "inputTokens": event["data"].prompt_tokens, "outputTokens": event["data"].completion_tokens, "totalTokens": event["data"].total_tokens, + "cacheReadInputTokens": event["data"].prompt_tokens_details.cached_tokens, + "cacheWriteInputTokens": getattr( + event["data"], "cache_creation_input_tokens", 0 + ), # litellm }, "metrics": { "latencyMs": 0, # TODO From 0efb76e68885af83f4c0524d66c78c06f8180ebd Mon Sep 17 00:00:00 2001 From: Workshop Participant Date: Wed, 9 Jul 2025 09:25:19 +0000 Subject: [PATCH 06/12] Reads cache tokens from litellm response (openai, anthropic, bedrock) --- src/strands/models/litellm.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/src/strands/models/litellm.py b/src/strands/models/litellm.py index 1536fc4d6..c1062c4ae 100644 --- a/src/strands/models/litellm.py +++ b/src/strands/models/litellm.py @@ -158,7 +158,15 @@ async def stream(self, request: dict[str, Any]) -> AsyncGenerator[dict[str, Any] for event in response: _ = event - yield {"chunk_type": "metadata", "data": event.usage} + usage = event.usage + cache_read = max( + getattr(usage, "cache_read_input_tokens", 0), + getattr(getattr(usage, "prompt_tokens_details", {}), "cached_tokens", 0), + ) + + usage.prompt_tokens_details.cached_tokens = cache_read + + yield {"chunk_type": "metadata", "data": usage} @override async def structured_output( From e5908c04484728ebc8af95a0b94d4c1a63bcdf09 Mon Sep 17 00:00:00 2001 From: Workshop Participant Date: Wed, 9 Jul 2025 09:26:04 +0000 Subject: [PATCH 07/12] Fallback for model providers that do not support / expose cache token usage --- src/strands/models/llamaapi.py | 3 +++ src/strands/models/mistral.py | 3 +++ src/strands/models/ollama.py | 3 +++ 3 files changed, 9 insertions(+) diff --git a/src/strands/models/llamaapi.py b/src/strands/models/llamaapi.py index 2b585439c..6d80af793 100644 --- a/src/strands/models/llamaapi.py +++ b/src/strands/models/llamaapi.py @@ -310,6 +310,9 @@ def format_chunk(self, event: dict[str, Any]) -> StreamEvent: inputTokens=usage["inputTokens"], outputTokens=usage["outputTokens"], totalTokens=usage["totalTokens"], + # TODO does not seem to support caching as of July 2025 + cacheWriteInputTokens=0, + cacheReadInputTokens=0, ) return { "metadata": { diff --git a/src/strands/models/mistral.py b/src/strands/models/mistral.py index 6f8492b79..7b52f0aa8 100644 --- a/src/strands/models/mistral.py +++ b/src/strands/models/mistral.py @@ -342,6 +342,9 @@ def format_chunk(self, event: dict[str, Any]) -> StreamEvent: "inputTokens": usage.prompt_tokens, "outputTokens": usage.completion_tokens, "totalTokens": usage.total_tokens, + # TODO does not seem to support caching as of July 2025 + "cacheWriteInputTokens": 0, + "cacheReadInputTokens": 0, }, "metrics": { "latencyMs": event.get("latency_ms", 0), diff --git a/src/strands/models/ollama.py b/src/strands/models/ollama.py index 707672498..0a302a19d 100644 --- a/src/strands/models/ollama.py +++ b/src/strands/models/ollama.py @@ -272,6 +272,9 @@ def format_chunk(self, event: dict[str, Any]) -> StreamEvent: "inputTokens": event["data"].eval_count, "outputTokens": event["data"].prompt_eval_count, "totalTokens": event["data"].eval_count + event["data"].prompt_eval_count, + # TODO add cache metrics + "cacheWriteInputTokens": 0, + "cacheReadInputTokens": 0, }, "metrics": { "latencyMs": event["data"].total_duration / 1e6, From 0a46646b0fc7bc52984e406b4d0ee56a1ce24228 Mon Sep 17 00:00:00 2001 From: Workshop Participant Date: Wed, 9 Jul 2025 09:26:26 +0000 Subject: [PATCH 08/12] Reports cache token usage as metric --- src/strands/telemetry/metrics.py | 25 ++++++++++++++++++---- src/strands/telemetry/metrics_constants.py | 2 ++ 2 files changed, 23 insertions(+), 4 deletions(-) diff --git a/src/strands/telemetry/metrics.py b/src/strands/telemetry/metrics.py index 332ab2ae3..3a2567083 100644 --- a/src/strands/telemetry/metrics.py +++ b/src/strands/telemetry/metrics.py @@ -168,7 +168,11 @@ class EventLoopMetrics: tool_metrics: Dict[str, ToolMetrics] = field(default_factory=dict) cycle_durations: List[float] = field(default_factory=list) traces: List[Trace] = field(default_factory=list) - accumulated_usage: Usage = field(default_factory=lambda: Usage(inputTokens=0, outputTokens=0, totalTokens=0)) + accumulated_usage: Usage = field( + default_factory=lambda: Usage( + inputTokens=0, outputTokens=0, totalTokens=0, cacheReadInputTokens=0, cacheWriteInputTokens=0 + ) + ) accumulated_metrics: Metrics = field(default_factory=lambda: Metrics(latencyMs=0)) @property @@ -263,6 +267,8 @@ def update_usage(self, usage: Usage) -> None: self.accumulated_usage["inputTokens"] += usage["inputTokens"] self.accumulated_usage["outputTokens"] += usage["outputTokens"] self.accumulated_usage["totalTokens"] += usage["totalTokens"] + self.accumulated_usage["cacheReadInputTokens"] += usage.get("cacheReadInputTokens", 0) + self.accumulated_usage["cacheWriteInputTokens"] += usage.get("cacheWriteInputTokens", 0) def update_metrics(self, metrics: Metrics) -> None: """Update the accumulated performance metrics with new metrics data. @@ -320,15 +326,18 @@ def _metrics_summary_to_lines(event_loop_metrics: EventLoopMetrics, allowed_name An iterable of formatted text lines representing the metrics. """ summary = event_loop_metrics.get_summary() + accumulated_usage = summary["accumulated_usage"] yield "Event Loop Metrics Summary:" yield ( f"├─ Cycles: total={summary['total_cycles']}, avg_time={summary['average_cycle_time']:.3f}s, " f"total_time={summary['total_duration']:.3f}s" ) yield ( - f"├─ Tokens: in={summary['accumulated_usage']['inputTokens']}, " - f"out={summary['accumulated_usage']['outputTokens']}, " - f"total={summary['accumulated_usage']['totalTokens']}" + f"├─ Tokens: in={accumulated_usage['inputTokens']}" + f" (cache_write={accumulated_usage.get('cacheWriteInputTokens', 0)}), " + f"out={accumulated_usage['outputTokens']}, " + f"total={accumulated_usage['totalTokens']}" + f" (cache_read={accumulated_usage.get('cacheReadInputTokens', 0)})" ) yield f"├─ Bedrock Latency: {summary['accumulated_metrics']['latencyMs']}ms" @@ -421,6 +430,8 @@ class MetricsClient: event_loop_latency: Histogram event_loop_input_tokens: Histogram event_loop_output_tokens: Histogram + event_loop_input_tokens_cache_read: Histogram + event_loop_input_tokens_cache_write: Histogram tool_call_count: Counter tool_success_count: Counter @@ -474,3 +485,9 @@ def create_instruments(self) -> None: self.event_loop_output_tokens = self.meter.create_histogram( name=constants.STRANDS_EVENT_LOOP_OUTPUT_TOKENS, unit="token" ) + self.event_loop_input_tokens_cache_read = self.meter.create_histogram( + name=constants.STRANDS_EVENT_LOOP_INPUT_TOKEN_CACHE_READ, unit="token" + ) + self.event_loop_input_tokens_cache_write = self.meter.create_histogram( + name=constants.STRANDS_EVENT_LOOP_INPUT_TOKENS_CACHE_WRITE, unit="token" + ) diff --git a/src/strands/telemetry/metrics_constants.py b/src/strands/telemetry/metrics_constants.py index b622eebff..caae05098 100644 --- a/src/strands/telemetry/metrics_constants.py +++ b/src/strands/telemetry/metrics_constants.py @@ -13,3 +13,5 @@ STRANDS_EVENT_LOOP_CYCLE_DURATION = "strands.event_loop.cycle_duration" STRANDS_EVENT_LOOP_INPUT_TOKENS = "strands.event_loop.input.tokens" STRANDS_EVENT_LOOP_OUTPUT_TOKENS = "strands.event_loop.output.tokens" +STRANDS_EVENT_LOOP_INPUT_TOKEN_CACHE_READ = "strands.event_loop.input.tokens.cache.read" +STRANDS_EVENT_LOOP_INPUT_TOKENS_CACHE_WRITE = "strands.event_loop.input.tokens.cache.write" From 30d9d5da2f7a345907acad8ae9e5e08ccaa8152f Mon Sep 17 00:00:00 2001 From: Workshop Participant Date: Wed, 9 Jul 2025 09:26:47 +0000 Subject: [PATCH 09/12] Prints cache token usage in tracing --- src/strands/telemetry/tracer.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/strands/telemetry/tracer.py b/src/strands/telemetry/tracer.py index 7f8abb1e6..6ed9ee937 100644 --- a/src/strands/telemetry/tracer.py +++ b/src/strands/telemetry/tracer.py @@ -257,6 +257,8 @@ def end_model_invoke_span( attributes: Dict[str, AttributeValue] = { "gen_ai.usage.prompt_tokens": usage["inputTokens"], "gen_ai.usage.input_tokens": usage["inputTokens"], + "gen_ai.usage.cache_read_input_tokens": usage.get("cacheReadInputTokens", 0), + "gen_ai.usage.cache_write_input_tokens": usage.get("cacheWriteInputTokens", 0), "gen_ai.usage.completion_tokens": usage["outputTokens"], "gen_ai.usage.output_tokens": usage["outputTokens"], "gen_ai.usage.total_tokens": usage["totalTokens"], @@ -490,6 +492,8 @@ def end_agent_span( "gen_ai.usage.input_tokens": accumulated_usage["inputTokens"], "gen_ai.usage.output_tokens": accumulated_usage["outputTokens"], "gen_ai.usage.total_tokens": accumulated_usage["totalTokens"], + "gen_ai.usage.cache_read_input_tokens": accumulated_usage["cacheReadInputTokens"], + "gen_ai.usage.cache_write_input_tokens": accumulated_usage["cacheWriteInputTokens"], } ) From 5a961e48d565af1128b68f0015ca37c1be95fb48 Mon Sep 17 00:00:00 2001 From: Workshop Participant Date: Wed, 9 Jul 2025 09:43:09 +0000 Subject: [PATCH 10/12] Makes cache token metrics optional --- src/strands/types/event_loop.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/strands/types/event_loop.py b/src/strands/types/event_loop.py index 99f154800..86d188c37 100644 --- a/src/strands/types/event_loop.py +++ b/src/strands/types/event_loop.py @@ -1,6 +1,6 @@ """Event loop-related type definitions for the SDK.""" -from typing import Literal +from typing import Literal, Optional from typing_extensions import TypedDict @@ -19,8 +19,8 @@ class Usage(TypedDict, total=False): inputTokens: int outputTokens: int totalTokens: int - cacheReadInputTokens: int - cacheWriteInputTokens: int + cacheReadInputTokens: Optional[int] + cacheWriteInputTokens: Optional[int] class Metrics(TypedDict): From 219b2276b9c2111a3a86fcf5a9b935612cf363ca Mon Sep 17 00:00:00 2001 From: Workshop Participant Date: Wed, 9 Jul 2025 10:00:53 +0000 Subject: [PATCH 11/12] Removes optional cache metric --- src/strands/telemetry/tracer.py | 4 ++-- src/strands/types/event_loop.py | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/strands/telemetry/tracer.py b/src/strands/telemetry/tracer.py index 6ed9ee937..bf10291a6 100644 --- a/src/strands/telemetry/tracer.py +++ b/src/strands/telemetry/tracer.py @@ -492,8 +492,8 @@ def end_agent_span( "gen_ai.usage.input_tokens": accumulated_usage["inputTokens"], "gen_ai.usage.output_tokens": accumulated_usage["outputTokens"], "gen_ai.usage.total_tokens": accumulated_usage["totalTokens"], - "gen_ai.usage.cache_read_input_tokens": accumulated_usage["cacheReadInputTokens"], - "gen_ai.usage.cache_write_input_tokens": accumulated_usage["cacheWriteInputTokens"], + "gen_ai.usage.cache_read_input_tokens": accumulated_usage.get("cacheReadInputTokens", 0), + "gen_ai.usage.cache_write_input_tokens": accumulated_usage.get("cacheWriteInputTokens", 0), } ) diff --git a/src/strands/types/event_loop.py b/src/strands/types/event_loop.py index 86d188c37..99f154800 100644 --- a/src/strands/types/event_loop.py +++ b/src/strands/types/event_loop.py @@ -1,6 +1,6 @@ """Event loop-related type definitions for the SDK.""" -from typing import Literal, Optional +from typing import Literal from typing_extensions import TypedDict @@ -19,8 +19,8 @@ class Usage(TypedDict, total=False): inputTokens: int outputTokens: int totalTokens: int - cacheReadInputTokens: Optional[int] - cacheWriteInputTokens: Optional[int] + cacheReadInputTokens: int + cacheWriteInputTokens: int class Metrics(TypedDict): From 3a36705716ffba7f051703402b2e95a439757b04 Mon Sep 17 00:00:00 2001 From: Workshop Participant Date: Mon, 14 Jul 2025 09:19:25 +0000 Subject: [PATCH 12/12] Fix: openai cache tokens metric after format_chunk changes --- src/strands/models/openai.py | 2 ++ tests/strands/models/test_openai.py | 4 +++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/src/strands/models/openai.py b/src/strands/models/openai.py index 6374590b9..28edc532b 100644 --- a/src/strands/models/openai.py +++ b/src/strands/models/openai.py @@ -310,6 +310,8 @@ def format_chunk(self, event: dict[str, Any]) -> StreamEvent: "inputTokens": event["data"].prompt_tokens, "outputTokens": event["data"].completion_tokens, "totalTokens": event["data"].total_tokens, + "cacheReadInputTokens": event["data"].prompt_tokens_details.cached_tokens, + "cacheWriteInputTokens": 0, # OpenAI does not return cache write information }, "metrics": { "latencyMs": 0, # TODO diff --git a/tests/strands/models/test_openai.py b/tests/strands/models/test_openai.py index 0a095ab9d..2ce1f66d6 100644 --- a/tests/strands/models/test_openai.py +++ b/tests/strands/models/test_openai.py @@ -352,7 +352,7 @@ def test_format_request(model, messages, tool_specs, system_prompt): ( { "chunk_type": "metadata", - "data": unittest.mock.Mock(prompt_tokens=100, completion_tokens=50, total_tokens=150), + "data": unittest.mock.Mock(prompt_tokens=100, completion_tokens=50, total_tokens=150, prompt_tokens_details=unittest.mock.Mock(cached_tokens=40)), }, { "metadata": { @@ -360,6 +360,8 @@ def test_format_request(model, messages, tool_specs, system_prompt): "inputTokens": 100, "outputTokens": 50, "totalTokens": 150, + "cacheReadInputTokens": 40, + "cacheWriteInputTokens": 0 }, "metrics": { "latencyMs": 0,