From 0521ec24d712a0ef3615b47767d807a5291dc36d Mon Sep 17 00:00:00 2001 From: Sanny Sanoff Date: Mon, 29 Dec 2025 20:51:20 +0100 Subject: [PATCH 1/2] feat: Add token processing and generation speed to usage report MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add --show-speed flag to enable speed display (disabled by default) - Track LLM response time (llm_elapsed) after streaming completes - Track time to first token (TtFT) for streaming responses - Calculate and display prompt processing speed (tokens/sec) - Calculate and display token generation speed (output tokens/sec) Output with --show-speed: LLM elapsed time: X.XX seconds (TtFT: X.XXs) Speed: XXX prompt tokens/sec, XXX output tokens/sec 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- aider/args.py | 6 +++ aider/coders/base_coder.py | 76 +++++++++++++++++++++++++++++++++++--- 2 files changed, 76 insertions(+), 6 deletions(-) diff --git a/aider/args.py b/aider/args.py index 41d883249ad..2625406839c 100644 --- a/aider/args.py +++ b/aider/args.py @@ -486,6 +486,12 @@ def get_parser(default_config_files, git_root): default="#0088ff", help="Set the color for assistant output (default: #0088ff)", ) + group.add_argument( + "--show-speed", + action="store_true", + help="Show token processing and generation speed in usage report (default: False)", + default=False, + ) group.add_argument( "--completion-menu-color", metavar="COLOR", diff --git a/aider/coders/base_coder.py b/aider/coders/base_coder.py index fa0a3f302bc..28500581949 100755 --- a/aider/coders/base_coder.py +++ b/aider/coders/base_coder.py @@ -2969,6 +2969,7 @@ async def send(self, messages, model=None, functions=None, tools=None): self.partial_response_function_call = dict() completion = None + llm_start_time = time.time() try: hash_object, completion = await model.send_completion( @@ -2982,19 +2983,21 @@ async def send(self, messages, model=None, functions=None, tools=None): self.chat_completion_call_hashes.append(hash_object.hexdigest()) if not isinstance(completion, ModelResponse): - async for chunk in self.show_send_output_stream(completion): + async for chunk in self.show_send_output_stream(completion, llm_start_time): yield chunk else: self.show_send_output(completion) - # Calculate costs for successful responses - self.calculate_and_show_tokens_and_cost(messages, completion) + # Calculate costs after streaming completes (llm_elapsed set by show_send_output_stream) + llm_elapsed = getattr(self, 'llm_elapsed', time.time() - llm_start_time) + self.calculate_and_show_tokens_and_cost(messages, completion, llm_elapsed) except LiteLLMExceptions().exceptions_tuple() as err: ex_info = LiteLLMExceptions().get_ex_info(err) if ex_info.name == "ContextWindowExceededError": # Still calculate costs for context window errors - self.calculate_and_show_tokens_and_cost(messages, completion) + llm_elapsed = time.time() - llm_start_time + self.calculate_and_show_tokens_and_cost(messages, completion, llm_elapsed) raise except KeyboardInterrupt as kbi: self.keyboard_interrupt() @@ -3056,8 +3059,10 @@ def show_send_output(self, completion): ): raise FinishReasonLength() - async def show_send_output_stream(self, completion): + async def show_send_output_stream(self, completion, llm_start_time=None): received_content = False + first_token_time = None + start_time = llm_start_time or time.time() async for chunk in completion: if self.args.debug: @@ -3084,6 +3089,8 @@ async def show_send_output_stream(self, completion): try: if chunk.choices[0].delta.tool_calls: received_content = True + if first_token_time is None: + first_token_time = time.time() for tool_call_chunk in chunk.choices[0].delta.tool_calls: self.tool_reflection = True @@ -3111,6 +3118,8 @@ async def show_send_output_stream(self, completion): self.io.update_spinner_suffix(v) received_content = True + if first_token_time is None: + first_token_time = time.time() except AttributeError: pass @@ -3130,6 +3139,8 @@ async def show_send_output_stream(self, completion): text += reasoning_content self.got_reasoning_content = True received_content = True + if first_token_time is None: + first_token_time = time.time() self.io.update_spinner_suffix(reasoning_content) self.partial_response_reasoning_content += reasoning_content @@ -3142,6 +3153,8 @@ async def show_send_output_stream(self, completion): text += content received_content = True + if first_token_time is None: + first_token_time = time.time() self.io.update_spinner_suffix(content) except AttributeError: pass @@ -3173,6 +3186,11 @@ async def show_send_output_stream(self, completion): if not received_content and len(self.partial_response_tool_calls) == 0: self.io.tool_warning("Empty response received from LLM. Check your provider account?") + # Set timing info for reporting (after streaming completes) + self.llm_elapsed = time.time() - start_time + if first_token_time is not None: + self.first_token_time = first_token_time - start_time + def consolidate_chunks(self): response = ( self.partial_response_chunks[0] @@ -3333,7 +3351,7 @@ def remove_reasoning_content(self): self.reasoning_tag_name, ) - def calculate_and_show_tokens_and_cost(self, messages, completion=None): + def calculate_and_show_tokens_and_cost(self, messages, completion=None, llm_elapsed=None): prompt_tokens = 0 completion_tokens = 0 cache_hit_tokens = 0 @@ -3372,6 +3390,8 @@ def calculate_and_show_tokens_and_cost(self, messages, completion=None): if not self.main_model.info.get("input_cost_per_token"): self.usage_report = tokens_report + # Still add speed info even without cost + self._add_speed_info(llm_elapsed) return try: @@ -3400,6 +3420,50 @@ def calculate_and_show_tokens_and_cost(self, messages, completion=None): self.usage_report = tokens_report + sep + cost_report + # Add LLM elapsed time and speed information + self._add_speed_info(llm_elapsed) + + def _add_speed_info(self, llm_elapsed): + """Add LLM elapsed time and speed information to usage report.""" + if llm_elapsed is None or self.usage_report is None: + return + + # Check if speed display is enabled + show_speed = getattr(self.args, 'show_speed', False) if self.args else False + + if not show_speed: + return + + time_report = f"\nLLM elapsed time: {llm_elapsed:.2f} seconds" + # Add time to first token if available + if hasattr(self, 'first_token_time'): + time_report += f" (TtFT: {self.first_token_time:.2f}s)" + + # Add processing and generation speeds if we have the data + if hasattr(self, 'message_tokens_sent') and hasattr(self, 'message_tokens_received'): + sent_tokens = self.message_tokens_sent + received_tokens = self.message_tokens_received + if sent_tokens > 0 and received_tokens > 0: + # Calculate prompt processing speed (tokens/sec) based on time to first token + if hasattr(self, 'first_token_time') and self.first_token_time > 0: + prompt_processing_speed = sent_tokens / self.first_token_time + time_report += f"\nSpeed: {prompt_processing_speed:.0f} prompt tokens/sec" + + # Calculate token generation speed based on time after first token + if hasattr(self, 'first_token_time') and self.first_token_time > 0: + generation_time = llm_elapsed - self.first_token_time + if generation_time > 0: + token_generation_speed = received_tokens / generation_time + time_report += f", {token_generation_speed:.0f} output tokens/sec" + else: + token_generation_speed = received_tokens / llm_elapsed + if hasattr(self, 'first_token_time') and self.first_token_time > 0: + time_report += f", {token_generation_speed:.0f} output tokens/sec" + else: + time_report += f"\nSpeed: {token_generation_speed:.0f} output tokens/sec" + + self.usage_report += time_report + def format_cost(self, value): if value == 0: return "0.00" From f606d04f59babab15d4c20394354f594cadb3c34 Mon Sep 17 00:00:00 2001 From: Sanny Sanoff Date: Wed, 31 Dec 2025 16:11:24 +0100 Subject: [PATCH 2/2] minimize the complexity of coder changes. --- aider/coders/base_coder.py | 85 ++++--------------- aider/helpers/profiler.py | 162 +++++++++++++++++++++++++++++++++++++ 2 files changed, 177 insertions(+), 70 deletions(-) create mode 100644 aider/helpers/profiler.py diff --git a/aider/coders/base_coder.py b/aider/coders/base_coder.py index 28500581949..fbc41456cf3 100755 --- a/aider/coders/base_coder.py +++ b/aider/coders/base_coder.py @@ -39,6 +39,7 @@ from aider.commands import Commands, SwitchCoder from aider.exceptions import LiteLLMExceptions from aider.helpers import coroutines +from aider.helpers.profiler import TokenProfiler from aider.history import ChatSummary from aider.io import ConfirmGroup, InputOutput from aider.linter import Linter @@ -365,6 +366,7 @@ def __init__( self.message_tokens_sent = 0 self.message_tokens_received = 0 + self.profiler = TokenProfiler(enable_printing=getattr(args, 'show_speed', False) if args else False) self.verbose = verbose self.abs_fnames = set() self.abs_read_only_fnames = set() @@ -2969,7 +2971,7 @@ async def send(self, messages, model=None, functions=None, tools=None): self.partial_response_function_call = dict() completion = None - llm_start_time = time.time() + self.profiler.start() try: hash_object, completion = await model.send_completion( @@ -2983,21 +2985,20 @@ async def send(self, messages, model=None, functions=None, tools=None): self.chat_completion_call_hashes.append(hash_object.hexdigest()) if not isinstance(completion, ModelResponse): - async for chunk in self.show_send_output_stream(completion, llm_start_time): + async for chunk in self.show_send_output_stream(completion): yield chunk else: self.show_send_output(completion) - # Calculate costs after streaming completes (llm_elapsed set by show_send_output_stream) - llm_elapsed = getattr(self, 'llm_elapsed', time.time() - llm_start_time) - self.calculate_and_show_tokens_and_cost(messages, completion, llm_elapsed) + # Calculate costs for successful responses + self.calculate_and_show_tokens_and_cost(messages, completion) except LiteLLMExceptions().exceptions_tuple() as err: ex_info = LiteLLMExceptions().get_ex_info(err) if ex_info.name == "ContextWindowExceededError": # Still calculate costs for context window errors - llm_elapsed = time.time() - llm_start_time - self.calculate_and_show_tokens_and_cost(messages, completion, llm_elapsed) + self.profiler.on_error() + self.calculate_and_show_tokens_and_cost(messages, completion) raise except KeyboardInterrupt as kbi: self.keyboard_interrupt() @@ -3059,10 +3060,8 @@ def show_send_output(self, completion): ): raise FinishReasonLength() - async def show_send_output_stream(self, completion, llm_start_time=None): + async def show_send_output_stream(self, completion): received_content = False - first_token_time = None - start_time = llm_start_time or time.time() async for chunk in completion: if self.args.debug: @@ -3089,8 +3088,7 @@ async def show_send_output_stream(self, completion, llm_start_time=None): try: if chunk.choices[0].delta.tool_calls: received_content = True - if first_token_time is None: - first_token_time = time.time() + self.profiler.on_token() for tool_call_chunk in chunk.choices[0].delta.tool_calls: self.tool_reflection = True @@ -3118,8 +3116,7 @@ async def show_send_output_stream(self, completion, llm_start_time=None): self.io.update_spinner_suffix(v) received_content = True - if first_token_time is None: - first_token_time = time.time() + self.profiler.on_token() except AttributeError: pass @@ -3139,8 +3136,7 @@ async def show_send_output_stream(self, completion, llm_start_time=None): text += reasoning_content self.got_reasoning_content = True received_content = True - if first_token_time is None: - first_token_time = time.time() + self.profiler.on_token() self.io.update_spinner_suffix(reasoning_content) self.partial_response_reasoning_content += reasoning_content @@ -3153,8 +3149,7 @@ async def show_send_output_stream(self, completion, llm_start_time=None): text += content received_content = True - if first_token_time is None: - first_token_time = time.time() + self.profiler.on_token() self.io.update_spinner_suffix(content) except AttributeError: pass @@ -3186,11 +3181,6 @@ async def show_send_output_stream(self, completion, llm_start_time=None): if not received_content and len(self.partial_response_tool_calls) == 0: self.io.tool_warning("Empty response received from LLM. Check your provider account?") - # Set timing info for reporting (after streaming completes) - self.llm_elapsed = time.time() - start_time - if first_token_time is not None: - self.first_token_time = first_token_time - start_time - def consolidate_chunks(self): response = ( self.partial_response_chunks[0] @@ -3351,7 +3341,7 @@ def remove_reasoning_content(self): self.reasoning_tag_name, ) - def calculate_and_show_tokens_and_cost(self, messages, completion=None, llm_elapsed=None): + def calculate_and_show_tokens_and_cost(self, messages, completion=None): prompt_tokens = 0 completion_tokens = 0 cache_hit_tokens = 0 @@ -3387,11 +3377,10 @@ def calculate_and_show_tokens_and_cost(self, messages, completion=None, llm_elap if cache_hit_tokens: tokens_report += f", {format_tokens(cache_hit_tokens)} cache hit" tokens_report += f", {format_tokens(self.message_tokens_received)} received." + tokens_report = self.profiler.add_to_usage_report(tokens_report, self.message_tokens_sent, self.message_tokens_received) if not self.main_model.info.get("input_cost_per_token"): self.usage_report = tokens_report - # Still add speed info even without cost - self._add_speed_info(llm_elapsed) return try: @@ -3420,50 +3409,6 @@ def calculate_and_show_tokens_and_cost(self, messages, completion=None, llm_elap self.usage_report = tokens_report + sep + cost_report - # Add LLM elapsed time and speed information - self._add_speed_info(llm_elapsed) - - def _add_speed_info(self, llm_elapsed): - """Add LLM elapsed time and speed information to usage report.""" - if llm_elapsed is None or self.usage_report is None: - return - - # Check if speed display is enabled - show_speed = getattr(self.args, 'show_speed', False) if self.args else False - - if not show_speed: - return - - time_report = f"\nLLM elapsed time: {llm_elapsed:.2f} seconds" - # Add time to first token if available - if hasattr(self, 'first_token_time'): - time_report += f" (TtFT: {self.first_token_time:.2f}s)" - - # Add processing and generation speeds if we have the data - if hasattr(self, 'message_tokens_sent') and hasattr(self, 'message_tokens_received'): - sent_tokens = self.message_tokens_sent - received_tokens = self.message_tokens_received - if sent_tokens > 0 and received_tokens > 0: - # Calculate prompt processing speed (tokens/sec) based on time to first token - if hasattr(self, 'first_token_time') and self.first_token_time > 0: - prompt_processing_speed = sent_tokens / self.first_token_time - time_report += f"\nSpeed: {prompt_processing_speed:.0f} prompt tokens/sec" - - # Calculate token generation speed based on time after first token - if hasattr(self, 'first_token_time') and self.first_token_time > 0: - generation_time = llm_elapsed - self.first_token_time - if generation_time > 0: - token_generation_speed = received_tokens / generation_time - time_report += f", {token_generation_speed:.0f} output tokens/sec" - else: - token_generation_speed = received_tokens / llm_elapsed - if hasattr(self, 'first_token_time') and self.first_token_time > 0: - time_report += f", {token_generation_speed:.0f} output tokens/sec" - else: - time_report += f"\nSpeed: {token_generation_speed:.0f} output tokens/sec" - - self.usage_report += time_report - def format_cost(self, value): if value == 0: return "0.00" diff --git a/aider/helpers/profiler.py b/aider/helpers/profiler.py new file mode 100644 index 00000000000..ddd62f8e574 --- /dev/null +++ b/aider/helpers/profiler.py @@ -0,0 +1,162 @@ +"""Token profiler for tracking and reporting LLM token timing metrics.""" + +import time +from typing import Optional + + +class TokenProfiler: + """ + A profiler for tracking LLM token timing metrics with minimal interface. + + Handles all timing logic internally - just need to: + 1. Create with enable_printing flag + 2. Call start() when starting LLM request + 3. Call on_token() for each token received (auto-detects first token) + 4. Call set_token_counts() with input/output token counts + 5. Call get_report() to get formatted report (only if enabled) + 6. Call on_error() for error cases + """ + + def __init__(self, enable_printing: bool = False): + """ + Initialize the token profiler. + + Args: + enable_printing: If True, generate reports when get_report() is called + """ + self._enabled = enable_printing + self._start_time: Optional[float] = None + self._first_token_time: Optional[float] = None + self._end_time: Optional[float] = None + self._input_tokens: int = 0 + self._output_tokens: int = 0 + self._has_seen_first_token: bool = False + + def start(self) -> None: + """Start timing an LLM request.""" + self._start_time = time.time() + self._first_token_time = None + self._end_time = None + self._has_seen_first_token = False + self._input_tokens = 0 + self._output_tokens = 0 + + def on_token(self) -> None: + """ + Record that a token was received. + Auto-detects if this is the first token. + """ + if not self._enabled or not self._start_time: + return + + if not self._has_seen_first_token: + self._first_token_time = time.time() + self._has_seen_first_token = True + + def set_token_counts(self, input_tokens: int, output_tokens: int) -> None: + """ + Set the token counts for the request. + + Args: + input_tokens: Number of input/prompt tokens + output_tokens: Number of output/generated tokens + """ + if not self._enabled: + return + + self._input_tokens = input_tokens + self._output_tokens = output_tokens + + def on_error(self) -> None: + """Handle error case - finalize timing.""" + if not self._enabled or not self._start_time: + return + + if self._end_time is None: + self._end_time = time.time() + + def get_report(self) -> Optional[str]: + """ + Get the formatted speed report (only if enabled). + + Returns: + Formatted report string, or None if disabled or no data + """ + if not self._enabled or not self._start_time: + return None + + # Calculate elapsed time + if self._end_time is not None: + elapsed = self._end_time - self._start_time + else: + elapsed = time.time() - self._start_time + + # Build the time report + report = f"\nLLM elapsed time: {elapsed:.2f} seconds" + + # Add time to first token if available + if self._first_token_time is not None: + ttft = self._first_token_time - self._start_time + report += f" (TtFT: {ttft:.2f}s)" + + # Add speed information if we have token data + if self._input_tokens > 0 and self._output_tokens > 0: + speed_parts = [] + + # Prompt processing speed (based on time to first token) + if ttft > 0: + prompt_speed = self._input_tokens / ttft + speed_parts.append(f"{prompt_speed:.0f} prompt tokens/sec") + + # Token generation speed (based on time after first token) + generation_time = elapsed - ttft + if generation_time > 0: + generation_speed = self._output_tokens / generation_time + speed_parts.append(f"{generation_speed:.0f} output tokens/sec") + + if speed_parts: + report += "\nSpeed: " + ", ".join(speed_parts) + + return report + + def get_elapsed(self) -> Optional[float]: + """ + Get the elapsed time for the current request. + + Returns: + Elapsed time in seconds, or None if not started + """ + if not self._start_time: + return None + + if self._end_time is not None: + return self._end_time - self._start_time + + return time.time() - self._start_time + + def add_to_usage_report(self, usage_report: Optional[str], input_tokens: int = 0, output_tokens: int = 0) -> str: + """ + Add speed report to usage_report and return the combined string. + + Args: + usage_report: The existing usage report string + input_tokens: Number of input/prompt tokens (optional, updates if provided) + output_tokens: Number of output/generated tokens (optional, updates if provided) + + Returns: + The usage report with speed info appended (if enabled), or original if disabled + """ + if not usage_report: + return usage_report + + # Update token counts if provided + if input_tokens > 0 or output_tokens > 0: + self.set_token_counts(input_tokens, output_tokens) + + speed_report = self.get_report() + if speed_report: + return usage_report + speed_report + + return usage_report + +