From b87a1cd583049e5b0696e43f2ebaffc10b3a3c7f Mon Sep 17 00:00:00 2001 From: Pavloveuge Date: Sun, 4 May 2025 05:10:23 +0300 Subject: [PATCH 01/12] add new metrics reporting --- src/utils/metrics.py | 171 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 171 insertions(+) diff --git a/src/utils/metrics.py b/src/utils/metrics.py index ecb044d4..a593f70a 100644 --- a/src/utils/metrics.py +++ b/src/utils/metrics.py @@ -29,6 +29,7 @@ from typing import Dict, List, Union import triton_python_backend_utils as pb_utils + from vllm.config import VllmConfig from vllm.engine.metrics import StatLoggerBase as VllmStatLoggerBase from vllm.engine.metrics import Stats as VllmStats @@ -49,6 +50,16 @@ def __init__(self, labels: List[str], max_model_len: int): description="Number of generation tokens processed.", kind=pb_utils.MetricFamily.COUNTER, ) + self.counter_preemption_tokens_family = pb_utils.MetricFamily( + name="vllm:num_preemptions_total", + description="Number of preemption tokens processed.", + kind=pb_utils.MetricFamily.COUNTER, + ) + self.histogram_iteration_tokens_total_family = pb_utils.MetricFamily( + name="vllm:iteration_tokens_total", + description="Histogram of number of tokens per engine_step.", + kind=pb_utils.MetricFamily.HISTOGRAM, + ) self.histogram_time_to_first_token_family = pb_utils.MetricFamily( name="vllm:time_to_first_token_seconds", description="Histogram of time to first token in seconds.", @@ -66,6 +77,26 @@ def __init__(self, labels: List[str], max_model_len: int): description="Histogram of end to end request latency in seconds.", kind=pb_utils.MetricFamily.HISTOGRAM, ) + self.histogram_queue_time_request_family = pb_utils.MetricFamily( + name="vllm:request_queue_time_seconds", + description="Histogram of time spent in WAITING phase for request.", + kind=pb_utils.MetricFamily.HISTOGRAM, + ) + self.histogram_inference_time_request_family = pb_utils.MetricFamily( + name="vllm:request_inference_time_seconds", + description="Histogram of time spent in RUNNING phase for request.", + kind=pb_utils.MetricFamily.HISTOGRAM, + ) + self.histogram_prefill_time_request_family = pb_utils.MetricFamily( + name="vllm:request_prefill_time_seconds", + description="Histogram of time spent in PREFILL phase for request.", + kind=pb_utils.MetricFamily.HISTOGRAM, + ) + self.histogram_decode_time_request_family = pb_utils.MetricFamily( + name="vllm:request_decode_time_seconds", + description="Histogram of time spent in DECODE phase for request.", + kind=pb_utils.MetricFamily.HISTOGRAM, + ) # Metadata self.histogram_num_prompt_tokens_request_family = pb_utils.MetricFamily( name="vllm:request_prompt_tokens", @@ -82,6 +113,45 @@ def __init__(self, labels: List[str], max_model_len: int): description="Histogram of the n request parameter.", kind=pb_utils.MetricFamily.HISTOGRAM, ) + # System stats + # Scheduler State + self.gauge_scheduler_running_family = pb_utils.MetricFamily( + name="vllm:num_requests_running", + description="Number of requests currently running on GPU.", + kind=pb_utils.MetricFamily.GAUGE, + ) + self.gauge_scheduler_waiting_family = pb_utils.MetricFamily( + name="vllm:num_requests_waiting", + description="Number of requests waiting to be processed.", + kind=pb_utils.MetricFamily.GAUGE, + ) + self.gauge_scheduler_swapped_family = pb_utils.MetricFamily( + name="vllm:num_requests_swapped", + description="Number of requests swapped to CPU.", + kind=pb_utils.MetricFamily.GAUGE, + ) + # KV Cache Usage in % + self.gauge_gpu_cache_usage_family = pb_utils.MetricFamily( + name="vllm:gpu_cache_usage_perc", + description="GPU KV-cache usage. 1 means 100 percent usage.", + kind=pb_utils.MetricFamily.GAUGE, + ) + self.gauge_cpu_cache_usage_family = pb_utils.MetricFamily( + name="vllm:cpu_cache_usage_perc", + description="CPU KV-cache usage. 1 means 100 percent usage.", + kind=pb_utils.MetricFamily.GAUGE, + ) + # Prefix caching block hit rate + self.gauge_cpu_prefix_cache_hit_rate_family = pb_utils.MetricFamily( + name="vllm:cpu_prefix_cache_hit_rate", + description="CPU prefix cache block hit rate.", + kind=pb_utils.MetricFamily.GAUGE, + ) + self.gauge_gpu_prefix_cache_hit_rate_family = pb_utils.MetricFamily( + name="vllm:gpu_prefix_cache_hit_rate", + description="GPU prefix cache block hit rate.", + kind=pb_utils.MetricFamily.GAUGE, + ) # Initialize metrics # Iteration stats @@ -91,8 +161,19 @@ def __init__(self, labels: List[str], max_model_len: int): self.counter_generation_tokens = self.counter_generation_tokens_family.Metric( labels=labels ) + self.counter_preemption_tokens = self.counter_preemption_tokens_family.Metric( + labels=labels + ) + # Use the same bucket boundaries from vLLM sample metrics as an example. # https://github.com/vllm-project/vllm/blob/21313e09e3f9448817016290da20d0db1adf3664/vllm/engine/metrics.py#L81-L96 + self.histogram_iteration_tokens_total = ( + self.histogram_iteration_tokens_total_family.Metric( + labels=labels, + buckets=[1, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8096], + ) + ) + self.histogram_time_to_first_token = ( self.histogram_time_to_first_token_family.Metric( labels=labels, @@ -142,6 +223,30 @@ def __init__(self, labels: List[str], max_model_len: int): labels=labels, buckets=[1.0, 2.5, 5.0, 10.0, 15.0, 20.0, 30.0, 40.0, 50.0, 60.0], ) + self.histogram_prefill_time_request = ( + self.histogram_prefill_time_request_family.Metric( + labels=labels, + buckets=[1.0, 2.5, 5.0, 10.0, 15.0, 20.0, 30.0, 40.0, 50.0, 60.0], + ) + ) + self.histogram_decode_time_request = ( + self.histogram_decode_time_request_family.Metric( + labels=labels, + buckets=[1.0, 2.5, 5.0, 10.0, 15.0, 20.0, 30.0, 40.0, 50.0, 60.0], + ) + ) + self.histogram_inference_time_request = ( + self.histogram_inference_time_request_family.Metric( + labels=labels, + buckets=[1.0, 2.5, 5.0, 10.0, 15.0, 20.0, 30.0, 40.0, 50.0, 60.0], + ) + ) + self.histogram_queue_time_request = ( + self.histogram_queue_time_request_family.Metric( + labels=labels, + buckets=[1.0, 2.5, 5.0, 10.0, 15.0, 20.0, 30.0, 40.0, 50.0, 60.0], + ) + ) # Metadata self.histogram_num_prompt_tokens_request = ( self.histogram_num_prompt_tokens_request_family.Metric( @@ -159,6 +264,31 @@ def __init__(self, labels: List[str], max_model_len: int): labels=labels, buckets=[1, 2, 5, 10, 20], ) + # System stats + # Scheduler State + self.gauge_num_requests_running = self.gauge_scheduler_running_family.Metric( + labels=labels + ) + self.gauge_num_requests_waiting = self.gauge_scheduler_waiting_family.Metric( + labels=labels + ) + self.gauge_num_requests_swapped = self.gauge_scheduler_swapped_family.Metric( + labels=labels + ) + # KV Cache Usage in % + self.gauge_gpu_cache_usage = self.gauge_gpu_cache_usage_family.Metric( + labels=labels + ) + self.gauge_cpu_cache_usage = self.gauge_cpu_cache_usage_family.Metric( + labels=labels + ) + # Prefix caching block hit rate + self.gauge_cpu_prefix_cache_hit_rate = ( + self.gauge_cpu_prefix_cache_hit_rate_family.Metric(labels=labels) + ) + self.gauge_gpu_prefix_cache_hit_rate = ( + self.gauge_gpu_prefix_cache_hit_rate_family.Metric(labels=labels) + ) class VllmStatLogger(VllmStatLoggerBase): @@ -182,6 +312,18 @@ def __init__(self, labels: Dict, vllm_config: VllmConfig, log_logger) -> None: def info(self, type: str, obj: SupportsMetricsInfo) -> None: pass + def _log_gauge(self, gauge, data: Union[int, float]) -> None: + """Convenience function for logging to gauge. + + Args: + gauge: A gauge metric instance. + data: An int or float to set as the current gauge value. + + Returns: + None + """ + self._logger_queue.put_nowait((gauge, "set", data)) + def _log_counter(self, counter, data: Union[int, float]) -> None: """Convenience function for logging to counter. @@ -222,8 +364,10 @@ def log(self, stats: VllmStats) -> None: counter_metrics = [ (self.metrics.counter_prompt_tokens, stats.num_prompt_tokens_iter), (self.metrics.counter_generation_tokens, stats.num_generation_tokens_iter), + (self.metrics.counter_preemption_tokens, stats.num_preemption_iter), ] histogram_metrics = [ + (self.metrics.histogram_iteration_tokens_total, [stats.num_tokens_iter]), ( self.metrics.histogram_time_to_first_token, stats.time_to_first_tokens_iter, @@ -233,6 +377,13 @@ def log(self, stats: VllmStats) -> None: stats.time_per_output_tokens_iter, ), (self.metrics.histogram_e2e_time_request, stats.time_e2e_requests), + (self.metrics.histogram_prefill_time_request, stats.time_prefill_requests), + (self.metrics.histogram_decode_time_request, stats.time_decode_requests), + ( + self.metrics.histogram_inference_time_request, + stats.time_inference_requests, + ), + (self.metrics.histogram_queue_time_request, stats.time_queue_requests), ( self.metrics.histogram_num_prompt_tokens_request, stats.num_prompt_tokens_requests, @@ -243,10 +394,27 @@ def log(self, stats: VllmStats) -> None: ), (self.metrics.histogram_n_request, stats.n_requests), ] + gauge_metrics = [ + (self.metrics.gauge_num_requests_running, stats.num_running_sys), + (self.metrics.gauge_num_requests_waiting, stats.num_waiting_sys), + (self.metrics.gauge_num_requests_swapped, stats.num_swapped_sys), + (self.metrics.gauge_gpu_cache_usage, stats.gpu_cache_usage_sys), + (self.metrics.gauge_cpu_cache_usage, stats.cpu_cache_usage_sys), + ( + self.metrics.gauge_cpu_prefix_cache_hit_rate, + stats.cpu_prefix_cache_hit_rate, + ), + ( + self.metrics.gauge_gpu_prefix_cache_hit_rate, + stats.gpu_prefix_cache_hit_rate, + ), + ] for metric, data in counter_metrics: self._log_counter(metric, data) for metric, data in histogram_metrics: self._log_histogram(metric, data) + for metric, data in gauge_metrics: + self._log_gauge(metric, data) def logger_loop(self): while True: @@ -259,6 +427,8 @@ def logger_loop(self): metric.increment(data) elif command == "observe": metric.observe(data) + elif command == "set": + metric.set(data) else: self.log_logger.log_error(f"Undefined command name: {command}") @@ -268,3 +438,4 @@ def finalize(self): if self._logger_thread is not None: self._logger_thread.join() self._logger_thread = None + self._logger_thread = None From 1a1b66e71812b2b9a09f49dd057dc55067de2adb Mon Sep 17 00:00:00 2001 From: Pavloveuge Date: Sun, 4 May 2025 05:24:31 +0300 Subject: [PATCH 02/12] update readme --- README.md | 85 ++++++++++++++++++++++++++++++++++++++++++++ src/utils/metrics.py | 2 -- 2 files changed, 85 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index b88f9ec9..6be683b2 100644 --- a/README.md +++ b/README.md @@ -227,12 +227,24 @@ VLLM stats are reported by the metrics endpoint in fields that are prefixed with counter_prompt_tokens # Number of generation tokens processed. counter_generation_tokens +# Number of preemption tokens processed. +counter_preemption_tokens +# Histogram of number of tokens per engine_step. +histogram_iteration_tokens # Histogram of time to first token in seconds. histogram_time_to_first_token # Histogram of time per output token in seconds. histogram_time_per_output_token # Histogram of end to end request latency in seconds. histogram_e2e_time_request +# Histogram of time spent in WAITING phase for request. +histogram_queue_time_request +# Histogram of time spent in RUNNING phase for request. +histogram_inference_time_request +# Histogram of time spent in PREFILL phase for request. +histogram_prefill_time_request +# Histogram of time spent in DECODE phase for request. +histogram_decode_time_request # Number of prefill tokens processed. histogram_num_prompt_tokens_request # Number of generation tokens processed. @@ -241,6 +253,20 @@ histogram_num_generation_tokens_request histogram_best_of_request # Histogram of the n request parameter. histogram_n_request +# Number of requests currently running on GPU. +gauge_scheduler_running +# Number of requests waiting to be processed. +gauge_scheduler_waiting +# Number of requests swapped to CPU. +gauge_scheduler_swapped +# GPU KV-cache usage. 1 means 100 percent usage. +gauge_gpu_cache_usage +# CPU KV-cache usage. 1 means 100 percent usage. +gauge_cpu_cache_usage +# CPU prefix cache block hit rate. +gauge_cpu_prefix_cache_hit_rate +# GPU prefix cache block hit rate. +gauge_gpu_prefix_cache_hit_rate ``` Your output for these fields should look similar to the following: ```bash @@ -250,6 +276,37 @@ vllm:prompt_tokens_total{model="vllm_model",version="1"} 10 # HELP vllm:generation_tokens_total Number of generation tokens processed. # TYPE vllm:generation_tokens_total counter vllm:generation_tokens_total{model="vllm_model",version="1"} 16 +# HELP vllm:num_preemptions_total Number of preemption tokens processed. +# TYPE vllm:num_preemptions_total counter +vllm:num_preemptions_total{model="vllm_model",version="1"} 0 +# HELP vllm:num_requests_running Number of requests currently running on GPU. +# TYPE vllm:num_requests_running gauge +vllm:num_requests_running{model="vllm_model",version="1"} 0 +# HELP vllm:num_requests_waiting Number of requests waiting to be processed. +# TYPE vllm:num_requests_waiting gauge +vllm:num_requests_waiting{model="vllm_model",version="1"} 0 +# HELP vllm:num_requests_swapped Number of requests swapped to CPU. +# TYPE vllm:num_requests_swapped gauge +vllm:num_requests_swapped{model="vllm_model",version="1"} 0 +# HELP vllm:gpu_cache_usage_perc Gauga of gpu cache usage. 1 means 100 percent usage. +# TYPE vllm:gpu_cache_usage_perc gauge +vllm:gpu_cache_usage_perc{model="vllm_model",version="1"} 0 +# HELP vllm:cpu_cache_usage_perc Gauga of cpu cache usage. 1 means 100 percent usage. +# TYPE vllm:cpu_cache_usage_perc gauge +vllm:cpu_cache_usage_perc{model="vllm_model",version="1"} 0 +# HELP vllm:cpu_prefix_cache_hit_rate CPU prefix cache block hit rate. +# TYPE vllm:cpu_prefix_cache_hit_rate gauge +vllm:cpu_prefix_cache_hit_rate{model="vllm_model",version="1"} -1 +# HELP vllm:gpu_prefix_cache_hit_rate GPU prefix cache block hit rate. +# TYPE vllm:gpu_prefix_cache_hit_rate gauge +vllm:gpu_prefix_cache_hit_rate{model="vllm_model",version="1"} -1 +# HELP vllm:iteration_tokens_total Histogram of number of tokens per engine_step. +# TYPE vllm:iteration_tokens_total histogram +vllm:iteration_tokens_total_count{model="vllm_model",version="1"} 10 +vllm:iteration_tokens_total_sum{model="vllm_model",version="1"} 12 +vllm:iteration_tokens_total_bucket{model="vllm_model",version="1",le="1"} 9 +... +vllm:iteration_tokens_total_bucket{model="vllm_model",version="1",le="+Inf"} 10 # HELP vllm:time_to_first_token_seconds Histogram of time to first token in seconds. # TYPE vllm:time_to_first_token_seconds histogram vllm:time_to_first_token_seconds_count{model="vllm_model",version="1"} 1 @@ -271,6 +328,34 @@ vllm:e2e_request_latency_seconds_sum{model="vllm_model",version="1"} 0.086861848 vllm:e2e_request_latency_seconds_bucket{model="vllm_model",version="1",le="1"} 1 ... vllm:e2e_request_latency_seconds_bucket{model="vllm_model",version="1",le="+Inf"} 1 +# HELP vllm:request_queue_time_seconds Histogram of time spent in WAITING phase for request. +# TYPE vllm:request_queue_time_seconds histogram +vllm:request_queue_time_seconds_count{model="vllm_model",version="1"} 1 +vllm:request_queue_time_seconds_sum{model="vllm_model",version="1"} 0.0045166015625 +vllm:request_queue_time_seconds_bucket{model="vllm_model",version="1",le="1"} 1 +... +vllm:request_queue_time_seconds_bucket{model="vllm_model",version="1",le="+Inf"} 1 +# HELP vllm:request_inference_time_seconds Histogram of time spent in RUNNING phase for request +# TYPE vllm:request_inference_time_seconds histogram +vllm:request_inference_time_seconds_count{model="vllm_model",version="1"} 1 +vllm:request_inference_time_seconds_sum{model="vllm_model",version="1"} 0.1418392658233643 +vllm:request_inference_time_seconds_bucket{model="vllm_model",version="1",le="1"} 1 +... +vllm:request_inference_time_seconds_bucket{model="vllm_model",version="1",le="+Inf"} 1 +# HELP vllm:request_prefill_time_seconds Histogram of time spent in PREFILL phase for request. +# TYPE vllm:request_prefill_time_seconds histogram +vllm:request_prefill_time_seconds_count{model="vllm_model",version="1"} 1 +vllm:request_prefill_time_seconds_sum{model="vllm_model",version="1"} 0.05302977561950684 +vllm:request_prefill_time_seconds_bucket{model="vllm_model",version="1",le="1"} 1 +... +vllm:request_prefill_time_seconds_bucket{model="vllm_model",version="1",le="+Inf"} 1 +# HELP vllm:request_decode_time_seconds Histogram of time spent in DECODE phase for request. +# TYPE vllm:request_decode_time_seconds histogram +vllm:request_decode_time_seconds_count{model="vllm_model",version="1"} 1 +vllm:request_decode_time_seconds_sum{model="vllm_model",version="1"} 0.08880949020385742 +vllm:request_decode_time_seconds_bucket{model="vllm_model",version="1",le="1"} 1 +... +vllm:request_decode_time_seconds_bucket{model="vllm_model",version="1",le="+Inf"} 1 # HELP vllm:request_prompt_tokens Number of prefill tokens processed. # TYPE vllm:request_prompt_tokens histogram vllm:request_prompt_tokens_count{model="vllm_model",version="1"} 1 diff --git a/src/utils/metrics.py b/src/utils/metrics.py index a593f70a..d4a74a93 100644 --- a/src/utils/metrics.py +++ b/src/utils/metrics.py @@ -29,7 +29,6 @@ from typing import Dict, List, Union import triton_python_backend_utils as pb_utils - from vllm.config import VllmConfig from vllm.engine.metrics import StatLoggerBase as VllmStatLoggerBase from vllm.engine.metrics import Stats as VllmStats @@ -438,4 +437,3 @@ def finalize(self): if self._logger_thread is not None: self._logger_thread.join() self._logger_thread = None - self._logger_thread = None From 7a0096ef3378bd5f596783ced8f7c8f4afa767e3 Mon Sep 17 00:00:00 2001 From: Pavloveuge Date: Tue, 13 May 2025 18:06:18 +0300 Subject: [PATCH 03/12] update test --- .../metrics_test/vllm_metrics_test.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/ci/L0_backend_vllm/metrics_test/vllm_metrics_test.py b/ci/L0_backend_vllm/metrics_test/vllm_metrics_test.py index b8ddeb49..393d75ab 100644 --- a/ci/L0_backend_vllm/metrics_test/vllm_metrics_test.py +++ b/ci/L0_backend_vllm/metrics_test/vllm_metrics_test.py @@ -131,6 +131,8 @@ def test_vllm_metrics(self): # (5, 812, 9, 5, 1515, 3497, 4, 50118, 50118, 133, 812, 9, 1470, 16, 5, 812) # (11, 5, 1420, 9, 5, 82, 4, 50118, 50118, 133, 499, 9, 4687, 16, 11, 5) self.assertEqual(metrics_dict["vllm:generation_tokens_total"], 48) + # vllm:num_preemptions_total + self.assertEqual(metrics_dict["vllm:num_preemptions_total"], 0) # vllm:time_to_first_token_seconds self.assertEqual( metrics_dict["vllm:time_to_first_token_seconds_count"], total_prompts @@ -139,6 +141,10 @@ def test_vllm_metrics(self): self.assertEqual( metrics_dict["vllm:time_to_first_token_seconds_bucket"], total_prompts ) + # vllm:iteration_tokens_total + self.assertEqual(metrics_dict["vllm:iteration_tokens_total_count"], 18) + self.assertGreater(metrics_dict["vllm:iteration_tokens_total_sum"], 0) + self.assertEqual(metrics_dict["vllm:iteration_tokens_total_bucket"], 18) # vllm:time_per_output_token_seconds self.assertEqual(metrics_dict["vllm:time_per_output_token_seconds_count"], 45) self.assertGreater(metrics_dict["vllm:time_per_output_token_seconds_sum"], 0) @@ -151,6 +157,16 @@ def test_vllm_metrics(self): self.assertEqual( metrics_dict["vllm:e2e_request_latency_seconds_bucket"], total_prompts ) + request_time_metrics_list = [ + "request_queue_time_seconds", + "request_inference_time_seconds", + "request_prefill_time_seconds", + "request_decode_time_seconds", + ] + for metric_name in request_time_metrics_list: + self.assertEqual(metrics_dict[f"{metric_name}_count"], total_prompts) + self.assertGreater(metrics_dict[f"{metric_name}_sum"], 0) + self.assertEqual(metrics_dict[f"{metric_name}_bucket"], total_prompts) # vllm:request_prompt_tokens self.assertEqual( metrics_dict["vllm:request_prompt_tokens_count"], total_prompts From 41a39198a614b00895e71e8caa9dd0208e09f67c Mon Sep 17 00:00:00 2001 From: Eugene Pavlov <49618087+Pavloveuge@users.noreply.github.com> Date: Thu, 24 Jul 2025 20:46:49 +0300 Subject: [PATCH 04/12] Update src/utils/metrics.py Co-authored-by: Yingge He <157551214+yinggeh@users.noreply.github.com> --- src/utils/metrics.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/utils/metrics.py b/src/utils/metrics.py index d4a74a93..3e48113f 100644 --- a/src/utils/metrics.py +++ b/src/utils/metrics.py @@ -366,7 +366,7 @@ def log(self, stats: VllmStats) -> None: (self.metrics.counter_preemption_tokens, stats.num_preemption_iter), ] histogram_metrics = [ - (self.metrics.histogram_iteration_tokens_total, [stats.num_tokens_iter]), + (self.metrics.histogram_iteration_tokens, [stats.num_tokens_iter]), ( self.metrics.histogram_time_to_first_token, stats.time_to_first_tokens_iter, From c5f97514788fb9e1da83bf8b491f7f92d1d5d309 Mon Sep 17 00:00:00 2001 From: Eugene Pavlov <49618087+Pavloveuge@users.noreply.github.com> Date: Thu, 24 Jul 2025 20:47:01 +0300 Subject: [PATCH 05/12] Update src/utils/metrics.py Co-authored-by: Yingge He <157551214+yinggeh@users.noreply.github.com> --- src/utils/metrics.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/utils/metrics.py b/src/utils/metrics.py index 3e48113f..c455e43d 100644 --- a/src/utils/metrics.py +++ b/src/utils/metrics.py @@ -363,7 +363,7 @@ def log(self, stats: VllmStats) -> None: counter_metrics = [ (self.metrics.counter_prompt_tokens, stats.num_prompt_tokens_iter), (self.metrics.counter_generation_tokens, stats.num_generation_tokens_iter), - (self.metrics.counter_preemption_tokens, stats.num_preemption_iter), + (self.metrics.counter_num_preemption, stats.num_preemption_iter), ] histogram_metrics = [ (self.metrics.histogram_iteration_tokens, [stats.num_tokens_iter]), From 6ac61084ae24d212582ecbb06b56426fd8b94174 Mon Sep 17 00:00:00 2001 From: Eugene Pavlov <49618087+Pavloveuge@users.noreply.github.com> Date: Thu, 24 Jul 2025 20:47:20 +0300 Subject: [PATCH 06/12] Update README.md Co-authored-by: Yingge He <157551214+yinggeh@users.noreply.github.com> --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 6be683b2..31d72839 100644 --- a/README.md +++ b/README.md @@ -227,7 +227,7 @@ VLLM stats are reported by the metrics endpoint in fields that are prefixed with counter_prompt_tokens # Number of generation tokens processed. counter_generation_tokens -# Number of preemption tokens processed. +# Cumulative number of preemption from the engine. counter_preemption_tokens # Histogram of number of tokens per engine_step. histogram_iteration_tokens From 221a1c12c6f48cd7e14e8357cfe5c4993fe26714 Mon Sep 17 00:00:00 2001 From: Eugene Pavlov <49618087+Pavloveuge@users.noreply.github.com> Date: Thu, 24 Jul 2025 20:48:15 +0300 Subject: [PATCH 07/12] Update ci/L0_backend_vllm/metrics_test/vllm_metrics_test.py Co-authored-by: Yingge He <157551214+yinggeh@users.noreply.github.com> --- ci/L0_backend_vllm/metrics_test/vllm_metrics_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/L0_backend_vllm/metrics_test/vllm_metrics_test.py b/ci/L0_backend_vllm/metrics_test/vllm_metrics_test.py index 393d75ab..95b4becd 100644 --- a/ci/L0_backend_vllm/metrics_test/vllm_metrics_test.py +++ b/ci/L0_backend_vllm/metrics_test/vllm_metrics_test.py @@ -132,7 +132,7 @@ def test_vllm_metrics(self): # (11, 5, 1420, 9, 5, 82, 4, 50118, 50118, 133, 499, 9, 4687, 16, 11, 5) self.assertEqual(metrics_dict["vllm:generation_tokens_total"], 48) # vllm:num_preemptions_total - self.assertEqual(metrics_dict["vllm:num_preemptions_total"], 0) + self.assertEqual(metrics_dict["vllm:prompt_tokens_total"], 0) # vllm:time_to_first_token_seconds self.assertEqual( metrics_dict["vllm:time_to_first_token_seconds_count"], total_prompts From f4196484fe8e35e33757a71973d66e3657d97fd3 Mon Sep 17 00:00:00 2001 From: Eugene Pavlov <49618087+Pavloveuge@users.noreply.github.com> Date: Thu, 24 Jul 2025 20:48:24 +0300 Subject: [PATCH 08/12] Update README.md Co-authored-by: Yingge He <157551214+yinggeh@users.noreply.github.com> --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 31d72839..bee66a5b 100644 --- a/README.md +++ b/README.md @@ -228,7 +228,7 @@ counter_prompt_tokens # Number of generation tokens processed. counter_generation_tokens # Cumulative number of preemption from the engine. -counter_preemption_tokens +counter_num_preemption # Histogram of number of tokens per engine_step. histogram_iteration_tokens # Histogram of time to first token in seconds. From 6896178c48696f2ede8ed6b4c09eead7601887d4 Mon Sep 17 00:00:00 2001 From: Eugene Pavlov <49618087+Pavloveuge@users.noreply.github.com> Date: Thu, 24 Jul 2025 20:48:37 +0300 Subject: [PATCH 09/12] Update src/utils/metrics.py Co-authored-by: Yingge He <157551214+yinggeh@users.noreply.github.com> --- src/utils/metrics.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/utils/metrics.py b/src/utils/metrics.py index c455e43d..22d9bf9b 100644 --- a/src/utils/metrics.py +++ b/src/utils/metrics.py @@ -169,7 +169,7 @@ def __init__(self, labels: List[str], max_model_len: int): self.histogram_iteration_tokens_total = ( self.histogram_iteration_tokens_total_family.Metric( labels=labels, - buckets=[1, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8096], + buckets=[1, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384], ) ) From 93d88955a56aae1351d6f3fde1585a455d90d828 Mon Sep 17 00:00:00 2001 From: "ea.pav" Date: Thu, 24 Jul 2025 17:49:44 +0000 Subject: [PATCH 10/12] move gauges before counters --- README.md | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/README.md b/README.md index bee66a5b..449a93b4 100644 --- a/README.md +++ b/README.md @@ -223,6 +223,20 @@ curl localhost:8002/metrics VLLM stats are reported by the metrics endpoint in fields that are prefixed with `vllm:`. Triton currently supports reporting of the following metrics from vLLM. ```bash +# Number of requests currently running on GPU. +gauge_scheduler_running +# Number of requests waiting to be processed. +gauge_scheduler_waiting +# Number of requests swapped to CPU. +gauge_scheduler_swapped +# GPU KV-cache usage. 1 means 100 percent usage. +gauge_gpu_cache_usage +# CPU KV-cache usage. 1 means 100 percent usage. +gauge_cpu_cache_usage +# CPU prefix cache block hit rate. +gauge_cpu_prefix_cache_hit_rate +# GPU prefix cache block hit rate. +gauge_gpu_prefix_cache_hit_rate # Number of prefill tokens processed. counter_prompt_tokens # Number of generation tokens processed. @@ -253,20 +267,6 @@ histogram_num_generation_tokens_request histogram_best_of_request # Histogram of the n request parameter. histogram_n_request -# Number of requests currently running on GPU. -gauge_scheduler_running -# Number of requests waiting to be processed. -gauge_scheduler_waiting -# Number of requests swapped to CPU. -gauge_scheduler_swapped -# GPU KV-cache usage. 1 means 100 percent usage. -gauge_gpu_cache_usage -# CPU KV-cache usage. 1 means 100 percent usage. -gauge_cpu_cache_usage -# CPU prefix cache block hit rate. -gauge_cpu_prefix_cache_hit_rate -# GPU prefix cache block hit rate. -gauge_gpu_prefix_cache_hit_rate ``` Your output for these fields should look similar to the following: ```bash From 6be53bdafbe44c3c3af031b73c1a417733218dc7 Mon Sep 17 00:00:00 2001 From: "ea.pav" Date: Thu, 24 Jul 2025 18:08:47 +0000 Subject: [PATCH 11/12] revert suggested change --- ci/L0_backend_vllm/metrics_test/vllm_metrics_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/L0_backend_vllm/metrics_test/vllm_metrics_test.py b/ci/L0_backend_vllm/metrics_test/vllm_metrics_test.py index 95b4becd..393d75ab 100644 --- a/ci/L0_backend_vllm/metrics_test/vllm_metrics_test.py +++ b/ci/L0_backend_vllm/metrics_test/vllm_metrics_test.py @@ -132,7 +132,7 @@ def test_vllm_metrics(self): # (11, 5, 1420, 9, 5, 82, 4, 50118, 50118, 133, 499, 9, 4687, 16, 11, 5) self.assertEqual(metrics_dict["vllm:generation_tokens_total"], 48) # vllm:num_preemptions_total - self.assertEqual(metrics_dict["vllm:prompt_tokens_total"], 0) + self.assertEqual(metrics_dict["vllm:num_preemptions_total"], 0) # vllm:time_to_first_token_seconds self.assertEqual( metrics_dict["vllm:time_to_first_token_seconds_count"], total_prompts From 9b96279029abaf46fe38238998ec591b7857e331 Mon Sep 17 00:00:00 2001 From: "ea.pav" Date: Thu, 24 Jul 2025 18:15:27 +0000 Subject: [PATCH 12/12] remove deprecated metrics, fix namings --- README.md | 20 --------- src/utils/metrics.py | 99 +++++++++++++++++--------------------------- 2 files changed, 38 insertions(+), 81 deletions(-) diff --git a/README.md b/README.md index 449a93b4..2a9d77f0 100644 --- a/README.md +++ b/README.md @@ -227,16 +227,8 @@ VLLM stats are reported by the metrics endpoint in fields that are prefixed with gauge_scheduler_running # Number of requests waiting to be processed. gauge_scheduler_waiting -# Number of requests swapped to CPU. -gauge_scheduler_swapped # GPU KV-cache usage. 1 means 100 percent usage. gauge_gpu_cache_usage -# CPU KV-cache usage. 1 means 100 percent usage. -gauge_cpu_cache_usage -# CPU prefix cache block hit rate. -gauge_cpu_prefix_cache_hit_rate -# GPU prefix cache block hit rate. -gauge_gpu_prefix_cache_hit_rate # Number of prefill tokens processed. counter_prompt_tokens # Number of generation tokens processed. @@ -285,21 +277,9 @@ vllm:num_requests_running{model="vllm_model",version="1"} 0 # HELP vllm:num_requests_waiting Number of requests waiting to be processed. # TYPE vllm:num_requests_waiting gauge vllm:num_requests_waiting{model="vllm_model",version="1"} 0 -# HELP vllm:num_requests_swapped Number of requests swapped to CPU. -# TYPE vllm:num_requests_swapped gauge -vllm:num_requests_swapped{model="vllm_model",version="1"} 0 # HELP vllm:gpu_cache_usage_perc Gauga of gpu cache usage. 1 means 100 percent usage. # TYPE vllm:gpu_cache_usage_perc gauge vllm:gpu_cache_usage_perc{model="vllm_model",version="1"} 0 -# HELP vllm:cpu_cache_usage_perc Gauga of cpu cache usage. 1 means 100 percent usage. -# TYPE vllm:cpu_cache_usage_perc gauge -vllm:cpu_cache_usage_perc{model="vllm_model",version="1"} 0 -# HELP vllm:cpu_prefix_cache_hit_rate CPU prefix cache block hit rate. -# TYPE vllm:cpu_prefix_cache_hit_rate gauge -vllm:cpu_prefix_cache_hit_rate{model="vllm_model",version="1"} -1 -# HELP vllm:gpu_prefix_cache_hit_rate GPU prefix cache block hit rate. -# TYPE vllm:gpu_prefix_cache_hit_rate gauge -vllm:gpu_prefix_cache_hit_rate{model="vllm_model",version="1"} -1 # HELP vllm:iteration_tokens_total Histogram of number of tokens per engine_step. # TYPE vllm:iteration_tokens_total histogram vllm:iteration_tokens_total_count{model="vllm_model",version="1"} 10 diff --git a/src/utils/metrics.py b/src/utils/metrics.py index 22d9bf9b..6fae5299 100644 --- a/src/utils/metrics.py +++ b/src/utils/metrics.py @@ -49,12 +49,12 @@ def __init__(self, labels: List[str], max_model_len: int): description="Number of generation tokens processed.", kind=pb_utils.MetricFamily.COUNTER, ) - self.counter_preemption_tokens_family = pb_utils.MetricFamily( + self.counter_num_preemption_family = pb_utils.MetricFamily( name="vllm:num_preemptions_total", description="Number of preemption tokens processed.", kind=pb_utils.MetricFamily.COUNTER, ) - self.histogram_iteration_tokens_total_family = pb_utils.MetricFamily( + self.histogram_iteration_tokens_family = pb_utils.MetricFamily( name="vllm:iteration_tokens_total", description="Histogram of number of tokens per engine_step.", kind=pb_utils.MetricFamily.HISTOGRAM, @@ -124,33 +124,12 @@ def __init__(self, labels: List[str], max_model_len: int): description="Number of requests waiting to be processed.", kind=pb_utils.MetricFamily.GAUGE, ) - self.gauge_scheduler_swapped_family = pb_utils.MetricFamily( - name="vllm:num_requests_swapped", - description="Number of requests swapped to CPU.", - kind=pb_utils.MetricFamily.GAUGE, - ) # KV Cache Usage in % self.gauge_gpu_cache_usage_family = pb_utils.MetricFamily( name="vllm:gpu_cache_usage_perc", description="GPU KV-cache usage. 1 means 100 percent usage.", kind=pb_utils.MetricFamily.GAUGE, ) - self.gauge_cpu_cache_usage_family = pb_utils.MetricFamily( - name="vllm:cpu_cache_usage_perc", - description="CPU KV-cache usage. 1 means 100 percent usage.", - kind=pb_utils.MetricFamily.GAUGE, - ) - # Prefix caching block hit rate - self.gauge_cpu_prefix_cache_hit_rate_family = pb_utils.MetricFamily( - name="vllm:cpu_prefix_cache_hit_rate", - description="CPU prefix cache block hit rate.", - kind=pb_utils.MetricFamily.GAUGE, - ) - self.gauge_gpu_prefix_cache_hit_rate_family = pb_utils.MetricFamily( - name="vllm:gpu_prefix_cache_hit_rate", - description="GPU prefix cache block hit rate.", - kind=pb_utils.MetricFamily.GAUGE, - ) # Initialize metrics # Iteration stats @@ -160,17 +139,15 @@ def __init__(self, labels: List[str], max_model_len: int): self.counter_generation_tokens = self.counter_generation_tokens_family.Metric( labels=labels ) - self.counter_preemption_tokens = self.counter_preemption_tokens_family.Metric( + self.counter_num_preemption = self.counter_num_preemption_family.Metric( labels=labels ) # Use the same bucket boundaries from vLLM sample metrics as an example. # https://github.com/vllm-project/vllm/blob/21313e09e3f9448817016290da20d0db1adf3664/vllm/engine/metrics.py#L81-L96 - self.histogram_iteration_tokens_total = ( - self.histogram_iteration_tokens_total_family.Metric( - labels=labels, - buckets=[1, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384], - ) + self.histogram_iteration_tokens = self.histogram_iteration_tokens_family.Metric( + labels=labels, + buckets=[1, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384], ) self.histogram_time_to_first_token = ( @@ -218,32 +195,55 @@ def __init__(self, labels: List[str], max_model_len: int): ) # Request stats # Latency + request_latency_buckets = [ + 0.3, + 0.5, + 0.8, + 1.0, + 1.5, + 2.0, + 2.5, + 5.0, + 10.0, + 15.0, + 20.0, + 30.0, + 40.0, + 50.0, + 60.0, + 120.0, + 240.0, + 480.0, + 960.0, + 1920.0, + 7680.0, + ] self.histogram_e2e_time_request = self.histogram_e2e_time_request_family.Metric( labels=labels, - buckets=[1.0, 2.5, 5.0, 10.0, 15.0, 20.0, 30.0, 40.0, 50.0, 60.0], + buckets=request_latency_buckets, ) self.histogram_prefill_time_request = ( self.histogram_prefill_time_request_family.Metric( labels=labels, - buckets=[1.0, 2.5, 5.0, 10.0, 15.0, 20.0, 30.0, 40.0, 50.0, 60.0], + buckets=request_latency_buckets, ) ) self.histogram_decode_time_request = ( self.histogram_decode_time_request_family.Metric( labels=labels, - buckets=[1.0, 2.5, 5.0, 10.0, 15.0, 20.0, 30.0, 40.0, 50.0, 60.0], + buckets=request_latency_buckets, ) ) self.histogram_inference_time_request = ( self.histogram_inference_time_request_family.Metric( labels=labels, - buckets=[1.0, 2.5, 5.0, 10.0, 15.0, 20.0, 30.0, 40.0, 50.0, 60.0], + buckets=request_latency_buckets, ) ) self.histogram_queue_time_request = ( self.histogram_queue_time_request_family.Metric( labels=labels, - buckets=[1.0, 2.5, 5.0, 10.0, 15.0, 20.0, 30.0, 40.0, 50.0, 60.0], + buckets=request_latency_buckets, ) ) # Metadata @@ -265,29 +265,16 @@ def __init__(self, labels: List[str], max_model_len: int): ) # System stats # Scheduler State - self.gauge_num_requests_running = self.gauge_scheduler_running_family.Metric( - labels=labels - ) - self.gauge_num_requests_waiting = self.gauge_scheduler_waiting_family.Metric( + self.gauge_scheduler_running = self.gauge_scheduler_running_family.Metric( labels=labels ) - self.gauge_num_requests_swapped = self.gauge_scheduler_swapped_family.Metric( + self.gauge_scheduler_waiting = self.gauge_scheduler_waiting_family.Metric( labels=labels ) # KV Cache Usage in % self.gauge_gpu_cache_usage = self.gauge_gpu_cache_usage_family.Metric( labels=labels ) - self.gauge_cpu_cache_usage = self.gauge_cpu_cache_usage_family.Metric( - labels=labels - ) - # Prefix caching block hit rate - self.gauge_cpu_prefix_cache_hit_rate = ( - self.gauge_cpu_prefix_cache_hit_rate_family.Metric(labels=labels) - ) - self.gauge_gpu_prefix_cache_hit_rate = ( - self.gauge_gpu_prefix_cache_hit_rate_family.Metric(labels=labels) - ) class VllmStatLogger(VllmStatLoggerBase): @@ -394,19 +381,9 @@ def log(self, stats: VllmStats) -> None: (self.metrics.histogram_n_request, stats.n_requests), ] gauge_metrics = [ - (self.metrics.gauge_num_requests_running, stats.num_running_sys), - (self.metrics.gauge_num_requests_waiting, stats.num_waiting_sys), - (self.metrics.gauge_num_requests_swapped, stats.num_swapped_sys), + (self.metrics.gauge_scheduler_running, stats.num_running_sys), + (self.metrics.gauge_scheduler_waiting, stats.num_waiting_sys), (self.metrics.gauge_gpu_cache_usage, stats.gpu_cache_usage_sys), - (self.metrics.gauge_cpu_cache_usage, stats.cpu_cache_usage_sys), - ( - self.metrics.gauge_cpu_prefix_cache_hit_rate, - stats.cpu_prefix_cache_hit_rate, - ), - ( - self.metrics.gauge_gpu_prefix_cache_hit_rate, - stats.gpu_prefix_cache_hit_rate, - ), ] for metric, data in counter_metrics: self._log_counter(metric, data)