Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
65 changes: 65 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -223,16 +223,34 @@ curl localhost:8002/metrics
VLLM stats are reported by the metrics endpoint in fields that are prefixed with
`vllm:`. Triton currently supports reporting of the following metrics from vLLM.
```bash
# Number of requests currently running on GPU.
gauge_scheduler_running
# Number of requests waiting to be processed.
gauge_scheduler_waiting
# GPU KV-cache usage. 1 means 100 percent usage.
gauge_gpu_cache_usage
# Number of prefill tokens processed.
counter_prompt_tokens
# Number of generation tokens processed.
counter_generation_tokens
# Cumulative number of preemption from the engine.
counter_num_preemption
# Histogram of number of tokens per engine_step.
histogram_iteration_tokens
# Histogram of time to first token in seconds.
histogram_time_to_first_token
# Histogram of time per output token in seconds.
histogram_time_per_output_token
# Histogram of end to end request latency in seconds.
histogram_e2e_time_request
# Histogram of time spent in WAITING phase for request.
histogram_queue_time_request
# Histogram of time spent in RUNNING phase for request.
histogram_inference_time_request
# Histogram of time spent in PREFILL phase for request.
histogram_prefill_time_request
# Histogram of time spent in DECODE phase for request.
histogram_decode_time_request
# Number of prefill tokens processed.
histogram_num_prompt_tokens_request
# Number of generation tokens processed.
Expand All @@ -250,6 +268,25 @@ vllm:prompt_tokens_total{model="vllm_model",version="1"} 10
# HELP vllm:generation_tokens_total Number of generation tokens processed.
# TYPE vllm:generation_tokens_total counter
vllm:generation_tokens_total{model="vllm_model",version="1"} 16
# HELP vllm:num_preemptions_total Number of preemption tokens processed.
# TYPE vllm:num_preemptions_total counter
vllm:num_preemptions_total{model="vllm_model",version="1"} 0
# HELP vllm:num_requests_running Number of requests currently running on GPU.
# TYPE vllm:num_requests_running gauge
vllm:num_requests_running{model="vllm_model",version="1"} 0
# HELP vllm:num_requests_waiting Number of requests waiting to be processed.
# TYPE vllm:num_requests_waiting gauge
vllm:num_requests_waiting{model="vllm_model",version="1"} 0
# HELP vllm:gpu_cache_usage_perc Gauga of gpu cache usage. 1 means 100 percent usage.
# TYPE vllm:gpu_cache_usage_perc gauge
vllm:gpu_cache_usage_perc{model="vllm_model",version="1"} 0
# HELP vllm:iteration_tokens_total Histogram of number of tokens per engine_step.
# TYPE vllm:iteration_tokens_total histogram
vllm:iteration_tokens_total_count{model="vllm_model",version="1"} 10
vllm:iteration_tokens_total_sum{model="vllm_model",version="1"} 12
vllm:iteration_tokens_total_bucket{model="vllm_model",version="1",le="1"} 9
...
vllm:iteration_tokens_total_bucket{model="vllm_model",version="1",le="+Inf"} 10
# HELP vllm:time_to_first_token_seconds Histogram of time to first token in seconds.
# TYPE vllm:time_to_first_token_seconds histogram
vllm:time_to_first_token_seconds_count{model="vllm_model",version="1"} 1
Expand All @@ -271,6 +308,34 @@ vllm:e2e_request_latency_seconds_sum{model="vllm_model",version="1"} 0.086861848
vllm:e2e_request_latency_seconds_bucket{model="vllm_model",version="1",le="1"} 1
...
vllm:e2e_request_latency_seconds_bucket{model="vllm_model",version="1",le="+Inf"} 1
# HELP vllm:request_queue_time_seconds Histogram of time spent in WAITING phase for request.
# TYPE vllm:request_queue_time_seconds histogram
vllm:request_queue_time_seconds_count{model="vllm_model",version="1"} 1
vllm:request_queue_time_seconds_sum{model="vllm_model",version="1"} 0.0045166015625
vllm:request_queue_time_seconds_bucket{model="vllm_model",version="1",le="1"} 1
...
vllm:request_queue_time_seconds_bucket{model="vllm_model",version="1",le="+Inf"} 1
# HELP vllm:request_inference_time_seconds Histogram of time spent in RUNNING phase for request
# TYPE vllm:request_inference_time_seconds histogram
vllm:request_inference_time_seconds_count{model="vllm_model",version="1"} 1
vllm:request_inference_time_seconds_sum{model="vllm_model",version="1"} 0.1418392658233643
vllm:request_inference_time_seconds_bucket{model="vllm_model",version="1",le="1"} 1
...
vllm:request_inference_time_seconds_bucket{model="vllm_model",version="1",le="+Inf"} 1
# HELP vllm:request_prefill_time_seconds Histogram of time spent in PREFILL phase for request.
# TYPE vllm:request_prefill_time_seconds histogram
vllm:request_prefill_time_seconds_count{model="vllm_model",version="1"} 1
vllm:request_prefill_time_seconds_sum{model="vllm_model",version="1"} 0.05302977561950684
vllm:request_prefill_time_seconds_bucket{model="vllm_model",version="1",le="1"} 1
...
vllm:request_prefill_time_seconds_bucket{model="vllm_model",version="1",le="+Inf"} 1
# HELP vllm:request_decode_time_seconds Histogram of time spent in DECODE phase for request.
# TYPE vllm:request_decode_time_seconds histogram
vllm:request_decode_time_seconds_count{model="vllm_model",version="1"} 1
vllm:request_decode_time_seconds_sum{model="vllm_model",version="1"} 0.08880949020385742
vllm:request_decode_time_seconds_bucket{model="vllm_model",version="1",le="1"} 1
...
vllm:request_decode_time_seconds_bucket{model="vllm_model",version="1",le="+Inf"} 1
# HELP vllm:request_prompt_tokens Number of prefill tokens processed.
# TYPE vllm:request_prompt_tokens histogram
vllm:request_prompt_tokens_count{model="vllm_model",version="1"} 1
Expand Down
16 changes: 16 additions & 0 deletions ci/L0_backend_vllm/metrics_test/vllm_metrics_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,8 @@ def test_vllm_metrics(self):
# (5, 812, 9, 5, 1515, 3497, 4, 50118, 50118, 133, 812, 9, 1470, 16, 5, 812)
# (11, 5, 1420, 9, 5, 82, 4, 50118, 50118, 133, 499, 9, 4687, 16, 11, 5)
self.assertEqual(metrics_dict["vllm:generation_tokens_total"], 48)
# vllm:num_preemptions_total
self.assertEqual(metrics_dict["vllm:num_preemptions_total"], 0)
# vllm:time_to_first_token_seconds
self.assertEqual(
metrics_dict["vllm:time_to_first_token_seconds_count"], total_prompts
Expand All @@ -139,6 +141,10 @@ def test_vllm_metrics(self):
self.assertEqual(
metrics_dict["vllm:time_to_first_token_seconds_bucket"], total_prompts
)
# vllm:iteration_tokens_total
self.assertEqual(metrics_dict["vllm:iteration_tokens_total_count"], 18)
self.assertGreater(metrics_dict["vllm:iteration_tokens_total_sum"], 0)
self.assertEqual(metrics_dict["vllm:iteration_tokens_total_bucket"], 18)
# vllm:time_per_output_token_seconds
self.assertEqual(metrics_dict["vllm:time_per_output_token_seconds_count"], 45)
self.assertGreater(metrics_dict["vllm:time_per_output_token_seconds_sum"], 0)
Expand All @@ -151,6 +157,16 @@ def test_vllm_metrics(self):
self.assertEqual(
metrics_dict["vllm:e2e_request_latency_seconds_bucket"], total_prompts
)
request_time_metrics_list = [
"request_queue_time_seconds",
"request_inference_time_seconds",
"request_prefill_time_seconds",
"request_decode_time_seconds",
]
for metric_name in request_time_metrics_list:
self.assertEqual(metrics_dict[f"{metric_name}_count"], total_prompts)
self.assertGreater(metrics_dict[f"{metric_name}_sum"], 0)
self.assertEqual(metrics_dict[f"{metric_name}_bucket"], total_prompts)
# vllm:request_prompt_tokens
self.assertEqual(
metrics_dict["vllm:request_prompt_tokens_count"], total_prompts
Expand Down
148 changes: 147 additions & 1 deletion src/utils/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,16 @@ def __init__(self, labels: List[str], max_model_len: int):
description="Number of generation tokens processed.",
kind=pb_utils.MetricFamily.COUNTER,
)
self.counter_num_preemption_family = pb_utils.MetricFamily(
name="vllm:num_preemptions_total",
description="Number of preemption tokens processed.",
kind=pb_utils.MetricFamily.COUNTER,
)
self.histogram_iteration_tokens_family = pb_utils.MetricFamily(
name="vllm:iteration_tokens_total",
description="Histogram of number of tokens per engine_step.",
kind=pb_utils.MetricFamily.HISTOGRAM,
)
self.histogram_time_to_first_token_family = pb_utils.MetricFamily(
name="vllm:time_to_first_token_seconds",
description="Histogram of time to first token in seconds.",
Expand All @@ -66,6 +76,26 @@ def __init__(self, labels: List[str], max_model_len: int):
description="Histogram of end to end request latency in seconds.",
kind=pb_utils.MetricFamily.HISTOGRAM,
)
self.histogram_queue_time_request_family = pb_utils.MetricFamily(
name="vllm:request_queue_time_seconds",
description="Histogram of time spent in WAITING phase for request.",
kind=pb_utils.MetricFamily.HISTOGRAM,
)
self.histogram_inference_time_request_family = pb_utils.MetricFamily(
name="vllm:request_inference_time_seconds",
description="Histogram of time spent in RUNNING phase for request.",
kind=pb_utils.MetricFamily.HISTOGRAM,
)
self.histogram_prefill_time_request_family = pb_utils.MetricFamily(
name="vllm:request_prefill_time_seconds",
description="Histogram of time spent in PREFILL phase for request.",
kind=pb_utils.MetricFamily.HISTOGRAM,
)
self.histogram_decode_time_request_family = pb_utils.MetricFamily(
name="vllm:request_decode_time_seconds",
description="Histogram of time spent in DECODE phase for request.",
kind=pb_utils.MetricFamily.HISTOGRAM,
)
# Metadata
self.histogram_num_prompt_tokens_request_family = pb_utils.MetricFamily(
name="vllm:request_prompt_tokens",
Expand All @@ -82,6 +112,24 @@ def __init__(self, labels: List[str], max_model_len: int):
description="Histogram of the n request parameter.",
kind=pb_utils.MetricFamily.HISTOGRAM,
)
# System stats
# Scheduler State
self.gauge_scheduler_running_family = pb_utils.MetricFamily(
name="vllm:num_requests_running",
description="Number of requests currently running on GPU.",
kind=pb_utils.MetricFamily.GAUGE,
)
self.gauge_scheduler_waiting_family = pb_utils.MetricFamily(
name="vllm:num_requests_waiting",
description="Number of requests waiting to be processed.",
kind=pb_utils.MetricFamily.GAUGE,
)
# KV Cache Usage in %
self.gauge_gpu_cache_usage_family = pb_utils.MetricFamily(
name="vllm:gpu_cache_usage_perc",
description="GPU KV-cache usage. 1 means 100 percent usage.",
kind=pb_utils.MetricFamily.GAUGE,
)

# Initialize metrics
# Iteration stats
Expand All @@ -91,8 +139,17 @@ def __init__(self, labels: List[str], max_model_len: int):
self.counter_generation_tokens = self.counter_generation_tokens_family.Metric(
labels=labels
)
self.counter_num_preemption = self.counter_num_preemption_family.Metric(
labels=labels
)

# Use the same bucket boundaries from vLLM sample metrics as an example.
# https://github.com/vllm-project/vllm/blob/21313e09e3f9448817016290da20d0db1adf3664/vllm/engine/metrics.py#L81-L96
self.histogram_iteration_tokens = self.histogram_iteration_tokens_family.Metric(
labels=labels,
buckets=[1, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384],
)

self.histogram_time_to_first_token = (
self.histogram_time_to_first_token_family.Metric(
labels=labels,
Expand Down Expand Up @@ -138,9 +195,56 @@ def __init__(self, labels: List[str], max_model_len: int):
)
# Request stats
# Latency
request_latency_buckets = [
0.3,
0.5,
0.8,
1.0,
1.5,
2.0,
2.5,
5.0,
10.0,
15.0,
20.0,
30.0,
40.0,
50.0,
60.0,
120.0,
240.0,
480.0,
960.0,
1920.0,
7680.0,
]
self.histogram_e2e_time_request = self.histogram_e2e_time_request_family.Metric(
labels=labels,
buckets=[1.0, 2.5, 5.0, 10.0, 15.0, 20.0, 30.0, 40.0, 50.0, 60.0],
buckets=request_latency_buckets,
)
self.histogram_prefill_time_request = (
self.histogram_prefill_time_request_family.Metric(
labels=labels,
buckets=request_latency_buckets,
)
)
self.histogram_decode_time_request = (
self.histogram_decode_time_request_family.Metric(
labels=labels,
buckets=request_latency_buckets,
)
)
self.histogram_inference_time_request = (
self.histogram_inference_time_request_family.Metric(
labels=labels,
buckets=request_latency_buckets,
)
)
self.histogram_queue_time_request = (
self.histogram_queue_time_request_family.Metric(
labels=labels,
buckets=request_latency_buckets,
)
)
# Metadata
self.histogram_num_prompt_tokens_request = (
Expand All @@ -159,6 +263,18 @@ def __init__(self, labels: List[str], max_model_len: int):
labels=labels,
buckets=[1, 2, 5, 10, 20],
)
# System stats
# Scheduler State
self.gauge_scheduler_running = self.gauge_scheduler_running_family.Metric(
labels=labels
)
self.gauge_scheduler_waiting = self.gauge_scheduler_waiting_family.Metric(
labels=labels
)
# KV Cache Usage in %
self.gauge_gpu_cache_usage = self.gauge_gpu_cache_usage_family.Metric(
labels=labels
)


class VllmStatLogger(VllmStatLoggerBase):
Expand All @@ -182,6 +298,18 @@ def __init__(self, labels: Dict, vllm_config: VllmConfig, log_logger) -> None:
def info(self, type: str, obj: SupportsMetricsInfo) -> None:
pass

def _log_gauge(self, gauge, data: Union[int, float]) -> None:
"""Convenience function for logging to gauge.

Args:
gauge: A gauge metric instance.
data: An int or float to set as the current gauge value.

Returns:
None
"""
self._logger_queue.put_nowait((gauge, "set", data))

def _log_counter(self, counter, data: Union[int, float]) -> None:
"""Convenience function for logging to counter.

Expand Down Expand Up @@ -222,8 +350,10 @@ def log(self, stats: VllmStats) -> None:
counter_metrics = [
(self.metrics.counter_prompt_tokens, stats.num_prompt_tokens_iter),
(self.metrics.counter_generation_tokens, stats.num_generation_tokens_iter),
(self.metrics.counter_num_preemption, stats.num_preemption_iter),
]
histogram_metrics = [
(self.metrics.histogram_iteration_tokens, [stats.num_tokens_iter]),
(
self.metrics.histogram_time_to_first_token,
stats.time_to_first_tokens_iter,
Expand All @@ -233,6 +363,13 @@ def log(self, stats: VllmStats) -> None:
stats.time_per_output_tokens_iter,
),
(self.metrics.histogram_e2e_time_request, stats.time_e2e_requests),
(self.metrics.histogram_prefill_time_request, stats.time_prefill_requests),
(self.metrics.histogram_decode_time_request, stats.time_decode_requests),
(
self.metrics.histogram_inference_time_request,
stats.time_inference_requests,
),
(self.metrics.histogram_queue_time_request, stats.time_queue_requests),
(
self.metrics.histogram_num_prompt_tokens_request,
stats.num_prompt_tokens_requests,
Expand All @@ -243,10 +380,17 @@ def log(self, stats: VllmStats) -> None:
),
(self.metrics.histogram_n_request, stats.n_requests),
]
gauge_metrics = [
(self.metrics.gauge_scheduler_running, stats.num_running_sys),
(self.metrics.gauge_scheduler_waiting, stats.num_waiting_sys),
(self.metrics.gauge_gpu_cache_usage, stats.gpu_cache_usage_sys),
]
for metric, data in counter_metrics:
self._log_counter(metric, data)
for metric, data in histogram_metrics:
self._log_histogram(metric, data)
for metric, data in gauge_metrics:
self._log_gauge(metric, data)

def logger_loop(self):
while True:
Expand All @@ -259,6 +403,8 @@ def logger_loop(self):
metric.increment(data)
elif command == "observe":
metric.observe(data)
elif command == "set":
metric.set(data)
else:
self.log_logger.log_error(f"Undefined command name: {command}")

Expand Down