@@ -49,12 +49,12 @@ def __init__(self, labels: List[str], max_model_len: int):
4949            description = "Number of generation tokens processed." ,
5050            kind = pb_utils .MetricFamily .COUNTER ,
5151        )
52-         self .counter_preemption_tokens_family  =  pb_utils .MetricFamily (
52+         self .counter_num_preemption_family  =  pb_utils .MetricFamily (
5353            name = "vllm:num_preemptions_total" ,
5454            description = "Number of preemption tokens processed." ,
5555            kind = pb_utils .MetricFamily .COUNTER ,
5656        )
57-         self .histogram_iteration_tokens_total_family  =  pb_utils .MetricFamily (
57+         self .histogram_iteration_tokens_family  =  pb_utils .MetricFamily (
5858            name = "vllm:iteration_tokens_total" ,
5959            description = "Histogram of number of tokens per engine_step." ,
6060            kind = pb_utils .MetricFamily .HISTOGRAM ,
@@ -124,33 +124,12 @@ def __init__(self, labels: List[str], max_model_len: int):
124124            description = "Number of requests waiting to be processed." ,
125125            kind = pb_utils .MetricFamily .GAUGE ,
126126        )
127-         self .gauge_scheduler_swapped_family  =  pb_utils .MetricFamily (
128-             name = "vllm:num_requests_swapped" ,
129-             description = "Number of requests swapped to CPU." ,
130-             kind = pb_utils .MetricFamily .GAUGE ,
131-         )
132127        #   KV Cache Usage in % 
133128        self .gauge_gpu_cache_usage_family  =  pb_utils .MetricFamily (
134129            name = "vllm:gpu_cache_usage_perc" ,
135130            description = "GPU KV-cache usage. 1 means 100 percent usage." ,
136131            kind = pb_utils .MetricFamily .GAUGE ,
137132        )
138-         self .gauge_cpu_cache_usage_family  =  pb_utils .MetricFamily (
139-             name = "vllm:cpu_cache_usage_perc" ,
140-             description = "CPU KV-cache usage. 1 means 100 percent usage." ,
141-             kind = pb_utils .MetricFamily .GAUGE ,
142-         )
143-         #   Prefix caching block hit rate 
144-         self .gauge_cpu_prefix_cache_hit_rate_family  =  pb_utils .MetricFamily (
145-             name = "vllm:cpu_prefix_cache_hit_rate" ,
146-             description = "CPU prefix cache block hit rate." ,
147-             kind = pb_utils .MetricFamily .GAUGE ,
148-         )
149-         self .gauge_gpu_prefix_cache_hit_rate_family  =  pb_utils .MetricFamily (
150-             name = "vllm:gpu_prefix_cache_hit_rate" ,
151-             description = "GPU prefix cache block hit rate." ,
152-             kind = pb_utils .MetricFamily .GAUGE ,
153-         )
154133
155134        # Initialize metrics 
156135        # Iteration stats 
@@ -160,14 +139,14 @@ def __init__(self, labels: List[str], max_model_len: int):
160139        self .counter_generation_tokens  =  self .counter_generation_tokens_family .Metric (
161140            labels = labels 
162141        )
163-         self .counter_preemption_tokens  =  self .counter_preemption_tokens_family .Metric (
142+         self .counter_num_preemption  =  self .counter_num_preemption_family .Metric (
164143            labels = labels 
165144        )
166145
167146        # Use the same bucket boundaries from vLLM sample metrics as an example. 
168147        # https://github.com/vllm-project/vllm/blob/21313e09e3f9448817016290da20d0db1adf3664/vllm/engine/metrics.py#L81-L96 
169-         self .histogram_iteration_tokens_total  =  (
170-             self .histogram_iteration_tokens_total_family .Metric (
148+         self .histogram_iteration_tokens  =  (
149+             self .histogram_iteration_tokens_family .Metric (
171150                labels = labels ,
172151                buckets = [1 , 8 , 16 , 32 , 64 , 128 , 256 , 512 , 1024 , 2048 , 4096 , 8192 , 16384 ],
173152            )
@@ -218,32 +197,36 @@ def __init__(self, labels: List[str], max_model_len: int):
218197        )
219198        # Request stats 
220199        #   Latency 
200+         request_latency_buckets  =  [
201+             0.3 , 0.5 , 0.8 , 1.0 , 1.5 , 2.0 , 2.5 , 5.0 , 10.0 , 15.0 , 20.0 , 30.0 ,
202+             40.0 , 50.0 , 60.0 , 120.0 , 240.0 , 480.0 , 960.0 , 1920.0 , 7680.0 
203+         ]
221204        self .histogram_e2e_time_request  =  self .histogram_e2e_time_request_family .Metric (
222205            labels = labels ,
223-             buckets = [ 1.0 ,  2.5 ,  5.0 ,  10.0 ,  15.0 ,  20.0 ,  30.0 ,  40.0 ,  50.0 ,  60.0 ] ,
206+             buckets = request_latency_buckets ,
224207        )
225208        self .histogram_prefill_time_request  =  (
226209            self .histogram_prefill_time_request_family .Metric (
227210                labels = labels ,
228-                 buckets = [ 1.0 ,  2.5 ,  5.0 ,  10.0 ,  15.0 ,  20.0 ,  30.0 ,  40.0 ,  50.0 ,  60.0 ] ,
211+                 buckets = request_latency_buckets ,
229212            )
230213        )
231214        self .histogram_decode_time_request  =  (
232215            self .histogram_decode_time_request_family .Metric (
233216                labels = labels ,
234-                 buckets = [ 1.0 ,  2.5 ,  5.0 ,  10.0 ,  15.0 ,  20.0 ,  30.0 ,  40.0 ,  50.0 ,  60.0 ] ,
217+                 buckets = request_latency_buckets ,
235218            )
236219        )
237220        self .histogram_inference_time_request  =  (
238221            self .histogram_inference_time_request_family .Metric (
239222                labels = labels ,
240-                 buckets = [ 1.0 ,  2.5 ,  5.0 ,  10.0 ,  15.0 ,  20.0 ,  30.0 ,  40.0 ,  50.0 ,  60.0 ] ,
223+                 buckets = request_latency_buckets ,
241224            )
242225        )
243226        self .histogram_queue_time_request  =  (
244227            self .histogram_queue_time_request_family .Metric (
245228                labels = labels ,
246-                 buckets = [ 1.0 ,  2.5 ,  5.0 ,  10.0 ,  15.0 ,  20.0 ,  30.0 ,  40.0 ,  50.0 ,  60.0 ] ,
229+                 buckets = request_latency_buckets ,
247230            )
248231        )
249232        #   Metadata 
@@ -265,29 +248,16 @@ def __init__(self, labels: List[str], max_model_len: int):
265248        )
266249        # System stats 
267250        #   Scheduler State 
268-         self .gauge_num_requests_running  =  self .gauge_scheduler_running_family .Metric (
251+         self .gauge_scheduler_running  =  self .gauge_scheduler_running_family .Metric (
269252            labels = labels 
270253        )
271-         self .gauge_num_requests_waiting  =  self .gauge_scheduler_waiting_family .Metric (
272-             labels = labels 
273-         )
274-         self .gauge_num_requests_swapped  =  self .gauge_scheduler_swapped_family .Metric (
254+         self .gauge_scheduler_waiting  =  self .gauge_scheduler_waiting_family .Metric (
275255            labels = labels 
276256        )
277257        #   KV Cache Usage in % 
278258        self .gauge_gpu_cache_usage  =  self .gauge_gpu_cache_usage_family .Metric (
279259            labels = labels 
280260        )
281-         self .gauge_cpu_cache_usage  =  self .gauge_cpu_cache_usage_family .Metric (
282-             labels = labels 
283-         )
284-         #   Prefix caching block hit rate 
285-         self .gauge_cpu_prefix_cache_hit_rate  =  (
286-             self .gauge_cpu_prefix_cache_hit_rate_family .Metric (labels = labels )
287-         )
288-         self .gauge_gpu_prefix_cache_hit_rate  =  (
289-             self .gauge_gpu_prefix_cache_hit_rate_family .Metric (labels = labels )
290-         )
291261
292262
293263class  VllmStatLogger (VllmStatLoggerBase ):
@@ -394,19 +364,9 @@ def log(self, stats: VllmStats) -> None:
394364            (self .metrics .histogram_n_request , stats .n_requests ),
395365        ]
396366        gauge_metrics  =  [
397-             (self .metrics .gauge_num_requests_running , stats .num_running_sys ),
398-             (self .metrics .gauge_num_requests_waiting , stats .num_waiting_sys ),
399-             (self .metrics .gauge_num_requests_swapped , stats .num_swapped_sys ),
367+             (self .metrics .gauge_scheduler_running , stats .num_running_sys ),
368+             (self .metrics .gauge_scheduler_waiting , stats .num_waiting_sys ),
400369            (self .metrics .gauge_gpu_cache_usage , stats .gpu_cache_usage_sys ),
401-             (self .metrics .gauge_cpu_cache_usage , stats .cpu_cache_usage_sys ),
402-             (
403-                 self .metrics .gauge_cpu_prefix_cache_hit_rate ,
404-                 stats .cpu_prefix_cache_hit_rate ,
405-             ),
406-             (
407-                 self .metrics .gauge_gpu_prefix_cache_hit_rate ,
408-                 stats .gpu_prefix_cache_hit_rate ,
409-             ),
410370        ]
411371        for  metric , data  in  counter_metrics :
412372            self ._log_counter (metric , data )
0 commit comments