Skip to content

Commit f9ffada

Browse files
committed
feat(metrics): add generation_tokens_total and prompt_tokens_total metrics
Signed-off-by: CYJiang <googs1025@gmail.com>
1 parent 96d3f72 commit f9ffada

File tree

5 files changed

+253
-1
lines changed

5 files changed

+253
-1
lines changed

README.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,9 +35,11 @@ In addition, it supports a subset of vLLM's Prometheus metrics. These metrics ar
3535
| vllm:time_per_output_token_seconds | Histogram of time per output token in seconds |
3636
| vllm:inter_token_latency_seconds | Histogram of inter-token latency in seconds |
3737
| vllm:request_generation_tokens | Number of generation tokens processed |
38+
| vllm:generation_tokens_total | Total number of generated tokens. |
3839
| vllm:max_num_generation_tokens | Maximum number of requested generation tokens. Currently same as `vllm:request_generation_tokens` since always only one choice is returned |
3940
| vllm:request_params_max_tokens | Histogram of the max_tokens request parameter |
4041
| vllm:request_prompt_tokens | Number of prefill tokens processed |
42+
| vllm:prompt_tokens_total | Total number of prompt tokens processed |
4143
| vllm:request_success_total | Count of successfully processed requests |
4244

4345
The simulated inference has no connection with the model and LoRA adapters specified in the command line parameters or via the /v1/load_lora_adapter HTTP REST endpoint. The /v1/models endpoint returns simulated results based on those same command line parameters and those loaded via the /v1/load_lora_adapter HTTP REST endpoint.

pkg/common/config.go

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -262,14 +262,25 @@ type Metrics struct {
262262
// 1.0, 2.5, 5.0, 7.5, 10.0, 20.0, 40.0, 80.0, +Inf
263263
TPOTBucketValues []int `yaml:"tpot-buckets-values" json:"tpot-buckets-values"`
264264
// RequestPromptTokens RequestGenerationTokens RequestParamsMaxTokens Histogram fake-observation arrays for init.
265-
// Each value will be passed to Observe() once at start-up.
265+
// Each value in these arrays is passed to Observe() exactly once at startup.
266+
// By default:
267+
// - The sum of RequestPromptTokens initializes the metric vllm:prompt_tokens_total.
268+
// - The sum of RequestGenerationTokens initializes the metric vllm:generation_tokens_total.
269+
//
270+
// If TotalPromptTokens or TotalGenerationTokens are explicitly provided,
271+
// they override the above sums and are used directly as the initial total token counts.
266272
RequestPromptTokens []int `yaml:"request-prompt-tokens" json:"request-prompt-tokens"` // prompt-length samples
267273
RequestGenerationTokens []int `yaml:"request-generation-tokens" json:"request-generation-tokens"` // generation-length samples
268274
RequestParamsMaxTokens []int `yaml:"request-params-max-tokens" json:"request-params-max-tokens"` // max_tokens parameter samples
269275
RequestMaxGenerationTokens []int `yaml:"request-max-generation-tokens" json:"request-max-generation-tokens"` // request_max_num_generation_tokens samples
270276
// RequestSuccessTotal is the number of successful requests, key: finish-reason (stop, length, etc.).
271277
RequestSuccessTotal map[string]int64 `yaml:"request-success-total" json:"request-success-total"`
272278

279+
// TotalPromptTokens is the total number of prompt tokens processed
280+
TotalPromptTokens *int64 `json:"total-prompt-tokens,omitempty"`
281+
// TotalGenerationTokens is the total number of generated tokens
282+
TotalGenerationTokens *int64 `json:"total-generation-tokens,omitempty"`
283+
273284
// Latency histograms - have same buckets upper boundaries in seconds are:
274285
// 0.3, 0.5, 0.8, 1.0, 1.5, 2.0, 2.5, 5.0, 10.0, 15.0,
275286
// 20.0, 30.0, 40.0, 50.0, 60.0, 120.0, 240.0, 480.0, 960.0, 1920.0, 7680.0, +Inf

pkg/llm-d-inference-sim/metrics.go

Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,8 @@ const (
4545
generationTokensMetricName = "vllm:request_generation_tokens"
4646
paramMaxTokensMetricName = "vllm:request_params_max_tokens"
4747
promptTokensMetricName = "vllm:request_prompt_tokens"
48+
generationTokensTotalMetricName = "vllm:generation_tokens_total"
49+
promptTokensTotalMetricName = "vllm:prompt_tokens_total"
4850
successTotalMetricName = "vllm:request_success_total"
4951
loraRequestsMetricName = "vllm:lora_requests_info"
5052
reqRunningMetricName = "vllm:num_requests_running"
@@ -292,6 +294,34 @@ func (s *VllmSimulator) createAndRegisterPrometheus() error {
292294
return err
293295
}
294296

297+
s.metrics.promptTokensTotal = prometheus.NewCounterVec(
298+
prometheus.CounterOpts{
299+
Subsystem: "",
300+
Name: promptTokensTotalMetricName,
301+
Help: "Total number of prompt tokens processed.",
302+
},
303+
[]string{vllmapi.PromLabelModelName},
304+
)
305+
306+
if err := s.metrics.registry.Register(s.metrics.promptTokensTotal); err != nil {
307+
s.logger.Error(err, "prometheus prompt_tokens_total counter register failed")
308+
return err
309+
}
310+
311+
s.metrics.generationTokensTotal = prometheus.NewCounterVec(
312+
prometheus.CounterOpts{
313+
Subsystem: "",
314+
Name: generationTokensTotalMetricName,
315+
Help: "Total number of generated tokens.",
316+
},
317+
[]string{vllmapi.PromLabelModelName},
318+
)
319+
320+
if err := s.metrics.registry.Register(s.metrics.generationTokensTotal); err != nil {
321+
s.logger.Error(err, "prometheus generation_tokens_total counter register failed")
322+
return err
323+
}
324+
295325
s.metrics.requestSuccessTotal = prometheus.NewCounterVec(
296326
prometheus.CounterOpts{
297327
Subsystem: "",
@@ -343,9 +373,23 @@ func (s *VllmSimulator) setInitialPrometheusMetrics(cacheConfig *prometheus.Gaug
343373
buckets := build125Buckets(s.config.MaxModelLen)
344374
if s.config.FakeMetrics.RequestPromptTokens != nil {
345375
s.initFakeHistogram(s.metrics.requestPromptTokens, buckets, s.config.FakeMetrics.RequestPromptTokens)
376+
var promptTotal int64
377+
if s.config.FakeMetrics.TotalPromptTokens != nil {
378+
promptTotal = *s.config.FakeMetrics.TotalPromptTokens
379+
} else {
380+
promptTotal = estimateTokenTotal(s.config.FakeMetrics.RequestPromptTokens, buckets)
381+
}
382+
s.metrics.promptTokensTotal.WithLabelValues(modelName).Add(float64(promptTotal))
346383
}
347384
if s.config.FakeMetrics.RequestGenerationTokens != nil {
348385
s.initFakeHistogram(s.metrics.requestParamsMaxTokens, buckets, s.config.FakeMetrics.RequestGenerationTokens)
386+
var genTotal int64
387+
if s.config.FakeMetrics.TotalGenerationTokens != nil {
388+
genTotal = *s.config.FakeMetrics.TotalGenerationTokens
389+
} else {
390+
genTotal = estimateTokenTotal(s.config.FakeMetrics.RequestGenerationTokens, buckets)
391+
}
392+
s.metrics.generationTokensTotal.WithLabelValues(modelName).Add(float64(genTotal))
349393
}
350394
if s.config.FakeMetrics.RequestParamsMaxTokens != nil {
351395
s.initFakeHistogram(s.metrics.requestGenerationTokens, buckets, s.config.FakeMetrics.RequestParamsMaxTokens)
@@ -727,6 +771,8 @@ func (s *VllmSimulator) recordRequestMetricsOnSuccess(promptTokens,
727771
modelName := s.getDisplayedModelName(s.config.Model)
728772
s.metrics.requestPromptTokens.WithLabelValues(modelName).Observe(float64(promptTokens))
729773
s.metrics.requestGenerationTokens.WithLabelValues(modelName).Observe(float64(generationTokens))
774+
s.metrics.promptTokensTotal.WithLabelValues(modelName).Add(float64(promptTokens))
775+
s.metrics.generationTokensTotal.WithLabelValues(modelName).Add(float64(generationTokens))
730776
if maxTokens != nil {
731777
s.metrics.requestParamsMaxTokens.WithLabelValues(modelName).Observe(float64(*maxTokens))
732778
}
@@ -764,3 +810,50 @@ func build125Buckets(maxValue int) []float64 {
764810
}
765811
return buckets
766812
}
813+
814+
// estimateTokenTotal estimates the total number of tokens based on histogram bucket boundaries
815+
// and the number of requests in each bucket. It assumes that requests in a bucket have token
816+
// lengths uniformly distributed between the bucket's lower and upper bounds, and uses the
817+
// midpoint as a representative value for estimation.
818+
//
819+
// The last bucket is treated as [buckets[len(buckets)-1], +Inf), so its upper bound is approximated
820+
// as twice the lower bound for midpoint calculation.
821+
func estimateTokenTotal(counts []int, buckets []float64) int64 {
822+
if len(counts) == 0 || len(buckets) == 0 {
823+
return 0
824+
}
825+
826+
nCounts := len(counts)
827+
nBuckets := len(buckets)
828+
829+
var total int64
830+
lower := 0.0
831+
832+
for i := 0; i < nCounts; i++ {
833+
count := counts[i]
834+
if count == 0 {
835+
// Advance lower bound even if count is zero, to stay aligned with buckets
836+
if i < nBuckets {
837+
lower = buckets[i]
838+
}
839+
continue
840+
}
841+
842+
var upper float64
843+
if i < nBuckets {
844+
// Bucket i corresponds to (lower, buckets[i]]
845+
upper = buckets[i]
846+
} else {
847+
// Last bucket: (buckets[nBuckets-1], +Inf) → approximate upper = 2 * lower
848+
upper = lower * 2.0
849+
}
850+
851+
mid := (lower + upper) / 2.0
852+
total += int64(float64(count) * mid)
853+
854+
// Update lower for next iteration
855+
lower = upper
856+
}
857+
858+
return total
859+
}

pkg/llm-d-inference-sim/metrics_test.go

Lines changed: 142 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -162,6 +162,7 @@ var _ = Describe("Simulator metrics", Ordered, func() {
162162
}
163163
Expect(metrics).To(ContainSubstring(getFloatBucketMetricLine(testModel, promptTokensMetricName, math.Inf(1), 1)))
164164
Expect(metrics).To(ContainSubstring(getFloatBucketMetricLine(testModel, paramMaxTokensMetricName, math.Inf(1), 1)))
165+
Expect(metrics).To(MatchRegexp(`vllm:prompt_tokens_total{model_name="testmodel"} 25`))
165166

166167
// request_generation_tokens
167168
// We do not verify the distribution of the number of tokens generated per request,
@@ -710,12 +711,46 @@ var _ = Describe("Simulator metrics", Ordered, func() {
710711
Expect(metrics).To(ContainSubstring(getFloatBucketMetricLine(testModel, generationTokensMetricName, math.Inf(1), expectedCount)))
711712
Expect(metrics).To(ContainSubstring(getFloatBucketMetricLine(testModel, promptTokensMetricName, math.Inf(1), expectedCount)))
712713
Expect(metrics).To(ContainSubstring(getFloatBucketMetricLine(testModel, paramMaxTokensMetricName, math.Inf(1), expectedCount)))
714+
Expect(metrics).To(MatchRegexp(`vllm:generation_tokens_total{model_name="testmodel"} 140`))
715+
Expect(metrics).To(MatchRegexp(`vllm:prompt_tokens_total{model_name="testmodel"} 140`))
713716

714717
Expect(metrics).To(ContainSubstring(`vllm:request_success_total{finish_reason="length",model_name="testmodel"} 0`))
715718
Expect(metrics).To(ContainSubstring(`vllm:request_success_total{finish_reason="remote_decode",model_name="testmodel"} 0`))
716719
Expect(metrics).To(ContainSubstring(`vllm:request_success_total{finish_reason="stop",model_name="testmodel"} 20`))
717720
Expect(metrics).To(ContainSubstring(`vllm:request_success_total{finish_reason="tool_calls",model_name="testmodel"} 0`))
718721
})
722+
It("Should use TotalPromptTokens and TotalGenerationTokens if provided", func() {
723+
ctx := context.TODO()
724+
args := []string{
725+
"cmd", "--model", testModel, "--mode", common.ModeRandom,
726+
"--fake-metrics",
727+
`{` +
728+
`"running-requests":5,` +
729+
`"waiting-requests":2,` +
730+
`"kv-cache-usage":0.1,` +
731+
`"request-prompt-tokens":[100,200],` +
732+
`"request-generation-tokens":[50,150],` +
733+
`"total-prompt-tokens":12345,` + // explicit total
734+
`"total-generation-tokens":67890,` + // explicit total
735+
`"request-success-total":{"stop":10}` +
736+
`}`,
737+
}
738+
739+
client, err := startServerWithArgs(ctx, args)
740+
Expect(err).NotTo(HaveOccurred())
741+
742+
resp, err := client.Get(metricsUrl)
743+
Expect(err).NotTo(HaveOccurred())
744+
Expect(resp.StatusCode).To(Equal(http.StatusOK))
745+
746+
data, err := io.ReadAll(resp.Body)
747+
Expect(err).NotTo(HaveOccurred())
748+
metrics := string(data)
749+
750+
// Verify that the explicit totals are used
751+
Expect(metrics).To(MatchRegexp(`vllm:prompt_tokens_total{model_name="testmodel"} 12345`))
752+
Expect(metrics).To(MatchRegexp(`vllm:generation_tokens_total{model_name="testmodel"} 67890`))
753+
})
719754
})
720755

721756
Context("fake ttft metrics", func() {
@@ -946,3 +981,110 @@ var _ = Describe("build125Buckets", Ordered, func() {
946981
}
947982
})
948983
})
984+
985+
var _ = Describe("estimateTokenTotal", func() {
986+
It("should correctly estimate total tokens from bucket counts and boundaries", func() {
987+
tests := []struct {
988+
name string
989+
counts []int
990+
buckets []float64
991+
expected int64
992+
}{
993+
{
994+
name: "empty counts",
995+
counts: []int{},
996+
buckets: []float64{1, 2, 5},
997+
expected: 0,
998+
},
999+
{
1000+
name: "empty buckets",
1001+
counts: []int{10, 20},
1002+
buckets: []float64{},
1003+
expected: 0,
1004+
},
1005+
{
1006+
name: "only first bucket has requests: [0,10]",
1007+
counts: []int{1},
1008+
buckets: []float64{10},
1009+
expected: 5,
1010+
// bucket0: [0,10] → mid=5 → 1*5 = 5
1011+
// total = 5
1012+
},
1013+
{
1014+
name: "first two buckets: [0,10], (10,20]",
1015+
counts: []int{2, 3},
1016+
buckets: []float64{10, 20},
1017+
expected: 55,
1018+
// bucket0: [0,10] → mid=5 → 2*5 = 10
1019+
// bucket1: (10,20] → mid=15 → 3*15 = 45
1020+
// total = 10 + 45 = 55
1021+
},
1022+
{
1023+
name: "three finite buckets + last (+Inf) bucket",
1024+
counts: []int{1, 1, 1, 1},
1025+
buckets: []float64{10, 20, 50},
1026+
expected: 130,
1027+
// bucket0: [0,10] → mid=5 → 1*5 = 5
1028+
// bucket1: (10,20] → mid=15 → 1*15 = 15
1029+
// bucket2: (20,50] → mid=35 → 1*35 = 35
1030+
// bucket3: (50,+Inf) → upper=100, mid=75 → 1*75 = 75
1031+
// total = 5 + 15 + 35 + 75 = 130
1032+
},
1033+
{
1034+
name: "zero counts in some buckets",
1035+
counts: []int{0, 5, 0, 2},
1036+
buckets: []float64{1, 10, 100},
1037+
expected: 327,
1038+
// bucket1: (1,10] → mid=5.5 → 5*5.5 = 27.5 → truncated to 27
1039+
// bucket3: (100,+Inf) → upper=200, mid=150 → 2*150 = 300
1040+
// total = 27 + 300 = 327
1041+
},
1042+
{
1043+
name: "only last bucket has requests",
1044+
counts: []int{0, 0, 0, 4},
1045+
buckets: []float64{10, 100, 1000},
1046+
expected: 6000,
1047+
// bucket3: (1000,+Inf) → upper=2000, mid=1500 → 4*1500 = 6000
1048+
// total = 4*1500 = 6000
1049+
},
1050+
{
1051+
name: "non-integer midpoints truncated by int64 cast",
1052+
counts: []int{1},
1053+
buckets: []float64{1},
1054+
expected: 0,
1055+
// bucket0: [0,1] → mid=0.5 → 1*0.5 = 0.5 → truncated to 0
1056+
},
1057+
{
1058+
name: "collaborator example: [10,20,30] with long buckets",
1059+
counts: []int{10, 20, 30},
1060+
buckets: []float64{1, 2, 5, 10, 20, 50, 100, 200, 500, 1000},
1061+
expected: 140,
1062+
// bucket0: [0,1] → mid=0.5 → 10*0.5 = 5
1063+
// bucket1: (1,2] → mid=1.5 → 20*1.5 = 30
1064+
// bucket2: (2,5] → mid=3.5 → 30*3.5 = 105
1065+
// total = 5 + 30 + 105 = 140
1066+
},
1067+
{
1068+
name: "counts shorter than buckets (trailing zeros omitted)",
1069+
counts: []int{1, 1},
1070+
buckets: []float64{10, 100, 1000, 10000},
1071+
expected: 60,
1072+
// bucket0: [0,10] → mid=5 → 1*5 = 5
1073+
// bucket1: (10,100] → mid=55 → 1*55 = 55
1074+
// total = 5 + 55 = 60
1075+
},
1076+
{
1077+
name: "all zero counts",
1078+
counts: []int{0, 0, 0},
1079+
buckets: []float64{1, 10, 100},
1080+
expected: 0,
1081+
// all buckets have zero requests
1082+
},
1083+
}
1084+
1085+
for _, test := range tests {
1086+
result := estimateTokenTotal(test.counts, test.buckets)
1087+
Expect(result).To(Equal(test.expected), "test case: %s", test.name)
1088+
}
1089+
})
1090+
})

pkg/llm-d-inference-sim/simulator.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -136,6 +136,10 @@ type metricsData struct {
136136
requestPromptTokens *prometheus.HistogramVec
137137
// requestGenerationTokens is prometheus histogram for number of generated tokens in request
138138
requestGenerationTokens *prometheus.HistogramVec
139+
// promptTokensTotal is prometheus counter for total number of input (prompt) tokens
140+
promptTokensTotal *prometheus.CounterVec
141+
// generationTokensTotal is prometheus counter for total number of generated tokens
142+
generationTokensTotal *prometheus.CounterVec
139143
// maxNumGenerationTokens is prometheus histogram for maximum number of generated tokens in request
140144
maxNumGenerationTokens *prometheus.HistogramVec
141145
// requestParamsMaxTokens is prometheus histogram for 'max_tokens' parameter in request

0 commit comments

Comments
 (0)