feat(metrics): add generation_tokens_total and prompt_tokens_total metrics

googs1025 · googs1025 · commit f9ffada066e8 · 2025-12-16T13:21:32.000+08:00
Signed-off-by: CYJiang &lt;googs1025@gmail.com&gt;
diff --git a/README.md b/README.md
@@ -35,9 +35,11 @@ In addition, it supports a subset of vLLM's Prometheus metrics. These metrics ar
 | vllm:time_per_output_token_seconds | Histogram of time per output token in seconds |
 | vllm:inter_token_latency_seconds | Histogram of inter-token latency in seconds |
 | vllm:request_generation_tokens | Number of generation tokens processed |
+| vllm:generation_tokens_total	 | Total number of generated tokens. |
 | vllm:max_num_generation_tokens | Maximum number of requested generation tokens. Currently same as `vllm:request_generation_tokens` since always only one choice is returned |
 | vllm:request_params_max_tokens | Histogram of the max_tokens request parameter | 
 | vllm:request_prompt_tokens | Number of prefill tokens processed |
+| vllm:prompt_tokens_total	 | Total number of prompt tokens processed |
 | vllm:request_success_total | Count of successfully processed requests |
   
 The simulated inference has no connection with the model and LoRA adapters specified in the command line parameters or via the /v1/load_lora_adapter HTTP REST endpoint. The /v1/models endpoint returns simulated results based on those same command line parameters and those loaded via the /v1/load_lora_adapter HTTP REST endpoint.
diff --git a/pkg/common/config.go b/pkg/common/config.go
@@ -262,14 +262,25 @@ type Metrics struct {
 	// 1.0, 2.5, 5.0, 7.5, 10.0, 20.0, 40.0, 80.0, +Inf
 	TPOTBucketValues []int `yaml:"tpot-buckets-values" json:"tpot-buckets-values"`
 	// RequestPromptTokens RequestGenerationTokens RequestParamsMaxTokens Histogram fake-observation arrays for init.
-	// Each value will be passed to Observe() once at start-up.
+	// Each value in these arrays is passed to Observe() exactly once at startup.
+	// By default:
+	//   - The sum of RequestPromptTokens initializes the metric vllm:prompt_tokens_total.
+	//   - The sum of RequestGenerationTokens initializes the metric vllm:generation_tokens_total.
+	//
+	// If TotalPromptTokens or TotalGenerationTokens are explicitly provided,
+	// they override the above sums and are used directly as the initial total token counts.
 	RequestPromptTokens        []int `yaml:"request-prompt-tokens" json:"request-prompt-tokens"`                 // prompt-length samples
 	RequestGenerationTokens    []int `yaml:"request-generation-tokens" json:"request-generation-tokens"`         // generation-length samples
 	RequestParamsMaxTokens     []int `yaml:"request-params-max-tokens" json:"request-params-max-tokens"`         // max_tokens parameter samples
 	RequestMaxGenerationTokens []int `yaml:"request-max-generation-tokens" json:"request-max-generation-tokens"` // request_max_num_generation_tokens samples
 	// RequestSuccessTotal is the number of successful requests, key: finish-reason (stop, length, etc.).
 	RequestSuccessTotal map[string]int64 `yaml:"request-success-total" json:"request-success-total"`
 
+	// TotalPromptTokens is the total number of prompt tokens processed
+	TotalPromptTokens *int64 `json:"total-prompt-tokens,omitempty"`
+	// TotalGenerationTokens is the total number of generated tokens
+	TotalGenerationTokens *int64 `json:"total-generation-tokens,omitempty"`
+
 	// Latency histograms - have same buckets upper boundaries in seconds are:
 	// 0.3, 0.5, 0.8, 1.0, 1.5, 2.0, 2.5, 5.0, 10.0, 15.0,
 	// 20.0, 30.0, 40.0, 50.0, 60.0, 120.0, 240.0, 480.0, 960.0, 1920.0, 7680.0, +Inf
diff --git a/pkg/llm-d-inference-sim/metrics.go b/pkg/llm-d-inference-sim/metrics.go
@@ -45,6 +45,8 @@ const (
 	generationTokensMetricName       = "vllm:request_generation_tokens"
 	paramMaxTokensMetricName         = "vllm:request_params_max_tokens"
 	promptTokensMetricName           = "vllm:request_prompt_tokens"
+	generationTokensTotalMetricName  = "vllm:generation_tokens_total"
+	promptTokensTotalMetricName      = "vllm:prompt_tokens_total"
 	successTotalMetricName           = "vllm:request_success_total"
 	loraRequestsMetricName           = "vllm:lora_requests_info"
 	reqRunningMetricName             = "vllm:num_requests_running"
@@ -292,6 +294,34 @@ func (s *VllmSimulator) createAndRegisterPrometheus() error {
 		return err
 	}
 
+	s.metrics.promptTokensTotal = prometheus.NewCounterVec(
+		prometheus.CounterOpts{
+			Subsystem: "",
+			Name:      promptTokensTotalMetricName,
+			Help:      "Total number of prompt tokens processed.",
+		},
+		[]string{vllmapi.PromLabelModelName},
+	)
+
+	if err := s.metrics.registry.Register(s.metrics.promptTokensTotal); err != nil {
+		s.logger.Error(err, "prometheus prompt_tokens_total counter register failed")
+		return err
+	}
+
+	s.metrics.generationTokensTotal = prometheus.NewCounterVec(
+		prometheus.CounterOpts{
+			Subsystem: "",
+			Name:      generationTokensTotalMetricName,
+			Help:      "Total number of generated tokens.",
+		},
+		[]string{vllmapi.PromLabelModelName},
+	)
+
+	if err := s.metrics.registry.Register(s.metrics.generationTokensTotal); err != nil {
+		s.logger.Error(err, "prometheus generation_tokens_total counter register failed")
+		return err
+	}
+
 	s.metrics.requestSuccessTotal = prometheus.NewCounterVec(
 		prometheus.CounterOpts{
 			Subsystem: "",
@@ -343,9 +373,23 @@ func (s *VllmSimulator) setInitialPrometheusMetrics(cacheConfig *prometheus.Gaug
 		buckets := build125Buckets(s.config.MaxModelLen)
 		if s.config.FakeMetrics.RequestPromptTokens != nil {
 			s.initFakeHistogram(s.metrics.requestPromptTokens, buckets, s.config.FakeMetrics.RequestPromptTokens)
+			var promptTotal int64
+			if s.config.FakeMetrics.TotalPromptTokens != nil {
+				promptTotal = *s.config.FakeMetrics.TotalPromptTokens
+			} else {
+				promptTotal = estimateTokenTotal(s.config.FakeMetrics.RequestPromptTokens, buckets)
+			}
+			s.metrics.promptTokensTotal.WithLabelValues(modelName).Add(float64(promptTotal))
 		}
 		if s.config.FakeMetrics.RequestGenerationTokens != nil {
 			s.initFakeHistogram(s.metrics.requestParamsMaxTokens, buckets, s.config.FakeMetrics.RequestGenerationTokens)
+			var genTotal int64
+			if s.config.FakeMetrics.TotalGenerationTokens != nil {
+				genTotal = *s.config.FakeMetrics.TotalGenerationTokens
+			} else {
+				genTotal = estimateTokenTotal(s.config.FakeMetrics.RequestGenerationTokens, buckets)
+			}
+			s.metrics.generationTokensTotal.WithLabelValues(modelName).Add(float64(genTotal))
 		}
 		if s.config.FakeMetrics.RequestParamsMaxTokens != nil {
 			s.initFakeHistogram(s.metrics.requestGenerationTokens, buckets, s.config.FakeMetrics.RequestParamsMaxTokens)
@@ -727,6 +771,8 @@ func (s *VllmSimulator) recordRequestMetricsOnSuccess(promptTokens,
 	modelName := s.getDisplayedModelName(s.config.Model)
 	s.metrics.requestPromptTokens.WithLabelValues(modelName).Observe(float64(promptTokens))
 	s.metrics.requestGenerationTokens.WithLabelValues(modelName).Observe(float64(generationTokens))
+	s.metrics.promptTokensTotal.WithLabelValues(modelName).Add(float64(promptTokens))
+	s.metrics.generationTokensTotal.WithLabelValues(modelName).Add(float64(generationTokens))
 	if maxTokens != nil {
 		s.metrics.requestParamsMaxTokens.WithLabelValues(modelName).Observe(float64(*maxTokens))
 	}
@@ -764,3 +810,50 @@ func build125Buckets(maxValue int) []float64 {
 	}
 	return buckets
 }
+
+// estimateTokenTotal estimates the total number of tokens based on histogram bucket boundaries
+// and the number of requests in each bucket. It assumes that requests in a bucket have token
+// lengths uniformly distributed between the bucket's lower and upper bounds, and uses the
+// midpoint as a representative value for estimation.
+//
+// The last bucket is treated as [buckets[len(buckets)-1], +Inf), so its upper bound is approximated
+// as twice the lower bound for midpoint calculation.
+func estimateTokenTotal(counts []int, buckets []float64) int64 {
+	if len(counts) == 0 || len(buckets) == 0 {
+		return 0
+	}
+
+	nCounts := len(counts)
+	nBuckets := len(buckets)
+
+	var total int64
+	lower := 0.0
+
+	for i := 0; i < nCounts; i++ {
+		count := counts[i]
+		if count == 0 {
+			// Advance lower bound even if count is zero, to stay aligned with buckets
+			if i < nBuckets {
+				lower = buckets[i]
+			}
+			continue
+		}
+
+		var upper float64
+		if i < nBuckets {
+			// Bucket i corresponds to (lower, buckets[i]]
+			upper = buckets[i]
+		} else {
+			// Last bucket: (buckets[nBuckets-1], +Inf) → approximate upper = 2 * lower
+			upper = lower * 2.0
+		}
+
+		mid := (lower + upper) / 2.0
+		total += int64(float64(count) * mid)
+
+		// Update lower for next iteration
+		lower = upper
+	}
+
+	return total
+}
diff --git a/pkg/llm-d-inference-sim/metrics_test.go b/pkg/llm-d-inference-sim/metrics_test.go
@@ -162,6 +162,7 @@ var _ = Describe("Simulator metrics", Ordered, func() {
 		}
 		Expect(metrics).To(ContainSubstring(getFloatBucketMetricLine(testModel, promptTokensMetricName, math.Inf(1), 1)))
 		Expect(metrics).To(ContainSubstring(getFloatBucketMetricLine(testModel, paramMaxTokensMetricName, math.Inf(1), 1)))
+		Expect(metrics).To(MatchRegexp(`vllm:prompt_tokens_total{model_name="testmodel"} 25`))
 
 		// request_generation_tokens
 		// We do not verify the distribution of the number of tokens generated per request,
@@ -710,12 +711,46 @@ var _ = Describe("Simulator metrics", Ordered, func() {
 			Expect(metrics).To(ContainSubstring(getFloatBucketMetricLine(testModel, generationTokensMetricName, math.Inf(1), expectedCount)))
 			Expect(metrics).To(ContainSubstring(getFloatBucketMetricLine(testModel, promptTokensMetricName, math.Inf(1), expectedCount)))
 			Expect(metrics).To(ContainSubstring(getFloatBucketMetricLine(testModel, paramMaxTokensMetricName, math.Inf(1), expectedCount)))
+			Expect(metrics).To(MatchRegexp(`vllm:generation_tokens_total{model_name="testmodel"} 140`))
+			Expect(metrics).To(MatchRegexp(`vllm:prompt_tokens_total{model_name="testmodel"} 140`))
 
 			Expect(metrics).To(ContainSubstring(`vllm:request_success_total{finish_reason="length",model_name="testmodel"} 0`))
 			Expect(metrics).To(ContainSubstring(`vllm:request_success_total{finish_reason="remote_decode",model_name="testmodel"} 0`))
 			Expect(metrics).To(ContainSubstring(`vllm:request_success_total{finish_reason="stop",model_name="testmodel"} 20`))
 			Expect(metrics).To(ContainSubstring(`vllm:request_success_total{finish_reason="tool_calls",model_name="testmodel"} 0`))
 		})
+		It("Should use TotalPromptTokens and TotalGenerationTokens if provided", func() {
+			ctx := context.TODO()
+			args := []string{
+				"cmd", "--model", testModel, "--mode", common.ModeRandom,
+				"--fake-metrics",
+				`{` +
+					`"running-requests":5,` +
+					`"waiting-requests":2,` +
+					`"kv-cache-usage":0.1,` +
+					`"request-prompt-tokens":[100,200],` +
+					`"request-generation-tokens":[50,150],` +
+					`"total-prompt-tokens":12345,` + // explicit total
+					`"total-generation-tokens":67890,` + // explicit total
+					`"request-success-total":{"stop":10}` +
+					`}`,
+			}
+
+			client, err := startServerWithArgs(ctx, args)
+			Expect(err).NotTo(HaveOccurred())
+
+			resp, err := client.Get(metricsUrl)
+			Expect(err).NotTo(HaveOccurred())
+			Expect(resp.StatusCode).To(Equal(http.StatusOK))
+
+			data, err := io.ReadAll(resp.Body)
+			Expect(err).NotTo(HaveOccurred())
+			metrics := string(data)
+
+			// Verify that the explicit totals are used
+			Expect(metrics).To(MatchRegexp(`vllm:prompt_tokens_total{model_name="testmodel"} 12345`))
+			Expect(metrics).To(MatchRegexp(`vllm:generation_tokens_total{model_name="testmodel"} 67890`))
+		})
 	})
 
 	Context("fake ttft metrics", func() {
@@ -946,3 +981,110 @@ var _ = Describe("build125Buckets", Ordered, func() {
 		}
 	})
 })
+
+var _ = Describe("estimateTokenTotal", func() {
+	It("should correctly estimate total tokens from bucket counts and boundaries", func() {
+		tests := []struct {
+			name     string
+			counts   []int
+			buckets  []float64
+			expected int64
+		}{
+			{
+				name:     "empty counts",
+				counts:   []int{},
+				buckets:  []float64{1, 2, 5},
+				expected: 0,
+			},
+			{
+				name:     "empty buckets",
+				counts:   []int{10, 20},
+				buckets:  []float64{},
+				expected: 0,
+			},
+			{
+				name:     "only first bucket has requests: [0,10]",
+				counts:   []int{1},
+				buckets:  []float64{10},
+				expected: 5,
+				// bucket0: [0,10] → mid=5 → 1*5 = 5
+				// total = 5
+			},
+			{
+				name:     "first two buckets: [0,10], (10,20]",
+				counts:   []int{2, 3},
+				buckets:  []float64{10, 20},
+				expected: 55,
+				// bucket0: [0,10] → mid=5 → 2*5 = 10
+				// bucket1: (10,20] → mid=15 → 3*15 = 45
+				// total = 10 + 45 = 55
+			},
+			{
+				name:     "three finite buckets + last (+Inf) bucket",
+				counts:   []int{1, 1, 1, 1},
+				buckets:  []float64{10, 20, 50},
+				expected: 130,
+				// bucket0: [0,10] → mid=5 → 1*5 = 5
+				// bucket1: (10,20] → mid=15 → 1*15 = 15
+				// bucket2: (20,50] → mid=35 → 1*35 = 35
+				// bucket3: (50,+Inf) → upper=100, mid=75 → 1*75 = 75
+				// total = 5 + 15 + 35 + 75 = 130
+			},
+			{
+				name:     "zero counts in some buckets",
+				counts:   []int{0, 5, 0, 2},
+				buckets:  []float64{1, 10, 100},
+				expected: 327,
+				// bucket1: (1,10] → mid=5.5 → 5*5.5 = 27.5 → truncated to 27
+				// bucket3: (100,+Inf) → upper=200, mid=150 → 2*150 = 300
+				// total = 27 + 300 = 327
+			},
+			{
+				name:     "only last bucket has requests",
+				counts:   []int{0, 0, 0, 4},
+				buckets:  []float64{10, 100, 1000},
+				expected: 6000,
+				// bucket3: (1000,+Inf) → upper=2000, mid=1500 → 4*1500 = 6000
+				// total = 4*1500 = 6000
+			},
+			{
+				name:     "non-integer midpoints truncated by int64 cast",
+				counts:   []int{1},
+				buckets:  []float64{1},
+				expected: 0,
+				// bucket0: [0,1] → mid=0.5 → 1*0.5 = 0.5 → truncated to 0
+			},
+			{
+				name:     "collaborator example: [10,20,30] with long buckets",
+				counts:   []int{10, 20, 30},
+				buckets:  []float64{1, 2, 5, 10, 20, 50, 100, 200, 500, 1000},
+				expected: 140,
+				// bucket0: [0,1] → mid=0.5 → 10*0.5 = 5
+				// bucket1: (1,2] → mid=1.5 → 20*1.5 = 30
+				// bucket2: (2,5] → mid=3.5 → 30*3.5 = 105
+				// total = 5 + 30 + 105 = 140
+			},
+			{
+				name:     "counts shorter than buckets (trailing zeros omitted)",
+				counts:   []int{1, 1},
+				buckets:  []float64{10, 100, 1000, 10000},
+				expected: 60,
+				// bucket0: [0,10] → mid=5 → 1*5 = 5
+				// bucket1: (10,100] → mid=55 → 1*55 = 55
+				// total = 5 + 55 = 60
+			},
+			{
+				name:     "all zero counts",
+				counts:   []int{0, 0, 0},
+				buckets:  []float64{1, 10, 100},
+				expected: 0,
+				// all buckets have zero requests
+			},
+		}
+
+		for _, test := range tests {
+			result := estimateTokenTotal(test.counts, test.buckets)
+			Expect(result).To(Equal(test.expected), "test case: %s", test.name)
+		}
+	})
+})
diff --git a/pkg/llm-d-inference-sim/simulator.go b/pkg/llm-d-inference-sim/simulator.go
@@ -136,6 +136,10 @@ type metricsData struct {
 	requestPromptTokens *prometheus.HistogramVec
 	// requestGenerationTokens is prometheus histogram for number of generated tokens in request
 	requestGenerationTokens *prometheus.HistogramVec
+	// promptTokensTotal is prometheus counter for total number of input (prompt) tokens
+	promptTokensTotal *prometheus.CounterVec
+	// generationTokensTotal is prometheus counter for total number of generated tokens
+	generationTokensTotal *prometheus.CounterVec
 	// maxNumGenerationTokens is prometheus histogram for maximum number of generated tokens in request
 	maxNumGenerationTokens *prometheus.HistogramVec
 	// requestParamsMaxTokens is prometheus histogram for 'max_tokens' parameter in request