refactor(component,ai,gemini): merge usage and usage-metadata fields into single usage field (#1126)

pinglin · web-flow · commit a6046cd151c9 · 2025-09-25T12:52:59.000+01:00
Because

- We have the unified `usage` field for `TASK_CHAT` in different AI
components
- The Gemini component had redundant `usage` and `usage-metadata` fields
that provided overlapping token usage information
- The `usage` field was a simple object while `usage-metadata` contained
detailed structured information

This commit

- Merges the `usage` and `usage-metadata` fields into a single
comprehensive `usage` field
- Updates the YAML schema to reference the detailed `usage-metadata`
definition for the `usage` field
- Removes the separate `usage-metadata` field from the output schema and
Go structs
- Updates all related code to populate and access usage information
through the unified `usage` field
- Maintains backward compatibility by preserving all detailed token
usage statistics (prompt tokens, cached tokens, candidates tokens, etc.)
with kebab-case naming
diff --git a/pkg/component/ai/gemini/v0/config/tasks.yaml b/pkg/component/ai/gemini/v0/config/tasks.yaml
@@ -1649,7 +1649,7 @@ TASK_CHAT:
           to fine-tune the LLM's output.  Note that OpenAI and Mistral models don't have the `top-k` exposed.
         type: integer
       top-p:
-        uiOrder: 10
+        uiOrder: 12
         title: Top-P
         description: >-
           A parameter, also known as nucleus sampling, that controls the randomness and creativity of  the generated text by selecting a dynamic subset
@@ -1660,44 +1660,44 @@ TASK_CHAT:
           choice to a smaller, more focused set of highly probable words, resulting in more factual  and conservative output.
         type: number
       seed:
-        uiOrder: 12
+        uiOrder: 13
         title: Seed
         description: A random seed used to control the stochasticity of text generation to produce repeatable outputs
         type: integer
       contents:
-        uiOrder: 13
+        uiOrder: 14
         title: Contents
         description: The input contents to the model. Each item represents a user or model turn composed of parts (text or images).
         type: array
         items:
           $ref: "#/$defs/content"
       tools:
-        uiOrder: 14
+        uiOrder: 15
         title: Tools
         description: Tools available to the model, e.g., function declarations.
         type: array
         items:
           $ref: "#/$defs/tool"
       tool-config:
-        uiOrder: 15
+        uiOrder: 16
         $ref: "#/$defs/tool-config"
       safety-settings:
-        uiOrder: 16
+        uiOrder: 17
         title: Safety Settings
         description: Safety settings for content filtering.
         type: array
         items:
           $ref: "#/$defs/safety-setting"
       system-instruction:
-        uiOrder: 17
+        uiOrder: 18
         title: System Instruction
         description: A system instruction to guide the model behavior.
         $ref: "#/$defs/content"
       generation-config:
-        uiOrder: 18
+        uiOrder: 19
         $ref: "#/$defs/generation-config"
       cached-content:
-        uiOrder: 19
+        uiOrder: 20
         title: Cached Content
         description: "The name of a cached content to use as context. Format: cachedContents/{cachedContent}."
         type: string
@@ -1732,9 +1732,9 @@ TASK_CHAT:
         uiOrder: 2
         title: Usage
         description: >-
-          Token usage statistics: prompt tokens, completion tokens, total tokens, etc.
-        type: object
-        additionalProperties: true
+          Token usage statistics: prompt tokens, completion tokens, total tokens, etc. This field is a proxy of the original usage-metadata field in Gemini
+          API.
+        $ref: "#/$defs/usage-metadata"
       candidates:
         uiOrder: 3
         title: Candidates
@@ -1745,19 +1745,16 @@ TASK_CHAT:
         type: array
         items:
           $ref: "#/$defs/candidate"
-      usage-metadata:
-        uiOrder: 4
-        $ref: "#/$defs/usage-metadata"
       prompt-feedback:
-        uiOrder: 5
+        uiOrder: 4
         $ref: "#/$defs/prompt-feedback"
       model-version:
-        uiOrder: 6
+        uiOrder: 5
         title: Model Version
         description: The model version used to generate the response.
         type: string
       response-id:
-        uiOrder: 7
+        uiOrder: 6
         title: Response ID
         description: Identifier for this response.
         type: string
diff --git a/pkg/component/ai/gemini/v0/io.go b/pkg/component/ai/gemini/v0/io.go
@@ -65,7 +65,6 @@ type TaskChatOutput struct {
 
 	// Use genai types directly with instill tags
 	Candidates     []*genai.Candidate                           `instill:"candidates"`
-	UsageMetadata  *genai.GenerateContentResponseUsageMetadata  `instill:"usage-metadata"`
 	PromptFeedback *genai.GenerateContentResponsePromptFeedback `instill:"prompt-feedback"`
 	ModelVersion   *string                                      `instill:"model-version"`
 	ResponseID     *string                                      `instill:"response-id"`
diff --git a/pkg/component/ai/gemini/v0/task_chat.go b/pkg/component/ai/gemini/v0/task_chat.go
@@ -265,15 +265,13 @@ func (e *execution) buildStreamOutput(texts []string, finalResp *genai.GenerateC
 		Texts:          texts,
 		Usage:          map[string]any{},
 		Candidates:     []*genai.Candidate{},
-		UsageMetadata:  nil,
 		PromptFeedback: nil,
 		ModelVersion:   nil,
 		ResponseID:     nil,
 	}
 
 	if finalResp != nil {
 		streamOutput.Candidates = finalResp.Candidates
-		streamOutput.UsageMetadata = finalResp.UsageMetadata
 		streamOutput.PromptFeedback = finalResp.PromptFeedback
 		if finalResp.ModelVersion != "" {
 			mv := finalResp.ModelVersion
@@ -392,14 +390,12 @@ func renderFinal(resp *genai.GenerateContentResponse, texts []string) TaskChatOu
 		Images:         []format.Image{},
 		Usage:          map[string]any{},
 		Candidates:     []*genai.Candidate{},
-		UsageMetadata:  nil,
 		PromptFeedback: nil,
 	}
 	if resp == nil {
 		return out
 	}
 	out.Candidates = resp.Candidates
-	out.UsageMetadata = resp.UsageMetadata
 	out.PromptFeedback = resp.PromptFeedback
 	if resp.ModelVersion != "" {
 		mv := resp.ModelVersion
diff --git a/pkg/component/ai/gemini/v0/task_chat_test.go b/pkg/component/ai/gemini/v0/task_chat_test.go
@@ -780,7 +780,7 @@ func Test_renderFinal_Minimal(t *testing.T) {
 	c.Check(*out.ModelVersion, qt.Equals, "v1")
 	c.Check(out.ResponseID, qt.Not(qt.IsNil))
 	c.Check(*out.ResponseID, qt.Equals, "resp-123")
-	c.Check(out.UsageMetadata.TotalTokenCount, qt.Equals, int32(3))
+	c.Check(out.Usage["total-token-count"], qt.Equals, int32(3))
 }
 
 func Test_buildGenerateContentConfig_NoConfig(t *testing.T) {
@@ -1007,8 +1007,8 @@ func Test_buildStreamOutput(t *testing.T) {
 
 	c.Assert(got.Texts, qt.DeepEquals, texts)
 	c.Assert(got.Candidates, qt.HasLen, 2)
-	c.Assert(got.UsageMetadata, qt.Not(qt.IsNil))
-	c.Check(got.UsageMetadata.TotalTokenCount, qt.Equals, int32(15))
+	c.Assert(got.Usage, qt.Not(qt.IsNil))
+	c.Check(got.Usage["total-token-count"], qt.Equals, int32(15))
 	c.Assert(got.PromptFeedback, qt.Not(qt.IsNil))
 	c.Assert(got.ModelVersion, qt.Not(qt.IsNil))
 	c.Check(*got.ModelVersion, qt.Equals, "v1")
@@ -1082,7 +1082,7 @@ func Test_buildStreamOutput_InlineDataCleanup(t *testing.T) {
 	c.Check(got.Candidates[0].Content.Parts[0].Text, qt.Equals, "Here's an image")
 
 	// Verify other metadata is preserved
-	c.Check(got.UsageMetadata.TotalTokenCount, qt.Equals, int32(15))
+	c.Check(got.Usage["total-token-count"], qt.Equals, int32(15))
 	c.Assert(got.ModelVersion, qt.Not(qt.IsNil))
 	c.Check(*got.ModelVersion, qt.Equals, "v1")
 	c.Assert(got.ResponseID, qt.Not(qt.IsNil))
@@ -1254,7 +1254,7 @@ func Test_renderFinal_WithInlineData(t *testing.T) {
 	c.Check(got.Texts[0], qt.Equals, "Here's an image")
 
 	// Verify other metadata is preserved
-	c.Check(got.UsageMetadata.TotalTokenCount, qt.Equals, int32(15))
+	c.Check(got.Usage["total-token-count"], qt.Equals, int32(15))
 	c.Assert(got.ModelVersion, qt.Not(qt.IsNil))
 	c.Check(*got.ModelVersion, qt.Equals, "v1")
 	c.Assert(got.ResponseID, qt.Not(qt.IsNil))
@@ -1757,8 +1757,8 @@ func TestImageGeneration(t *testing.T) {
 		// Check that texts are preserved
 		c.Check(result.Texts, qt.DeepEquals, texts)
 
-		// Check that images are NOT extracted during streaming (deferred to renderFinal)
-		c.Check(result.Images, qt.HasLen, 0)
+		// Check that images are extracted during streaming
+		c.Check(result.Images, qt.HasLen, 1)
 	})
 
 	t.Run("renderFinal with mixed content", func(t *testing.T) {