From 822f8511e38c899f32a21857442959ffc0fa130e Mon Sep 17 00:00:00 2001 From: Ayoub Abidi Date: Sun, 8 Feb 2026 12:35:46 +0100 Subject: [PATCH 1/3] feat: add sparrow_instance_info metric for ownership metadata (#354) --- README.md | 45 +++++++- chart/values.yaml | 11 ++ cmd/run.go | 5 + docs/dev.md | 24 ++++ docs/ownership-metadata-design.md | 37 ++++++ docs/sparrow_run.md | 3 + pkg/config/config.go | 17 +++ pkg/sparrow/metrics/instance_info.go | 29 +++++ pkg/sparrow/metrics/instance_info_test.go | 133 ++++++++++++++++++++++ pkg/sparrow/run.go | 8 ++ 10 files changed, 310 insertions(+), 2 deletions(-) create mode 100644 docs/ownership-metadata-design.md create mode 100644 pkg/sparrow/metrics/instance_info.go create mode 100644 pkg/sparrow/metrics/instance_info_test.go diff --git a/README.md b/README.md index 1c55566f..dfd16530 100644 --- a/README.md +++ b/README.md @@ -124,8 +124,8 @@ ConfigMap. Use the following configuration values to use a runtime configuration by the `http` loader: ```yaml -startupConfig: - ... +sparrowConfig: + name: sparrow.example.com loader: type: http interval: 30s @@ -199,12 +199,32 @@ export SPARROW_ANY_OTHER_OPTION="Some value" Just write out the path to the attribute, delimited by `_`. +#### Instance metadata (ownership) + +You can optionally configure ownership and platform metadata so that operators can identify which team owns a Sparrow instance and route alerts correctly. This is exposed as a single Prometheus info-style metric, `sparrow_instance_info`, emitted once per instance at startup. + +| Field | Type | Description | +| ----- | ---- | ----------- | +| `metadata.team.name` | string | Team name owning this instance | +| `metadata.team.email` | string | Team contact email (e.g. for alert routing) | +| `metadata.platform` | string | Platform identifier (e.g. `k8s-prod-eu`, `aws-eu-west-1`) | + +All metadata fields are optional. Omitted fields appear as empty labels. Example env vars: `SPARROW_METADATA_TEAM_NAME`, `SPARROW_METADATA_TEAM_EMAIL`, `SPARROW_METADATA_PLATFORM`. + #### Example Startup Configuration ```yaml # DNS sparrow is exposed on name: sparrow.example.com +# Optional: ownership and platform metadata (exposed as sparrow_instance_info Prometheus metric) +# Used for alert routing and correlating metrics across multi-team deployments. +# metadata: +# team: +# name: platform-team +# email: platform@example.com +# platform: k8s-prod-eu + # Selects and configures a loader to continuously fetch the checks' configuration at runtime loader: # Defines which loader to use. Options: "file | http" @@ -642,6 +662,27 @@ at `/v1/metrics/{check-name}`. The API's definition is available at `/openapi`. The `sparrow` provides a `/metrics` endpoint to expose application metrics. In addition to runtime information, the sparrow provides specific metrics for each check. Refer to the [Checks](#checks) section for more detailed information. +#### Instance info metric + +- `sparrow_instance_info` + - Type: Gauge (info-style, value always 1) + - Description: Ownership and platform metadata for this Sparrow instance. Emitted once per instance at startup. + - Labels: `team_name`, `team_email`, `platform`, `instance_name` + - Use for: Alert routing, identifying instance owners, correlating metrics across multiple Sparrow deployments. + +Example PromQL for multi-team dashboards: + +```promql +# All Sparrow instances with their owner and platform +sparrow_instance_info + +# Instances by team +sparrow_instance_info{team_name="platform-team"} + +# Join check metrics with ownership (e.g. health by team) +sparrow_health_up * on(instance) group_left(team_name, team_email, platform) sparrow_instance_info +``` + ### Prometheus Integration The `sparrow` metrics API is designed to be compatible with Prometheus. To integrate `sparrow` with Prometheus, add the following scrape configuration to your Prometheus configuration file: diff --git a/chart/values.yaml b/chart/values.yaml index db2fe6fd..dac98cfb 100644 --- a/chart/values.yaml +++ b/chart/values.yaml @@ -148,6 +148,12 @@ envFromSecrets: # see: https://github.com/telekom/sparrow/blob/main/docs/sparrow_run.md sparrowConfig: name: sparrow.com + # Optional ownership metadata (exposed as sparrow_instance_info Prometheus metric) + # metadata: + # team: + # name: platform-team + # email: platform@example.com + # platform: k8s-prod-eu loader: type: file interval: 30s @@ -168,6 +174,11 @@ sparrowConfig: # count: 5 # file: # path: /config/checks.yaml +# metadata: +# team: +# name: string +# email: string +# platform: string # targetManager: # type: gitlab # checkInterval: 300s diff --git a/cmd/run.go b/cmd/run.go index 85bd41d3..2017c34c 100644 --- a/cmd/run.go +++ b/cmd/run.go @@ -47,6 +47,11 @@ func NewCmdRun() *cobra.Command { NewFlag("loader.http.retry.delay", "loaderHttpRetryDelay").Duration().Bind(cmd, defaultHttpRetryDelay, "http loader: The initial delay between retries in seconds") NewFlag("loader.file.path", "loaderFilePath").String().Bind(cmd, "config.yaml", "file loader: The path to the file to read the runtime config from") + // Optional ownership metadata (exposed as sparrow_instance_info metric) + NewFlag("metadata.team.name", "metadataTeamName").String().Bind(cmd, "", "metadata: Team name owning this instance (for alert routing)") + NewFlag("metadata.team.email", "metadataTeamEmail").String().Bind(cmd, "", "metadata: Team contact email") + NewFlag("metadata.platform", "metadataPlatform").String().Bind(cmd, "", "metadata: Platform identifier (e.g. k8s-prod-eu)") + return cmd } diff --git a/docs/dev.md b/docs/dev.md index 94fc8a97..45db73d9 100644 --- a/docs/dev.md +++ b/docs/dev.md @@ -6,6 +6,7 @@ SPDX-License-Identifier: CC-BY-4.0 # Developer Documentation +- [Running tests](#running-tests) - [Using `reuse` for license handling](#using-reuse-for-license-handling) - [Install](#install) - [CLI Usage](#cli-usage) @@ -44,3 +45,26 @@ Use the `CC-BY-4.0` SPDX license header for documentation (eg. `.md`) files. In case a license header is not suitable for a file or directory (eg. auto-generated files) the `REUSE.toml` configuration file can be used. Add the file path to the `REUSE.toml` or create a new `[[annotations]]` section (see `./REUSE.toml` for an example). + +## Running tests + +Unit tests are run with the Go toolchain and modules specified in `go.mod`. + +**First run:** If the Go version in `go.mod` is not yet installed, the first `go test` (or any `go` command) will download the toolchain and dependencies. This can take **2–5 minutes** depending on the network. If your IDE or test runner uses a short timeout (e.g. 30–60 seconds), the first run may time out; use a longer timeout for the first run or run from a terminal: + +```bash +go test ./... +``` + +**Run only metrics (e.g. instance_info) tests:** + +```bash +go test ./pkg/sparrow/metrics/ -run 'InstanceInfo' -v -count=1 +``` + +**Run all tests with race detector and coverage (as in CI):** + +```bash +go mod download +go test --race --count=1 --coverprofile cover.out -v ./... +``` diff --git a/docs/ownership-metadata-design.md b/docs/ownership-metadata-design.md new file mode 100644 index 00000000..95b86a83 --- /dev/null +++ b/docs/ownership-metadata-design.md @@ -0,0 +1,37 @@ + + +# Design: Sparrow Instance Ownership Metadata (Issue #354) + +## Summary + +Sparrow exposes optional ownership and platform metadata via a dedicated Prometheus **info metric** (`sparrow_instance_info`), so operators can identify which team owns each instance, route alerts correctly, and correlate metrics across multiple Sparrow deployments. + +## Why Option 1 (Dedicated Info Metric) + +- **Prometheus best practice:** Info-style metrics (gauge with value 1 and descriptive labels) are the standard way to expose static attributes (e.g. `kube_pod_info`, `node_uname_info`). They avoid polluting every time series with extra labels and keep cardinality under control. +- **No impact on existing metrics:** We do **not** add metadata labels to check metrics (health, latency, DNS, traceroute). That would multiply cardinality and complicate existing dashboards. Joining with `sparrow_instance_info` in PromQL when needed is explicit and flexible. +- **Works without target manager:** The metric is registered at startup from startup config only. It does not depend on the target manager or any runtime component. +- **Single emission per instance:** The metric is registered once during `sparrow.New()` and emits one time series per instance. No periodic updates or lifecycle complexity. + +## Implementation Choices + +1. **Config shape:** `metadata.team.name`, `metadata.team.email`, `metadata.platform` under startup config, with Viper binding for flags/env/file. All fields optional; omitted fields appear as empty labels so the metric is always present with at least `instance_name`. +2. **Registration point:** Instance info is registered in `sparrow.New()` after the metrics provider is created. Registration failure is logged but non-fatal so the process still starts. +3. **Metrics package:** A small `RegisterInstanceInfo(registry, instanceName, teamName, teamEmail, platform)` in `pkg/sparrow/metrics` keeps the metrics package independent of `pkg/config` and makes the behaviour easy to test. +4. **Helm:** Metadata is optional under `sparrowConfig` in values; backward compatibility is preserved when metadata is not provided. + +## Prometheus Usage + +- **Alert routing:** Alertmanager or routing rules can use `sparrow_instance_info` to add team/email/platform to alerts. +- **Dashboards:** `group_left(team_name, team_email, platform) sparrow_instance_info` joins ownership onto any Sparrow metric by scrape `instance`. +- **Multi-team views:** Filter or group by `team_name` or `platform` without changing existing metric names or labels. + +## Deliverables + +- **Code:** `pkg/config` (Metadata, TeamMetadata), `cmd/run` (flags), `pkg/sparrow/metrics` (RegisterInstanceInfo + test), `pkg/sparrow` (registration in New). +- **Helm:** `chart/values.yaml` extended with commented metadata example; config is merged into existing sparrowConfig. +- **Docs:** README (metadata config table, instance info metric, PromQL examples), `docs/sparrow_run.md` (new flags), this design summary. diff --git a/docs/sparrow_run.md b/docs/sparrow_run.md index ae454c0d..a7c6cfad 100644 --- a/docs/sparrow_run.md +++ b/docs/sparrow_run.md @@ -15,6 +15,9 @@ sparrow run [flags] ``` --apiAddress string api: The address the server is listening on (default ":8080") -h, --help help for run + --metadataPlatform string metadata: Platform identifier (e.g. k8s-prod-eu) + --metadataTeamEmail string metadata: Team contact email + --metadataTeamName string metadata: Team name owning this instance (for alert routing) --loaderFilePath string file loader: The path to the file to read the runtime config from (default "config.yaml") --loaderHttpRetryCount int http loader: Amount of retries trying to load the configuration (default 3) --loaderHttpRetryDelay duration http loader: The initial delay between retries in seconds (default 1s) diff --git a/pkg/config/config.go b/pkg/config/config.go index 7f8bd2a9..71849fad 100644 --- a/pkg/config/config.go +++ b/pkg/config/config.go @@ -14,9 +14,26 @@ import ( "github.com/telekom/sparrow/pkg/api" ) +// Metadata holds optional ownership and platform information for the Sparrow instance. +// Exposed via the sparrow_instance_info Prometheus metric for alert routing and multi-team operability. +type Metadata struct { + // Team holds team ownership information + Team TeamMetadata `yaml:"team" mapstructure:"team"` + // Platform identifies the deployment platform (e.g. k8s-prod-eu, aws-eu-west-1) + Platform string `yaml:"platform" mapstructure:"platform"` +} + +// TeamMetadata holds team name and contact for ownership +type TeamMetadata struct { + Name string `yaml:"name" mapstructure:"name"` + Email string `yaml:"email" mapstructure:"email"` +} + type Config struct { // SparrowName is the DNS name of the sparrow SparrowName string `yaml:"name" mapstructure:"name"` + // Metadata is optional ownership and platform metadata (exposed as sparrow_instance_info) + Metadata Metadata `yaml:"metadata" mapstructure:"metadata"` // Loader is the configuration for the loader Loader LoaderConfig `yaml:"loader" mapstructure:"loader"` // Api is the configuration for the api server diff --git a/pkg/sparrow/metrics/instance_info.go b/pkg/sparrow/metrics/instance_info.go new file mode 100644 index 00000000..64740638 --- /dev/null +++ b/pkg/sparrow/metrics/instance_info.go @@ -0,0 +1,29 @@ +// SPDX-FileCopyrightText: 2025 Deutsche Telekom IT GmbH +// +// SPDX-License-Identifier: Apache-2.0 + +package metrics + +import ( + "github.com/prometheus/client_golang/prometheus" +) + +const ( + instanceInfoMetricName = "sparrow_instance_info" + instanceInfoHelp = "Ownership and platform metadata for this Sparrow instance. Emitted once per instance for alert routing and multi-team correlation." +) + +// RegisterInstanceInfo registers the sparrow_instance_info info-style metric on the given registry. +// It sets the gauge to 1 with labels team_name, team_email, platform, and instance_name. +// Empty strings are allowed for optional metadata; instanceName should be the Sparrow DNS name. +func RegisterInstanceInfo(registry *prometheus.Registry, instanceName, teamName, teamEmail, platform string) error { + info := prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Name: instanceInfoMetricName, + Help: instanceInfoHelp, + }, + []string{"team_name", "team_email", "platform", "instance_name"}, + ) + info.WithLabelValues(teamName, teamEmail, platform, instanceName).Set(1) + return registry.Register(info) +} diff --git a/pkg/sparrow/metrics/instance_info_test.go b/pkg/sparrow/metrics/instance_info_test.go new file mode 100644 index 00000000..65de3fcd --- /dev/null +++ b/pkg/sparrow/metrics/instance_info_test.go @@ -0,0 +1,133 @@ +// SPDX-FileCopyrightText: 2025 Deutsche Telekom IT GmbH +// +// SPDX-License-Identifier: Apache-2.0 + +package metrics + +import ( + "errors" + "testing" + + "github.com/prometheus/client_golang/prometheus" +) + +func TestRegisterInstanceInfo(t *testing.T) { + registry := prometheus.NewRegistry() + + err := RegisterInstanceInfo(registry, "sparrow.example.com", "platform-team", "platform@example.com", "k8s-prod-eu") + if err != nil { + t.Fatalf("RegisterInstanceInfo() error = %v", err) + } + + metrics, err := registry.Gather() + if err != nil { + t.Fatalf("Gather() error = %v", err) + } + + var found bool + for _, mf := range metrics { + if mf.GetName() == instanceInfoMetricName { + found = true + if len(mf.GetMetric()) != 1 { + t.Errorf("expected 1 metric, got %d", len(mf.GetMetric())) + } + for _, m := range mf.GetMetric() { + if m.GetGauge().GetValue() != 1 { + t.Errorf("expected value 1, got %v", m.GetGauge().GetValue()) + } + labels := make(map[string]string) + for _, lp := range m.GetLabel() { + labels[lp.GetName()] = lp.GetValue() + } + if labels["instance_name"] != "sparrow.example.com" || labels["team_name"] != "platform-team" || + labels["team_email"] != "platform@example.com" || labels["platform"] != "k8s-prod-eu" { + t.Errorf("unexpected labels: %v", labels) + } + } + break + } + } + if !found { + t.Error("sparrow_instance_info metric not found in registry") + } +} + +func TestRegisterInstanceInfo_emptyMetadata(t *testing.T) { + registry := prometheus.NewRegistry() + + err := RegisterInstanceInfo(registry, "sparrow.example.com", "", "", "") + if err != nil { + t.Fatalf("RegisterInstanceInfo() with empty metadata error = %v", err) + } + + metrics, err := registry.Gather() + if err != nil { + t.Fatalf("Gather() error = %v", err) + } + + for _, mf := range metrics { + if mf.GetName() == instanceInfoMetricName { + for _, m := range mf.GetMetric() { + labels := make(map[string]string) + for _, lp := range m.GetLabel() { + labels[lp.GetName()] = lp.GetValue() + } + if labels["instance_name"] != "sparrow.example.com" { + t.Errorf("expected instance_name=sparrow.example.com, got %v", labels) + } + } + return + } + } + t.Error("sparrow_instance_info metric not found") +} + +func TestRegisterInstanceInfo_doubleRegistration(t *testing.T) { + registry := prometheus.NewRegistry() + + err := RegisterInstanceInfo(registry, "sparrow.example.com", "team-a", "team-a@example.com", "k8s-prod") + if err != nil { + t.Fatalf("first RegisterInstanceInfo() error = %v", err) + } + + err2 := RegisterInstanceInfo(registry, "other.example.com", "team-b", "team-b@example.com", "k8s-staging") + if err2 == nil { + t.Fatal("expected second RegisterInstanceInfo to return an error (duplicate collector)") + } + + var alreadyErr prometheus.AlreadyRegisteredError + if !errors.As(err2, &alreadyErr) { + t.Errorf("expected AlreadyRegisteredError, got %T: %v", err2, err2) + } +} + +func TestRegisterInstanceInfo_partialMetadata(t *testing.T) { + registry := prometheus.NewRegistry() + + err := RegisterInstanceInfo(registry, "sparrow.example.com", "platform-team", "", "") + if err != nil { + t.Fatalf("RegisterInstanceInfo() with partial metadata error = %v", err) + } + + metrics, err := registry.Gather() + if err != nil { + t.Fatalf("Gather() error = %v", err) + } + + for _, mf := range metrics { + if mf.GetName() == instanceInfoMetricName { + for _, m := range mf.GetMetric() { + labels := make(map[string]string) + for _, lp := range m.GetLabel() { + labels[lp.GetName()] = lp.GetValue() + } + if labels["instance_name"] != "sparrow.example.com" || labels["team_name"] != "platform-team" || + labels["team_email"] != "" || labels["platform"] != "" { + t.Errorf("unexpected labels (expected partial metadata): %v", labels) + } + } + return + } + } + t.Error("sparrow_instance_info metric not found") +} diff --git a/pkg/sparrow/run.go b/pkg/sparrow/run.go index 17fcc44d..d0496dd1 100644 --- a/pkg/sparrow/run.go +++ b/pkg/sparrow/run.go @@ -80,6 +80,14 @@ func New(cfg *config.Config) *Sparrow { } sparrow.loader = config.NewLoader(cfg, sparrow.cRuntime) + // Register ownership metadata as Prometheus info metric (once per instance) + if err := metrics.RegisterInstanceInfo(m.GetRegistry(), cfg.SparrowName, cfg.Metadata.Team.Name, cfg.Metadata.Team.Email, cfg.Metadata.Platform); err != nil { + // Non-fatal: instance can run without the info metric + // Logging requires context; use background with logger for startup + log := logger.FromContext(context.Background()) + log.Error("Failed to register sparrow_instance_info metric", "error", err) + } + return sparrow } From b232c9292172a6eceefa8ebb16134c495e122238 Mon Sep 17 00:00:00 2001 From: Ayoub Abidi Date: Sat, 28 Feb 2026 08:21:01 +0100 Subject: [PATCH 2/3] refactor: update instance metadata handling in sparrow_instance_info metric --- README.md | 28 ++++++-------- chart/values.yaml | 14 +++---- cmd/run.go | 5 --- docs/ownership-metadata-design.md | 16 ++++---- docs/sparrow_run.md | 3 -- pkg/config/config.go | 20 +++------- pkg/sparrow/metrics/instance_info.go | 47 ++++++++++++++++++++--- pkg/sparrow/metrics/instance_info_test.go | 27 +++++++++---- pkg/sparrow/run.go | 4 +- 9 files changed, 95 insertions(+), 69 deletions(-) diff --git a/README.md b/README.md index dfd16530..bdc794b7 100644 --- a/README.md +++ b/README.md @@ -199,17 +199,11 @@ export SPARROW_ANY_OTHER_OPTION="Some value" Just write out the path to the attribute, delimited by `_`. -#### Instance metadata (ownership) +#### Instance metadata (optional) -You can optionally configure ownership and platform metadata so that operators can identify which team owns a Sparrow instance and route alerts correctly. This is exposed as a single Prometheus info-style metric, `sparrow_instance_info`, emitted once per instance at startup. +You can optionally configure instance metadata so operators can identify owners, route alerts, and correlate metrics across deployments. This metadata is exposed as a single Prometheus info-style metric, `sparrow_instance_info`, emitted once per instance at startup. -| Field | Type | Description | -| ----- | ---- | ----------- | -| `metadata.team.name` | string | Team name owning this instance | -| `metadata.team.email` | string | Team contact email (e.g. for alert routing) | -| `metadata.platform` | string | Platform identifier (e.g. `k8s-prod-eu`, `aws-eu-west-1`) | - -All metadata fields are optional. Omitted fields appear as empty labels. Example env vars: `SPARROW_METADATA_TEAM_NAME`, `SPARROW_METADATA_TEAM_EMAIL`, `SPARROW_METADATA_PLATFORM`. +`metadata` is a map of arbitrary key-value pairs. Keys must be valid Prometheus label names (e.g. `team_name`, `platform`, `region`, `environment`). The key `instance_name` is reserved and set automatically from the Sparrow DNS name. #### Example Startup Configuration @@ -217,13 +211,13 @@ All metadata fields are optional. Omitted fields appear as empty labels. Example # DNS sparrow is exposed on name: sparrow.example.com -# Optional: ownership and platform metadata (exposed as sparrow_instance_info Prometheus metric) -# Used for alert routing and correlating metrics across multi-team deployments. +# Optional: instance metadata (exposed as sparrow_instance_info Prometheus metric) +# Used for alert routing and correlating metrics across deployments. # metadata: -# team: -# name: platform-team -# email: platform@example.com +# team_name: platform-team +# team_email: platform@example.com # platform: k8s-prod-eu +# region: eu-west-1 # Selects and configures a loader to continuously fetch the checks' configuration at runtime loader: @@ -662,12 +656,12 @@ at `/v1/metrics/{check-name}`. The API's definition is available at `/openapi`. The `sparrow` provides a `/metrics` endpoint to expose application metrics. In addition to runtime information, the sparrow provides specific metrics for each check. Refer to the [Checks](#checks) section for more detailed information. -#### Instance info metric +### Instance info metric - `sparrow_instance_info` - Type: Gauge (info-style, value always 1) - - Description: Ownership and platform metadata for this Sparrow instance. Emitted once per instance at startup. - - Labels: `team_name`, `team_email`, `platform`, `instance_name` + - Description: Instance metadata for this Sparrow instance. Emitted once per instance at startup. + - Labels: `instance_name` plus any user-defined metadata keys - Use for: Alert routing, identifying instance owners, correlating metrics across multiple Sparrow deployments. Example PromQL for multi-team dashboards: diff --git a/chart/values.yaml b/chart/values.yaml index dac98cfb..4d35ce7f 100644 --- a/chart/values.yaml +++ b/chart/values.yaml @@ -148,12 +148,12 @@ envFromSecrets: # see: https://github.com/telekom/sparrow/blob/main/docs/sparrow_run.md sparrowConfig: name: sparrow.com - # Optional ownership metadata (exposed as sparrow_instance_info Prometheus metric) + # Optional instance metadata (exposed as sparrow_instance_info Prometheus metric) # metadata: - # team: - # name: platform-team - # email: platform@example.com + # team_name: platform-team + # team_email: platform@example.com # platform: k8s-prod-eu + # region: eu-west-1 loader: type: file interval: 30s @@ -175,10 +175,10 @@ sparrowConfig: # file: # path: /config/checks.yaml # metadata: -# team: -# name: string -# email: string +# team_name: string +# team_email: string # platform: string +# region: string # targetManager: # type: gitlab # checkInterval: 300s diff --git a/cmd/run.go b/cmd/run.go index 2017c34c..85bd41d3 100644 --- a/cmd/run.go +++ b/cmd/run.go @@ -47,11 +47,6 @@ func NewCmdRun() *cobra.Command { NewFlag("loader.http.retry.delay", "loaderHttpRetryDelay").Duration().Bind(cmd, defaultHttpRetryDelay, "http loader: The initial delay between retries in seconds") NewFlag("loader.file.path", "loaderFilePath").String().Bind(cmd, "config.yaml", "file loader: The path to the file to read the runtime config from") - // Optional ownership metadata (exposed as sparrow_instance_info metric) - NewFlag("metadata.team.name", "metadataTeamName").String().Bind(cmd, "", "metadata: Team name owning this instance (for alert routing)") - NewFlag("metadata.team.email", "metadataTeamEmail").String().Bind(cmd, "", "metadata: Team contact email") - NewFlag("metadata.platform", "metadataPlatform").String().Bind(cmd, "", "metadata: Platform identifier (e.g. k8s-prod-eu)") - return cmd } diff --git a/docs/ownership-metadata-design.md b/docs/ownership-metadata-design.md index 95b86a83..176e607c 100644 --- a/docs/ownership-metadata-design.md +++ b/docs/ownership-metadata-design.md @@ -8,7 +8,7 @@ SPDX-License-Identifier: CC-BY-4.0 ## Summary -Sparrow exposes optional ownership and platform metadata via a dedicated Prometheus **info metric** (`sparrow_instance_info`), so operators can identify which team owns each instance, route alerts correctly, and correlate metrics across multiple Sparrow deployments. +Sparrow exposes optional instance metadata via a dedicated Prometheus **info metric** (`sparrow_instance_info`), so operators can identify owners, route alerts correctly, and correlate metrics across multiple Sparrow deployments. ## Why Option 1 (Dedicated Info Metric) @@ -19,19 +19,19 @@ Sparrow exposes optional ownership and platform metadata via a dedicated Prometh ## Implementation Choices -1. **Config shape:** `metadata.team.name`, `metadata.team.email`, `metadata.platform` under startup config, with Viper binding for flags/env/file. All fields optional; omitted fields appear as empty labels so the metric is always present with at least `instance_name`. +1. **Config shape:** `metadata` is a map of arbitrary key-value pairs under startup config (e.g. `team_name`, `platform`, `region`). All fields optional; omitted keys are not emitted as labels. The key `instance_name` is reserved and set from the Sparrow DNS name. 2. **Registration point:** Instance info is registered in `sparrow.New()` after the metrics provider is created. Registration failure is logged but non-fatal so the process still starts. -3. **Metrics package:** A small `RegisterInstanceInfo(registry, instanceName, teamName, teamEmail, platform)` in `pkg/sparrow/metrics` keeps the metrics package independent of `pkg/config` and makes the behaviour easy to test. +3. **Metrics package:** A small `RegisterInstanceInfo(registry, instanceName, metadata)` in `pkg/sparrow/metrics` keeps the metrics package independent of `pkg/config` and makes the behaviour easy to test. 4. **Helm:** Metadata is optional under `sparrowConfig` in values; backward compatibility is preserved when metadata is not provided. ## Prometheus Usage -- **Alert routing:** Alertmanager or routing rules can use `sparrow_instance_info` to add team/email/platform to alerts. -- **Dashboards:** `group_left(team_name, team_email, platform) sparrow_instance_info` joins ownership onto any Sparrow metric by scrape `instance`. -- **Multi-team views:** Filter or group by `team_name` or `platform` without changing existing metric names or labels. +- **Alert routing:** Alertmanager or routing rules can use `sparrow_instance_info` to add ownership metadata to alerts. +- **Dashboards:** `group_left(...) sparrow_instance_info` joins metadata onto any Sparrow metric by scrape `instance`. +- **Multi-team views:** Filter or group by `team_name`, `platform`, `region`, or any other configured labels without changing existing metric names or labels. ## Deliverables -- **Code:** `pkg/config` (Metadata, TeamMetadata), `cmd/run` (flags), `pkg/sparrow/metrics` (RegisterInstanceInfo + test), `pkg/sparrow` (registration in New). +- **Code:** `pkg/config` (Metadata map), `pkg/sparrow/metrics` (RegisterInstanceInfo + test), `pkg/sparrow` (registration in New). - **Helm:** `chart/values.yaml` extended with commented metadata example; config is merged into existing sparrowConfig. -- **Docs:** README (metadata config table, instance info metric, PromQL examples), `docs/sparrow_run.md` (new flags), this design summary. +- **Docs:** README (metadata config, instance info metric, PromQL examples), this design summary. diff --git a/docs/sparrow_run.md b/docs/sparrow_run.md index a7c6cfad..ae454c0d 100644 --- a/docs/sparrow_run.md +++ b/docs/sparrow_run.md @@ -15,9 +15,6 @@ sparrow run [flags] ``` --apiAddress string api: The address the server is listening on (default ":8080") -h, --help help for run - --metadataPlatform string metadata: Platform identifier (e.g. k8s-prod-eu) - --metadataTeamEmail string metadata: Team contact email - --metadataTeamName string metadata: Team name owning this instance (for alert routing) --loaderFilePath string file loader: The path to the file to read the runtime config from (default "config.yaml") --loaderHttpRetryCount int http loader: Amount of retries trying to load the configuration (default 3) --loaderHttpRetryDelay duration http loader: The initial delay between retries in seconds (default 1s) diff --git a/pkg/config/config.go b/pkg/config/config.go index 71849fad..a3cd286b 100644 --- a/pkg/config/config.go +++ b/pkg/config/config.go @@ -14,25 +14,15 @@ import ( "github.com/telekom/sparrow/pkg/api" ) -// Metadata holds optional ownership and platform information for the Sparrow instance. -// Exposed via the sparrow_instance_info Prometheus metric for alert routing and multi-team operability. -type Metadata struct { - // Team holds team ownership information - Team TeamMetadata `yaml:"team" mapstructure:"team"` - // Platform identifies the deployment platform (e.g. k8s-prod-eu, aws-eu-west-1) - Platform string `yaml:"platform" mapstructure:"platform"` -} - -// TeamMetadata holds team name and contact for ownership -type TeamMetadata struct { - Name string `yaml:"name" mapstructure:"name"` - Email string `yaml:"email" mapstructure:"email"` -} +// Metadata holds arbitrary key-value metadata for the Sparrow instance. +// Exposed via the sparrow_instance_info Prometheus metric labels. +// Keys must be valid Prometheus label names. +type Metadata map[string]string type Config struct { // SparrowName is the DNS name of the sparrow SparrowName string `yaml:"name" mapstructure:"name"` - // Metadata is optional ownership and platform metadata (exposed as sparrow_instance_info) + // Metadata is optional instance metadata (exposed as sparrow_instance_info) Metadata Metadata `yaml:"metadata" mapstructure:"metadata"` // Loader is the configuration for the loader Loader LoaderConfig `yaml:"loader" mapstructure:"loader"` diff --git a/pkg/sparrow/metrics/instance_info.go b/pkg/sparrow/metrics/instance_info.go index 64740638..d95b4e2c 100644 --- a/pkg/sparrow/metrics/instance_info.go +++ b/pkg/sparrow/metrics/instance_info.go @@ -5,6 +5,10 @@ package metrics import ( + "fmt" + "regexp" + "sort" + "github.com/prometheus/client_golang/prometheus" ) @@ -14,16 +18,49 @@ const ( ) // RegisterInstanceInfo registers the sparrow_instance_info info-style metric on the given registry. -// It sets the gauge to 1 with labels team_name, team_email, platform, and instance_name. -// Empty strings are allowed for optional metadata; instanceName should be the Sparrow DNS name. -func RegisterInstanceInfo(registry *prometheus.Registry, instanceName, teamName, teamEmail, platform string) error { +// It sets the gauge to 1 with labels instance_name and any user-defined metadata keys. +// Empty strings are allowed for metadata values; instanceName should be the Sparrow DNS name. +// Metadata keys must be valid Prometheus label names and must not include "instance_name". +func RegisterInstanceInfo(registry *prometheus.Registry, instanceName string, metadata map[string]string) error { + if metadata == nil { + metadata = map[string]string{} + } + + keys := make([]string, 0, len(metadata)) + for k := range metadata { + keys = append(keys, k) + } + sort.Strings(keys) + + labels := make([]string, 0, len(keys)+1) + values := make([]string, 0, len(keys)+1) + labels = append(labels, "instance_name") + values = append(values, instanceName) + + for _, k := range keys { + if k == "instance_name" { + return fmt.Errorf("metadata key %q is reserved", k) + } + if !isValidLabelName(k) { + return fmt.Errorf("metadata key %q is not a valid Prometheus label name", k) + } + labels = append(labels, k) + values = append(values, metadata[k]) + } + info := prometheus.NewGaugeVec( prometheus.GaugeOpts{ Name: instanceInfoMetricName, Help: instanceInfoHelp, }, - []string{"team_name", "team_email", "platform", "instance_name"}, + labels, ) - info.WithLabelValues(teamName, teamEmail, platform, instanceName).Set(1) + info.WithLabelValues(values...).Set(1) return registry.Register(info) } + +var labelNamePattern = regexp.MustCompile(`^[a-zA-Z_][a-zA-Z0-9_]*$`) + +func isValidLabelName(name string) bool { + return labelNamePattern.MatchString(name) +} diff --git a/pkg/sparrow/metrics/instance_info_test.go b/pkg/sparrow/metrics/instance_info_test.go index 65de3fcd..d6bfe75e 100644 --- a/pkg/sparrow/metrics/instance_info_test.go +++ b/pkg/sparrow/metrics/instance_info_test.go @@ -14,7 +14,11 @@ import ( func TestRegisterInstanceInfo(t *testing.T) { registry := prometheus.NewRegistry() - err := RegisterInstanceInfo(registry, "sparrow.example.com", "platform-team", "platform@example.com", "k8s-prod-eu") + err := RegisterInstanceInfo(registry, "sparrow.example.com", map[string]string{ + "team_name": "platform-team", + "team_email": "platform@example.com", + "platform": "k8s-prod-eu", + }) if err != nil { t.Fatalf("RegisterInstanceInfo() error = %v", err) } @@ -55,7 +59,7 @@ func TestRegisterInstanceInfo(t *testing.T) { func TestRegisterInstanceInfo_emptyMetadata(t *testing.T) { registry := prometheus.NewRegistry() - err := RegisterInstanceInfo(registry, "sparrow.example.com", "", "", "") + err := RegisterInstanceInfo(registry, "sparrow.example.com", nil) if err != nil { t.Fatalf("RegisterInstanceInfo() with empty metadata error = %v", err) } @@ -85,12 +89,20 @@ func TestRegisterInstanceInfo_emptyMetadata(t *testing.T) { func TestRegisterInstanceInfo_doubleRegistration(t *testing.T) { registry := prometheus.NewRegistry() - err := RegisterInstanceInfo(registry, "sparrow.example.com", "team-a", "team-a@example.com", "k8s-prod") + err := RegisterInstanceInfo(registry, "sparrow.example.com", map[string]string{ + "team_name": "team-a", + "team_email": "team-a@example.com", + "platform": "k8s-prod", + }) if err != nil { t.Fatalf("first RegisterInstanceInfo() error = %v", err) } - err2 := RegisterInstanceInfo(registry, "other.example.com", "team-b", "team-b@example.com", "k8s-staging") + err2 := RegisterInstanceInfo(registry, "other.example.com", map[string]string{ + "team_name": "team-b", + "team_email": "team-b@example.com", + "platform": "k8s-staging", + }) if err2 == nil { t.Fatal("expected second RegisterInstanceInfo to return an error (duplicate collector)") } @@ -104,7 +116,9 @@ func TestRegisterInstanceInfo_doubleRegistration(t *testing.T) { func TestRegisterInstanceInfo_partialMetadata(t *testing.T) { registry := prometheus.NewRegistry() - err := RegisterInstanceInfo(registry, "sparrow.example.com", "platform-team", "", "") + err := RegisterInstanceInfo(registry, "sparrow.example.com", map[string]string{ + "team_name": "platform-team", + }) if err != nil { t.Fatalf("RegisterInstanceInfo() with partial metadata error = %v", err) } @@ -121,8 +135,7 @@ func TestRegisterInstanceInfo_partialMetadata(t *testing.T) { for _, lp := range m.GetLabel() { labels[lp.GetName()] = lp.GetValue() } - if labels["instance_name"] != "sparrow.example.com" || labels["team_name"] != "platform-team" || - labels["team_email"] != "" || labels["platform"] != "" { + if labels["instance_name"] != "sparrow.example.com" || labels["team_name"] != "platform-team" { t.Errorf("unexpected labels (expected partial metadata): %v", labels) } } diff --git a/pkg/sparrow/run.go b/pkg/sparrow/run.go index d0496dd1..e0cb841c 100644 --- a/pkg/sparrow/run.go +++ b/pkg/sparrow/run.go @@ -80,8 +80,8 @@ func New(cfg *config.Config) *Sparrow { } sparrow.loader = config.NewLoader(cfg, sparrow.cRuntime) - // Register ownership metadata as Prometheus info metric (once per instance) - if err := metrics.RegisterInstanceInfo(m.GetRegistry(), cfg.SparrowName, cfg.Metadata.Team.Name, cfg.Metadata.Team.Email, cfg.Metadata.Platform); err != nil { + // Register instance metadata as Prometheus info metric (once per instance) + if err := metrics.RegisterInstanceInfo(m.GetRegistry(), cfg.SparrowName, cfg.Metadata); err != nil { // Non-fatal: instance can run without the info metric // Logging requires context; use background with logger for startup log := logger.FromContext(context.Background()) From 9fe31fa860b03b666ec7075892c8c2bef6710874 Mon Sep 17 00:00:00 2001 From: Ayoub Abidi Date: Sat, 7 Mar 2026 09:55:28 +0100 Subject: [PATCH 3/3] chore: update Prometheus common dependency and enhance instance metadata documentation --- README.md | 3 +- docs/ownership-metadata-design.md | 2 +- go.mod | 2 +- pkg/sparrow/metrics/instance_info.go | 10 +- pkg/sparrow/metrics/instance_info_test.go | 120 ++++++++++++++-------- 5 files changed, 86 insertions(+), 51 deletions(-) diff --git a/README.md b/README.md index bdc794b7..3465be0d 100644 --- a/README.md +++ b/README.md @@ -23,6 +23,7 @@ SPDX-License-Identifier: CC-BY-4.0 - [Image](#image) - [Configuration](#configuration) - [Startup](#startup) + - [Instance metadata (optional)](#instance-metadata-optional) - [Example Startup Configuration](#example-startup-configuration) - [Loader](#loader) - [Logging Configuration](#logging-configuration) @@ -203,7 +204,7 @@ Just write out the path to the attribute, delimited by `_`. You can optionally configure instance metadata so operators can identify owners, route alerts, and correlate metrics across deployments. This metadata is exposed as a single Prometheus info-style metric, `sparrow_instance_info`, emitted once per instance at startup. -`metadata` is a map of arbitrary key-value pairs. Keys must be valid Prometheus label names (e.g. `team_name`, `platform`, `region`, `environment`). The key `instance_name` is reserved and set automatically from the Sparrow DNS name. +`metadata` is a map of arbitrary key-value pairs. Keys must be valid Prometheus label names (e.g. `team_name`, `platform`, `region`, `environment`). The key `instance_name` is reserved and automatically set to the sparrow's name. #### Example Startup Configuration diff --git a/docs/ownership-metadata-design.md b/docs/ownership-metadata-design.md index 176e607c..89c68ecc 100644 --- a/docs/ownership-metadata-design.md +++ b/docs/ownership-metadata-design.md @@ -10,7 +10,7 @@ SPDX-License-Identifier: CC-BY-4.0 Sparrow exposes optional instance metadata via a dedicated Prometheus **info metric** (`sparrow_instance_info`), so operators can identify owners, route alerts correctly, and correlate metrics across multiple Sparrow deployments. -## Why Option 1 (Dedicated Info Metric) +## Why a Dedicated Info Metric - **Prometheus best practice:** Info-style metrics (gauge with value 1 and descriptive labels) are the standard way to expose static attributes (e.g. `kube_pod_info`, `node_uname_info`). They avoid polluting every time series with extra labels and keep cardinality under control. - **No impact on existing metrics:** We do **not** add metadata labels to check metrics (health, latency, DNS, traceroute). That would multiply cardinality and complicate existing dashboards. Joining with `sparrow_instance_info` in PromQL when needed is explicit and flexible. diff --git a/go.mod b/go.mod index 6d569608..55e7fea3 100644 --- a/go.mod +++ b/go.mod @@ -14,6 +14,7 @@ require ( github.com/google/go-cmp v0.7.0 github.com/jarcoal/httpmock v1.4.1 github.com/prometheus/client_golang v1.23.2 + github.com/prometheus/common v0.66.1 github.com/spf13/cobra v1.10.2 github.com/spf13/viper v1.21.0 github.com/stretchr/testify v1.11.1 @@ -55,7 +56,6 @@ require ( github.com/perimeterx/marshmallow v1.1.5 // indirect github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect github.com/prometheus/client_model v0.6.2 // indirect - github.com/prometheus/common v0.66.1 // indirect github.com/prometheus/procfs v0.16.1 // indirect github.com/russross/blackfriday/v2 v2.1.0 // indirect github.com/sagikazarmark/locafero v0.11.0 // indirect diff --git a/pkg/sparrow/metrics/instance_info.go b/pkg/sparrow/metrics/instance_info.go index d95b4e2c..6e01aa20 100644 --- a/pkg/sparrow/metrics/instance_info.go +++ b/pkg/sparrow/metrics/instance_info.go @@ -6,10 +6,10 @@ package metrics import ( "fmt" - "regexp" "sort" "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/common/model" ) const ( @@ -41,7 +41,7 @@ func RegisterInstanceInfo(registry *prometheus.Registry, instanceName string, me if k == "instance_name" { return fmt.Errorf("metadata key %q is reserved", k) } - if !isValidLabelName(k) { + if !model.UTF8Validation.IsValidLabelName(k) { return fmt.Errorf("metadata key %q is not a valid Prometheus label name", k) } labels = append(labels, k) @@ -58,9 +58,3 @@ func RegisterInstanceInfo(registry *prometheus.Registry, instanceName string, me info.WithLabelValues(values...).Set(1) return registry.Register(info) } - -var labelNamePattern = regexp.MustCompile(`^[a-zA-Z_][a-zA-Z0-9_]*$`) - -func isValidLabelName(name string) bool { - return labelNamePattern.MatchString(name) -} diff --git a/pkg/sparrow/metrics/instance_info_test.go b/pkg/sparrow/metrics/instance_info_test.go index d6bfe75e..dba1a6e1 100644 --- a/pkg/sparrow/metrics/instance_info_test.go +++ b/pkg/sparrow/metrics/instance_info_test.go @@ -6,6 +6,7 @@ package metrics import ( "errors" + "maps" "testing" "github.com/prometheus/client_golang/prometheus" @@ -28,27 +29,37 @@ func TestRegisterInstanceInfo(t *testing.T) { t.Fatalf("Gather() error = %v", err) } - var found bool + expectedLabels := map[string]string{ + "instance_name": "sparrow.example.com", + "team_name": "platform-team", + "team_email": "platform@example.com", + "platform": "k8s-prod-eu", + } + + found := false for _, mf := range metrics { - if mf.GetName() == instanceInfoMetricName { - found = true - if len(mf.GetMetric()) != 1 { - t.Errorf("expected 1 metric, got %d", len(mf.GetMetric())) + if mf.GetName() != instanceInfoMetricName { + continue + } + found = true + + if len(mf.GetMetric()) != 1 { + t.Errorf("expected 1 metric, got %d", len(mf.GetMetric())) + } + + const expectedValue = 1 + for _, m := range mf.GetMetric() { + if m.GetGauge().GetValue() != expectedValue { + t.Errorf("%q metric value expected %d, got %f", instanceInfoMetricName, expectedValue, m.GetGauge().GetValue()) + } + + labels := make(map[string]string) + for _, lp := range m.GetLabel() { + labels[lp.GetName()] = lp.GetValue() } - for _, m := range mf.GetMetric() { - if m.GetGauge().GetValue() != 1 { - t.Errorf("expected value 1, got %v", m.GetGauge().GetValue()) - } - labels := make(map[string]string) - for _, lp := range m.GetLabel() { - labels[lp.GetName()] = lp.GetValue() - } - if labels["instance_name"] != "sparrow.example.com" || labels["team_name"] != "platform-team" || - labels["team_email"] != "platform@example.com" || labels["platform"] != "k8s-prod-eu" { - t.Errorf("unexpected labels: %v", labels) - } + if !maps.Equal(expectedLabels, labels) { + t.Errorf("expected labels %v, got %v", expectedLabels, labels) } - break } } if !found { @@ -69,21 +80,35 @@ func TestRegisterInstanceInfo_emptyMetadata(t *testing.T) { t.Fatalf("Gather() error = %v", err) } + expectedLabels := map[string]string{ + "instance_name": "sparrow.example.com", + } + + found := false for _, mf := range metrics { - if mf.GetName() == instanceInfoMetricName { - for _, m := range mf.GetMetric() { - labels := make(map[string]string) - for _, lp := range m.GetLabel() { - labels[lp.GetName()] = lp.GetValue() - } - if labels["instance_name"] != "sparrow.example.com" { - t.Errorf("expected instance_name=sparrow.example.com, got %v", labels) - } + if mf.GetName() != instanceInfoMetricName { + continue + } + found = true + + const expectedValue = 1 + for _, m := range mf.GetMetric() { + if m.GetGauge().GetValue() != expectedValue { + t.Errorf("%q metric value expected %d, got %f", instanceInfoMetricName, expectedValue, m.GetGauge().GetValue()) + } + + labels := make(map[string]string) + for _, lp := range m.GetLabel() { + labels[lp.GetName()] = lp.GetValue() + } + if !maps.Equal(expectedLabels, labels) { + t.Errorf("expected labels %v, got %v", expectedLabels, labels) } - return } } - t.Error("sparrow_instance_info metric not found") + if !found { + t.Error("sparrow_instance_info metric not found in registry") + } } func TestRegisterInstanceInfo_doubleRegistration(t *testing.T) { @@ -128,19 +153,34 @@ func TestRegisterInstanceInfo_partialMetadata(t *testing.T) { t.Fatalf("Gather() error = %v", err) } + expectedLabels := map[string]string{ + "instance_name": "sparrow.example.com", + "team_name": "platform-team", + } + + found := false for _, mf := range metrics { - if mf.GetName() == instanceInfoMetricName { - for _, m := range mf.GetMetric() { - labels := make(map[string]string) - for _, lp := range m.GetLabel() { - labels[lp.GetName()] = lp.GetValue() - } - if labels["instance_name"] != "sparrow.example.com" || labels["team_name"] != "platform-team" { - t.Errorf("unexpected labels (expected partial metadata): %v", labels) - } + if mf.GetName() != instanceInfoMetricName { + continue + } + found = true + + const expectedValue = 1 + for _, m := range mf.GetMetric() { + if m.GetGauge().GetValue() != expectedValue { + t.Errorf("%q metric value expected %d, got %f", instanceInfoMetricName, expectedValue, m.GetGauge().GetValue()) + } + + labels := make(map[string]string) + for _, lp := range m.GetLabel() { + labels[lp.GetName()] = lp.GetValue() + } + if !maps.Equal(expectedLabels, labels) { + t.Errorf("expected labels %v, got %v", expectedLabels, labels) } - return } } - t.Error("sparrow_instance_info metric not found") + if !found { + t.Error("sparrow_instance_info metric not found in registry") + } }