diff --git a/README.md b/README.md index 1c55566f..3465be0d 100644 --- a/README.md +++ b/README.md @@ -23,6 +23,7 @@ SPDX-License-Identifier: CC-BY-4.0 - [Image](#image) - [Configuration](#configuration) - [Startup](#startup) + - [Instance metadata (optional)](#instance-metadata-optional) - [Example Startup Configuration](#example-startup-configuration) - [Loader](#loader) - [Logging Configuration](#logging-configuration) @@ -124,8 +125,8 @@ ConfigMap. Use the following configuration values to use a runtime configuration by the `http` loader: ```yaml -startupConfig: - ... +sparrowConfig: + name: sparrow.example.com loader: type: http interval: 30s @@ -199,12 +200,26 @@ export SPARROW_ANY_OTHER_OPTION="Some value" Just write out the path to the attribute, delimited by `_`. +#### Instance metadata (optional) + +You can optionally configure instance metadata so operators can identify owners, route alerts, and correlate metrics across deployments. This metadata is exposed as a single Prometheus info-style metric, `sparrow_instance_info`, emitted once per instance at startup. + +`metadata` is a map of arbitrary key-value pairs. Keys must be valid Prometheus label names (e.g. `team_name`, `platform`, `region`, `environment`). The key `instance_name` is reserved and automatically set to the sparrow's name. + #### Example Startup Configuration ```yaml # DNS sparrow is exposed on name: sparrow.example.com +# Optional: instance metadata (exposed as sparrow_instance_info Prometheus metric) +# Used for alert routing and correlating metrics across deployments. +# metadata: +# team_name: platform-team +# team_email: platform@example.com +# platform: k8s-prod-eu +# region: eu-west-1 + # Selects and configures a loader to continuously fetch the checks' configuration at runtime loader: # Defines which loader to use. Options: "file | http" @@ -642,6 +657,27 @@ at `/v1/metrics/{check-name}`. The API's definition is available at `/openapi`. The `sparrow` provides a `/metrics` endpoint to expose application metrics. In addition to runtime information, the sparrow provides specific metrics for each check. Refer to the [Checks](#checks) section for more detailed information. +### Instance info metric + +- `sparrow_instance_info` + - Type: Gauge (info-style, value always 1) + - Description: Instance metadata for this Sparrow instance. Emitted once per instance at startup. + - Labels: `instance_name` plus any user-defined metadata keys + - Use for: Alert routing, identifying instance owners, correlating metrics across multiple Sparrow deployments. + +Example PromQL for multi-team dashboards: + +```promql +# All Sparrow instances with their owner and platform +sparrow_instance_info + +# Instances by team +sparrow_instance_info{team_name="platform-team"} + +# Join check metrics with ownership (e.g. health by team) +sparrow_health_up * on(instance) group_left(team_name, team_email, platform) sparrow_instance_info +``` + ### Prometheus Integration The `sparrow` metrics API is designed to be compatible with Prometheus. To integrate `sparrow` with Prometheus, add the following scrape configuration to your Prometheus configuration file: diff --git a/chart/values.yaml b/chart/values.yaml index db2fe6fd..4d35ce7f 100644 --- a/chart/values.yaml +++ b/chart/values.yaml @@ -148,6 +148,12 @@ envFromSecrets: # see: https://github.com/telekom/sparrow/blob/main/docs/sparrow_run.md sparrowConfig: name: sparrow.com + # Optional instance metadata (exposed as sparrow_instance_info Prometheus metric) + # metadata: + # team_name: platform-team + # team_email: platform@example.com + # platform: k8s-prod-eu + # region: eu-west-1 loader: type: file interval: 30s @@ -168,6 +174,11 @@ sparrowConfig: # count: 5 # file: # path: /config/checks.yaml +# metadata: +# team_name: string +# team_email: string +# platform: string +# region: string # targetManager: # type: gitlab # checkInterval: 300s diff --git a/docs/dev.md b/docs/dev.md index 94fc8a97..45db73d9 100644 --- a/docs/dev.md +++ b/docs/dev.md @@ -6,6 +6,7 @@ SPDX-License-Identifier: CC-BY-4.0 # Developer Documentation +- [Running tests](#running-tests) - [Using `reuse` for license handling](#using-reuse-for-license-handling) - [Install](#install) - [CLI Usage](#cli-usage) @@ -44,3 +45,26 @@ Use the `CC-BY-4.0` SPDX license header for documentation (eg. `.md`) files. In case a license header is not suitable for a file or directory (eg. auto-generated files) the `REUSE.toml` configuration file can be used. Add the file path to the `REUSE.toml` or create a new `[[annotations]]` section (see `./REUSE.toml` for an example). + +## Running tests + +Unit tests are run with the Go toolchain and modules specified in `go.mod`. + +**First run:** If the Go version in `go.mod` is not yet installed, the first `go test` (or any `go` command) will download the toolchain and dependencies. This can take **2–5 minutes** depending on the network. If your IDE or test runner uses a short timeout (e.g. 30–60 seconds), the first run may time out; use a longer timeout for the first run or run from a terminal: + +```bash +go test ./... +``` + +**Run only metrics (e.g. instance_info) tests:** + +```bash +go test ./pkg/sparrow/metrics/ -run 'InstanceInfo' -v -count=1 +``` + +**Run all tests with race detector and coverage (as in CI):** + +```bash +go mod download +go test --race --count=1 --coverprofile cover.out -v ./... +``` diff --git a/docs/ownership-metadata-design.md b/docs/ownership-metadata-design.md new file mode 100644 index 00000000..89c68ecc --- /dev/null +++ b/docs/ownership-metadata-design.md @@ -0,0 +1,37 @@ + + +# Design: Sparrow Instance Ownership Metadata (Issue #354) + +## Summary + +Sparrow exposes optional instance metadata via a dedicated Prometheus **info metric** (`sparrow_instance_info`), so operators can identify owners, route alerts correctly, and correlate metrics across multiple Sparrow deployments. + +## Why a Dedicated Info Metric + +- **Prometheus best practice:** Info-style metrics (gauge with value 1 and descriptive labels) are the standard way to expose static attributes (e.g. `kube_pod_info`, `node_uname_info`). They avoid polluting every time series with extra labels and keep cardinality under control. +- **No impact on existing metrics:** We do **not** add metadata labels to check metrics (health, latency, DNS, traceroute). That would multiply cardinality and complicate existing dashboards. Joining with `sparrow_instance_info` in PromQL when needed is explicit and flexible. +- **Works without target manager:** The metric is registered at startup from startup config only. It does not depend on the target manager or any runtime component. +- **Single emission per instance:** The metric is registered once during `sparrow.New()` and emits one time series per instance. No periodic updates or lifecycle complexity. + +## Implementation Choices + +1. **Config shape:** `metadata` is a map of arbitrary key-value pairs under startup config (e.g. `team_name`, `platform`, `region`). All fields optional; omitted keys are not emitted as labels. The key `instance_name` is reserved and set from the Sparrow DNS name. +2. **Registration point:** Instance info is registered in `sparrow.New()` after the metrics provider is created. Registration failure is logged but non-fatal so the process still starts. +3. **Metrics package:** A small `RegisterInstanceInfo(registry, instanceName, metadata)` in `pkg/sparrow/metrics` keeps the metrics package independent of `pkg/config` and makes the behaviour easy to test. +4. **Helm:** Metadata is optional under `sparrowConfig` in values; backward compatibility is preserved when metadata is not provided. + +## Prometheus Usage + +- **Alert routing:** Alertmanager or routing rules can use `sparrow_instance_info` to add ownership metadata to alerts. +- **Dashboards:** `group_left(...) sparrow_instance_info` joins metadata onto any Sparrow metric by scrape `instance`. +- **Multi-team views:** Filter or group by `team_name`, `platform`, `region`, or any other configured labels without changing existing metric names or labels. + +## Deliverables + +- **Code:** `pkg/config` (Metadata map), `pkg/sparrow/metrics` (RegisterInstanceInfo + test), `pkg/sparrow` (registration in New). +- **Helm:** `chart/values.yaml` extended with commented metadata example; config is merged into existing sparrowConfig. +- **Docs:** README (metadata config, instance info metric, PromQL examples), this design summary. diff --git a/go.mod b/go.mod index 6d569608..55e7fea3 100644 --- a/go.mod +++ b/go.mod @@ -14,6 +14,7 @@ require ( github.com/google/go-cmp v0.7.0 github.com/jarcoal/httpmock v1.4.1 github.com/prometheus/client_golang v1.23.2 + github.com/prometheus/common v0.66.1 github.com/spf13/cobra v1.10.2 github.com/spf13/viper v1.21.0 github.com/stretchr/testify v1.11.1 @@ -55,7 +56,6 @@ require ( github.com/perimeterx/marshmallow v1.1.5 // indirect github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect github.com/prometheus/client_model v0.6.2 // indirect - github.com/prometheus/common v0.66.1 // indirect github.com/prometheus/procfs v0.16.1 // indirect github.com/russross/blackfriday/v2 v2.1.0 // indirect github.com/sagikazarmark/locafero v0.11.0 // indirect diff --git a/pkg/config/config.go b/pkg/config/config.go index 7f8bd2a9..a3cd286b 100644 --- a/pkg/config/config.go +++ b/pkg/config/config.go @@ -14,9 +14,16 @@ import ( "github.com/telekom/sparrow/pkg/api" ) +// Metadata holds arbitrary key-value metadata for the Sparrow instance. +// Exposed via the sparrow_instance_info Prometheus metric labels. +// Keys must be valid Prometheus label names. +type Metadata map[string]string + type Config struct { // SparrowName is the DNS name of the sparrow SparrowName string `yaml:"name" mapstructure:"name"` + // Metadata is optional instance metadata (exposed as sparrow_instance_info) + Metadata Metadata `yaml:"metadata" mapstructure:"metadata"` // Loader is the configuration for the loader Loader LoaderConfig `yaml:"loader" mapstructure:"loader"` // Api is the configuration for the api server diff --git a/pkg/sparrow/metrics/instance_info.go b/pkg/sparrow/metrics/instance_info.go new file mode 100644 index 00000000..6e01aa20 --- /dev/null +++ b/pkg/sparrow/metrics/instance_info.go @@ -0,0 +1,60 @@ +// SPDX-FileCopyrightText: 2025 Deutsche Telekom IT GmbH +// +// SPDX-License-Identifier: Apache-2.0 + +package metrics + +import ( + "fmt" + "sort" + + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/common/model" +) + +const ( + instanceInfoMetricName = "sparrow_instance_info" + instanceInfoHelp = "Ownership and platform metadata for this Sparrow instance. Emitted once per instance for alert routing and multi-team correlation." +) + +// RegisterInstanceInfo registers the sparrow_instance_info info-style metric on the given registry. +// It sets the gauge to 1 with labels instance_name and any user-defined metadata keys. +// Empty strings are allowed for metadata values; instanceName should be the Sparrow DNS name. +// Metadata keys must be valid Prometheus label names and must not include "instance_name". +func RegisterInstanceInfo(registry *prometheus.Registry, instanceName string, metadata map[string]string) error { + if metadata == nil { + metadata = map[string]string{} + } + + keys := make([]string, 0, len(metadata)) + for k := range metadata { + keys = append(keys, k) + } + sort.Strings(keys) + + labels := make([]string, 0, len(keys)+1) + values := make([]string, 0, len(keys)+1) + labels = append(labels, "instance_name") + values = append(values, instanceName) + + for _, k := range keys { + if k == "instance_name" { + return fmt.Errorf("metadata key %q is reserved", k) + } + if !model.UTF8Validation.IsValidLabelName(k) { + return fmt.Errorf("metadata key %q is not a valid Prometheus label name", k) + } + labels = append(labels, k) + values = append(values, metadata[k]) + } + + info := prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Name: instanceInfoMetricName, + Help: instanceInfoHelp, + }, + labels, + ) + info.WithLabelValues(values...).Set(1) + return registry.Register(info) +} diff --git a/pkg/sparrow/metrics/instance_info_test.go b/pkg/sparrow/metrics/instance_info_test.go new file mode 100644 index 00000000..dba1a6e1 --- /dev/null +++ b/pkg/sparrow/metrics/instance_info_test.go @@ -0,0 +1,186 @@ +// SPDX-FileCopyrightText: 2025 Deutsche Telekom IT GmbH +// +// SPDX-License-Identifier: Apache-2.0 + +package metrics + +import ( + "errors" + "maps" + "testing" + + "github.com/prometheus/client_golang/prometheus" +) + +func TestRegisterInstanceInfo(t *testing.T) { + registry := prometheus.NewRegistry() + + err := RegisterInstanceInfo(registry, "sparrow.example.com", map[string]string{ + "team_name": "platform-team", + "team_email": "platform@example.com", + "platform": "k8s-prod-eu", + }) + if err != nil { + t.Fatalf("RegisterInstanceInfo() error = %v", err) + } + + metrics, err := registry.Gather() + if err != nil { + t.Fatalf("Gather() error = %v", err) + } + + expectedLabels := map[string]string{ + "instance_name": "sparrow.example.com", + "team_name": "platform-team", + "team_email": "platform@example.com", + "platform": "k8s-prod-eu", + } + + found := false + for _, mf := range metrics { + if mf.GetName() != instanceInfoMetricName { + continue + } + found = true + + if len(mf.GetMetric()) != 1 { + t.Errorf("expected 1 metric, got %d", len(mf.GetMetric())) + } + + const expectedValue = 1 + for _, m := range mf.GetMetric() { + if m.GetGauge().GetValue() != expectedValue { + t.Errorf("%q metric value expected %d, got %f", instanceInfoMetricName, expectedValue, m.GetGauge().GetValue()) + } + + labels := make(map[string]string) + for _, lp := range m.GetLabel() { + labels[lp.GetName()] = lp.GetValue() + } + if !maps.Equal(expectedLabels, labels) { + t.Errorf("expected labels %v, got %v", expectedLabels, labels) + } + } + } + if !found { + t.Error("sparrow_instance_info metric not found in registry") + } +} + +func TestRegisterInstanceInfo_emptyMetadata(t *testing.T) { + registry := prometheus.NewRegistry() + + err := RegisterInstanceInfo(registry, "sparrow.example.com", nil) + if err != nil { + t.Fatalf("RegisterInstanceInfo() with empty metadata error = %v", err) + } + + metrics, err := registry.Gather() + if err != nil { + t.Fatalf("Gather() error = %v", err) + } + + expectedLabels := map[string]string{ + "instance_name": "sparrow.example.com", + } + + found := false + for _, mf := range metrics { + if mf.GetName() != instanceInfoMetricName { + continue + } + found = true + + const expectedValue = 1 + for _, m := range mf.GetMetric() { + if m.GetGauge().GetValue() != expectedValue { + t.Errorf("%q metric value expected %d, got %f", instanceInfoMetricName, expectedValue, m.GetGauge().GetValue()) + } + + labels := make(map[string]string) + for _, lp := range m.GetLabel() { + labels[lp.GetName()] = lp.GetValue() + } + if !maps.Equal(expectedLabels, labels) { + t.Errorf("expected labels %v, got %v", expectedLabels, labels) + } + } + } + if !found { + t.Error("sparrow_instance_info metric not found in registry") + } +} + +func TestRegisterInstanceInfo_doubleRegistration(t *testing.T) { + registry := prometheus.NewRegistry() + + err := RegisterInstanceInfo(registry, "sparrow.example.com", map[string]string{ + "team_name": "team-a", + "team_email": "team-a@example.com", + "platform": "k8s-prod", + }) + if err != nil { + t.Fatalf("first RegisterInstanceInfo() error = %v", err) + } + + err2 := RegisterInstanceInfo(registry, "other.example.com", map[string]string{ + "team_name": "team-b", + "team_email": "team-b@example.com", + "platform": "k8s-staging", + }) + if err2 == nil { + t.Fatal("expected second RegisterInstanceInfo to return an error (duplicate collector)") + } + + var alreadyErr prometheus.AlreadyRegisteredError + if !errors.As(err2, &alreadyErr) { + t.Errorf("expected AlreadyRegisteredError, got %T: %v", err2, err2) + } +} + +func TestRegisterInstanceInfo_partialMetadata(t *testing.T) { + registry := prometheus.NewRegistry() + + err := RegisterInstanceInfo(registry, "sparrow.example.com", map[string]string{ + "team_name": "platform-team", + }) + if err != nil { + t.Fatalf("RegisterInstanceInfo() with partial metadata error = %v", err) + } + + metrics, err := registry.Gather() + if err != nil { + t.Fatalf("Gather() error = %v", err) + } + + expectedLabels := map[string]string{ + "instance_name": "sparrow.example.com", + "team_name": "platform-team", + } + + found := false + for _, mf := range metrics { + if mf.GetName() != instanceInfoMetricName { + continue + } + found = true + + const expectedValue = 1 + for _, m := range mf.GetMetric() { + if m.GetGauge().GetValue() != expectedValue { + t.Errorf("%q metric value expected %d, got %f", instanceInfoMetricName, expectedValue, m.GetGauge().GetValue()) + } + + labels := make(map[string]string) + for _, lp := range m.GetLabel() { + labels[lp.GetName()] = lp.GetValue() + } + if !maps.Equal(expectedLabels, labels) { + t.Errorf("expected labels %v, got %v", expectedLabels, labels) + } + } + } + if !found { + t.Error("sparrow_instance_info metric not found in registry") + } +} diff --git a/pkg/sparrow/run.go b/pkg/sparrow/run.go index 17fcc44d..e0cb841c 100644 --- a/pkg/sparrow/run.go +++ b/pkg/sparrow/run.go @@ -80,6 +80,14 @@ func New(cfg *config.Config) *Sparrow { } sparrow.loader = config.NewLoader(cfg, sparrow.cRuntime) + // Register instance metadata as Prometheus info metric (once per instance) + if err := metrics.RegisterInstanceInfo(m.GetRegistry(), cfg.SparrowName, cfg.Metadata); err != nil { + // Non-fatal: instance can run without the info metric + // Logging requires context; use background with logger for startup + log := logger.FromContext(context.Background()) + log.Error("Failed to register sparrow_instance_info metric", "error", err) + } + return sparrow }