From 23f91e46cae463c7d74b5eeb172d5d008808e370 Mon Sep 17 00:00:00 2001 From: dnj Date: Fri, 19 Sep 2025 16:41:33 +0800 Subject: [PATCH 01/18] =?UTF-8?q?feat(observability):=20=E6=B7=BB=E5=8A=A0?= =?UTF-8?q?HTTP=E8=AF=B7=E6=B1=82=E6=97=B6=E5=BB=B6=E6=8C=87=E6=A0=87?= =?UTF-8?q?=E6=94=B6=E9=9B=86=E5=8A=9F=E8=83=BD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 实现HTTP中间件记录请求时延并导出到Prometheus指标 添加服务信息到指标收集器 统一代码格式和修复缩进问题 添加Prometheus Adapter API文档 --- docs/prometheus_adapter/API.md | 271 ++++++++++++++++++ .../internal/handler/mock_error_handler.go | 26 +- .../internal/service/mock_error_service.go | 42 +-- mock/s3/shared/interfaces/error_injector.go | 2 +- .../error_injection/error_injection.go | 54 ++-- mock/s3/shared/models/error.go | 14 +- mock/s3/shared/observability/metrics.go | 58 +++- mock/s3/shared/observability/middleware.go | 18 +- mock/s3/shared/observability/observability.go | 3 + mock/s3/shared/observability/providers.go | 30 +- mock/s3/shared/server/service_bootstrap.go | 164 +++++------ mock/s3/shared/utils/instance.go | 10 +- 12 files changed, 519 insertions(+), 173 deletions(-) create mode 100644 docs/prometheus_adapter/API.md diff --git a/docs/prometheus_adapter/API.md b/docs/prometheus_adapter/API.md new file mode 100644 index 0000000..4cfe228 --- /dev/null +++ b/docs/prometheus_adapter/API.md @@ -0,0 +1,271 @@ +# Prometheus Adapter API 文档 + +## 概述 + +Prometheus Adapter 提供从 Prometheus 获取服务 QPS 和平均时延指标的 RESTful API 接口。支持按服务名称和版本进行查询。 + +> **当前状态**: +> - QPS 指标:已实现,使用 `system_network_qps` 指标(基于网络包统计) +> - 时延指标:已实现,使用 `http.server.request.duration_seconds` 指标(HTTP 请求真实时延) + +## API + +### 1. 获取服务 QPS 指标 + +**GET** `/v1/metrics/:service/qps` + +获取指定服务的 QPS(每秒请求数)指标数据。 + +#### 路径参数 +- `service` (string, required): 服务名称 + +#### 查询参数 +- `version` (string, optional): 服务版本,不指定则返回所有版本 +- `start` (string, optional): 开始时间 (RFC3339 格式,如: 2024-01-01T00:00:00Z) +- `end` (string, optional): 结束时间 (RFC3339 格式,如: 2024-01-01T01:00:00Z) +- `step` (string, optional): 时间步长 (如: 1m, 5m, 1h),默认 1m + +#### 请求示例 +```bash +GET /v1/metrics/metadata-service/qps?version=1.0.0&start=2024-01-01T00:00:00Z&end=2024-01-01T01:00:00Z&step=1m +``` + +#### 响应示例 +```json +{ + "service": "metadata-service", + "version": "1.0.0", + "metric_type": "qps", + "data": [ + { + "timestamp": "2024-01-01T00:00:00Z", + "value": 150.5 + }, + { + "timestamp": "2024-01-01T00:01:00Z", + "value": 148.2 + } + ], + "summary": { + "min": 120.1, + "max": 180.3, + "avg": 152.8, + "total_points": 60 + } +} +``` + +### 2. 获取服务平均时延指标 + +**GET** `/v1/metrics/:service/latency` + +获取指定服务的平均响应时延指标数据(单位:秒)。 + +#### 路径参数 +- `service` (string, required): 服务名称 + +#### 查询参数 +- `version` (string, optional): 服务版本,不指定则返回所有版本 +- `start` (string, optional): 开始时间 (RFC3339 格式) +- `end` (string, optional): 结束时间 (RFC3339 格式) +- `step` (string, optional): 时间步长,默认 1m +- `percentile` (string, optional): 百分位数 (p50, p95, p99),默认 p50 + +#### 请求示例 +```bash +GET /v1/metrics/storage-service/latency?version=1.0.0&percentile=p95&start=2024-01-01T00:00:00Z&end=2024-01-01T01:00:00Z +``` + +#### 响应示例 +```json +{ + "service": "storage-service", + "version": "1.0.0", + "metric_type": "latency", + "percentile": "p95", + "data": [ + { + "timestamp": "2024-01-01T00:00:00Z", + "value": 125.8 + }, + { + "timestamp": "2024-01-01T00:01:00Z", + "value": 132.1 + } + ], + "summary": { + "min": 98.5, + "max": 201.2, + "avg": 128.9, + "total_points": 60 + } +} +``` + +### 3. 获取服务综合指标 + +**GET** `/v1/metrics/:service/overview` + +同时获取指定服务的 QPS 和时延指标概览。 + +#### 路径参数 +- `service` (string, required): 服务名称 + +#### 查询参数 +- `version` (string, optional): 服务版本 +- `start` (string, optional): 开始时间 (RFC3339 格式) +- `end` (string, optional): 结束时间 (RFC3339 格式) + +#### 响应示例 +```json +{ + "service": "queue-service", + "version": "1.0.0", + "time_range": { + "start": "2024-01-01T00:00:00Z", + "end": "2024-01-01T01:00:00Z" + }, + "metrics": { + "qps": { + "current": 152.8, + "avg": 148.5, + "max": 180.3, + "min": 120.1 + }, + "latency": { + "p50": 85.2, + "p95": 128.9, + "p99": 201.2 + } + } +} +``` + +### 4. 获取可用服务列表 + +**GET** `/v1/services` + +获取 Prometheus 中可监控的服务列表。 + +#### 查询参数 +- `prefix` (string, optional): 服务名前缀过滤 + +#### 响应示例 +```json +{ + "services": [ + { + "name": "metadata-service", + "versions": ["1.0.0"], + "active_versions": ["1.0.0"], + "last_updated": "2024-01-01T01:00:00Z" + }, + { + "name": "storage-service", + "versions": ["1.0.0"], + "active_versions": ["1.0.0"], + "last_updated": "2024-01-01T00:45:00Z" + }, + { + "name": "queue-service", + "versions": ["1.0.0"], + "active_versions": ["1.0.0"], + "last_updated": "2024-01-01T00:30:00Z" + }, + { + "name": "third-party-service", + "versions": ["1.0.0"], + "active_versions": ["1.0.0"], + "last_updated": "2024-01-01T00:20:00Z" + }, + { + "name": "mock-error-service", + "versions": ["1.0.0"], + "active_versions": ["1.0.0"], + "last_updated": "2024-01-01T00:15:00Z" + } + ], + "total": 5 +} +``` + +## 错误响应 + +所有 API 在出错时返回统一的错误格式: + +```json +{ + "error": "error_code", + "message": "详细错误描述", + "details": { + "field": "具体错误字段" + } +} +``` + +### 常见错误码 + +- `400 Bad Request`: 请求参数错误 +- `404 Not Found`: 服务或版本不存在 +- `500 Internal Server Error`: 内部服务器错误 +- `503 Service Unavailable`: Prometheus 连接失败 + +## 实现说明 + +### Prometheus 查询语法 + +API 内部使用的 Prometheus 查询示例: + +#### QPS 查询 +```promql +# 网络包 QPS(当前实现) +system_network_qps{exported_job="metadata-service",service_version="1.0.0"} + +# 计算5分钟平均 QPS +rate(system_network_qps{exported_job="metadata-service",service_version="1.0.0"}[5m]) +``` + +#### 平均时延查询 +```promql +# P95 时延(95分位数) +histogram_quantile(0.95, rate(http.server.request.duration_seconds_bucket{exported_job="metadata-service",service_version="1.0.0"}[5m])) + +# P50 时延(中位数) +histogram_quantile(0.50, rate(http.server.request.duration_seconds_bucket{exported_job="metadata-service",service_version="1.0.0"}[5m])) + +# P99 时延(99分位数) +histogram_quantile(0.99, rate(http.server.request.duration_seconds_bucket{exported_job="metadata-service",service_version="1.0.0"}[5m])) + +# 平均时延 +rate(http.server.request.duration_seconds_sum{exported_job="metadata-service",service_version="1.0.0"}[5m]) +/ +rate(http.server.request.duration_seconds_count{exported_job="metadata-service",service_version="1.0.0"}[5m]) +``` + +### 配置要求 + +需要在配置文件中指定: +- Prometheus 服务器地址:`http://10.210.10.33:9090` +- 查询超时时间:30秒 +- 默认时间范围:最近1小时 +- 服务标签映射: + - 服务名:`exported_job`(在指标中作为标签) + - 版本号:`service_version`(在指标中作为标签) + - 实例标识:通过 OpenTelemetry 的 `service.instance.id` 属性设置 + +### 支持的服务列表 + +当前 mock/s3 环境中支持的服务: +- `metadata-service` - 元数据管理服务(版本:1.0.0) +- `storage-service` - 存储服务(版本:1.0.0) +- `queue-service` - 消息队列服务(版本:1.0.0) +- `third-party-service` - 第三方集成服务(版本:1.0.0) +- `mock-error-service` - 错误模拟服务(版本:1.0.0) + +所有服务的版本信息通过 `service_version` 标签暴露。 + +### 缓存策略 + +- 指标数据缓存时间:30秒 +- 服务列表缓存时间:5分钟 +- 支持 ETag 缓存验证 \ No newline at end of file diff --git a/mock/s3/services/mock-error/internal/handler/mock_error_handler.go b/mock/s3/services/mock-error/internal/handler/mock_error_handler.go index 18be6b6..dfd1f33 100644 --- a/mock/s3/services/mock-error/internal/handler/mock_error_handler.go +++ b/mock/s3/services/mock-error/internal/handler/mock_error_handler.go @@ -85,13 +85,13 @@ func (h *MockErrorHandler) deleteMetricAnomaly(c *gin.Context) { // checkMetricInjection 检查是否应该注入指标异常 func (h *MockErrorHandler) checkMetricInjection(c *gin.Context) { - ctx := c.Request.Context() + ctx := c.Request.Context() - var request struct { - Service string `json:"service" binding:"required"` - MetricName string `json:"metric_name" binding:"required"` - Instance string `json:"instance"` - } + var request struct { + Service string `json:"service" binding:"required"` + MetricName string `json:"metric_name" binding:"required"` + Instance string `json:"instance"` + } if err := c.ShouldBindJSON(&request); err != nil { h.logger.Error(ctx, "Failed to bind metric injection check request", observability.Error(err)) @@ -99,14 +99,14 @@ func (h *MockErrorHandler) checkMetricInjection(c *gin.Context) { return } - anomaly, shouldInject := h.errorService.ShouldInjectError(ctx, request.Service, request.MetricName, request.Instance) + anomaly, shouldInject := h.errorService.ShouldInjectError(ctx, request.Service, request.MetricName, request.Instance) - response := gin.H{ - "should_inject": shouldInject, - "service": request.Service, - "metric_name": request.MetricName, - "instance": request.Instance, - } + response := gin.H{ + "should_inject": shouldInject, + "service": request.Service, + "metric_name": request.MetricName, + "instance": request.Instance, + } if shouldInject { response["anomaly"] = anomaly diff --git a/mock/s3/services/mock-error/internal/service/mock_error_service.go b/mock/s3/services/mock-error/internal/service/mock_error_service.go index cdaf0c4..b445c82 100644 --- a/mock/s3/services/mock-error/internal/service/mock_error_service.go +++ b/mock/s3/services/mock-error/internal/service/mock_error_service.go @@ -118,19 +118,19 @@ func (s *MockErrorService) ShouldInjectError(ctx context.Context, service, metri s.stats.TotalRequests++ s.stats.LastUpdated = time.Now() - for _, rule := range s.rules { - if !rule.Enabled { - continue - } - - // 检查服务匹配 - if rule.Service != "" && rule.Service != service { - continue - } - // 检查实例匹配(如果指定了实例,则必须匹配) - if rule.Instance != "" && rule.Instance != instance { - continue - } + for _, rule := range s.rules { + if !rule.Enabled { + continue + } + + // 检查服务匹配 + if rule.Service != "" && rule.Service != service { + continue + } + // 检查实例匹配(如果指定了实例,则必须匹配) + if rule.Instance != "" && rule.Instance != instance { + continue + } // 检查指标名称匹配 if rule.MetricName != "" && rule.MetricName != metricName { @@ -167,14 +167,14 @@ func (s *MockErrorService) ShouldInjectError(ctx context.Context, service, metri "rule_id": rule.ID, } - s.logger.Info(ctx, "Metric anomaly injected", - observability.String("rule_id", rule.ID), - observability.String("service", service), - observability.String("instance", instance), - observability.String("metric_name", metricName), - observability.String("anomaly_type", rule.AnomalyType), - observability.Float64("target_value", rule.TargetValue), - observability.Int("triggered_count", rule.Triggered)) + s.logger.Info(ctx, "Metric anomaly injected", + observability.String("rule_id", rule.ID), + observability.String("service", service), + observability.String("instance", instance), + observability.String("metric_name", metricName), + observability.String("anomaly_type", rule.AnomalyType), + observability.Float64("target_value", rule.TargetValue), + observability.Int("triggered_count", rule.Triggered)) return anomaly, true } diff --git a/mock/s3/shared/interfaces/error_injector.go b/mock/s3/shared/interfaces/error_injector.go index 4feb187..c59894a 100644 --- a/mock/s3/shared/interfaces/error_injector.go +++ b/mock/s3/shared/interfaces/error_injector.go @@ -15,7 +15,7 @@ type MetricAnomalyService interface { ListRules(ctx context.Context) ([]*models.MetricAnomalyRule, error) // 指标异常注入核心功能 - ShouldInjectError(ctx context.Context, service, metricName, instance string) (map[string]any, bool) + ShouldInjectError(ctx context.Context, service, metricName, instance string) (map[string]any, bool) } // MetricInjector HTTP指标异常注入器接口 diff --git a/mock/s3/shared/middleware/error_injection/error_injection.go b/mock/s3/shared/middleware/error_injection/error_injection.go index 9eb98a1..dec2c6c 100644 --- a/mock/s3/shared/middleware/error_injection/error_injection.go +++ b/mock/s3/shared/middleware/error_injection/error_injection.go @@ -1,16 +1,16 @@ package error_injection import ( - "context" - "fmt" - "mocks3/shared/client" - "mocks3/shared/models" - "mocks3/shared/observability" - "mocks3/shared/utils" - "net/http" - "strconv" - "sync" - "time" + "context" + "fmt" + "mocks3/shared/client" + "mocks3/shared/models" + "mocks3/shared/observability" + "mocks3/shared/utils" + "net/http" + "strconv" + "sync" + "time" ) // MetricInjectorConfig 指标异常注入器配置 @@ -125,11 +125,11 @@ func NewMetricInjectorWithDefaults(mockErrorServiceURL string, serviceName strin // InjectMetricAnomaly 检查并注入指标异常 func (mi *MetricInjector) InjectMetricAnomaly(ctx context.Context, metricName string, originalValue float64) float64 { - // 计算实例标识,用于实例级注入与缓存 - instanceID := utils.GetInstanceID(mi.serviceName) + // 计算实例标识,用于实例级注入与缓存 + instanceID := utils.GetInstanceID(mi.serviceName) - // 检查缓存(加入实例维度) - cacheKey := mi.serviceName + ":" + instanceID + ":" + metricName + // 检查缓存(加入实例维度) + cacheKey := mi.serviceName + ":" + instanceID + ":" + metricName mi.cacheMu.RLock() if cached, exists := mi.cache[cacheKey]; exists && time.Now().Before(cached.ExpiresAt) { mi.cacheMu.RUnlock() @@ -141,19 +141,19 @@ func (mi *MetricInjector) InjectMetricAnomaly(ctx context.Context, metricName st mi.cacheMu.RUnlock() // 查询Mock Error Service获取异常规则 - request := map[string]string{ - "service": mi.serviceName, - "metric_name": metricName, - "instance": instanceID, - } - - var response struct { - ShouldInject bool `json:"should_inject"` - Service string `json:"service"` - MetricName string `json:"metric_name"` - Instance string `json:"instance"` - Anomaly map[string]any `json:"anomaly,omitempty"` - } + request := map[string]string{ + "service": mi.serviceName, + "metric_name": metricName, + "instance": instanceID, + } + + var response struct { + ShouldInject bool `json:"should_inject"` + Service string `json:"service"` + MetricName string `json:"metric_name"` + Instance string `json:"instance"` + Anomaly map[string]any `json:"anomaly,omitempty"` + } // 使用较短的超时时间避免影响正常指标收集 opts := client.RequestOptions{ diff --git a/mock/s3/shared/models/error.go b/mock/s3/shared/models/error.go index 7ebd5b2..2643054 100644 --- a/mock/s3/shared/models/error.go +++ b/mock/s3/shared/models/error.go @@ -6,13 +6,13 @@ import ( // MetricAnomalyRule 指标异常注入规则 type MetricAnomalyRule struct { - ID string `json:"id"` - Name string `json:"name"` - Service string `json:"service"` // 目标服务 - Instance string `json:"instance,omitempty"` // 目标实例,可选 - MetricName string `json:"metric_name"` // 目标指标名称 - AnomalyType string `json:"anomaly_type"` - Enabled bool `json:"enabled"` + ID string `json:"id"` + Name string `json:"name"` + Service string `json:"service"` // 目标服务 + Instance string `json:"instance,omitempty"` // 目标实例,可选 + MetricName string `json:"metric_name"` // 目标指标名称 + AnomalyType string `json:"anomaly_type"` + Enabled bool `json:"enabled"` // 异常参数 TargetValue float64 `json:"target_value"` // 目标异常值 diff --git a/mock/s3/shared/observability/metrics.go b/mock/s3/shared/observability/metrics.go index 3c5bdeb..755161e 100644 --- a/mock/s3/shared/observability/metrics.go +++ b/mock/s3/shared/observability/metrics.go @@ -12,6 +12,7 @@ import ( "time" "github.com/prometheus/procfs" + "go.opentelemetry.io/otel/attribute" "go.opentelemetry.io/otel/metric" ) @@ -47,6 +48,9 @@ type MetricCollector struct { networkQPS metric.Float64Gauge machineOnlineStatus metric.Int64Gauge + // HTTP 请求指标 + httpRequestDuration metric.Float64Histogram + // 统计状态 cpuStats *CPUStats networkStats *NetworkStats @@ -55,6 +59,10 @@ type MetricCollector struct { // 错误注入器 metricInjector MetricInjector + + // 服务属性 + serviceName string + serviceVersion string } // NewMetricCollector 创建指标收集器 @@ -83,6 +91,12 @@ func NewMetricCollector(meter metric.Meter, logger *Logger) (*MetricCollector, e return collector, nil } +// SetServiceInfo 设置服务信息 +func (c *MetricCollector) SetServiceInfo(serviceName, serviceVersion string) { + c.serviceName = serviceName + c.serviceVersion = serviceVersion +} + // SetMetricInjector 设置错误注入器 func (c *MetricCollector) SetMetricInjector(injector MetricInjector) { c.metricInjector = injector @@ -139,6 +153,18 @@ func (c *MetricCollector) initMetrics() error { return err } + // HTTP 请求时延 (使用 Prometheus 兼容的命名) + if c.httpRequestDuration, err = c.meter.Float64Histogram( + "http.server.request.duration_seconds", + metric.WithDescription("HTTP server request duration in seconds"), + metric.WithUnit("s"), + metric.WithExplicitBucketBoundaries( + 0.005, 0.01, 0.025, 0.05, 0.075, 0.1, 0.25, 0.5, 0.75, 1, 2.5, 5, 7.5, 10, + ), + ); err != nil { + return err + } + return nil } @@ -413,7 +439,16 @@ func (c *MetricCollector) collectNetworkMetrics(ctx context.Context) { finalValue = c.metricInjector.InjectMetricAnomaly(ctx, "system_network_qps", qps) } - c.networkQPS.Record(ctx, finalValue) + // 添加服务属性作为标签 + attrs := []attribute.KeyValue{} + if c.serviceName != "" { + attrs = append(attrs, attribute.String("exported_job", c.serviceName)) + } + if c.serviceVersion != "" { + attrs = append(attrs, attribute.String("service_version", c.serviceVersion)) + } + + c.networkQPS.Record(ctx, finalValue, metric.WithAttributes(attrs...)) } c.networkStats.lastUpdate = now } @@ -486,3 +521,24 @@ func (c *MetricCollector) updateMachineStatus(ctx context.Context) { c.machineOnlineStatus.Record(ctx, int64(finalValue)) } + +// RecordHTTPRequestDuration 记录 HTTP 请求时延 +func (c *MetricCollector) RecordHTTPRequestDuration(ctx context.Context, duration float64, method, path string, statusCode int) { + // 构建属性标签 + attrs := []attribute.KeyValue{ + attribute.String("http.method", method), + attribute.String("http.route", path), + attribute.Int("http.status_code", statusCode), + } + + // 添加服务属性 + if c.serviceName != "" { + attrs = append(attrs, attribute.String("exported_job", c.serviceName)) + } + if c.serviceVersion != "" { + attrs = append(attrs, attribute.String("service_version", c.serviceVersion)) + } + + // 记录时延(以秒为单位) + c.httpRequestDuration.Record(ctx, duration, metric.WithAttributes(attrs...)) +} diff --git a/mock/s3/shared/observability/middleware.go b/mock/s3/shared/observability/middleware.go index 265bc3e..a07e8b8 100644 --- a/mock/s3/shared/observability/middleware.go +++ b/mock/s3/shared/observability/middleware.go @@ -29,10 +29,26 @@ func (m *HTTPMiddleware) GinMetricsMiddleware() gin.HandlerFunc { // 处理请求 c.Next() - // 计算基本信息用于日志记录 + // 计算请求时延 duration := time.Since(start) statusCode := c.Writer.Status() + // 记录 HTTP 请求时延指标(以秒为单位) + if m.collector != nil { + durationSeconds := duration.Seconds() + path := c.FullPath() + if path == "" { + path = c.Request.URL.Path // 如果没有匹配的路由,使用原始路径 + } + m.collector.RecordHTTPRequestDuration( + c.Request.Context(), + durationSeconds, + c.Request.Method, + path, + statusCode, + ) + } + // 只记录错误请求的日志 if statusCode >= 400 { m.logger.Warn(c.Request.Context(), "HTTP request completed with error", diff --git a/mock/s3/shared/observability/observability.go b/mock/s3/shared/observability/observability.go index a6efafa..7d238ce 100644 --- a/mock/s3/shared/observability/observability.go +++ b/mock/s3/shared/observability/observability.go @@ -30,6 +30,9 @@ func Setup(serviceName string, configPath string) (*Providers, *MetricCollector, return nil, nil, nil, fmt.Errorf("failed to create metric collector: %w", err) } + // 设置服务信息到指标收集器 + collector.SetServiceInfo(config.ServiceName, config.ServiceVersion) + // 创建HTTP中间件 httpMiddleware := NewHTTPMiddleware(collector, providers.Logger) diff --git a/mock/s3/shared/observability/providers.go b/mock/s3/shared/observability/providers.go index 6f93882..4ca00ba 100644 --- a/mock/s3/shared/observability/providers.go +++ b/mock/s3/shared/observability/providers.go @@ -1,12 +1,12 @@ package observability import ( - "context" - "fmt" - "mocks3/shared/observability/config" - "mocks3/shared/utils" + "context" + "fmt" + "mocks3/shared/observability/config" + "mocks3/shared/utils" - "go.opentelemetry.io/otel" + "go.opentelemetry.io/otel" "go.opentelemetry.io/otel/exporters/otlp/otlplog/otlploghttp" "go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetrichttp" "go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp" @@ -183,14 +183,14 @@ func (p *Providers) Shutdown(ctx context.Context) error { // createResource 创建OTEL资源 func createResource(config *config.ObservabilityConfig) (*resource.Resource, error) { - // 使用统一的实例ID生成器 - instanceID := utils.GetInstanceID(config.ServiceName) - return resource.New(context.Background(), - resource.WithAttributes( - semconv.ServiceName(config.ServiceName), - semconv.ServiceVersion(config.ServiceVersion), - semconv.DeploymentEnvironment(config.Environment), - semconv.ServiceInstanceID(instanceID), - ), - ) + // 使用统一的实例ID生成器 + instanceID := utils.GetInstanceID(config.ServiceName) + return resource.New(context.Background(), + resource.WithAttributes( + semconv.ServiceName(config.ServiceName), + semconv.ServiceVersion(config.ServiceVersion), + semconv.DeploymentEnvironment(config.Environment), + semconv.ServiceInstanceID(instanceID), + ), + ) } diff --git a/mock/s3/shared/server/service_bootstrap.go b/mock/s3/shared/server/service_bootstrap.go index aa7d986..5e48b74 100644 --- a/mock/s3/shared/server/service_bootstrap.go +++ b/mock/s3/shared/server/service_bootstrap.go @@ -1,15 +1,15 @@ package server import ( - "context" - "fmt" - "mocks3/shared/observability" - "net/http" - "net" - "os" - "os/signal" - "syscall" - "time" + "context" + "fmt" + "mocks3/shared/observability" + "net" + "net/http" + "os" + "os/signal" + "syscall" + "time" "github.com/gin-gonic/gin" "mocks3/shared/middleware/consul" @@ -174,7 +174,7 @@ func (sb *ServiceBootstrap) setupObservability() error { // setupConsulRegistration 设置Consul服务注册 func (sb *ServiceBootstrap) setupConsulRegistration() error { - ctx := context.Background() + ctx := context.Background() // 检查配置是否支持Consul consulConfig, ok := sb.Config.(ConsulServiceConfig) @@ -191,23 +191,23 @@ func (sb *ServiceBootstrap) setupConsulRegistration() error { sb.ConsulClient = consulClient - // 注册服务到Consul - // 优先使用可达的容器/主机实例IP地址进行注册,确保多实例下目标唯一 - var registerAddress string - if sb.Config.GetHost() == "0.0.0.0" { - // 允许通过环境变量覆盖对外公布地址 - if envAddr := os.Getenv("ADVERTISE_ADDR"); envAddr != "" { - registerAddress = envAddr - } else { - ip, err := detectAdvertiseAddr() - if err != nil { - return fmt.Errorf("failed to detect advertise address: %w", err) - } - registerAddress = ip - } - } else { - registerAddress = sb.Config.GetHost() - } + // 注册服务到Consul + // 优先使用可达的容器/主机实例IP地址进行注册,确保多实例下目标唯一 + var registerAddress string + if sb.Config.GetHost() == "0.0.0.0" { + // 允许通过环境变量覆盖对外公布地址 + if envAddr := os.Getenv("ADVERTISE_ADDR"); envAddr != "" { + registerAddress = envAddr + } else { + ip, err := detectAdvertiseAddr() + if err != nil { + return fmt.Errorf("failed to detect advertise address: %w", err) + } + registerAddress = ip + } + } else { + registerAddress = sb.Config.GetHost() + } err = consul.RegisterService(ctx, consulClient, sb.Config.GetServiceName(), @@ -217,69 +217,69 @@ func (sb *ServiceBootstrap) setupConsulRegistration() error { return fmt.Errorf("failed to register service with Consul: %w", err) } - sb.Logger.Info(ctx, "Service registered with Consul successfully", - observability.String("consul_addr", consulConfig.GetConsulAddress()), - observability.String("service_name", sb.Config.GetServiceName()), - observability.String("register_address", registerAddress)) + sb.Logger.Info(ctx, "Service registered with Consul successfully", + observability.String("consul_addr", consulConfig.GetConsulAddress()), + observability.String("service_name", sb.Config.GetServiceName()), + observability.String("register_address", registerAddress)) - return nil + return nil } // detectAdvertiseAddr 自动探测一个非回环的IPv4地址,优先选择常见容器网卡 func detectAdvertiseAddr() (string, error) { - // 优先尝试常见的容器网卡名称 - preferredIfaces := []string{"eth0", "ens3", "ens4", "en0"} - for _, name := range preferredIfaces { - ifi, err := net.InterfaceByName(name) - if err == nil && (ifi.Flags&net.FlagUp) != 0 { - addrs, err := ifi.Addrs() - if err == nil { - if ip := firstIPv4(addrs); ip != "" { - return ip, nil - } - } - } - } - - // 回退:遍历所有网卡,取第一个非回环且Up的IPv4 - ifaces, err := net.Interfaces() - if err != nil { - return "", err - } - for _, ifi := range ifaces { - if (ifi.Flags&net.FlagUp) == 0 || (ifi.Flags&net.FlagLoopback) != 0 { - continue - } - addrs, err := ifi.Addrs() - if err != nil { - continue - } - if ip := firstIPv4(addrs); ip != "" { - return ip, nil - } - } - return "", fmt.Errorf("no non-loopback IPv4 address found") + // 优先尝试常见的容器网卡名称 + preferredIfaces := []string{"eth0", "ens3", "ens4", "en0"} + for _, name := range preferredIfaces { + ifi, err := net.InterfaceByName(name) + if err == nil && (ifi.Flags&net.FlagUp) != 0 { + addrs, err := ifi.Addrs() + if err == nil { + if ip := firstIPv4(addrs); ip != "" { + return ip, nil + } + } + } + } + + // 回退:遍历所有网卡,取第一个非回环且Up的IPv4 + ifaces, err := net.Interfaces() + if err != nil { + return "", err + } + for _, ifi := range ifaces { + if (ifi.Flags&net.FlagUp) == 0 || (ifi.Flags&net.FlagLoopback) != 0 { + continue + } + addrs, err := ifi.Addrs() + if err != nil { + continue + } + if ip := firstIPv4(addrs); ip != "" { + return ip, nil + } + } + return "", fmt.Errorf("no non-loopback IPv4 address found") } func firstIPv4(addrs []net.Addr) string { - for _, a := range addrs { - var ip net.IP - switch v := a.(type) { - case *net.IPNet: - ip = v.IP - case *net.IPAddr: - ip = v.IP - } - if ip == nil { - continue - } - ip4 := ip.To4() - if ip4 == nil || ip4.IsLoopback() { - continue - } - return ip4.String() - } - return "" + for _, a := range addrs { + var ip net.IP + switch v := a.(type) { + case *net.IPNet: + ip = v.IP + case *net.IPAddr: + ip = v.IP + } + if ip == nil { + continue + } + ip4 := ip.To4() + if ip4 == nil || ip4.IsLoopback() { + continue + } + return ip4.String() + } + return "" } // setupErrorInjection 设置错误注入中间件 diff --git a/mock/s3/shared/utils/instance.go b/mock/s3/shared/utils/instance.go index 8f8e519..79ad013 100644 --- a/mock/s3/shared/utils/instance.go +++ b/mock/s3/shared/utils/instance.go @@ -62,7 +62,7 @@ func GetInstanceID(serviceName string) string { func generateInstanceID(serviceName string) string { // 清理服务名:移除常见后缀,转换为小写 cleanServiceName := cleanServiceName(serviceName) - + // 生成8位短UUID shortUUID := generateShortUUID() if shortUUID == "" { @@ -75,7 +75,7 @@ func generateInstanceID(serviceName string) string { // cleanServiceName 清理服务名 func cleanServiceName(serviceName string) string { name := strings.ToLower(serviceName) - + // 移除常见后缀 suffixes := []string{"-service", "_service", "service"} for _, suffix := range suffixes { @@ -84,11 +84,11 @@ func cleanServiceName(serviceName string) string { break } } - + // 替换特殊字符为连字符 name = strings.ReplaceAll(name, "_", "-") name = strings.ReplaceAll(name, " ", "-") - + return name } @@ -106,4 +106,4 @@ func ResetInstanceID() { instanceIDMutex.Lock() defer instanceIDMutex.Unlock() cachedInstanceID = "" -} \ No newline at end of file +} From d5dc444a9ef6ffa4937b99c0061a3140d8f1b148 Mon Sep 17 00:00:00 2001 From: dnj Date: Fri, 19 Sep 2025 17:23:12 +0800 Subject: [PATCH 02/18] =?UTF-8?q?feat(observability):=20=E6=B7=BB=E5=8A=A0?= =?UTF-8?q?HTTP=E5=BB=B6=E8=BF=9F=E6=B3=A8=E5=85=A5=E5=8A=9F=E8=83=BD?= =?UTF-8?q?=E5=B9=B6=E4=BC=98=E5=8C=96=E6=8C=87=E6=A0=87=E6=94=B6=E9=9B=86?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 重构指标注入器以支持服务版本维度 移除冗余的exported_job标签和实例ID生成 新增HTTP延迟注入器并与中间件集成 --- .../error_injection/error_injection.go | 21 +- .../error_injection/http_latency_injector.go | 183 ++++++++++++++++++ mock/s3/shared/observability/metrics.go | 12 +- mock/s3/shared/observability/middleware.go | 60 ++++-- mock/s3/shared/observability/providers.go | 4 - mock/s3/shared/server/service_bootstrap.go | 35 +++- 6 files changed, 275 insertions(+), 40 deletions(-) create mode 100644 mock/s3/shared/middleware/error_injection/http_latency_injector.go diff --git a/mock/s3/shared/middleware/error_injection/error_injection.go b/mock/s3/shared/middleware/error_injection/error_injection.go index dec2c6c..42dd606 100644 --- a/mock/s3/shared/middleware/error_injection/error_injection.go +++ b/mock/s3/shared/middleware/error_injection/error_injection.go @@ -34,6 +34,7 @@ type CacheConfig struct { type MetricInjector struct { mockErrorClient *client.BaseHTTPClient serviceName string + serviceVersion string // 添加服务版本字段 logger *observability.Logger // 缓存 @@ -56,7 +57,7 @@ type CachedAnomaly struct { } // NewMetricInjector 从YAML配置创建指标异常注入器 -func NewMetricInjector(configPath string, serviceName string, logger *observability.Logger) (*MetricInjector, error) { +func NewMetricInjector(configPath string, serviceName string, serviceVersion string, logger *observability.Logger) (*MetricInjector, error) { // 加载配置文件 var config MetricInjectorConfig if err := utils.LoadConfig(configPath, &config); err != nil { @@ -84,6 +85,7 @@ func NewMetricInjector(configPath string, serviceName string, logger *observabil injector := &MetricInjector{ mockErrorClient: client, serviceName: serviceName, + serviceVersion: serviceVersion, logger: logger, cache: make(map[string]*CachedAnomaly), cacheTTL: config.Cache.TTL, @@ -102,12 +104,13 @@ func NewMetricInjector(configPath string, serviceName string, logger *observabil } // NewMetricInjectorWithDefaults 使用默认配置创建指标异常注入器 -func NewMetricInjectorWithDefaults(mockErrorServiceURL string, serviceName string, logger *observability.Logger) *MetricInjector { +func NewMetricInjectorWithDefaults(mockErrorServiceURL string, serviceName string, serviceVersion string, logger *observability.Logger) *MetricInjector { client := client.NewBaseHTTPClient(mockErrorServiceURL, 5*time.Second, "metric-injector", logger) injector := &MetricInjector{ mockErrorClient: client, serviceName: serviceName, + serviceVersion: serviceVersion, logger: logger, cache: make(map[string]*CachedAnomaly), cacheTTL: 30 * time.Second, @@ -125,11 +128,9 @@ func NewMetricInjectorWithDefaults(mockErrorServiceURL string, serviceName strin // InjectMetricAnomaly 检查并注入指标异常 func (mi *MetricInjector) InjectMetricAnomaly(ctx context.Context, metricName string, originalValue float64) float64 { - // 计算实例标识,用于实例级注入与缓存 - instanceID := utils.GetInstanceID(mi.serviceName) - - // 检查缓存(加入实例维度) - cacheKey := mi.serviceName + ":" + instanceID + ":" + metricName + // 使用服务版本作为注入维度,同一版本的所有实例共享相同的异常注入 + // 检查缓存(基于服务版本) + cacheKey := mi.serviceName + ":" + mi.serviceVersion + ":" + metricName mi.cacheMu.RLock() if cached, exists := mi.cache[cacheKey]; exists && time.Now().Before(cached.ExpiresAt) { mi.cacheMu.RUnlock() @@ -140,18 +141,18 @@ func (mi *MetricInjector) InjectMetricAnomaly(ctx context.Context, metricName st } mi.cacheMu.RUnlock() - // 查询Mock Error Service获取异常规则 + // 查询Mock Error Service获取异常规则(基于版本) request := map[string]string{ "service": mi.serviceName, + "version": mi.serviceVersion, "metric_name": metricName, - "instance": instanceID, } var response struct { ShouldInject bool `json:"should_inject"` Service string `json:"service"` + Version string `json:"version"` MetricName string `json:"metric_name"` - Instance string `json:"instance"` Anomaly map[string]any `json:"anomaly,omitempty"` } diff --git a/mock/s3/shared/middleware/error_injection/http_latency_injector.go b/mock/s3/shared/middleware/error_injection/http_latency_injector.go new file mode 100644 index 0000000..f26de3c --- /dev/null +++ b/mock/s3/shared/middleware/error_injection/http_latency_injector.go @@ -0,0 +1,183 @@ +package error_injection + +import ( + "context" + "mocks3/shared/client" + "mocks3/shared/observability" + "sync" + "time" +) + +// HTTPLatencyInjector HTTP请求延迟注入器 +type HTTPLatencyInjector struct { + mockErrorClient *client.BaseHTTPClient + serviceName string + serviceVersion string + logger *observability.Logger + + // 缓存 + cache map[string]*CachedLatencyConfig + cacheMu sync.RWMutex + cacheTTL time.Duration +} + +// CachedLatencyConfig 缓存的延迟配置 +type CachedLatencyConfig struct { + Config *LatencyConfig + ExpiresAt time.Time +} + +// LatencyConfig 延迟配置 +type LatencyConfig struct { + ShouldInject bool `json:"should_inject"` + Latency time.Duration `json:"latency"` // 注入的延迟时间 + Probability float64 `json:"probability"` // 注入概率 (0-1) + Pattern string `json:"pattern"` // 路径匹配模式(可选) +} + +// NewHTTPLatencyInjector 创建HTTP延迟注入器 +func NewHTTPLatencyInjector(mockErrorServiceURL string, serviceName, serviceVersion string, logger *observability.Logger) *HTTPLatencyInjector { + client := client.NewBaseHTTPClient(mockErrorServiceURL, 5*time.Second, "latency-injector", logger) + + injector := &HTTPLatencyInjector{ + mockErrorClient: client, + serviceName: serviceName, + serviceVersion: serviceVersion, + logger: logger, + cache: make(map[string]*CachedLatencyConfig), + cacheTTL: 30 * time.Second, + } + + // 启动缓存清理 + go injector.cleanupCache() + + return injector +} + +// GetLatencyConfig 获取延迟配置 +func (h *HTTPLatencyInjector) GetLatencyConfig(ctx context.Context, path string) (*LatencyConfig, error) { + // 构建缓存键(基于版本) + cacheKey := h.serviceName + ":" + h.serviceVersion + ":" + path + + // 检查缓存 + h.cacheMu.RLock() + if cached, exists := h.cache[cacheKey]; exists && time.Now().Before(cached.ExpiresAt) { + h.cacheMu.RUnlock() + return cached.Config, nil + } + h.cacheMu.RUnlock() + + // 查询Mock Error Service获取延迟配置 + request := map[string]string{ + "service": h.serviceName, + "version": h.serviceVersion, + "path": path, + "type": "http_latency", + } + + var response struct { + ShouldInject bool `json:"should_inject"` + Latency int64 `json:"latency_ms"` // 毫秒 + Probability float64 `json:"probability"` + Pattern string `json:"pattern"` + } + + opts := client.RequestOptions{ + Method: "POST", + Path: "/api/v1/latency-inject/check", + Body: request, + } + + err := h.mockErrorClient.DoRequestWithJSON(ctx, opts, &response) + if err != nil { + h.logger.Debug(ctx, "Failed to check latency injection", + observability.Error(err), + observability.String("path", path)) + // 失败时缓存空结果 + h.updateCache(cacheKey, nil) + return nil, nil + } + + // 构建配置 + var config *LatencyConfig + if response.ShouldInject { + config = &LatencyConfig{ + ShouldInject: true, + Latency: time.Duration(response.Latency) * time.Millisecond, + Probability: response.Probability, + Pattern: response.Pattern, + } + } + + // 更新缓存 + h.updateCache(cacheKey, config) + + return config, nil +} + +// InjectLatency 注入延迟(如果需要) +func (h *HTTPLatencyInjector) InjectLatency(ctx context.Context, path string) time.Duration { + config, err := h.GetLatencyConfig(ctx, path) + if err != nil || config == nil || !config.ShouldInject { + return 0 + } + + // 基于概率决定是否注入 + if config.Probability < 1.0 { + // 简单的概率实现(生产环境应使用更好的随机数) + if time.Now().UnixNano()%100 >= int64(config.Probability*100) { + return 0 + } + } + + // 执行真实的延迟 + if config.Latency > 0 { + h.logger.Info(ctx, "Injecting HTTP latency", + observability.String("service", h.serviceName), + observability.String("version", h.serviceVersion), + observability.String("path", path), + observability.Duration("latency", config.Latency)) + + // 真实的延迟注入 + time.Sleep(config.Latency) + + return config.Latency + } + + return 0 +} + +// updateCache 更新缓存 +func (h *HTTPLatencyInjector) updateCache(key string, config *LatencyConfig) { + h.cacheMu.Lock() + defer h.cacheMu.Unlock() + + h.cache[key] = &CachedLatencyConfig{ + Config: config, + ExpiresAt: time.Now().Add(h.cacheTTL), + } +} + +// cleanupCache 定期清理过期缓存 +func (h *HTTPLatencyInjector) cleanupCache() { + ticker := time.NewTicker(1 * time.Minute) + defer ticker.Stop() + + for range ticker.C { + h.cacheMu.Lock() + now := time.Now() + for key, cached := range h.cache { + if now.After(cached.ExpiresAt) { + delete(h.cache, key) + } + } + h.cacheMu.Unlock() + } +} + +// Cleanup 清理资源 +func (h *HTTPLatencyInjector) Cleanup() { + h.cacheMu.Lock() + defer h.cacheMu.Unlock() + h.cache = make(map[string]*CachedLatencyConfig) +} diff --git a/mock/s3/shared/observability/metrics.go b/mock/s3/shared/observability/metrics.go index 755161e..4f00649 100644 --- a/mock/s3/shared/observability/metrics.go +++ b/mock/s3/shared/observability/metrics.go @@ -439,11 +439,8 @@ func (c *MetricCollector) collectNetworkMetrics(ctx context.Context) { finalValue = c.metricInjector.InjectMetricAnomaly(ctx, "system_network_qps", qps) } - // 添加服务属性作为标签 + // 添加服务版本标签(exported_job 冗余,已通过 service_name 资源属性暴露) attrs := []attribute.KeyValue{} - if c.serviceName != "" { - attrs = append(attrs, attribute.String("exported_job", c.serviceName)) - } if c.serviceVersion != "" { attrs = append(attrs, attribute.String("service_version", c.serviceVersion)) } @@ -524,17 +521,14 @@ func (c *MetricCollector) updateMachineStatus(ctx context.Context) { // RecordHTTPRequestDuration 记录 HTTP 请求时延 func (c *MetricCollector) RecordHTTPRequestDuration(ctx context.Context, duration float64, method, path string, statusCode int) { - // 构建属性标签 + // 构建属性标签(移除 exported_job,保留 service_version) attrs := []attribute.KeyValue{ attribute.String("http.method", method), attribute.String("http.route", path), attribute.Int("http.status_code", statusCode), } - // 添加服务属性 - if c.serviceName != "" { - attrs = append(attrs, attribute.String("exported_job", c.serviceName)) - } + // 添加服务版本(必要标签,用于版本区分) if c.serviceVersion != "" { attrs = append(attrs, attribute.String("service_version", c.serviceVersion)) } diff --git a/mock/s3/shared/observability/middleware.go b/mock/s3/shared/observability/middleware.go index a07e8b8..5b22f85 100644 --- a/mock/s3/shared/observability/middleware.go +++ b/mock/s3/shared/observability/middleware.go @@ -1,16 +1,23 @@ package observability import ( + "context" "time" "github.com/gin-gonic/gin" "go.opentelemetry.io/contrib/instrumentation/github.com/gin-gonic/gin/otelgin" ) +// LatencyInjector 延迟注入器接口 +type LatencyInjector interface { + InjectLatency(ctx context.Context, path string) time.Duration +} + // HTTPMiddleware HTTP监控中间件 type HTTPMiddleware struct { - collector *MetricCollector - logger *Logger + collector *MetricCollector + logger *Logger + latencyInjector LatencyInjector } // NewHTTPMiddleware 创建HTTP中间件 @@ -21,25 +28,38 @@ func NewHTTPMiddleware(collector *MetricCollector, logger *Logger) *HTTPMiddlewa } } +// SetLatencyInjector 设置延迟注入器 +func (m *HTTPMiddleware) SetLatencyInjector(injector LatencyInjector) { + m.latencyInjector = injector +} + // GinMetricsMiddleware Gin指标中间件 func (m *HTTPMiddleware) GinMetricsMiddleware() gin.HandlerFunc { return func(c *gin.Context) { + // 获取请求路径 + path := c.FullPath() + if path == "" { + path = c.Request.URL.Path + } + + // 在请求处理前注入延迟(如果配置了延迟注入器) + var injectedLatency time.Duration + if m.latencyInjector != nil { + injectedLatency = m.latencyInjector.InjectLatency(c.Request.Context(), path) + } + start := time.Now() // 处理请求 c.Next() - // 计算请求时延 + // 计算请求时延(包含注入的延迟) duration := time.Since(start) statusCode := c.Writer.Status() // 记录 HTTP 请求时延指标(以秒为单位) if m.collector != nil { durationSeconds := duration.Seconds() - path := c.FullPath() - if path == "" { - path = c.Request.URL.Path // 如果没有匹配的路由,使用原始路径 - } m.collector.RecordHTTPRequestDuration( c.Request.Context(), durationSeconds, @@ -53,18 +73,30 @@ func (m *HTTPMiddleware) GinMetricsMiddleware() gin.HandlerFunc { if statusCode >= 400 { m.logger.Warn(c.Request.Context(), "HTTP request completed with error", String("method", c.Request.Method), - String("path", c.FullPath()), + String("path", path), Int("status", statusCode), Duration("duration", duration), + Duration("injected_latency", injectedLatency), ) } - m.logger.Info(c.Request.Context(), "HTTP request completed", - String("method", c.Request.Method), - String("path", c.FullPath()), - Int("status", statusCode), - Duration("duration", duration), - ) + // 记录请求信息(如果有注入延迟,记录在日志中) + if injectedLatency > 0 { + m.logger.Info(c.Request.Context(), "HTTP request completed with injected latency", + String("method", c.Request.Method), + String("path", path), + Int("status", statusCode), + Duration("duration", duration), + Duration("injected_latency", injectedLatency), + ) + } else { + m.logger.Info(c.Request.Context(), "HTTP request completed", + String("method", c.Request.Method), + String("path", path), + Int("status", statusCode), + Duration("duration", duration), + ) + } } } diff --git a/mock/s3/shared/observability/providers.go b/mock/s3/shared/observability/providers.go index 4ca00ba..0f28ba5 100644 --- a/mock/s3/shared/observability/providers.go +++ b/mock/s3/shared/observability/providers.go @@ -4,7 +4,6 @@ import ( "context" "fmt" "mocks3/shared/observability/config" - "mocks3/shared/utils" "go.opentelemetry.io/otel" "go.opentelemetry.io/otel/exporters/otlp/otlplog/otlploghttp" @@ -183,14 +182,11 @@ func (p *Providers) Shutdown(ctx context.Context) error { // createResource 创建OTEL资源 func createResource(config *config.ObservabilityConfig) (*resource.Resource, error) { - // 使用统一的实例ID生成器 - instanceID := utils.GetInstanceID(config.ServiceName) return resource.New(context.Background(), resource.WithAttributes( semconv.ServiceName(config.ServiceName), semconv.ServiceVersion(config.ServiceVersion), semconv.DeploymentEnvironment(config.Environment), - semconv.ServiceInstanceID(instanceID), ), ) } diff --git a/mock/s3/shared/server/service_bootstrap.go b/mock/s3/shared/server/service_bootstrap.go index 5e48b74..6be5825 100644 --- a/mock/s3/shared/server/service_bootstrap.go +++ b/mock/s3/shared/server/service_bootstrap.go @@ -47,7 +47,8 @@ type ServiceBootstrap struct { HTTPMiddleware *observability.HTTPMiddleware // 错误注入 - MetricInjector *error_injection.MetricInjector + MetricInjector *error_injection.MetricInjector + LatencyInjector *error_injection.HTTPLatencyInjector // Consul客户端 ConsulClient consul.ConsulClient @@ -286,10 +287,14 @@ func firstIPv4(addrs []net.Addr) string { func (sb *ServiceBootstrap) setupErrorInjection() error { ctx := context.Background() - // 尝试从配置文件加载 + // 获取服务版本(默认为 1.0.0) + serviceVersion := "1.0.0" + + // 尝试从配置文件加载指标注入器 metricInjector, err := error_injection.NewMetricInjector( sb.MetricInjectorConfigPath, sb.Config.GetServiceName(), + serviceVersion, sb.Logger, ) @@ -300,6 +305,7 @@ func (sb *ServiceBootstrap) setupErrorInjection() error { sb.MetricInjector = error_injection.NewMetricInjectorWithDefaults( "http://mock-error-service:8085", sb.Config.GetServiceName(), + serviceVersion, sb.Logger, ) } else { @@ -307,7 +313,24 @@ func (sb *ServiceBootstrap) setupErrorInjection() error { } if sb.MetricInjector != nil { - sb.Logger.Info(ctx, "Metric injector initialized successfully") + sb.Logger.Info(ctx, "Metric injector initialized successfully", + observability.String("service_version", serviceVersion)) + } + + // 创建HTTP延迟注入器 + sb.LatencyInjector = error_injection.NewHTTPLatencyInjector( + "http://mock-error-service:8085", + sb.Config.GetServiceName(), + serviceVersion, + sb.Logger, + ) + + // 将延迟注入器连接到HTTP中间件 + if sb.HTTPMiddleware != nil && sb.LatencyInjector != nil { + sb.HTTPMiddleware.SetLatencyInjector(sb.LatencyInjector) + sb.Logger.Info(ctx, "HTTP latency injector connected to middleware", + observability.String("service", sb.Config.GetServiceName()), + observability.String("version", serviceVersion)) } return nil @@ -407,6 +430,12 @@ func (sb *ServiceBootstrap) waitForShutdown(server *http.Server) { sb.Logger.Info(ctx, "Metric injector cleaned up") } + // 清理延迟注入器资源 + if sb.LatencyInjector != nil { + sb.LatencyInjector.Cleanup() + sb.Logger.Info(ctx, "Latency injector cleaned up") + } + // 关闭HTTP服务器 shutdownCtx, cancel := context.WithTimeout(context.Background(), 30*time.Second) defer cancel() From c8ba0993bde138c8bc7c451c2fef96f67a80b67c Mon Sep 17 00:00:00 2001 From: dnj Date: Fri, 19 Sep 2025 17:35:17 +0800 Subject: [PATCH 03/18] =?UTF-8?q?refactor(metrics):=20=E7=AE=80=E5=8C=96HT?= =?UTF-8?q?TP=E6=97=B6=E5=BB=B6=E6=8C=87=E6=A0=87=E5=90=8D=E7=A7=B0?= =?UTF-8?q?=E5=B9=B6=E6=9B=B4=E6=96=B0=E5=81=A5=E5=BA=B7=E6=A3=80=E6=9F=A5?= =?UTF-8?q?=E8=84=9A=E6=9C=AC?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 更新HTTP时延指标名称使其更简洁,同时改进健康检查脚本以支持批量检查多个端口 --- mock/s3/DEPLOYMENT.md | 11 ++++++----- mock/s3/shared/observability/metrics.go | 4 ++-- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/mock/s3/DEPLOYMENT.md b/mock/s3/DEPLOYMENT.md index 6e69125..8a93c65 100644 --- a/mock/s3/DEPLOYMENT.md +++ b/mock/s3/DEPLOYMENT.md @@ -55,11 +55,12 @@ sudo supervisorctl start zeroops_* ps aux | grep zeroops_ # 健康检查 -curl http://localhost:8181/health # metadata-service -curl http://localhost:8191/health # storage-service -curl http://localhost:8201/health # queue-service -curl http://localhost:8211/health # third-party-service -curl http://localhost:8221/health # mock-error-service +``` +for port in 8182 8183 8191 8192 8201 8202 8211 8221; do + echo "Checking port $port:" + curl -s -o /dev/null -w "HTTP Status: %{http_code}\n" http://localhost:$port/metrics ||echo "Failed" +done +``` # 查看日志 tail -f /home/qboxserver/zeroops_metadata_1/logs/service.log diff --git a/mock/s3/shared/observability/metrics.go b/mock/s3/shared/observability/metrics.go index 4f00649..d6a8343 100644 --- a/mock/s3/shared/observability/metrics.go +++ b/mock/s3/shared/observability/metrics.go @@ -153,9 +153,9 @@ func (c *MetricCollector) initMetrics() error { return err } - // HTTP 请求时延 (使用 Prometheus 兼容的命名) + // HTTP 请求时延 if c.httpRequestDuration, err = c.meter.Float64Histogram( - "http.server.request.duration_seconds", + "http_latency", metric.WithDescription("HTTP server request duration in seconds"), metric.WithUnit("s"), metric.WithExplicitBucketBoundaries( From 334a81d70c9d9a70c9147386a9bb41f0368649ff Mon Sep 17 00:00:00 2001 From: Ding Date: Sat, 20 Sep 2025 20:09:14 +0800 Subject: [PATCH 04/18] =?UTF-8?q?feat(prometheus=5Fadapter):=20=E6=96=B0?= =?UTF-8?q?=E5=A2=9Eprometheus=5Fadapter=E5=AE=9E=E7=8E=B0=E6=8C=87?= =?UTF-8?q?=E6=A0=87=E6=9F=A5=E8=AF=A2=E5=8A=9F=E8=83=BD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 实现Prometheus适配器模块,包括以下主要功能: - 添加Prometheus客户端封装,支持指标查询和范围查询 - 实现指标服务层,提供指标列表获取和指标数据查询 - 添加API路由和控制器,提供RESTful接口 - 定义模型结构体和错误处理机制 - 更新依赖添加Prometheus客户端库 - 编写详细API文档说明接口使用方式 --- docs/prometheus_adapter/API.md | 271 ------------------ docs/prometheus_adapter/README.md | 140 +++++++++ go.mod | 18 +- go.sum | 48 +++- internal/prometheus_adapter/api/api.go | 29 ++ internal/prometheus_adapter/api/metric_api.go | 175 +++++++++++ .../client/prometheus_client.go | 144 ++++++++++ internal/prometheus_adapter/model/api.go | 24 ++ .../prometheus_adapter/model/constants.go | 10 + internal/prometheus_adapter/model/error.go | 49 ++++ internal/prometheus_adapter/server.go | 65 +++++ .../service/metric_service.go | 80 ++++++ 12 files changed, 761 insertions(+), 292 deletions(-) delete mode 100644 docs/prometheus_adapter/API.md create mode 100644 docs/prometheus_adapter/README.md create mode 100644 internal/prometheus_adapter/api/api.go create mode 100644 internal/prometheus_adapter/api/metric_api.go create mode 100644 internal/prometheus_adapter/client/prometheus_client.go create mode 100644 internal/prometheus_adapter/model/api.go create mode 100644 internal/prometheus_adapter/model/constants.go create mode 100644 internal/prometheus_adapter/model/error.go create mode 100644 internal/prometheus_adapter/server.go create mode 100644 internal/prometheus_adapter/service/metric_service.go diff --git a/docs/prometheus_adapter/API.md b/docs/prometheus_adapter/API.md deleted file mode 100644 index 4cfe228..0000000 --- a/docs/prometheus_adapter/API.md +++ /dev/null @@ -1,271 +0,0 @@ -# Prometheus Adapter API 文档 - -## 概述 - -Prometheus Adapter 提供从 Prometheus 获取服务 QPS 和平均时延指标的 RESTful API 接口。支持按服务名称和版本进行查询。 - -> **当前状态**: -> - QPS 指标:已实现,使用 `system_network_qps` 指标(基于网络包统计) -> - 时延指标:已实现,使用 `http.server.request.duration_seconds` 指标(HTTP 请求真实时延) - -## API - -### 1. 获取服务 QPS 指标 - -**GET** `/v1/metrics/:service/qps` - -获取指定服务的 QPS(每秒请求数)指标数据。 - -#### 路径参数 -- `service` (string, required): 服务名称 - -#### 查询参数 -- `version` (string, optional): 服务版本,不指定则返回所有版本 -- `start` (string, optional): 开始时间 (RFC3339 格式,如: 2024-01-01T00:00:00Z) -- `end` (string, optional): 结束时间 (RFC3339 格式,如: 2024-01-01T01:00:00Z) -- `step` (string, optional): 时间步长 (如: 1m, 5m, 1h),默认 1m - -#### 请求示例 -```bash -GET /v1/metrics/metadata-service/qps?version=1.0.0&start=2024-01-01T00:00:00Z&end=2024-01-01T01:00:00Z&step=1m -``` - -#### 响应示例 -```json -{ - "service": "metadata-service", - "version": "1.0.0", - "metric_type": "qps", - "data": [ - { - "timestamp": "2024-01-01T00:00:00Z", - "value": 150.5 - }, - { - "timestamp": "2024-01-01T00:01:00Z", - "value": 148.2 - } - ], - "summary": { - "min": 120.1, - "max": 180.3, - "avg": 152.8, - "total_points": 60 - } -} -``` - -### 2. 获取服务平均时延指标 - -**GET** `/v1/metrics/:service/latency` - -获取指定服务的平均响应时延指标数据(单位:秒)。 - -#### 路径参数 -- `service` (string, required): 服务名称 - -#### 查询参数 -- `version` (string, optional): 服务版本,不指定则返回所有版本 -- `start` (string, optional): 开始时间 (RFC3339 格式) -- `end` (string, optional): 结束时间 (RFC3339 格式) -- `step` (string, optional): 时间步长,默认 1m -- `percentile` (string, optional): 百分位数 (p50, p95, p99),默认 p50 - -#### 请求示例 -```bash -GET /v1/metrics/storage-service/latency?version=1.0.0&percentile=p95&start=2024-01-01T00:00:00Z&end=2024-01-01T01:00:00Z -``` - -#### 响应示例 -```json -{ - "service": "storage-service", - "version": "1.0.0", - "metric_type": "latency", - "percentile": "p95", - "data": [ - { - "timestamp": "2024-01-01T00:00:00Z", - "value": 125.8 - }, - { - "timestamp": "2024-01-01T00:01:00Z", - "value": 132.1 - } - ], - "summary": { - "min": 98.5, - "max": 201.2, - "avg": 128.9, - "total_points": 60 - } -} -``` - -### 3. 获取服务综合指标 - -**GET** `/v1/metrics/:service/overview` - -同时获取指定服务的 QPS 和时延指标概览。 - -#### 路径参数 -- `service` (string, required): 服务名称 - -#### 查询参数 -- `version` (string, optional): 服务版本 -- `start` (string, optional): 开始时间 (RFC3339 格式) -- `end` (string, optional): 结束时间 (RFC3339 格式) - -#### 响应示例 -```json -{ - "service": "queue-service", - "version": "1.0.0", - "time_range": { - "start": "2024-01-01T00:00:00Z", - "end": "2024-01-01T01:00:00Z" - }, - "metrics": { - "qps": { - "current": 152.8, - "avg": 148.5, - "max": 180.3, - "min": 120.1 - }, - "latency": { - "p50": 85.2, - "p95": 128.9, - "p99": 201.2 - } - } -} -``` - -### 4. 获取可用服务列表 - -**GET** `/v1/services` - -获取 Prometheus 中可监控的服务列表。 - -#### 查询参数 -- `prefix` (string, optional): 服务名前缀过滤 - -#### 响应示例 -```json -{ - "services": [ - { - "name": "metadata-service", - "versions": ["1.0.0"], - "active_versions": ["1.0.0"], - "last_updated": "2024-01-01T01:00:00Z" - }, - { - "name": "storage-service", - "versions": ["1.0.0"], - "active_versions": ["1.0.0"], - "last_updated": "2024-01-01T00:45:00Z" - }, - { - "name": "queue-service", - "versions": ["1.0.0"], - "active_versions": ["1.0.0"], - "last_updated": "2024-01-01T00:30:00Z" - }, - { - "name": "third-party-service", - "versions": ["1.0.0"], - "active_versions": ["1.0.0"], - "last_updated": "2024-01-01T00:20:00Z" - }, - { - "name": "mock-error-service", - "versions": ["1.0.0"], - "active_versions": ["1.0.0"], - "last_updated": "2024-01-01T00:15:00Z" - } - ], - "total": 5 -} -``` - -## 错误响应 - -所有 API 在出错时返回统一的错误格式: - -```json -{ - "error": "error_code", - "message": "详细错误描述", - "details": { - "field": "具体错误字段" - } -} -``` - -### 常见错误码 - -- `400 Bad Request`: 请求参数错误 -- `404 Not Found`: 服务或版本不存在 -- `500 Internal Server Error`: 内部服务器错误 -- `503 Service Unavailable`: Prometheus 连接失败 - -## 实现说明 - -### Prometheus 查询语法 - -API 内部使用的 Prometheus 查询示例: - -#### QPS 查询 -```promql -# 网络包 QPS(当前实现) -system_network_qps{exported_job="metadata-service",service_version="1.0.0"} - -# 计算5分钟平均 QPS -rate(system_network_qps{exported_job="metadata-service",service_version="1.0.0"}[5m]) -``` - -#### 平均时延查询 -```promql -# P95 时延(95分位数) -histogram_quantile(0.95, rate(http.server.request.duration_seconds_bucket{exported_job="metadata-service",service_version="1.0.0"}[5m])) - -# P50 时延(中位数) -histogram_quantile(0.50, rate(http.server.request.duration_seconds_bucket{exported_job="metadata-service",service_version="1.0.0"}[5m])) - -# P99 时延(99分位数) -histogram_quantile(0.99, rate(http.server.request.duration_seconds_bucket{exported_job="metadata-service",service_version="1.0.0"}[5m])) - -# 平均时延 -rate(http.server.request.duration_seconds_sum{exported_job="metadata-service",service_version="1.0.0"}[5m]) -/ -rate(http.server.request.duration_seconds_count{exported_job="metadata-service",service_version="1.0.0"}[5m]) -``` - -### 配置要求 - -需要在配置文件中指定: -- Prometheus 服务器地址:`http://10.210.10.33:9090` -- 查询超时时间:30秒 -- 默认时间范围:最近1小时 -- 服务标签映射: - - 服务名:`exported_job`(在指标中作为标签) - - 版本号:`service_version`(在指标中作为标签) - - 实例标识:通过 OpenTelemetry 的 `service.instance.id` 属性设置 - -### 支持的服务列表 - -当前 mock/s3 环境中支持的服务: -- `metadata-service` - 元数据管理服务(版本:1.0.0) -- `storage-service` - 存储服务(版本:1.0.0) -- `queue-service` - 消息队列服务(版本:1.0.0) -- `third-party-service` - 第三方集成服务(版本:1.0.0) -- `mock-error-service` - 错误模拟服务(版本:1.0.0) - -所有服务的版本信息通过 `service_version` 标签暴露。 - -### 缓存策略 - -- 指标数据缓存时间:30秒 -- 服务列表缓存时间:5分钟 -- 支持 ETag 缓存验证 \ No newline at end of file diff --git a/docs/prometheus_adapter/README.md b/docs/prometheus_adapter/README.md new file mode 100644 index 0000000..30ca4be --- /dev/null +++ b/docs/prometheus_adapter/README.md @@ -0,0 +1,140 @@ +# Prometheus Adapter API 文档 + +## 概述 + +Prometheus Adapter 提供从 Prometheus 获取服务指标的 RESTful API 接口。支持按服务名称和版本进行查询。 + +## API + +### 1. 获取可用指标列表 + +**GET** `/v1/metrics` + +获取所有可用的指标列表。 + +#### 请求示例 +```bash +GET /v1/metrics +``` + +#### 响应示例 +```json +{ + "metrics": [ + "system_cpu_usage_percent", + "system_memory_usage_percent", + "system_disk_usage_percent", + "system_network_qps", + "system_machine_online_status", + "http_latency" + ] +} +``` + +### 2. 通用指标查询接口 + +**GET** `/v1/metrics/:service/:metric` + +获取指定服务的任意指标时间序列数据。指标不存在则返回错误。 + +#### 路径参数 +- `service` (string, required): 服务名称 +- `metric` (string, required): 指标名称(必须是 Prometheus 中实际存在的指标) + +#### 查询参数 +- `version` (string, optional): 服务版本,不指定则返回所有版本 +- `start` (string, optional): 开始时间 (RFC3339 格式,如: 2024-01-01T00:00:00Z) +- `end` (string, optional): 结束时间 (RFC3339 格式,如: 2024-01-01T01:00:00Z) +- `step` (string, optional): 时间步长 (如: 1m, 5m, 1h),默认 1m + +#### 请求示例 + +1. **查询 CPU 使用率:** +```bash +GET /v1/metrics/metadata-service/system_cpu_usage_percent?version=1.0.0 +``` + +2. **查询内存使用率:** +```bash +GET /v1/metrics/storage-service/system_memory_usage_percent?version=1.0.0 +``` + +3. **查询 HTTP 请求延迟:** +```bash +GET /v1/metrics/storage-service/http_latency?version=1.0.0 +``` + +4. **查询网络 QPS:** +```bash +GET /v1/metrics/storage-service/system_network_qps?version=1.0.0 +``` + +#### 成功响应示例 + +**HTTP 200 OK** +```json +{ + "service": "metadata-service", + "version": "1.0.0", + "metric": "system_cpu_usage_percent", + "data": [ + { + "timestamp": "2024-01-01T00:00:00Z", + "value": 45.2 + }, + { + "timestamp": "2024-01-01T00:01:00Z", + "value": 48.5 + } + ] +} +``` + +#### 错误响应示例 + +**指标不存在时 - HTTP 404 Not Found** +```json +{ + "error": { + "code": "METRIC_NOT_FOUND", + "message": "指标 'invalid_metric' 不存在", + "metric": "invalid_metric" + } +} +``` + +**服务不存在时 - HTTP 404 Not Found** +```json +{ + "error": { + "code": "SERVICE_NOT_FOUND", + "message": "服务 'invalid-service' 不存在", + "service": "invalid-service" + } +} +``` + +**参数错误时 - HTTP 400 Bad Request** +```json +{ + "error": { + "code": "INVALID_PARAMETER", + "message": "参数 'start' 格式错误: invalid-time", + "parameter": "start", + "value": "invalid-time" + } +} +``` + +## 实现说明 + +### 支持的服务列表 + +当前 mock/s3 环境中支持的服务: +- `metadata-service` - 元数据管理服务 +- `storage-service` - 存储服务 +- `queue-service` - 消息队列服务 +- `third-party-service` - 第三方集成服务 +- `mock-error-service` - 错误模拟服务 + +所有服务的版本信息通过 `service_version` 标签暴露。 \ No newline at end of file diff --git a/go.mod b/go.mod index 6094f9c..8015824 100644 --- a/go.mod +++ b/go.mod @@ -7,6 +7,8 @@ require ( github.com/google/uuid v1.6.0 github.com/jackc/pgx/v5 v5.5.5 github.com/lib/pq v1.10.9 + github.com/prometheus/client_golang v1.23.2 + github.com/prometheus/common v0.66.1 github.com/redis/go-redis/v9 v9.5.1 github.com/rs/zerolog v1.34.0 ) @@ -14,7 +16,7 @@ require ( require ( github.com/bytedance/sonic v1.13.3 // indirect github.com/bytedance/sonic/loader v0.2.4 // indirect - github.com/cespare/xxhash/v2 v2.2.0 // indirect + github.com/cespare/xxhash/v2 v2.3.0 // indirect github.com/cloudwego/base64x v0.1.5 // indirect github.com/dgryski/go-rendezvous v0.0.0-20200823014737-9f7001d12a5f // indirect github.com/gabriel-vasile/mimetype v1.4.9 // indirect @@ -39,14 +41,16 @@ require ( github.com/modern-go/reflect2 v1.0.2 // indirect github.com/natefinch/lumberjack v2.0.0+incompatible // indirect github.com/pelletier/go-toml/v2 v2.2.4 // indirect + github.com/prometheus/client_model v0.6.2 // indirect github.com/twitchyliquid64/golang-asm v0.15.1 // indirect github.com/ugorji/go/codec v1.3.0 // indirect + go.yaml.in/yaml/v2 v2.4.2 // indirect golang.org/x/arch v0.18.0 // indirect - golang.org/x/crypto v0.39.0 // indirect - golang.org/x/net v0.41.0 // indirect - golang.org/x/sync v0.15.0 // indirect - golang.org/x/sys v0.33.0 // indirect - golang.org/x/text v0.26.0 // indirect - google.golang.org/protobuf v1.36.6 // indirect + golang.org/x/crypto v0.41.0 // indirect + golang.org/x/net v0.43.0 // indirect + golang.org/x/sync v0.16.0 // indirect + golang.org/x/sys v0.35.0 // indirect + golang.org/x/text v0.28.0 // indirect + google.golang.org/protobuf v1.36.8 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect ) diff --git a/go.sum b/go.sum index 04e9b56..2c92f4d 100644 --- a/go.sum +++ b/go.sum @@ -1,5 +1,7 @@ github.com/BurntSushi/toml v1.2.1 h1:9F2/+DoOYIOksmaJFPw1tGFy1eDnIJXg+UHjuD8lTak= github.com/BurntSushi/toml v1.2.1/go.mod h1:CxXYINrC8qIiEnFrOxCa7Jy5BFHlXnUU2pbicEuybxQ= +github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= +github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw= github.com/bsm/ginkgo/v2 v2.12.0 h1:Ny8MWAHyOepLGlLKYmXG4IEkioBysk6GpaRTLC8zwWs= github.com/bsm/ginkgo/v2 v2.12.0/go.mod h1:SwYbGRRDovPVboqFv0tPTcG1sN61LM1Z4ARdbAV9g4c= github.com/bsm/gomega v1.27.10 h1:yeMWxP2pV2fG3FgAODIY8EiRE3dy0aeFYt4l7wh6yKA= @@ -9,8 +11,8 @@ github.com/bytedance/sonic v1.13.3/go.mod h1:o68xyaF9u2gvVBuGHPlUVCy+ZfmNNO5ETf1 github.com/bytedance/sonic/loader v0.1.1/go.mod h1:ncP89zfokxS5LZrJxl5z0UJcsk4M4yY2JpfqGeCtNLU= github.com/bytedance/sonic/loader v0.2.4 h1:ZWCw4stuXUsn1/+zQDqeE7JKP+QO47tz7QCNan80NzY= github.com/bytedance/sonic/loader v0.2.4/go.mod h1:N8A3vUdtUebEY2/VQC0MyhYeKUFosQU6FxH2JmUe6VI= -github.com/cespare/xxhash/v2 v2.2.0 h1:DC2CZ1Ep5Y4k3ZQ899DldepgrayRUGE6BBZ/cd9Cj44= -github.com/cespare/xxhash/v2 v2.2.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= +github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs= +github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= github.com/cloudwego/base64x v0.1.5 h1:XPciSp1xaq2VCSt6lF0phncD4koWyULpl5bUxbfCyP4= github.com/cloudwego/base64x v0.1.5/go.mod h1:0zlkT4Wn5C6NdauXdJRhSKRlJvmclQ1hhJgA0rcu/8w= github.com/cloudwego/iasm v0.2.0/go.mod h1:8rXZaNYT2n95jn+zTI1sDr+IgcD2GVs0nlbbQPiEFhY= @@ -56,6 +58,8 @@ github.com/jackc/pgx/v5 v5.5.5 h1:amBjrZVmksIdNjxGW/IiIMzxMKZFelXbUoPNb+8sjQw= github.com/jackc/pgx/v5 v5.5.5/go.mod h1:ez9gk+OAat140fv9ErkZDYFWmXLfV+++K0uAOiwgm1A= github.com/jackc/puddle/v2 v2.2.1 h1:RhxXJtFG022u4ibrCSMSiu5aOq1i77R3OHKNJj77OAk= github.com/jackc/puddle/v2 v2.2.1/go.mod h1:vriiEXHvEE654aYKXXjOvZM39qJ0q+azkZFrfEOc3H4= +github.com/jpillora/backoff v1.0.0 h1:uvFg412JmmHBHw7iwprIxkPMI+sGQ4kzOWsMeHnm2EA= +github.com/jpillora/backoff v1.0.0/go.mod h1:J/6gKK9jxlEcS3zixgDgUAsiuZ7yrSoa/FX5e0EB2j4= github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM= github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo= github.com/klauspost/cpuid/v2 v2.0.9/go.mod h1:FInQzS24/EEf25PyTYn52gqo7WaD8xa0213Md/qVLRg= @@ -84,6 +88,10 @@ github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= github.com/modern-go/reflect2 v1.0.2 h1:xBagoLtFs94CBntxluKeaWgTMpvLxC4ur3nMaC9Gz0M= github.com/modern-go/reflect2 v1.0.2/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk= +github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA= +github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ= +github.com/mwitkow/go-conntrack v0.0.0-20190716064945-2f068394615f h1:KUppIJq7/+SVif2QVs3tOP0zanoHgBEVAwHxUSIzRqU= +github.com/mwitkow/go-conntrack v0.0.0-20190716064945-2f068394615f/go.mod h1:qRWi+5nqEBWmkhHvq77mSJWrCKwh8bxhgT7d/eI7P4U= github.com/natefinch/lumberjack v2.0.0+incompatible h1:4QJd3OLAMgj7ph+yZTuX13Ld4UpgHp07nNdFX7mqFfM= github.com/natefinch/lumberjack v2.0.0+incompatible/go.mod h1:Wi9p2TTF5DG5oU+6YfsmYQpsTIOm0B1VNzQg9Mw6nPk= github.com/pelletier/go-toml/v2 v2.2.4 h1:mye9XuhQ6gvn5h28+VilKrrPoQVanw5PMw/TB0t5Ec4= @@ -92,6 +100,14 @@ github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINE github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 h1:Jamvg5psRIccs7FGNTlIRMkT8wgtp5eCXdBlqhYGL6U= github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/prometheus/client_golang v1.23.2 h1:Je96obch5RDVy3FDMndoUsjAhG5Edi49h0RJWRi/o0o= +github.com/prometheus/client_golang v1.23.2/go.mod h1:Tb1a6LWHB3/SPIzCoaDXI4I8UHKeFTEQ1YCr+0Gyqmg= +github.com/prometheus/client_model v0.6.2 h1:oBsgwpGs7iVziMvrGhE53c/GrLUsZdHnqNwqPLxwZyk= +github.com/prometheus/client_model v0.6.2/go.mod h1:y3m2F6Gdpfy6Ut/GBsUqTWZqCUvMVzSfMLjcu6wAwpE= +github.com/prometheus/common v0.66.1 h1:h5E0h5/Y8niHc5DlaLlWLArTQI7tMrsfQjHV+d9ZoGs= +github.com/prometheus/common v0.66.1/go.mod h1:gcaUsgf3KfRSwHY4dIMXLPV0K/Wg1oZ8+SbZk/HH/dA= +github.com/prometheus/procfs v0.16.1 h1:hZ15bTNuirocR6u0JZ6BAHHmwS1p8B4P6MRqxtzMyRg= +github.com/prometheus/procfs v0.16.1/go.mod h1:teAbpZRB1iIAJYREa1LsoWUXykVXA1KlTmWl8x/U+Is= github.com/redis/go-redis/v9 v9.5.1 h1:H1X4D3yHPaYrkL5X06Wh6xNVM/pX0Ft4RV0vMGvLBh8= github.com/redis/go-redis/v9 v9.5.1/go.mod h1:hdY0cQFCN4fnSYT6TkisLufl/4W5UIXyv0b/CLO2V2M= github.com/rogpeppe/go-internal v1.10.0 h1:TMyTOH3F/DB16zRVcYyreMH6GnZZrwQVAoYjRBZyWFQ= @@ -113,23 +129,27 @@ github.com/twitchyliquid64/golang-asm v0.15.1 h1:SU5vSMR7hnwNxj24w34ZyCi/FmDZTkS github.com/twitchyliquid64/golang-asm v0.15.1/go.mod h1:a1lVb/DtPvCB8fslRZhAngC2+aY1QWCk3Cedj/Gdt08= github.com/ugorji/go/codec v1.3.0 h1:Qd2W2sQawAfG8XSvzwhBeoGq71zXOC/Q1E9y/wUcsUA= github.com/ugorji/go/codec v1.3.0/go.mod h1:pRBVtBSKl77K30Bv8R2P+cLSGaTtex6fsA2Wjqmfxj4= +go.yaml.in/yaml/v2 v2.4.2 h1:DzmwEr2rDGHl7lsFgAHxmNz/1NlQ7xLIrlN2h5d1eGI= +go.yaml.in/yaml/v2 v2.4.2/go.mod h1:081UH+NErpNdqlCXm3TtEran0rJZGxAYx9hb/ELlsPU= golang.org/x/arch v0.18.0 h1:WN9poc33zL4AzGxqf8VtpKUnGvMi8O9lhNyBMF/85qc= golang.org/x/arch v0.18.0/go.mod h1:bdwinDaKcfZUGpH09BB7ZmOfhalA8lQdzl62l8gGWsk= -golang.org/x/crypto v0.39.0 h1:SHs+kF4LP+f+p14esP5jAoDpHU8Gu/v9lFRK6IT5imM= -golang.org/x/crypto v0.39.0/go.mod h1:L+Xg3Wf6HoL4Bn4238Z6ft6KfEpN0tJGo53AAPC632U= -golang.org/x/net v0.41.0 h1:vBTly1HeNPEn3wtREYfy4GZ/NECgw2Cnl+nK6Nz3uvw= -golang.org/x/net v0.41.0/go.mod h1:B/K4NNqkfmg07DQYrbwvSluqCJOOXwUjeb/5lOisjbA= -golang.org/x/sync v0.15.0 h1:KWH3jNZsfyT6xfAfKiz6MRNmd46ByHDYaZ7KSkCtdW8= -golang.org/x/sync v0.15.0/go.mod h1:1dzgHSNfp02xaA81J2MS99Qcpr2w7fw1gpm99rleRqA= +golang.org/x/crypto v0.41.0 h1:WKYxWedPGCTVVl5+WHSSrOBT0O8lx32+zxmHxijgXp4= +golang.org/x/crypto v0.41.0/go.mod h1:pO5AFd7FA68rFak7rOAGVuygIISepHftHnr8dr6+sUc= +golang.org/x/net v0.43.0 h1:lat02VYK2j4aLzMzecihNvTlJNQUq316m2Mr9rnM6YE= +golang.org/x/net v0.43.0/go.mod h1:vhO1fvI4dGsIjh73sWfUVjj3N7CA9WkKJNQm2svM6Jg= +golang.org/x/oauth2 v0.30.0 h1:dnDm7JmhM45NNpd8FDDeLhK6FwqbOf4MLCM9zb1BOHI= +golang.org/x/oauth2 v0.30.0/go.mod h1:B++QgG3ZKulg6sRPGD/mqlHQs5rB3Ml9erfeDY7xKlU= +golang.org/x/sync v0.16.0 h1:ycBJEhp9p4vXvUZNszeOq0kGTPghopOL8q0fq3vstxw= +golang.org/x/sync v0.16.0/go.mod h1:1dzgHSNfp02xaA81J2MS99Qcpr2w7fw1gpm99rleRqA= golang.org/x/sys v0.0.0-20220811171246-fbc7d0a398ab/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.12.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.33.0 h1:q3i8TbbEz+JRD9ywIRlyRAQbM0qF7hu24q3teo2hbuw= -golang.org/x/sys v0.33.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k= -golang.org/x/text v0.26.0 h1:P42AVeLghgTYr4+xUnTRKDMqpar+PtX7KWuNQL21L8M= -golang.org/x/text v0.26.0/go.mod h1:QK15LZJUUQVJxhz7wXgxSy/CJaTFjd0G+YLonydOVQA= -google.golang.org/protobuf v1.36.6 h1:z1NpPI8ku2WgiWnf+t9wTPsn6eP1L7ksHUlkfLvd9xY= -google.golang.org/protobuf v1.36.6/go.mod h1:jduwjTPXsFjZGTmRluh+L6NjiWu7pchiJ2/5YcXBHnY= +golang.org/x/sys v0.35.0 h1:vz1N37gP5bs89s7He8XuIYXpyY0+QlsKmzipCbUtyxI= +golang.org/x/sys v0.35.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k= +golang.org/x/text v0.28.0 h1:rhazDwis8INMIwQ4tpjLDzUhx6RlXqZNPEM0huQojng= +golang.org/x/text v0.28.0/go.mod h1:U8nCwOR8jO/marOQ0QbDiOngZVEBB7MAiitBuMjXiNU= +google.golang.org/protobuf v1.36.8 h1:xHScyCOEuuwZEc6UtSOvPbAT4zRh0xcNRYekJwfqyMc= +google.golang.org/protobuf v1.36.8/go.mod h1:fuxRtAxBytpl4zzqUh6/eyUujkJdNiuEkXntxiD/uRU= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk= gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q= diff --git a/internal/prometheus_adapter/api/api.go b/internal/prometheus_adapter/api/api.go new file mode 100644 index 0000000..4de0c96 --- /dev/null +++ b/internal/prometheus_adapter/api/api.go @@ -0,0 +1,29 @@ +package api + +import ( + "github.com/fox-gonic/fox" + "github.com/qiniu/zeroops/internal/prometheus_adapter/service" +) + +// Api Prometheus Adapter API +type Api struct { + metricService *service.MetricService + router *fox.Engine +} + +// NewApi 创建新的 API +func NewApi(metricService *service.MetricService, router *fox.Engine) (*Api, error) { + api := &Api{ + metricService: metricService, + router: router, + } + + api.setupRouters(router) + return api, nil +} + +// setupRouters 设置路由 +func (api *Api) setupRouters(router *fox.Engine) { + // 指标相关路由 + api.setupMetricRouters(router) +} diff --git a/internal/prometheus_adapter/api/metric_api.go b/internal/prometheus_adapter/api/metric_api.go new file mode 100644 index 0000000..431bfdf --- /dev/null +++ b/internal/prometheus_adapter/api/metric_api.go @@ -0,0 +1,175 @@ +package api + +import ( + "errors" + "fmt" + "net/http" + "time" + + "github.com/fox-gonic/fox" + "github.com/qiniu/zeroops/internal/prometheus_adapter/model" + "github.com/rs/zerolog/log" +) + +// setupMetricRouters 设置指标相关路由 +func (api *Api) setupMetricRouters(router *fox.Engine) { + router.GET("/v1/metrics", api.GetMetrics) + router.GET("/v1/metrics/:service/:metric", api.QueryMetric) +} + +// GetMetrics 获取可用指标列表(GET /v1/metrics) +func (api *Api) GetMetrics(c *fox.Context) { + ctx := c.Request.Context() + + response, err := api.metricService.GetAvailableMetrics(ctx) + if err != nil { + log.Error().Err(err).Msg("failed to get available metrics") + api.sendErrorResponse(c, http.StatusInternalServerError, model.ErrorCodeInternalError, "获取指标列表失败", nil) + return + } + + c.JSON(http.StatusOK, response) +} + +// QueryMetric 查询指标数据(GET /v1/metrics/:service/:metric) +func (api *Api) QueryMetric(c *fox.Context) { + ctx := c.Request.Context() + + // 获取路径参数 + serviceName := c.Param("service") + metricName := c.Param("metric") + + // 获取查询参数 + version := c.Query("version") + startStr := c.Query("start") + endStr := c.Query("end") + stepStr := c.Query("step") + + // 解析时间参数 + start, end, err := api.parseTimeRange(startStr, endStr) + if err != nil { + log.Error().Err(err).Msg("invalid time parameters") + api.sendErrorResponse(c, http.StatusBadRequest, model.ErrorCodeInvalidParameter, + fmt.Sprintf("参数 'start/end' 格式错误: %s", err.Error()), nil) + return + } + + // 解析步长参数 + step := api.parseStep(stepStr) + + // 查询指标 + response, err := api.metricService.QueryMetric(ctx, serviceName, metricName, version, start, end, step) + if err != nil { + api.handleQueryError(c, err, serviceName, metricName) + return + } + + c.JSON(http.StatusOK, response) +} + +// parseTimeRange 解析时间范围参数 +func (api *Api) parseTimeRange(startStr, endStr string) (time.Time, time.Time, error) { + var start, end time.Time + var err error + + // 如果没有指定开始时间,默认为1小时前 + if startStr == "" { + start = time.Now().Add(-1 * time.Hour) + } else { + start, err = time.Parse(time.RFC3339, startStr) + if err != nil { + return time.Time{}, time.Time{}, fmt.Errorf("invalid start time format: %w", err) + } + } + + // 如果没有指定结束时间,默认为当前时间 + if endStr == "" { + end = time.Now() + } else { + end, err = time.Parse(time.RFC3339, endStr) + if err != nil { + return time.Time{}, time.Time{}, fmt.Errorf("invalid end time format: %w", err) + } + } + + // 验证时间范围的合理性 + if end.Before(start) { + return time.Time{}, time.Time{}, fmt.Errorf("end time must be after start time") + } + + return start, end, nil +} + +// parseStep 解析步长参数 +func (api *Api) parseStep(stepStr string) time.Duration { + if stepStr == "" { + return time.Minute // 默认1分钟 + } + + duration, err := time.ParseDuration(stepStr) + if err != nil { + log.Warn().Str("step", stepStr).Msg("invalid step format, using default") + return time.Minute + } + + return duration +} + +// handleQueryError 处理查询错误 +func (api *Api) handleQueryError(c *fox.Context, err error, service, metric string) { + var serviceNotFound *model.ServiceNotFoundError + var metricNotFound *model.MetricNotFoundError + var prometheusError *model.PrometheusError + + switch { + case errors.As(err, &serviceNotFound): + log.Error().Err(err).Str("service", service).Msg("service not found") + api.sendErrorResponse(c, http.StatusNotFound, model.ErrorCodeServiceNotFound, + err.Error(), map[string]string{"service": service}) + + case errors.As(err, &metricNotFound): + log.Error().Err(err).Str("metric", metric).Msg("metric not found") + api.sendErrorResponse(c, http.StatusNotFound, model.ErrorCodeMetricNotFound, + err.Error(), map[string]string{"metric": metric}) + + case errors.As(err, &prometheusError): + log.Error().Err(err).Msg("prometheus query error") + api.sendErrorResponse(c, http.StatusInternalServerError, model.ErrorCodePrometheusError, + "Prometheus 查询失败", nil) + + default: + log.Error().Err(err).Msg("unexpected error during metric query") + api.sendErrorResponse(c, http.StatusInternalServerError, model.ErrorCodeInternalError, + "内部服务器错误", nil) + } +} + +// sendErrorResponse 发送错误响应 +func (api *Api) sendErrorResponse(c *fox.Context, statusCode int, errorCode, message string, extras map[string]string) { + errorDetail := model.ErrorDetail{ + Code: errorCode, + Message: message, + } + + // 添加额外的字段 + if extras != nil { + if service, ok := extras["service"]; ok { + errorDetail.Service = service + } + if metric, ok := extras["metric"]; ok { + errorDetail.Metric = metric + } + if parameter, ok := extras["parameter"]; ok { + errorDetail.Parameter = parameter + } + if value, ok := extras["value"]; ok { + errorDetail.Value = value + } + } + + response := model.ErrorResponse{ + Error: errorDetail, + } + + c.JSON(statusCode, response) +} diff --git a/internal/prometheus_adapter/client/prometheus_client.go b/internal/prometheus_adapter/client/prometheus_client.go new file mode 100644 index 0000000..7bf0a3a --- /dev/null +++ b/internal/prometheus_adapter/client/prometheus_client.go @@ -0,0 +1,144 @@ +package client + +import ( + "context" + "fmt" + "time" + + "github.com/prometheus/client_golang/api" + v1 "github.com/prometheus/client_golang/api/prometheus/v1" + promModel "github.com/prometheus/common/model" + "github.com/qiniu/zeroops/internal/prometheus_adapter/model" +) + +// PrometheusClient Prometheus 客户端 +type PrometheusClient struct { + api v1.API +} + +// NewPrometheusClient 创建新的 Prometheus 客户端 +func NewPrometheusClient(address string) (*PrometheusClient, error) { + client, err := api.NewClient(api.Config{ + Address: address, + }) + if err != nil { + return nil, fmt.Errorf("failed to create prometheus client: %w", err) + } + + return &PrometheusClient{ + api: v1.NewAPI(client), + }, nil +} + +// QueryRange 执行范围查询 +func (c *PrometheusClient) QueryRange(ctx context.Context, query string, start, end time.Time, step time.Duration) ([]model.MetricDataPoint, error) { + r := v1.Range{ + Start: start, + End: end, + Step: step, + } + + result, warnings, err := c.api.QueryRange(ctx, query, r) + if err != nil { + return nil, fmt.Errorf("failed to query prometheus: %w", err) + } + + if len(warnings) > 0 { + // 记录警告但不返回错误 + fmt.Printf("Prometheus warnings: %v\n", warnings) + } + + // 转换结果为我们的数据格式 + matrix, ok := result.(promModel.Matrix) + if !ok { + return nil, fmt.Errorf("unexpected result type: %T", result) + } + + var dataPoints []model.MetricDataPoint + for _, sample := range matrix { + for _, pair := range sample.Values { + dataPoints = append(dataPoints, model.MetricDataPoint{ + Timestamp: pair.Timestamp.Time(), + Value: float64(pair.Value), + }) + } + } + + return dataPoints, nil +} + +// GetAvailableMetrics 获取所有可用的指标名称 +func (c *PrometheusClient) GetAvailableMetrics(ctx context.Context) ([]string, error) { + // 查询所有指标名称 + result, warnings, err := c.api.LabelValues(ctx, "__name__", nil, time.Now().Add(-time.Hour), time.Now()) + if err != nil { + return nil, fmt.Errorf("failed to get metrics: %w", err) + } + + if len(warnings) > 0 { + fmt.Printf("Prometheus warnings: %v\n", warnings) + } + + // 转换为字符串数组,过滤相关的指标 + metrics := make([]string, 0) + for _, m := range result { + metricName := string(m) + metrics = append(metrics, metricName) + } + + return metrics, nil +} + +// CheckMetricExists 检查指标是否存在 +func (c *PrometheusClient) CheckMetricExists(ctx context.Context, metric string) (bool, error) { + // 查询指标是否存在 + query := fmt.Sprintf(`{__name__="%s"}`, metric) + result, _, err := c.api.Query(ctx, query, time.Now()) + if err != nil { + return false, fmt.Errorf("failed to check metric existence: %w", err) + } + + // 如果有结果,说明指标存在 + switch v := result.(type) { + case promModel.Vector: + return len(v) > 0, nil + case promModel.Matrix: + return len(v) > 0, nil + default: + return false, nil + } +} + +// CheckServiceExists 检查服务是否存在 +func (c *PrometheusClient) CheckServiceExists(ctx context.Context, service string) (bool, error) { + // 查询服务是否存在 + query := fmt.Sprintf(`{service_name="%s"}`, service) + result, _, err := c.api.Query(ctx, query, time.Now()) + if err != nil { + return false, fmt.Errorf("failed to check service existence: %w", err) + } + + // 如果有结果,说明服务存在 + switch v := result.(type) { + case promModel.Vector: + return len(v) > 0, nil + case promModel.Matrix: + return len(v) > 0, nil + default: + return false, nil + } +} + +// BuildQuery 构建 PromQL 查询 +func BuildQuery(service, metric, version string) string { + // 基础查询 + query := fmt.Sprintf(`%s{service_name="%s"`, metric, service) + + // 如果指定了版本,添加版本过滤 + if version != "" { + query += fmt.Sprintf(`,service_version="%s"`, version) + } + + query += "}" + return query +} diff --git a/internal/prometheus_adapter/model/api.go b/internal/prometheus_adapter/model/api.go new file mode 100644 index 0000000..efef1d2 --- /dev/null +++ b/internal/prometheus_adapter/model/api.go @@ -0,0 +1,24 @@ +package model + +import "time" + +// ===== API 响应结构体 ===== + +// MetricListResponse 指标列表响应(对应 GET /v1/metrics) +type MetricListResponse struct { + Metrics []string `json:"metrics"` +} + +// MetricQueryResponse 指标查询响应(对应 GET /v1/metrics/:service/:metric) +type MetricQueryResponse struct { + Service string `json:"service"` + Version string `json:"version,omitempty"` + Metric string `json:"metric"` + Data []MetricDataPoint `json:"data"` +} + +// MetricDataPoint 指标数据点 +type MetricDataPoint struct { + Timestamp time.Time `json:"timestamp"` + Value float64 `json:"value"` +} diff --git a/internal/prometheus_adapter/model/constants.go b/internal/prometheus_adapter/model/constants.go new file mode 100644 index 0000000..3e8727b --- /dev/null +++ b/internal/prometheus_adapter/model/constants.go @@ -0,0 +1,10 @@ +package model + +// 错误码常量 +const ( + ErrorCodeMetricNotFound = "METRIC_NOT_FOUND" + ErrorCodeServiceNotFound = "SERVICE_NOT_FOUND" + ErrorCodeInvalidParameter = "INVALID_PARAMETER" + ErrorCodePrometheusError = "PROMETHEUS_ERROR" + ErrorCodeInternalError = "INTERNAL_ERROR" +) diff --git a/internal/prometheus_adapter/model/error.go b/internal/prometheus_adapter/model/error.go new file mode 100644 index 0000000..aacbb63 --- /dev/null +++ b/internal/prometheus_adapter/model/error.go @@ -0,0 +1,49 @@ +package model + +import "fmt" + +// ===== 错误响应结构体 ===== + +// ErrorResponse 错误响应 +type ErrorResponse struct { + Error ErrorDetail `json:"error"` +} + +// ErrorDetail 错误详情 +type ErrorDetail struct { + Code string `json:"code"` + Message string `json:"message"` + Service string `json:"service,omitempty"` + Metric string `json:"metric,omitempty"` + Parameter string `json:"parameter,omitempty"` + Value string `json:"value,omitempty"` +} + +// ===== 自定义错误类型 ===== + +// ServiceNotFoundError 服务不存在错误 +type ServiceNotFoundError struct { + Service string +} + +func (e *ServiceNotFoundError) Error() string { + return fmt.Sprintf("服务 '%s' 不存在", e.Service) +} + +// MetricNotFoundError 指标不存在错误 +type MetricNotFoundError struct { + Metric string +} + +func (e *MetricNotFoundError) Error() string { + return fmt.Sprintf("指标 '%s' 不存在", e.Metric) +} + +// PrometheusError Prometheus 查询错误 +type PrometheusError struct { + Message string +} + +func (e *PrometheusError) Error() string { + return fmt.Sprintf("Prometheus 查询错误: %s", e.Message) +} diff --git a/internal/prometheus_adapter/server.go b/internal/prometheus_adapter/server.go new file mode 100644 index 0000000..1940839 --- /dev/null +++ b/internal/prometheus_adapter/server.go @@ -0,0 +1,65 @@ +package prometheusadapter + +import ( + "fmt" + "os" + + "github.com/fox-gonic/fox" + "github.com/qiniu/zeroops/internal/config" + "github.com/qiniu/zeroops/internal/prometheus_adapter/api" + "github.com/qiniu/zeroops/internal/prometheus_adapter/client" + "github.com/qiniu/zeroops/internal/prometheus_adapter/service" + "github.com/rs/zerolog/log" +) + +// PrometheusAdapterServer Prometheus Adapter 服务器 +type PrometheusAdapterServer struct { + config *config.Config + promClient *client.PrometheusClient + metricService *service.MetricService + api *api.Api +} + +// NewPrometheusAdapterServer 创建新的 Prometheus Adapter 服务器 +func NewPrometheusAdapterServer(cfg *config.Config) (*PrometheusAdapterServer, error) { + // 使用环境变量或默认值获取 Prometheus 地址 + prometheusAddr := os.Getenv("PROMETHEUS_ADDRESS") + if prometheusAddr == "" { + prometheusAddr = "http://localhost:9090" + } + + // 创建 Prometheus 客户端 + promClient, err := client.NewPrometheusClient(prometheusAddr) + if err != nil { + return nil, fmt.Errorf("failed to create prometheus client: %w", err) + } + + // 创建指标服务 + metricService := service.NewMetricService(promClient) + + server := &PrometheusAdapterServer{ + config: cfg, + promClient: promClient, + metricService: metricService, + } + + log.Info().Str("prometheus_address", prometheusAddr).Msg("Prometheus Adapter initialized successfully") + return server, nil +} + +// UseApi 设置 API 路由 +func (s *PrometheusAdapterServer) UseApi(router *fox.Engine) error { + var err error + s.api, err = api.NewApi(s.metricService, router) + if err != nil { + return fmt.Errorf("failed to initialize API: %w", err) + } + return nil +} + +// Close 关闭服务器 +func (s *PrometheusAdapterServer) Close() error { + // 当前没有需要关闭的资源 + log.Info().Msg("Prometheus Adapter server closed") + return nil +} diff --git a/internal/prometheus_adapter/service/metric_service.go b/internal/prometheus_adapter/service/metric_service.go new file mode 100644 index 0000000..fff69c2 --- /dev/null +++ b/internal/prometheus_adapter/service/metric_service.go @@ -0,0 +1,80 @@ +package service + +import ( + "context" + "time" + + "github.com/qiniu/zeroops/internal/prometheus_adapter/client" + "github.com/qiniu/zeroops/internal/prometheus_adapter/model" + "github.com/rs/zerolog/log" +) + +// MetricService 指标服务 +type MetricService struct { + promClient *client.PrometheusClient +} + +// NewMetricService 创建指标服务 +func NewMetricService(promClient *client.PrometheusClient) *MetricService { + return &MetricService{ + promClient: promClient, + } +} + +// GetAvailableMetrics 获取可用的指标列表 +func (s *MetricService) GetAvailableMetrics(ctx context.Context) (*model.MetricListResponse, error) { + // 从 Prometheus 动态获取指标列表 + metrics, err := s.promClient.GetAvailableMetrics(ctx) + if err != nil { + log.Error().Err(err).Msg("failed to get available metrics from prometheus") + return nil, &model.PrometheusError{Message: err.Error()} + } + + return &model.MetricListResponse{ + Metrics: metrics, + }, nil +} + +// QueryMetric 查询指标数据 +func (s *MetricService) QueryMetric(ctx context.Context, service, metric, version string, start, end time.Time, step time.Duration) (*model.MetricQueryResponse, error) { + // 动态验证服务是否存在 + serviceExists, err := s.promClient.CheckServiceExists(ctx, service) + if err != nil { + log.Error().Err(err).Str("service", service).Msg("failed to check service existence") + return nil, &model.PrometheusError{Message: err.Error()} + } + if !serviceExists { + return nil, &model.ServiceNotFoundError{Service: service} + } + + // 动态验证指标是否存在 + metricExists, err := s.promClient.CheckMetricExists(ctx, metric) + if err != nil { + log.Error().Err(err).Str("metric", metric).Msg("failed to check metric existence") + return nil, &model.PrometheusError{Message: err.Error()} + } + if !metricExists { + return nil, &model.MetricNotFoundError{Metric: metric} + } + + // 构建 PromQL 查询 + query := client.BuildQuery(service, metric, version) + log.Debug().Str("query", query).Msg("executing prometheus query") + + // 执行查询 + dataPoints, err := s.promClient.QueryRange(ctx, query, start, end, step) + if err != nil { + log.Error().Err(err).Str("query", query).Msg("failed to query prometheus") + return nil, &model.PrometheusError{Message: err.Error()} + } + + // 构建响应 + response := &model.MetricQueryResponse{ + Service: service, + Version: version, + Metric: metric, + Data: dataPoints, + } + + return response, nil +} From 48b34b0cb7dbc5519e8a139165b8289a965512ac Mon Sep 17 00:00:00 2001 From: dnj Date: Tue, 23 Sep 2025 11:52:29 +0800 Subject: [PATCH 05/18] =?UTF-8?q?fix(prometheus=5Fadapter):=20=E6=9B=B4?= =?UTF-8?q?=E6=96=B0=E9=BB=98=E8=AE=A4Prometheus=E5=9C=B0=E5=9D=80?= =?UTF-8?q?=E5=B9=B6=E5=AE=8C=E5=96=84=E6=96=87=E6=A1=A3?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- docs/prometheus_adapter/README.md | 75 ++++++++++++++++++++++++++- internal/prometheus_adapter/server.go | 2 +- 2 files changed, 75 insertions(+), 2 deletions(-) diff --git a/docs/prometheus_adapter/README.md b/docs/prometheus_adapter/README.md index 30ca4be..7c1d400 100644 --- a/docs/prometheus_adapter/README.md +++ b/docs/prometheus_adapter/README.md @@ -1,9 +1,82 @@ -# Prometheus Adapter API 文档 +# Prometheus Adapter 模块文档 ## 概述 Prometheus Adapter 提供从 Prometheus 获取服务指标的 RESTful API 接口。支持按服务名称和版本进行查询。 +## 架构设计 + +### 模块结构 + +``` +internal/prometheus_adapter/ +├── server.go # 服务器主入口,负责初始化和生命周期管理 +├── api/ # API 层,处理 HTTP 请求 +│ ├── api.go # API 基础结构和初始化 +│ └── metric_api.go # 指标相关的 API 处理器 +├── service/ # 业务逻辑层 +│ └── metric_service.go # 指标查询服务实现 +├── client/ # Prometheus 客户端 +│ └── prometheus_client.go # 封装 Prometheus API 调用 +└── model/ # 数据模型 + ├── api.go # API 请求响应模型 + ├── constants.go # 常量定义(错误码等) + └── error.go # 错误类型定义 +``` + +### 层次设计 + +1. **API 层** (`api/`) + - 处理 HTTP 请求和响应 + - 参数验证和解析 + - 错误响应格式化 + +2. **Service 层** (`service/`) + - 业务逻辑处理 + - 指标和服务存在性验证 + - 数据转换和组装 + +3. **Client 层** (`client/`) + - 与 Prometheus API 交互 + - PromQL 查询构建 + - 结果数据转换 + +4. **Model 层** (`model/`) + - 统一的数据模型定义 + - 错误类型和错误码 + - 请求响应结构体 + +### 核心组件 + +#### PrometheusAdapterServer +主服务器组件,负责: +- 初始化 Prometheus 客户端 +- 创建服务实例 +- 设置 API 路由 +- 管理生命周期 + +#### PrometheusClient +Prometheus 客户端封装,提供: +- `QueryRange`: 执行时间范围查询 +- `GetAvailableMetrics`: 获取所有可用指标 +- `CheckMetricExists`: 检查指标是否存在 +- `CheckServiceExists`: 检查服务是否存在 +- `BuildQuery`: 构建 PromQL 查询语句 + +#### MetricService +业务逻辑服务,实现: +- 动态指标发现 +- 查询参数验证 +- 错误处理和转换 + +## 配置说明 + +### 环境变量 + +| 变量名 | 说明 | 默认值 | +|--------|------|--------| +| PROMETHEUS_ADDRESS | Prometheus 服务器地址 | http://10.210.10.33:9090 | + ## API ### 1. 获取可用指标列表 diff --git a/internal/prometheus_adapter/server.go b/internal/prometheus_adapter/server.go index 1940839..e921668 100644 --- a/internal/prometheus_adapter/server.go +++ b/internal/prometheus_adapter/server.go @@ -25,7 +25,7 @@ func NewPrometheusAdapterServer(cfg *config.Config) (*PrometheusAdapterServer, e // 使用环境变量或默认值获取 Prometheus 地址 prometheusAddr := os.Getenv("PROMETHEUS_ADDRESS") if prometheusAddr == "" { - prometheusAddr = "http://localhost:9090" + prometheusAddr = "http://10.210.10.33:9090/" } // 创建 Prometheus 客户端 From 855c51451c47cc7bd5f1b34853411f030ea9948e Mon Sep 17 00:00:00 2001 From: dnj Date: Tue, 23 Sep 2025 17:24:57 +0800 Subject: [PATCH 06/18] =?UTF-8?q?feat(=E5=91=8A=E8=AD=A6):=20=E5=AE=9E?= =?UTF-8?q?=E7=8E=B0=E5=91=8A=E8=AD=A6=E8=A7=84=E5=88=99=E5=90=8C=E6=AD=A5?= =?UTF-8?q?=E5=8A=9F=E8=83=BD=E5=B9=B6=E9=87=8D=E6=9E=84API=E5=B1=82?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 新增告警规则同步API及服务实现,支持将规则同步到Prometheus并触发重载 重构API层提取公共错误处理和工具方法到通用模块 添加相关模型定义和文档更新 --- docs/prometheus_adapter/README.md | 298 +++++++++--------- internal/prometheus_adapter/api/alert_api.go | 36 +++ internal/prometheus_adapter/api/api.go | 91 +++++- internal/prometheus_adapter/api/metric_api.go | 95 +----- internal/prometheus_adapter/model/alert.go | 20 ++ internal/prometheus_adapter/model/api.go | 57 +++- .../prometheus_adapter/model/constants.go | 1 + .../model/prometheus_rule.go | 21 ++ internal/prometheus_adapter/server.go | 8 +- .../service/alert_service.go | 282 +++++++++++++++++ 10 files changed, 672 insertions(+), 237 deletions(-) create mode 100644 internal/prometheus_adapter/api/alert_api.go create mode 100644 internal/prometheus_adapter/model/alert.go create mode 100644 internal/prometheus_adapter/model/prometheus_rule.go create mode 100644 internal/prometheus_adapter/service/alert_service.go diff --git a/docs/prometheus_adapter/README.md b/docs/prometheus_adapter/README.md index 7c1d400..81aa9f5 100644 --- a/docs/prometheus_adapter/README.md +++ b/docs/prometheus_adapter/README.md @@ -1,97 +1,70 @@ -# Prometheus Adapter 模块文档 +# Prometheus Adapter + +基于 Prometheus 的指标查询与告警规则同步适配层,提供统一的 REST API: +- 按服务与版本查询任意 Prometheus 指标 +- 同步告警规则到 Prometheus 并触发重载 + +目录 +- 概述 +- 快速开始 +- 架构设计 +- API 参考 + - 指标查询 + - 告警规则同步 +- Alertmanager 集成 +- 支持的服务 +- 错误码 ## 概述 -Prometheus Adapter 提供从 Prometheus 获取服务指标的 RESTful API 接口。支持按服务名称和版本进行查询。 +Prometheus Adapter 作为内部系统与 Prometheus 之间的适配层: +- 向上暴露简洁、统一的 HTTP API +- 向下负责 PromQL 查询与 Prometheus 规则文件管理 ## 架构设计 -### 模块结构 +- 分层设计 + - API 层(`internal/prometheus_adapter/api`):HTTP 请求处理、参数校验、错误格式化 + - Service 层(`internal/prometheus_adapter/service`):业务逻辑、指标与服务存在性校验、数据装配 + - Client 层(`internal/prometheus_adapter/client`):与 Prometheus API 交互、PromQL 构建、结果转换 + - Model 层(`internal/prometheus_adapter/model`):统一数据模型、错误类型、常量 +- 目录结构 ``` internal/prometheus_adapter/ -├── server.go # 服务器主入口,负责初始化和生命周期管理 -├── api/ # API 层,处理 HTTP 请求 -│ ├── api.go # API 基础结构和初始化 -│ └── metric_api.go # 指标相关的 API 处理器 -├── service/ # 业务逻辑层 -│ └── metric_service.go # 指标查询服务实现 -├── client/ # Prometheus 客户端 +├── server.go # 服务器主入口,负责初始化和生命周期管理 +├── api/ # API 层,处理 HTTP 请求 +│ ├── api.go # API 基础结构和初始化 +│ ├── metric_api.go # 指标相关的 API 处理器 +│ └── alert_api.go # 告警规则同步 API 处理器 +├── service/ # 业务逻辑层 +│ ├── metric_service.go # 指标查询服务实现 +│ └── alert_service.go # 告警规则同步服务实现 +├── client/ # Prometheus 客户端 │ └── prometheus_client.go # 封装 Prometheus API 调用 -└── model/ # 数据模型 - ├── api.go # API 请求响应模型 - ├── constants.go # 常量定义(错误码等) - └── error.go # 错误类型定义 +└── model/ # 数据模型 + ├── api.go # API 请求响应模型 + ├── alert.go # 告警规则模型 + ├── constants.go # 常量定义(错误码等) + ├── error.go # 错误类型定义 + └── prometheus.go # Prometheus 规则文件模型 ``` -### 层次设计 +- 核心组件 + - PrometheusAdapterServer:初始化客户端与路由,管理服务生命周期 + - PrometheusClient:`QueryRange`、`GetAvailableMetrics`、`CheckMetricExists`、`CheckServiceExists`、`BuildQuery` + - MetricService:参数校验、动态指标发现、错误转换 + - AlertService:告警规则同步、Prometheus 规则文件生成、配置重载 -1. **API 层** (`api/`) - - 处理 HTTP 请求和响应 - - 参数验证和解析 - - 错误响应格式化 +## API -2. **Service 层** (`service/`) - - 业务逻辑处理 - - 指标和服务存在性验证 - - 数据转换和组装 +### 指标查询 -3. **Client 层** (`client/`) - - 与 Prometheus API 交互 - - PromQL 查询构建 - - 结果数据转换 - -4. **Model 层** (`model/`) - - 统一的数据模型定义 - - 错误类型和错误码 - - 请求响应结构体 - -### 核心组件 - -#### PrometheusAdapterServer -主服务器组件,负责: -- 初始化 Prometheus 客户端 -- 创建服务实例 -- 设置 API 路由 -- 管理生命周期 - -#### PrometheusClient -Prometheus 客户端封装,提供: -- `QueryRange`: 执行时间范围查询 -- `GetAvailableMetrics`: 获取所有可用指标 -- `CheckMetricExists`: 检查指标是否存在 -- `CheckServiceExists`: 检查服务是否存在 -- `BuildQuery`: 构建 PromQL 查询语句 - -#### MetricService -业务逻辑服务,实现: -- 动态指标发现 -- 查询参数验证 -- 错误处理和转换 - -## 配置说明 - -### 环境变量 - -| 变量名 | 说明 | 默认值 | -|--------|------|--------| -| PROMETHEUS_ADDRESS | Prometheus 服务器地址 | http://10.210.10.33:9090 | - -## API - -### 1. 获取可用指标列表 - -**GET** `/v1/metrics` - -获取所有可用的指标列表。 - -#### 请求示例 -```bash -GET /v1/metrics +1) 获取可用指标列表 +- 方法与路径:`GET /v1/metrics` +- 用途:列出当前可查询的所有指标名称 +- 响应示例: ``` - -#### 响应示例 -```json { "metrics": [ "system_cpu_usage_percent", @@ -104,69 +77,36 @@ GET /v1/metrics } ``` -### 2. 通用指标查询接口 - -**GET** `/v1/metrics/:service/:metric` - -获取指定服务的任意指标时间序列数据。指标不存在则返回错误。 - -#### 路径参数 -- `service` (string, required): 服务名称 -- `metric` (string, required): 指标名称(必须是 Prometheus 中实际存在的指标) - -#### 查询参数 -- `version` (string, optional): 服务版本,不指定则返回所有版本 -- `start` (string, optional): 开始时间 (RFC3339 格式,如: 2024-01-01T00:00:00Z) -- `end` (string, optional): 结束时间 (RFC3339 格式,如: 2024-01-01T01:00:00Z) -- `step` (string, optional): 时间步长 (如: 1m, 5m, 1h),默认 1m - -#### 请求示例 - -1. **查询 CPU 使用率:** -```bash -GET /v1/metrics/metadata-service/system_cpu_usage_percent?version=1.0.0 -``` - -2. **查询内存使用率:** -```bash -GET /v1/metrics/storage-service/system_memory_usage_percent?version=1.0.0 -``` - -3. **查询 HTTP 请求延迟:** -```bash -GET /v1/metrics/storage-service/http_latency?version=1.0.0 +2) 查询指定服务的指标时间序列 +- 方法与路径:`GET /v1/metrics/{service}/{metric}` +- 路径参数: + - `service`:服务名(必填) + - `metric`:指标名(必填,需为 Prometheus 中存在的指标) +- 查询参数: + - `version`:服务版本(选填;不传则返回所有版本) + - `start`:开始时间(选填,RFC3339) + - `end`:结束时间(选填,RFC3339) + - `step`:步长(选填,如 `1m`、`5m`、`1h`;默认 `1m`) +- 请求示例: + - `GET /v1/metrics/metadata-service/system_cpu_usage_percent?version=1.0.0` + - `GET /v1/metrics/storage-service/system_memory_usage_percent?version=1.0.0` + - `GET /v1/metrics/storage-service/http_latency?version=1.0.0` + - `GET /v1/metrics/storage-service/system_network_qps?version=1.0.0` +- 成功响应示例: ``` - -4. **查询网络 QPS:** -```bash -GET /v1/metrics/storage-service/system_network_qps?version=1.0.0 -``` - -#### 成功响应示例 - -**HTTP 200 OK** -```json { "service": "metadata-service", "version": "1.0.0", "metric": "system_cpu_usage_percent", "data": [ - { - "timestamp": "2024-01-01T00:00:00Z", - "value": 45.2 - }, - { - "timestamp": "2024-01-01T00:01:00Z", - "value": 48.5 - } + { "timestamp": "2024-01-01T00:00:00Z", "value": 45.2 }, + { "timestamp": "2024-01-01T00:01:00Z", "value": 48.5 } ] } ``` - -#### 错误响应示例 - -**指标不存在时 - HTTP 404 Not Found** -```json +- 错误响应示例: + - 指标不存在(404): +``` { "error": { "code": "METRIC_NOT_FOUND", @@ -175,9 +115,8 @@ GET /v1/metrics/storage-service/system_network_qps?version=1.0.0 } } ``` - -**服务不存在时 - HTTP 404 Not Found** -```json + - 服务不存在(404): +``` { "error": { "code": "SERVICE_NOT_FOUND", @@ -186,9 +125,8 @@ GET /v1/metrics/storage-service/system_network_qps?version=1.0.0 } } ``` - -**参数错误时 - HTTP 400 Bad Request** -```json + - 参数错误(400): +``` { "error": { "code": "INVALID_PARAMETER", @@ -199,15 +137,81 @@ GET /v1/metrics/storage-service/system_network_qps?version=1.0.0 } ``` -## 实现说明 +### 告警规则同步 + +- 方法与路径:`POST /v1/alert-rules/sync` +- 功能:接收监控告警模块发送的完整规则列表,生成 Prometheus 规则文件并触发重载(全量同步) +- 请求体示例: +``` +{ + "rules": [ + { + "name": "high_cpu_usage", + "description": "CPU使用率过高告警", + "expr": "system_cpu_usage_percent", + "op": ">", + "severity": "warning" + } + ], + "rule_metas": [ + { + "alert_name": "high_cpu_usage_storage_v1", + "labels": "{\"service\":\"storage-service\",\"version\":\"1.0.0\"}", + "threshold": 90, + "watch_time": 300, + "match_time": "5m" + } + ] +} +``` +- 响应示例: +``` +{ + "status": "success", + "message": "Rules synced to Prometheus" +} +``` + +## Alertmanager 集成 + +- 目标:将 Prometheus 触发的告警通过 Alertmanager 转发到监控告警模块 +- `alertmanager.yml` 配置示例: +```yaml +global: + resolve_timeout: 5m + +route: + group_by: ['alertname', 'cluster', 'service'] + group_wait: 10s + group_interval: 10s + repeat_interval: 1h + receiver: 'zeroops-alert-webhook' + +receivers: + - name: 'zeroops-alert-webhook' + webhook_configs: + - url: 'http://alert-module:8080/v1/integrations/alertmanager/webhook' + send_resolved: true +``` +- 说明: + - `url`:监控告警模块的 webhook 地址(按实际部署修改主机与端口) + - `send_resolved`:为 `true` 时,告警恢复也会通知 + +## 支持的服务 + +当前 mock/s3 环境下: +- `metadata-service` +- `storage-service` +- `queue-service` +- `third-party-service`(原文为 third-party-servrice,已更正) +- `mock-error-service` -### 支持的服务列表 +所有服务的版本信息通过标签 `service_version` 暴露。 -当前 mock/s3 环境中支持的服务: -- `metadata-service` - 元数据管理服务 -- `storage-service` - 存储服务 -- `queue-service` - 消息队列服务 -- `third-party-service` - 第三方集成服务 -- `mock-error-service` - 错误模拟服务 +## 错误码 -所有服务的版本信息通过 `service_version` 标签暴露。 \ No newline at end of file +- `METRIC_NOT_FOUND`:指标不存在 +- `SERVICE_NOT_FOUND`:服务不存在 +- `INVALID_PARAMETER`:请求参数不合法(如时间格式不正确) +- `INTERNAL_ERROR`:内部服务器错误 +- `PROMETHEUS_ERROR`:Prometheus 查询失败 diff --git a/internal/prometheus_adapter/api/alert_api.go b/internal/prometheus_adapter/api/alert_api.go new file mode 100644 index 0000000..cb3e968 --- /dev/null +++ b/internal/prometheus_adapter/api/alert_api.go @@ -0,0 +1,36 @@ +package api + +import ( + "net/http" + + "github.com/fox-gonic/fox" + "github.com/qiniu/zeroops/internal/prometheus_adapter/model" +) + +// setupAlertRouters 设置告警相关路由 +func (api *Api) setupAlertRouters(router *fox.Engine) { + router.POST("/v1/alert-rules/sync", api.SyncRules) +} + +// SyncRules 同步规则到Prometheus +// 接收从监控告警模块发来的规则列表,生成Prometheus规则文件并重载配置 +func (api *Api) SyncRules(c *fox.Context) { + var req model.SyncRulesRequest + if err := c.ShouldBindJSON(&req); err != nil { + SendErrorResponse(c, http.StatusBadRequest, model.ErrorCodeInvalidParameter, + "Invalid request body: "+err.Error(), nil) + return + } + + err := api.alertService.SyncRulesToPrometheus(req.Rules, req.RuleMetas) + if err != nil { + SendErrorResponse(c, http.StatusInternalServerError, model.ErrorCodeInternalError, + "Failed to sync rules to Prometheus: "+err.Error(), nil) + return + } + + c.JSON(http.StatusOK, map[string]string{ + "status": "success", + "message": "Rules synced to Prometheus", + }) +} diff --git a/internal/prometheus_adapter/api/api.go b/internal/prometheus_adapter/api/api.go index 4de0c96..2b6e432 100644 --- a/internal/prometheus_adapter/api/api.go +++ b/internal/prometheus_adapter/api/api.go @@ -1,20 +1,27 @@ package api import ( + "fmt" + "time" + "github.com/fox-gonic/fox" + "github.com/qiniu/zeroops/internal/prometheus_adapter/model" "github.com/qiniu/zeroops/internal/prometheus_adapter/service" + "github.com/rs/zerolog/log" ) // Api Prometheus Adapter API type Api struct { metricService *service.MetricService + alertService *service.AlertService router *fox.Engine } // NewApi 创建新的 API -func NewApi(metricService *service.MetricService, router *fox.Engine) (*Api, error) { +func NewApi(metricService *service.MetricService, alertService *service.AlertService, router *fox.Engine) (*Api, error) { api := &Api{ metricService: metricService, + alertService: alertService, router: router, } @@ -26,4 +33,86 @@ func NewApi(metricService *service.MetricService, router *fox.Engine) (*Api, err func (api *Api) setupRouters(router *fox.Engine) { // 指标相关路由 api.setupMetricRouters(router) + // 告警相关路由 + api.setupAlertRouters(router) +} + +// ========== 通用辅助方法 ========== + +// SendErrorResponse 发送错误响应(可被其他API模块使用) +func SendErrorResponse(c *fox.Context, statusCode int, errorCode, message string, extras map[string]string) { + errorDetail := model.ErrorDetail{ + Code: errorCode, + Message: message, + } + + // 添加额外的字段 + if extras != nil { + if service, ok := extras["service"]; ok { + errorDetail.Service = service + } + if metric, ok := extras["metric"]; ok { + errorDetail.Metric = metric + } + if parameter, ok := extras["parameter"]; ok { + errorDetail.Parameter = parameter + } + if value, ok := extras["value"]; ok { + errorDetail.Value = value + } + } + + response := model.ErrorResponse{ + Error: errorDetail, + } + + c.JSON(statusCode, response) +} + +// ParseTimeRange 解析时间范围参数 +func ParseTimeRange(startStr, endStr string) (time.Time, time.Time, error) { + var start, end time.Time + var err error + + // 如果没有指定开始时间,默认为1小时前 + if startStr == "" { + start = time.Now().Add(-1 * time.Hour) + } else { + start, err = time.Parse(time.RFC3339, startStr) + if err != nil { + return time.Time{}, time.Time{}, fmt.Errorf("invalid start time format: %w", err) + } + } + + // 如果没有指定结束时间,默认为当前时间 + if endStr == "" { + end = time.Now() + } else { + end, err = time.Parse(time.RFC3339, endStr) + if err != nil { + return time.Time{}, time.Time{}, fmt.Errorf("invalid end time format: %w", err) + } + } + + // 验证时间范围的合理性 + if end.Before(start) { + return time.Time{}, time.Time{}, fmt.Errorf("end time must be after start time") + } + + return start, end, nil +} + +// ParseStep 解析步长参数 +func ParseStep(stepStr string) time.Duration { + if stepStr == "" { + return time.Minute // 默认1分钟 + } + + duration, err := time.ParseDuration(stepStr) + if err != nil { + log.Warn().Str("step", stepStr).Msg("invalid step format, using default") + return time.Minute + } + + return duration } diff --git a/internal/prometheus_adapter/api/metric_api.go b/internal/prometheus_adapter/api/metric_api.go index 431bfdf..832362f 100644 --- a/internal/prometheus_adapter/api/metric_api.go +++ b/internal/prometheus_adapter/api/metric_api.go @@ -4,7 +4,6 @@ import ( "errors" "fmt" "net/http" - "time" "github.com/fox-gonic/fox" "github.com/qiniu/zeroops/internal/prometheus_adapter/model" @@ -24,7 +23,7 @@ func (api *Api) GetMetrics(c *fox.Context) { response, err := api.metricService.GetAvailableMetrics(ctx) if err != nil { log.Error().Err(err).Msg("failed to get available metrics") - api.sendErrorResponse(c, http.StatusInternalServerError, model.ErrorCodeInternalError, "获取指标列表失败", nil) + SendErrorResponse(c, http.StatusInternalServerError, model.ErrorCodeInternalError, "获取指标列表失败", nil) return } @@ -46,16 +45,16 @@ func (api *Api) QueryMetric(c *fox.Context) { stepStr := c.Query("step") // 解析时间参数 - start, end, err := api.parseTimeRange(startStr, endStr) + start, end, err := ParseTimeRange(startStr, endStr) if err != nil { log.Error().Err(err).Msg("invalid time parameters") - api.sendErrorResponse(c, http.StatusBadRequest, model.ErrorCodeInvalidParameter, + SendErrorResponse(c, http.StatusBadRequest, model.ErrorCodeInvalidParameter, fmt.Sprintf("参数 'start/end' 格式错误: %s", err.Error()), nil) return } // 解析步长参数 - step := api.parseStep(stepStr) + step := ParseStep(stepStr) // 查询指标 response, err := api.metricService.QueryMetric(ctx, serviceName, metricName, version, start, end, step) @@ -67,54 +66,6 @@ func (api *Api) QueryMetric(c *fox.Context) { c.JSON(http.StatusOK, response) } -// parseTimeRange 解析时间范围参数 -func (api *Api) parseTimeRange(startStr, endStr string) (time.Time, time.Time, error) { - var start, end time.Time - var err error - - // 如果没有指定开始时间,默认为1小时前 - if startStr == "" { - start = time.Now().Add(-1 * time.Hour) - } else { - start, err = time.Parse(time.RFC3339, startStr) - if err != nil { - return time.Time{}, time.Time{}, fmt.Errorf("invalid start time format: %w", err) - } - } - - // 如果没有指定结束时间,默认为当前时间 - if endStr == "" { - end = time.Now() - } else { - end, err = time.Parse(time.RFC3339, endStr) - if err != nil { - return time.Time{}, time.Time{}, fmt.Errorf("invalid end time format: %w", err) - } - } - - // 验证时间范围的合理性 - if end.Before(start) { - return time.Time{}, time.Time{}, fmt.Errorf("end time must be after start time") - } - - return start, end, nil -} - -// parseStep 解析步长参数 -func (api *Api) parseStep(stepStr string) time.Duration { - if stepStr == "" { - return time.Minute // 默认1分钟 - } - - duration, err := time.ParseDuration(stepStr) - if err != nil { - log.Warn().Str("step", stepStr).Msg("invalid step format, using default") - return time.Minute - } - - return duration -} - // handleQueryError 处理查询错误 func (api *Api) handleQueryError(c *fox.Context, err error, service, metric string) { var serviceNotFound *model.ServiceNotFoundError @@ -124,52 +75,22 @@ func (api *Api) handleQueryError(c *fox.Context, err error, service, metric stri switch { case errors.As(err, &serviceNotFound): log.Error().Err(err).Str("service", service).Msg("service not found") - api.sendErrorResponse(c, http.StatusNotFound, model.ErrorCodeServiceNotFound, + SendErrorResponse(c, http.StatusNotFound, model.ErrorCodeServiceNotFound, err.Error(), map[string]string{"service": service}) case errors.As(err, &metricNotFound): log.Error().Err(err).Str("metric", metric).Msg("metric not found") - api.sendErrorResponse(c, http.StatusNotFound, model.ErrorCodeMetricNotFound, + SendErrorResponse(c, http.StatusNotFound, model.ErrorCodeMetricNotFound, err.Error(), map[string]string{"metric": metric}) case errors.As(err, &prometheusError): log.Error().Err(err).Msg("prometheus query error") - api.sendErrorResponse(c, http.StatusInternalServerError, model.ErrorCodePrometheusError, + SendErrorResponse(c, http.StatusInternalServerError, model.ErrorCodePrometheusError, "Prometheus 查询失败", nil) default: log.Error().Err(err).Msg("unexpected error during metric query") - api.sendErrorResponse(c, http.StatusInternalServerError, model.ErrorCodeInternalError, + SendErrorResponse(c, http.StatusInternalServerError, model.ErrorCodeInternalError, "内部服务器错误", nil) } } - -// sendErrorResponse 发送错误响应 -func (api *Api) sendErrorResponse(c *fox.Context, statusCode int, errorCode, message string, extras map[string]string) { - errorDetail := model.ErrorDetail{ - Code: errorCode, - Message: message, - } - - // 添加额外的字段 - if extras != nil { - if service, ok := extras["service"]; ok { - errorDetail.Service = service - } - if metric, ok := extras["metric"]; ok { - errorDetail.Metric = metric - } - if parameter, ok := extras["parameter"]; ok { - errorDetail.Parameter = parameter - } - if value, ok := extras["value"]; ok { - errorDetail.Value = value - } - } - - response := model.ErrorResponse{ - Error: errorDetail, - } - - c.JSON(statusCode, response) -} diff --git a/internal/prometheus_adapter/model/alert.go b/internal/prometheus_adapter/model/alert.go new file mode 100644 index 0000000..605c5fb --- /dev/null +++ b/internal/prometheus_adapter/model/alert.go @@ -0,0 +1,20 @@ +package model + +// AlertRule 告警规则表 - 定义告警规则模板 +type AlertRule struct { + Name string `json:"name" gorm:"type:varchar(255);primaryKey"` + Description string `json:"description" gorm:"type:text"` + Expr string `json:"expr" gorm:"type:text;not null"` + Op string `json:"op" gorm:"type:enum('>', '<', '=', '!=');not null"` + Severity string `json:"severity" gorm:"type:varchar(50);not null"` +} + +// AlertRuleMeta 告警规则元信息表 - 存储服务级别的告警配置 +// 用于将告警规则模板实例化为具体的服务告警 +type AlertRuleMeta struct { + AlertName string `json:"alert_name" gorm:"type:varchar(255);primaryKey"` + Labels string `json:"labels" gorm:"type:text"` // JSON格式的服务标签,如:{"service":"storage-service","version":"1.0.0"} + Threshold float64 `json:"threshold"` // 告警阈值 + WatchTime int `json:"watch_time"` // 持续时间(秒),对应Prometheus的for字段 + MatchTime string `json:"match_time" gorm:"type:text"` // 时间范围表达式 +} diff --git a/internal/prometheus_adapter/model/api.go b/internal/prometheus_adapter/model/api.go index efef1d2..f7f6e7f 100644 --- a/internal/prometheus_adapter/model/api.go +++ b/internal/prometheus_adapter/model/api.go @@ -2,7 +2,7 @@ package model import "time" -// ===== API 响应结构体 ===== +// ===== 指标相关 API ===== // MetricListResponse 指标列表响应(对应 GET /v1/metrics) type MetricListResponse struct { @@ -22,3 +22,58 @@ type MetricDataPoint struct { Timestamp time.Time `json:"timestamp"` Value float64 `json:"value"` } + +// ===== 告警规则相关 API ===== + +// CreateAlertRuleRequest 创建告警规则请求 +type CreateAlertRuleRequest struct { + Name string `json:"name" binding:"required"` + Description string `json:"description,omitempty"` + Expr string `json:"expr" binding:"required"` + Op string `json:"op" binding:"required,oneof=> < = !="` + Severity string `json:"severity" binding:"required"` + + // 元信息字段(可选) + Labels map[string]string `json:"labels,omitempty"` + Threshold float64 `json:"threshold,omitempty"` + WatchTime int `json:"watch_time,omitempty"` + MatchTime string `json:"match_time,omitempty"` +} + +// UpdateAlertRuleRequest 更新告警规则请求 +type UpdateAlertRuleRequest struct { + Description *string `json:"description,omitempty"` + Expr *string `json:"expr,omitempty"` + Op *string `json:"op,omitempty" binding:"omitempty,oneof=> < = !="` + Severity *string `json:"severity,omitempty"` + + // 元信息字段(可选) + Labels map[string]string `json:"labels,omitempty"` + Threshold *float64 `json:"threshold,omitempty"` + WatchTime *int `json:"watch_time,omitempty"` + MatchTime *string `json:"match_time,omitempty"` +} + +// CreateAlertRuleMetaRequest 创建告警规则元信息请求 +type CreateAlertRuleMetaRequest struct { + AlertName string `json:"alert_name" binding:"required"` + Labels map[string]string `json:"labels" binding:"required"` + Threshold float64 `json:"threshold" binding:"required"` + WatchTime int `json:"watch_time,omitempty"` + MatchTime string `json:"match_time,omitempty"` +} + +// UpdateAlertRuleMetaRequest 更新告警规则元信息请求 +type UpdateAlertRuleMetaRequest struct { + Labels map[string]string `json:"labels,omitempty"` + Threshold *float64 `json:"threshold,omitempty"` + WatchTime *int `json:"watch_time,omitempty"` + MatchTime *string `json:"match_time,omitempty"` +} + +// SyncRulesRequest 同步规则请求 +// 从监控告警模块发送过来的完整规则列表 +type SyncRulesRequest struct { + Rules []AlertRule `json:"rules"` // 告警规则列表 + RuleMetas []AlertRuleMeta `json:"rule_metas"` // 规则元信息列表 +} diff --git a/internal/prometheus_adapter/model/constants.go b/internal/prometheus_adapter/model/constants.go index 3e8727b..3992eae 100644 --- a/internal/prometheus_adapter/model/constants.go +++ b/internal/prometheus_adapter/model/constants.go @@ -7,4 +7,5 @@ const ( ErrorCodeInvalidParameter = "INVALID_PARAMETER" ErrorCodePrometheusError = "PROMETHEUS_ERROR" ErrorCodeInternalError = "INTERNAL_ERROR" + ErrorCodeRuleNotFound = "RULE_NOT_FOUND" ) diff --git a/internal/prometheus_adapter/model/prometheus_rule.go b/internal/prometheus_adapter/model/prometheus_rule.go new file mode 100644 index 0000000..3d5c9e4 --- /dev/null +++ b/internal/prometheus_adapter/model/prometheus_rule.go @@ -0,0 +1,21 @@ +package model + +// PrometheusRule Prometheus规则文件中的单个规则 +type PrometheusRule struct { + Alert string `yaml:"alert"` + Expr string `yaml:"expr"` + For string `yaml:"for,omitempty"` + Labels map[string]string `yaml:"labels,omitempty"` + Annotations map[string]string `yaml:"annotations,omitempty"` +} + +// PrometheusRuleGroup Prometheus规则组 +type PrometheusRuleGroup struct { + Name string `yaml:"name"` + Rules []PrometheusRule `yaml:"rules"` +} + +// PrometheusRuleFile Prometheus规则文件结构 +type PrometheusRuleFile struct { + Groups []PrometheusRuleGroup `yaml:"groups"` +} diff --git a/internal/prometheus_adapter/server.go b/internal/prometheus_adapter/server.go index e921668..d9fb2f4 100644 --- a/internal/prometheus_adapter/server.go +++ b/internal/prometheus_adapter/server.go @@ -17,6 +17,7 @@ type PrometheusAdapterServer struct { config *config.Config promClient *client.PrometheusClient metricService *service.MetricService + alertService *service.AlertService api *api.Api } @@ -37,10 +38,14 @@ func NewPrometheusAdapterServer(cfg *config.Config) (*PrometheusAdapterServer, e // 创建指标服务 metricService := service.NewMetricService(promClient) + // 创建告警服务 + alertService := service.NewAlertService(promClient) + server := &PrometheusAdapterServer{ config: cfg, promClient: promClient, metricService: metricService, + alertService: alertService, } log.Info().Str("prometheus_address", prometheusAddr).Msg("Prometheus Adapter initialized successfully") @@ -50,10 +55,11 @@ func NewPrometheusAdapterServer(cfg *config.Config) (*PrometheusAdapterServer, e // UseApi 设置 API 路由 func (s *PrometheusAdapterServer) UseApi(router *fox.Engine) error { var err error - s.api, err = api.NewApi(s.metricService, router) + s.api, err = api.NewApi(s.metricService, s.alertService, router) if err != nil { return fmt.Errorf("failed to initialize API: %w", err) } + return nil } diff --git a/internal/prometheus_adapter/service/alert_service.go b/internal/prometheus_adapter/service/alert_service.go new file mode 100644 index 0000000..2be8095 --- /dev/null +++ b/internal/prometheus_adapter/service/alert_service.go @@ -0,0 +1,282 @@ +package service + +import ( + "encoding/json" + "fmt" + "net/http" + "os" + "path/filepath" + "strings" + + "github.com/qiniu/zeroops/internal/prometheus_adapter/client" + "github.com/qiniu/zeroops/internal/prometheus_adapter/model" + "github.com/rs/zerolog/log" + "gopkg.in/yaml.v3" +) + +// AlertService 告警服务 - 仅负责与Prometheus交互,不存储规则 +type AlertService struct { + promClient *client.PrometheusClient + rulesFilePath string +} + +// NewAlertService 创建告警服务 +func NewAlertService(promClient *client.PrometheusClient) *AlertService { + rulesFilePath := os.Getenv("PROMETHEUS_RULES_FILE") + if rulesFilePath == "" { + rulesFilePath = "/etc/prometheus/rules/alert_rules.yml" + } + + return &AlertService{ + promClient: promClient, + rulesFilePath: rulesFilePath, + } +} + +// SyncRulesToPrometheus 同步规则到Prometheus +// 接收完整的规则列表,生成Prometheus规则文件并重载配置 +func (s *AlertService) SyncRulesToPrometheus(rules []model.AlertRule, ruleMetas []model.AlertRuleMeta) error { + // 构建Prometheus规则文件 + prometheusRules := s.buildPrometheusRules(rules, ruleMetas) + + // 写入规则文件 + if err := s.writeRulesFile(prometheusRules); err != nil { + return fmt.Errorf("failed to write rules file: %w", err) + } + + // 通知Prometheus重新加载配置 + if err := s.reloadPrometheus(); err != nil { + log.Warn().Err(err).Msg("Failed to reload Prometheus, rules file has been updated") + // 不返回错误,因为文件已经更新成功 + } + + log.Info(). + Int("rules_count", len(rules)). + Int("metas_count", len(ruleMetas)). + Msg("Rules synced to Prometheus successfully") + + return nil +} + +// buildPrometheusRules 构建Prometheus规则 +func (s *AlertService) buildPrometheusRules(rules []model.AlertRule, ruleMetas []model.AlertRuleMeta) *model.PrometheusRuleFile { + promRules := []model.PrometheusRule{} + + // 创建规则名到规则的映射 + ruleMap := make(map[string]*model.AlertRule) + for i := range rules { + ruleMap[rules[i].Name] = &rules[i] + } + + // 为每个元信息生成Prometheus规则 + for _, meta := range ruleMetas { + // 查找对应的规则模板 + var rule *model.AlertRule + + // 尝试从alert_name中提取规则名 + // 假设alert_name格式为: {rule_name}_{service}_{version} 或类似格式 + for ruleName, r := range ruleMap { + if strings.HasPrefix(meta.AlertName, ruleName) { + rule = r + break + } + } + + if rule == nil { + log.Warn(). + Str("alert_name", meta.AlertName). + Msg("No matching rule template found for alert meta, skipping") + continue + } + + // 解析标签 + var labels map[string]string + if meta.Labels != "" { + if err := json.Unmarshal([]byte(meta.Labels), &labels); err != nil { + log.Warn(). + Err(err). + Str("alert_name", meta.AlertName). + Msg("Failed to parse labels, using empty labels") + labels = make(map[string]string) + } + } else { + labels = make(map[string]string) + } + + // 添加severity标签 + labels["severity"] = rule.Severity + labels["rule_name"] = rule.Name + + // 构建表达式 + expr := s.buildExpression(rule, &meta) + + // 构建注释 + annotations := map[string]string{ + "description": rule.Description, + "summary": fmt.Sprintf("%s %s %f", rule.Expr, rule.Op, meta.Threshold), + } + + // 计算for字段 + forDuration := "" + if meta.WatchTime > 0 { + forDuration = fmt.Sprintf("%ds", meta.WatchTime) + } + + promRule := model.PrometheusRule{ + Alert: meta.AlertName, + Expr: expr, + For: forDuration, + Labels: labels, + Annotations: annotations, + } + + promRules = append(promRules, promRule) + } + + // 如果没有元信息,为每个规则创建默认规则 + if len(ruleMetas) == 0 { + for _, rule := range rules { + labels := map[string]string{ + "severity": rule.Severity, + } + + annotations := map[string]string{ + "description": rule.Description, + "summary": fmt.Sprintf("%s triggered", rule.Name), + } + + promRule := model.PrometheusRule{ + Alert: rule.Name, + Expr: rule.Expr, + Labels: labels, + Annotations: annotations, + } + + promRules = append(promRules, promRule) + } + } + + return &model.PrometheusRuleFile{ + Groups: []model.PrometheusRuleGroup{ + { + Name: "zeroops_alerts", + Rules: promRules, + }, + }, + } +} + +// buildExpression 构建PromQL表达式 +func (s *AlertService) buildExpression(rule *model.AlertRule, meta *model.AlertRuleMeta) string { + expr := rule.Expr + + // 解析标签并添加到表达式中 + var labels map[string]string + if meta.Labels != "" { + json.Unmarshal([]byte(meta.Labels), &labels) + } + + if len(labels) > 0 { + labelMatchers := []string{} + for k, v := range labels { + // 跳过内部使用的标签 + if k == "rule_name" { + continue + } + labelMatchers = append(labelMatchers, fmt.Sprintf(`%s="%s"`, k, v)) + } + + if len(labelMatchers) > 0 { + // 如果表达式包含{,说明已经有标签选择器 + if strings.Contains(expr, "{") { + expr = strings.Replace(expr, "}", ","+strings.Join(labelMatchers, ",")+"}", 1) + } else { + // 在指标名后添加标签选择器 + // 查找第一个非字母数字下划线的字符 + metricEnd := 0 + for i, ch := range expr { + if !((ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z') || + (ch >= '0' && ch <= '9') || ch == '_') { + metricEnd = i + break + } + } + if metricEnd == 0 { + metricEnd = len(expr) + } + expr = expr[:metricEnd] + "{" + strings.Join(labelMatchers, ",") + "}" + expr[metricEnd:] + } + } + } + + // 添加时间范围 + if meta.MatchTime != "" { + // 查找最后一个指标,添加时间范围 + if !strings.Contains(expr, "[") { + // 简单处理:在第一个空格前添加时间范围 + parts := strings.SplitN(expr, " ", 2) + if len(parts) == 2 { + expr = parts[0] + "[" + meta.MatchTime + "] " + parts[1] + } else { + expr = expr + "[" + meta.MatchTime + "]" + } + } + } + + // 添加比较操作符和阈值 + if meta.Threshold != 0 { + expr = fmt.Sprintf("%s %s %f", expr, rule.Op, meta.Threshold) + } + + return expr +} + +// writeRulesFile 写入规则文件 +func (s *AlertService) writeRulesFile(rules *model.PrometheusRuleFile) error { + // 确保目录存在 + dir := filepath.Dir(s.rulesFilePath) + if err := os.MkdirAll(dir, 0755); err != nil { + return fmt.Errorf("failed to create rules directory: %w", err) + } + + // 序列化为YAML + data, err := yaml.Marshal(rules) + if err != nil { + return fmt.Errorf("failed to marshal rules: %w", err) + } + + // 写入文件 + if err := os.WriteFile(s.rulesFilePath, data, 0644); err != nil { + return fmt.Errorf("failed to write rules file: %w", err) + } + + log.Info(). + Str("file", s.rulesFilePath). + Int("groups", len(rules.Groups)). + Msg("Prometheus rules file updated") + + return nil +} + +// reloadPrometheus 重新加载Prometheus配置 +func (s *AlertService) reloadPrometheus() error { + prometheusURL := os.Getenv("PROMETHEUS_ADDRESS") + if prometheusURL == "" { + prometheusURL = "http://10.210.10.33:9090" + } + + reloadURL := fmt.Sprintf("%s/-/reload", strings.TrimSuffix(prometheusURL, "/")) + + resp, err := http.Post(reloadURL, "text/plain", nil) + if err != nil { + return fmt.Errorf("failed to reload Prometheus: %w", err) + } + defer resp.Body.Close() + + if resp.StatusCode != http.StatusOK { + return fmt.Errorf("Prometheus reload failed with status: %d", resp.StatusCode) + } + + log.Info().Msg("Prometheus configuration reloaded") + return nil +} From 54b8f4d0b3c12f41ccc7201bb53559c3dd9793e3 Mon Sep 17 00:00:00 2001 From: dnj Date: Wed, 24 Sep 2025 19:22:01 +0800 Subject: [PATCH 07/18] =?UTF-8?q?feat(prometheus=5Fadapter):=20=E5=AE=9E?= =?UTF-8?q?=E7=8E=B0=E5=91=8A=E8=AD=A6=E8=A7=84=E5=88=99=E5=A2=9E=E9=87=8F?= =?UTF-8?q?=E6=9B=B4=E6=96=B0=E5=8A=9F=E8=83=BD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ps:未跑通 --- cmd/prometheus_adapter/main.go | 50 +++ docs/prometheus_adapter/README.md | 85 ++++- go.mod | 2 +- internal/prometheus_adapter/api/alert_api.go | 86 +++++ internal/prometheus_adapter/model/alert.go | 19 +- internal/prometheus_adapter/model/api.go | 22 +- .../service/alert_service.go | 249 +++++++++++-- .../prometheus_adapter/test_alert_update.sh | 91 +++++ scripts/prometheus_adapter/build.sh | 180 +++++++++ scripts/prometheus_adapter/deploy.sh | 350 ++++++++++++++++++ 10 files changed, 1071 insertions(+), 63 deletions(-) create mode 100644 cmd/prometheus_adapter/main.go create mode 100755 internal/prometheus_adapter/test_alert_update.sh create mode 100755 scripts/prometheus_adapter/build.sh create mode 100755 scripts/prometheus_adapter/deploy.sh diff --git a/cmd/prometheus_adapter/main.go b/cmd/prometheus_adapter/main.go new file mode 100644 index 0000000..9f45442 --- /dev/null +++ b/cmd/prometheus_adapter/main.go @@ -0,0 +1,50 @@ +package main + +import ( + "os" + + "github.com/fox-gonic/fox" + "github.com/qiniu/zeroops/internal/config" + prometheusadapter "github.com/qiniu/zeroops/internal/prometheus_adapter" + "github.com/rs/zerolog" + "github.com/rs/zerolog/log" +) + +func main() { + // 配置日志 + log.Logger = log.Output(zerolog.ConsoleWriter{Out: os.Stderr}) + + log.Info().Msg("Starting Prometheus Adapter server") + + // 加载配置 + cfg := &config.Config{ + Server: config.ServerConfig{ + BindAddr: ":9999", // 默认端口 + }, + } + + // 如果有环境变量,使用环境变量的端口 + if port := os.Getenv("ADAPTER_PORT"); port != "" { + cfg.Server.BindAddr = ":" + port + } + + // 创建 Prometheus Adapter 服务器 + adapter, err := prometheusadapter.NewPrometheusAdapterServer(cfg) + if err != nil { + log.Fatal().Err(err).Msg("Failed to create Prometheus Adapter server") + } + + // 创建路由 + router := fox.New() + + // 启动 API + if err := adapter.UseApi(router); err != nil { + log.Fatal().Err(err).Msg("Failed to setup API routes") + } + + // 启动服务器 + log.Info().Msgf("Starting Prometheus Adapter on %s", cfg.Server.BindAddr) + if err := router.Run(cfg.Server.BindAddr); err != nil { + log.Fatal().Err(err).Msg("Failed to start server") + } +} diff --git a/docs/prometheus_adapter/README.md b/docs/prometheus_adapter/README.md index 81aa9f5..0a92312 100644 --- a/docs/prometheus_adapter/README.md +++ b/docs/prometheus_adapter/README.md @@ -139,10 +139,11 @@ internal/prometheus_adapter/ ### 告警规则同步 +#### 1. 全量同步规则 - 方法与路径:`POST /v1/alert-rules/sync` - 功能:接收监控告警模块发送的完整规则列表,生成 Prometheus 规则文件并触发重载(全量同步) - 请求体示例: -``` +```json { "rules": [ { @@ -155,23 +156,95 @@ internal/prometheus_adapter/ ], "rule_metas": [ { - "alert_name": "high_cpu_usage_storage_v1", + "alert_name": "high_cpu_usage", // 与规则模板的name字段保持一致 "labels": "{\"service\":\"storage-service\",\"version\":\"1.0.0\"}", "threshold": 90, - "watch_time": 300, - "match_time": "5m" + "watch_time": 300 } ] } ``` - 响应示例: -``` +```json { "status": "success", "message": "Rules synced to Prometheus" } ``` +#### 2. 更新单个规则模板 +- 方法与路径:`PUT /v1/alert-rules/:rule_name` +- 功能:更新指定的告警规则模板,系统会自动查找所有使用该规则的元信息并重新生成 Prometheus 规则 +- 路径参数: + - `rule_name`:规则名称(如 `high_cpu_usage`) +- 请求体示例: +```json +{ + "description": "CPU使用率异常告警(更新后)", + "expr": "avg(system_cpu_usage_percent)", + "op": ">=", + "severity": "critical" +} +``` +- 响应示例: +```json +{ + "status": "success", + "message": "Rule 'high_cpu_usage' updated and synced to Prometheus", + "affected_metas": 3 // 影响的元信息数量 +} +``` + +#### 3. 更新单个规则元信息 +- 方法与路径:`PUT /v1/alert-rules/meta` +- 功能:更新指定规则的元信息,系统会根据对应的规则模板重新生成 Prometheus 规则 +- 请求体示例: +```json +{ + "rule_name": "high_cpu_usage", // 必填,对应规则模板的name + "labels": "{\"service\":\"storage-service\",\"version\":\"2.0.0\"}", // 必填,用于唯一标识 + "threshold": 85, + "watch_time": 600 +} +``` +- 响应示例: +```json +{ + "status": "success", + "message": "Rule meta updated and synced to Prometheus", + "rule_name": "high_cpu_usage", + "labels": "{\"service\":\"storage-service\",\"version\":\"2.0.0\"}" +} +``` + +#### 规则生成机制 +- **规则模板与元信息关联**:通过 `alert_name` 字段关联 + - `AlertRule.name` = `AlertRuleMeta.alert_name` +- **元信息唯一标识**:通过 `alert_name` + `labels` 的组合唯一确定一个元信息记录 +- **Prometheus 告警生成**: + - 所有基于同一规则模板的告警使用相同的 `alert` 名称(即规则模板的 `name`) + - 通过 `labels` 区分不同的服务实例 + +#### 字段说明 +- **AlertRule(规则模板)**: + - `name`:规则名称,作为 Prometheus 的 alert 名称 + - `description`:规则描述,可读的 title + - `expr`:PromQL 表达式,如 `sum(apitime) by (service, version)`,可包含时间范围 + - `op`:比较操作符(`>`, `<`, `=`, `!=`) + - `severity`:告警等级,通常进入告警的 labels.severity +- **AlertRuleMeta(元信息)**: + - `alert_name`:关联的规则名称(对应 alert_rules.name) + - `labels`:JSON 格式的标签,用于筛选特定服务(如 `{"service":"s3","version":"v1"}`) + - `threshold`:告警阈值 + - `watch_time`:持续时间(秒),对应 Prometheus 的 `for` 字段 + +#### 增量更新说明 +- **增量更新**:新接口支持增量更新,只需传入需要修改的字段 +- **自动匹配**: + - 更新规则模板时,系统自动查找所有 `alert_name` 匹配的元信息并重新生成规则 + - 更新元信息时,系统根据 `alert_name` + `labels` 查找并更新对应的元信息 +- **缓存机制**:系统在内存中缓存当前的规则和元信息,支持快速增量更新 + ## Alertmanager 集成 - 目标:将 Prometheus 触发的告警通过 Alertmanager 转发到监控告警模块 @@ -203,7 +276,7 @@ receivers: - `metadata-service` - `storage-service` - `queue-service` -- `third-party-service`(原文为 third-party-servrice,已更正) +- `third-party-service` - `mock-error-service` 所有服务的版本信息通过标签 `service_version` 暴露。 diff --git a/go.mod b/go.mod index 8015824..94b9643 100644 --- a/go.mod +++ b/go.mod @@ -11,6 +11,7 @@ require ( github.com/prometheus/common v0.66.1 github.com/redis/go-redis/v9 v9.5.1 github.com/rs/zerolog v1.34.0 + gopkg.in/yaml.v3 v3.0.1 ) require ( @@ -52,5 +53,4 @@ require ( golang.org/x/sys v0.35.0 // indirect golang.org/x/text v0.28.0 // indirect google.golang.org/protobuf v1.36.8 // indirect - gopkg.in/yaml.v3 v3.0.1 // indirect ) diff --git a/internal/prometheus_adapter/api/alert_api.go b/internal/prometheus_adapter/api/alert_api.go index cb3e968..3206087 100644 --- a/internal/prometheus_adapter/api/alert_api.go +++ b/internal/prometheus_adapter/api/alert_api.go @@ -1,6 +1,7 @@ package api import ( + "fmt" "net/http" "github.com/fox-gonic/fox" @@ -10,6 +11,8 @@ import ( // setupAlertRouters 设置告警相关路由 func (api *Api) setupAlertRouters(router *fox.Engine) { router.POST("/v1/alert-rules/sync", api.SyncRules) + router.PUT("/v1/alert-rules/:rule_name", api.UpdateRule) + router.PUT("/v1/alert-rules/meta", api.UpdateRuleMeta) } // SyncRules 同步规则到Prometheus @@ -34,3 +37,86 @@ func (api *Api) SyncRules(c *fox.Context) { "message": "Rules synced to Prometheus", }) } + +// UpdateRule 更新单个规则模板 +// 只更新指定的规则,系统会自动查找所有使用该规则的元信息并重新生成 +func (api *Api) UpdateRule(c *fox.Context) { + ruleName := c.Param("rule_name") + if ruleName == "" { + SendErrorResponse(c, http.StatusBadRequest, model.ErrorCodeInvalidParameter, + "Rule name is required", nil) + return + } + + var req model.UpdateAlertRuleRequest + if err := c.ShouldBindJSON(&req); err != nil { + SendErrorResponse(c, http.StatusBadRequest, model.ErrorCodeInvalidParameter, + "Invalid request body: "+err.Error(), nil) + return + } + + // 构建完整的规则对象 + rule := model.AlertRule{ + Name: ruleName, + Description: req.Description, + Expr: req.Expr, + Op: req.Op, + Severity: req.Severity, + } + + err := api.alertService.UpdateRule(rule) + if err != nil { + SendErrorResponse(c, http.StatusInternalServerError, model.ErrorCodeInternalError, + "Failed to update rule: "+err.Error(), nil) + return + } + + // 获取受影响的元信息数量 + affectedCount := api.alertService.GetAffectedMetas(ruleName) + + c.JSON(http.StatusOK, map[string]interface{}{ + "status": "success", + "message": fmt.Sprintf("Rule '%s' updated and synced to Prometheus", ruleName), + "affected_metas": affectedCount, + }) +} + +// UpdateRuleMeta 更新单个规则元信息 +// 通过 alert_name + labels 唯一确定一个元信息记录 +func (api *Api) UpdateRuleMeta(c *fox.Context) { + var req model.UpdateAlertRuleMetaRequest + if err := c.ShouldBindJSON(&req); err != nil { + SendErrorResponse(c, http.StatusBadRequest, model.ErrorCodeInvalidParameter, + "Invalid request body: "+err.Error(), nil) + return + } + + // alert_name 和 labels 是必填的 + if req.AlertName == "" || req.Labels == "" { + SendErrorResponse(c, http.StatusBadRequest, model.ErrorCodeInvalidParameter, + "alert_name and labels are required", nil) + return + } + + // 构建完整的元信息对象 + meta := model.AlertRuleMeta{ + AlertName: req.AlertName, + Labels: req.Labels, + Threshold: req.Threshold, + WatchTime: req.WatchTime, + } + + err := api.alertService.UpdateRuleMeta(meta) + if err != nil { + SendErrorResponse(c, http.StatusInternalServerError, model.ErrorCodeInternalError, + "Failed to update rule meta: "+err.Error(), nil) + return + } + + c.JSON(http.StatusOK, map[string]interface{}{ + "status": "success", + "message": "Rule meta updated and synced to Prometheus", + "alert_name": req.AlertName, + "labels": req.Labels, + }) +} diff --git a/internal/prometheus_adapter/model/alert.go b/internal/prometheus_adapter/model/alert.go index 605c5fb..566a143 100644 --- a/internal/prometheus_adapter/model/alert.go +++ b/internal/prometheus_adapter/model/alert.go @@ -2,19 +2,18 @@ package model // AlertRule 告警规则表 - 定义告警规则模板 type AlertRule struct { - Name string `json:"name" gorm:"type:varchar(255);primaryKey"` - Description string `json:"description" gorm:"type:text"` - Expr string `json:"expr" gorm:"type:text;not null"` - Op string `json:"op" gorm:"type:enum('>', '<', '=', '!=');not null"` - Severity string `json:"severity" gorm:"type:varchar(50);not null"` + Name string `json:"name" gorm:"type:varchar(255);primaryKey"` // 主键,告警规则名称 + Description string `json:"description" gorm:"type:text"` // 可读标题,可拼接渲染为可读的 title + Expr string `json:"expr" gorm:"type:text;not null"` // 左侧业务指标表达式,如 sum(apitime) by (service, version) + Op string `json:"op" gorm:"type:varchar(4);not null"` // 阈值比较方式(>, <, =, !=) + Severity string `json:"severity" gorm:"type:varchar(32);not null"` // 告警等级,通常进入告警的 labels.severity } // AlertRuleMeta 告警规则元信息表 - 存储服务级别的告警配置 // 用于将告警规则模板实例化为具体的服务告警 type AlertRuleMeta struct { - AlertName string `json:"alert_name" gorm:"type:varchar(255);primaryKey"` - Labels string `json:"labels" gorm:"type:text"` // JSON格式的服务标签,如:{"service":"storage-service","version":"1.0.0"} - Threshold float64 `json:"threshold"` // 告警阈值 - WatchTime int `json:"watch_time"` // 持续时间(秒),对应Prometheus的for字段 - MatchTime string `json:"match_time" gorm:"type:text"` // 时间范围表达式 + AlertName string `json:"alert_name" gorm:"type:varchar(255);index"` // 关联 alert_rules.name + Labels string `json:"labels" gorm:"type:jsonb"` // 适用标签,如 {"service":"s3","version":"v1"},为空表示全局 + Threshold float64 `json:"threshold"` // 阈值(会被渲染成特定规则的 threshold metric 数值) + WatchTime int `json:"watch_time"` // 持续时长(映射 Prometheus rule 的 for) } diff --git a/internal/prometheus_adapter/model/api.go b/internal/prometheus_adapter/model/api.go index f7f6e7f..4dc8421 100644 --- a/internal/prometheus_adapter/model/api.go +++ b/internal/prometheus_adapter/model/api.go @@ -42,16 +42,10 @@ type CreateAlertRuleRequest struct { // UpdateAlertRuleRequest 更新告警规则请求 type UpdateAlertRuleRequest struct { - Description *string `json:"description,omitempty"` - Expr *string `json:"expr,omitempty"` - Op *string `json:"op,omitempty" binding:"omitempty,oneof=> < = !="` - Severity *string `json:"severity,omitempty"` - - // 元信息字段(可选) - Labels map[string]string `json:"labels,omitempty"` - Threshold *float64 `json:"threshold,omitempty"` - WatchTime *int `json:"watch_time,omitempty"` - MatchTime *string `json:"match_time,omitempty"` + Description string `json:"description,omitempty"` + Expr string `json:"expr,omitempty"` + Op string `json:"op,omitempty" binding:"omitempty,oneof=> < = !="` + Severity string `json:"severity,omitempty"` } // CreateAlertRuleMetaRequest 创建告警规则元信息请求 @@ -65,10 +59,10 @@ type CreateAlertRuleMetaRequest struct { // UpdateAlertRuleMetaRequest 更新告警规则元信息请求 type UpdateAlertRuleMetaRequest struct { - Labels map[string]string `json:"labels,omitempty"` - Threshold *float64 `json:"threshold,omitempty"` - WatchTime *int `json:"watch_time,omitempty"` - MatchTime *string `json:"match_time,omitempty"` + AlertName string `json:"alert_name" binding:"required"` + Labels string `json:"labels" binding:"required"` + Threshold float64 `json:"threshold"` + WatchTime int `json:"watch_time"` } // SyncRulesRequest 同步规则请求 diff --git a/internal/prometheus_adapter/service/alert_service.go b/internal/prometheus_adapter/service/alert_service.go index 2be8095..0e4854a 100644 --- a/internal/prometheus_adapter/service/alert_service.go +++ b/internal/prometheus_adapter/service/alert_service.go @@ -5,6 +5,7 @@ import ( "fmt" "net/http" "os" + "os/exec" "path/filepath" "strings" @@ -18,24 +19,34 @@ import ( type AlertService struct { promClient *client.PrometheusClient rulesFilePath string + // 内存中缓存当前规则,用于增量更新 + currentRules []model.AlertRule + currentRuleMetas []model.AlertRuleMeta } // NewAlertService 创建告警服务 func NewAlertService(promClient *client.PrometheusClient) *AlertService { rulesFilePath := os.Getenv("PROMETHEUS_RULES_FILE") if rulesFilePath == "" { - rulesFilePath = "/etc/prometheus/rules/alert_rules.yml" + // 在本地生成规则文件,用于调试和后续同步到远程容器 + rulesFilePath = "./prometheus_rules/alert_rules.yml" } return &AlertService{ - promClient: promClient, - rulesFilePath: rulesFilePath, + promClient: promClient, + rulesFilePath: rulesFilePath, + currentRules: []model.AlertRule{}, + currentRuleMetas: []model.AlertRuleMeta{}, } } // SyncRulesToPrometheus 同步规则到Prometheus // 接收完整的规则列表,生成Prometheus规则文件并重载配置 func (s *AlertService) SyncRulesToPrometheus(rules []model.AlertRule, ruleMetas []model.AlertRuleMeta) error { + // 保存到内存缓存 + s.currentRules = rules + s.currentRuleMetas = ruleMetas + // 构建Prometheus规则文件 prometheusRules := s.buildPrometheusRules(rules, ruleMetas) @@ -73,14 +84,9 @@ func (s *AlertService) buildPrometheusRules(rules []model.AlertRule, ruleMetas [ // 查找对应的规则模板 var rule *model.AlertRule - // 尝试从alert_name中提取规则名 - // 假设alert_name格式为: {rule_name}_{service}_{version} 或类似格式 - for ruleName, r := range ruleMap { - if strings.HasPrefix(meta.AlertName, ruleName) { - rule = r - break - } - } + // 通过 alert_name 直接查找对应的规则模板 + // AlertRuleMeta.alert_name 关联 AlertRule.name + rule = ruleMap[meta.AlertName] if rule == nil { log.Warn(). @@ -105,7 +111,6 @@ func (s *AlertService) buildPrometheusRules(rules []model.AlertRule, ruleMetas [ // 添加severity标签 labels["severity"] = rule.Severity - labels["rule_name"] = rule.Name // 构建表达式 expr := s.buildExpression(rule, &meta) @@ -122,8 +127,9 @@ func (s *AlertService) buildPrometheusRules(rules []model.AlertRule, ruleMetas [ forDuration = fmt.Sprintf("%ds", meta.WatchTime) } + // 使用规则名作为 alert 名称,通过 labels 区分不同实例 promRule := model.PrometheusRule{ - Alert: meta.AlertName, + Alert: rule.Name, // 使用规则名作为 alert 名称 Expr: expr, For: forDuration, Labels: labels, @@ -179,10 +185,6 @@ func (s *AlertService) buildExpression(rule *model.AlertRule, meta *model.AlertR if len(labels) > 0 { labelMatchers := []string{} for k, v := range labels { - // 跳过内部使用的标签 - if k == "rule_name" { - continue - } labelMatchers = append(labelMatchers, fmt.Sprintf(`%s="%s"`, k, v)) } @@ -209,20 +211,6 @@ func (s *AlertService) buildExpression(rule *model.AlertRule, meta *model.AlertR } } - // 添加时间范围 - if meta.MatchTime != "" { - // 查找最后一个指标,添加时间范围 - if !strings.Contains(expr, "[") { - // 简单处理:在第一个空格前添加时间范围 - parts := strings.SplitN(expr, " ", 2) - if len(parts) == 2 { - expr = parts[0] + "[" + meta.MatchTime + "] " + parts[1] - } else { - expr = expr + "[" + meta.MatchTime + "]" - } - } - } - // 添加比较操作符和阈值 if meta.Threshold != 0 { expr = fmt.Sprintf("%s %s %f", expr, rule.Op, meta.Threshold) @@ -253,7 +241,13 @@ func (s *AlertService) writeRulesFile(rules *model.PrometheusRuleFile) error { log.Info(). Str("file", s.rulesFilePath). Int("groups", len(rules.Groups)). - Msg("Prometheus rules file updated") + Msg("Prometheus rules file updated locally") + + // 同步到 Prometheus 容器 + if err := s.syncToPrometheusContainer(); err != nil { + log.Warn().Err(err).Msg("Failed to sync rules to Prometheus container") + // 不返回错误,因为本地文件已经生成成功 + } return nil } @@ -280,3 +274,194 @@ func (s *AlertService) reloadPrometheus() error { log.Info().Msg("Prometheus configuration reloaded") return nil } + +// syncToPrometheusContainer 同步规则文件到本地 Prometheus 容器 +func (s *AlertService) syncToPrometheusContainer() error { + // 获取容器名称,默认为 mock-s3-prometheus + containerName := os.Getenv("PROMETHEUS_CONTAINER") + if containerName == "" { + containerName = "mock-s3-prometheus" + } + + // 1. 创建容器内的规则目录(如果不存在) + cmdMkdir := exec.Command("docker", "exec", containerName, "mkdir", "-p", "/etc/prometheus/rules") + if output, err := cmdMkdir.CombinedOutput(); err != nil { + // 目录可能已存在,记录日志但不返回错误 + log.Debug(). + Str("output", string(output)). + Msg("mkdir in container (may already exist)") + } + + // 2. 将规则文件拷贝到容器内 + cmdCopy := exec.Command("docker", "cp", s.rulesFilePath, fmt.Sprintf("%s:/etc/prometheus/rules/alert_rules.yml", containerName)) + if output, err := cmdCopy.CombinedOutput(); err != nil { + return fmt.Errorf("failed to copy rules file to container: %w, output: %s", err, string(output)) + } + + log.Info(). + Str("container", containerName). + Str("file", s.rulesFilePath). + Msg("Rules synced to Prometheus container") + + // 3. 确保 Prometheus 配置包含 rule_files + if err := s.ensurePrometheusRuleConfig(containerName); err != nil { + log.Warn().Err(err).Msg("Failed to ensure Prometheus rule configuration") + } + + return nil +} + +// ensurePrometheusRuleConfig 确保 Prometheus 配置文件包含 rule_files 配置 +func (s *AlertService) ensurePrometheusRuleConfig(containerName string) error { + configPath := "/etc/prometheus/prometheus.yml" + + // 1. 检查配置文件是否已包含 rule_files + cmdCheck := exec.Command("docker", "exec", containerName, "grep", "-q", "rule_files:", configPath) + if err := cmdCheck.Run(); err == nil { + // 已经包含 rule_files,不需要修改 + log.Debug().Msg("Prometheus config already contains rule_files") + return nil + } + + log.Info().Msg("Adding rule_files configuration to Prometheus") + + // 3. 在 global 部分后添加 rule_files 配置 + // 使用 sed 在 global: 块后插入 rule_files 配置 + sedScript := `'/^global:/,/^[^[:space:]]/ { + /^[^[:space:]]/ { + i\ +# Alert rules\ +rule_files:\ + - "/etc/prometheus/rules/*.yml"\ + + } + }'` + + cmdSed := exec.Command("docker", "exec", containerName, "sh", "-c", + fmt.Sprintf(`sed -i '%s' %s`, sedScript, configPath)) + + if output, err := cmdSed.CombinedOutput(); err != nil { + // 如果 sed 失败,尝试使用更简单的方法 + log.Warn(). + Str("output", string(output)). + Msg("sed failed, trying alternative method") + + // 使用 awk 方法 + awkScript := `awk '/^global:/ {print; getline; print; print "# Alert rules"; print "rule_files:"; print " - \"/etc/prometheus/rules/*.yml\""; next} {print}' %s > %s.tmp && mv %s.tmp %s` + cmdAwk := exec.Command("docker", "exec", containerName, "sh", "-c", + fmt.Sprintf(awkScript, configPath, configPath, configPath, configPath)) + + if output, err := cmdAwk.CombinedOutput(); err != nil { + return fmt.Errorf("failed to add rule_files to config: %w, output: %s", err, string(output)) + } + } + + log.Info().Msg("Successfully added rule_files configuration to Prometheus") + + // 4. 重启 Prometheus 容器以应用配置 + cmdRestart := exec.Command("docker", "restart", containerName) + if output, err := cmdRestart.CombinedOutput(); err != nil { + return fmt.Errorf("failed to restart Prometheus: %w, output: %s", err, string(output)) + } + + log.Info().Msg("Prometheus restarted with new configuration") + return nil +} + +// UpdateRule 更新单个规则模板 +// 只更新传入的规则,其他规则和所有元信息保持不变 +func (s *AlertService) UpdateRule(rule model.AlertRule) error { + // 查找并更新规则 + found := false + for i, r := range s.currentRules { + if r.Name == rule.Name { + s.currentRules[i] = rule + found = true + break + } + } + + if !found { + // 如果规则不存在,添加新规则 + s.currentRules = append(s.currentRules, rule) + } + + // 统计受影响的元信息数量 + affectedCount := 0 + for _, meta := range s.currentRuleMetas { + if meta.AlertName == rule.Name { + affectedCount++ + } + } + + log.Info(). + Str("rule", rule.Name). + Int("affected_metas", affectedCount). + Msg("Updating rule and affected metas") + + // 使用更新后的规则重新生成并同步 + return s.regenerateAndSync() +} + +// UpdateRuleMeta 更新单个规则元信息 +// 通过 alert_name + labels 唯一确定一个元信息记录 +func (s *AlertService) UpdateRuleMeta(meta model.AlertRuleMeta) error { + // 查找并更新元信息 + found := false + for i, m := range s.currentRuleMetas { + // 通过 alert_name + labels 唯一确定 + if m.AlertName == meta.AlertName && m.Labels == meta.Labels { + s.currentRuleMetas[i] = meta + found = true + break + } + } + + if !found { + // 如果元信息不存在,添加新元信息 + s.currentRuleMetas = append(s.currentRuleMetas, meta) + } + + log.Info(). + Str("alert_name", meta.AlertName). + Str("labels", meta.Labels). + Msg("Updating rule meta") + + // 使用更新后的元信息重新生成并同步 + return s.regenerateAndSync() +} + +// regenerateAndSync 使用当前内存中的规则和元信息重新生成Prometheus规则并同步 +func (s *AlertService) regenerateAndSync() error { + // 构建Prometheus规则文件 + prometheusRules := s.buildPrometheusRules(s.currentRules, s.currentRuleMetas) + + // 写入规则文件 + if err := s.writeRulesFile(prometheusRules); err != nil { + return fmt.Errorf("failed to write rules file: %w", err) + } + + // 通知Prometheus重新加载配置 + if err := s.reloadPrometheus(); err != nil { + log.Warn().Err(err).Msg("Failed to reload Prometheus, rules file has been updated") + // 不返回错误,因为文件已经更新成功 + } + + log.Info(). + Int("rules_count", len(s.currentRules)). + Int("metas_count", len(s.currentRuleMetas)). + Msg("Rules regenerated and synced to Prometheus") + + return nil +} + +// GetAffectedMetas 获取受影响的元信息数量 +func (s *AlertService) GetAffectedMetas(ruleName string) int { + count := 0 + for _, meta := range s.currentRuleMetas { + if meta.AlertName == ruleName { + count++ + } + } + return count +} diff --git a/internal/prometheus_adapter/test_alert_update.sh b/internal/prometheus_adapter/test_alert_update.sh new file mode 100755 index 0000000..a8af0ea --- /dev/null +++ b/internal/prometheus_adapter/test_alert_update.sh @@ -0,0 +1,91 @@ +#!/bin/bash + +# 测试增量更新告警规则功能 + +BASE_URL="http://localhost:8080" + +echo "=== 测试增量更新告警规则 ===" + +# 1. 先进行全量同步,创建初始规则 +echo -e "\n1. 全量同步规则..." +curl -X POST ${BASE_URL}/v1/alert-rules/sync \ + -H "Content-Type: application/json" \ + -d '{ + "rules": [ + { + "name": "high_cpu_usage", + "description": "CPU使用率过高告警", + "expr": "system_cpu_usage_percent", + "op": ">", + "severity": "warning" + }, + { + "name": "high_memory_usage", + "description": "内存使用率过高告警", + "expr": "system_memory_usage_percent", + "op": ">", + "severity": "warning" + } + ], + "rule_metas": [ + { + "alert_name": "high_cpu_usage", + "labels": "{\"service\":\"storage-service\",\"version\":\"1.0.0\"}", + "threshold": 80, + "watch_time": 300 + }, + { + "alert_name": "high_cpu_usage", + "labels": "{\"service\":\"metadata-service\",\"version\":\"1.0.0\"}", + "threshold": 85, + "watch_time": 300 + }, + { + "alert_name": "high_memory_usage", + "labels": "{\"service\":\"storage-service\",\"version\":\"1.0.0\"}", + "threshold": 90, + "watch_time": 600 + } + ] + }' | jq . + +sleep 2 + +# 2. 更新单个规则模板 +echo -e "\n2. 更新规则模板 high_cpu_usage..." +curl -X PUT ${BASE_URL}/v1/alert-rules/high_cpu_usage \ + -H "Content-Type: application/json" \ + -d '{ + "description": "CPU使用率异常告警(更新后)", + "expr": "avg(system_cpu_usage_percent[5m])", + "op": ">=", + "severity": "critical" + }' | jq . + +sleep 2 + +# 3. 更新单个规则元信息 +echo -e "\n3. 更新规则元信息..." +curl -X PUT ${BASE_URL}/v1/alert-rules/meta \ + -H "Content-Type: application/json" \ + -d '{ + "alert_name": "high_cpu_usage", + "labels": "{\"service\":\"storage-service\",\"version\":\"1.0.0\"}", + "threshold": 75, + "watch_time": 600 + }' | jq . + +sleep 2 + +# 4. 添加新的元信息 +echo -e "\n4. 添加新的元信息..." +curl -X PUT ${BASE_URL}/v1/alert-rules/meta \ + -H "Content-Type: application/json" \ + -d '{ + "alert_name": "high_memory_usage", + "labels": "{\"service\":\"queue-service\",\"version\":\"2.0.0\"}", + "threshold": 95, + "watch_time": 300 + }' | jq . + +echo -e "\n=== 测试完成 ===" \ No newline at end of file diff --git a/scripts/prometheus_adapter/build.sh b/scripts/prometheus_adapter/build.sh new file mode 100755 index 0000000..ec5c08a --- /dev/null +++ b/scripts/prometheus_adapter/build.sh @@ -0,0 +1,180 @@ +#!/bin/bash + +# Prometheus Adapter 打包脚本 +# 将编译产物和必要文件打包到 build 目录 + +set -e + +# 颜色输出 +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +NC='\033[0m' # No Color + +# 打印日志函数 +log_info() { + echo -e "${GREEN}[INFO]${NC} $1" +} + +log_error() { + echo -e "${RED}[ERROR]${NC} $1" +} + +log_warn() { + echo -e "${YELLOW}[WARN]${NC} $1" +} + +# 项目根目录 +PROJECT_ROOT=$(cd "$(dirname "$0")"/../.. && pwd) +cd "$PROJECT_ROOT" + +# 配置 +APP_NAME="prometheus_adapter" +BUILD_DIR="build/${APP_NAME}" +VERSION=$(git describe --tags --always --dirty 2>/dev/null || echo "dev") +BUILD_TIME=$(date -u '+%Y-%m-%d_%H:%M:%S') +GOOS=${GOOS:-linux} +GOARCH=${GOARCH:-amd64} + +log_info "开始构建 ${APP_NAME}" +log_info "版本: ${VERSION}" +log_info "构建时间: ${BUILD_TIME}" +log_info "目标系统: ${GOOS}/${GOARCH}" + +# 清理旧的构建目录 +if [ -d "$BUILD_DIR" ]; then + log_warn "清理旧的构建目录..." + rm -rf "$BUILD_DIR" +fi + +# 创建构建目录 +log_info "创建构建目录..." +mkdir -p "$BUILD_DIR/bin" +mkdir -p "$BUILD_DIR/docs" +mkdir -p "$BUILD_DIR/scripts" + +# 编译二进制文件 +log_info "编译 ${APP_NAME}..." +LDFLAGS="-X main.Version=${VERSION} -X main.BuildTime=${BUILD_TIME}" +CGO_ENABLED=0 GOOS=$GOOS GOARCH=$GOARCH go build \ + -ldflags "$LDFLAGS" \ + -o "$BUILD_DIR/bin/${APP_NAME}" \ + "./cmd/${APP_NAME}" + +if [ $? -ne 0 ]; then + log_error "编译失败" + exit 1 +fi + +# 复制文档 +log_info "复制文档..." +if [ -f "docs/${APP_NAME}/README.md" ]; then + cp "docs/${APP_NAME}/README.md" "$BUILD_DIR/docs/" +fi + +# 复制测试脚本 +log_info "复制脚本..." +if [ -f "internal/${APP_NAME}/test_alert_update.sh" ]; then + cp "internal/${APP_NAME}/test_alert_update.sh" "$BUILD_DIR/scripts/" + chmod +x "$BUILD_DIR/scripts/test_alert_update.sh" +fi + +# 创建启动脚本 +log_info "创建启动脚本..." +cat > "$BUILD_DIR/start.sh" << 'EOF' +#!/bin/bash + +# Prometheus Adapter 启动脚本 + +# 默认配置 +PROMETHEUS_URL=${PROMETHEUS_URL:-"http://localhost:9090"} +PORT=${PORT:-8080} +LOG_LEVEL=${LOG_LEVEL:-"info"} + +# 获取脚本所在目录 +SCRIPT_DIR=$(cd "$(dirname "$0")" && pwd) +BIN_PATH="$SCRIPT_DIR/bin/prometheus_adapter" + +# 检查二进制文件 +if [ ! -f "$BIN_PATH" ]; then + echo "错误: 找不到可执行文件 $BIN_PATH" + exit 1 +fi + +# 启动参数 +ARGS="" +ARGS="$ARGS --prometheus-url=$PROMETHEUS_URL" +ARGS="$ARGS --port=$PORT" +ARGS="$ARGS --log-level=$LOG_LEVEL" + +echo "启动 Prometheus Adapter..." +echo "Prometheus URL: $PROMETHEUS_URL" +echo "监听端口: $PORT" +echo "日志级别: $LOG_LEVEL" + +# 启动服务 +exec "$BIN_PATH" $ARGS +EOF +chmod +x "$BUILD_DIR/start.sh" + +# 创建停止脚本 +log_info "创建停止脚本..." +cat > "$BUILD_DIR/stop.sh" << 'EOF' +#!/bin/bash + +# Prometheus Adapter 停止脚本 + +APP_NAME="prometheus_adapter" + +# 查找进程 +PID=$(ps aux | grep -v grep | grep "$APP_NAME" | awk '{print $2}') + +if [ -z "$PID" ]; then + echo "没有找到运行中的 $APP_NAME 进程" + exit 0 +fi + +echo "停止 $APP_NAME (PID: $PID)..." +kill -TERM $PID + +# 等待进程退出 +sleep 2 + +# 检查是否还在运行 +if ps -p $PID > /dev/null 2>&1; then + echo "强制停止进程..." + kill -KILL $PID +fi + +echo "$APP_NAME 已停止" +EOF +chmod +x "$BUILD_DIR/stop.sh" + +# 创建版本信息文件 +log_info "创建版本信息..." +cat > "$BUILD_DIR/VERSION" << EOF +Application: ${APP_NAME} +Version: ${VERSION} +Build Time: ${BUILD_TIME} +Build OS/Arch: ${GOOS}/${GOARCH} +EOF + +# 打包成 tar.gz +ARCHIVE_NAME="${APP_NAME}_${VERSION}_${GOOS}_${GOARCH}.tar.gz" +log_info "创建归档文件: $ARCHIVE_NAME" +cd build +tar -czf "$ARCHIVE_NAME" "$APP_NAME" +cd .. + +# 输出构建信息 +log_info "构建成功!" +echo "" +echo "构建产物:" +echo " - 目录: $BUILD_DIR" +echo " - 归档: build/$ARCHIVE_NAME" +echo "" +echo "文件列表:" +ls -lah "$BUILD_DIR/" +echo "" +echo "归档大小:" +ls -lah "build/$ARCHIVE_NAME" \ No newline at end of file diff --git a/scripts/prometheus_adapter/deploy.sh b/scripts/prometheus_adapter/deploy.sh new file mode 100755 index 0000000..85bddf7 --- /dev/null +++ b/scripts/prometheus_adapter/deploy.sh @@ -0,0 +1,350 @@ +#!/bin/bash + +# Prometheus Adapter 部署脚本 +# 将打包好的文件解压并部署到指定目录 + +set -e + +# 颜色输出 +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' # No Color + +# 打印日志函数 +log_info() { + echo -e "${GREEN}[INFO]${NC} $1" +} + +log_error() { + echo -e "${RED}[ERROR]${NC} $1" +} + +log_warn() { + echo -e "${YELLOW}[WARN]${NC} $1" +} + +log_debug() { + echo -e "${BLUE}[DEBUG]${NC} $1" +} + +# 显示使用帮助 +show_usage() { + cat << EOF +使用方法: + $0 [选项] <归档文件> + +选项: + -d, --deploy-dir DIR 指定部署目录 (默认: /home/qboxserver/zeroops_prometheus_adapter) + -b, --backup 部署前备份现有目录 + -s, --start 部署后自动启动服务 + -r, --restart 如果服务已运行则重启 + -f, --force 强制部署,不询问确认 + -h, --help 显示此帮助信息 + +示例: + $0 prometheus_adapter_v1.0.0_linux_amd64.tar.gz + $0 -d /opt/prometheus_adapter -b -s prometheus_adapter.tar.gz + $0 --backup --restart prometheus_adapter.tar.gz + +EOF + exit 0 +} + +# 默认配置 +DEPLOY_DIR="/home/qboxserver/zeroops_prometheus_adapter" +BACKUP=false +START_SERVICE=false +RESTART_SERVICE=false +FORCE_DEPLOY=false +ARCHIVE_FILE="" + +# 解析命令行参数 +while [[ $# -gt 0 ]]; do + case $1 in + -d|--deploy-dir) + DEPLOY_DIR="$2" + shift 2 + ;; + -b|--backup) + BACKUP=true + shift + ;; + -s|--start) + START_SERVICE=true + shift + ;; + -r|--restart) + RESTART_SERVICE=true + shift + ;; + -f|--force) + FORCE_DEPLOY=true + shift + ;; + -h|--help) + show_usage + ;; + *) + if [ -z "$ARCHIVE_FILE" ]; then + ARCHIVE_FILE="$1" + else + log_error "未知参数: $1" + show_usage + fi + shift + ;; + esac +done + +# 检查归档文件参数 +if [ -z "$ARCHIVE_FILE" ]; then + log_error "请指定要部署的归档文件" + show_usage +fi + +# 检查归档文件是否存在 +if [ ! -f "$ARCHIVE_FILE" ]; then + log_error "找不到归档文件: $ARCHIVE_FILE" + exit 1 +fi + +# 获取归档文件的绝对路径 +ARCHIVE_FILE=$(realpath "$ARCHIVE_FILE") + +log_info "部署配置:" +log_info " 归档文件: $ARCHIVE_FILE" +log_info " 部署目录: $DEPLOY_DIR" +log_info " 备份现有: $BACKUP" +log_info " 自动启动: $START_SERVICE" +log_info " 重启服务: $RESTART_SERVICE" + +# 确认部署 +if [ "$FORCE_DEPLOY" = false ]; then + echo -n "确认部署? (y/N): " + read -r CONFIRM + if [ "$CONFIRM" != "y" ] && [ "$CONFIRM" != "Y" ]; then + log_warn "部署已取消" + exit 0 + fi +fi + +# 检查是否有运行中的服务 +check_running_service() { + local pid=$(ps aux | grep -v grep | grep "prometheus_adapter" | grep -v "$0" | awk '{print $2}') + if [ -n "$pid" ]; then + echo "$pid" + fi +} + +# 停止运行中的服务 +stop_service() { + local pid=$1 + if [ -n "$pid" ]; then + log_warn "停止运行中的服务 (PID: $pid)..." + kill -TERM "$pid" 2>/dev/null || true + + # 等待进程退出 + local count=0 + while [ $count -lt 10 ] && ps -p "$pid" > /dev/null 2>&1; do + sleep 1 + count=$((count + 1)) + done + + # 如果还没退出,强制停止 + if ps -p "$pid" > /dev/null 2>&1; then + log_warn "强制停止进程..." + kill -KILL "$pid" 2>/dev/null || true + fi + + log_info "服务已停止" + fi +} + +# 检查运行中的服务 +RUNNING_PID=$(check_running_service) +if [ -n "$RUNNING_PID" ]; then + log_warn "检测到运行中的 prometheus_adapter 服务 (PID: $RUNNING_PID)" + if [ "$RESTART_SERVICE" = true ] || [ "$FORCE_DEPLOY" = true ]; then + stop_service "$RUNNING_PID" + else + log_error "服务正在运行,请先停止服务或使用 -r/--restart 选项" + exit 1 + fi +fi + +# 备份现有目录 +if [ "$BACKUP" = true ] && [ -d "$DEPLOY_DIR" ]; then + BACKUP_DIR="${DEPLOY_DIR}_backup_$(date +%Y%m%d_%H%M%S)" + log_info "备份现有目录到: $BACKUP_DIR" + + # 需要sudo权限 + if [ -w "$(dirname "$DEPLOY_DIR")" ]; then + mv "$DEPLOY_DIR" "$BACKUP_DIR" + else + log_warn "需要管理员权限来备份目录" + sudo mv "$DEPLOY_DIR" "$BACKUP_DIR" + fi +fi + +# 创建临时解压目录 +TEMP_DIR=$(mktemp -d) +log_info "创建临时目录: $TEMP_DIR" + +# 解压归档文件 +log_info "解压归档文件..." +tar -xzf "$ARCHIVE_FILE" -C "$TEMP_DIR" + +# 查找解压后的目录 +EXTRACTED_DIR=$(find "$TEMP_DIR" -maxdepth 1 -type d -name "prometheus_adapter" | head -n 1) +if [ -z "$EXTRACTED_DIR" ]; then + log_error "解压失败:找不到 prometheus_adapter 目录" + rm -rf "$TEMP_DIR" + exit 1 +fi + +# 创建部署目录(如果需要sudo) +log_info "创建部署目录..." +if [ -w "$(dirname "$DEPLOY_DIR")" ]; then + mkdir -p "$(dirname "$DEPLOY_DIR")" +else + log_warn "需要管理员权限来创建部署目录" + sudo mkdir -p "$(dirname "$DEPLOY_DIR")" +fi + +# 移动到部署目录 +log_info "部署到: $DEPLOY_DIR" +if [ -w "$(dirname "$DEPLOY_DIR")" ]; then + if [ -d "$DEPLOY_DIR" ]; then + rm -rf "$DEPLOY_DIR" + fi + mv "$EXTRACTED_DIR" "$DEPLOY_DIR" +else + log_warn "需要管理员权限来部署" + if [ -d "$DEPLOY_DIR" ]; then + sudo rm -rf "$DEPLOY_DIR" + fi + sudo mv "$EXTRACTED_DIR" "$DEPLOY_DIR" +fi + +# 设置权限 +log_info "设置文件权限..." +if [ -w "$DEPLOY_DIR" ]; then + chmod +x "$DEPLOY_DIR/bin/prometheus_adapter" + chmod +x "$DEPLOY_DIR/start.sh" + chmod +x "$DEPLOY_DIR/stop.sh" + [ -f "$DEPLOY_DIR/scripts/test_alert_update.sh" ] && chmod +x "$DEPLOY_DIR/scripts/test_alert_update.sh" +else + sudo chmod +x "$DEPLOY_DIR/bin/prometheus_adapter" + sudo chmod +x "$DEPLOY_DIR/start.sh" + sudo chmod +x "$DEPLOY_DIR/stop.sh" + [ -f "$DEPLOY_DIR/scripts/test_alert_update.sh" ] && sudo chmod +x "$DEPLOY_DIR/scripts/test_alert_update.sh" +fi + +# 清理临时目录 +rm -rf "$TEMP_DIR" + +# 显示部署信息 +log_info "部署成功!" +echo "" +echo "部署信息:" +echo " 目录: $DEPLOY_DIR" +echo "" +echo "版本信息:" +if [ -f "$DEPLOY_DIR/VERSION" ]; then + cat "$DEPLOY_DIR/VERSION" +else + echo " 无版本信息" +fi +echo "" +echo "文件列表:" +ls -lah "$DEPLOY_DIR/" + +# 创建systemd服务文件(可选) +create_systemd_service() { + local service_name="prometheus-adapter" + local service_file="/etc/systemd/system/${service_name}.service" + + log_info "创建 systemd 服务..." + + cat << EOF | sudo tee "$service_file" > /dev/null +[Unit] +Description=Prometheus Adapter Service +After=network.target + +[Service] +Type=simple +User=qboxserver +Group=qboxserver +WorkingDirectory=$DEPLOY_DIR +Environment="PROMETHEUS_URL=http://localhost:9090" +Environment="PORT=8080" +Environment="LOG_LEVEL=info" +ExecStart=$DEPLOY_DIR/bin/prometheus_adapter +ExecStop=$DEPLOY_DIR/stop.sh +Restart=on-failure +RestartSec=10 + +[Install] +WantedBy=multi-user.target +EOF + + sudo systemctl daemon-reload + log_info "Systemd 服务已创建: ${service_name}.service" + echo "" + echo "可以使用以下命令管理服务:" + echo " 启动: sudo systemctl start ${service_name}" + echo " 停止: sudo systemctl stop ${service_name}" + echo " 重启: sudo systemctl restart ${service_name}" + echo " 状态: sudo systemctl status ${service_name}" + echo " 开机自启: sudo systemctl enable ${service_name}" +} + +# 询问是否创建systemd服务 +if [ "$FORCE_DEPLOY" = false ]; then + echo "" + echo -n "是否创建 systemd 服务? (y/N): " + read -r CREATE_SERVICE + if [ "$CREATE_SERVICE" = "y" ] || [ "$CREATE_SERVICE" = "Y" ]; then + create_systemd_service + fi +fi + +# 启动服务 +if [ "$START_SERVICE" = true ] || [ "$RESTART_SERVICE" = true ]; then + log_info "启动服务..." + + # 设置环境变量 + export PROMETHEUS_URL="${PROMETHEUS_URL:-http://localhost:9090}" + export PORT="${PORT:-8080}" + export LOG_LEVEL="${LOG_LEVEL:-info}" + + # 启动服务 + cd "$DEPLOY_DIR" + nohup ./start.sh > prometheus_adapter.log 2>&1 & + + # 等待服务启动 + sleep 2 + + # 检查是否启动成功 + NEW_PID=$(check_running_service) + if [ -n "$NEW_PID" ]; then + log_info "服务已启动 (PID: $NEW_PID)" + echo "" + echo "查看日志: tail -f $DEPLOY_DIR/prometheus_adapter.log" + else + log_error "服务启动失败,请检查日志" + exit 1 + fi +else + echo "" + echo "手动启动服务:" + echo " cd $DEPLOY_DIR" + echo " ./start.sh" + echo "" + echo "或使用后台模式:" + echo " nohup ./start.sh > prometheus_adapter.log 2>&1 &" +fi + +log_info "部署完成!" \ No newline at end of file From f6c1ad1888ca9343dc24738e56cce7697d9fdb5d Mon Sep 17 00:00:00 2001 From: Ding Date: Wed, 24 Sep 2025 22:35:22 +0800 Subject: [PATCH 08/18] =?UTF-8?q?feat(observability):=20=E4=BF=AE=E5=A4=8D?= =?UTF-8?q?Prometheus=E5=91=8A=E8=AD=A6=E8=A7=84=E5=88=99=E6=94=AF?= =?UTF-8?q?=E6=8C=81=E5=B9=B6=E4=BC=98=E5=8C=96=E8=A7=84=E5=88=99=E5=90=8C?= =?UTF-8?q?=E6=AD=A5=E6=9C=BA=E5=88=B6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - 在prometheus.yml中配置告警规则文件路径 - 修改docker-compose.yml挂载规则目录 - 重构AlertService,移除本地文件存储,直接写入容器 - 添加容器内规则文件写入的容错机制 --- .../service/alert_service.go | 69 ++++++++++--------- mock/s3/deployments/docker-compose.yml | 1 + .../deployments/observability/prometheus.yml | 4 ++ 3 files changed, 43 insertions(+), 31 deletions(-) diff --git a/internal/prometheus_adapter/service/alert_service.go b/internal/prometheus_adapter/service/alert_service.go index 0e4854a..f9fe908 100644 --- a/internal/prometheus_adapter/service/alert_service.go +++ b/internal/prometheus_adapter/service/alert_service.go @@ -6,7 +6,6 @@ import ( "net/http" "os" "os/exec" - "path/filepath" "strings" "github.com/qiniu/zeroops/internal/prometheus_adapter/client" @@ -17,8 +16,7 @@ import ( // AlertService 告警服务 - 仅负责与Prometheus交互,不存储规则 type AlertService struct { - promClient *client.PrometheusClient - rulesFilePath string + promClient *client.PrometheusClient // 内存中缓存当前规则,用于增量更新 currentRules []model.AlertRule currentRuleMetas []model.AlertRuleMeta @@ -26,15 +24,8 @@ type AlertService struct { // NewAlertService 创建告警服务 func NewAlertService(promClient *client.PrometheusClient) *AlertService { - rulesFilePath := os.Getenv("PROMETHEUS_RULES_FILE") - if rulesFilePath == "" { - // 在本地生成规则文件,用于调试和后续同步到远程容器 - rulesFilePath = "./prometheus_rules/alert_rules.yml" - } - return &AlertService{ promClient: promClient, - rulesFilePath: rulesFilePath, currentRules: []model.AlertRule{}, currentRuleMetas: []model.AlertRuleMeta{}, } @@ -221,34 +212,50 @@ func (s *AlertService) buildExpression(rule *model.AlertRule, meta *model.AlertR // writeRulesFile 写入规则文件 func (s *AlertService) writeRulesFile(rules *model.PrometheusRuleFile) error { - // 确保目录存在 - dir := filepath.Dir(s.rulesFilePath) - if err := os.MkdirAll(dir, 0755); err != nil { - return fmt.Errorf("failed to create rules directory: %w", err) - } - // 序列化为YAML data, err := yaml.Marshal(rules) if err != nil { return fmt.Errorf("failed to marshal rules: %w", err) } - // 写入文件 - if err := os.WriteFile(s.rulesFilePath, data, 0644); err != nil { - return fmt.Errorf("failed to write rules file: %w", err) + // 获取容器名称 + containerName := os.Getenv("PROMETHEUS_CONTAINER") + if containerName == "" { + containerName = "mock-s3-prometheus" } - log.Info(). - Str("file", s.rulesFilePath). - Int("groups", len(rules.Groups)). - Msg("Prometheus rules file updated locally") + // 直接写入到容器内的规则目录 + // 使用docker exec和echo命令写入文件 + cmd := exec.Command("docker", "exec", containerName, "sh", "-c", + fmt.Sprintf("cat > /etc/prometheus/rules/alert_rules.yml << 'EOF'\n%s\nEOF", string(data))) - // 同步到 Prometheus 容器 - if err := s.syncToPrometheusContainer(); err != nil { - log.Warn().Err(err).Msg("Failed to sync rules to Prometheus container") - // 不返回错误,因为本地文件已经生成成功 + if output, err := cmd.CombinedOutput(); err != nil { + // 如果直接写入容器失败,尝试使用临时文件+docker cp + log.Warn(). + Err(err). + Str("output", string(output)). + Msg("Failed to write directly to container, trying docker cp") + + // 写入临时文件 + tmpFile := "/tmp/prometheus_alert_rules.yml" + if err := os.WriteFile(tmpFile, data, 0644); err != nil { + return fmt.Errorf("failed to write temp rules file: %w", err) + } + + // 使用docker cp复制到容器 + if err := s.syncRuleFileToContainer(tmpFile); err != nil { + return fmt.Errorf("failed to sync to container: %w", err) + } + + // 清理临时文件 + os.Remove(tmpFile) } + log.Info(). + Str("container", containerName). + Int("groups", len(rules.Groups)). + Msg("Prometheus rules file updated in container") + return nil } @@ -275,8 +282,8 @@ func (s *AlertService) reloadPrometheus() error { return nil } -// syncToPrometheusContainer 同步规则文件到本地 Prometheus 容器 -func (s *AlertService) syncToPrometheusContainer() error { +// syncRuleFileToContainer 同步规则文件到容器 +func (s *AlertService) syncRuleFileToContainer(filePath string) error { // 获取容器名称,默认为 mock-s3-prometheus containerName := os.Getenv("PROMETHEUS_CONTAINER") if containerName == "" { @@ -293,14 +300,14 @@ func (s *AlertService) syncToPrometheusContainer() error { } // 2. 将规则文件拷贝到容器内 - cmdCopy := exec.Command("docker", "cp", s.rulesFilePath, fmt.Sprintf("%s:/etc/prometheus/rules/alert_rules.yml", containerName)) + cmdCopy := exec.Command("docker", "cp", filePath, fmt.Sprintf("%s:/etc/prometheus/rules/alert_rules.yml", containerName)) if output, err := cmdCopy.CombinedOutput(); err != nil { return fmt.Errorf("failed to copy rules file to container: %w, output: %s", err, string(output)) } log.Info(). Str("container", containerName). - Str("file", s.rulesFilePath). + Str("file", filePath). Msg("Rules synced to Prometheus container") // 3. 确保 Prometheus 配置包含 rule_files diff --git a/mock/s3/deployments/docker-compose.yml b/mock/s3/deployments/docker-compose.yml index 61f13cd..377ec3d 100644 --- a/mock/s3/deployments/docker-compose.yml +++ b/mock/s3/deployments/docker-compose.yml @@ -84,6 +84,7 @@ services: volumes: - prometheus-data:/prometheus - ./prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro + - ./prometheus/rules:/etc/prometheus/rules:rw command: - '--config.file=/etc/prometheus/prometheus.yml' - '--storage.tsdb.path=/prometheus' diff --git a/mock/s3/deployments/observability/prometheus.yml b/mock/s3/deployments/observability/prometheus.yml index 2bcabb8..35fb014 100644 --- a/mock/s3/deployments/observability/prometheus.yml +++ b/mock/s3/deployments/observability/prometheus.yml @@ -5,6 +5,10 @@ global: cluster: mock-s3 environment: docker +# 告警规则文件 +rule_files: + - "/etc/prometheus/rules/*.yml" + scrape_configs: # Prometheus自身的指标 - job_name: 'prometheus' From 0b636a1a78cc9b83649d721d49976d6e1eb454c6 Mon Sep 17 00:00:00 2001 From: Ding Date: Thu, 25 Sep 2025 11:22:24 +0800 Subject: [PATCH 09/18] =?UTF-8?q?refactor(prometheus=5Fadapter):=20?= =?UTF-8?q?=E9=87=8D=E6=9E=84=E5=91=8A=E8=AD=A6=E8=A7=84=E5=88=99=E7=AE=A1?= =?UTF-8?q?=E7=90=86API=E5=92=8C=E9=80=BB=E8=BE=91?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - 将watch_time字段从AlertRuleMeta移到AlertRule中 - 移除全量同步接口,改为增量更新方式 - 实现批量更新规则元信息的API - 重构服务层代码结构,提高可维护性 - 更新文档 --- docs/prometheus_adapter/README.md | 72 ++--- internal/prometheus_adapter/api/alert_api.go | 83 +++--- internal/prometheus_adapter/model/alert.go | 2 +- internal/prometheus_adapter/model/api.go | 17 +- .../service/alert_service.go | 249 ++++++++---------- 5 files changed, 182 insertions(+), 241 deletions(-) diff --git a/docs/prometheus_adapter/README.md b/docs/prometheus_adapter/README.md index 0a92312..650b6f8 100644 --- a/docs/prometheus_adapter/README.md +++ b/docs/prometheus_adapter/README.md @@ -10,7 +10,7 @@ - 架构设计 - API 参考 - 指标查询 - - 告警规则同步 + - 告警规则管理 - Alertmanager 集成 - 支持的服务 - 错误码 @@ -36,7 +36,7 @@ internal/prometheus_adapter/ ├── api/ # API 层,处理 HTTP 请求 │ ├── api.go # API 基础结构和初始化 │ ├── metric_api.go # 指标相关的 API 处理器 -│ └── alert_api.go # 告警规则同步 API 处理器 +│ └── alert_api.go # 告警规则管理 API 处理器 ├── service/ # 业务逻辑层 │ ├── metric_service.go # 指标查询服务实现 │ └── alert_service.go # 告警规则同步服务实现 @@ -137,42 +137,9 @@ internal/prometheus_adapter/ } ``` -### 告警规则同步 +### 告警规则管理 -#### 1. 全量同步规则 -- 方法与路径:`POST /v1/alert-rules/sync` -- 功能:接收监控告警模块发送的完整规则列表,生成 Prometheus 规则文件并触发重载(全量同步) -- 请求体示例: -```json -{ - "rules": [ - { - "name": "high_cpu_usage", - "description": "CPU使用率过高告警", - "expr": "system_cpu_usage_percent", - "op": ">", - "severity": "warning" - } - ], - "rule_metas": [ - { - "alert_name": "high_cpu_usage", // 与规则模板的name字段保持一致 - "labels": "{\"service\":\"storage-service\",\"version\":\"1.0.0\"}", - "threshold": 90, - "watch_time": 300 - } - ] -} -``` -- 响应示例: -```json -{ - "status": "success", - "message": "Rules synced to Prometheus" -} -``` - -#### 2. 更新单个规则模板 +#### 1. 更新单个规则模板 - 方法与路径:`PUT /v1/alert-rules/:rule_name` - 功能:更新指定的告警规则模板,系统会自动查找所有使用该规则的元信息并重新生成 Prometheus 规则 - 路径参数: @@ -183,7 +150,8 @@ internal/prometheus_adapter/ "description": "CPU使用率异常告警(更新后)", "expr": "avg(system_cpu_usage_percent)", "op": ">=", - "severity": "critical" + "severity": "critical", + "watch_time": 300 } ``` - 响应示例: @@ -195,25 +163,33 @@ internal/prometheus_adapter/ } ``` -#### 3. 更新单个规则元信息 -- 方法与路径:`PUT /v1/alert-rules/meta` -- 功能:更新指定规则的元信息,系统会根据对应的规则模板重新生成 Prometheus 规则 +#### 2. 批量更新规则元信息 +- 方法与路径:`PUT /v1/alert-rules-meta/:rule_name` +- 功能:批量更新指定规则的元信息,系统会根据对应的规则模板重新生成 Prometheus 规则 +- 路径参数: + - `rule_name`:规则名称(如 `high_cpu_usage`) - 请求体示例: ```json { - "rule_name": "high_cpu_usage", // 必填,对应规则模板的name - "labels": "{\"service\":\"storage-service\",\"version\":\"2.0.0\"}", // 必填,用于唯一标识 - "threshold": 85, - "watch_time": 600 + "metas": [ + { + "labels": "{\"service\":\"storage-service\",\"version\":\"1.0.0\"}", // 必填,用于唯一标识 + "threshold": 85 + }, + { + "labels": "{\"service\":\"storage-service\",\"version\":\"2.0.0\"}", // 必填,用于唯一标识 + "threshold": 90 + } + ] } ``` - 响应示例: ```json { "status": "success", - "message": "Rule meta updated and synced to Prometheus", + "message": "Rule metas updated and synced to Prometheus", "rule_name": "high_cpu_usage", - "labels": "{\"service\":\"storage-service\",\"version\":\"2.0.0\"}" + "updated_count": 2 } ``` @@ -232,11 +208,11 @@ internal/prometheus_adapter/ - `expr`:PromQL 表达式,如 `sum(apitime) by (service, version)`,可包含时间范围 - `op`:比较操作符(`>`, `<`, `=`, `!=`) - `severity`:告警等级,通常进入告警的 labels.severity + - `watch_time`:持续时间(秒),对应 Prometheus 的 `for` 字段 - **AlertRuleMeta(元信息)**: - `alert_name`:关联的规则名称(对应 alert_rules.name) - `labels`:JSON 格式的标签,用于筛选特定服务(如 `{"service":"s3","version":"v1"}`) - `threshold`:告警阈值 - - `watch_time`:持续时间(秒),对应 Prometheus 的 `for` 字段 #### 增量更新说明 - **增量更新**:新接口支持增量更新,只需传入需要修改的字段 diff --git a/internal/prometheus_adapter/api/alert_api.go b/internal/prometheus_adapter/api/alert_api.go index 3206087..8724803 100644 --- a/internal/prometheus_adapter/api/alert_api.go +++ b/internal/prometheus_adapter/api/alert_api.go @@ -10,32 +10,8 @@ import ( // setupAlertRouters 设置告警相关路由 func (api *Api) setupAlertRouters(router *fox.Engine) { - router.POST("/v1/alert-rules/sync", api.SyncRules) router.PUT("/v1/alert-rules/:rule_name", api.UpdateRule) - router.PUT("/v1/alert-rules/meta", api.UpdateRuleMeta) -} - -// SyncRules 同步规则到Prometheus -// 接收从监控告警模块发来的规则列表,生成Prometheus规则文件并重载配置 -func (api *Api) SyncRules(c *fox.Context) { - var req model.SyncRulesRequest - if err := c.ShouldBindJSON(&req); err != nil { - SendErrorResponse(c, http.StatusBadRequest, model.ErrorCodeInvalidParameter, - "Invalid request body: "+err.Error(), nil) - return - } - - err := api.alertService.SyncRulesToPrometheus(req.Rules, req.RuleMetas) - if err != nil { - SendErrorResponse(c, http.StatusInternalServerError, model.ErrorCodeInternalError, - "Failed to sync rules to Prometheus: "+err.Error(), nil) - return - } - - c.JSON(http.StatusOK, map[string]string{ - "status": "success", - "message": "Rules synced to Prometheus", - }) + router.PUT("/v1/alert-rules-meta/:rule_name", api.UpdateRuleMetas) } // UpdateRule 更新单个规则模板 @@ -62,6 +38,7 @@ func (api *Api) UpdateRule(c *fox.Context) { Expr: req.Expr, Op: req.Op, Severity: req.Severity, + WatchTime: req.WatchTime, } err := api.alertService.UpdateRule(rule) @@ -81,9 +58,16 @@ func (api *Api) UpdateRule(c *fox.Context) { }) } -// UpdateRuleMeta 更新单个规则元信息 -// 通过 alert_name + labels 唯一确定一个元信息记录 -func (api *Api) UpdateRuleMeta(c *fox.Context) { +// UpdateRuleMetas 批量更新规则元信息 +// 通过 rule_name + labels 唯一确定一个元信息记录 +func (api *Api) UpdateRuleMetas(c *fox.Context) { + ruleName := c.Param("rule_name") + if ruleName == "" { + SendErrorResponse(c, http.StatusBadRequest, model.ErrorCodeInvalidParameter, + "Rule name is required", nil) + return + } + var req model.UpdateAlertRuleMetaRequest if err := c.ShouldBindJSON(&req); err != nil { SendErrorResponse(c, http.StatusBadRequest, model.ErrorCodeInvalidParameter, @@ -91,32 +75,35 @@ func (api *Api) UpdateRuleMeta(c *fox.Context) { return } - // alert_name 和 labels 是必填的 - if req.AlertName == "" || req.Labels == "" { + if len(req.Metas) == 0 { SendErrorResponse(c, http.StatusBadRequest, model.ErrorCodeInvalidParameter, - "alert_name and labels are required", nil) + "At least one meta update is required", nil) return } - // 构建完整的元信息对象 - meta := model.AlertRuleMeta{ - AlertName: req.AlertName, - Labels: req.Labels, - Threshold: req.Threshold, - WatchTime: req.WatchTime, - } - - err := api.alertService.UpdateRuleMeta(meta) - if err != nil { - SendErrorResponse(c, http.StatusInternalServerError, model.ErrorCodeInternalError, - "Failed to update rule meta: "+err.Error(), nil) - return + // 批量更新元信息 + updatedCount := 0 + for _, metaUpdate := range req.Metas { + // 构建完整的元信息对象 + meta := model.AlertRuleMeta{ + AlertName: ruleName, + Labels: metaUpdate.Labels, + Threshold: metaUpdate.Threshold, + } + + err := api.alertService.UpdateRuleMeta(meta) + if err != nil { + SendErrorResponse(c, http.StatusInternalServerError, model.ErrorCodeInternalError, + fmt.Sprintf("Failed to update rule meta: %v", err), nil) + return + } + updatedCount++ } c.JSON(http.StatusOK, map[string]interface{}{ - "status": "success", - "message": "Rule meta updated and synced to Prometheus", - "alert_name": req.AlertName, - "labels": req.Labels, + "status": "success", + "message": "Rule metas updated and synced to Prometheus", + "rule_name": ruleName, + "updated_count": updatedCount, }) } diff --git a/internal/prometheus_adapter/model/alert.go b/internal/prometheus_adapter/model/alert.go index 566a143..e64a047 100644 --- a/internal/prometheus_adapter/model/alert.go +++ b/internal/prometheus_adapter/model/alert.go @@ -7,6 +7,7 @@ type AlertRule struct { Expr string `json:"expr" gorm:"type:text;not null"` // 左侧业务指标表达式,如 sum(apitime) by (service, version) Op string `json:"op" gorm:"type:varchar(4);not null"` // 阈值比较方式(>, <, =, !=) Severity string `json:"severity" gorm:"type:varchar(32);not null"` // 告警等级,通常进入告警的 labels.severity + WatchTime int `json:"watch_time"` // 持续时长(秒),映射 Prometheus rule 的 for 字段 } // AlertRuleMeta 告警规则元信息表 - 存储服务级别的告警配置 @@ -15,5 +16,4 @@ type AlertRuleMeta struct { AlertName string `json:"alert_name" gorm:"type:varchar(255);index"` // 关联 alert_rules.name Labels string `json:"labels" gorm:"type:jsonb"` // 适用标签,如 {"service":"s3","version":"v1"},为空表示全局 Threshold float64 `json:"threshold"` // 阈值(会被渲染成特定规则的 threshold metric 数值) - WatchTime int `json:"watch_time"` // 持续时长(映射 Prometheus rule 的 for) } diff --git a/internal/prometheus_adapter/model/api.go b/internal/prometheus_adapter/model/api.go index 4dc8421..775bdd9 100644 --- a/internal/prometheus_adapter/model/api.go +++ b/internal/prometheus_adapter/model/api.go @@ -46,6 +46,7 @@ type UpdateAlertRuleRequest struct { Expr string `json:"expr,omitempty"` Op string `json:"op,omitempty" binding:"omitempty,oneof=> < = !="` Severity string `json:"severity,omitempty"` + WatchTime int `json:"watch_time,omitempty"` // 持续时长(秒) } // CreateAlertRuleMetaRequest 创建告警规则元信息请求 @@ -57,17 +58,13 @@ type CreateAlertRuleMetaRequest struct { MatchTime string `json:"match_time,omitempty"` } -// UpdateAlertRuleMetaRequest 更新告警规则元信息请求 +// UpdateAlertRuleMetaRequest 批量更新告警规则元信息请求 type UpdateAlertRuleMetaRequest struct { - AlertName string `json:"alert_name" binding:"required"` - Labels string `json:"labels" binding:"required"` - Threshold float64 `json:"threshold"` - WatchTime int `json:"watch_time"` + Metas []AlertRuleMetaUpdate `json:"metas" binding:"required"` } -// SyncRulesRequest 同步规则请求 -// 从监控告警模块发送过来的完整规则列表 -type SyncRulesRequest struct { - Rules []AlertRule `json:"rules"` // 告警规则列表 - RuleMetas []AlertRuleMeta `json:"rule_metas"` // 规则元信息列表 +// AlertRuleMetaUpdate 单个规则元信息更新项 +type AlertRuleMetaUpdate struct { + Labels string `json:"labels" binding:"required"` // 必填,用于唯一标识 + Threshold float64 `json:"threshold"` } diff --git a/internal/prometheus_adapter/service/alert_service.go b/internal/prometheus_adapter/service/alert_service.go index f9fe908..cfae3f6 100644 --- a/internal/prometheus_adapter/service/alert_service.go +++ b/internal/prometheus_adapter/service/alert_service.go @@ -31,15 +31,88 @@ func NewAlertService(promClient *client.PrometheusClient) *AlertService { } } -// SyncRulesToPrometheus 同步规则到Prometheus -// 接收完整的规则列表,生成Prometheus规则文件并重载配置 -func (s *AlertService) SyncRulesToPrometheus(rules []model.AlertRule, ruleMetas []model.AlertRuleMeta) error { - // 保存到内存缓存 - s.currentRules = rules - s.currentRuleMetas = ruleMetas +// ========== 公开 API 方法 ========== +// UpdateRule 更新单个规则模板 +// 只更新传入的规则,其他规则和所有元信息保持不变 +func (s *AlertService) UpdateRule(rule model.AlertRule) error { + // 查找并更新规则 + found := false + for i, r := range s.currentRules { + if r.Name == rule.Name { + s.currentRules[i] = rule + found = true + break + } + } + + if !found { + // 如果规则不存在,添加新规则 + s.currentRules = append(s.currentRules, rule) + } + + // 统计受影响的元信息数量 + affectedCount := 0 + for _, meta := range s.currentRuleMetas { + if meta.AlertName == rule.Name { + affectedCount++ + } + } + + log.Info(). + Str("rule", rule.Name). + Int("affected_metas", affectedCount). + Msg("Updating rule and affected metas") + + // 使用更新后的规则重新生成并同步 + return s.regenerateAndSync() +} + +// UpdateRuleMeta 更新单个规则元信息 +// 通过 alert_name + labels 唯一确定一个元信息记录 +func (s *AlertService) UpdateRuleMeta(meta model.AlertRuleMeta) error { + // 查找并更新元信息 + found := false + for i, m := range s.currentRuleMetas { + // 通过 alert_name + labels 唯一确定 + if m.AlertName == meta.AlertName && m.Labels == meta.Labels { + s.currentRuleMetas[i] = meta + found = true + break + } + } + + if !found { + // 如果元信息不存在,添加新元信息 + s.currentRuleMetas = append(s.currentRuleMetas, meta) + } + + log.Info(). + Str("alert_name", meta.AlertName). + Str("labels", meta.Labels). + Msg("Updating rule meta") + + // 使用更新后的元信息重新生成并同步 + return s.regenerateAndSync() +} + +// GetAffectedMetas 获取受影响的元信息数量 +func (s *AlertService) GetAffectedMetas(ruleName string) int { + count := 0 + for _, meta := range s.currentRuleMetas { + if meta.AlertName == ruleName { + count++ + } + } + return count +} + +// ========== 内部核心方法 ========== + +// regenerateAndSync 使用当前内存中的规则和元信息重新生成Prometheus规则并同步 +func (s *AlertService) regenerateAndSync() error { // 构建Prometheus规则文件 - prometheusRules := s.buildPrometheusRules(rules, ruleMetas) + prometheusRules := s.buildPrometheusRules(s.currentRules, s.currentRuleMetas) // 写入规则文件 if err := s.writeRulesFile(prometheusRules); err != nil { @@ -53,13 +126,15 @@ func (s *AlertService) SyncRulesToPrometheus(rules []model.AlertRule, ruleMetas } log.Info(). - Int("rules_count", len(rules)). - Int("metas_count", len(ruleMetas)). - Msg("Rules synced to Prometheus successfully") + Int("rules_count", len(s.currentRules)). + Int("metas_count", len(s.currentRuleMetas)). + Msg("Rules regenerated and synced to Prometheus") return nil } +// ========== 规则构建相关方法 ========== + // buildPrometheusRules 构建Prometheus规则 func (s *AlertService) buildPrometheusRules(rules []model.AlertRule, ruleMetas []model.AlertRuleMeta) *model.PrometheusRuleFile { promRules := []model.PrometheusRule{} @@ -114,8 +189,8 @@ func (s *AlertService) buildPrometheusRules(rules []model.AlertRule, ruleMetas [ // 计算for字段 forDuration := "" - if meta.WatchTime > 0 { - forDuration = fmt.Sprintf("%ds", meta.WatchTime) + if rule.WatchTime > 0 { + forDuration = fmt.Sprintf("%ds", rule.WatchTime) } // 使用规则名作为 alert 名称,通过 labels 区分不同实例 @@ -182,7 +257,7 @@ func (s *AlertService) buildExpression(rule *model.AlertRule, meta *model.AlertR if len(labelMatchers) > 0 { // 如果表达式包含{,说明已经有标签选择器 if strings.Contains(expr, "{") { - expr = strings.Replace(expr, "}", ","+strings.Join(labelMatchers, ",")+"}", 1) + expr = strings.Replace(expr, "}", ","+strings.Join(labelMatchers, ",")+"}}", 1) } else { // 在指标名后添加标签选择器 // 查找第一个非字母数字下划线的字符 @@ -210,6 +285,8 @@ func (s *AlertService) buildExpression(rule *model.AlertRule, meta *model.AlertR return expr } +// ========== 文件操作相关方法 ========== + // writeRulesFile 写入规则文件 func (s *AlertService) writeRulesFile(rules *model.PrometheusRuleFile) error { // 序列化为YAML @@ -259,29 +336,6 @@ func (s *AlertService) writeRulesFile(rules *model.PrometheusRuleFile) error { return nil } -// reloadPrometheus 重新加载Prometheus配置 -func (s *AlertService) reloadPrometheus() error { - prometheusURL := os.Getenv("PROMETHEUS_ADDRESS") - if prometheusURL == "" { - prometheusURL = "http://10.210.10.33:9090" - } - - reloadURL := fmt.Sprintf("%s/-/reload", strings.TrimSuffix(prometheusURL, "/")) - - resp, err := http.Post(reloadURL, "text/plain", nil) - if err != nil { - return fmt.Errorf("failed to reload Prometheus: %w", err) - } - defer resp.Body.Close() - - if resp.StatusCode != http.StatusOK { - return fmt.Errorf("Prometheus reload failed with status: %d", resp.StatusCode) - } - - log.Info().Msg("Prometheus configuration reloaded") - return nil -} - // syncRuleFileToContainer 同步规则文件到容器 func (s *AlertService) syncRuleFileToContainer(filePath string) error { // 获取容器名称,默认为 mock-s3-prometheus @@ -318,6 +372,31 @@ func (s *AlertService) syncRuleFileToContainer(filePath string) error { return nil } +// ========== Prometheus 配置相关方法 ========== + +// reloadPrometheus 重新加载Prometheus配置 +func (s *AlertService) reloadPrometheus() error { + prometheusURL := os.Getenv("PROMETHEUS_ADDRESS") + if prometheusURL == "" { + prometheusURL = "http://10.210.10.33:9090" + } + + reloadURL := fmt.Sprintf("%s/-/reload", strings.TrimSuffix(prometheusURL, "/")) + + resp, err := http.Post(reloadURL, "text/plain", nil) + if err != nil { + return fmt.Errorf("failed to reload Prometheus: %w", err) + } + defer resp.Body.Close() + + if resp.StatusCode != http.StatusOK { + return fmt.Errorf("Prometheus reload failed with status: %d", resp.StatusCode) + } + + log.Info().Msg("Prometheus configuration reloaded") + return nil +} + // ensurePrometheusRuleConfig 确保 Prometheus 配置文件包含 rule_files 配置 func (s *AlertService) ensurePrometheusRuleConfig(containerName string) error { configPath := "/etc/prometheus/prometheus.yml" @@ -374,101 +453,3 @@ rule_files:\ log.Info().Msg("Prometheus restarted with new configuration") return nil } - -// UpdateRule 更新单个规则模板 -// 只更新传入的规则,其他规则和所有元信息保持不变 -func (s *AlertService) UpdateRule(rule model.AlertRule) error { - // 查找并更新规则 - found := false - for i, r := range s.currentRules { - if r.Name == rule.Name { - s.currentRules[i] = rule - found = true - break - } - } - - if !found { - // 如果规则不存在,添加新规则 - s.currentRules = append(s.currentRules, rule) - } - - // 统计受影响的元信息数量 - affectedCount := 0 - for _, meta := range s.currentRuleMetas { - if meta.AlertName == rule.Name { - affectedCount++ - } - } - - log.Info(). - Str("rule", rule.Name). - Int("affected_metas", affectedCount). - Msg("Updating rule and affected metas") - - // 使用更新后的规则重新生成并同步 - return s.regenerateAndSync() -} - -// UpdateRuleMeta 更新单个规则元信息 -// 通过 alert_name + labels 唯一确定一个元信息记录 -func (s *AlertService) UpdateRuleMeta(meta model.AlertRuleMeta) error { - // 查找并更新元信息 - found := false - for i, m := range s.currentRuleMetas { - // 通过 alert_name + labels 唯一确定 - if m.AlertName == meta.AlertName && m.Labels == meta.Labels { - s.currentRuleMetas[i] = meta - found = true - break - } - } - - if !found { - // 如果元信息不存在,添加新元信息 - s.currentRuleMetas = append(s.currentRuleMetas, meta) - } - - log.Info(). - Str("alert_name", meta.AlertName). - Str("labels", meta.Labels). - Msg("Updating rule meta") - - // 使用更新后的元信息重新生成并同步 - return s.regenerateAndSync() -} - -// regenerateAndSync 使用当前内存中的规则和元信息重新生成Prometheus规则并同步 -func (s *AlertService) regenerateAndSync() error { - // 构建Prometheus规则文件 - prometheusRules := s.buildPrometheusRules(s.currentRules, s.currentRuleMetas) - - // 写入规则文件 - if err := s.writeRulesFile(prometheusRules); err != nil { - return fmt.Errorf("failed to write rules file: %w", err) - } - - // 通知Prometheus重新加载配置 - if err := s.reloadPrometheus(); err != nil { - log.Warn().Err(err).Msg("Failed to reload Prometheus, rules file has been updated") - // 不返回错误,因为文件已经更新成功 - } - - log.Info(). - Int("rules_count", len(s.currentRules)). - Int("metas_count", len(s.currentRuleMetas)). - Msg("Rules regenerated and synced to Prometheus") - - return nil -} - -// GetAffectedMetas 获取受影响的元信息数量 -func (s *AlertService) GetAffectedMetas(ruleName string) int { - count := 0 - for _, meta := range s.currentRuleMetas { - if meta.AlertName == ruleName { - count++ - } - } - return count -} From 40d41e8c28802765a37b6ef70e4b1ba135a8fdbc Mon Sep 17 00:00:00 2001 From: dnj Date: Thu, 25 Sep 2025 14:53:17 +0800 Subject: [PATCH 10/18] =?UTF-8?q?feat(prometheus=5Fadapter):=20=E5=AE=9E?= =?UTF-8?q?=E7=8E=B0=E5=91=8A=E8=AD=A6=E8=A7=84=E5=88=99=E6=8C=81=E4=B9=85?= =?UTF-8?q?=E5=8C=96=E4=B8=8E=E4=BC=98=E9=9B=85=E5=85=B3=E9=97=AD=E5=8A=9F?= =?UTF-8?q?=E8=83=BD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 添加告警规则本地文件持久化功能,支持启动时加载和关闭时保存规则 重构关闭逻辑实现优雅关闭,包括保存当前规则状态 更新构建和部署脚本以处理规则文件目录 修改测试脚本以适配新的增量更新接口 --- cmd/prometheus_adapter/main.go | 43 ++++- .../prometheus_adapter/rules/alert_rules.yml | 5 + internal/prometheus_adapter/server.go | 18 +- .../service/alert_service.go | 156 +++++++++++++++++- .../prometheus_adapter/test_alert_update.sh | 125 ++++++++------ scripts/prometheus_adapter/build.sh | 18 ++ scripts/prometheus_adapter/deploy.sh | 10 +- 7 files changed, 319 insertions(+), 56 deletions(-) create mode 100644 internal/prometheus_adapter/rules/alert_rules.yml diff --git a/cmd/prometheus_adapter/main.go b/cmd/prometheus_adapter/main.go index 9f45442..847516b 100644 --- a/cmd/prometheus_adapter/main.go +++ b/cmd/prometheus_adapter/main.go @@ -1,7 +1,11 @@ package main import ( + "context" "os" + "os/signal" + "syscall" + "time" "github.com/fox-gonic/fox" "github.com/qiniu/zeroops/internal/config" @@ -42,9 +46,40 @@ func main() { log.Fatal().Err(err).Msg("Failed to setup API routes") } - // 启动服务器 - log.Info().Msgf("Starting Prometheus Adapter on %s", cfg.Server.BindAddr) - if err := router.Run(cfg.Server.BindAddr); err != nil { - log.Fatal().Err(err).Msg("Failed to start server") + // 设置信号处理 + sigChan := make(chan os.Signal, 1) + signal.Notify(sigChan, syscall.SIGINT, syscall.SIGTERM) + + // 创建一个用于优雅关闭的context + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + // 在goroutine中启动服务器 + serverErr := make(chan error, 1) + go func() { + log.Info().Msgf("Starting Prometheus Adapter on %s", cfg.Server.BindAddr) + if err := router.Run(cfg.Server.BindAddr); err != nil { + serverErr <- err + } + }() + + // 等待信号或服务器错误 + select { + case sig := <-sigChan: + log.Info().Msgf("Received signal %s, shutting down...", sig) + + // 创建超时context + shutdownCtx, shutdownCancel := context.WithTimeout(ctx, 10*time.Second) + defer shutdownCancel() + + // 调用adapter的Shutdown方法 + if err := adapter.Close(shutdownCtx); err != nil { + log.Error().Err(err).Msg("Error during shutdown") + } + + log.Info().Msg("Shutdown complete") + + case err := <-serverErr: + log.Fatal().Err(err).Msg("Server error") } } diff --git a/internal/prometheus_adapter/rules/alert_rules.yml b/internal/prometheus_adapter/rules/alert_rules.yml new file mode 100644 index 0000000..7dd73ca --- /dev/null +++ b/internal/prometheus_adapter/rules/alert_rules.yml @@ -0,0 +1,5 @@ +# Prometheus Alert Rules +# This file is managed by the Prometheus Adapter service +# It will be loaded on startup and saved on shutdown + +groups: [] \ No newline at end of file diff --git a/internal/prometheus_adapter/server.go b/internal/prometheus_adapter/server.go index d9fb2f4..35f5b08 100644 --- a/internal/prometheus_adapter/server.go +++ b/internal/prometheus_adapter/server.go @@ -1,6 +1,7 @@ package prometheusadapter import ( + "context" "fmt" "os" @@ -63,9 +64,18 @@ func (s *PrometheusAdapterServer) UseApi(router *fox.Engine) error { return nil } -// Close 关闭服务器 -func (s *PrometheusAdapterServer) Close() error { - // 当前没有需要关闭的资源 - log.Info().Msg("Prometheus Adapter server closed") +// Close 优雅关闭服务器 +func (s *PrometheusAdapterServer) Close(ctx context.Context) error { + log.Info().Msg("Starting shutdown...") + + // 调用 alertService 的 Shutdown 方法保存规则 + if s.alertService != nil { + if err := s.alertService.Shutdown(); err != nil { + log.Error().Err(err).Msg("Failed to shutdown alert service") + return err + } + } + + log.Info().Msg("Prometheus Adapter server shut down") return nil } diff --git a/internal/prometheus_adapter/service/alert_service.go b/internal/prometheus_adapter/service/alert_service.go index cfae3f6..a254ae1 100644 --- a/internal/prometheus_adapter/service/alert_service.go +++ b/internal/prometheus_adapter/service/alert_service.go @@ -6,6 +6,8 @@ import ( "net/http" "os" "os/exec" + "path/filepath" + "strconv" "strings" "github.com/qiniu/zeroops/internal/prometheus_adapter/client" @@ -20,15 +22,167 @@ type AlertService struct { // 内存中缓存当前规则,用于增量更新 currentRules []model.AlertRule currentRuleMetas []model.AlertRuleMeta + // 本地规则文件路径 + localRulesPath string } // NewAlertService 创建告警服务 func NewAlertService(promClient *client.PrometheusClient) *AlertService { - return &AlertService{ + service := &AlertService{ promClient: promClient, currentRules: []model.AlertRule{}, currentRuleMetas: []model.AlertRuleMeta{}, + localRulesPath: "../rules/alert_rules.yml", } + + // 启动时尝试加载本地规则 + if err := service.LoadRulesFromFile(); err != nil { + log.Warn().Err(err).Msg("Failed to load rules from file, starting with empty rules") + } + + return service +} + +// ========== 持久化方法 ========== + +// LoadRulesFromFile 从本地文件加载规则 +func (s *AlertService) LoadRulesFromFile() error { + // 检查文件是否存在 + if _, err := os.Stat(s.localRulesPath); os.IsNotExist(err) { + log.Info().Str("path", s.localRulesPath).Msg("Local rules file does not exist, skipping load") + return nil + } + + // 读取文件内容 + data, err := os.ReadFile(s.localRulesPath) + if err != nil { + return fmt.Errorf("failed to read local rules file: %w", err) + } + + // 解析规则文件 + var rulesFile model.PrometheusRuleFile + if err := yaml.Unmarshal(data, &rulesFile); err != nil { + return fmt.Errorf("failed to parse rules file: %w", err) + } + + // 从Prometheus格式转换回内部格式 + s.currentRules = []model.AlertRule{} + s.currentRuleMetas = []model.AlertRuleMeta{} + + // 用于去重的map + ruleMap := make(map[string]*model.AlertRule) + + for _, group := range rulesFile.Groups { + for _, rule := range group.Rules { + // 提取基础规则信息 + ruleName := rule.Alert + + // 从annotations中获取description + description := "" + if desc, ok := rule.Annotations["description"]; ok { + description = desc + } + + // 从labels中获取severity + severity := "warning" + if sev, ok := rule.Labels["severity"]; ok { + severity = sev + delete(rule.Labels, "severity") // 移除severity,剩下的是meta的labels + } + + // 创建或更新规则模板 + if _, exists := ruleMap[ruleName]; !exists { + alertRule := model.AlertRule{ + Name: ruleName, + Description: description, + Expr: rule.Expr, + Severity: severity, + } + + // 解析For字段获取WatchTime + if rule.For != "" { + // 简单解析,假设格式为 "300s" 或 "5m" + if strings.HasSuffix(rule.For, "s") { + if seconds, err := strconv.Atoi(strings.TrimSuffix(rule.For, "s")); err == nil { + alertRule.WatchTime = seconds + } + } else if strings.HasSuffix(rule.For, "m") { + if minutes, err := strconv.Atoi(strings.TrimSuffix(rule.For, "m")); err == nil { + alertRule.WatchTime = minutes * 60 + } + } + } + + ruleMap[ruleName] = &alertRule + s.currentRules = append(s.currentRules, alertRule) + } + + // 创建元信息 + if len(rule.Labels) > 0 { + labelsJSON, _ := json.Marshal(rule.Labels) + meta := model.AlertRuleMeta{ + AlertName: ruleName, + Labels: string(labelsJSON), + } + + // 从表达式中提取threshold(简单实现) + // 假设表达式类似 "metric > 80" 或 "metric{labels} > 80" + parts := strings.Split(rule.Expr, " ") + if len(parts) >= 3 { + if threshold, err := strconv.ParseFloat(parts[len(parts)-1], 64); err == nil { + meta.Threshold = threshold + } + } + + s.currentRuleMetas = append(s.currentRuleMetas, meta) + } + } + } + + log.Info(). + Int("rules", len(s.currentRules)). + Int("metas", len(s.currentRuleMetas)). + Str("path", s.localRulesPath). + Msg("Loaded rules from local file") + + return nil +} + +// SaveRulesToFile 保存规则到本地文件 +func (s *AlertService) SaveRulesToFile() error { + // 确保目录存在 + dir := filepath.Dir(s.localRulesPath) + if err := os.MkdirAll(dir, 0755); err != nil { + return fmt.Errorf("failed to create rules directory: %w", err) + } + + // 构建Prometheus规则文件格式 + prometheusRules := s.buildPrometheusRules(s.currentRules, s.currentRuleMetas) + + // 序列化为YAML + data, err := yaml.Marshal(prometheusRules) + if err != nil { + return fmt.Errorf("failed to marshal rules: %w", err) + } + + // 写入文件 + if err := os.WriteFile(s.localRulesPath, data, 0644); err != nil { + return fmt.Errorf("failed to write rules file: %w", err) + } + + log.Info(). + Int("rules", len(s.currentRules)). + Int("metas", len(s.currentRuleMetas)). + Str("path", s.localRulesPath). + Msg("Saved rules to local file") + + return nil +} + +// Shutdown 优雅关闭,保存当前规则 +func (s *AlertService) Shutdown() error { + log.Info().Msg("Shutting down alert service, saving rules...") + return s.SaveRulesToFile() } // ========== 公开 API 方法 ========== diff --git a/internal/prometheus_adapter/test_alert_update.sh b/internal/prometheus_adapter/test_alert_update.sh index a8af0ea..4400b43 100755 --- a/internal/prometheus_adapter/test_alert_update.sh +++ b/internal/prometheus_adapter/test_alert_update.sh @@ -2,49 +2,69 @@ # 测试增量更新告警规则功能 -BASE_URL="http://localhost:8080" +BASE_URL="http://localhost:9999" echo "=== 测试增量更新告警规则 ===" -# 1. 先进行全量同步,创建初始规则 -echo -e "\n1. 全量同步规则..." -curl -X POST ${BASE_URL}/v1/alert-rules/sync \ +# 1. 初始化规则(使用增量更新接口) +echo -e "\n1. 创建初始规则..." + +# 1.1 创建 high_cpu_usage 规则模板 +echo -e "\n1.1 创建规则模板: high_cpu_usage" +curl -X PUT ${BASE_URL}/v1/alert-rules/high_cpu_usage \ -H "Content-Type: application/json" \ -d '{ - "rules": [ - { - "name": "high_cpu_usage", - "description": "CPU使用率过高告警", - "expr": "system_cpu_usage_percent", - "op": ">", - "severity": "warning" - }, - { - "name": "high_memory_usage", - "description": "内存使用率过高告警", - "expr": "system_memory_usage_percent", - "op": ">", - "severity": "warning" - } - ], - "rule_metas": [ + "description": "CPU使用率过高告警", + "expr": "system_cpu_usage_percent", + "op": ">", + "severity": "warning", + "watch_time": 300 + }' | jq . + +sleep 1 + +# 1.2 创建 high_memory_usage 规则模板 +echo -e "\n1.2 创建规则模板: high_memory_usage" +curl -X PUT ${BASE_URL}/v1/alert-rules/high_memory_usage \ + -H "Content-Type: application/json" \ + -d '{ + "description": "内存使用率过高告警", + "expr": "system_memory_usage_percent", + "op": ">", + "severity": "warning", + "watch_time": 600 + }' | jq . + +sleep 1 + +# 1.3 设置 high_cpu_usage 规则的元信息 +echo -e "\n1.3 设置规则元信息: high_cpu_usage" +curl -X PUT ${BASE_URL}/v1/alert-rules-meta/high_cpu_usage \ + -H "Content-Type: application/json" \ + -d '{ + "metas": [ { - "alert_name": "high_cpu_usage", "labels": "{\"service\":\"storage-service\",\"version\":\"1.0.0\"}", - "threshold": 80, - "watch_time": 300 + "threshold": 80 }, { - "alert_name": "high_cpu_usage", "labels": "{\"service\":\"metadata-service\",\"version\":\"1.0.0\"}", - "threshold": 85, - "watch_time": 300 - }, + "threshold": 85 + } + ] + }' | jq . + +sleep 1 + +# 1.4 设置 high_memory_usage 规则的元信息 +echo -e "\n1.4 设置规则元信息: high_memory_usage" +curl -X PUT ${BASE_URL}/v1/alert-rules-meta/high_memory_usage \ + -H "Content-Type: application/json" \ + -d '{ + "metas": [ { - "alert_name": "high_memory_usage", "labels": "{\"service\":\"storage-service\",\"version\":\"1.0.0\"}", - "threshold": 90, - "watch_time": 600 + "threshold": 90 } ] }' | jq . @@ -59,33 +79,46 @@ curl -X PUT ${BASE_URL}/v1/alert-rules/high_cpu_usage \ "description": "CPU使用率异常告警(更新后)", "expr": "avg(system_cpu_usage_percent[5m])", "op": ">=", - "severity": "critical" + "severity": "critical", + "watch_time": 300 }' | jq . sleep 2 -# 3. 更新单个规则元信息 -echo -e "\n3. 更新规则元信息..." -curl -X PUT ${BASE_URL}/v1/alert-rules/meta \ +# 3. 批量更新规则元信息 +echo -e "\n3. 批量更新规则元信息(high_cpu_usage)..." +curl -X PUT ${BASE_URL}/v1/alert-rules-meta/high_cpu_usage \ -H "Content-Type: application/json" \ -d '{ - "alert_name": "high_cpu_usage", - "labels": "{\"service\":\"storage-service\",\"version\":\"1.0.0\"}", - "threshold": 75, - "watch_time": 600 + "metas": [ + { + "labels": "{\"service\":\"storage-service\",\"version\":\"1.0.0\"}", + "threshold": 75 + }, + { + "labels": "{\"service\":\"metadata-service\",\"version\":\"1.0.0\"}", + "threshold": 88 + } + ] }' | jq . sleep 2 -# 4. 添加新的元信息 -echo -e "\n4. 添加新的元信息..." -curl -X PUT ${BASE_URL}/v1/alert-rules/meta \ +# 4. 批量更新规则元信息(添加新的服务) +echo -e "\n4. 批量更新规则元信息(high_memory_usage - 添加新服务)..." +curl -X PUT ${BASE_URL}/v1/alert-rules-meta/high_memory_usage \ -H "Content-Type: application/json" \ -d '{ - "alert_name": "high_memory_usage", - "labels": "{\"service\":\"queue-service\",\"version\":\"2.0.0\"}", - "threshold": 95, - "watch_time": 300 + "metas": [ + { + "labels": "{\"service\":\"queue-service\",\"version\":\"2.0.0\"}", + "threshold": 95 + }, + { + "labels": "{\"service\":\"third-party-service\",\"version\":\"1.0.0\"}", + "threshold": 92 + } + ] }' | jq . echo -e "\n=== 测试完成 ===" \ No newline at end of file diff --git a/scripts/prometheus_adapter/build.sh b/scripts/prometheus_adapter/build.sh index ec5c08a..dbcbcf1 100755 --- a/scripts/prometheus_adapter/build.sh +++ b/scripts/prometheus_adapter/build.sh @@ -52,6 +52,7 @@ log_info "创建构建目录..." mkdir -p "$BUILD_DIR/bin" mkdir -p "$BUILD_DIR/docs" mkdir -p "$BUILD_DIR/scripts" +mkdir -p "$BUILD_DIR/rules" # 编译二进制文件 log_info "编译 ${APP_NAME}..." @@ -79,6 +80,23 @@ if [ -f "internal/${APP_NAME}/test_alert_update.sh" ]; then chmod +x "$BUILD_DIR/scripts/test_alert_update.sh" fi +# 复制规则文件 +log_info "复制规则文件..." +if [ -d "internal/${APP_NAME}/rules" ]; then + cp -r "internal/${APP_NAME}/rules/"* "$BUILD_DIR/rules/" 2>/dev/null || true + log_info "已复制规则文件到 $BUILD_DIR/rules/" +else + # 如果没有规则文件夹,创建一个空的规则文件 + log_warn "未找到规则目录,创建默认规则文件..." + cat > "$BUILD_DIR/rules/alert_rules.yml" << 'RULES_EOF' +# Prometheus Alert Rules +# This file is managed by the Prometheus Adapter service +# It will be loaded on startup and saved on shutdown + +groups: [] +RULES_EOF +fi + # 创建启动脚本 log_info "创建启动脚本..." cat > "$BUILD_DIR/start.sh" << 'EOF' diff --git a/scripts/prometheus_adapter/deploy.sh b/scripts/prometheus_adapter/deploy.sh index 85bddf7..01ccdf8 100755 --- a/scripts/prometheus_adapter/deploy.sh +++ b/scripts/prometheus_adapter/deploy.sh @@ -235,11 +235,19 @@ if [ -w "$DEPLOY_DIR" ]; then chmod +x "$DEPLOY_DIR/start.sh" chmod +x "$DEPLOY_DIR/stop.sh" [ -f "$DEPLOY_DIR/scripts/test_alert_update.sh" ] && chmod +x "$DEPLOY_DIR/scripts/test_alert_update.sh" + # 确保 rules 目录可写 + chmod 755 "$DEPLOY_DIR/rules" + [ -f "$DEPLOY_DIR/rules/alert_rules.yml" ] && chmod 644 "$DEPLOY_DIR/rules/alert_rules.yml" else sudo chmod +x "$DEPLOY_DIR/bin/prometheus_adapter" sudo chmod +x "$DEPLOY_DIR/start.sh" sudo chmod +x "$DEPLOY_DIR/stop.sh" [ -f "$DEPLOY_DIR/scripts/test_alert_update.sh" ] && sudo chmod +x "$DEPLOY_DIR/scripts/test_alert_update.sh" + # 确保 rules 目录可写 + sudo chmod 755 "$DEPLOY_DIR/rules" + [ -f "$DEPLOY_DIR/rules/alert_rules.yml" ] && sudo chmod 644 "$DEPLOY_DIR/rules/alert_rules.yml" + # 设置 rules 目录的所有者为服务运行用户 + sudo chown -R qboxserver:qboxserver "$DEPLOY_DIR/rules" fi # 清理临时目录 @@ -277,7 +285,7 @@ After=network.target Type=simple User=qboxserver Group=qboxserver -WorkingDirectory=$DEPLOY_DIR +WorkingDirectory=$DEPLOY_DIR/bin Environment="PROMETHEUS_URL=http://localhost:9090" Environment="PORT=8080" Environment="LOG_LEVEL=info" From 271763ea277900efbfe93b74c803cbf33687bf77 Mon Sep 17 00:00:00 2001 From: dnj Date: Thu, 25 Sep 2025 16:15:30 +0800 Subject: [PATCH 11/18] =?UTF-8?q?feat(prometheus):=20=E5=AE=9E=E7=8E=B0?= =?UTF-8?q?=E4=B8=BB=E5=8A=A8=E6=8B=89=E5=8F=96=E5=91=8A=E8=AD=A6=E7=9A=84?= =?UTF-8?q?webhook=E6=9C=8D=E5=8A=A1?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 添加prometheus_adapter.yml配置文件支持 重构alert_service使用配置而非环境变量 新增alert_webhook_service实现告警轮询推送 更新build.sh和deploy.sh支持配置文件部署 更新README文档说明新的webhook架构 --- docs/prometheus_adapter/README.md | 63 +++-- .../client/prometheus_client.go | 44 ++- internal/prometheus_adapter/config/config.go | 165 ++++++++++++ .../config/prometheus_adapter.yml | 27 ++ .../model/prometheus_alert.go | 49 ++++ internal/prometheus_adapter/server.go | 52 ++-- .../service/alert_service.go | 24 +- .../service/alert_webhook_service.go | 255 ++++++++++++++++++ scripts/prometheus_adapter/build.sh | 41 ++- scripts/prometheus_adapter/deploy.sh | 18 +- 10 files changed, 665 insertions(+), 73 deletions(-) create mode 100644 internal/prometheus_adapter/config/config.go create mode 100644 internal/prometheus_adapter/config/prometheus_adapter.yml create mode 100644 internal/prometheus_adapter/model/prometheus_alert.go create mode 100644 internal/prometheus_adapter/service/alert_webhook_service.go diff --git a/docs/prometheus_adapter/README.md b/docs/prometheus_adapter/README.md index 650b6f8..0b9d5ca 100644 --- a/docs/prometheus_adapter/README.md +++ b/docs/prometheus_adapter/README.md @@ -221,30 +221,51 @@ internal/prometheus_adapter/ - 更新元信息时,系统根据 `alert_name` + `labels` 查找并更新对应的元信息 - **缓存机制**:系统在内存中缓存当前的规则和元信息,支持快速增量更新 -## Alertmanager 集成 +## 告警接收 Webhook -- 目标:将 Prometheus 触发的告警通过 Alertmanager 转发到监控告警模块 -- `alertmanager.yml` 配置示例: -```yaml -global: - resolve_timeout: 5m +- 目标:实现自定义 webhook 服务,主动从 Prometheus 拉取告警并转发到监控告警模块 +- 实现方式: + - 通过 Prometheus Alerts API 获取告警 + - 定期轮询 Prometheus 的 `/api/v1/alerts` 端点 + - 将获取的告警格式化后 POST 到监控告警模块 -route: - group_by: ['alertname', 'cluster', 'service'] - group_wait: 10s - group_interval: 10s - repeat_interval: 1h - receiver: 'zeroops-alert-webhook' - -receivers: - - name: 'zeroops-alert-webhook' - webhook_configs: - - url: 'http://alert-module:8080/v1/integrations/alertmanager/webhook' - send_resolved: true +### Webhook 服务架构 +``` +┌─────────────────┐ +│ Prometheus │ +│ (告警规则引擎) │ +└────────┬────────┘ + │ Pull (轮询) + │ GET /api/v1/alerts + ▼ +┌─────────────────┐ +│ Alert Webhook │ +│ (自定义服务) │ +└────────┬────────┘ + │ Push + │ POST /v1/integrations/prometheus/alerts + ▼ +┌─────────────────┐ +│ 监控告警模块 │ +│ (告警处理中心) │ +└─────────────────┘ ``` -- 说明: - - `url`:监控告警模块的 webhook 地址(按实际部署修改主机与端口) - - `send_resolved`:为 `true` 时,告警恢复也会通知 + +### 实现细节 +- **轮询机制**: + - 每 10 秒从 Prometheus 拉取一次活跃告警 + - 通过 `GET http://prometheus:9090/api/v1/alerts` 获取告警列表 + - 维护告警状态缓存,避免重复推送 + +- **告警格式转换**: + - 将 Prometheus 告警格式转换为监控告警模块所需格式 + - 包含告警名称、标签、严重程度、开始时间等信息 + - 支持告警恢复状态通知 + +- **推送目标**: + - URL: `http://alert-module:8080/v1/integrations/prometheus/alerts` + - Method: POST + - Content-Type: application/json ## 支持的服务 diff --git a/internal/prometheus_adapter/client/prometheus_client.go b/internal/prometheus_adapter/client/prometheus_client.go index 7bf0a3a..a42b58b 100644 --- a/internal/prometheus_adapter/client/prometheus_client.go +++ b/internal/prometheus_adapter/client/prometheus_client.go @@ -2,18 +2,24 @@ package client import ( "context" + "encoding/json" "fmt" + "io" + "net/http" "time" "github.com/prometheus/client_golang/api" v1 "github.com/prometheus/client_golang/api/prometheus/v1" promModel "github.com/prometheus/common/model" "github.com/qiniu/zeroops/internal/prometheus_adapter/model" + "github.com/rs/zerolog/log" ) // PrometheusClient Prometheus 客户端 type PrometheusClient struct { - api v1.API + api v1.API + httpClient *http.Client + baseURL string } // NewPrometheusClient 创建新的 Prometheus 客户端 @@ -26,7 +32,9 @@ func NewPrometheusClient(address string) (*PrometheusClient, error) { } return &PrometheusClient{ - api: v1.NewAPI(client), + api: v1.NewAPI(client), + httpClient: &http.Client{Timeout: 10 * time.Second}, + baseURL: address, }, nil } @@ -142,3 +150,35 @@ func BuildQuery(service, metric, version string) string { query += "}" return query } + +// GetAlerts 获取 Prometheus 当前的告警 +func (c *PrometheusClient) GetAlerts(ctx context.Context) (*model.PrometheusAlertsResponse, error) { + url := fmt.Sprintf("%s/api/v1/alerts", c.baseURL) + + req, err := http.NewRequestWithContext(ctx, "GET", url, nil) + if err != nil { + return nil, fmt.Errorf("failed to create request: %w", err) + } + + resp, err := c.httpClient.Do(req) + if err != nil { + return nil, fmt.Errorf("failed to query alerts: %w", err) + } + defer resp.Body.Close() + + if resp.StatusCode != http.StatusOK { + body, _ := io.ReadAll(resp.Body) + return nil, fmt.Errorf("prometheus returned status %d: %s", resp.StatusCode, string(body)) + } + + var alertsResp model.PrometheusAlertsResponse + if err := json.NewDecoder(resp.Body).Decode(&alertsResp); err != nil { + return nil, fmt.Errorf("failed to decode response: %w", err) + } + + log.Debug(). + Int("alert_count", len(alertsResp.Data.Alerts)). + Msg("Retrieved alerts from Prometheus") + + return &alertsResp, nil +} diff --git a/internal/prometheus_adapter/config/config.go b/internal/prometheus_adapter/config/config.go new file mode 100644 index 0000000..38f6998 --- /dev/null +++ b/internal/prometheus_adapter/config/config.go @@ -0,0 +1,165 @@ +package config + +import ( + "fmt" + "os" + "time" + + "github.com/rs/zerolog/log" + "gopkg.in/yaml.v3" +) + +// PrometheusAdapterConfig Prometheus Adapter 配置 +type PrometheusAdapterConfig struct { + Prometheus PrometheusConfig `yaml:"prometheus"` + AlertWebhook AlertWebhookConfig `yaml:"alert_webhook"` + AlertRules AlertRulesConfig `yaml:"alert_rules"` + Server ServerConfig `yaml:"server"` +} + +// PrometheusConfig Prometheus 服务配置 +type PrometheusConfig struct { + Address string `yaml:"address"` // Prometheus 地址 + ContainerName string `yaml:"container_name"` // 容器名称 +} + +// AlertWebhookConfig 告警 Webhook 配置 +type AlertWebhookConfig struct { + URL string `yaml:"url"` // Webhook URL + PollingInterval string `yaml:"polling_interval"` // 轮询间隔 +} + +// AlertRulesConfig 告警规则配置 +type AlertRulesConfig struct { + LocalFile string `yaml:"local_file"` // 本地规则文件 + PrometheusRulesDir string `yaml:"prometheus_rules_dir"` // Prometheus 规则目录 +} + +// ServerConfig 服务器配置 +type ServerConfig struct { + BindAddr string `yaml:"bind_addr"` // 监听地址 +} + +// LoadConfig 加载配置文件 +func LoadConfig(configPath string) (*PrometheusAdapterConfig, error) { + // 如果没有指定配置文件,使用默认路径 + if configPath == "" { + configPath = "internal/prometheus_adapter/config/prometheus_adapter.yml" + } + + // 读取配置文件 + data, err := os.ReadFile(configPath) + if err != nil { + // 如果文件不存在,返回默认配置 + if os.IsNotExist(err) { + log.Warn().Msg("Config file not found, using default configuration") + return getDefaultConfig(), nil + } + return nil, fmt.Errorf("failed to read config file: %w", err) + } + + // 解析配置 + var config PrometheusAdapterConfig + if err := yaml.Unmarshal(data, &config); err != nil { + return nil, fmt.Errorf("failed to parse config file: %w", err) + } + + // 应用环境变量覆盖 + applyEnvOverrides(&config) + + // 验证配置 + if err := validateConfig(&config); err != nil { + return nil, fmt.Errorf("invalid configuration: %w", err) + } + + log.Info(). + Str("config_file", configPath). + Msg("Configuration loaded successfully") + + return &config, nil +} + +// getDefaultConfig 获取默认配置 +func getDefaultConfig() *PrometheusAdapterConfig { + return &PrometheusAdapterConfig{ + Prometheus: PrometheusConfig{ + Address: "http://10.210.10.33:9090", + ContainerName: "mock-s3-prometheus", + }, + AlertWebhook: AlertWebhookConfig{ + URL: "http://alert-module:8080/v1/integrations/prometheus/alerts", + PollingInterval: "10s", + }, + AlertRules: AlertRulesConfig{ + LocalFile: "../rules/alert_rules.yml", + PrometheusRulesDir: "/etc/prometheus/rules/", + }, + Server: ServerConfig{ + BindAddr: "0.0.0.0:9999", + }, + } +} + +// applyEnvOverrides 应用环境变量覆盖 +func applyEnvOverrides(config *PrometheusAdapterConfig) { + // Prometheus 配置 + if addr := os.Getenv("PROMETHEUS_ADDRESS"); addr != "" { + config.Prometheus.Address = addr + } + if container := os.Getenv("PROMETHEUS_CONTAINER"); container != "" { + config.Prometheus.ContainerName = container + } + + // Alert Webhook 配置 + if url := os.Getenv("ALERT_WEBHOOK_URL"); url != "" { + config.AlertWebhook.URL = url + } + if interval := os.Getenv("ALERT_POLLING_INTERVAL"); interval != "" { + config.AlertWebhook.PollingInterval = interval + } + + // Server 配置 + if bindAddr := os.Getenv("SERVER_BIND_ADDR"); bindAddr != "" { + config.Server.BindAddr = bindAddr + } +} + +// validateConfig 验证配置 +func validateConfig(config *PrometheusAdapterConfig) error { + // 验证 Prometheus 地址 + if config.Prometheus.Address == "" { + return fmt.Errorf("prometheus address is required") + } + + // 验证轮询间隔 + if config.AlertWebhook.PollingInterval != "" { + if _, err := time.ParseDuration(config.AlertWebhook.PollingInterval); err != nil { + return fmt.Errorf("invalid polling interval: %w", err) + } + } + + // 验证服务器地址 + if config.Server.BindAddr == "" { + return fmt.Errorf("server bind address is required") + } + + return nil +} + +// GetPollingInterval 获取轮询间隔的 Duration +func (c *AlertWebhookConfig) GetPollingInterval() time.Duration { + if c.PollingInterval == "" { + return 10 * time.Second + } + + duration, err := time.ParseDuration(c.PollingInterval) + if err != nil { + log.Warn(). + Err(err). + Str("interval", c.PollingInterval). + Msg("Invalid polling interval, using default") + return 10 * time.Second + } + + return duration +} diff --git a/internal/prometheus_adapter/config/prometheus_adapter.yml b/internal/prometheus_adapter/config/prometheus_adapter.yml new file mode 100644 index 0000000..a3eab43 --- /dev/null +++ b/internal/prometheus_adapter/config/prometheus_adapter.yml @@ -0,0 +1,27 @@ +# Prometheus Adapter 配置文件 + +# Prometheus 服务配置 +prometheus: + # Prometheus 服务地址 + address: "http://10.210.10.33:9090" + # 容器名称(用于规则同步) + container_name: "mock-s3-prometheus" + +# 告警 Webhook 服务配置 +alert_webhook: + # 监控告警模块地址 + url: "http://alert-module:8080/v1/integrations/prometheus/alerts" + # 轮询间隔 + polling_interval: "10s" + +# 告警规则管理配置 +alert_rules: + # 本地规则文件路径 + local_file: "../rules/alert_rules.yml" + # Prometheus 规则目录 + prometheus_rules_dir: "/etc/prometheus/rules/" + +# 服务器配置 +server: + # 服务监听地址 + bind_addr: "0.0.0.0:9999" \ No newline at end of file diff --git a/internal/prometheus_adapter/model/prometheus_alert.go b/internal/prometheus_adapter/model/prometheus_alert.go new file mode 100644 index 0000000..9dab331 --- /dev/null +++ b/internal/prometheus_adapter/model/prometheus_alert.go @@ -0,0 +1,49 @@ +package model + +import ( + "time" +) + +// PrometheusAlert Prometheus 告警 API 响应结构 +type PrometheusAlert struct { + Labels map[string]string `json:"labels"` + Annotations map[string]string `json:"annotations"` + State string `json:"state"` // pending, firing + ActiveAt time.Time `json:"activeAt"` + Value string `json:"value"` // 触发告警时的值 +} + +// PrometheusAlertsResponse Prometheus /api/v1/alerts 响应 +type PrometheusAlertsResponse struct { + Status string `json:"status"` + Data struct { + Alerts []PrometheusAlert `json:"alerts"` + } `json:"data"` +} + +// AlertmanagerWebhookAlert 单个告警 +type AlertmanagerWebhookAlert struct { + Status string `json:"status"` // "firing" or "resolved" + Labels map[string]string `json:"labels"` // 包含 alertname, service, severity, idc, service_version 等 + Annotations map[string]string `json:"annotations"` // 包含 summary, description + StartsAt string `json:"startsAt"` // RFC3339 格式时间 + EndsAt string `json:"endsAt"` // RFC3339 格式时间 + GeneratorURL string `json:"generatorURL"` // Prometheus 查询链接 + Fingerprint string `json:"fingerprint"` // 告警唯一标识 +} + +// AlertmanagerWebhookRequest 发送到监控告警模块的请求格式 +type AlertmanagerWebhookRequest struct { + Receiver string `json:"receiver"` // "our-webhook" + Status string `json:"status"` // "firing" or "resolved" + Alerts []AlertmanagerWebhookAlert `json:"alerts"` + GroupLabels map[string]string `json:"groupLabels"` // 分组标签 + CommonLabels map[string]string `json:"commonLabels"` // 公共标签 + Version string `json:"version"` // "4" +} + +// AlertWebhookResponse 告警推送响应 +type AlertWebhookResponse struct { + Status string `json:"status"` + Message string `json:"message"` +} diff --git a/internal/prometheus_adapter/server.go b/internal/prometheus_adapter/server.go index 35f5b08..81a2824 100644 --- a/internal/prometheus_adapter/server.go +++ b/internal/prometheus_adapter/server.go @@ -3,35 +3,37 @@ package prometheusadapter import ( "context" "fmt" - "os" "github.com/fox-gonic/fox" "github.com/qiniu/zeroops/internal/config" "github.com/qiniu/zeroops/internal/prometheus_adapter/api" "github.com/qiniu/zeroops/internal/prometheus_adapter/client" + promconfig "github.com/qiniu/zeroops/internal/prometheus_adapter/config" "github.com/qiniu/zeroops/internal/prometheus_adapter/service" "github.com/rs/zerolog/log" ) // PrometheusAdapterServer Prometheus Adapter 服务器 type PrometheusAdapterServer struct { - config *config.Config - promClient *client.PrometheusClient - metricService *service.MetricService - alertService *service.AlertService - api *api.Api + config *config.Config + promConfig *promconfig.PrometheusAdapterConfig + promClient *client.PrometheusClient + metricService *service.MetricService + alertService *service.AlertService + alertWebhookService *service.AlertWebhookService + api *api.Api } // NewPrometheusAdapterServer 创建新的 Prometheus Adapter 服务器 func NewPrometheusAdapterServer(cfg *config.Config) (*PrometheusAdapterServer, error) { - // 使用环境变量或默认值获取 Prometheus 地址 - prometheusAddr := os.Getenv("PROMETHEUS_ADDRESS") - if prometheusAddr == "" { - prometheusAddr = "http://10.210.10.33:9090/" + // 加载 Prometheus Adapter 配置 + promConfig, err := promconfig.LoadConfig("") + if err != nil { + return nil, fmt.Errorf("failed to load prometheus adapter config: %w", err) } // 创建 Prometheus 客户端 - promClient, err := client.NewPrometheusClient(prometheusAddr) + promClient, err := client.NewPrometheusClient(promConfig.Prometheus.Address) if err != nil { return nil, fmt.Errorf("failed to create prometheus client: %w", err) } @@ -40,16 +42,27 @@ func NewPrometheusAdapterServer(cfg *config.Config) (*PrometheusAdapterServer, e metricService := service.NewMetricService(promClient) // 创建告警服务 - alertService := service.NewAlertService(promClient) + alertService := service.NewAlertService(promClient, promConfig) + + // 创建告警 Webhook 服务 + alertWebhookService := service.NewAlertWebhookService(promClient, promConfig) server := &PrometheusAdapterServer{ - config: cfg, - promClient: promClient, - metricService: metricService, - alertService: alertService, + config: cfg, + promConfig: promConfig, + promClient: promClient, + metricService: metricService, + alertService: alertService, + alertWebhookService: alertWebhookService, + } + + // 启动告警 Webhook 服务 + if err := alertWebhookService.Start(); err != nil { + log.Error().Err(err).Msg("Failed to start alert webhook service") + // 不返回错误,允许服务继续运行 } - log.Info().Str("prometheus_address", prometheusAddr).Msg("Prometheus Adapter initialized successfully") + log.Info().Str("prometheus_address", promConfig.Prometheus.Address).Msg("Prometheus Adapter initialized successfully") return server, nil } @@ -68,6 +81,11 @@ func (s *PrometheusAdapterServer) UseApi(router *fox.Engine) error { func (s *PrometheusAdapterServer) Close(ctx context.Context) error { log.Info().Msg("Starting shutdown...") + // 停止告警 Webhook 服务 + if s.alertWebhookService != nil { + s.alertWebhookService.Stop() + } + // 调用 alertService 的 Shutdown 方法保存规则 if s.alertService != nil { if err := s.alertService.Shutdown(); err != nil { diff --git a/internal/prometheus_adapter/service/alert_service.go b/internal/prometheus_adapter/service/alert_service.go index a254ae1..f74d4a3 100644 --- a/internal/prometheus_adapter/service/alert_service.go +++ b/internal/prometheus_adapter/service/alert_service.go @@ -11,6 +11,7 @@ import ( "strings" "github.com/qiniu/zeroops/internal/prometheus_adapter/client" + promconfig "github.com/qiniu/zeroops/internal/prometheus_adapter/config" "github.com/qiniu/zeroops/internal/prometheus_adapter/model" "github.com/rs/zerolog/log" "gopkg.in/yaml.v3" @@ -19,6 +20,7 @@ import ( // AlertService 告警服务 - 仅负责与Prometheus交互,不存储规则 type AlertService struct { promClient *client.PrometheusClient + config *promconfig.PrometheusAdapterConfig // 内存中缓存当前规则,用于增量更新 currentRules []model.AlertRule currentRuleMetas []model.AlertRuleMeta @@ -27,12 +29,13 @@ type AlertService struct { } // NewAlertService 创建告警服务 -func NewAlertService(promClient *client.PrometheusClient) *AlertService { +func NewAlertService(promClient *client.PrometheusClient, config *promconfig.PrometheusAdapterConfig) *AlertService { service := &AlertService{ promClient: promClient, + config: config, currentRules: []model.AlertRule{}, currentRuleMetas: []model.AlertRuleMeta{}, - localRulesPath: "../rules/alert_rules.yml", + localRulesPath: config.AlertRules.LocalFile, } // 启动时尝试加载本地规则 @@ -301,12 +304,9 @@ func (s *AlertService) buildPrometheusRules(rules []model.AlertRule, ruleMetas [ // 为每个元信息生成Prometheus规则 for _, meta := range ruleMetas { - // 查找对应的规则模板 - var rule *model.AlertRule - // 通过 alert_name 直接查找对应的规则模板 // AlertRuleMeta.alert_name 关联 AlertRule.name - rule = ruleMap[meta.AlertName] + var rule *model.AlertRule = ruleMap[meta.AlertName] if rule == nil { log.Warn(). @@ -450,10 +450,7 @@ func (s *AlertService) writeRulesFile(rules *model.PrometheusRuleFile) error { } // 获取容器名称 - containerName := os.Getenv("PROMETHEUS_CONTAINER") - if containerName == "" { - containerName = "mock-s3-prometheus" - } + containerName := s.config.Prometheus.ContainerName // 直接写入到容器内的规则目录 // 使用docker exec和echo命令写入文件 @@ -530,10 +527,7 @@ func (s *AlertService) syncRuleFileToContainer(filePath string) error { // reloadPrometheus 重新加载Prometheus配置 func (s *AlertService) reloadPrometheus() error { - prometheusURL := os.Getenv("PROMETHEUS_ADDRESS") - if prometheusURL == "" { - prometheusURL = "http://10.210.10.33:9090" - } + prometheusURL := s.config.Prometheus.Address reloadURL := fmt.Sprintf("%s/-/reload", strings.TrimSuffix(prometheusURL, "/")) @@ -544,7 +538,7 @@ func (s *AlertService) reloadPrometheus() error { defer resp.Body.Close() if resp.StatusCode != http.StatusOK { - return fmt.Errorf("Prometheus reload failed with status: %d", resp.StatusCode) + return fmt.Errorf("prometheus reload failed with status: %d", resp.StatusCode) } log.Info().Msg("Prometheus configuration reloaded") diff --git a/internal/prometheus_adapter/service/alert_webhook_service.go b/internal/prometheus_adapter/service/alert_webhook_service.go new file mode 100644 index 0000000..ac211ac --- /dev/null +++ b/internal/prometheus_adapter/service/alert_webhook_service.go @@ -0,0 +1,255 @@ +package service + +import ( + "bytes" + "context" + "crypto/md5" + "encoding/json" + "fmt" + "io" + "net/http" + "sync" + "time" + + "github.com/qiniu/zeroops/internal/prometheus_adapter/client" + promconfig "github.com/qiniu/zeroops/internal/prometheus_adapter/config" + "github.com/qiniu/zeroops/internal/prometheus_adapter/model" + "github.com/rs/zerolog/log" +) + +// AlertWebhookService 告警 Webhook 服务 +type AlertWebhookService struct { + promClient *client.PrometheusClient + config *promconfig.PrometheusAdapterConfig + webhookURL string + pollingInterval time.Duration + httpClient *http.Client + alertCache map[string]*model.PrometheusAlert // 缓存已发送的告警 + cacheMutex sync.RWMutex + stopCh chan struct{} + running bool + runningMutex sync.Mutex +} + +// NewAlertWebhookService 创建告警 Webhook 服务 +func NewAlertWebhookService(promClient *client.PrometheusClient, config *promconfig.PrometheusAdapterConfig) *AlertWebhookService { + return &AlertWebhookService{ + promClient: promClient, + config: config, + webhookURL: config.AlertWebhook.URL, + pollingInterval: config.AlertWebhook.GetPollingInterval(), + httpClient: &http.Client{Timeout: 30 * time.Second}, + alertCache: make(map[string]*model.PrometheusAlert), + stopCh: make(chan struct{}), + } +} + +// Start 启动告警轮询服务 +func (s *AlertWebhookService) Start() error { + s.runningMutex.Lock() + defer s.runningMutex.Unlock() + + if s.running { + return fmt.Errorf("alert webhook service already running") + } + + s.running = true + go s.pollAlerts() + + log.Info(). + Str("webhook_url", s.webhookURL). + Dur("interval", s.pollingInterval). + Msg("Alert webhook service started") + + return nil +} + +// Stop 停止告警轮询服务 +func (s *AlertWebhookService) Stop() { + s.runningMutex.Lock() + defer s.runningMutex.Unlock() + + if !s.running { + return + } + + close(s.stopCh) + s.running = false + + log.Info().Msg("Alert webhook service stopped") +} + +// pollAlerts 轮询告警 +func (s *AlertWebhookService) pollAlerts() { + ticker := time.NewTicker(s.pollingInterval) + defer ticker.Stop() + + // 立即执行一次 + s.fetchAndProcessAlerts() + + for { + select { + case <-ticker.C: + s.fetchAndProcessAlerts() + case <-s.stopCh: + return + } + } +} + +// fetchAndProcessAlerts 获取并处理告警 +func (s *AlertWebhookService) fetchAndProcessAlerts() { + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + + // 从 Prometheus 获取告警 + alertsResp, err := s.promClient.GetAlerts(ctx) + if err != nil { + log.Error().Err(err).Msg("Failed to fetch alerts from Prometheus") + return + } + + // 处理告警 + firingAlerts := []model.PrometheusAlert{} + resolvedAlerts := []model.PrometheusAlert{} + + s.cacheMutex.Lock() + defer s.cacheMutex.Unlock() + + // 分类告警 + currentAlerts := make(map[string]*model.PrometheusAlert) + for _, alert := range alertsResp.Data.Alerts { + fingerprint := s.generateFingerprint(alert) + currentAlerts[fingerprint] = &alert + + // 检查是否是新告警或状态变更 + cachedAlert, exists := s.alertCache[fingerprint] + if !exists || cachedAlert.State != alert.State { + if alert.State == "firing" { + firingAlerts = append(firingAlerts, alert) + } + } + } + + // 检查已恢复的告警 + for fingerprint, cachedAlert := range s.alertCache { + if _, exists := currentAlerts[fingerprint]; !exists { + // 告警已恢复 + resolvedAlert := *cachedAlert + resolvedAlert.State = "resolved" + resolvedAlerts = append(resolvedAlerts, resolvedAlert) + } + } + + // 更新缓存 + s.alertCache = currentAlerts + + // 发送告警 + if len(firingAlerts) > 0 { + if err := s.sendAlerts(firingAlerts, "firing"); err != nil { + log.Error().Err(err).Msg("Failed to send firing alerts") + } + } + + if len(resolvedAlerts) > 0 { + if err := s.sendAlerts(resolvedAlerts, "resolved"); err != nil { + log.Error().Err(err).Msg("Failed to send resolved alerts") + } + } +} + +// sendAlerts 发送告警到监控模块 +func (s *AlertWebhookService) sendAlerts(alerts []model.PrometheusAlert, status string) error { + webhookAlerts := []model.AlertmanagerWebhookAlert{} + + // 收集所有标签用于 groupLabels 和 commonLabels + commonLabels := map[string]string{} + firstAlert := true + + for _, alert := range alerts { + // 生成 fingerprint + fingerprint := s.generateFingerprint(alert) + + // 转换时间格式 + startsAt := alert.ActiveAt.Format(time.RFC3339) + endsAt := "0001-01-01T00:00:00Z" + if status == "resolved" { + endsAt = time.Now().Format(time.RFC3339) + } + + // 构造 GeneratorURL + generatorURL := fmt.Sprintf("http://prometheus/graph?g0.expr=%s", alert.Labels["alertname"]) + + webhookAlert := model.AlertmanagerWebhookAlert{ + Status: status, + Labels: alert.Labels, + Annotations: alert.Annotations, + StartsAt: startsAt, + EndsAt: endsAt, + GeneratorURL: generatorURL, + Fingerprint: fingerprint, + } + webhookAlerts = append(webhookAlerts, webhookAlert) + + // 收集公共标签(取第一个告警的标签作为公共标签) + if firstAlert { + for k, v := range alert.Labels { + commonLabels[k] = v + } + firstAlert = false + } + } + + groupLabels := map[string]string{} + if alertName, ok := commonLabels["alertname"]; ok { + groupLabels["alertname"] = alertName + } + + // 构造请求 + req := model.AlertmanagerWebhookRequest{ + Receiver: "prometheus_adapter", + Status: status, + Alerts: webhookAlerts, + GroupLabels: groupLabels, + CommonLabels: commonLabels, + Version: "1", + } + + // 发送请求 + jsonData, err := json.Marshal(req) + if err != nil { + return fmt.Errorf("failed to marshal request: %w", err) + } + + resp, err := s.httpClient.Post(s.webhookURL, "application/json", bytes.NewBuffer(jsonData)) + if err != nil { + return fmt.Errorf("failed to send webhook: %w", err) + } + defer resp.Body.Close() + + if resp.StatusCode != http.StatusOK { + body, _ := io.ReadAll(resp.Body) + return fmt.Errorf("webhook returned status %d: %s", resp.StatusCode, string(body)) + } + + log.Info(). + Str("status", status). + Int("alert_count", len(alerts)). + Str("webhook_url", s.webhookURL). + Msg("Successfully sent alerts to webhook") + + return nil +} + +// generateFingerprint 生成告警的唯一标识 +func (s *AlertWebhookService) generateFingerprint(alert model.PrometheusAlert) string { + // 基于标签生成指纹 + labels := "" + for k, v := range alert.Labels { + labels += fmt.Sprintf("%s=%s,", k, v) + } + + h := md5.New() + h.Write([]byte(labels)) + return fmt.Sprintf("%x", h.Sum(nil))[:16] +} diff --git a/scripts/prometheus_adapter/build.sh b/scripts/prometheus_adapter/build.sh index dbcbcf1..d7b5025 100755 --- a/scripts/prometheus_adapter/build.sh +++ b/scripts/prometheus_adapter/build.sh @@ -50,6 +50,7 @@ fi # 创建构建目录 log_info "创建构建目录..." mkdir -p "$BUILD_DIR/bin" +mkdir -p "$BUILD_DIR/config" mkdir -p "$BUILD_DIR/docs" mkdir -p "$BUILD_DIR/scripts" mkdir -p "$BUILD_DIR/rules" @@ -67,6 +68,15 @@ if [ $? -ne 0 ]; then exit 1 fi +# 复制配置文件 +log_info "复制配置文件..." +if [ -f "internal/${APP_NAME}/config/prometheus_adapter.yml" ]; then + cp "internal/${APP_NAME}/config/prometheus_adapter.yml" "$BUILD_DIR/config/" + log_info "已复制配置文件到 $BUILD_DIR/config/" +else + log_warn "未找到配置文件,使用默认配置" +fi + # 复制文档 log_info "复制文档..." if [ -f "docs/${APP_NAME}/README.md" ]; then @@ -104,14 +114,10 @@ cat > "$BUILD_DIR/start.sh" << 'EOF' # Prometheus Adapter 启动脚本 -# 默认配置 -PROMETHEUS_URL=${PROMETHEUS_URL:-"http://localhost:9090"} -PORT=${PORT:-8080} -LOG_LEVEL=${LOG_LEVEL:-"info"} - # 获取脚本所在目录 SCRIPT_DIR=$(cd "$(dirname "$0")" && pwd) BIN_PATH="$SCRIPT_DIR/bin/prometheus_adapter" +CONFIG_FILE="$SCRIPT_DIR/config/prometheus_adapter.yml" # 检查二进制文件 if [ ! -f "$BIN_PATH" ]; then @@ -119,19 +125,26 @@ if [ ! -f "$BIN_PATH" ]; then exit 1 fi -# 启动参数 -ARGS="" -ARGS="$ARGS --prometheus-url=$PROMETHEUS_URL" -ARGS="$ARGS --port=$PORT" -ARGS="$ARGS --log-level=$LOG_LEVEL" +# 检查配置文件 +if [ -f "$CONFIG_FILE" ]; then + echo "使用配置文件: $CONFIG_FILE" +else + echo "警告: 找不到配置文件 $CONFIG_FILE,将使用默认配置" +fi + +# 环境变量(可选,用于覆盖配置文件) +# export PROMETHEUS_ADDRESS="http://localhost:9090" +# export ALERT_WEBHOOK_URL="http://alert-module:8080/v1/integrations/prometheus/alerts" +# export ALERT_POLLING_INTERVAL="10s" +# export SERVER_BIND_ADDR="0.0.0.0:9999" echo "启动 Prometheus Adapter..." -echo "Prometheus URL: $PROMETHEUS_URL" -echo "监听端口: $PORT" -echo "日志级别: $LOG_LEVEL" + +# 切换到 bin 目录,以便程序能正确找到相对路径的配置文件 +cd "$SCRIPT_DIR" # 启动服务 -exec "$BIN_PATH" $ARGS +exec "$BIN_PATH" EOF chmod +x "$BUILD_DIR/start.sh" diff --git a/scripts/prometheus_adapter/deploy.sh b/scripts/prometheus_adapter/deploy.sh index 01ccdf8..ad68564 100755 --- a/scripts/prometheus_adapter/deploy.sh +++ b/scripts/prometheus_adapter/deploy.sh @@ -235,6 +235,9 @@ if [ -w "$DEPLOY_DIR" ]; then chmod +x "$DEPLOY_DIR/start.sh" chmod +x "$DEPLOY_DIR/stop.sh" [ -f "$DEPLOY_DIR/scripts/test_alert_update.sh" ] && chmod +x "$DEPLOY_DIR/scripts/test_alert_update.sh" + # 确保 config 目录和配置文件可读 + chmod 755 "$DEPLOY_DIR/config" + [ -f "$DEPLOY_DIR/config/prometheus_adapter.yml" ] && chmod 644 "$DEPLOY_DIR/config/prometheus_adapter.yml" # 确保 rules 目录可写 chmod 755 "$DEPLOY_DIR/rules" [ -f "$DEPLOY_DIR/rules/alert_rules.yml" ] && chmod 644 "$DEPLOY_DIR/rules/alert_rules.yml" @@ -243,11 +246,16 @@ else sudo chmod +x "$DEPLOY_DIR/start.sh" sudo chmod +x "$DEPLOY_DIR/stop.sh" [ -f "$DEPLOY_DIR/scripts/test_alert_update.sh" ] && sudo chmod +x "$DEPLOY_DIR/scripts/test_alert_update.sh" + # 确保 config 目录和配置文件可读 + sudo chmod 755 "$DEPLOY_DIR/config" + [ -f "$DEPLOY_DIR/config/prometheus_adapter.yml" ] && sudo chmod 644 "$DEPLOY_DIR/config/prometheus_adapter.yml" # 确保 rules 目录可写 sudo chmod 755 "$DEPLOY_DIR/rules" [ -f "$DEPLOY_DIR/rules/alert_rules.yml" ] && sudo chmod 644 "$DEPLOY_DIR/rules/alert_rules.yml" # 设置 rules 目录的所有者为服务运行用户 sudo chown -R qboxserver:qboxserver "$DEPLOY_DIR/rules" + # 确保配置文件也可以被服务用户读取 + sudo chown qboxserver:qboxserver "$DEPLOY_DIR/config/prometheus_adapter.yml" fi # 清理临时目录 @@ -285,10 +293,12 @@ After=network.target Type=simple User=qboxserver Group=qboxserver -WorkingDirectory=$DEPLOY_DIR/bin -Environment="PROMETHEUS_URL=http://localhost:9090" -Environment="PORT=8080" -Environment="LOG_LEVEL=info" +WorkingDirectory=$DEPLOY_DIR +# 可选:通过环境变量覆盖配置 +#Environment="PROMETHEUS_ADDRESS=http://localhost:9090" +#Environment="ALERT_WEBHOOK_URL=http://alert-module:8080/v1/integrations/prometheus/alerts" +#Environment="ALERT_POLLING_INTERVAL=10s" +#Environment="SERVER_BIND_ADDR=0.0.0.0:9999" ExecStart=$DEPLOY_DIR/bin/prometheus_adapter ExecStop=$DEPLOY_DIR/stop.sh Restart=on-failure From b6e76bd7b68d28d82c3735683e10cbbbab14a2ec Mon Sep 17 00:00:00 2001 From: dnj Date: Thu, 25 Sep 2025 16:23:16 +0800 Subject: [PATCH 12/18] =?UTF-8?q?feat(=E9=85=8D=E7=BD=AE):=20=E6=94=AF?= =?UTF-8?q?=E6=8C=81=E4=BB=8E=E5=A4=9A=E4=B8=AA=E9=BB=98=E8=AE=A4=E8=B7=AF?= =?UTF-8?q?=E5=BE=84=E5=8A=A0=E8=BD=BD=E9=85=8D=E7=BD=AE=E6=96=87=E4=BB=B6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- internal/prometheus_adapter/config/config.go | 22 ++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/internal/prometheus_adapter/config/config.go b/internal/prometheus_adapter/config/config.go index 38f6998..39dd023 100644 --- a/internal/prometheus_adapter/config/config.go +++ b/internal/prometheus_adapter/config/config.go @@ -42,9 +42,27 @@ type ServerConfig struct { // LoadConfig 加载配置文件 func LoadConfig(configPath string) (*PrometheusAdapterConfig, error) { - // 如果没有指定配置文件,使用默认路径 + // 如果没有指定配置文件,尝试多个默认路径 if configPath == "" { - configPath = "internal/prometheus_adapter/config/prometheus_adapter.yml" + // 尝试的路径列表(按优先级) + possiblePaths := []string{ + "config/prometheus_adapter.yml", // 部署环境:相对于工作目录 + "internal/prometheus_adapter/config/prometheus_adapter.yml", // 开发环境:源码目录 + "./prometheus_adapter.yml", // 当前目录 + } + + for _, path := range possiblePaths { + if _, err := os.Stat(path); err == nil { + configPath = path + log.Info().Str("path", path).Msg("Found config file") + break + } + } + + // 如果都找不到,使用第一个路径(稍后会返回默认配置) + if configPath == "" { + configPath = possiblePaths[0] + } } // 读取配置文件 From 4bc9b158c6908798d918950a333b649633a79f83 Mon Sep 17 00:00:00 2001 From: dnj Date: Thu, 25 Sep 2025 18:07:57 +0800 Subject: [PATCH 13/18] =?UTF-8?q?feat(prometheus):=20=E5=AE=9E=E7=8E=B0Ale?= =?UTF-8?q?rtmanager=E5=85=BC=E5=AE=B9API=E5=B9=B6=E9=87=8D=E6=9E=84?= =?UTF-8?q?=E5=91=8A=E8=AD=A6=E5=A4=84=E7=90=86?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - 新增Alertmanager API v2兼容接口用于接收Prometheus告警 - 重构告警服务架构,替换原有的轮询模式为推送模式 - 添加docker-compose配置支持Prometheus管理API - 移除过时的AlertWebhookService实现 --- .../api/alertmanager_api.go | 25 ++ internal/prometheus_adapter/api/api.go | 18 +- internal/prometheus_adapter/model/alert.go | 9 + internal/prometheus_adapter/server.go | 45 ++-- .../service/alert_service.go | 2 +- .../service/alert_webhook_service.go | 255 ------------------ .../service/alertmanager_service.go | 215 +++++++++++++++ mock/s3/deployments/docker-compose.yml | 3 + .../deployments/observability/prometheus.yml | 8 + 9 files changed, 290 insertions(+), 290 deletions(-) create mode 100644 internal/prometheus_adapter/api/alertmanager_api.go delete mode 100644 internal/prometheus_adapter/service/alert_webhook_service.go create mode 100644 internal/prometheus_adapter/service/alertmanager_service.go diff --git a/internal/prometheus_adapter/api/alertmanager_api.go b/internal/prometheus_adapter/api/alertmanager_api.go new file mode 100644 index 0000000..a50f2c9 --- /dev/null +++ b/internal/prometheus_adapter/api/alertmanager_api.go @@ -0,0 +1,25 @@ +package api + +import ( + "github.com/fox-gonic/fox" + "github.com/qiniu/zeroops/internal/prometheus_adapter/service" +) + +// setupAlertmanagerRouters 设置 Alertmanager 兼容路由 +// 这些路由模拟 Alertmanager API,接收 Prometheus 的告警推送 +func (api *Api) setupAlertmanagerRouters(router *fox.Engine, alertmanagerService *service.AlertmanagerService) { + // Alertmanager API v2 告警接收端点 + router.POST("/api/v2/alerts", func(c *fox.Context) { + alertmanagerService.HandleAlertsV2(c.Writer, c.Request) + }) + + // 健康检查端点 + router.GET("/-/healthy", func(c *fox.Context) { + alertmanagerService.HandleHealthCheck(c.Writer, c.Request) + }) + + // 就绪检查端点 + router.GET("/-/ready", func(c *fox.Context) { + alertmanagerService.HandleReadyCheck(c.Writer, c.Request) + }) +} diff --git a/internal/prometheus_adapter/api/api.go b/internal/prometheus_adapter/api/api.go index 2b6e432..351544d 100644 --- a/internal/prometheus_adapter/api/api.go +++ b/internal/prometheus_adapter/api/api.go @@ -12,17 +12,19 @@ import ( // Api Prometheus Adapter API type Api struct { - metricService *service.MetricService - alertService *service.AlertService - router *fox.Engine + metricService *service.MetricService + alertService *service.AlertService + alertmanagerService *service.AlertmanagerService + router *fox.Engine } // NewApi 创建新的 API -func NewApi(metricService *service.MetricService, alertService *service.AlertService, router *fox.Engine) (*Api, error) { +func NewApi(metricService *service.MetricService, alertService *service.AlertService, alertmanagerService *service.AlertmanagerService, router *fox.Engine) (*Api, error) { api := &Api{ - metricService: metricService, - alertService: alertService, - router: router, + metricService: metricService, + alertService: alertService, + alertmanagerService: alertmanagerService, + router: router, } api.setupRouters(router) @@ -35,6 +37,8 @@ func (api *Api) setupRouters(router *fox.Engine) { api.setupMetricRouters(router) // 告警相关路由 api.setupAlertRouters(router) + // Alertmanager 兼容路由 + api.setupAlertmanagerRouters(router, api.alertmanagerService) } // ========== 通用辅助方法 ========== diff --git a/internal/prometheus_adapter/model/alert.go b/internal/prometheus_adapter/model/alert.go index e64a047..497b541 100644 --- a/internal/prometheus_adapter/model/alert.go +++ b/internal/prometheus_adapter/model/alert.go @@ -17,3 +17,12 @@ type AlertRuleMeta struct { Labels string `json:"labels" gorm:"type:jsonb"` // 适用标签,如 {"service":"s3","version":"v1"},为空表示全局 Threshold float64 `json:"threshold"` // 阈值(会被渲染成特定规则的 threshold metric 数值) } + +// AlertmanagerAlert 符合 Alertmanager API v2 的告警格式 +type AlertmanagerAlert struct { + Labels map[string]string `json:"labels"` + Annotations map[string]string `json:"annotations,omitempty"` + StartsAt string `json:"startsAt,omitempty"` // RFC3339 格式 + EndsAt string `json:"endsAt,omitempty"` // RFC3339 格式 + GeneratorURL string `json:"generatorURL,omitempty"` +} diff --git a/internal/prometheus_adapter/server.go b/internal/prometheus_adapter/server.go index 81a2824..b2de452 100644 --- a/internal/prometheus_adapter/server.go +++ b/internal/prometheus_adapter/server.go @@ -15,13 +15,13 @@ import ( // PrometheusAdapterServer Prometheus Adapter 服务器 type PrometheusAdapterServer struct { - config *config.Config - promConfig *promconfig.PrometheusAdapterConfig - promClient *client.PrometheusClient - metricService *service.MetricService - alertService *service.AlertService - alertWebhookService *service.AlertWebhookService - api *api.Api + config *config.Config + promConfig *promconfig.PrometheusAdapterConfig + promClient *client.PrometheusClient + metricService *service.MetricService + alertService *service.AlertService + alertmanagerProxyService *service.AlertmanagerService + api *api.Api } // NewPrometheusAdapterServer 创建新的 Prometheus Adapter 服务器 @@ -44,22 +44,16 @@ func NewPrometheusAdapterServer(cfg *config.Config) (*PrometheusAdapterServer, e // 创建告警服务 alertService := service.NewAlertService(promClient, promConfig) - // 创建告警 Webhook 服务 - alertWebhookService := service.NewAlertWebhookService(promClient, promConfig) + // 创建 Alertmanager 代理服务 + alertmanagerProxyService := service.NewAlertmanagerProxyService(promConfig) server := &PrometheusAdapterServer{ - config: cfg, - promConfig: promConfig, - promClient: promClient, - metricService: metricService, - alertService: alertService, - alertWebhookService: alertWebhookService, - } - - // 启动告警 Webhook 服务 - if err := alertWebhookService.Start(); err != nil { - log.Error().Err(err).Msg("Failed to start alert webhook service") - // 不返回错误,允许服务继续运行 + config: cfg, + promConfig: promConfig, + promClient: promClient, + metricService: metricService, + alertService: alertService, + alertmanagerProxyService: alertmanagerProxyService, } log.Info().Str("prometheus_address", promConfig.Prometheus.Address).Msg("Prometheus Adapter initialized successfully") @@ -69,11 +63,13 @@ func NewPrometheusAdapterServer(cfg *config.Config) (*PrometheusAdapterServer, e // UseApi 设置 API 路由 func (s *PrometheusAdapterServer) UseApi(router *fox.Engine) error { var err error - s.api, err = api.NewApi(s.metricService, s.alertService, router) + s.api, err = api.NewApi(s.metricService, s.alertService, s.alertmanagerProxyService, router) if err != nil { return fmt.Errorf("failed to initialize API: %w", err) } + log.Info().Msg("All API endpoints registered") + return nil } @@ -81,11 +77,6 @@ func (s *PrometheusAdapterServer) UseApi(router *fox.Engine) error { func (s *PrometheusAdapterServer) Close(ctx context.Context) error { log.Info().Msg("Starting shutdown...") - // 停止告警 Webhook 服务 - if s.alertWebhookService != nil { - s.alertWebhookService.Stop() - } - // 调用 alertService 的 Shutdown 方法保存规则 if s.alertService != nil { if err := s.alertService.Shutdown(); err != nil { diff --git a/internal/prometheus_adapter/service/alert_service.go b/internal/prometheus_adapter/service/alert_service.go index f74d4a3..1c2290c 100644 --- a/internal/prometheus_adapter/service/alert_service.go +++ b/internal/prometheus_adapter/service/alert_service.go @@ -411,7 +411,7 @@ func (s *AlertService) buildExpression(rule *model.AlertRule, meta *model.AlertR if len(labelMatchers) > 0 { // 如果表达式包含{,说明已经有标签选择器 if strings.Contains(expr, "{") { - expr = strings.Replace(expr, "}", ","+strings.Join(labelMatchers, ",")+"}}", 1) + expr = strings.Replace(expr, "}", ","+strings.Join(labelMatchers, ",")+"}", 1) } else { // 在指标名后添加标签选择器 // 查找第一个非字母数字下划线的字符 diff --git a/internal/prometheus_adapter/service/alert_webhook_service.go b/internal/prometheus_adapter/service/alert_webhook_service.go deleted file mode 100644 index ac211ac..0000000 --- a/internal/prometheus_adapter/service/alert_webhook_service.go +++ /dev/null @@ -1,255 +0,0 @@ -package service - -import ( - "bytes" - "context" - "crypto/md5" - "encoding/json" - "fmt" - "io" - "net/http" - "sync" - "time" - - "github.com/qiniu/zeroops/internal/prometheus_adapter/client" - promconfig "github.com/qiniu/zeroops/internal/prometheus_adapter/config" - "github.com/qiniu/zeroops/internal/prometheus_adapter/model" - "github.com/rs/zerolog/log" -) - -// AlertWebhookService 告警 Webhook 服务 -type AlertWebhookService struct { - promClient *client.PrometheusClient - config *promconfig.PrometheusAdapterConfig - webhookURL string - pollingInterval time.Duration - httpClient *http.Client - alertCache map[string]*model.PrometheusAlert // 缓存已发送的告警 - cacheMutex sync.RWMutex - stopCh chan struct{} - running bool - runningMutex sync.Mutex -} - -// NewAlertWebhookService 创建告警 Webhook 服务 -func NewAlertWebhookService(promClient *client.PrometheusClient, config *promconfig.PrometheusAdapterConfig) *AlertWebhookService { - return &AlertWebhookService{ - promClient: promClient, - config: config, - webhookURL: config.AlertWebhook.URL, - pollingInterval: config.AlertWebhook.GetPollingInterval(), - httpClient: &http.Client{Timeout: 30 * time.Second}, - alertCache: make(map[string]*model.PrometheusAlert), - stopCh: make(chan struct{}), - } -} - -// Start 启动告警轮询服务 -func (s *AlertWebhookService) Start() error { - s.runningMutex.Lock() - defer s.runningMutex.Unlock() - - if s.running { - return fmt.Errorf("alert webhook service already running") - } - - s.running = true - go s.pollAlerts() - - log.Info(). - Str("webhook_url", s.webhookURL). - Dur("interval", s.pollingInterval). - Msg("Alert webhook service started") - - return nil -} - -// Stop 停止告警轮询服务 -func (s *AlertWebhookService) Stop() { - s.runningMutex.Lock() - defer s.runningMutex.Unlock() - - if !s.running { - return - } - - close(s.stopCh) - s.running = false - - log.Info().Msg("Alert webhook service stopped") -} - -// pollAlerts 轮询告警 -func (s *AlertWebhookService) pollAlerts() { - ticker := time.NewTicker(s.pollingInterval) - defer ticker.Stop() - - // 立即执行一次 - s.fetchAndProcessAlerts() - - for { - select { - case <-ticker.C: - s.fetchAndProcessAlerts() - case <-s.stopCh: - return - } - } -} - -// fetchAndProcessAlerts 获取并处理告警 -func (s *AlertWebhookService) fetchAndProcessAlerts() { - ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) - defer cancel() - - // 从 Prometheus 获取告警 - alertsResp, err := s.promClient.GetAlerts(ctx) - if err != nil { - log.Error().Err(err).Msg("Failed to fetch alerts from Prometheus") - return - } - - // 处理告警 - firingAlerts := []model.PrometheusAlert{} - resolvedAlerts := []model.PrometheusAlert{} - - s.cacheMutex.Lock() - defer s.cacheMutex.Unlock() - - // 分类告警 - currentAlerts := make(map[string]*model.PrometheusAlert) - for _, alert := range alertsResp.Data.Alerts { - fingerprint := s.generateFingerprint(alert) - currentAlerts[fingerprint] = &alert - - // 检查是否是新告警或状态变更 - cachedAlert, exists := s.alertCache[fingerprint] - if !exists || cachedAlert.State != alert.State { - if alert.State == "firing" { - firingAlerts = append(firingAlerts, alert) - } - } - } - - // 检查已恢复的告警 - for fingerprint, cachedAlert := range s.alertCache { - if _, exists := currentAlerts[fingerprint]; !exists { - // 告警已恢复 - resolvedAlert := *cachedAlert - resolvedAlert.State = "resolved" - resolvedAlerts = append(resolvedAlerts, resolvedAlert) - } - } - - // 更新缓存 - s.alertCache = currentAlerts - - // 发送告警 - if len(firingAlerts) > 0 { - if err := s.sendAlerts(firingAlerts, "firing"); err != nil { - log.Error().Err(err).Msg("Failed to send firing alerts") - } - } - - if len(resolvedAlerts) > 0 { - if err := s.sendAlerts(resolvedAlerts, "resolved"); err != nil { - log.Error().Err(err).Msg("Failed to send resolved alerts") - } - } -} - -// sendAlerts 发送告警到监控模块 -func (s *AlertWebhookService) sendAlerts(alerts []model.PrometheusAlert, status string) error { - webhookAlerts := []model.AlertmanagerWebhookAlert{} - - // 收集所有标签用于 groupLabels 和 commonLabels - commonLabels := map[string]string{} - firstAlert := true - - for _, alert := range alerts { - // 生成 fingerprint - fingerprint := s.generateFingerprint(alert) - - // 转换时间格式 - startsAt := alert.ActiveAt.Format(time.RFC3339) - endsAt := "0001-01-01T00:00:00Z" - if status == "resolved" { - endsAt = time.Now().Format(time.RFC3339) - } - - // 构造 GeneratorURL - generatorURL := fmt.Sprintf("http://prometheus/graph?g0.expr=%s", alert.Labels["alertname"]) - - webhookAlert := model.AlertmanagerWebhookAlert{ - Status: status, - Labels: alert.Labels, - Annotations: alert.Annotations, - StartsAt: startsAt, - EndsAt: endsAt, - GeneratorURL: generatorURL, - Fingerprint: fingerprint, - } - webhookAlerts = append(webhookAlerts, webhookAlert) - - // 收集公共标签(取第一个告警的标签作为公共标签) - if firstAlert { - for k, v := range alert.Labels { - commonLabels[k] = v - } - firstAlert = false - } - } - - groupLabels := map[string]string{} - if alertName, ok := commonLabels["alertname"]; ok { - groupLabels["alertname"] = alertName - } - - // 构造请求 - req := model.AlertmanagerWebhookRequest{ - Receiver: "prometheus_adapter", - Status: status, - Alerts: webhookAlerts, - GroupLabels: groupLabels, - CommonLabels: commonLabels, - Version: "1", - } - - // 发送请求 - jsonData, err := json.Marshal(req) - if err != nil { - return fmt.Errorf("failed to marshal request: %w", err) - } - - resp, err := s.httpClient.Post(s.webhookURL, "application/json", bytes.NewBuffer(jsonData)) - if err != nil { - return fmt.Errorf("failed to send webhook: %w", err) - } - defer resp.Body.Close() - - if resp.StatusCode != http.StatusOK { - body, _ := io.ReadAll(resp.Body) - return fmt.Errorf("webhook returned status %d: %s", resp.StatusCode, string(body)) - } - - log.Info(). - Str("status", status). - Int("alert_count", len(alerts)). - Str("webhook_url", s.webhookURL). - Msg("Successfully sent alerts to webhook") - - return nil -} - -// generateFingerprint 生成告警的唯一标识 -func (s *AlertWebhookService) generateFingerprint(alert model.PrometheusAlert) string { - // 基于标签生成指纹 - labels := "" - for k, v := range alert.Labels { - labels += fmt.Sprintf("%s=%s,", k, v) - } - - h := md5.New() - h.Write([]byte(labels)) - return fmt.Sprintf("%x", h.Sum(nil))[:16] -} diff --git a/internal/prometheus_adapter/service/alertmanager_service.go b/internal/prometheus_adapter/service/alertmanager_service.go new file mode 100644 index 0000000..e91d721 --- /dev/null +++ b/internal/prometheus_adapter/service/alertmanager_service.go @@ -0,0 +1,215 @@ +package service + +import ( + "bytes" + "encoding/json" + "fmt" + "io" + "net/http" + "time" + + promconfig "github.com/qiniu/zeroops/internal/prometheus_adapter/config" + "github.com/qiniu/zeroops/internal/prometheus_adapter/model" + "github.com/rs/zerolog/log" +) + +// AlertmanagerService Alertmanager 服务 +// 接收 Prometheus 的告警推送并转发到监控告警模块 +type AlertmanagerService struct { + config *promconfig.PrometheusAdapterConfig + webhookURL string + httpClient *http.Client + resolveTimeout time.Duration +} + +// NewAlertmanagerProxyService 创建新的 Alertmanager 代理服务 +func NewAlertmanagerProxyService(config *promconfig.PrometheusAdapterConfig) *AlertmanagerService { + return &AlertmanagerService{ + config: config, + webhookURL: config.AlertWebhook.URL, + httpClient: &http.Client{Timeout: 30 * time.Second}, + resolveTimeout: 5 * time.Minute, // 默认 resolve_timeout + } +} + +// HandleAlertsV2 处理 Prometheus 推送的告警 +// 实现 POST /api/v2/alerts 接口 +func (s *AlertmanagerService) HandleAlertsV2(w http.ResponseWriter, r *http.Request) { + // 检查 Content-Type + contentType := r.Header.Get("Content-Type") + if contentType != "application/json" && contentType != "" { + http.Error(w, "Content-Type must be application/json", http.StatusBadRequest) + return + } + + // 解析 Prometheus 发送的告警 + var alerts []model.AlertmanagerAlert + body, err := io.ReadAll(r.Body) + if err != nil { + log.Error().Err(err).Msg("Failed to read request body") + http.Error(w, "Failed to read request", http.StatusBadRequest) + return + } + defer r.Body.Close() + + if err := json.Unmarshal(body, &alerts); err != nil { + log.Error(). + Err(err). + Str("body", string(body)). + Msg("Failed to unmarshal alerts") + http.Error(w, "Invalid JSON", http.StatusBadRequest) + return + } + + // 处理时间戳:如果缺失则设置默认值 + now := time.Now() + for i := range alerts { + // 如果 startsAt 缺失,设置为当前时间 + if alerts[i].StartsAt == "" { + alerts[i].StartsAt = now.Format(time.RFC3339) + } + // 如果 endsAt 缺失,设置为当前时间 + resolve_timeout + if alerts[i].EndsAt == "" { + alerts[i].EndsAt = now.Add(s.resolveTimeout).Format(time.RFC3339) + } + } + + log.Info(). + Int("alert_count", len(alerts)). + Msg("Received alerts from Prometheus") + + // 转发告警到监控模块 + if err := s.forwardAlertsV2(alerts); err != nil { + log.Error().Err(err).Msg("Failed to forward alerts") + // 返回 500 让 Prometheus 重试 + http.Error(w, "Failed to forward alerts", http.StatusInternalServerError) + return + } + + // 返回成功响应(Alertmanager API v2 返回空 JSON) + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(http.StatusOK) + w.Write([]byte("{}")) +} + +// HandleHealthCheck 健康检查接口 +// 实现 GET /-/healthy +func (s *AlertmanagerService) HandleHealthCheck(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(http.StatusOK) + w.Write([]byte("OK")) +} + +// HandleReadyCheck 就绪检查接口 +// 实现 GET /-/ready +func (s *AlertmanagerService) HandleReadyCheck(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(http.StatusOK) + w.Write([]byte("OK")) +} + +// forwardAlertsV2 转发告警到监控告警模块 +func (s *AlertmanagerService) forwardAlertsV2(alerts []model.AlertmanagerAlert) error { + // 转换为 Alertmanager webhook 格式 + webhookAlerts := []model.AlertmanagerWebhookAlert{} + commonLabels := map[string]string{} + groupLabels := map[string]string{} + + // 统计告警状态,用于确定总体状态 + hasFiring := false + + for _, alert := range alerts { + // 确定告警状态:通过比较 endsAt 和当前时间 + status := "firing" + if alert.EndsAt != "" { + endsAtTime, err := time.Parse(time.RFC3339, alert.EndsAt) + if err == nil && endsAtTime.Before(time.Now()) { + status = "resolved" + } else { + hasFiring = true + } + } else { + hasFiring = true + } + + // 生成 fingerprint + fingerprint := s.generateFingerprint(alert.Labels) + + // 构造 GeneratorURL + generatorURL := alert.GeneratorURL + if generatorURL == "" && alert.Labels["alertname"] != "" { + generatorURL = fmt.Sprintf("http://prometheus/graph?g0.expr=%s", alert.Labels["alertname"]) + } + + webhookAlert := model.AlertmanagerWebhookAlert{ + Status: status, + Labels: alert.Labels, + Annotations: alert.Annotations, + StartsAt: alert.StartsAt, // 已经是 RFC3339 格式 + EndsAt: alert.EndsAt, // 已经是 RFC3339 格式 + GeneratorURL: generatorURL, + Fingerprint: fingerprint, + } + webhookAlerts = append(webhookAlerts, webhookAlert) + + // 收集公共标签 + if len(commonLabels) == 0 { + for k, v := range alert.Labels { + commonLabels[k] = v + } + } + } + + // 设置 groupLabels + if alertName, ok := commonLabels["alertname"]; ok { + groupLabels["alertname"] = alertName + } + + // 确定总体状态:如果有任何 firing 的告警,总体状态为 firing,否则为 resolved + overallStatus := "resolved" + if hasFiring { + overallStatus = "firing" + } + + // 构造 webhook 请求 + req := model.AlertmanagerWebhookRequest{ + Receiver: "prometheus_adapter", + Status: overallStatus, // 根据告警实际状态设置 + Alerts: webhookAlerts, + GroupLabels: groupLabels, + CommonLabels: commonLabels, + Version: "1", + } + + // 发送到监控告警模块 + jsonData, err := json.Marshal(req) + if err != nil { + return fmt.Errorf("failed to marshal webhook request: %w", err) + } + + resp, err := s.httpClient.Post(s.webhookURL, "application/json", bytes.NewBuffer(jsonData)) + if err != nil { + return fmt.Errorf("failed to send webhook: %w", err) + } + defer resp.Body.Close() + + if resp.StatusCode != http.StatusOK { + body, _ := io.ReadAll(resp.Body) + return fmt.Errorf("webhook returned status %d: %s", resp.StatusCode, string(body)) + } + + log.Info(). + Int("alert_count", len(alerts)). + Str("webhook_url", s.webhookURL). + Msg("Successfully forwarded alerts to monitoring module") + + return nil +} + +// generateFingerprint 生成告警指纹 +func (s *AlertmanagerService) generateFingerprint(labels map[string]string) string { + // 简化版指纹生成 + result := "" + for k, v := range labels { + result += fmt.Sprintf("%s:%s,", k, v) + } + return fmt.Sprintf("%x", result)[:16] +} diff --git a/mock/s3/deployments/docker-compose.yml b/mock/s3/deployments/docker-compose.yml index 377ec3d..9f30f9e 100644 --- a/mock/s3/deployments/docker-compose.yml +++ b/mock/s3/deployments/docker-compose.yml @@ -89,6 +89,9 @@ services: - '--config.file=/etc/prometheus/prometheus.yml' - '--storage.tsdb.path=/prometheus' - '--web.enable-lifecycle' + - '--web.enable-admin-api' # 启用管理API以支持配置重载 + extra_hosts: + - "host.docker.internal:host-gateway" # 允许容器访问宿主机 restart: unless-stopped # Grafana - 可视化 diff --git a/mock/s3/deployments/observability/prometheus.yml b/mock/s3/deployments/observability/prometheus.yml index 35fb014..6c9fb04 100644 --- a/mock/s3/deployments/observability/prometheus.yml +++ b/mock/s3/deployments/observability/prometheus.yml @@ -9,6 +9,14 @@ global: rule_files: - "/etc/prometheus/rules/*.yml" +# Alerting 配置 - 将告警发送到 Prometheus Adapter (伪 Alertmanager) +alerting: + alertmanagers: + - static_configs: + - targets: + - 'host.docker.internal:8081' # Prometheus Adapter 运行在宿主机的 8081 端口 + api_version: v2 # 使用 Alertmanager API v2 + scrape_configs: # Prometheus自身的指标 - job_name: 'prometheus' From 285412f197bd78dcdb40b28818710010a09d4eda Mon Sep 17 00:00:00 2001 From: dnj Date: Thu, 25 Sep 2025 18:21:02 +0800 Subject: [PATCH 14/18] =?UTF-8?q?feat(prometheus=5Fadapter):=20=E6=B7=BB?= =?UTF-8?q?=E5=8A=A0=E8=8E=B7=E5=8F=96=E7=BB=91=E5=AE=9A=E5=9C=B0=E5=9D=80?= =?UTF-8?q?=E6=96=B9=E6=B3=95=E5=B9=B6=E4=BC=98=E5=8C=96=E7=AB=AF=E5=8F=A3?= =?UTF-8?q?=E9=85=8D=E7=BD=AE=E9=80=BB=E8=BE=91?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- cmd/prometheus_adapter/main.go | 28 ++++++++++++++++----------- internal/prometheus_adapter/server.go | 8 ++++++++ 2 files changed, 25 insertions(+), 11 deletions(-) diff --git a/cmd/prometheus_adapter/main.go b/cmd/prometheus_adapter/main.go index 847516b..696018b 100644 --- a/cmd/prometheus_adapter/main.go +++ b/cmd/prometheus_adapter/main.go @@ -20,22 +20,28 @@ func main() { log.Info().Msg("Starting Prometheus Adapter server") - // 加载配置 - cfg := &config.Config{ - Server: config.ServerConfig{ - BindAddr: ":9999", // 默认端口 - }, + // 加载 Prometheus Adapter 配置 + adapter, err := prometheusadapter.NewPrometheusAdapterServer(&config.Config{}) + if err != nil { + log.Fatal().Err(err).Msg("Failed to create Prometheus Adapter server") + } + + // 获取 Prometheus Adapter 内部配置的绑定地址 + bindAddr := ":9999" // 默认端口 + if adapter.GetBindAddr() != "" { + bindAddr = adapter.GetBindAddr() } - // 如果有环境变量,使用环境变量的端口 + // 如果有环境变量,优先使用环境变量的端口 if port := os.Getenv("ADAPTER_PORT"); port != "" { - cfg.Server.BindAddr = ":" + port + bindAddr = ":" + port } - // 创建 Prometheus Adapter 服务器 - adapter, err := prometheusadapter.NewPrometheusAdapterServer(cfg) - if err != nil { - log.Fatal().Err(err).Msg("Failed to create Prometheus Adapter server") + // 更新配置(虽然已经创建了 adapter,但需要端口信息用于启动服务器) + cfg := &config.Config{ + Server: config.ServerConfig{ + BindAddr: bindAddr, + }, } // 创建路由 diff --git a/internal/prometheus_adapter/server.go b/internal/prometheus_adapter/server.go index b2de452..c8e321c 100644 --- a/internal/prometheus_adapter/server.go +++ b/internal/prometheus_adapter/server.go @@ -60,6 +60,14 @@ func NewPrometheusAdapterServer(cfg *config.Config) (*PrometheusAdapterServer, e return server, nil } +// GetBindAddr 获取配置文件中的绑定地址 +func (s *PrometheusAdapterServer) GetBindAddr() string { + if s.promConfig != nil && s.promConfig.Server.BindAddr != "" { + return s.promConfig.Server.BindAddr + } + return "" +} + // UseApi 设置 API 路由 func (s *PrometheusAdapterServer) UseApi(router *fox.Engine) error { var err error From 705363e64bc4d92ede5e77ee4b1f7b4a457d0b47 Mon Sep 17 00:00:00 2001 From: dnj Date: Sun, 28 Sep 2025 10:49:12 +0800 Subject: [PATCH 15/18] =?UTF-8?q?fix(prometheus=5Fadapter):=20=E6=9B=B4?= =?UTF-8?q?=E6=96=B0=E5=91=8A=E8=AD=A6webhook=E8=B7=AF=E5=BE=84=E5=B9=B6?= =?UTF-8?q?=E6=B7=BB=E5=8A=A0REDACTED=E5=AD=97=E6=AE=B5?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- docs/prometheus_adapter/README.md | 4 ++-- internal/prometheus_adapter/config/config.go | 2 +- internal/prometheus_adapter/config/prometheus_adapter.yml | 2 +- internal/prometheus_adapter/model/prometheus_alert.go | 1 + internal/prometheus_adapter/service/alertmanager_service.go | 1 + internal/prometheus_adapter/test_alert_update.sh | 2 +- scripts/prometheus_adapter/build.sh | 2 +- scripts/prometheus_adapter/deploy.sh | 2 +- 8 files changed, 9 insertions(+), 7 deletions(-) diff --git a/docs/prometheus_adapter/README.md b/docs/prometheus_adapter/README.md index 0b9d5ca..65d4714 100644 --- a/docs/prometheus_adapter/README.md +++ b/docs/prometheus_adapter/README.md @@ -243,7 +243,7 @@ internal/prometheus_adapter/ │ (自定义服务) │ └────────┬────────┘ │ Push - │ POST /v1/integrations/prometheus/alerts + │ POST /v1/integrations/alertmanager/webhook ▼ ┌─────────────────┐ │ 监控告警模块 │ @@ -263,7 +263,7 @@ internal/prometheus_adapter/ - 支持告警恢复状态通知 - **推送目标**: - - URL: `http://alert-module:8080/v1/integrations/prometheus/alerts` + - URL: `http://alert-module:8080/v1/integrations/alertmanager/webhook` - Method: POST - Content-Type: application/json diff --git a/internal/prometheus_adapter/config/config.go b/internal/prometheus_adapter/config/config.go index 39dd023..5fe7fe8 100644 --- a/internal/prometheus_adapter/config/config.go +++ b/internal/prometheus_adapter/config/config.go @@ -105,7 +105,7 @@ func getDefaultConfig() *PrometheusAdapterConfig { ContainerName: "mock-s3-prometheus", }, AlertWebhook: AlertWebhookConfig{ - URL: "http://alert-module:8080/v1/integrations/prometheus/alerts", + URL: "http://alert-module:8080/v1/integrations/alertmanager/webhook", PollingInterval: "10s", }, AlertRules: AlertRulesConfig{ diff --git a/internal/prometheus_adapter/config/prometheus_adapter.yml b/internal/prometheus_adapter/config/prometheus_adapter.yml index a3eab43..55f0b56 100644 --- a/internal/prometheus_adapter/config/prometheus_adapter.yml +++ b/internal/prometheus_adapter/config/prometheus_adapter.yml @@ -10,7 +10,7 @@ prometheus: # 告警 Webhook 服务配置 alert_webhook: # 监控告警模块地址 - url: "http://alert-module:8080/v1/integrations/prometheus/alerts" + url: "http://alert-module:8080/v1/integrations/alertmanager/webhook" # 轮询间隔 polling_interval: "10s" diff --git a/internal/prometheus_adapter/model/prometheus_alert.go b/internal/prometheus_adapter/model/prometheus_alert.go index 9dab331..c809359 100644 --- a/internal/prometheus_adapter/model/prometheus_alert.go +++ b/internal/prometheus_adapter/model/prometheus_alert.go @@ -39,6 +39,7 @@ type AlertmanagerWebhookRequest struct { Alerts []AlertmanagerWebhookAlert `json:"alerts"` GroupLabels map[string]string `json:"groupLabels"` // 分组标签 CommonLabels map[string]string `json:"commonLabels"` // 公共标签 + Alert string `json:"alert"` // "REDACTED" Version string `json:"version"` // "4" } diff --git a/internal/prometheus_adapter/service/alertmanager_service.go b/internal/prometheus_adapter/service/alertmanager_service.go index e91d721..5ff5e90 100644 --- a/internal/prometheus_adapter/service/alertmanager_service.go +++ b/internal/prometheus_adapter/service/alertmanager_service.go @@ -176,6 +176,7 @@ func (s *AlertmanagerService) forwardAlertsV2(alerts []model.AlertmanagerAlert) Alerts: webhookAlerts, GroupLabels: groupLabels, CommonLabels: commonLabels, + Alert: "REDACTED", Version: "1", } diff --git a/internal/prometheus_adapter/test_alert_update.sh b/internal/prometheus_adapter/test_alert_update.sh index 4400b43..f57fceb 100755 --- a/internal/prometheus_adapter/test_alert_update.sh +++ b/internal/prometheus_adapter/test_alert_update.sh @@ -2,7 +2,7 @@ # 测试增量更新告警规则功能 -BASE_URL="http://localhost:9999" +BASE_URL="http://10.210.10.33:9999" echo "=== 测试增量更新告警规则 ===" diff --git a/scripts/prometheus_adapter/build.sh b/scripts/prometheus_adapter/build.sh index d7b5025..06d7213 100755 --- a/scripts/prometheus_adapter/build.sh +++ b/scripts/prometheus_adapter/build.sh @@ -134,7 +134,7 @@ fi # 环境变量(可选,用于覆盖配置文件) # export PROMETHEUS_ADDRESS="http://localhost:9090" -# export ALERT_WEBHOOK_URL="http://alert-module:8080/v1/integrations/prometheus/alerts" +# export ALERT_WEBHOOK_URL="http://alert-module:8080/v1/integrations/alertmanager/webhook" # export ALERT_POLLING_INTERVAL="10s" # export SERVER_BIND_ADDR="0.0.0.0:9999" diff --git a/scripts/prometheus_adapter/deploy.sh b/scripts/prometheus_adapter/deploy.sh index ad68564..dc56736 100755 --- a/scripts/prometheus_adapter/deploy.sh +++ b/scripts/prometheus_adapter/deploy.sh @@ -296,7 +296,7 @@ Group=qboxserver WorkingDirectory=$DEPLOY_DIR # 可选:通过环境变量覆盖配置 #Environment="PROMETHEUS_ADDRESS=http://localhost:9090" -#Environment="ALERT_WEBHOOK_URL=http://alert-module:8080/v1/integrations/prometheus/alerts" +#Environment="ALERT_WEBHOOK_URL=http://alert-module:8080/v1/integrations/alertmanager/webhook" #Environment="ALERT_POLLING_INTERVAL=10s" #Environment="SERVER_BIND_ADDR=0.0.0.0:9999" ExecStart=$DEPLOY_DIR/bin/prometheus_adapter From 6de179393f3a64a988d86f4fbe40c162f0a161d0 Mon Sep 17 00:00:00 2001 From: dnj Date: Sun, 28 Sep 2025 11:13:54 +0800 Subject: [PATCH 16/18] =?UTF-8?q?feat(=E5=91=8A=E8=AD=A6=E8=A7=84=E5=88=99?= =?UTF-8?q?):=20=E6=B7=BB=E5=8A=A0=E5=88=A0=E9=99=A4=E8=A7=84=E5=88=99?= =?UTF-8?q?=E6=A8=A1=E6=9D=BF=E5=92=8C=E5=85=83=E4=BF=A1=E6=81=AF=E7=9A=84?= =?UTF-8?q?API=E6=8E=A5=E5=8F=A3?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 实现删除告警规则模板及其关联元信息的功能,包括: 1. 添加DELETE /v1/alert-rules/:rule_name接口删除规则模板 2. 添加DELETE /v1/alert-rules-meta/:rule_name接口删除特定元信息 3. 更新相关文档说明删除操作的使用方法 --- docs/prometheus_adapter/README.md | 54 ++++++++++++++ internal/prometheus_adapter/api/alert_api.go | 70 +++++++++++++++++++ internal/prometheus_adapter/model/api.go | 5 ++ .../service/alert_service.go | 64 +++++++++++++++++ 4 files changed, 193 insertions(+) diff --git a/docs/prometheus_adapter/README.md b/docs/prometheus_adapter/README.md index 65d4714..0167488 100644 --- a/docs/prometheus_adapter/README.md +++ b/docs/prometheus_adapter/README.md @@ -193,6 +193,60 @@ internal/prometheus_adapter/ } ``` +#### 3. 删除规则模板 +- 方法与路径:`DELETE /v1/alert-rules/:rule_name` +- 功能:删除指定的告警规则模板及其所有关联的元信息 +- 路径参数: + - `rule_name`:规则名称(如 `high_cpu_usage`) +- 响应示例: +```json +{ + "status": "success", + "message": "Rule 'high_cpu_usage' and 3 associated metas deleted successfully", + "rule_name": "high_cpu_usage", + "deleted_metas": 3 +} +``` +- 错误响应示例(规则不存在): +```json +{ + "error": { + "code": "INVALID_PARAMETER", + "message": "rule 'invalid_rule' not found" + } +} +``` + +#### 4. 删除规则元信息 +- 方法与路径:`DELETE /v1/alert-rules-meta/:rule_name` +- 功能:删除指定规则下的特定元信息(通过 labels 唯一标识) +- 路径参数: + - `rule_name`:规则名称(如 `high_cpu_usage`) +- 请求体示例: +```json +{ + "labels": "{\"service\":\"storage-service\",\"version\":\"1.0.0\"}" +} +``` +- 响应示例: +```json +{ + "status": "success", + "message": "Rule meta deleted successfully", + "rule_name": "high_cpu_usage", + "labels": "{\"service\":\"storage-service\",\"version\":\"1.0.0\"}" +} +``` +- 错误响应示例(元信息不存在): +```json +{ + "error": { + "code": "INVALID_PARAMETER", + "message": "rule meta not found for rule 'high_cpu_usage' with labels '{\"service\":\"invalid-service\"}'" + } +} +``` + #### 规则生成机制 - **规则模板与元信息关联**:通过 `alert_name` 字段关联 - `AlertRule.name` = `AlertRuleMeta.alert_name` diff --git a/internal/prometheus_adapter/api/alert_api.go b/internal/prometheus_adapter/api/alert_api.go index 8724803..786de6f 100644 --- a/internal/prometheus_adapter/api/alert_api.go +++ b/internal/prometheus_adapter/api/alert_api.go @@ -12,6 +12,8 @@ import ( func (api *Api) setupAlertRouters(router *fox.Engine) { router.PUT("/v1/alert-rules/:rule_name", api.UpdateRule) router.PUT("/v1/alert-rules-meta/:rule_name", api.UpdateRuleMetas) + router.DELETE("/v1/alert-rules/:rule_name", api.DeleteRule) + router.DELETE("/v1/alert-rules-meta/:rule_name", api.DeleteRuleMeta) } // UpdateRule 更新单个规则模板 @@ -107,3 +109,71 @@ func (api *Api) UpdateRuleMetas(c *fox.Context) { "updated_count": updatedCount, }) } + +// DeleteRule 删除单个规则模板及其所有关联的元信息 +func (api *Api) DeleteRule(c *fox.Context) { + ruleName := c.Param("rule_name") + if ruleName == "" { + SendErrorResponse(c, http.StatusBadRequest, model.ErrorCodeInvalidParameter, + "Rule name is required", nil) + return + } + + // 获取受影响的元信息数量 + affectedCount := api.alertService.GetAffectedMetas(ruleName) + + err := api.alertService.DeleteRule(ruleName) + if err != nil { + if err.Error() == fmt.Sprintf("rule '%s' not found", ruleName) { + SendErrorResponse(c, http.StatusNotFound, model.ErrorCodeInvalidParameter, + err.Error(), nil) + } else { + SendErrorResponse(c, http.StatusInternalServerError, model.ErrorCodeInternalError, + "Failed to delete rule: "+err.Error(), nil) + } + return + } + + c.JSON(http.StatusOK, map[string]interface{}{ + "status": "success", + "message": fmt.Sprintf("Rule '%s' and %d associated metas deleted successfully", ruleName, affectedCount), + "rule_name": ruleName, + "deleted_metas": affectedCount, + }) +} + +// DeleteRuleMeta 删除单个规则元信息 +func (api *Api) DeleteRuleMeta(c *fox.Context) { + ruleName := c.Param("rule_name") + if ruleName == "" { + SendErrorResponse(c, http.StatusBadRequest, model.ErrorCodeInvalidParameter, + "Rule name is required", nil) + return + } + + var req model.DeleteAlertRuleMetaRequest + if err := c.ShouldBindJSON(&req); err != nil { + SendErrorResponse(c, http.StatusBadRequest, model.ErrorCodeInvalidParameter, + "Invalid request body: "+err.Error(), nil) + return + } + + err := api.alertService.DeleteRuleMeta(ruleName, req.Labels) + if err != nil { + if err.Error() == fmt.Sprintf("rule meta not found for rule '%s' with labels '%s'", ruleName, req.Labels) { + SendErrorResponse(c, http.StatusNotFound, model.ErrorCodeInvalidParameter, + err.Error(), nil) + } else { + SendErrorResponse(c, http.StatusInternalServerError, model.ErrorCodeInternalError, + "Failed to delete rule meta: "+err.Error(), nil) + } + return + } + + c.JSON(http.StatusOK, map[string]interface{}{ + "status": "success", + "message": "Rule meta deleted successfully", + "rule_name": ruleName, + "labels": req.Labels, + }) +} diff --git a/internal/prometheus_adapter/model/api.go b/internal/prometheus_adapter/model/api.go index 775bdd9..9138f68 100644 --- a/internal/prometheus_adapter/model/api.go +++ b/internal/prometheus_adapter/model/api.go @@ -68,3 +68,8 @@ type AlertRuleMetaUpdate struct { Labels string `json:"labels" binding:"required"` // 必填,用于唯一标识 Threshold float64 `json:"threshold"` } + +// DeleteAlertRuleMetaRequest 删除告警规则元信息请求 +type DeleteAlertRuleMetaRequest struct { + Labels string `json:"labels" binding:"required"` // 必填,用于唯一标识要删除的元信息 +} diff --git a/internal/prometheus_adapter/service/alert_service.go b/internal/prometheus_adapter/service/alert_service.go index 1c2290c..1a1d115 100644 --- a/internal/prometheus_adapter/service/alert_service.go +++ b/internal/prometheus_adapter/service/alert_service.go @@ -264,6 +264,70 @@ func (s *AlertService) GetAffectedMetas(ruleName string) int { return count } +// DeleteRule 删除单个规则模板及其所有关联的元信息 +func (s *AlertService) DeleteRule(ruleName string) error { + // 查找并删除规则模板 + ruleFound := false + for i, rule := range s.currentRules { + if rule.Name == ruleName { + // 从切片中删除规则 + s.currentRules = append(s.currentRules[:i], s.currentRules[i+1:]...) + ruleFound = true + break + } + } + + if !ruleFound { + return fmt.Errorf("rule '%s' not found", ruleName) + } + + // 删除所有关联的元信息 + deletedMetaCount := 0 + newMetas := []model.AlertRuleMeta{} + for _, meta := range s.currentRuleMetas { + if meta.AlertName != ruleName { + newMetas = append(newMetas, meta) + } else { + deletedMetaCount++ + } + } + s.currentRuleMetas = newMetas + + log.Info(). + Str("rule", ruleName). + Int("deleted_metas", deletedMetaCount). + Msg("Rule and associated metas deleted") + + // 重新生成并同步 + return s.regenerateAndSync() +} + +// DeleteRuleMeta 删除单个规则元信息 +func (s *AlertService) DeleteRuleMeta(ruleName, labels string) error { + // 查找并删除匹配的元信息 + found := false + for i, meta := range s.currentRuleMetas { + if meta.AlertName == ruleName && meta.Labels == labels { + // 从切片中删除元信息 + s.currentRuleMetas = append(s.currentRuleMetas[:i], s.currentRuleMetas[i+1:]...) + found = true + break + } + } + + if !found { + return fmt.Errorf("rule meta not found for rule '%s' with labels '%s'", ruleName, labels) + } + + log.Info(). + Str("rule", ruleName). + Str("labels", labels). + Msg("Rule meta deleted") + + // 重新生成并同步 + return s.regenerateAndSync() +} + // ========== 内部核心方法 ========== // regenerateAndSync 使用当前内存中的规则和元信息重新生成Prometheus规则并同步 From 4c7dab9fef28f1f3a0c373edb1efde78aceccb31 Mon Sep 17 00:00:00 2001 From: dnj Date: Sun, 28 Sep 2025 11:26:29 +0800 Subject: [PATCH 17/18] =?UTF-8?q?refactor(prometheus=5Fadapter):=20?= =?UTF-8?q?=E5=90=88=E5=B9=B6=E5=B9=B6=E6=89=A9=E5=B1=95=E5=91=8A=E8=AD=A6?= =?UTF-8?q?=E8=A7=84=E5=88=99=E6=B5=8B=E8=AF=95=E8=84=9A=E6=9C=AC?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../{test_alert_update.sh => test_alert.sh} | 64 +++++++++++++++++++ 1 file changed, 64 insertions(+) rename internal/prometheus_adapter/{test_alert_update.sh => test_alert.sh} (59%) diff --git a/internal/prometheus_adapter/test_alert_update.sh b/internal/prometheus_adapter/test_alert.sh similarity index 59% rename from internal/prometheus_adapter/test_alert_update.sh rename to internal/prometheus_adapter/test_alert.sh index f57fceb..cc74248 100755 --- a/internal/prometheus_adapter/test_alert_update.sh +++ b/internal/prometheus_adapter/test_alert.sh @@ -121,4 +121,68 @@ curl -X PUT ${BASE_URL}/v1/alert-rules-meta/high_memory_usage \ ] }' | jq . +sleep 2 + +# 5. 测试删除规则元信息 +echo -e "\n5. 删除规则元信息(删除 high_cpu_usage 的 storage-service)..." +curl -X DELETE ${BASE_URL}/v1/alert-rules-meta/high_cpu_usage \ + -H "Content-Type: application/json" \ + -d '{ + "labels": "{\"service\":\"storage-service\",\"version\":\"1.0.0\"}" + }' | jq . + +sleep 2 + +# 6. 测试删除不存在的规则元信息(应该返回404) +echo -e "\n6. 删除不存在的规则元信息(测试错误处理)..." +curl -X DELETE ${BASE_URL}/v1/alert-rules-meta/high_cpu_usage \ + -H "Content-Type: application/json" \ + -d '{ + "labels": "{\"service\":\"non-existent-service\",\"version\":\"1.0.0\"}" + }' | jq . + +sleep 2 + +# 7. 测试删除整个规则模板 +echo -e "\n7. 删除整个规则模板(删除 high_memory_usage 及其所有元信息)..." +curl -X DELETE ${BASE_URL}/v1/alert-rules/high_memory_usage | jq . + +sleep 2 + +# 8. 测试删除不存在的规则模板(应该返回404) +echo -e "\n8. 删除不存在的规则模板(测试错误处理)..." +curl -X DELETE ${BASE_URL}/v1/alert-rules/non_existent_rule | jq . + +sleep 2 + +# 9. 验证删除结果 - 查看剩余的规则 +echo -e "\n9. 验证删除结果..." +echo "9.1 尝试更新已删除的规则模板(应该创建新规则):" +curl -X PUT ${BASE_URL}/v1/alert-rules/high_memory_usage \ + -H "Content-Type: application/json" \ + -d '{ + "description": "重新创建的内存告警规则", + "expr": "system_memory_usage_percent", + "op": ">", + "severity": "warning", + "watch_time": 300 + }' | jq . + +sleep 1 + +echo -e "\n9.2 查看当前 high_cpu_usage 的受影响元信息数量(应该只剩1个):" +curl -X PUT ${BASE_URL}/v1/alert-rules/high_cpu_usage \ + -H "Content-Type: application/json" \ + -d '{ + "description": "验证剩余元信息的规则更新" + }' | jq . + +echo -e "\n=== 删除功能测试完成 ===" +echo -e "\n测试总结:" +echo "✓ 测试了删除单个规则元信息" +echo "✓ 测试了删除不存在的规则元信息(错误处理)" +echo "✓ 测试了删除整个规则模板及其所有元信息" +echo "✓ 测试了删除不存在的规则模板(错误处理)" +echo "✓ 验证了删除操作的实际效果" + echo -e "\n=== 测试完成 ===" \ No newline at end of file From 4555e066326958d25da3ba29f99f56922867516f Mon Sep 17 00:00:00 2001 From: dnj Date: Mon, 29 Sep 2025 10:03:47 +0800 Subject: [PATCH 18/18] =?UTF-8?q?fix(prometheus=5Fadapter):=20=E6=94=B9?= =?UTF-8?q?=E8=BF=9B=E6=9C=8D=E5=8A=A1=E8=BF=9B=E7=A8=8B=E7=AE=A1=E7=90=86?= =?UTF-8?q?=E5=92=8C=E5=91=8A=E8=AD=A6=E8=A1=A8=E8=BE=BE=E5=BC=8F=E7=94=9F?= =?UTF-8?q?=E6=88=90?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - 在deploy.sh和build.sh中添加PID文件管理,优化服务启动和停止流程 - 修复alert_service.go中告警表达式生成的标签处理逻辑 - 使用%g代替%f格式化浮点数以避免科学计数法显示 --- .../service/alert_service.go | 55 ++++++++++---- scripts/prometheus_adapter/build.sh | 75 ++++++++++++++++--- scripts/prometheus_adapter/deploy.sh | 40 ++++++++-- 3 files changed, 137 insertions(+), 33 deletions(-) diff --git a/internal/prometheus_adapter/service/alert_service.go b/internal/prometheus_adapter/service/alert_service.go index 1a1d115..fdd1b5c 100644 --- a/internal/prometheus_adapter/service/alert_service.go +++ b/internal/prometheus_adapter/service/alert_service.go @@ -402,7 +402,7 @@ func (s *AlertService) buildPrometheusRules(rules []model.AlertRule, ruleMetas [ // 构建注释 annotations := map[string]string{ "description": rule.Description, - "summary": fmt.Sprintf("%s %s %f", rule.Expr, rule.Op, meta.Threshold), + "summary": fmt.Sprintf("%s %s %g", rule.Expr, rule.Op, meta.Threshold), } // 计算for字段 @@ -475,29 +475,54 @@ func (s *AlertService) buildExpression(rule *model.AlertRule, meta *model.AlertR if len(labelMatchers) > 0 { // 如果表达式包含{,说明已经有标签选择器 if strings.Contains(expr, "{") { - expr = strings.Replace(expr, "}", ","+strings.Join(labelMatchers, ",")+"}", 1) - } else { - // 在指标名后添加标签选择器 - // 查找第一个非字母数字下划线的字符 - metricEnd := 0 - for i, ch := range expr { - if !((ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z') || - (ch >= '0' && ch <= '9') || ch == '_') { - metricEnd = i - break + // 查找第一个 { 后的内容 + start := strings.Index(expr, "{") + end := strings.Index(expr[start:], "}") + if end != -1 { + end += start + existingLabels := strings.TrimSpace(expr[start+1 : end]) + if existingLabels == "" { + // 空的标签选择器,直接替换 + expr = expr[:start+1] + strings.Join(labelMatchers, ",") + expr[end:] + } else { + // 已有标签,需要检查是否重复 + existingLabelMap := make(map[string]bool) + // 解析现有标签 + labelPairs := strings.Split(existingLabels, ",") + for _, pair := range labelPairs { + if strings.Contains(pair, "=") { + key := strings.TrimSpace(strings.Split(pair, "=")[0]) + if key != "" { + existingLabelMap[key] = true + } + } + } + // 只添加不重复的标签 + newLabels := []string{} + for k, v := range labels { + if !existingLabelMap[k] && k != "" && v != "" { + newLabels = append(newLabels, fmt.Sprintf(`%s="%s"`, k, v)) + } + } + if len(newLabels) > 0 { + expr = expr[:end] + "," + strings.Join(newLabels, ",") + expr[end:] + } } } - if metricEnd == 0 { - metricEnd = len(expr) + } else { + // 对于没有标签的简单指标,只处理单个单词的情况 + // 如果表达式包含空格、括号等,不进行标签注入 + if !strings.ContainsAny(expr, " ()[]{}") { + // 只有单个指标名,可以安全添加标签 + expr = expr + "{" + strings.Join(labelMatchers, ",") + "}" } - expr = expr[:metricEnd] + "{" + strings.Join(labelMatchers, ",") + "}" + expr[metricEnd:] } } } // 添加比较操作符和阈值 if meta.Threshold != 0 { - expr = fmt.Sprintf("%s %s %f", expr, rule.Op, meta.Threshold) + expr = fmt.Sprintf("%s %s %g", expr, rule.Op, meta.Threshold) } return expr diff --git a/scripts/prometheus_adapter/build.sh b/scripts/prometheus_adapter/build.sh index 06d7213..d75f0b8 100755 --- a/scripts/prometheus_adapter/build.sh +++ b/scripts/prometheus_adapter/build.sh @@ -118,6 +118,8 @@ cat > "$BUILD_DIR/start.sh" << 'EOF' SCRIPT_DIR=$(cd "$(dirname "$0")" && pwd) BIN_PATH="$SCRIPT_DIR/bin/prometheus_adapter" CONFIG_FILE="$SCRIPT_DIR/config/prometheus_adapter.yml" +PID_FILE="$SCRIPT_DIR/prometheus_adapter.pid" +LOG_FILE="$SCRIPT_DIR/prometheus_adapter.log" # 检查二进制文件 if [ ! -f "$BIN_PATH" ]; then @@ -125,6 +127,17 @@ if [ ! -f "$BIN_PATH" ]; then exit 1 fi +# 检查是否已在运行 +if [ -f "$PID_FILE" ]; then + PID=$(cat "$PID_FILE") + if kill -0 "$PID" 2>/dev/null; then + echo "Prometheus Adapter已在运行 (PID: $PID)" + exit 1 + else + rm -f "$PID_FILE" + fi +fi + # 检查配置文件 if [ -f "$CONFIG_FILE" ]; then echo "使用配置文件: $CONFIG_FILE" @@ -140,11 +153,23 @@ fi echo "启动 Prometheus Adapter..." -# 切换到 bin 目录,以便程序能正确找到相对路径的配置文件 +# 切换到脚本目录 cd "$SCRIPT_DIR" -# 启动服务 -exec "$BIN_PATH" +# 后台启动服务 +nohup "$BIN_PATH" > "$LOG_FILE" 2>&1 & +PID=$! + +# 保存PID +echo $PID > "$PID_FILE" + +echo "Prometheus Adapter已启动" +echo "PID: $PID" +echo "日志文件: $LOG_FILE" +echo "PID文件: $PID_FILE" +echo "" +echo "查看日志: tail -f $LOG_FILE" +echo "停止服务: ./stop.sh" EOF chmod +x "$BUILD_DIR/start.sh" @@ -155,10 +180,29 @@ cat > "$BUILD_DIR/stop.sh" << 'EOF' # Prometheus Adapter 停止脚本 +# 获取脚本所在目录 +SCRIPT_DIR=$(cd "$(dirname "$0")" && pwd) +PID_FILE="$SCRIPT_DIR/prometheus_adapter.pid" APP_NAME="prometheus_adapter" -# 查找进程 -PID=$(ps aux | grep -v grep | grep "$APP_NAME" | awk '{print $2}') +# 优先从PID文件读取 +if [ -f "$PID_FILE" ]; then + PID=$(cat "$PID_FILE" 2>/dev/null) + if [ -n "$PID" ] && kill -0 "$PID" 2>/dev/null; then + echo "从PID文件获取进程ID: $PID" + else + echo "PID文件中的进程已不存在,清理PID文件" + rm -f "$PID_FILE" + PID="" + fi +else + PID="" +fi + +# 如果PID文件不存在或进程已死,通过进程名查找 +if [ -z "$PID" ]; then + PID=$(ps aux | grep -v grep | grep "$APP_NAME" | awk '{print $2}') +fi if [ -z "$PID" ]; then echo "没有找到运行中的 $APP_NAME 进程" @@ -166,15 +210,24 @@ if [ -z "$PID" ]; then fi echo "停止 $APP_NAME (PID: $PID)..." -kill -TERM $PID +kill -TERM $PID 2>/dev/null || true # 等待进程退出 -sleep 2 +count=0 +while [ $count -lt 10 ] && ps -p "$PID" > /dev/null 2>&1; do + sleep 1 + count=$((count + 1)) +done + +# 检查是否已退出 +if ps -p "$PID" > /dev/null 2>&1; then + echo "强制停止 $APP_NAME..." + kill -KILL "$PID" 2>/dev/null || true +fi -# 检查是否还在运行 -if ps -p $PID > /dev/null 2>&1; then - echo "强制停止进程..." - kill -KILL $PID +# 清理PID文件 +if [ -f "$PID_FILE" ]; then + rm -f "$PID_FILE" fi echo "$APP_NAME 已停止" diff --git a/scripts/prometheus_adapter/deploy.sh b/scripts/prometheus_adapter/deploy.sh index dc56736..6f53a6f 100755 --- a/scripts/prometheus_adapter/deploy.sh +++ b/scripts/prometheus_adapter/deploy.sh @@ -132,6 +132,16 @@ fi # 检查是否有运行中的服务 check_running_service() { + # 优先从PID文件读取 + if [ -f "$DEPLOY_DIR/prometheus_adapter.pid" ]; then + local pid=$(cat "$DEPLOY_DIR/prometheus_adapter.pid" 2>/dev/null) + if [ -n "$pid" ] && kill -0 "$pid" 2>/dev/null; then + echo "$pid" + return + fi + fi + + # 如果PID文件不存在或进程已死,通过进程名查找 local pid=$(ps aux | grep -v grep | grep "prometheus_adapter" | grep -v "$0" | awk '{print $2}') if [ -n "$pid" ]; then echo "$pid" @@ -158,6 +168,11 @@ stop_service() { kill -KILL "$pid" 2>/dev/null || true fi + # 清理PID文件 + if [ -f "$DEPLOY_DIR/prometheus_adapter.pid" ]; then + rm -f "$DEPLOY_DIR/prometheus_adapter.pid" + fi + log_info "服务已停止" fi } @@ -340,17 +355,27 @@ if [ "$START_SERVICE" = true ] || [ "$RESTART_SERVICE" = true ]; then # 启动服务 cd "$DEPLOY_DIR" - nohup ./start.sh > prometheus_adapter.log 2>&1 & + + # 直接启动二进制文件而不是通过start.sh脚本 + nohup ./bin/prometheus_adapter > prometheus_adapter.log 2>&1 & + PID=$! + + # 保存PID到文件 + echo $PID > prometheus_adapter.pid + + log_info "服务已启动 (PID: $PID)" + echo "PID文件: $DEPLOY_DIR/prometheus_adapter.pid" + echo "日志文件: $DEPLOY_DIR/prometheus_adapter.log" # 等待服务启动 sleep 2 # 检查是否启动成功 - NEW_PID=$(check_running_service) - if [ -n "$NEW_PID" ]; then - log_info "服务已启动 (PID: $NEW_PID)" + if kill -0 "$PID" 2>/dev/null; then + log_info "服务启动成功,正在运行" echo "" echo "查看日志: tail -f $DEPLOY_DIR/prometheus_adapter.log" + echo "停止服务: kill \$(cat $DEPLOY_DIR/prometheus_adapter.pid)" else log_error "服务启动失败,请检查日志" exit 1 @@ -359,10 +384,11 @@ else echo "" echo "手动启动服务:" echo " cd $DEPLOY_DIR" - echo " ./start.sh" + echo " nohup ./bin/prometheus_adapter > prometheus_adapter.log 2>&1 &" + echo " echo \$! > prometheus_adapter.pid" echo "" - echo "或使用后台模式:" - echo " nohup ./start.sh > prometheus_adapter.log 2>&1 &" + echo "停止服务:" + echo " kill \$(cat prometheus_adapter.pid)" fi log_info "部署完成!" \ No newline at end of file