diff --git a/cmd/prometheus_adapter/main.go b/cmd/prometheus_adapter/main.go new file mode 100644 index 0000000..696018b --- /dev/null +++ b/cmd/prometheus_adapter/main.go @@ -0,0 +1,91 @@ +package main + +import ( + "context" + "os" + "os/signal" + "syscall" + "time" + + "github.com/fox-gonic/fox" + "github.com/qiniu/zeroops/internal/config" + prometheusadapter "github.com/qiniu/zeroops/internal/prometheus_adapter" + "github.com/rs/zerolog" + "github.com/rs/zerolog/log" +) + +func main() { + // 配置日志 + log.Logger = log.Output(zerolog.ConsoleWriter{Out: os.Stderr}) + + log.Info().Msg("Starting Prometheus Adapter server") + + // 加载 Prometheus Adapter 配置 + adapter, err := prometheusadapter.NewPrometheusAdapterServer(&config.Config{}) + if err != nil { + log.Fatal().Err(err).Msg("Failed to create Prometheus Adapter server") + } + + // 获取 Prometheus Adapter 内部配置的绑定地址 + bindAddr := ":9999" // 默认端口 + if adapter.GetBindAddr() != "" { + bindAddr = adapter.GetBindAddr() + } + + // 如果有环境变量,优先使用环境变量的端口 + if port := os.Getenv("ADAPTER_PORT"); port != "" { + bindAddr = ":" + port + } + + // 更新配置(虽然已经创建了 adapter,但需要端口信息用于启动服务器) + cfg := &config.Config{ + Server: config.ServerConfig{ + BindAddr: bindAddr, + }, + } + + // 创建路由 + router := fox.New() + + // 启动 API + if err := adapter.UseApi(router); err != nil { + log.Fatal().Err(err).Msg("Failed to setup API routes") + } + + // 设置信号处理 + sigChan := make(chan os.Signal, 1) + signal.Notify(sigChan, syscall.SIGINT, syscall.SIGTERM) + + // 创建一个用于优雅关闭的context + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + // 在goroutine中启动服务器 + serverErr := make(chan error, 1) + go func() { + log.Info().Msgf("Starting Prometheus Adapter on %s", cfg.Server.BindAddr) + if err := router.Run(cfg.Server.BindAddr); err != nil { + serverErr <- err + } + }() + + // 等待信号或服务器错误 + select { + case sig := <-sigChan: + log.Info().Msgf("Received signal %s, shutting down...", sig) + + // 创建超时context + shutdownCtx, shutdownCancel := context.WithTimeout(ctx, 10*time.Second) + defer shutdownCancel() + + // 调用adapter的Shutdown方法 + if err := adapter.Close(shutdownCtx); err != nil { + log.Error().Err(err).Msg("Error during shutdown") + } + + log.Info().Msg("Shutdown complete") + + case err := <-serverErr: + log.Fatal().Err(err).Msg("Server error") + } +} diff --git a/docs/prometheus_adapter/README.md b/docs/prometheus_adapter/README.md new file mode 100644 index 0000000..0167488 --- /dev/null +++ b/docs/prometheus_adapter/README.md @@ -0,0 +1,341 @@ +# Prometheus Adapter + +基于 Prometheus 的指标查询与告警规则同步适配层,提供统一的 REST API: +- 按服务与版本查询任意 Prometheus 指标 +- 同步告警规则到 Prometheus 并触发重载 + +目录 +- 概述 +- 快速开始 +- 架构设计 +- API 参考 + - 指标查询 + - 告警规则管理 +- Alertmanager 集成 +- 支持的服务 +- 错误码 + +## 概述 + +Prometheus Adapter 作为内部系统与 Prometheus 之间的适配层: +- 向上暴露简洁、统一的 HTTP API +- 向下负责 PromQL 查询与 Prometheus 规则文件管理 + +## 架构设计 + +- 分层设计 + - API 层(`internal/prometheus_adapter/api`):HTTP 请求处理、参数校验、错误格式化 + - Service 层(`internal/prometheus_adapter/service`):业务逻辑、指标与服务存在性校验、数据装配 + - Client 层(`internal/prometheus_adapter/client`):与 Prometheus API 交互、PromQL 构建、结果转换 + - Model 层(`internal/prometheus_adapter/model`):统一数据模型、错误类型、常量 + +- 目录结构 +``` +internal/prometheus_adapter/ +├── server.go # 服务器主入口,负责初始化和生命周期管理 +├── api/ # API 层,处理 HTTP 请求 +│ ├── api.go # API 基础结构和初始化 +│ ├── metric_api.go # 指标相关的 API 处理器 +│ └── alert_api.go # 告警规则管理 API 处理器 +├── service/ # 业务逻辑层 +│ ├── metric_service.go # 指标查询服务实现 +│ └── alert_service.go # 告警规则同步服务实现 +├── client/ # Prometheus 客户端 +│ └── prometheus_client.go # 封装 Prometheus API 调用 +└── model/ # 数据模型 + ├── api.go # API 请求响应模型 + ├── alert.go # 告警规则模型 + ├── constants.go # 常量定义(错误码等) + ├── error.go # 错误类型定义 + └── prometheus.go # Prometheus 规则文件模型 +``` + +- 核心组件 + - PrometheusAdapterServer:初始化客户端与路由,管理服务生命周期 + - PrometheusClient:`QueryRange`、`GetAvailableMetrics`、`CheckMetricExists`、`CheckServiceExists`、`BuildQuery` + - MetricService:参数校验、动态指标发现、错误转换 + - AlertService:告警规则同步、Prometheus 规则文件生成、配置重载 + +## API + +### 指标查询 + +1) 获取可用指标列表 +- 方法与路径:`GET /v1/metrics` +- 用途:列出当前可查询的所有指标名称 +- 响应示例: +``` +{ + "metrics": [ + "system_cpu_usage_percent", + "system_memory_usage_percent", + "system_disk_usage_percent", + "system_network_qps", + "system_machine_online_status", + "http_latency" + ] +} +``` + +2) 查询指定服务的指标时间序列 +- 方法与路径:`GET /v1/metrics/{service}/{metric}` +- 路径参数: + - `service`:服务名(必填) + - `metric`:指标名(必填,需为 Prometheus 中存在的指标) +- 查询参数: + - `version`:服务版本(选填;不传则返回所有版本) + - `start`:开始时间(选填,RFC3339) + - `end`:结束时间(选填,RFC3339) + - `step`:步长(选填,如 `1m`、`5m`、`1h`;默认 `1m`) +- 请求示例: + - `GET /v1/metrics/metadata-service/system_cpu_usage_percent?version=1.0.0` + - `GET /v1/metrics/storage-service/system_memory_usage_percent?version=1.0.0` + - `GET /v1/metrics/storage-service/http_latency?version=1.0.0` + - `GET /v1/metrics/storage-service/system_network_qps?version=1.0.0` +- 成功响应示例: +``` +{ + "service": "metadata-service", + "version": "1.0.0", + "metric": "system_cpu_usage_percent", + "data": [ + { "timestamp": "2024-01-01T00:00:00Z", "value": 45.2 }, + { "timestamp": "2024-01-01T00:01:00Z", "value": 48.5 } + ] +} +``` +- 错误响应示例: + - 指标不存在(404): +``` +{ + "error": { + "code": "METRIC_NOT_FOUND", + "message": "指标 'invalid_metric' 不存在", + "metric": "invalid_metric" + } +} +``` + - 服务不存在(404): +``` +{ + "error": { + "code": "SERVICE_NOT_FOUND", + "message": "服务 'invalid-service' 不存在", + "service": "invalid-service" + } +} +``` + - 参数错误(400): +``` +{ + "error": { + "code": "INVALID_PARAMETER", + "message": "参数 'start' 格式错误: invalid-time", + "parameter": "start", + "value": "invalid-time" + } +} +``` + +### 告警规则管理 + +#### 1. 更新单个规则模板 +- 方法与路径:`PUT /v1/alert-rules/:rule_name` +- 功能:更新指定的告警规则模板,系统会自动查找所有使用该规则的元信息并重新生成 Prometheus 规则 +- 路径参数: + - `rule_name`:规则名称(如 `high_cpu_usage`) +- 请求体示例: +```json +{ + "description": "CPU使用率异常告警(更新后)", + "expr": "avg(system_cpu_usage_percent)", + "op": ">=", + "severity": "critical", + "watch_time": 300 +} +``` +- 响应示例: +```json +{ + "status": "success", + "message": "Rule 'high_cpu_usage' updated and synced to Prometheus", + "affected_metas": 3 // 影响的元信息数量 +} +``` + +#### 2. 批量更新规则元信息 +- 方法与路径:`PUT /v1/alert-rules-meta/:rule_name` +- 功能:批量更新指定规则的元信息,系统会根据对应的规则模板重新生成 Prometheus 规则 +- 路径参数: + - `rule_name`:规则名称(如 `high_cpu_usage`) +- 请求体示例: +```json +{ + "metas": [ + { + "labels": "{\"service\":\"storage-service\",\"version\":\"1.0.0\"}", // 必填,用于唯一标识 + "threshold": 85 + }, + { + "labels": "{\"service\":\"storage-service\",\"version\":\"2.0.0\"}", // 必填,用于唯一标识 + "threshold": 90 + } + ] +} +``` +- 响应示例: +```json +{ + "status": "success", + "message": "Rule metas updated and synced to Prometheus", + "rule_name": "high_cpu_usage", + "updated_count": 2 +} +``` + +#### 3. 删除规则模板 +- 方法与路径:`DELETE /v1/alert-rules/:rule_name` +- 功能:删除指定的告警规则模板及其所有关联的元信息 +- 路径参数: + - `rule_name`:规则名称(如 `high_cpu_usage`) +- 响应示例: +```json +{ + "status": "success", + "message": "Rule 'high_cpu_usage' and 3 associated metas deleted successfully", + "rule_name": "high_cpu_usage", + "deleted_metas": 3 +} +``` +- 错误响应示例(规则不存在): +```json +{ + "error": { + "code": "INVALID_PARAMETER", + "message": "rule 'invalid_rule' not found" + } +} +``` + +#### 4. 删除规则元信息 +- 方法与路径:`DELETE /v1/alert-rules-meta/:rule_name` +- 功能:删除指定规则下的特定元信息(通过 labels 唯一标识) +- 路径参数: + - `rule_name`:规则名称(如 `high_cpu_usage`) +- 请求体示例: +```json +{ + "labels": "{\"service\":\"storage-service\",\"version\":\"1.0.0\"}" +} +``` +- 响应示例: +```json +{ + "status": "success", + "message": "Rule meta deleted successfully", + "rule_name": "high_cpu_usage", + "labels": "{\"service\":\"storage-service\",\"version\":\"1.0.0\"}" +} +``` +- 错误响应示例(元信息不存在): +```json +{ + "error": { + "code": "INVALID_PARAMETER", + "message": "rule meta not found for rule 'high_cpu_usage' with labels '{\"service\":\"invalid-service\"}'" + } +} +``` + +#### 规则生成机制 +- **规则模板与元信息关联**:通过 `alert_name` 字段关联 + - `AlertRule.name` = `AlertRuleMeta.alert_name` +- **元信息唯一标识**:通过 `alert_name` + `labels` 的组合唯一确定一个元信息记录 +- **Prometheus 告警生成**: + - 所有基于同一规则模板的告警使用相同的 `alert` 名称(即规则模板的 `name`) + - 通过 `labels` 区分不同的服务实例 + +#### 字段说明 +- **AlertRule(规则模板)**: + - `name`:规则名称,作为 Prometheus 的 alert 名称 + - `description`:规则描述,可读的 title + - `expr`:PromQL 表达式,如 `sum(apitime) by (service, version)`,可包含时间范围 + - `op`:比较操作符(`>`, `<`, `=`, `!=`) + - `severity`:告警等级,通常进入告警的 labels.severity + - `watch_time`:持续时间(秒),对应 Prometheus 的 `for` 字段 +- **AlertRuleMeta(元信息)**: + - `alert_name`:关联的规则名称(对应 alert_rules.name) + - `labels`:JSON 格式的标签,用于筛选特定服务(如 `{"service":"s3","version":"v1"}`) + - `threshold`:告警阈值 + +#### 增量更新说明 +- **增量更新**:新接口支持增量更新,只需传入需要修改的字段 +- **自动匹配**: + - 更新规则模板时,系统自动查找所有 `alert_name` 匹配的元信息并重新生成规则 + - 更新元信息时,系统根据 `alert_name` + `labels` 查找并更新对应的元信息 +- **缓存机制**:系统在内存中缓存当前的规则和元信息,支持快速增量更新 + +## 告警接收 Webhook + +- 目标:实现自定义 webhook 服务,主动从 Prometheus 拉取告警并转发到监控告警模块 +- 实现方式: + - 通过 Prometheus Alerts API 获取告警 + - 定期轮询 Prometheus 的 `/api/v1/alerts` 端点 + - 将获取的告警格式化后 POST 到监控告警模块 + +### Webhook 服务架构 +``` +┌─────────────────┐ +│ Prometheus │ +│ (告警规则引擎) │ +└────────┬────────┘ + │ Pull (轮询) + │ GET /api/v1/alerts + ▼ +┌─────────────────┐ +│ Alert Webhook │ +│ (自定义服务) │ +└────────┬────────┘ + │ Push + │ POST /v1/integrations/alertmanager/webhook + ▼ +┌─────────────────┐ +│ 监控告警模块 │ +│ (告警处理中心) │ +└─────────────────┘ +``` + +### 实现细节 +- **轮询机制**: + - 每 10 秒从 Prometheus 拉取一次活跃告警 + - 通过 `GET http://prometheus:9090/api/v1/alerts` 获取告警列表 + - 维护告警状态缓存,避免重复推送 + +- **告警格式转换**: + - 将 Prometheus 告警格式转换为监控告警模块所需格式 + - 包含告警名称、标签、严重程度、开始时间等信息 + - 支持告警恢复状态通知 + +- **推送目标**: + - URL: `http://alert-module:8080/v1/integrations/alertmanager/webhook` + - Method: POST + - Content-Type: application/json + +## 支持的服务 + +当前 mock/s3 环境下: +- `metadata-service` +- `storage-service` +- `queue-service` +- `third-party-service` +- `mock-error-service` + +所有服务的版本信息通过标签 `service_version` 暴露。 + +## 错误码 + +- `METRIC_NOT_FOUND`:指标不存在 +- `SERVICE_NOT_FOUND`:服务不存在 +- `INVALID_PARAMETER`:请求参数不合法(如时间格式不正确) +- `INTERNAL_ERROR`:内部服务器错误 +- `PROMETHEUS_ERROR`:Prometheus 查询失败 diff --git a/go.mod b/go.mod index 6094f9c..94b9643 100644 --- a/go.mod +++ b/go.mod @@ -7,14 +7,17 @@ require ( github.com/google/uuid v1.6.0 github.com/jackc/pgx/v5 v5.5.5 github.com/lib/pq v1.10.9 + github.com/prometheus/client_golang v1.23.2 + github.com/prometheus/common v0.66.1 github.com/redis/go-redis/v9 v9.5.1 github.com/rs/zerolog v1.34.0 + gopkg.in/yaml.v3 v3.0.1 ) require ( github.com/bytedance/sonic v1.13.3 // indirect github.com/bytedance/sonic/loader v0.2.4 // indirect - github.com/cespare/xxhash/v2 v2.2.0 // indirect + github.com/cespare/xxhash/v2 v2.3.0 // indirect github.com/cloudwego/base64x v0.1.5 // indirect github.com/dgryski/go-rendezvous v0.0.0-20200823014737-9f7001d12a5f // indirect github.com/gabriel-vasile/mimetype v1.4.9 // indirect @@ -39,14 +42,15 @@ require ( github.com/modern-go/reflect2 v1.0.2 // indirect github.com/natefinch/lumberjack v2.0.0+incompatible // indirect github.com/pelletier/go-toml/v2 v2.2.4 // indirect + github.com/prometheus/client_model v0.6.2 // indirect github.com/twitchyliquid64/golang-asm v0.15.1 // indirect github.com/ugorji/go/codec v1.3.0 // indirect + go.yaml.in/yaml/v2 v2.4.2 // indirect golang.org/x/arch v0.18.0 // indirect - golang.org/x/crypto v0.39.0 // indirect - golang.org/x/net v0.41.0 // indirect - golang.org/x/sync v0.15.0 // indirect - golang.org/x/sys v0.33.0 // indirect - golang.org/x/text v0.26.0 // indirect - google.golang.org/protobuf v1.36.6 // indirect - gopkg.in/yaml.v3 v3.0.1 // indirect + golang.org/x/crypto v0.41.0 // indirect + golang.org/x/net v0.43.0 // indirect + golang.org/x/sync v0.16.0 // indirect + golang.org/x/sys v0.35.0 // indirect + golang.org/x/text v0.28.0 // indirect + google.golang.org/protobuf v1.36.8 // indirect ) diff --git a/go.sum b/go.sum index 04e9b56..2c92f4d 100644 --- a/go.sum +++ b/go.sum @@ -1,5 +1,7 @@ github.com/BurntSushi/toml v1.2.1 h1:9F2/+DoOYIOksmaJFPw1tGFy1eDnIJXg+UHjuD8lTak= github.com/BurntSushi/toml v1.2.1/go.mod h1:CxXYINrC8qIiEnFrOxCa7Jy5BFHlXnUU2pbicEuybxQ= +github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= +github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw= github.com/bsm/ginkgo/v2 v2.12.0 h1:Ny8MWAHyOepLGlLKYmXG4IEkioBysk6GpaRTLC8zwWs= github.com/bsm/ginkgo/v2 v2.12.0/go.mod h1:SwYbGRRDovPVboqFv0tPTcG1sN61LM1Z4ARdbAV9g4c= github.com/bsm/gomega v1.27.10 h1:yeMWxP2pV2fG3FgAODIY8EiRE3dy0aeFYt4l7wh6yKA= @@ -9,8 +11,8 @@ github.com/bytedance/sonic v1.13.3/go.mod h1:o68xyaF9u2gvVBuGHPlUVCy+ZfmNNO5ETf1 github.com/bytedance/sonic/loader v0.1.1/go.mod h1:ncP89zfokxS5LZrJxl5z0UJcsk4M4yY2JpfqGeCtNLU= github.com/bytedance/sonic/loader v0.2.4 h1:ZWCw4stuXUsn1/+zQDqeE7JKP+QO47tz7QCNan80NzY= github.com/bytedance/sonic/loader v0.2.4/go.mod h1:N8A3vUdtUebEY2/VQC0MyhYeKUFosQU6FxH2JmUe6VI= -github.com/cespare/xxhash/v2 v2.2.0 h1:DC2CZ1Ep5Y4k3ZQ899DldepgrayRUGE6BBZ/cd9Cj44= -github.com/cespare/xxhash/v2 v2.2.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= +github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs= +github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= github.com/cloudwego/base64x v0.1.5 h1:XPciSp1xaq2VCSt6lF0phncD4koWyULpl5bUxbfCyP4= github.com/cloudwego/base64x v0.1.5/go.mod h1:0zlkT4Wn5C6NdauXdJRhSKRlJvmclQ1hhJgA0rcu/8w= github.com/cloudwego/iasm v0.2.0/go.mod h1:8rXZaNYT2n95jn+zTI1sDr+IgcD2GVs0nlbbQPiEFhY= @@ -56,6 +58,8 @@ github.com/jackc/pgx/v5 v5.5.5 h1:amBjrZVmksIdNjxGW/IiIMzxMKZFelXbUoPNb+8sjQw= github.com/jackc/pgx/v5 v5.5.5/go.mod h1:ez9gk+OAat140fv9ErkZDYFWmXLfV+++K0uAOiwgm1A= github.com/jackc/puddle/v2 v2.2.1 h1:RhxXJtFG022u4ibrCSMSiu5aOq1i77R3OHKNJj77OAk= github.com/jackc/puddle/v2 v2.2.1/go.mod h1:vriiEXHvEE654aYKXXjOvZM39qJ0q+azkZFrfEOc3H4= +github.com/jpillora/backoff v1.0.0 h1:uvFg412JmmHBHw7iwprIxkPMI+sGQ4kzOWsMeHnm2EA= +github.com/jpillora/backoff v1.0.0/go.mod h1:J/6gKK9jxlEcS3zixgDgUAsiuZ7yrSoa/FX5e0EB2j4= github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM= github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo= github.com/klauspost/cpuid/v2 v2.0.9/go.mod h1:FInQzS24/EEf25PyTYn52gqo7WaD8xa0213Md/qVLRg= @@ -84,6 +88,10 @@ github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= github.com/modern-go/reflect2 v1.0.2 h1:xBagoLtFs94CBntxluKeaWgTMpvLxC4ur3nMaC9Gz0M= github.com/modern-go/reflect2 v1.0.2/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk= +github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA= +github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ= +github.com/mwitkow/go-conntrack v0.0.0-20190716064945-2f068394615f h1:KUppIJq7/+SVif2QVs3tOP0zanoHgBEVAwHxUSIzRqU= +github.com/mwitkow/go-conntrack v0.0.0-20190716064945-2f068394615f/go.mod h1:qRWi+5nqEBWmkhHvq77mSJWrCKwh8bxhgT7d/eI7P4U= github.com/natefinch/lumberjack v2.0.0+incompatible h1:4QJd3OLAMgj7ph+yZTuX13Ld4UpgHp07nNdFX7mqFfM= github.com/natefinch/lumberjack v2.0.0+incompatible/go.mod h1:Wi9p2TTF5DG5oU+6YfsmYQpsTIOm0B1VNzQg9Mw6nPk= github.com/pelletier/go-toml/v2 v2.2.4 h1:mye9XuhQ6gvn5h28+VilKrrPoQVanw5PMw/TB0t5Ec4= @@ -92,6 +100,14 @@ github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINE github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 h1:Jamvg5psRIccs7FGNTlIRMkT8wgtp5eCXdBlqhYGL6U= github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/prometheus/client_golang v1.23.2 h1:Je96obch5RDVy3FDMndoUsjAhG5Edi49h0RJWRi/o0o= +github.com/prometheus/client_golang v1.23.2/go.mod h1:Tb1a6LWHB3/SPIzCoaDXI4I8UHKeFTEQ1YCr+0Gyqmg= +github.com/prometheus/client_model v0.6.2 h1:oBsgwpGs7iVziMvrGhE53c/GrLUsZdHnqNwqPLxwZyk= +github.com/prometheus/client_model v0.6.2/go.mod h1:y3m2F6Gdpfy6Ut/GBsUqTWZqCUvMVzSfMLjcu6wAwpE= +github.com/prometheus/common v0.66.1 h1:h5E0h5/Y8niHc5DlaLlWLArTQI7tMrsfQjHV+d9ZoGs= +github.com/prometheus/common v0.66.1/go.mod h1:gcaUsgf3KfRSwHY4dIMXLPV0K/Wg1oZ8+SbZk/HH/dA= +github.com/prometheus/procfs v0.16.1 h1:hZ15bTNuirocR6u0JZ6BAHHmwS1p8B4P6MRqxtzMyRg= +github.com/prometheus/procfs v0.16.1/go.mod h1:teAbpZRB1iIAJYREa1LsoWUXykVXA1KlTmWl8x/U+Is= github.com/redis/go-redis/v9 v9.5.1 h1:H1X4D3yHPaYrkL5X06Wh6xNVM/pX0Ft4RV0vMGvLBh8= github.com/redis/go-redis/v9 v9.5.1/go.mod h1:hdY0cQFCN4fnSYT6TkisLufl/4W5UIXyv0b/CLO2V2M= github.com/rogpeppe/go-internal v1.10.0 h1:TMyTOH3F/DB16zRVcYyreMH6GnZZrwQVAoYjRBZyWFQ= @@ -113,23 +129,27 @@ github.com/twitchyliquid64/golang-asm v0.15.1 h1:SU5vSMR7hnwNxj24w34ZyCi/FmDZTkS github.com/twitchyliquid64/golang-asm v0.15.1/go.mod h1:a1lVb/DtPvCB8fslRZhAngC2+aY1QWCk3Cedj/Gdt08= github.com/ugorji/go/codec v1.3.0 h1:Qd2W2sQawAfG8XSvzwhBeoGq71zXOC/Q1E9y/wUcsUA= github.com/ugorji/go/codec v1.3.0/go.mod h1:pRBVtBSKl77K30Bv8R2P+cLSGaTtex6fsA2Wjqmfxj4= +go.yaml.in/yaml/v2 v2.4.2 h1:DzmwEr2rDGHl7lsFgAHxmNz/1NlQ7xLIrlN2h5d1eGI= +go.yaml.in/yaml/v2 v2.4.2/go.mod h1:081UH+NErpNdqlCXm3TtEran0rJZGxAYx9hb/ELlsPU= golang.org/x/arch v0.18.0 h1:WN9poc33zL4AzGxqf8VtpKUnGvMi8O9lhNyBMF/85qc= golang.org/x/arch v0.18.0/go.mod h1:bdwinDaKcfZUGpH09BB7ZmOfhalA8lQdzl62l8gGWsk= -golang.org/x/crypto v0.39.0 h1:SHs+kF4LP+f+p14esP5jAoDpHU8Gu/v9lFRK6IT5imM= -golang.org/x/crypto v0.39.0/go.mod h1:L+Xg3Wf6HoL4Bn4238Z6ft6KfEpN0tJGo53AAPC632U= -golang.org/x/net v0.41.0 h1:vBTly1HeNPEn3wtREYfy4GZ/NECgw2Cnl+nK6Nz3uvw= -golang.org/x/net v0.41.0/go.mod h1:B/K4NNqkfmg07DQYrbwvSluqCJOOXwUjeb/5lOisjbA= -golang.org/x/sync v0.15.0 h1:KWH3jNZsfyT6xfAfKiz6MRNmd46ByHDYaZ7KSkCtdW8= -golang.org/x/sync v0.15.0/go.mod h1:1dzgHSNfp02xaA81J2MS99Qcpr2w7fw1gpm99rleRqA= +golang.org/x/crypto v0.41.0 h1:WKYxWedPGCTVVl5+WHSSrOBT0O8lx32+zxmHxijgXp4= +golang.org/x/crypto v0.41.0/go.mod h1:pO5AFd7FA68rFak7rOAGVuygIISepHftHnr8dr6+sUc= +golang.org/x/net v0.43.0 h1:lat02VYK2j4aLzMzecihNvTlJNQUq316m2Mr9rnM6YE= +golang.org/x/net v0.43.0/go.mod h1:vhO1fvI4dGsIjh73sWfUVjj3N7CA9WkKJNQm2svM6Jg= +golang.org/x/oauth2 v0.30.0 h1:dnDm7JmhM45NNpd8FDDeLhK6FwqbOf4MLCM9zb1BOHI= +golang.org/x/oauth2 v0.30.0/go.mod h1:B++QgG3ZKulg6sRPGD/mqlHQs5rB3Ml9erfeDY7xKlU= +golang.org/x/sync v0.16.0 h1:ycBJEhp9p4vXvUZNszeOq0kGTPghopOL8q0fq3vstxw= +golang.org/x/sync v0.16.0/go.mod h1:1dzgHSNfp02xaA81J2MS99Qcpr2w7fw1gpm99rleRqA= golang.org/x/sys v0.0.0-20220811171246-fbc7d0a398ab/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.12.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.33.0 h1:q3i8TbbEz+JRD9ywIRlyRAQbM0qF7hu24q3teo2hbuw= -golang.org/x/sys v0.33.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k= -golang.org/x/text v0.26.0 h1:P42AVeLghgTYr4+xUnTRKDMqpar+PtX7KWuNQL21L8M= -golang.org/x/text v0.26.0/go.mod h1:QK15LZJUUQVJxhz7wXgxSy/CJaTFjd0G+YLonydOVQA= -google.golang.org/protobuf v1.36.6 h1:z1NpPI8ku2WgiWnf+t9wTPsn6eP1L7ksHUlkfLvd9xY= -google.golang.org/protobuf v1.36.6/go.mod h1:jduwjTPXsFjZGTmRluh+L6NjiWu7pchiJ2/5YcXBHnY= +golang.org/x/sys v0.35.0 h1:vz1N37gP5bs89s7He8XuIYXpyY0+QlsKmzipCbUtyxI= +golang.org/x/sys v0.35.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k= +golang.org/x/text v0.28.0 h1:rhazDwis8INMIwQ4tpjLDzUhx6RlXqZNPEM0huQojng= +golang.org/x/text v0.28.0/go.mod h1:U8nCwOR8jO/marOQ0QbDiOngZVEBB7MAiitBuMjXiNU= +google.golang.org/protobuf v1.36.8 h1:xHScyCOEuuwZEc6UtSOvPbAT4zRh0xcNRYekJwfqyMc= +google.golang.org/protobuf v1.36.8/go.mod h1:fuxRtAxBytpl4zzqUh6/eyUujkJdNiuEkXntxiD/uRU= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk= gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q= diff --git a/internal/prometheus_adapter/api/alert_api.go b/internal/prometheus_adapter/api/alert_api.go new file mode 100644 index 0000000..786de6f --- /dev/null +++ b/internal/prometheus_adapter/api/alert_api.go @@ -0,0 +1,179 @@ +package api + +import ( + "fmt" + "net/http" + + "github.com/fox-gonic/fox" + "github.com/qiniu/zeroops/internal/prometheus_adapter/model" +) + +// setupAlertRouters 设置告警相关路由 +func (api *Api) setupAlertRouters(router *fox.Engine) { + router.PUT("/v1/alert-rules/:rule_name", api.UpdateRule) + router.PUT("/v1/alert-rules-meta/:rule_name", api.UpdateRuleMetas) + router.DELETE("/v1/alert-rules/:rule_name", api.DeleteRule) + router.DELETE("/v1/alert-rules-meta/:rule_name", api.DeleteRuleMeta) +} + +// UpdateRule 更新单个规则模板 +// 只更新指定的规则,系统会自动查找所有使用该规则的元信息并重新生成 +func (api *Api) UpdateRule(c *fox.Context) { + ruleName := c.Param("rule_name") + if ruleName == "" { + SendErrorResponse(c, http.StatusBadRequest, model.ErrorCodeInvalidParameter, + "Rule name is required", nil) + return + } + + var req model.UpdateAlertRuleRequest + if err := c.ShouldBindJSON(&req); err != nil { + SendErrorResponse(c, http.StatusBadRequest, model.ErrorCodeInvalidParameter, + "Invalid request body: "+err.Error(), nil) + return + } + + // 构建完整的规则对象 + rule := model.AlertRule{ + Name: ruleName, + Description: req.Description, + Expr: req.Expr, + Op: req.Op, + Severity: req.Severity, + WatchTime: req.WatchTime, + } + + err := api.alertService.UpdateRule(rule) + if err != nil { + SendErrorResponse(c, http.StatusInternalServerError, model.ErrorCodeInternalError, + "Failed to update rule: "+err.Error(), nil) + return + } + + // 获取受影响的元信息数量 + affectedCount := api.alertService.GetAffectedMetas(ruleName) + + c.JSON(http.StatusOK, map[string]interface{}{ + "status": "success", + "message": fmt.Sprintf("Rule '%s' updated and synced to Prometheus", ruleName), + "affected_metas": affectedCount, + }) +} + +// UpdateRuleMetas 批量更新规则元信息 +// 通过 rule_name + labels 唯一确定一个元信息记录 +func (api *Api) UpdateRuleMetas(c *fox.Context) { + ruleName := c.Param("rule_name") + if ruleName == "" { + SendErrorResponse(c, http.StatusBadRequest, model.ErrorCodeInvalidParameter, + "Rule name is required", nil) + return + } + + var req model.UpdateAlertRuleMetaRequest + if err := c.ShouldBindJSON(&req); err != nil { + SendErrorResponse(c, http.StatusBadRequest, model.ErrorCodeInvalidParameter, + "Invalid request body: "+err.Error(), nil) + return + } + + if len(req.Metas) == 0 { + SendErrorResponse(c, http.StatusBadRequest, model.ErrorCodeInvalidParameter, + "At least one meta update is required", nil) + return + } + + // 批量更新元信息 + updatedCount := 0 + for _, metaUpdate := range req.Metas { + // 构建完整的元信息对象 + meta := model.AlertRuleMeta{ + AlertName: ruleName, + Labels: metaUpdate.Labels, + Threshold: metaUpdate.Threshold, + } + + err := api.alertService.UpdateRuleMeta(meta) + if err != nil { + SendErrorResponse(c, http.StatusInternalServerError, model.ErrorCodeInternalError, + fmt.Sprintf("Failed to update rule meta: %v", err), nil) + return + } + updatedCount++ + } + + c.JSON(http.StatusOK, map[string]interface{}{ + "status": "success", + "message": "Rule metas updated and synced to Prometheus", + "rule_name": ruleName, + "updated_count": updatedCount, + }) +} + +// DeleteRule 删除单个规则模板及其所有关联的元信息 +func (api *Api) DeleteRule(c *fox.Context) { + ruleName := c.Param("rule_name") + if ruleName == "" { + SendErrorResponse(c, http.StatusBadRequest, model.ErrorCodeInvalidParameter, + "Rule name is required", nil) + return + } + + // 获取受影响的元信息数量 + affectedCount := api.alertService.GetAffectedMetas(ruleName) + + err := api.alertService.DeleteRule(ruleName) + if err != nil { + if err.Error() == fmt.Sprintf("rule '%s' not found", ruleName) { + SendErrorResponse(c, http.StatusNotFound, model.ErrorCodeInvalidParameter, + err.Error(), nil) + } else { + SendErrorResponse(c, http.StatusInternalServerError, model.ErrorCodeInternalError, + "Failed to delete rule: "+err.Error(), nil) + } + return + } + + c.JSON(http.StatusOK, map[string]interface{}{ + "status": "success", + "message": fmt.Sprintf("Rule '%s' and %d associated metas deleted successfully", ruleName, affectedCount), + "rule_name": ruleName, + "deleted_metas": affectedCount, + }) +} + +// DeleteRuleMeta 删除单个规则元信息 +func (api *Api) DeleteRuleMeta(c *fox.Context) { + ruleName := c.Param("rule_name") + if ruleName == "" { + SendErrorResponse(c, http.StatusBadRequest, model.ErrorCodeInvalidParameter, + "Rule name is required", nil) + return + } + + var req model.DeleteAlertRuleMetaRequest + if err := c.ShouldBindJSON(&req); err != nil { + SendErrorResponse(c, http.StatusBadRequest, model.ErrorCodeInvalidParameter, + "Invalid request body: "+err.Error(), nil) + return + } + + err := api.alertService.DeleteRuleMeta(ruleName, req.Labels) + if err != nil { + if err.Error() == fmt.Sprintf("rule meta not found for rule '%s' with labels '%s'", ruleName, req.Labels) { + SendErrorResponse(c, http.StatusNotFound, model.ErrorCodeInvalidParameter, + err.Error(), nil) + } else { + SendErrorResponse(c, http.StatusInternalServerError, model.ErrorCodeInternalError, + "Failed to delete rule meta: "+err.Error(), nil) + } + return + } + + c.JSON(http.StatusOK, map[string]interface{}{ + "status": "success", + "message": "Rule meta deleted successfully", + "rule_name": ruleName, + "labels": req.Labels, + }) +} diff --git a/internal/prometheus_adapter/api/alertmanager_api.go b/internal/prometheus_adapter/api/alertmanager_api.go new file mode 100644 index 0000000..a50f2c9 --- /dev/null +++ b/internal/prometheus_adapter/api/alertmanager_api.go @@ -0,0 +1,25 @@ +package api + +import ( + "github.com/fox-gonic/fox" + "github.com/qiniu/zeroops/internal/prometheus_adapter/service" +) + +// setupAlertmanagerRouters 设置 Alertmanager 兼容路由 +// 这些路由模拟 Alertmanager API,接收 Prometheus 的告警推送 +func (api *Api) setupAlertmanagerRouters(router *fox.Engine, alertmanagerService *service.AlertmanagerService) { + // Alertmanager API v2 告警接收端点 + router.POST("/api/v2/alerts", func(c *fox.Context) { + alertmanagerService.HandleAlertsV2(c.Writer, c.Request) + }) + + // 健康检查端点 + router.GET("/-/healthy", func(c *fox.Context) { + alertmanagerService.HandleHealthCheck(c.Writer, c.Request) + }) + + // 就绪检查端点 + router.GET("/-/ready", func(c *fox.Context) { + alertmanagerService.HandleReadyCheck(c.Writer, c.Request) + }) +} diff --git a/internal/prometheus_adapter/api/api.go b/internal/prometheus_adapter/api/api.go new file mode 100644 index 0000000..351544d --- /dev/null +++ b/internal/prometheus_adapter/api/api.go @@ -0,0 +1,122 @@ +package api + +import ( + "fmt" + "time" + + "github.com/fox-gonic/fox" + "github.com/qiniu/zeroops/internal/prometheus_adapter/model" + "github.com/qiniu/zeroops/internal/prometheus_adapter/service" + "github.com/rs/zerolog/log" +) + +// Api Prometheus Adapter API +type Api struct { + metricService *service.MetricService + alertService *service.AlertService + alertmanagerService *service.AlertmanagerService + router *fox.Engine +} + +// NewApi 创建新的 API +func NewApi(metricService *service.MetricService, alertService *service.AlertService, alertmanagerService *service.AlertmanagerService, router *fox.Engine) (*Api, error) { + api := &Api{ + metricService: metricService, + alertService: alertService, + alertmanagerService: alertmanagerService, + router: router, + } + + api.setupRouters(router) + return api, nil +} + +// setupRouters 设置路由 +func (api *Api) setupRouters(router *fox.Engine) { + // 指标相关路由 + api.setupMetricRouters(router) + // 告警相关路由 + api.setupAlertRouters(router) + // Alertmanager 兼容路由 + api.setupAlertmanagerRouters(router, api.alertmanagerService) +} + +// ========== 通用辅助方法 ========== + +// SendErrorResponse 发送错误响应(可被其他API模块使用) +func SendErrorResponse(c *fox.Context, statusCode int, errorCode, message string, extras map[string]string) { + errorDetail := model.ErrorDetail{ + Code: errorCode, + Message: message, + } + + // 添加额外的字段 + if extras != nil { + if service, ok := extras["service"]; ok { + errorDetail.Service = service + } + if metric, ok := extras["metric"]; ok { + errorDetail.Metric = metric + } + if parameter, ok := extras["parameter"]; ok { + errorDetail.Parameter = parameter + } + if value, ok := extras["value"]; ok { + errorDetail.Value = value + } + } + + response := model.ErrorResponse{ + Error: errorDetail, + } + + c.JSON(statusCode, response) +} + +// ParseTimeRange 解析时间范围参数 +func ParseTimeRange(startStr, endStr string) (time.Time, time.Time, error) { + var start, end time.Time + var err error + + // 如果没有指定开始时间,默认为1小时前 + if startStr == "" { + start = time.Now().Add(-1 * time.Hour) + } else { + start, err = time.Parse(time.RFC3339, startStr) + if err != nil { + return time.Time{}, time.Time{}, fmt.Errorf("invalid start time format: %w", err) + } + } + + // 如果没有指定结束时间,默认为当前时间 + if endStr == "" { + end = time.Now() + } else { + end, err = time.Parse(time.RFC3339, endStr) + if err != nil { + return time.Time{}, time.Time{}, fmt.Errorf("invalid end time format: %w", err) + } + } + + // 验证时间范围的合理性 + if end.Before(start) { + return time.Time{}, time.Time{}, fmt.Errorf("end time must be after start time") + } + + return start, end, nil +} + +// ParseStep 解析步长参数 +func ParseStep(stepStr string) time.Duration { + if stepStr == "" { + return time.Minute // 默认1分钟 + } + + duration, err := time.ParseDuration(stepStr) + if err != nil { + log.Warn().Str("step", stepStr).Msg("invalid step format, using default") + return time.Minute + } + + return duration +} diff --git a/internal/prometheus_adapter/api/metric_api.go b/internal/prometheus_adapter/api/metric_api.go new file mode 100644 index 0000000..832362f --- /dev/null +++ b/internal/prometheus_adapter/api/metric_api.go @@ -0,0 +1,96 @@ +package api + +import ( + "errors" + "fmt" + "net/http" + + "github.com/fox-gonic/fox" + "github.com/qiniu/zeroops/internal/prometheus_adapter/model" + "github.com/rs/zerolog/log" +) + +// setupMetricRouters 设置指标相关路由 +func (api *Api) setupMetricRouters(router *fox.Engine) { + router.GET("/v1/metrics", api.GetMetrics) + router.GET("/v1/metrics/:service/:metric", api.QueryMetric) +} + +// GetMetrics 获取可用指标列表(GET /v1/metrics) +func (api *Api) GetMetrics(c *fox.Context) { + ctx := c.Request.Context() + + response, err := api.metricService.GetAvailableMetrics(ctx) + if err != nil { + log.Error().Err(err).Msg("failed to get available metrics") + SendErrorResponse(c, http.StatusInternalServerError, model.ErrorCodeInternalError, "获取指标列表失败", nil) + return + } + + c.JSON(http.StatusOK, response) +} + +// QueryMetric 查询指标数据(GET /v1/metrics/:service/:metric) +func (api *Api) QueryMetric(c *fox.Context) { + ctx := c.Request.Context() + + // 获取路径参数 + serviceName := c.Param("service") + metricName := c.Param("metric") + + // 获取查询参数 + version := c.Query("version") + startStr := c.Query("start") + endStr := c.Query("end") + stepStr := c.Query("step") + + // 解析时间参数 + start, end, err := ParseTimeRange(startStr, endStr) + if err != nil { + log.Error().Err(err).Msg("invalid time parameters") + SendErrorResponse(c, http.StatusBadRequest, model.ErrorCodeInvalidParameter, + fmt.Sprintf("参数 'start/end' 格式错误: %s", err.Error()), nil) + return + } + + // 解析步长参数 + step := ParseStep(stepStr) + + // 查询指标 + response, err := api.metricService.QueryMetric(ctx, serviceName, metricName, version, start, end, step) + if err != nil { + api.handleQueryError(c, err, serviceName, metricName) + return + } + + c.JSON(http.StatusOK, response) +} + +// handleQueryError 处理查询错误 +func (api *Api) handleQueryError(c *fox.Context, err error, service, metric string) { + var serviceNotFound *model.ServiceNotFoundError + var metricNotFound *model.MetricNotFoundError + var prometheusError *model.PrometheusError + + switch { + case errors.As(err, &serviceNotFound): + log.Error().Err(err).Str("service", service).Msg("service not found") + SendErrorResponse(c, http.StatusNotFound, model.ErrorCodeServiceNotFound, + err.Error(), map[string]string{"service": service}) + + case errors.As(err, &metricNotFound): + log.Error().Err(err).Str("metric", metric).Msg("metric not found") + SendErrorResponse(c, http.StatusNotFound, model.ErrorCodeMetricNotFound, + err.Error(), map[string]string{"metric": metric}) + + case errors.As(err, &prometheusError): + log.Error().Err(err).Msg("prometheus query error") + SendErrorResponse(c, http.StatusInternalServerError, model.ErrorCodePrometheusError, + "Prometheus 查询失败", nil) + + default: + log.Error().Err(err).Msg("unexpected error during metric query") + SendErrorResponse(c, http.StatusInternalServerError, model.ErrorCodeInternalError, + "内部服务器错误", nil) + } +} diff --git a/internal/prometheus_adapter/client/prometheus_client.go b/internal/prometheus_adapter/client/prometheus_client.go new file mode 100644 index 0000000..a42b58b --- /dev/null +++ b/internal/prometheus_adapter/client/prometheus_client.go @@ -0,0 +1,184 @@ +package client + +import ( + "context" + "encoding/json" + "fmt" + "io" + "net/http" + "time" + + "github.com/prometheus/client_golang/api" + v1 "github.com/prometheus/client_golang/api/prometheus/v1" + promModel "github.com/prometheus/common/model" + "github.com/qiniu/zeroops/internal/prometheus_adapter/model" + "github.com/rs/zerolog/log" +) + +// PrometheusClient Prometheus 客户端 +type PrometheusClient struct { + api v1.API + httpClient *http.Client + baseURL string +} + +// NewPrometheusClient 创建新的 Prometheus 客户端 +func NewPrometheusClient(address string) (*PrometheusClient, error) { + client, err := api.NewClient(api.Config{ + Address: address, + }) + if err != nil { + return nil, fmt.Errorf("failed to create prometheus client: %w", err) + } + + return &PrometheusClient{ + api: v1.NewAPI(client), + httpClient: &http.Client{Timeout: 10 * time.Second}, + baseURL: address, + }, nil +} + +// QueryRange 执行范围查询 +func (c *PrometheusClient) QueryRange(ctx context.Context, query string, start, end time.Time, step time.Duration) ([]model.MetricDataPoint, error) { + r := v1.Range{ + Start: start, + End: end, + Step: step, + } + + result, warnings, err := c.api.QueryRange(ctx, query, r) + if err != nil { + return nil, fmt.Errorf("failed to query prometheus: %w", err) + } + + if len(warnings) > 0 { + // 记录警告但不返回错误 + fmt.Printf("Prometheus warnings: %v\n", warnings) + } + + // 转换结果为我们的数据格式 + matrix, ok := result.(promModel.Matrix) + if !ok { + return nil, fmt.Errorf("unexpected result type: %T", result) + } + + var dataPoints []model.MetricDataPoint + for _, sample := range matrix { + for _, pair := range sample.Values { + dataPoints = append(dataPoints, model.MetricDataPoint{ + Timestamp: pair.Timestamp.Time(), + Value: float64(pair.Value), + }) + } + } + + return dataPoints, nil +} + +// GetAvailableMetrics 获取所有可用的指标名称 +func (c *PrometheusClient) GetAvailableMetrics(ctx context.Context) ([]string, error) { + // 查询所有指标名称 + result, warnings, err := c.api.LabelValues(ctx, "__name__", nil, time.Now().Add(-time.Hour), time.Now()) + if err != nil { + return nil, fmt.Errorf("failed to get metrics: %w", err) + } + + if len(warnings) > 0 { + fmt.Printf("Prometheus warnings: %v\n", warnings) + } + + // 转换为字符串数组,过滤相关的指标 + metrics := make([]string, 0) + for _, m := range result { + metricName := string(m) + metrics = append(metrics, metricName) + } + + return metrics, nil +} + +// CheckMetricExists 检查指标是否存在 +func (c *PrometheusClient) CheckMetricExists(ctx context.Context, metric string) (bool, error) { + // 查询指标是否存在 + query := fmt.Sprintf(`{__name__="%s"}`, metric) + result, _, err := c.api.Query(ctx, query, time.Now()) + if err != nil { + return false, fmt.Errorf("failed to check metric existence: %w", err) + } + + // 如果有结果,说明指标存在 + switch v := result.(type) { + case promModel.Vector: + return len(v) > 0, nil + case promModel.Matrix: + return len(v) > 0, nil + default: + return false, nil + } +} + +// CheckServiceExists 检查服务是否存在 +func (c *PrometheusClient) CheckServiceExists(ctx context.Context, service string) (bool, error) { + // 查询服务是否存在 + query := fmt.Sprintf(`{service_name="%s"}`, service) + result, _, err := c.api.Query(ctx, query, time.Now()) + if err != nil { + return false, fmt.Errorf("failed to check service existence: %w", err) + } + + // 如果有结果,说明服务存在 + switch v := result.(type) { + case promModel.Vector: + return len(v) > 0, nil + case promModel.Matrix: + return len(v) > 0, nil + default: + return false, nil + } +} + +// BuildQuery 构建 PromQL 查询 +func BuildQuery(service, metric, version string) string { + // 基础查询 + query := fmt.Sprintf(`%s{service_name="%s"`, metric, service) + + // 如果指定了版本,添加版本过滤 + if version != "" { + query += fmt.Sprintf(`,service_version="%s"`, version) + } + + query += "}" + return query +} + +// GetAlerts 获取 Prometheus 当前的告警 +func (c *PrometheusClient) GetAlerts(ctx context.Context) (*model.PrometheusAlertsResponse, error) { + url := fmt.Sprintf("%s/api/v1/alerts", c.baseURL) + + req, err := http.NewRequestWithContext(ctx, "GET", url, nil) + if err != nil { + return nil, fmt.Errorf("failed to create request: %w", err) + } + + resp, err := c.httpClient.Do(req) + if err != nil { + return nil, fmt.Errorf("failed to query alerts: %w", err) + } + defer resp.Body.Close() + + if resp.StatusCode != http.StatusOK { + body, _ := io.ReadAll(resp.Body) + return nil, fmt.Errorf("prometheus returned status %d: %s", resp.StatusCode, string(body)) + } + + var alertsResp model.PrometheusAlertsResponse + if err := json.NewDecoder(resp.Body).Decode(&alertsResp); err != nil { + return nil, fmt.Errorf("failed to decode response: %w", err) + } + + log.Debug(). + Int("alert_count", len(alertsResp.Data.Alerts)). + Msg("Retrieved alerts from Prometheus") + + return &alertsResp, nil +} diff --git a/internal/prometheus_adapter/config/config.go b/internal/prometheus_adapter/config/config.go new file mode 100644 index 0000000..5fe7fe8 --- /dev/null +++ b/internal/prometheus_adapter/config/config.go @@ -0,0 +1,183 @@ +package config + +import ( + "fmt" + "os" + "time" + + "github.com/rs/zerolog/log" + "gopkg.in/yaml.v3" +) + +// PrometheusAdapterConfig Prometheus Adapter 配置 +type PrometheusAdapterConfig struct { + Prometheus PrometheusConfig `yaml:"prometheus"` + AlertWebhook AlertWebhookConfig `yaml:"alert_webhook"` + AlertRules AlertRulesConfig `yaml:"alert_rules"` + Server ServerConfig `yaml:"server"` +} + +// PrometheusConfig Prometheus 服务配置 +type PrometheusConfig struct { + Address string `yaml:"address"` // Prometheus 地址 + ContainerName string `yaml:"container_name"` // 容器名称 +} + +// AlertWebhookConfig 告警 Webhook 配置 +type AlertWebhookConfig struct { + URL string `yaml:"url"` // Webhook URL + PollingInterval string `yaml:"polling_interval"` // 轮询间隔 +} + +// AlertRulesConfig 告警规则配置 +type AlertRulesConfig struct { + LocalFile string `yaml:"local_file"` // 本地规则文件 + PrometheusRulesDir string `yaml:"prometheus_rules_dir"` // Prometheus 规则目录 +} + +// ServerConfig 服务器配置 +type ServerConfig struct { + BindAddr string `yaml:"bind_addr"` // 监听地址 +} + +// LoadConfig 加载配置文件 +func LoadConfig(configPath string) (*PrometheusAdapterConfig, error) { + // 如果没有指定配置文件,尝试多个默认路径 + if configPath == "" { + // 尝试的路径列表(按优先级) + possiblePaths := []string{ + "config/prometheus_adapter.yml", // 部署环境:相对于工作目录 + "internal/prometheus_adapter/config/prometheus_adapter.yml", // 开发环境:源码目录 + "./prometheus_adapter.yml", // 当前目录 + } + + for _, path := range possiblePaths { + if _, err := os.Stat(path); err == nil { + configPath = path + log.Info().Str("path", path).Msg("Found config file") + break + } + } + + // 如果都找不到,使用第一个路径(稍后会返回默认配置) + if configPath == "" { + configPath = possiblePaths[0] + } + } + + // 读取配置文件 + data, err := os.ReadFile(configPath) + if err != nil { + // 如果文件不存在,返回默认配置 + if os.IsNotExist(err) { + log.Warn().Msg("Config file not found, using default configuration") + return getDefaultConfig(), nil + } + return nil, fmt.Errorf("failed to read config file: %w", err) + } + + // 解析配置 + var config PrometheusAdapterConfig + if err := yaml.Unmarshal(data, &config); err != nil { + return nil, fmt.Errorf("failed to parse config file: %w", err) + } + + // 应用环境变量覆盖 + applyEnvOverrides(&config) + + // 验证配置 + if err := validateConfig(&config); err != nil { + return nil, fmt.Errorf("invalid configuration: %w", err) + } + + log.Info(). + Str("config_file", configPath). + Msg("Configuration loaded successfully") + + return &config, nil +} + +// getDefaultConfig 获取默认配置 +func getDefaultConfig() *PrometheusAdapterConfig { + return &PrometheusAdapterConfig{ + Prometheus: PrometheusConfig{ + Address: "http://10.210.10.33:9090", + ContainerName: "mock-s3-prometheus", + }, + AlertWebhook: AlertWebhookConfig{ + URL: "http://alert-module:8080/v1/integrations/alertmanager/webhook", + PollingInterval: "10s", + }, + AlertRules: AlertRulesConfig{ + LocalFile: "../rules/alert_rules.yml", + PrometheusRulesDir: "/etc/prometheus/rules/", + }, + Server: ServerConfig{ + BindAddr: "0.0.0.0:9999", + }, + } +} + +// applyEnvOverrides 应用环境变量覆盖 +func applyEnvOverrides(config *PrometheusAdapterConfig) { + // Prometheus 配置 + if addr := os.Getenv("PROMETHEUS_ADDRESS"); addr != "" { + config.Prometheus.Address = addr + } + if container := os.Getenv("PROMETHEUS_CONTAINER"); container != "" { + config.Prometheus.ContainerName = container + } + + // Alert Webhook 配置 + if url := os.Getenv("ALERT_WEBHOOK_URL"); url != "" { + config.AlertWebhook.URL = url + } + if interval := os.Getenv("ALERT_POLLING_INTERVAL"); interval != "" { + config.AlertWebhook.PollingInterval = interval + } + + // Server 配置 + if bindAddr := os.Getenv("SERVER_BIND_ADDR"); bindAddr != "" { + config.Server.BindAddr = bindAddr + } +} + +// validateConfig 验证配置 +func validateConfig(config *PrometheusAdapterConfig) error { + // 验证 Prometheus 地址 + if config.Prometheus.Address == "" { + return fmt.Errorf("prometheus address is required") + } + + // 验证轮询间隔 + if config.AlertWebhook.PollingInterval != "" { + if _, err := time.ParseDuration(config.AlertWebhook.PollingInterval); err != nil { + return fmt.Errorf("invalid polling interval: %w", err) + } + } + + // 验证服务器地址 + if config.Server.BindAddr == "" { + return fmt.Errorf("server bind address is required") + } + + return nil +} + +// GetPollingInterval 获取轮询间隔的 Duration +func (c *AlertWebhookConfig) GetPollingInterval() time.Duration { + if c.PollingInterval == "" { + return 10 * time.Second + } + + duration, err := time.ParseDuration(c.PollingInterval) + if err != nil { + log.Warn(). + Err(err). + Str("interval", c.PollingInterval). + Msg("Invalid polling interval, using default") + return 10 * time.Second + } + + return duration +} diff --git a/internal/prometheus_adapter/config/prometheus_adapter.yml b/internal/prometheus_adapter/config/prometheus_adapter.yml new file mode 100644 index 0000000..55f0b56 --- /dev/null +++ b/internal/prometheus_adapter/config/prometheus_adapter.yml @@ -0,0 +1,27 @@ +# Prometheus Adapter 配置文件 + +# Prometheus 服务配置 +prometheus: + # Prometheus 服务地址 + address: "http://10.210.10.33:9090" + # 容器名称(用于规则同步) + container_name: "mock-s3-prometheus" + +# 告警 Webhook 服务配置 +alert_webhook: + # 监控告警模块地址 + url: "http://alert-module:8080/v1/integrations/alertmanager/webhook" + # 轮询间隔 + polling_interval: "10s" + +# 告警规则管理配置 +alert_rules: + # 本地规则文件路径 + local_file: "../rules/alert_rules.yml" + # Prometheus 规则目录 + prometheus_rules_dir: "/etc/prometheus/rules/" + +# 服务器配置 +server: + # 服务监听地址 + bind_addr: "0.0.0.0:9999" \ No newline at end of file diff --git a/internal/prometheus_adapter/model/alert.go b/internal/prometheus_adapter/model/alert.go new file mode 100644 index 0000000..497b541 --- /dev/null +++ b/internal/prometheus_adapter/model/alert.go @@ -0,0 +1,28 @@ +package model + +// AlertRule 告警规则表 - 定义告警规则模板 +type AlertRule struct { + Name string `json:"name" gorm:"type:varchar(255);primaryKey"` // 主键,告警规则名称 + Description string `json:"description" gorm:"type:text"` // 可读标题,可拼接渲染为可读的 title + Expr string `json:"expr" gorm:"type:text;not null"` // 左侧业务指标表达式,如 sum(apitime) by (service, version) + Op string `json:"op" gorm:"type:varchar(4);not null"` // 阈值比较方式(>, <, =, !=) + Severity string `json:"severity" gorm:"type:varchar(32);not null"` // 告警等级,通常进入告警的 labels.severity + WatchTime int `json:"watch_time"` // 持续时长(秒),映射 Prometheus rule 的 for 字段 +} + +// AlertRuleMeta 告警规则元信息表 - 存储服务级别的告警配置 +// 用于将告警规则模板实例化为具体的服务告警 +type AlertRuleMeta struct { + AlertName string `json:"alert_name" gorm:"type:varchar(255);index"` // 关联 alert_rules.name + Labels string `json:"labels" gorm:"type:jsonb"` // 适用标签,如 {"service":"s3","version":"v1"},为空表示全局 + Threshold float64 `json:"threshold"` // 阈值(会被渲染成特定规则的 threshold metric 数值) +} + +// AlertmanagerAlert 符合 Alertmanager API v2 的告警格式 +type AlertmanagerAlert struct { + Labels map[string]string `json:"labels"` + Annotations map[string]string `json:"annotations,omitempty"` + StartsAt string `json:"startsAt,omitempty"` // RFC3339 格式 + EndsAt string `json:"endsAt,omitempty"` // RFC3339 格式 + GeneratorURL string `json:"generatorURL,omitempty"` +} diff --git a/internal/prometheus_adapter/model/api.go b/internal/prometheus_adapter/model/api.go new file mode 100644 index 0000000..9138f68 --- /dev/null +++ b/internal/prometheus_adapter/model/api.go @@ -0,0 +1,75 @@ +package model + +import "time" + +// ===== 指标相关 API ===== + +// MetricListResponse 指标列表响应(对应 GET /v1/metrics) +type MetricListResponse struct { + Metrics []string `json:"metrics"` +} + +// MetricQueryResponse 指标查询响应(对应 GET /v1/metrics/:service/:metric) +type MetricQueryResponse struct { + Service string `json:"service"` + Version string `json:"version,omitempty"` + Metric string `json:"metric"` + Data []MetricDataPoint `json:"data"` +} + +// MetricDataPoint 指标数据点 +type MetricDataPoint struct { + Timestamp time.Time `json:"timestamp"` + Value float64 `json:"value"` +} + +// ===== 告警规则相关 API ===== + +// CreateAlertRuleRequest 创建告警规则请求 +type CreateAlertRuleRequest struct { + Name string `json:"name" binding:"required"` + Description string `json:"description,omitempty"` + Expr string `json:"expr" binding:"required"` + Op string `json:"op" binding:"required,oneof=> < = !="` + Severity string `json:"severity" binding:"required"` + + // 元信息字段(可选) + Labels map[string]string `json:"labels,omitempty"` + Threshold float64 `json:"threshold,omitempty"` + WatchTime int `json:"watch_time,omitempty"` + MatchTime string `json:"match_time,omitempty"` +} + +// UpdateAlertRuleRequest 更新告警规则请求 +type UpdateAlertRuleRequest struct { + Description string `json:"description,omitempty"` + Expr string `json:"expr,omitempty"` + Op string `json:"op,omitempty" binding:"omitempty,oneof=> < = !="` + Severity string `json:"severity,omitempty"` + WatchTime int `json:"watch_time,omitempty"` // 持续时长(秒) +} + +// CreateAlertRuleMetaRequest 创建告警规则元信息请求 +type CreateAlertRuleMetaRequest struct { + AlertName string `json:"alert_name" binding:"required"` + Labels map[string]string `json:"labels" binding:"required"` + Threshold float64 `json:"threshold" binding:"required"` + WatchTime int `json:"watch_time,omitempty"` + MatchTime string `json:"match_time,omitempty"` +} + +// UpdateAlertRuleMetaRequest 批量更新告警规则元信息请求 +type UpdateAlertRuleMetaRequest struct { + Metas []AlertRuleMetaUpdate `json:"metas" binding:"required"` +} + +// AlertRuleMetaUpdate 单个规则元信息更新项 +type AlertRuleMetaUpdate struct { + Labels string `json:"labels" binding:"required"` // 必填,用于唯一标识 + Threshold float64 `json:"threshold"` +} + +// DeleteAlertRuleMetaRequest 删除告警规则元信息请求 +type DeleteAlertRuleMetaRequest struct { + Labels string `json:"labels" binding:"required"` // 必填,用于唯一标识要删除的元信息 +} diff --git a/internal/prometheus_adapter/model/constants.go b/internal/prometheus_adapter/model/constants.go new file mode 100644 index 0000000..3992eae --- /dev/null +++ b/internal/prometheus_adapter/model/constants.go @@ -0,0 +1,11 @@ +package model + +// 错误码常量 +const ( + ErrorCodeMetricNotFound = "METRIC_NOT_FOUND" + ErrorCodeServiceNotFound = "SERVICE_NOT_FOUND" + ErrorCodeInvalidParameter = "INVALID_PARAMETER" + ErrorCodePrometheusError = "PROMETHEUS_ERROR" + ErrorCodeInternalError = "INTERNAL_ERROR" + ErrorCodeRuleNotFound = "RULE_NOT_FOUND" +) diff --git a/internal/prometheus_adapter/model/error.go b/internal/prometheus_adapter/model/error.go new file mode 100644 index 0000000..aacbb63 --- /dev/null +++ b/internal/prometheus_adapter/model/error.go @@ -0,0 +1,49 @@ +package model + +import "fmt" + +// ===== 错误响应结构体 ===== + +// ErrorResponse 错误响应 +type ErrorResponse struct { + Error ErrorDetail `json:"error"` +} + +// ErrorDetail 错误详情 +type ErrorDetail struct { + Code string `json:"code"` + Message string `json:"message"` + Service string `json:"service,omitempty"` + Metric string `json:"metric,omitempty"` + Parameter string `json:"parameter,omitempty"` + Value string `json:"value,omitempty"` +} + +// ===== 自定义错误类型 ===== + +// ServiceNotFoundError 服务不存在错误 +type ServiceNotFoundError struct { + Service string +} + +func (e *ServiceNotFoundError) Error() string { + return fmt.Sprintf("服务 '%s' 不存在", e.Service) +} + +// MetricNotFoundError 指标不存在错误 +type MetricNotFoundError struct { + Metric string +} + +func (e *MetricNotFoundError) Error() string { + return fmt.Sprintf("指标 '%s' 不存在", e.Metric) +} + +// PrometheusError Prometheus 查询错误 +type PrometheusError struct { + Message string +} + +func (e *PrometheusError) Error() string { + return fmt.Sprintf("Prometheus 查询错误: %s", e.Message) +} diff --git a/internal/prometheus_adapter/model/prometheus_alert.go b/internal/prometheus_adapter/model/prometheus_alert.go new file mode 100644 index 0000000..c809359 --- /dev/null +++ b/internal/prometheus_adapter/model/prometheus_alert.go @@ -0,0 +1,50 @@ +package model + +import ( + "time" +) + +// PrometheusAlert Prometheus 告警 API 响应结构 +type PrometheusAlert struct { + Labels map[string]string `json:"labels"` + Annotations map[string]string `json:"annotations"` + State string `json:"state"` // pending, firing + ActiveAt time.Time `json:"activeAt"` + Value string `json:"value"` // 触发告警时的值 +} + +// PrometheusAlertsResponse Prometheus /api/v1/alerts 响应 +type PrometheusAlertsResponse struct { + Status string `json:"status"` + Data struct { + Alerts []PrometheusAlert `json:"alerts"` + } `json:"data"` +} + +// AlertmanagerWebhookAlert 单个告警 +type AlertmanagerWebhookAlert struct { + Status string `json:"status"` // "firing" or "resolved" + Labels map[string]string `json:"labels"` // 包含 alertname, service, severity, idc, service_version 等 + Annotations map[string]string `json:"annotations"` // 包含 summary, description + StartsAt string `json:"startsAt"` // RFC3339 格式时间 + EndsAt string `json:"endsAt"` // RFC3339 格式时间 + GeneratorURL string `json:"generatorURL"` // Prometheus 查询链接 + Fingerprint string `json:"fingerprint"` // 告警唯一标识 +} + +// AlertmanagerWebhookRequest 发送到监控告警模块的请求格式 +type AlertmanagerWebhookRequest struct { + Receiver string `json:"receiver"` // "our-webhook" + Status string `json:"status"` // "firing" or "resolved" + Alerts []AlertmanagerWebhookAlert `json:"alerts"` + GroupLabels map[string]string `json:"groupLabels"` // 分组标签 + CommonLabels map[string]string `json:"commonLabels"` // 公共标签 + Alert string `json:"alert"` // "REDACTED" + Version string `json:"version"` // "4" +} + +// AlertWebhookResponse 告警推送响应 +type AlertWebhookResponse struct { + Status string `json:"status"` + Message string `json:"message"` +} diff --git a/internal/prometheus_adapter/model/prometheus_rule.go b/internal/prometheus_adapter/model/prometheus_rule.go new file mode 100644 index 0000000..3d5c9e4 --- /dev/null +++ b/internal/prometheus_adapter/model/prometheus_rule.go @@ -0,0 +1,21 @@ +package model + +// PrometheusRule Prometheus规则文件中的单个规则 +type PrometheusRule struct { + Alert string `yaml:"alert"` + Expr string `yaml:"expr"` + For string `yaml:"for,omitempty"` + Labels map[string]string `yaml:"labels,omitempty"` + Annotations map[string]string `yaml:"annotations,omitempty"` +} + +// PrometheusRuleGroup Prometheus规则组 +type PrometheusRuleGroup struct { + Name string `yaml:"name"` + Rules []PrometheusRule `yaml:"rules"` +} + +// PrometheusRuleFile Prometheus规则文件结构 +type PrometheusRuleFile struct { + Groups []PrometheusRuleGroup `yaml:"groups"` +} diff --git a/internal/prometheus_adapter/rules/alert_rules.yml b/internal/prometheus_adapter/rules/alert_rules.yml new file mode 100644 index 0000000..7dd73ca --- /dev/null +++ b/internal/prometheus_adapter/rules/alert_rules.yml @@ -0,0 +1,5 @@ +# Prometheus Alert Rules +# This file is managed by the Prometheus Adapter service +# It will be loaded on startup and saved on shutdown + +groups: [] \ No newline at end of file diff --git a/internal/prometheus_adapter/server.go b/internal/prometheus_adapter/server.go new file mode 100644 index 0000000..c8e321c --- /dev/null +++ b/internal/prometheus_adapter/server.go @@ -0,0 +1,98 @@ +package prometheusadapter + +import ( + "context" + "fmt" + + "github.com/fox-gonic/fox" + "github.com/qiniu/zeroops/internal/config" + "github.com/qiniu/zeroops/internal/prometheus_adapter/api" + "github.com/qiniu/zeroops/internal/prometheus_adapter/client" + promconfig "github.com/qiniu/zeroops/internal/prometheus_adapter/config" + "github.com/qiniu/zeroops/internal/prometheus_adapter/service" + "github.com/rs/zerolog/log" +) + +// PrometheusAdapterServer Prometheus Adapter 服务器 +type PrometheusAdapterServer struct { + config *config.Config + promConfig *promconfig.PrometheusAdapterConfig + promClient *client.PrometheusClient + metricService *service.MetricService + alertService *service.AlertService + alertmanagerProxyService *service.AlertmanagerService + api *api.Api +} + +// NewPrometheusAdapterServer 创建新的 Prometheus Adapter 服务器 +func NewPrometheusAdapterServer(cfg *config.Config) (*PrometheusAdapterServer, error) { + // 加载 Prometheus Adapter 配置 + promConfig, err := promconfig.LoadConfig("") + if err != nil { + return nil, fmt.Errorf("failed to load prometheus adapter config: %w", err) + } + + // 创建 Prometheus 客户端 + promClient, err := client.NewPrometheusClient(promConfig.Prometheus.Address) + if err != nil { + return nil, fmt.Errorf("failed to create prometheus client: %w", err) + } + + // 创建指标服务 + metricService := service.NewMetricService(promClient) + + // 创建告警服务 + alertService := service.NewAlertService(promClient, promConfig) + + // 创建 Alertmanager 代理服务 + alertmanagerProxyService := service.NewAlertmanagerProxyService(promConfig) + + server := &PrometheusAdapterServer{ + config: cfg, + promConfig: promConfig, + promClient: promClient, + metricService: metricService, + alertService: alertService, + alertmanagerProxyService: alertmanagerProxyService, + } + + log.Info().Str("prometheus_address", promConfig.Prometheus.Address).Msg("Prometheus Adapter initialized successfully") + return server, nil +} + +// GetBindAddr 获取配置文件中的绑定地址 +func (s *PrometheusAdapterServer) GetBindAddr() string { + if s.promConfig != nil && s.promConfig.Server.BindAddr != "" { + return s.promConfig.Server.BindAddr + } + return "" +} + +// UseApi 设置 API 路由 +func (s *PrometheusAdapterServer) UseApi(router *fox.Engine) error { + var err error + s.api, err = api.NewApi(s.metricService, s.alertService, s.alertmanagerProxyService, router) + if err != nil { + return fmt.Errorf("failed to initialize API: %w", err) + } + + log.Info().Msg("All API endpoints registered") + + return nil +} + +// Close 优雅关闭服务器 +func (s *PrometheusAdapterServer) Close(ctx context.Context) error { + log.Info().Msg("Starting shutdown...") + + // 调用 alertService 的 Shutdown 方法保存规则 + if s.alertService != nil { + if err := s.alertService.Shutdown(); err != nil { + log.Error().Err(err).Msg("Failed to shutdown alert service") + return err + } + } + + log.Info().Msg("Prometheus Adapter server shut down") + return nil +} diff --git a/internal/prometheus_adapter/service/alert_service.go b/internal/prometheus_adapter/service/alert_service.go new file mode 100644 index 0000000..fdd1b5c --- /dev/null +++ b/internal/prometheus_adapter/service/alert_service.go @@ -0,0 +1,692 @@ +package service + +import ( + "encoding/json" + "fmt" + "net/http" + "os" + "os/exec" + "path/filepath" + "strconv" + "strings" + + "github.com/qiniu/zeroops/internal/prometheus_adapter/client" + promconfig "github.com/qiniu/zeroops/internal/prometheus_adapter/config" + "github.com/qiniu/zeroops/internal/prometheus_adapter/model" + "github.com/rs/zerolog/log" + "gopkg.in/yaml.v3" +) + +// AlertService 告警服务 - 仅负责与Prometheus交互,不存储规则 +type AlertService struct { + promClient *client.PrometheusClient + config *promconfig.PrometheusAdapterConfig + // 内存中缓存当前规则,用于增量更新 + currentRules []model.AlertRule + currentRuleMetas []model.AlertRuleMeta + // 本地规则文件路径 + localRulesPath string +} + +// NewAlertService 创建告警服务 +func NewAlertService(promClient *client.PrometheusClient, config *promconfig.PrometheusAdapterConfig) *AlertService { + service := &AlertService{ + promClient: promClient, + config: config, + currentRules: []model.AlertRule{}, + currentRuleMetas: []model.AlertRuleMeta{}, + localRulesPath: config.AlertRules.LocalFile, + } + + // 启动时尝试加载本地规则 + if err := service.LoadRulesFromFile(); err != nil { + log.Warn().Err(err).Msg("Failed to load rules from file, starting with empty rules") + } + + return service +} + +// ========== 持久化方法 ========== + +// LoadRulesFromFile 从本地文件加载规则 +func (s *AlertService) LoadRulesFromFile() error { + // 检查文件是否存在 + if _, err := os.Stat(s.localRulesPath); os.IsNotExist(err) { + log.Info().Str("path", s.localRulesPath).Msg("Local rules file does not exist, skipping load") + return nil + } + + // 读取文件内容 + data, err := os.ReadFile(s.localRulesPath) + if err != nil { + return fmt.Errorf("failed to read local rules file: %w", err) + } + + // 解析规则文件 + var rulesFile model.PrometheusRuleFile + if err := yaml.Unmarshal(data, &rulesFile); err != nil { + return fmt.Errorf("failed to parse rules file: %w", err) + } + + // 从Prometheus格式转换回内部格式 + s.currentRules = []model.AlertRule{} + s.currentRuleMetas = []model.AlertRuleMeta{} + + // 用于去重的map + ruleMap := make(map[string]*model.AlertRule) + + for _, group := range rulesFile.Groups { + for _, rule := range group.Rules { + // 提取基础规则信息 + ruleName := rule.Alert + + // 从annotations中获取description + description := "" + if desc, ok := rule.Annotations["description"]; ok { + description = desc + } + + // 从labels中获取severity + severity := "warning" + if sev, ok := rule.Labels["severity"]; ok { + severity = sev + delete(rule.Labels, "severity") // 移除severity,剩下的是meta的labels + } + + // 创建或更新规则模板 + if _, exists := ruleMap[ruleName]; !exists { + alertRule := model.AlertRule{ + Name: ruleName, + Description: description, + Expr: rule.Expr, + Severity: severity, + } + + // 解析For字段获取WatchTime + if rule.For != "" { + // 简单解析,假设格式为 "300s" 或 "5m" + if strings.HasSuffix(rule.For, "s") { + if seconds, err := strconv.Atoi(strings.TrimSuffix(rule.For, "s")); err == nil { + alertRule.WatchTime = seconds + } + } else if strings.HasSuffix(rule.For, "m") { + if minutes, err := strconv.Atoi(strings.TrimSuffix(rule.For, "m")); err == nil { + alertRule.WatchTime = minutes * 60 + } + } + } + + ruleMap[ruleName] = &alertRule + s.currentRules = append(s.currentRules, alertRule) + } + + // 创建元信息 + if len(rule.Labels) > 0 { + labelsJSON, _ := json.Marshal(rule.Labels) + meta := model.AlertRuleMeta{ + AlertName: ruleName, + Labels: string(labelsJSON), + } + + // 从表达式中提取threshold(简单实现) + // 假设表达式类似 "metric > 80" 或 "metric{labels} > 80" + parts := strings.Split(rule.Expr, " ") + if len(parts) >= 3 { + if threshold, err := strconv.ParseFloat(parts[len(parts)-1], 64); err == nil { + meta.Threshold = threshold + } + } + + s.currentRuleMetas = append(s.currentRuleMetas, meta) + } + } + } + + log.Info(). + Int("rules", len(s.currentRules)). + Int("metas", len(s.currentRuleMetas)). + Str("path", s.localRulesPath). + Msg("Loaded rules from local file") + + return nil +} + +// SaveRulesToFile 保存规则到本地文件 +func (s *AlertService) SaveRulesToFile() error { + // 确保目录存在 + dir := filepath.Dir(s.localRulesPath) + if err := os.MkdirAll(dir, 0755); err != nil { + return fmt.Errorf("failed to create rules directory: %w", err) + } + + // 构建Prometheus规则文件格式 + prometheusRules := s.buildPrometheusRules(s.currentRules, s.currentRuleMetas) + + // 序列化为YAML + data, err := yaml.Marshal(prometheusRules) + if err != nil { + return fmt.Errorf("failed to marshal rules: %w", err) + } + + // 写入文件 + if err := os.WriteFile(s.localRulesPath, data, 0644); err != nil { + return fmt.Errorf("failed to write rules file: %w", err) + } + + log.Info(). + Int("rules", len(s.currentRules)). + Int("metas", len(s.currentRuleMetas)). + Str("path", s.localRulesPath). + Msg("Saved rules to local file") + + return nil +} + +// Shutdown 优雅关闭,保存当前规则 +func (s *AlertService) Shutdown() error { + log.Info().Msg("Shutting down alert service, saving rules...") + return s.SaveRulesToFile() +} + +// ========== 公开 API 方法 ========== + +// UpdateRule 更新单个规则模板 +// 只更新传入的规则,其他规则和所有元信息保持不变 +func (s *AlertService) UpdateRule(rule model.AlertRule) error { + // 查找并更新规则 + found := false + for i, r := range s.currentRules { + if r.Name == rule.Name { + s.currentRules[i] = rule + found = true + break + } + } + + if !found { + // 如果规则不存在,添加新规则 + s.currentRules = append(s.currentRules, rule) + } + + // 统计受影响的元信息数量 + affectedCount := 0 + for _, meta := range s.currentRuleMetas { + if meta.AlertName == rule.Name { + affectedCount++ + } + } + + log.Info(). + Str("rule", rule.Name). + Int("affected_metas", affectedCount). + Msg("Updating rule and affected metas") + + // 使用更新后的规则重新生成并同步 + return s.regenerateAndSync() +} + +// UpdateRuleMeta 更新单个规则元信息 +// 通过 alert_name + labels 唯一确定一个元信息记录 +func (s *AlertService) UpdateRuleMeta(meta model.AlertRuleMeta) error { + // 查找并更新元信息 + found := false + for i, m := range s.currentRuleMetas { + // 通过 alert_name + labels 唯一确定 + if m.AlertName == meta.AlertName && m.Labels == meta.Labels { + s.currentRuleMetas[i] = meta + found = true + break + } + } + + if !found { + // 如果元信息不存在,添加新元信息 + s.currentRuleMetas = append(s.currentRuleMetas, meta) + } + + log.Info(). + Str("alert_name", meta.AlertName). + Str("labels", meta.Labels). + Msg("Updating rule meta") + + // 使用更新后的元信息重新生成并同步 + return s.regenerateAndSync() +} + +// GetAffectedMetas 获取受影响的元信息数量 +func (s *AlertService) GetAffectedMetas(ruleName string) int { + count := 0 + for _, meta := range s.currentRuleMetas { + if meta.AlertName == ruleName { + count++ + } + } + return count +} + +// DeleteRule 删除单个规则模板及其所有关联的元信息 +func (s *AlertService) DeleteRule(ruleName string) error { + // 查找并删除规则模板 + ruleFound := false + for i, rule := range s.currentRules { + if rule.Name == ruleName { + // 从切片中删除规则 + s.currentRules = append(s.currentRules[:i], s.currentRules[i+1:]...) + ruleFound = true + break + } + } + + if !ruleFound { + return fmt.Errorf("rule '%s' not found", ruleName) + } + + // 删除所有关联的元信息 + deletedMetaCount := 0 + newMetas := []model.AlertRuleMeta{} + for _, meta := range s.currentRuleMetas { + if meta.AlertName != ruleName { + newMetas = append(newMetas, meta) + } else { + deletedMetaCount++ + } + } + s.currentRuleMetas = newMetas + + log.Info(). + Str("rule", ruleName). + Int("deleted_metas", deletedMetaCount). + Msg("Rule and associated metas deleted") + + // 重新生成并同步 + return s.regenerateAndSync() +} + +// DeleteRuleMeta 删除单个规则元信息 +func (s *AlertService) DeleteRuleMeta(ruleName, labels string) error { + // 查找并删除匹配的元信息 + found := false + for i, meta := range s.currentRuleMetas { + if meta.AlertName == ruleName && meta.Labels == labels { + // 从切片中删除元信息 + s.currentRuleMetas = append(s.currentRuleMetas[:i], s.currentRuleMetas[i+1:]...) + found = true + break + } + } + + if !found { + return fmt.Errorf("rule meta not found for rule '%s' with labels '%s'", ruleName, labels) + } + + log.Info(). + Str("rule", ruleName). + Str("labels", labels). + Msg("Rule meta deleted") + + // 重新生成并同步 + return s.regenerateAndSync() +} + +// ========== 内部核心方法 ========== + +// regenerateAndSync 使用当前内存中的规则和元信息重新生成Prometheus规则并同步 +func (s *AlertService) regenerateAndSync() error { + // 构建Prometheus规则文件 + prometheusRules := s.buildPrometheusRules(s.currentRules, s.currentRuleMetas) + + // 写入规则文件 + if err := s.writeRulesFile(prometheusRules); err != nil { + return fmt.Errorf("failed to write rules file: %w", err) + } + + // 通知Prometheus重新加载配置 + if err := s.reloadPrometheus(); err != nil { + log.Warn().Err(err).Msg("Failed to reload Prometheus, rules file has been updated") + // 不返回错误,因为文件已经更新成功 + } + + log.Info(). + Int("rules_count", len(s.currentRules)). + Int("metas_count", len(s.currentRuleMetas)). + Msg("Rules regenerated and synced to Prometheus") + + return nil +} + +// ========== 规则构建相关方法 ========== + +// buildPrometheusRules 构建Prometheus规则 +func (s *AlertService) buildPrometheusRules(rules []model.AlertRule, ruleMetas []model.AlertRuleMeta) *model.PrometheusRuleFile { + promRules := []model.PrometheusRule{} + + // 创建规则名到规则的映射 + ruleMap := make(map[string]*model.AlertRule) + for i := range rules { + ruleMap[rules[i].Name] = &rules[i] + } + + // 为每个元信息生成Prometheus规则 + for _, meta := range ruleMetas { + // 通过 alert_name 直接查找对应的规则模板 + // AlertRuleMeta.alert_name 关联 AlertRule.name + var rule *model.AlertRule = ruleMap[meta.AlertName] + + if rule == nil { + log.Warn(). + Str("alert_name", meta.AlertName). + Msg("No matching rule template found for alert meta, skipping") + continue + } + + // 解析标签 + var labels map[string]string + if meta.Labels != "" { + if err := json.Unmarshal([]byte(meta.Labels), &labels); err != nil { + log.Warn(). + Err(err). + Str("alert_name", meta.AlertName). + Msg("Failed to parse labels, using empty labels") + labels = make(map[string]string) + } + } else { + labels = make(map[string]string) + } + + // 添加severity标签 + labels["severity"] = rule.Severity + + // 构建表达式 + expr := s.buildExpression(rule, &meta) + + // 构建注释 + annotations := map[string]string{ + "description": rule.Description, + "summary": fmt.Sprintf("%s %s %g", rule.Expr, rule.Op, meta.Threshold), + } + + // 计算for字段 + forDuration := "" + if rule.WatchTime > 0 { + forDuration = fmt.Sprintf("%ds", rule.WatchTime) + } + + // 使用规则名作为 alert 名称,通过 labels 区分不同实例 + promRule := model.PrometheusRule{ + Alert: rule.Name, // 使用规则名作为 alert 名称 + Expr: expr, + For: forDuration, + Labels: labels, + Annotations: annotations, + } + + promRules = append(promRules, promRule) + } + + // 如果没有元信息,为每个规则创建默认规则 + if len(ruleMetas) == 0 { + for _, rule := range rules { + labels := map[string]string{ + "severity": rule.Severity, + } + + annotations := map[string]string{ + "description": rule.Description, + "summary": fmt.Sprintf("%s triggered", rule.Name), + } + + promRule := model.PrometheusRule{ + Alert: rule.Name, + Expr: rule.Expr, + Labels: labels, + Annotations: annotations, + } + + promRules = append(promRules, promRule) + } + } + + return &model.PrometheusRuleFile{ + Groups: []model.PrometheusRuleGroup{ + { + Name: "zeroops_alerts", + Rules: promRules, + }, + }, + } +} + +// buildExpression 构建PromQL表达式 +func (s *AlertService) buildExpression(rule *model.AlertRule, meta *model.AlertRuleMeta) string { + expr := rule.Expr + + // 解析标签并添加到表达式中 + var labels map[string]string + if meta.Labels != "" { + json.Unmarshal([]byte(meta.Labels), &labels) + } + + if len(labels) > 0 { + labelMatchers := []string{} + for k, v := range labels { + labelMatchers = append(labelMatchers, fmt.Sprintf(`%s="%s"`, k, v)) + } + + if len(labelMatchers) > 0 { + // 如果表达式包含{,说明已经有标签选择器 + if strings.Contains(expr, "{") { + // 查找第一个 { 后的内容 + start := strings.Index(expr, "{") + end := strings.Index(expr[start:], "}") + if end != -1 { + end += start + existingLabels := strings.TrimSpace(expr[start+1 : end]) + if existingLabels == "" { + // 空的标签选择器,直接替换 + expr = expr[:start+1] + strings.Join(labelMatchers, ",") + expr[end:] + } else { + // 已有标签,需要检查是否重复 + existingLabelMap := make(map[string]bool) + // 解析现有标签 + labelPairs := strings.Split(existingLabels, ",") + for _, pair := range labelPairs { + if strings.Contains(pair, "=") { + key := strings.TrimSpace(strings.Split(pair, "=")[0]) + if key != "" { + existingLabelMap[key] = true + } + } + } + // 只添加不重复的标签 + newLabels := []string{} + for k, v := range labels { + if !existingLabelMap[k] && k != "" && v != "" { + newLabels = append(newLabels, fmt.Sprintf(`%s="%s"`, k, v)) + } + } + if len(newLabels) > 0 { + expr = expr[:end] + "," + strings.Join(newLabels, ",") + expr[end:] + } + } + } + } else { + // 对于没有标签的简单指标,只处理单个单词的情况 + // 如果表达式包含空格、括号等,不进行标签注入 + if !strings.ContainsAny(expr, " ()[]{}") { + // 只有单个指标名,可以安全添加标签 + expr = expr + "{" + strings.Join(labelMatchers, ",") + "}" + } + } + } + } + + // 添加比较操作符和阈值 + if meta.Threshold != 0 { + expr = fmt.Sprintf("%s %s %g", expr, rule.Op, meta.Threshold) + } + + return expr +} + +// ========== 文件操作相关方法 ========== + +// writeRulesFile 写入规则文件 +func (s *AlertService) writeRulesFile(rules *model.PrometheusRuleFile) error { + // 序列化为YAML + data, err := yaml.Marshal(rules) + if err != nil { + return fmt.Errorf("failed to marshal rules: %w", err) + } + + // 获取容器名称 + containerName := s.config.Prometheus.ContainerName + + // 直接写入到容器内的规则目录 + // 使用docker exec和echo命令写入文件 + cmd := exec.Command("docker", "exec", containerName, "sh", "-c", + fmt.Sprintf("cat > /etc/prometheus/rules/alert_rules.yml << 'EOF'\n%s\nEOF", string(data))) + + if output, err := cmd.CombinedOutput(); err != nil { + // 如果直接写入容器失败,尝试使用临时文件+docker cp + log.Warn(). + Err(err). + Str("output", string(output)). + Msg("Failed to write directly to container, trying docker cp") + + // 写入临时文件 + tmpFile := "/tmp/prometheus_alert_rules.yml" + if err := os.WriteFile(tmpFile, data, 0644); err != nil { + return fmt.Errorf("failed to write temp rules file: %w", err) + } + + // 使用docker cp复制到容器 + if err := s.syncRuleFileToContainer(tmpFile); err != nil { + return fmt.Errorf("failed to sync to container: %w", err) + } + + // 清理临时文件 + os.Remove(tmpFile) + } + + log.Info(). + Str("container", containerName). + Int("groups", len(rules.Groups)). + Msg("Prometheus rules file updated in container") + + return nil +} + +// syncRuleFileToContainer 同步规则文件到容器 +func (s *AlertService) syncRuleFileToContainer(filePath string) error { + // 获取容器名称,默认为 mock-s3-prometheus + containerName := os.Getenv("PROMETHEUS_CONTAINER") + if containerName == "" { + containerName = "mock-s3-prometheus" + } + + // 1. 创建容器内的规则目录(如果不存在) + cmdMkdir := exec.Command("docker", "exec", containerName, "mkdir", "-p", "/etc/prometheus/rules") + if output, err := cmdMkdir.CombinedOutput(); err != nil { + // 目录可能已存在,记录日志但不返回错误 + log.Debug(). + Str("output", string(output)). + Msg("mkdir in container (may already exist)") + } + + // 2. 将规则文件拷贝到容器内 + cmdCopy := exec.Command("docker", "cp", filePath, fmt.Sprintf("%s:/etc/prometheus/rules/alert_rules.yml", containerName)) + if output, err := cmdCopy.CombinedOutput(); err != nil { + return fmt.Errorf("failed to copy rules file to container: %w, output: %s", err, string(output)) + } + + log.Info(). + Str("container", containerName). + Str("file", filePath). + Msg("Rules synced to Prometheus container") + + // 3. 确保 Prometheus 配置包含 rule_files + if err := s.ensurePrometheusRuleConfig(containerName); err != nil { + log.Warn().Err(err).Msg("Failed to ensure Prometheus rule configuration") + } + + return nil +} + +// ========== Prometheus 配置相关方法 ========== + +// reloadPrometheus 重新加载Prometheus配置 +func (s *AlertService) reloadPrometheus() error { + prometheusURL := s.config.Prometheus.Address + + reloadURL := fmt.Sprintf("%s/-/reload", strings.TrimSuffix(prometheusURL, "/")) + + resp, err := http.Post(reloadURL, "text/plain", nil) + if err != nil { + return fmt.Errorf("failed to reload Prometheus: %w", err) + } + defer resp.Body.Close() + + if resp.StatusCode != http.StatusOK { + return fmt.Errorf("prometheus reload failed with status: %d", resp.StatusCode) + } + + log.Info().Msg("Prometheus configuration reloaded") + return nil +} + +// ensurePrometheusRuleConfig 确保 Prometheus 配置文件包含 rule_files 配置 +func (s *AlertService) ensurePrometheusRuleConfig(containerName string) error { + configPath := "/etc/prometheus/prometheus.yml" + + // 1. 检查配置文件是否已包含 rule_files + cmdCheck := exec.Command("docker", "exec", containerName, "grep", "-q", "rule_files:", configPath) + if err := cmdCheck.Run(); err == nil { + // 已经包含 rule_files,不需要修改 + log.Debug().Msg("Prometheus config already contains rule_files") + return nil + } + + log.Info().Msg("Adding rule_files configuration to Prometheus") + + // 3. 在 global 部分后添加 rule_files 配置 + // 使用 sed 在 global: 块后插入 rule_files 配置 + sedScript := `'/^global:/,/^[^[:space:]]/ { + /^[^[:space:]]/ { + i\ +# Alert rules\ +rule_files:\ + - "/etc/prometheus/rules/*.yml"\ + + } + }'` + + cmdSed := exec.Command("docker", "exec", containerName, "sh", "-c", + fmt.Sprintf(`sed -i '%s' %s`, sedScript, configPath)) + + if output, err := cmdSed.CombinedOutput(); err != nil { + // 如果 sed 失败,尝试使用更简单的方法 + log.Warn(). + Str("output", string(output)). + Msg("sed failed, trying alternative method") + + // 使用 awk 方法 + awkScript := `awk '/^global:/ {print; getline; print; print "# Alert rules"; print "rule_files:"; print " - \"/etc/prometheus/rules/*.yml\""; next} {print}' %s > %s.tmp && mv %s.tmp %s` + cmdAwk := exec.Command("docker", "exec", containerName, "sh", "-c", + fmt.Sprintf(awkScript, configPath, configPath, configPath, configPath)) + + if output, err := cmdAwk.CombinedOutput(); err != nil { + return fmt.Errorf("failed to add rule_files to config: %w, output: %s", err, string(output)) + } + } + + log.Info().Msg("Successfully added rule_files configuration to Prometheus") + + // 4. 重启 Prometheus 容器以应用配置 + cmdRestart := exec.Command("docker", "restart", containerName) + if output, err := cmdRestart.CombinedOutput(); err != nil { + return fmt.Errorf("failed to restart Prometheus: %w, output: %s", err, string(output)) + } + + log.Info().Msg("Prometheus restarted with new configuration") + return nil +} diff --git a/internal/prometheus_adapter/service/alertmanager_service.go b/internal/prometheus_adapter/service/alertmanager_service.go new file mode 100644 index 0000000..5ff5e90 --- /dev/null +++ b/internal/prometheus_adapter/service/alertmanager_service.go @@ -0,0 +1,216 @@ +package service + +import ( + "bytes" + "encoding/json" + "fmt" + "io" + "net/http" + "time" + + promconfig "github.com/qiniu/zeroops/internal/prometheus_adapter/config" + "github.com/qiniu/zeroops/internal/prometheus_adapter/model" + "github.com/rs/zerolog/log" +) + +// AlertmanagerService Alertmanager 服务 +// 接收 Prometheus 的告警推送并转发到监控告警模块 +type AlertmanagerService struct { + config *promconfig.PrometheusAdapterConfig + webhookURL string + httpClient *http.Client + resolveTimeout time.Duration +} + +// NewAlertmanagerProxyService 创建新的 Alertmanager 代理服务 +func NewAlertmanagerProxyService(config *promconfig.PrometheusAdapterConfig) *AlertmanagerService { + return &AlertmanagerService{ + config: config, + webhookURL: config.AlertWebhook.URL, + httpClient: &http.Client{Timeout: 30 * time.Second}, + resolveTimeout: 5 * time.Minute, // 默认 resolve_timeout + } +} + +// HandleAlertsV2 处理 Prometheus 推送的告警 +// 实现 POST /api/v2/alerts 接口 +func (s *AlertmanagerService) HandleAlertsV2(w http.ResponseWriter, r *http.Request) { + // 检查 Content-Type + contentType := r.Header.Get("Content-Type") + if contentType != "application/json" && contentType != "" { + http.Error(w, "Content-Type must be application/json", http.StatusBadRequest) + return + } + + // 解析 Prometheus 发送的告警 + var alerts []model.AlertmanagerAlert + body, err := io.ReadAll(r.Body) + if err != nil { + log.Error().Err(err).Msg("Failed to read request body") + http.Error(w, "Failed to read request", http.StatusBadRequest) + return + } + defer r.Body.Close() + + if err := json.Unmarshal(body, &alerts); err != nil { + log.Error(). + Err(err). + Str("body", string(body)). + Msg("Failed to unmarshal alerts") + http.Error(w, "Invalid JSON", http.StatusBadRequest) + return + } + + // 处理时间戳:如果缺失则设置默认值 + now := time.Now() + for i := range alerts { + // 如果 startsAt 缺失,设置为当前时间 + if alerts[i].StartsAt == "" { + alerts[i].StartsAt = now.Format(time.RFC3339) + } + // 如果 endsAt 缺失,设置为当前时间 + resolve_timeout + if alerts[i].EndsAt == "" { + alerts[i].EndsAt = now.Add(s.resolveTimeout).Format(time.RFC3339) + } + } + + log.Info(). + Int("alert_count", len(alerts)). + Msg("Received alerts from Prometheus") + + // 转发告警到监控模块 + if err := s.forwardAlertsV2(alerts); err != nil { + log.Error().Err(err).Msg("Failed to forward alerts") + // 返回 500 让 Prometheus 重试 + http.Error(w, "Failed to forward alerts", http.StatusInternalServerError) + return + } + + // 返回成功响应(Alertmanager API v2 返回空 JSON) + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(http.StatusOK) + w.Write([]byte("{}")) +} + +// HandleHealthCheck 健康检查接口 +// 实现 GET /-/healthy +func (s *AlertmanagerService) HandleHealthCheck(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(http.StatusOK) + w.Write([]byte("OK")) +} + +// HandleReadyCheck 就绪检查接口 +// 实现 GET /-/ready +func (s *AlertmanagerService) HandleReadyCheck(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(http.StatusOK) + w.Write([]byte("OK")) +} + +// forwardAlertsV2 转发告警到监控告警模块 +func (s *AlertmanagerService) forwardAlertsV2(alerts []model.AlertmanagerAlert) error { + // 转换为 Alertmanager webhook 格式 + webhookAlerts := []model.AlertmanagerWebhookAlert{} + commonLabels := map[string]string{} + groupLabels := map[string]string{} + + // 统计告警状态,用于确定总体状态 + hasFiring := false + + for _, alert := range alerts { + // 确定告警状态:通过比较 endsAt 和当前时间 + status := "firing" + if alert.EndsAt != "" { + endsAtTime, err := time.Parse(time.RFC3339, alert.EndsAt) + if err == nil && endsAtTime.Before(time.Now()) { + status = "resolved" + } else { + hasFiring = true + } + } else { + hasFiring = true + } + + // 生成 fingerprint + fingerprint := s.generateFingerprint(alert.Labels) + + // 构造 GeneratorURL + generatorURL := alert.GeneratorURL + if generatorURL == "" && alert.Labels["alertname"] != "" { + generatorURL = fmt.Sprintf("http://prometheus/graph?g0.expr=%s", alert.Labels["alertname"]) + } + + webhookAlert := model.AlertmanagerWebhookAlert{ + Status: status, + Labels: alert.Labels, + Annotations: alert.Annotations, + StartsAt: alert.StartsAt, // 已经是 RFC3339 格式 + EndsAt: alert.EndsAt, // 已经是 RFC3339 格式 + GeneratorURL: generatorURL, + Fingerprint: fingerprint, + } + webhookAlerts = append(webhookAlerts, webhookAlert) + + // 收集公共标签 + if len(commonLabels) == 0 { + for k, v := range alert.Labels { + commonLabels[k] = v + } + } + } + + // 设置 groupLabels + if alertName, ok := commonLabels["alertname"]; ok { + groupLabels["alertname"] = alertName + } + + // 确定总体状态:如果有任何 firing 的告警,总体状态为 firing,否则为 resolved + overallStatus := "resolved" + if hasFiring { + overallStatus = "firing" + } + + // 构造 webhook 请求 + req := model.AlertmanagerWebhookRequest{ + Receiver: "prometheus_adapter", + Status: overallStatus, // 根据告警实际状态设置 + Alerts: webhookAlerts, + GroupLabels: groupLabels, + CommonLabels: commonLabels, + Alert: "REDACTED", + Version: "1", + } + + // 发送到监控告警模块 + jsonData, err := json.Marshal(req) + if err != nil { + return fmt.Errorf("failed to marshal webhook request: %w", err) + } + + resp, err := s.httpClient.Post(s.webhookURL, "application/json", bytes.NewBuffer(jsonData)) + if err != nil { + return fmt.Errorf("failed to send webhook: %w", err) + } + defer resp.Body.Close() + + if resp.StatusCode != http.StatusOK { + body, _ := io.ReadAll(resp.Body) + return fmt.Errorf("webhook returned status %d: %s", resp.StatusCode, string(body)) + } + + log.Info(). + Int("alert_count", len(alerts)). + Str("webhook_url", s.webhookURL). + Msg("Successfully forwarded alerts to monitoring module") + + return nil +} + +// generateFingerprint 生成告警指纹 +func (s *AlertmanagerService) generateFingerprint(labels map[string]string) string { + // 简化版指纹生成 + result := "" + for k, v := range labels { + result += fmt.Sprintf("%s:%s,", k, v) + } + return fmt.Sprintf("%x", result)[:16] +} diff --git a/internal/prometheus_adapter/service/metric_service.go b/internal/prometheus_adapter/service/metric_service.go new file mode 100644 index 0000000..fff69c2 --- /dev/null +++ b/internal/prometheus_adapter/service/metric_service.go @@ -0,0 +1,80 @@ +package service + +import ( + "context" + "time" + + "github.com/qiniu/zeroops/internal/prometheus_adapter/client" + "github.com/qiniu/zeroops/internal/prometheus_adapter/model" + "github.com/rs/zerolog/log" +) + +// MetricService 指标服务 +type MetricService struct { + promClient *client.PrometheusClient +} + +// NewMetricService 创建指标服务 +func NewMetricService(promClient *client.PrometheusClient) *MetricService { + return &MetricService{ + promClient: promClient, + } +} + +// GetAvailableMetrics 获取可用的指标列表 +func (s *MetricService) GetAvailableMetrics(ctx context.Context) (*model.MetricListResponse, error) { + // 从 Prometheus 动态获取指标列表 + metrics, err := s.promClient.GetAvailableMetrics(ctx) + if err != nil { + log.Error().Err(err).Msg("failed to get available metrics from prometheus") + return nil, &model.PrometheusError{Message: err.Error()} + } + + return &model.MetricListResponse{ + Metrics: metrics, + }, nil +} + +// QueryMetric 查询指标数据 +func (s *MetricService) QueryMetric(ctx context.Context, service, metric, version string, start, end time.Time, step time.Duration) (*model.MetricQueryResponse, error) { + // 动态验证服务是否存在 + serviceExists, err := s.promClient.CheckServiceExists(ctx, service) + if err != nil { + log.Error().Err(err).Str("service", service).Msg("failed to check service existence") + return nil, &model.PrometheusError{Message: err.Error()} + } + if !serviceExists { + return nil, &model.ServiceNotFoundError{Service: service} + } + + // 动态验证指标是否存在 + metricExists, err := s.promClient.CheckMetricExists(ctx, metric) + if err != nil { + log.Error().Err(err).Str("metric", metric).Msg("failed to check metric existence") + return nil, &model.PrometheusError{Message: err.Error()} + } + if !metricExists { + return nil, &model.MetricNotFoundError{Metric: metric} + } + + // 构建 PromQL 查询 + query := client.BuildQuery(service, metric, version) + log.Debug().Str("query", query).Msg("executing prometheus query") + + // 执行查询 + dataPoints, err := s.promClient.QueryRange(ctx, query, start, end, step) + if err != nil { + log.Error().Err(err).Str("query", query).Msg("failed to query prometheus") + return nil, &model.PrometheusError{Message: err.Error()} + } + + // 构建响应 + response := &model.MetricQueryResponse{ + Service: service, + Version: version, + Metric: metric, + Data: dataPoints, + } + + return response, nil +} diff --git a/internal/prometheus_adapter/test_alert.sh b/internal/prometheus_adapter/test_alert.sh new file mode 100755 index 0000000..cc74248 --- /dev/null +++ b/internal/prometheus_adapter/test_alert.sh @@ -0,0 +1,188 @@ +#!/bin/bash + +# 测试增量更新告警规则功能 + +BASE_URL="http://10.210.10.33:9999" + +echo "=== 测试增量更新告警规则 ===" + +# 1. 初始化规则(使用增量更新接口) +echo -e "\n1. 创建初始规则..." + +# 1.1 创建 high_cpu_usage 规则模板 +echo -e "\n1.1 创建规则模板: high_cpu_usage" +curl -X PUT ${BASE_URL}/v1/alert-rules/high_cpu_usage \ + -H "Content-Type: application/json" \ + -d '{ + "description": "CPU使用率过高告警", + "expr": "system_cpu_usage_percent", + "op": ">", + "severity": "warning", + "watch_time": 300 + }' | jq . + +sleep 1 + +# 1.2 创建 high_memory_usage 规则模板 +echo -e "\n1.2 创建规则模板: high_memory_usage" +curl -X PUT ${BASE_URL}/v1/alert-rules/high_memory_usage \ + -H "Content-Type: application/json" \ + -d '{ + "description": "内存使用率过高告警", + "expr": "system_memory_usage_percent", + "op": ">", + "severity": "warning", + "watch_time": 600 + }' | jq . + +sleep 1 + +# 1.3 设置 high_cpu_usage 规则的元信息 +echo -e "\n1.3 设置规则元信息: high_cpu_usage" +curl -X PUT ${BASE_URL}/v1/alert-rules-meta/high_cpu_usage \ + -H "Content-Type: application/json" \ + -d '{ + "metas": [ + { + "labels": "{\"service\":\"storage-service\",\"version\":\"1.0.0\"}", + "threshold": 80 + }, + { + "labels": "{\"service\":\"metadata-service\",\"version\":\"1.0.0\"}", + "threshold": 85 + } + ] + }' | jq . + +sleep 1 + +# 1.4 设置 high_memory_usage 规则的元信息 +echo -e "\n1.4 设置规则元信息: high_memory_usage" +curl -X PUT ${BASE_URL}/v1/alert-rules-meta/high_memory_usage \ + -H "Content-Type: application/json" \ + -d '{ + "metas": [ + { + "labels": "{\"service\":\"storage-service\",\"version\":\"1.0.0\"}", + "threshold": 90 + } + ] + }' | jq . + +sleep 2 + +# 2. 更新单个规则模板 +echo -e "\n2. 更新规则模板 high_cpu_usage..." +curl -X PUT ${BASE_URL}/v1/alert-rules/high_cpu_usage \ + -H "Content-Type: application/json" \ + -d '{ + "description": "CPU使用率异常告警(更新后)", + "expr": "avg(system_cpu_usage_percent[5m])", + "op": ">=", + "severity": "critical", + "watch_time": 300 + }' | jq . + +sleep 2 + +# 3. 批量更新规则元信息 +echo -e "\n3. 批量更新规则元信息(high_cpu_usage)..." +curl -X PUT ${BASE_URL}/v1/alert-rules-meta/high_cpu_usage \ + -H "Content-Type: application/json" \ + -d '{ + "metas": [ + { + "labels": "{\"service\":\"storage-service\",\"version\":\"1.0.0\"}", + "threshold": 75 + }, + { + "labels": "{\"service\":\"metadata-service\",\"version\":\"1.0.0\"}", + "threshold": 88 + } + ] + }' | jq . + +sleep 2 + +# 4. 批量更新规则元信息(添加新的服务) +echo -e "\n4. 批量更新规则元信息(high_memory_usage - 添加新服务)..." +curl -X PUT ${BASE_URL}/v1/alert-rules-meta/high_memory_usage \ + -H "Content-Type: application/json" \ + -d '{ + "metas": [ + { + "labels": "{\"service\":\"queue-service\",\"version\":\"2.0.0\"}", + "threshold": 95 + }, + { + "labels": "{\"service\":\"third-party-service\",\"version\":\"1.0.0\"}", + "threshold": 92 + } + ] + }' | jq . + +sleep 2 + +# 5. 测试删除规则元信息 +echo -e "\n5. 删除规则元信息(删除 high_cpu_usage 的 storage-service)..." +curl -X DELETE ${BASE_URL}/v1/alert-rules-meta/high_cpu_usage \ + -H "Content-Type: application/json" \ + -d '{ + "labels": "{\"service\":\"storage-service\",\"version\":\"1.0.0\"}" + }' | jq . + +sleep 2 + +# 6. 测试删除不存在的规则元信息(应该返回404) +echo -e "\n6. 删除不存在的规则元信息(测试错误处理)..." +curl -X DELETE ${BASE_URL}/v1/alert-rules-meta/high_cpu_usage \ + -H "Content-Type: application/json" \ + -d '{ + "labels": "{\"service\":\"non-existent-service\",\"version\":\"1.0.0\"}" + }' | jq . + +sleep 2 + +# 7. 测试删除整个规则模板 +echo -e "\n7. 删除整个规则模板(删除 high_memory_usage 及其所有元信息)..." +curl -X DELETE ${BASE_URL}/v1/alert-rules/high_memory_usage | jq . + +sleep 2 + +# 8. 测试删除不存在的规则模板(应该返回404) +echo -e "\n8. 删除不存在的规则模板(测试错误处理)..." +curl -X DELETE ${BASE_URL}/v1/alert-rules/non_existent_rule | jq . + +sleep 2 + +# 9. 验证删除结果 - 查看剩余的规则 +echo -e "\n9. 验证删除结果..." +echo "9.1 尝试更新已删除的规则模板(应该创建新规则):" +curl -X PUT ${BASE_URL}/v1/alert-rules/high_memory_usage \ + -H "Content-Type: application/json" \ + -d '{ + "description": "重新创建的内存告警规则", + "expr": "system_memory_usage_percent", + "op": ">", + "severity": "warning", + "watch_time": 300 + }' | jq . + +sleep 1 + +echo -e "\n9.2 查看当前 high_cpu_usage 的受影响元信息数量(应该只剩1个):" +curl -X PUT ${BASE_URL}/v1/alert-rules/high_cpu_usage \ + -H "Content-Type: application/json" \ + -d '{ + "description": "验证剩余元信息的规则更新" + }' | jq . + +echo -e "\n=== 删除功能测试完成 ===" +echo -e "\n测试总结:" +echo "✓ 测试了删除单个规则元信息" +echo "✓ 测试了删除不存在的规则元信息(错误处理)" +echo "✓ 测试了删除整个规则模板及其所有元信息" +echo "✓ 测试了删除不存在的规则模板(错误处理)" +echo "✓ 验证了删除操作的实际效果" + +echo -e "\n=== 测试完成 ===" \ No newline at end of file diff --git a/mock/s3/DEPLOYMENT.md b/mock/s3/DEPLOYMENT.md index 6e69125..8a93c65 100644 --- a/mock/s3/DEPLOYMENT.md +++ b/mock/s3/DEPLOYMENT.md @@ -55,11 +55,12 @@ sudo supervisorctl start zeroops_* ps aux | grep zeroops_ # 健康检查 -curl http://localhost:8181/health # metadata-service -curl http://localhost:8191/health # storage-service -curl http://localhost:8201/health # queue-service -curl http://localhost:8211/health # third-party-service -curl http://localhost:8221/health # mock-error-service +``` +for port in 8182 8183 8191 8192 8201 8202 8211 8221; do + echo "Checking port $port:" + curl -s -o /dev/null -w "HTTP Status: %{http_code}\n" http://localhost:$port/metrics ||echo "Failed" +done +``` # 查看日志 tail -f /home/qboxserver/zeroops_metadata_1/logs/service.log diff --git a/mock/s3/deployments/docker-compose.yml b/mock/s3/deployments/docker-compose.yml index 61f13cd..9f30f9e 100644 --- a/mock/s3/deployments/docker-compose.yml +++ b/mock/s3/deployments/docker-compose.yml @@ -84,10 +84,14 @@ services: volumes: - prometheus-data:/prometheus - ./prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro + - ./prometheus/rules:/etc/prometheus/rules:rw command: - '--config.file=/etc/prometheus/prometheus.yml' - '--storage.tsdb.path=/prometheus' - '--web.enable-lifecycle' + - '--web.enable-admin-api' # 启用管理API以支持配置重载 + extra_hosts: + - "host.docker.internal:host-gateway" # 允许容器访问宿主机 restart: unless-stopped # Grafana - 可视化 diff --git a/mock/s3/deployments/observability/prometheus.yml b/mock/s3/deployments/observability/prometheus.yml index 2bcabb8..6c9fb04 100644 --- a/mock/s3/deployments/observability/prometheus.yml +++ b/mock/s3/deployments/observability/prometheus.yml @@ -5,6 +5,18 @@ global: cluster: mock-s3 environment: docker +# 告警规则文件 +rule_files: + - "/etc/prometheus/rules/*.yml" + +# Alerting 配置 - 将告警发送到 Prometheus Adapter (伪 Alertmanager) +alerting: + alertmanagers: + - static_configs: + - targets: + - 'host.docker.internal:8081' # Prometheus Adapter 运行在宿主机的 8081 端口 + api_version: v2 # 使用 Alertmanager API v2 + scrape_configs: # Prometheus自身的指标 - job_name: 'prometheus' diff --git a/mock/s3/services/mock-error/internal/handler/mock_error_handler.go b/mock/s3/services/mock-error/internal/handler/mock_error_handler.go index 18be6b6..dfd1f33 100644 --- a/mock/s3/services/mock-error/internal/handler/mock_error_handler.go +++ b/mock/s3/services/mock-error/internal/handler/mock_error_handler.go @@ -85,13 +85,13 @@ func (h *MockErrorHandler) deleteMetricAnomaly(c *gin.Context) { // checkMetricInjection 检查是否应该注入指标异常 func (h *MockErrorHandler) checkMetricInjection(c *gin.Context) { - ctx := c.Request.Context() + ctx := c.Request.Context() - var request struct { - Service string `json:"service" binding:"required"` - MetricName string `json:"metric_name" binding:"required"` - Instance string `json:"instance"` - } + var request struct { + Service string `json:"service" binding:"required"` + MetricName string `json:"metric_name" binding:"required"` + Instance string `json:"instance"` + } if err := c.ShouldBindJSON(&request); err != nil { h.logger.Error(ctx, "Failed to bind metric injection check request", observability.Error(err)) @@ -99,14 +99,14 @@ func (h *MockErrorHandler) checkMetricInjection(c *gin.Context) { return } - anomaly, shouldInject := h.errorService.ShouldInjectError(ctx, request.Service, request.MetricName, request.Instance) + anomaly, shouldInject := h.errorService.ShouldInjectError(ctx, request.Service, request.MetricName, request.Instance) - response := gin.H{ - "should_inject": shouldInject, - "service": request.Service, - "metric_name": request.MetricName, - "instance": request.Instance, - } + response := gin.H{ + "should_inject": shouldInject, + "service": request.Service, + "metric_name": request.MetricName, + "instance": request.Instance, + } if shouldInject { response["anomaly"] = anomaly diff --git a/mock/s3/services/mock-error/internal/service/mock_error_service.go b/mock/s3/services/mock-error/internal/service/mock_error_service.go index cdaf0c4..b445c82 100644 --- a/mock/s3/services/mock-error/internal/service/mock_error_service.go +++ b/mock/s3/services/mock-error/internal/service/mock_error_service.go @@ -118,19 +118,19 @@ func (s *MockErrorService) ShouldInjectError(ctx context.Context, service, metri s.stats.TotalRequests++ s.stats.LastUpdated = time.Now() - for _, rule := range s.rules { - if !rule.Enabled { - continue - } - - // 检查服务匹配 - if rule.Service != "" && rule.Service != service { - continue - } - // 检查实例匹配(如果指定了实例,则必须匹配) - if rule.Instance != "" && rule.Instance != instance { - continue - } + for _, rule := range s.rules { + if !rule.Enabled { + continue + } + + // 检查服务匹配 + if rule.Service != "" && rule.Service != service { + continue + } + // 检查实例匹配(如果指定了实例,则必须匹配) + if rule.Instance != "" && rule.Instance != instance { + continue + } // 检查指标名称匹配 if rule.MetricName != "" && rule.MetricName != metricName { @@ -167,14 +167,14 @@ func (s *MockErrorService) ShouldInjectError(ctx context.Context, service, metri "rule_id": rule.ID, } - s.logger.Info(ctx, "Metric anomaly injected", - observability.String("rule_id", rule.ID), - observability.String("service", service), - observability.String("instance", instance), - observability.String("metric_name", metricName), - observability.String("anomaly_type", rule.AnomalyType), - observability.Float64("target_value", rule.TargetValue), - observability.Int("triggered_count", rule.Triggered)) + s.logger.Info(ctx, "Metric anomaly injected", + observability.String("rule_id", rule.ID), + observability.String("service", service), + observability.String("instance", instance), + observability.String("metric_name", metricName), + observability.String("anomaly_type", rule.AnomalyType), + observability.Float64("target_value", rule.TargetValue), + observability.Int("triggered_count", rule.Triggered)) return anomaly, true } diff --git a/mock/s3/shared/interfaces/error_injector.go b/mock/s3/shared/interfaces/error_injector.go index 4feb187..c59894a 100644 --- a/mock/s3/shared/interfaces/error_injector.go +++ b/mock/s3/shared/interfaces/error_injector.go @@ -15,7 +15,7 @@ type MetricAnomalyService interface { ListRules(ctx context.Context) ([]*models.MetricAnomalyRule, error) // 指标异常注入核心功能 - ShouldInjectError(ctx context.Context, service, metricName, instance string) (map[string]any, bool) + ShouldInjectError(ctx context.Context, service, metricName, instance string) (map[string]any, bool) } // MetricInjector HTTP指标异常注入器接口 diff --git a/mock/s3/shared/middleware/error_injection/error_injection.go b/mock/s3/shared/middleware/error_injection/error_injection.go index 9eb98a1..42dd606 100644 --- a/mock/s3/shared/middleware/error_injection/error_injection.go +++ b/mock/s3/shared/middleware/error_injection/error_injection.go @@ -1,16 +1,16 @@ package error_injection import ( - "context" - "fmt" - "mocks3/shared/client" - "mocks3/shared/models" - "mocks3/shared/observability" - "mocks3/shared/utils" - "net/http" - "strconv" - "sync" - "time" + "context" + "fmt" + "mocks3/shared/client" + "mocks3/shared/models" + "mocks3/shared/observability" + "mocks3/shared/utils" + "net/http" + "strconv" + "sync" + "time" ) // MetricInjectorConfig 指标异常注入器配置 @@ -34,6 +34,7 @@ type CacheConfig struct { type MetricInjector struct { mockErrorClient *client.BaseHTTPClient serviceName string + serviceVersion string // 添加服务版本字段 logger *observability.Logger // 缓存 @@ -56,7 +57,7 @@ type CachedAnomaly struct { } // NewMetricInjector 从YAML配置创建指标异常注入器 -func NewMetricInjector(configPath string, serviceName string, logger *observability.Logger) (*MetricInjector, error) { +func NewMetricInjector(configPath string, serviceName string, serviceVersion string, logger *observability.Logger) (*MetricInjector, error) { // 加载配置文件 var config MetricInjectorConfig if err := utils.LoadConfig(configPath, &config); err != nil { @@ -84,6 +85,7 @@ func NewMetricInjector(configPath string, serviceName string, logger *observabil injector := &MetricInjector{ mockErrorClient: client, serviceName: serviceName, + serviceVersion: serviceVersion, logger: logger, cache: make(map[string]*CachedAnomaly), cacheTTL: config.Cache.TTL, @@ -102,12 +104,13 @@ func NewMetricInjector(configPath string, serviceName string, logger *observabil } // NewMetricInjectorWithDefaults 使用默认配置创建指标异常注入器 -func NewMetricInjectorWithDefaults(mockErrorServiceURL string, serviceName string, logger *observability.Logger) *MetricInjector { +func NewMetricInjectorWithDefaults(mockErrorServiceURL string, serviceName string, serviceVersion string, logger *observability.Logger) *MetricInjector { client := client.NewBaseHTTPClient(mockErrorServiceURL, 5*time.Second, "metric-injector", logger) injector := &MetricInjector{ mockErrorClient: client, serviceName: serviceName, + serviceVersion: serviceVersion, logger: logger, cache: make(map[string]*CachedAnomaly), cacheTTL: 30 * time.Second, @@ -125,11 +128,9 @@ func NewMetricInjectorWithDefaults(mockErrorServiceURL string, serviceName strin // InjectMetricAnomaly 检查并注入指标异常 func (mi *MetricInjector) InjectMetricAnomaly(ctx context.Context, metricName string, originalValue float64) float64 { - // 计算实例标识,用于实例级注入与缓存 - instanceID := utils.GetInstanceID(mi.serviceName) - - // 检查缓存(加入实例维度) - cacheKey := mi.serviceName + ":" + instanceID + ":" + metricName + // 使用服务版本作为注入维度,同一版本的所有实例共享相同的异常注入 + // 检查缓存(基于服务版本) + cacheKey := mi.serviceName + ":" + mi.serviceVersion + ":" + metricName mi.cacheMu.RLock() if cached, exists := mi.cache[cacheKey]; exists && time.Now().Before(cached.ExpiresAt) { mi.cacheMu.RUnlock() @@ -140,20 +141,20 @@ func (mi *MetricInjector) InjectMetricAnomaly(ctx context.Context, metricName st } mi.cacheMu.RUnlock() - // 查询Mock Error Service获取异常规则 - request := map[string]string{ - "service": mi.serviceName, - "metric_name": metricName, - "instance": instanceID, - } - - var response struct { - ShouldInject bool `json:"should_inject"` - Service string `json:"service"` - MetricName string `json:"metric_name"` - Instance string `json:"instance"` - Anomaly map[string]any `json:"anomaly,omitempty"` - } + // 查询Mock Error Service获取异常规则(基于版本) + request := map[string]string{ + "service": mi.serviceName, + "version": mi.serviceVersion, + "metric_name": metricName, + } + + var response struct { + ShouldInject bool `json:"should_inject"` + Service string `json:"service"` + Version string `json:"version"` + MetricName string `json:"metric_name"` + Anomaly map[string]any `json:"anomaly,omitempty"` + } // 使用较短的超时时间避免影响正常指标收集 opts := client.RequestOptions{ diff --git a/mock/s3/shared/middleware/error_injection/http_latency_injector.go b/mock/s3/shared/middleware/error_injection/http_latency_injector.go new file mode 100644 index 0000000..f26de3c --- /dev/null +++ b/mock/s3/shared/middleware/error_injection/http_latency_injector.go @@ -0,0 +1,183 @@ +package error_injection + +import ( + "context" + "mocks3/shared/client" + "mocks3/shared/observability" + "sync" + "time" +) + +// HTTPLatencyInjector HTTP请求延迟注入器 +type HTTPLatencyInjector struct { + mockErrorClient *client.BaseHTTPClient + serviceName string + serviceVersion string + logger *observability.Logger + + // 缓存 + cache map[string]*CachedLatencyConfig + cacheMu sync.RWMutex + cacheTTL time.Duration +} + +// CachedLatencyConfig 缓存的延迟配置 +type CachedLatencyConfig struct { + Config *LatencyConfig + ExpiresAt time.Time +} + +// LatencyConfig 延迟配置 +type LatencyConfig struct { + ShouldInject bool `json:"should_inject"` + Latency time.Duration `json:"latency"` // 注入的延迟时间 + Probability float64 `json:"probability"` // 注入概率 (0-1) + Pattern string `json:"pattern"` // 路径匹配模式(可选) +} + +// NewHTTPLatencyInjector 创建HTTP延迟注入器 +func NewHTTPLatencyInjector(mockErrorServiceURL string, serviceName, serviceVersion string, logger *observability.Logger) *HTTPLatencyInjector { + client := client.NewBaseHTTPClient(mockErrorServiceURL, 5*time.Second, "latency-injector", logger) + + injector := &HTTPLatencyInjector{ + mockErrorClient: client, + serviceName: serviceName, + serviceVersion: serviceVersion, + logger: logger, + cache: make(map[string]*CachedLatencyConfig), + cacheTTL: 30 * time.Second, + } + + // 启动缓存清理 + go injector.cleanupCache() + + return injector +} + +// GetLatencyConfig 获取延迟配置 +func (h *HTTPLatencyInjector) GetLatencyConfig(ctx context.Context, path string) (*LatencyConfig, error) { + // 构建缓存键(基于版本) + cacheKey := h.serviceName + ":" + h.serviceVersion + ":" + path + + // 检查缓存 + h.cacheMu.RLock() + if cached, exists := h.cache[cacheKey]; exists && time.Now().Before(cached.ExpiresAt) { + h.cacheMu.RUnlock() + return cached.Config, nil + } + h.cacheMu.RUnlock() + + // 查询Mock Error Service获取延迟配置 + request := map[string]string{ + "service": h.serviceName, + "version": h.serviceVersion, + "path": path, + "type": "http_latency", + } + + var response struct { + ShouldInject bool `json:"should_inject"` + Latency int64 `json:"latency_ms"` // 毫秒 + Probability float64 `json:"probability"` + Pattern string `json:"pattern"` + } + + opts := client.RequestOptions{ + Method: "POST", + Path: "/api/v1/latency-inject/check", + Body: request, + } + + err := h.mockErrorClient.DoRequestWithJSON(ctx, opts, &response) + if err != nil { + h.logger.Debug(ctx, "Failed to check latency injection", + observability.Error(err), + observability.String("path", path)) + // 失败时缓存空结果 + h.updateCache(cacheKey, nil) + return nil, nil + } + + // 构建配置 + var config *LatencyConfig + if response.ShouldInject { + config = &LatencyConfig{ + ShouldInject: true, + Latency: time.Duration(response.Latency) * time.Millisecond, + Probability: response.Probability, + Pattern: response.Pattern, + } + } + + // 更新缓存 + h.updateCache(cacheKey, config) + + return config, nil +} + +// InjectLatency 注入延迟(如果需要) +func (h *HTTPLatencyInjector) InjectLatency(ctx context.Context, path string) time.Duration { + config, err := h.GetLatencyConfig(ctx, path) + if err != nil || config == nil || !config.ShouldInject { + return 0 + } + + // 基于概率决定是否注入 + if config.Probability < 1.0 { + // 简单的概率实现(生产环境应使用更好的随机数) + if time.Now().UnixNano()%100 >= int64(config.Probability*100) { + return 0 + } + } + + // 执行真实的延迟 + if config.Latency > 0 { + h.logger.Info(ctx, "Injecting HTTP latency", + observability.String("service", h.serviceName), + observability.String("version", h.serviceVersion), + observability.String("path", path), + observability.Duration("latency", config.Latency)) + + // 真实的延迟注入 + time.Sleep(config.Latency) + + return config.Latency + } + + return 0 +} + +// updateCache 更新缓存 +func (h *HTTPLatencyInjector) updateCache(key string, config *LatencyConfig) { + h.cacheMu.Lock() + defer h.cacheMu.Unlock() + + h.cache[key] = &CachedLatencyConfig{ + Config: config, + ExpiresAt: time.Now().Add(h.cacheTTL), + } +} + +// cleanupCache 定期清理过期缓存 +func (h *HTTPLatencyInjector) cleanupCache() { + ticker := time.NewTicker(1 * time.Minute) + defer ticker.Stop() + + for range ticker.C { + h.cacheMu.Lock() + now := time.Now() + for key, cached := range h.cache { + if now.After(cached.ExpiresAt) { + delete(h.cache, key) + } + } + h.cacheMu.Unlock() + } +} + +// Cleanup 清理资源 +func (h *HTTPLatencyInjector) Cleanup() { + h.cacheMu.Lock() + defer h.cacheMu.Unlock() + h.cache = make(map[string]*CachedLatencyConfig) +} diff --git a/mock/s3/shared/models/error.go b/mock/s3/shared/models/error.go index 7ebd5b2..2643054 100644 --- a/mock/s3/shared/models/error.go +++ b/mock/s3/shared/models/error.go @@ -6,13 +6,13 @@ import ( // MetricAnomalyRule 指标异常注入规则 type MetricAnomalyRule struct { - ID string `json:"id"` - Name string `json:"name"` - Service string `json:"service"` // 目标服务 - Instance string `json:"instance,omitempty"` // 目标实例,可选 - MetricName string `json:"metric_name"` // 目标指标名称 - AnomalyType string `json:"anomaly_type"` - Enabled bool `json:"enabled"` + ID string `json:"id"` + Name string `json:"name"` + Service string `json:"service"` // 目标服务 + Instance string `json:"instance,omitempty"` // 目标实例,可选 + MetricName string `json:"metric_name"` // 目标指标名称 + AnomalyType string `json:"anomaly_type"` + Enabled bool `json:"enabled"` // 异常参数 TargetValue float64 `json:"target_value"` // 目标异常值 diff --git a/mock/s3/shared/observability/metrics.go b/mock/s3/shared/observability/metrics.go index 3c5bdeb..d6a8343 100644 --- a/mock/s3/shared/observability/metrics.go +++ b/mock/s3/shared/observability/metrics.go @@ -12,6 +12,7 @@ import ( "time" "github.com/prometheus/procfs" + "go.opentelemetry.io/otel/attribute" "go.opentelemetry.io/otel/metric" ) @@ -47,6 +48,9 @@ type MetricCollector struct { networkQPS metric.Float64Gauge machineOnlineStatus metric.Int64Gauge + // HTTP 请求指标 + httpRequestDuration metric.Float64Histogram + // 统计状态 cpuStats *CPUStats networkStats *NetworkStats @@ -55,6 +59,10 @@ type MetricCollector struct { // 错误注入器 metricInjector MetricInjector + + // 服务属性 + serviceName string + serviceVersion string } // NewMetricCollector 创建指标收集器 @@ -83,6 +91,12 @@ func NewMetricCollector(meter metric.Meter, logger *Logger) (*MetricCollector, e return collector, nil } +// SetServiceInfo 设置服务信息 +func (c *MetricCollector) SetServiceInfo(serviceName, serviceVersion string) { + c.serviceName = serviceName + c.serviceVersion = serviceVersion +} + // SetMetricInjector 设置错误注入器 func (c *MetricCollector) SetMetricInjector(injector MetricInjector) { c.metricInjector = injector @@ -139,6 +153,18 @@ func (c *MetricCollector) initMetrics() error { return err } + // HTTP 请求时延 + if c.httpRequestDuration, err = c.meter.Float64Histogram( + "http_latency", + metric.WithDescription("HTTP server request duration in seconds"), + metric.WithUnit("s"), + metric.WithExplicitBucketBoundaries( + 0.005, 0.01, 0.025, 0.05, 0.075, 0.1, 0.25, 0.5, 0.75, 1, 2.5, 5, 7.5, 10, + ), + ); err != nil { + return err + } + return nil } @@ -413,7 +439,13 @@ func (c *MetricCollector) collectNetworkMetrics(ctx context.Context) { finalValue = c.metricInjector.InjectMetricAnomaly(ctx, "system_network_qps", qps) } - c.networkQPS.Record(ctx, finalValue) + // 添加服务版本标签(exported_job 冗余,已通过 service_name 资源属性暴露) + attrs := []attribute.KeyValue{} + if c.serviceVersion != "" { + attrs = append(attrs, attribute.String("service_version", c.serviceVersion)) + } + + c.networkQPS.Record(ctx, finalValue, metric.WithAttributes(attrs...)) } c.networkStats.lastUpdate = now } @@ -486,3 +518,21 @@ func (c *MetricCollector) updateMachineStatus(ctx context.Context) { c.machineOnlineStatus.Record(ctx, int64(finalValue)) } + +// RecordHTTPRequestDuration 记录 HTTP 请求时延 +func (c *MetricCollector) RecordHTTPRequestDuration(ctx context.Context, duration float64, method, path string, statusCode int) { + // 构建属性标签(移除 exported_job,保留 service_version) + attrs := []attribute.KeyValue{ + attribute.String("http.method", method), + attribute.String("http.route", path), + attribute.Int("http.status_code", statusCode), + } + + // 添加服务版本(必要标签,用于版本区分) + if c.serviceVersion != "" { + attrs = append(attrs, attribute.String("service_version", c.serviceVersion)) + } + + // 记录时延(以秒为单位) + c.httpRequestDuration.Record(ctx, duration, metric.WithAttributes(attrs...)) +} diff --git a/mock/s3/shared/observability/middleware.go b/mock/s3/shared/observability/middleware.go index 265bc3e..5b22f85 100644 --- a/mock/s3/shared/observability/middleware.go +++ b/mock/s3/shared/observability/middleware.go @@ -1,16 +1,23 @@ package observability import ( + "context" "time" "github.com/gin-gonic/gin" "go.opentelemetry.io/contrib/instrumentation/github.com/gin-gonic/gin/otelgin" ) +// LatencyInjector 延迟注入器接口 +type LatencyInjector interface { + InjectLatency(ctx context.Context, path string) time.Duration +} + // HTTPMiddleware HTTP监控中间件 type HTTPMiddleware struct { - collector *MetricCollector - logger *Logger + collector *MetricCollector + logger *Logger + latencyInjector LatencyInjector } // NewHTTPMiddleware 创建HTTP中间件 @@ -21,34 +28,75 @@ func NewHTTPMiddleware(collector *MetricCollector, logger *Logger) *HTTPMiddlewa } } +// SetLatencyInjector 设置延迟注入器 +func (m *HTTPMiddleware) SetLatencyInjector(injector LatencyInjector) { + m.latencyInjector = injector +} + // GinMetricsMiddleware Gin指标中间件 func (m *HTTPMiddleware) GinMetricsMiddleware() gin.HandlerFunc { return func(c *gin.Context) { + // 获取请求路径 + path := c.FullPath() + if path == "" { + path = c.Request.URL.Path + } + + // 在请求处理前注入延迟(如果配置了延迟注入器) + var injectedLatency time.Duration + if m.latencyInjector != nil { + injectedLatency = m.latencyInjector.InjectLatency(c.Request.Context(), path) + } + start := time.Now() // 处理请求 c.Next() - // 计算基本信息用于日志记录 + // 计算请求时延(包含注入的延迟) duration := time.Since(start) statusCode := c.Writer.Status() + // 记录 HTTP 请求时延指标(以秒为单位) + if m.collector != nil { + durationSeconds := duration.Seconds() + m.collector.RecordHTTPRequestDuration( + c.Request.Context(), + durationSeconds, + c.Request.Method, + path, + statusCode, + ) + } + // 只记录错误请求的日志 if statusCode >= 400 { m.logger.Warn(c.Request.Context(), "HTTP request completed with error", String("method", c.Request.Method), - String("path", c.FullPath()), + String("path", path), Int("status", statusCode), Duration("duration", duration), + Duration("injected_latency", injectedLatency), ) } - m.logger.Info(c.Request.Context(), "HTTP request completed", - String("method", c.Request.Method), - String("path", c.FullPath()), - Int("status", statusCode), - Duration("duration", duration), - ) + // 记录请求信息(如果有注入延迟,记录在日志中) + if injectedLatency > 0 { + m.logger.Info(c.Request.Context(), "HTTP request completed with injected latency", + String("method", c.Request.Method), + String("path", path), + Int("status", statusCode), + Duration("duration", duration), + Duration("injected_latency", injectedLatency), + ) + } else { + m.logger.Info(c.Request.Context(), "HTTP request completed", + String("method", c.Request.Method), + String("path", path), + Int("status", statusCode), + Duration("duration", duration), + ) + } } } diff --git a/mock/s3/shared/observability/observability.go b/mock/s3/shared/observability/observability.go index a6efafa..7d238ce 100644 --- a/mock/s3/shared/observability/observability.go +++ b/mock/s3/shared/observability/observability.go @@ -30,6 +30,9 @@ func Setup(serviceName string, configPath string) (*Providers, *MetricCollector, return nil, nil, nil, fmt.Errorf("failed to create metric collector: %w", err) } + // 设置服务信息到指标收集器 + collector.SetServiceInfo(config.ServiceName, config.ServiceVersion) + // 创建HTTP中间件 httpMiddleware := NewHTTPMiddleware(collector, providers.Logger) diff --git a/mock/s3/shared/observability/providers.go b/mock/s3/shared/observability/providers.go index 6f93882..0f28ba5 100644 --- a/mock/s3/shared/observability/providers.go +++ b/mock/s3/shared/observability/providers.go @@ -1,12 +1,11 @@ package observability import ( - "context" - "fmt" - "mocks3/shared/observability/config" - "mocks3/shared/utils" + "context" + "fmt" + "mocks3/shared/observability/config" - "go.opentelemetry.io/otel" + "go.opentelemetry.io/otel" "go.opentelemetry.io/otel/exporters/otlp/otlplog/otlploghttp" "go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetrichttp" "go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp" @@ -183,14 +182,11 @@ func (p *Providers) Shutdown(ctx context.Context) error { // createResource 创建OTEL资源 func createResource(config *config.ObservabilityConfig) (*resource.Resource, error) { - // 使用统一的实例ID生成器 - instanceID := utils.GetInstanceID(config.ServiceName) - return resource.New(context.Background(), - resource.WithAttributes( - semconv.ServiceName(config.ServiceName), - semconv.ServiceVersion(config.ServiceVersion), - semconv.DeploymentEnvironment(config.Environment), - semconv.ServiceInstanceID(instanceID), - ), - ) + return resource.New(context.Background(), + resource.WithAttributes( + semconv.ServiceName(config.ServiceName), + semconv.ServiceVersion(config.ServiceVersion), + semconv.DeploymentEnvironment(config.Environment), + ), + ) } diff --git a/mock/s3/shared/server/service_bootstrap.go b/mock/s3/shared/server/service_bootstrap.go index aa7d986..6be5825 100644 --- a/mock/s3/shared/server/service_bootstrap.go +++ b/mock/s3/shared/server/service_bootstrap.go @@ -1,15 +1,15 @@ package server import ( - "context" - "fmt" - "mocks3/shared/observability" - "net/http" - "net" - "os" - "os/signal" - "syscall" - "time" + "context" + "fmt" + "mocks3/shared/observability" + "net" + "net/http" + "os" + "os/signal" + "syscall" + "time" "github.com/gin-gonic/gin" "mocks3/shared/middleware/consul" @@ -47,7 +47,8 @@ type ServiceBootstrap struct { HTTPMiddleware *observability.HTTPMiddleware // 错误注入 - MetricInjector *error_injection.MetricInjector + MetricInjector *error_injection.MetricInjector + LatencyInjector *error_injection.HTTPLatencyInjector // Consul客户端 ConsulClient consul.ConsulClient @@ -174,7 +175,7 @@ func (sb *ServiceBootstrap) setupObservability() error { // setupConsulRegistration 设置Consul服务注册 func (sb *ServiceBootstrap) setupConsulRegistration() error { - ctx := context.Background() + ctx := context.Background() // 检查配置是否支持Consul consulConfig, ok := sb.Config.(ConsulServiceConfig) @@ -191,23 +192,23 @@ func (sb *ServiceBootstrap) setupConsulRegistration() error { sb.ConsulClient = consulClient - // 注册服务到Consul - // 优先使用可达的容器/主机实例IP地址进行注册,确保多实例下目标唯一 - var registerAddress string - if sb.Config.GetHost() == "0.0.0.0" { - // 允许通过环境变量覆盖对外公布地址 - if envAddr := os.Getenv("ADVERTISE_ADDR"); envAddr != "" { - registerAddress = envAddr - } else { - ip, err := detectAdvertiseAddr() - if err != nil { - return fmt.Errorf("failed to detect advertise address: %w", err) - } - registerAddress = ip - } - } else { - registerAddress = sb.Config.GetHost() - } + // 注册服务到Consul + // 优先使用可达的容器/主机实例IP地址进行注册,确保多实例下目标唯一 + var registerAddress string + if sb.Config.GetHost() == "0.0.0.0" { + // 允许通过环境变量覆盖对外公布地址 + if envAddr := os.Getenv("ADVERTISE_ADDR"); envAddr != "" { + registerAddress = envAddr + } else { + ip, err := detectAdvertiseAddr() + if err != nil { + return fmt.Errorf("failed to detect advertise address: %w", err) + } + registerAddress = ip + } + } else { + registerAddress = sb.Config.GetHost() + } err = consul.RegisterService(ctx, consulClient, sb.Config.GetServiceName(), @@ -217,79 +218,83 @@ func (sb *ServiceBootstrap) setupConsulRegistration() error { return fmt.Errorf("failed to register service with Consul: %w", err) } - sb.Logger.Info(ctx, "Service registered with Consul successfully", - observability.String("consul_addr", consulConfig.GetConsulAddress()), - observability.String("service_name", sb.Config.GetServiceName()), - observability.String("register_address", registerAddress)) + sb.Logger.Info(ctx, "Service registered with Consul successfully", + observability.String("consul_addr", consulConfig.GetConsulAddress()), + observability.String("service_name", sb.Config.GetServiceName()), + observability.String("register_address", registerAddress)) - return nil + return nil } // detectAdvertiseAddr 自动探测一个非回环的IPv4地址,优先选择常见容器网卡 func detectAdvertiseAddr() (string, error) { - // 优先尝试常见的容器网卡名称 - preferredIfaces := []string{"eth0", "ens3", "ens4", "en0"} - for _, name := range preferredIfaces { - ifi, err := net.InterfaceByName(name) - if err == nil && (ifi.Flags&net.FlagUp) != 0 { - addrs, err := ifi.Addrs() - if err == nil { - if ip := firstIPv4(addrs); ip != "" { - return ip, nil - } - } - } - } - - // 回退:遍历所有网卡,取第一个非回环且Up的IPv4 - ifaces, err := net.Interfaces() - if err != nil { - return "", err - } - for _, ifi := range ifaces { - if (ifi.Flags&net.FlagUp) == 0 || (ifi.Flags&net.FlagLoopback) != 0 { - continue - } - addrs, err := ifi.Addrs() - if err != nil { - continue - } - if ip := firstIPv4(addrs); ip != "" { - return ip, nil - } - } - return "", fmt.Errorf("no non-loopback IPv4 address found") + // 优先尝试常见的容器网卡名称 + preferredIfaces := []string{"eth0", "ens3", "ens4", "en0"} + for _, name := range preferredIfaces { + ifi, err := net.InterfaceByName(name) + if err == nil && (ifi.Flags&net.FlagUp) != 0 { + addrs, err := ifi.Addrs() + if err == nil { + if ip := firstIPv4(addrs); ip != "" { + return ip, nil + } + } + } + } + + // 回退:遍历所有网卡,取第一个非回环且Up的IPv4 + ifaces, err := net.Interfaces() + if err != nil { + return "", err + } + for _, ifi := range ifaces { + if (ifi.Flags&net.FlagUp) == 0 || (ifi.Flags&net.FlagLoopback) != 0 { + continue + } + addrs, err := ifi.Addrs() + if err != nil { + continue + } + if ip := firstIPv4(addrs); ip != "" { + return ip, nil + } + } + return "", fmt.Errorf("no non-loopback IPv4 address found") } func firstIPv4(addrs []net.Addr) string { - for _, a := range addrs { - var ip net.IP - switch v := a.(type) { - case *net.IPNet: - ip = v.IP - case *net.IPAddr: - ip = v.IP - } - if ip == nil { - continue - } - ip4 := ip.To4() - if ip4 == nil || ip4.IsLoopback() { - continue - } - return ip4.String() - } - return "" + for _, a := range addrs { + var ip net.IP + switch v := a.(type) { + case *net.IPNet: + ip = v.IP + case *net.IPAddr: + ip = v.IP + } + if ip == nil { + continue + } + ip4 := ip.To4() + if ip4 == nil || ip4.IsLoopback() { + continue + } + return ip4.String() + } + return "" } // setupErrorInjection 设置错误注入中间件 func (sb *ServiceBootstrap) setupErrorInjection() error { ctx := context.Background() - // 尝试从配置文件加载 + // 获取服务版本(默认为 1.0.0) + serviceVersion := "1.0.0" + + // 尝试从配置文件加载指标注入器 metricInjector, err := error_injection.NewMetricInjector( sb.MetricInjectorConfigPath, sb.Config.GetServiceName(), + serviceVersion, sb.Logger, ) @@ -300,6 +305,7 @@ func (sb *ServiceBootstrap) setupErrorInjection() error { sb.MetricInjector = error_injection.NewMetricInjectorWithDefaults( "http://mock-error-service:8085", sb.Config.GetServiceName(), + serviceVersion, sb.Logger, ) } else { @@ -307,7 +313,24 @@ func (sb *ServiceBootstrap) setupErrorInjection() error { } if sb.MetricInjector != nil { - sb.Logger.Info(ctx, "Metric injector initialized successfully") + sb.Logger.Info(ctx, "Metric injector initialized successfully", + observability.String("service_version", serviceVersion)) + } + + // 创建HTTP延迟注入器 + sb.LatencyInjector = error_injection.NewHTTPLatencyInjector( + "http://mock-error-service:8085", + sb.Config.GetServiceName(), + serviceVersion, + sb.Logger, + ) + + // 将延迟注入器连接到HTTP中间件 + if sb.HTTPMiddleware != nil && sb.LatencyInjector != nil { + sb.HTTPMiddleware.SetLatencyInjector(sb.LatencyInjector) + sb.Logger.Info(ctx, "HTTP latency injector connected to middleware", + observability.String("service", sb.Config.GetServiceName()), + observability.String("version", serviceVersion)) } return nil @@ -407,6 +430,12 @@ func (sb *ServiceBootstrap) waitForShutdown(server *http.Server) { sb.Logger.Info(ctx, "Metric injector cleaned up") } + // 清理延迟注入器资源 + if sb.LatencyInjector != nil { + sb.LatencyInjector.Cleanup() + sb.Logger.Info(ctx, "Latency injector cleaned up") + } + // 关闭HTTP服务器 shutdownCtx, cancel := context.WithTimeout(context.Background(), 30*time.Second) defer cancel() diff --git a/mock/s3/shared/utils/instance.go b/mock/s3/shared/utils/instance.go index 8f8e519..79ad013 100644 --- a/mock/s3/shared/utils/instance.go +++ b/mock/s3/shared/utils/instance.go @@ -62,7 +62,7 @@ func GetInstanceID(serviceName string) string { func generateInstanceID(serviceName string) string { // 清理服务名:移除常见后缀,转换为小写 cleanServiceName := cleanServiceName(serviceName) - + // 生成8位短UUID shortUUID := generateShortUUID() if shortUUID == "" { @@ -75,7 +75,7 @@ func generateInstanceID(serviceName string) string { // cleanServiceName 清理服务名 func cleanServiceName(serviceName string) string { name := strings.ToLower(serviceName) - + // 移除常见后缀 suffixes := []string{"-service", "_service", "service"} for _, suffix := range suffixes { @@ -84,11 +84,11 @@ func cleanServiceName(serviceName string) string { break } } - + // 替换特殊字符为连字符 name = strings.ReplaceAll(name, "_", "-") name = strings.ReplaceAll(name, " ", "-") - + return name } @@ -106,4 +106,4 @@ func ResetInstanceID() { instanceIDMutex.Lock() defer instanceIDMutex.Unlock() cachedInstanceID = "" -} \ No newline at end of file +} diff --git a/scripts/prometheus_adapter/build.sh b/scripts/prometheus_adapter/build.sh new file mode 100755 index 0000000..d75f0b8 --- /dev/null +++ b/scripts/prometheus_adapter/build.sh @@ -0,0 +1,264 @@ +#!/bin/bash + +# Prometheus Adapter 打包脚本 +# 将编译产物和必要文件打包到 build 目录 + +set -e + +# 颜色输出 +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +NC='\033[0m' # No Color + +# 打印日志函数 +log_info() { + echo -e "${GREEN}[INFO]${NC} $1" +} + +log_error() { + echo -e "${RED}[ERROR]${NC} $1" +} + +log_warn() { + echo -e "${YELLOW}[WARN]${NC} $1" +} + +# 项目根目录 +PROJECT_ROOT=$(cd "$(dirname "$0")"/../.. && pwd) +cd "$PROJECT_ROOT" + +# 配置 +APP_NAME="prometheus_adapter" +BUILD_DIR="build/${APP_NAME}" +VERSION=$(git describe --tags --always --dirty 2>/dev/null || echo "dev") +BUILD_TIME=$(date -u '+%Y-%m-%d_%H:%M:%S') +GOOS=${GOOS:-linux} +GOARCH=${GOARCH:-amd64} + +log_info "开始构建 ${APP_NAME}" +log_info "版本: ${VERSION}" +log_info "构建时间: ${BUILD_TIME}" +log_info "目标系统: ${GOOS}/${GOARCH}" + +# 清理旧的构建目录 +if [ -d "$BUILD_DIR" ]; then + log_warn "清理旧的构建目录..." + rm -rf "$BUILD_DIR" +fi + +# 创建构建目录 +log_info "创建构建目录..." +mkdir -p "$BUILD_DIR/bin" +mkdir -p "$BUILD_DIR/config" +mkdir -p "$BUILD_DIR/docs" +mkdir -p "$BUILD_DIR/scripts" +mkdir -p "$BUILD_DIR/rules" + +# 编译二进制文件 +log_info "编译 ${APP_NAME}..." +LDFLAGS="-X main.Version=${VERSION} -X main.BuildTime=${BUILD_TIME}" +CGO_ENABLED=0 GOOS=$GOOS GOARCH=$GOARCH go build \ + -ldflags "$LDFLAGS" \ + -o "$BUILD_DIR/bin/${APP_NAME}" \ + "./cmd/${APP_NAME}" + +if [ $? -ne 0 ]; then + log_error "编译失败" + exit 1 +fi + +# 复制配置文件 +log_info "复制配置文件..." +if [ -f "internal/${APP_NAME}/config/prometheus_adapter.yml" ]; then + cp "internal/${APP_NAME}/config/prometheus_adapter.yml" "$BUILD_DIR/config/" + log_info "已复制配置文件到 $BUILD_DIR/config/" +else + log_warn "未找到配置文件,使用默认配置" +fi + +# 复制文档 +log_info "复制文档..." +if [ -f "docs/${APP_NAME}/README.md" ]; then + cp "docs/${APP_NAME}/README.md" "$BUILD_DIR/docs/" +fi + +# 复制测试脚本 +log_info "复制脚本..." +if [ -f "internal/${APP_NAME}/test_alert_update.sh" ]; then + cp "internal/${APP_NAME}/test_alert_update.sh" "$BUILD_DIR/scripts/" + chmod +x "$BUILD_DIR/scripts/test_alert_update.sh" +fi + +# 复制规则文件 +log_info "复制规则文件..." +if [ -d "internal/${APP_NAME}/rules" ]; then + cp -r "internal/${APP_NAME}/rules/"* "$BUILD_DIR/rules/" 2>/dev/null || true + log_info "已复制规则文件到 $BUILD_DIR/rules/" +else + # 如果没有规则文件夹,创建一个空的规则文件 + log_warn "未找到规则目录,创建默认规则文件..." + cat > "$BUILD_DIR/rules/alert_rules.yml" << 'RULES_EOF' +# Prometheus Alert Rules +# This file is managed by the Prometheus Adapter service +# It will be loaded on startup and saved on shutdown + +groups: [] +RULES_EOF +fi + +# 创建启动脚本 +log_info "创建启动脚本..." +cat > "$BUILD_DIR/start.sh" << 'EOF' +#!/bin/bash + +# Prometheus Adapter 启动脚本 + +# 获取脚本所在目录 +SCRIPT_DIR=$(cd "$(dirname "$0")" && pwd) +BIN_PATH="$SCRIPT_DIR/bin/prometheus_adapter" +CONFIG_FILE="$SCRIPT_DIR/config/prometheus_adapter.yml" +PID_FILE="$SCRIPT_DIR/prometheus_adapter.pid" +LOG_FILE="$SCRIPT_DIR/prometheus_adapter.log" + +# 检查二进制文件 +if [ ! -f "$BIN_PATH" ]; then + echo "错误: 找不到可执行文件 $BIN_PATH" + exit 1 +fi + +# 检查是否已在运行 +if [ -f "$PID_FILE" ]; then + PID=$(cat "$PID_FILE") + if kill -0 "$PID" 2>/dev/null; then + echo "Prometheus Adapter已在运行 (PID: $PID)" + exit 1 + else + rm -f "$PID_FILE" + fi +fi + +# 检查配置文件 +if [ -f "$CONFIG_FILE" ]; then + echo "使用配置文件: $CONFIG_FILE" +else + echo "警告: 找不到配置文件 $CONFIG_FILE,将使用默认配置" +fi + +# 环境变量(可选,用于覆盖配置文件) +# export PROMETHEUS_ADDRESS="http://localhost:9090" +# export ALERT_WEBHOOK_URL="http://alert-module:8080/v1/integrations/alertmanager/webhook" +# export ALERT_POLLING_INTERVAL="10s" +# export SERVER_BIND_ADDR="0.0.0.0:9999" + +echo "启动 Prometheus Adapter..." + +# 切换到脚本目录 +cd "$SCRIPT_DIR" + +# 后台启动服务 +nohup "$BIN_PATH" > "$LOG_FILE" 2>&1 & +PID=$! + +# 保存PID +echo $PID > "$PID_FILE" + +echo "Prometheus Adapter已启动" +echo "PID: $PID" +echo "日志文件: $LOG_FILE" +echo "PID文件: $PID_FILE" +echo "" +echo "查看日志: tail -f $LOG_FILE" +echo "停止服务: ./stop.sh" +EOF +chmod +x "$BUILD_DIR/start.sh" + +# 创建停止脚本 +log_info "创建停止脚本..." +cat > "$BUILD_DIR/stop.sh" << 'EOF' +#!/bin/bash + +# Prometheus Adapter 停止脚本 + +# 获取脚本所在目录 +SCRIPT_DIR=$(cd "$(dirname "$0")" && pwd) +PID_FILE="$SCRIPT_DIR/prometheus_adapter.pid" +APP_NAME="prometheus_adapter" + +# 优先从PID文件读取 +if [ -f "$PID_FILE" ]; then + PID=$(cat "$PID_FILE" 2>/dev/null) + if [ -n "$PID" ] && kill -0 "$PID" 2>/dev/null; then + echo "从PID文件获取进程ID: $PID" + else + echo "PID文件中的进程已不存在,清理PID文件" + rm -f "$PID_FILE" + PID="" + fi +else + PID="" +fi + +# 如果PID文件不存在或进程已死,通过进程名查找 +if [ -z "$PID" ]; then + PID=$(ps aux | grep -v grep | grep "$APP_NAME" | awk '{print $2}') +fi + +if [ -z "$PID" ]; then + echo "没有找到运行中的 $APP_NAME 进程" + exit 0 +fi + +echo "停止 $APP_NAME (PID: $PID)..." +kill -TERM $PID 2>/dev/null || true + +# 等待进程退出 +count=0 +while [ $count -lt 10 ] && ps -p "$PID" > /dev/null 2>&1; do + sleep 1 + count=$((count + 1)) +done + +# 检查是否已退出 +if ps -p "$PID" > /dev/null 2>&1; then + echo "强制停止 $APP_NAME..." + kill -KILL "$PID" 2>/dev/null || true +fi + +# 清理PID文件 +if [ -f "$PID_FILE" ]; then + rm -f "$PID_FILE" +fi + +echo "$APP_NAME 已停止" +EOF +chmod +x "$BUILD_DIR/stop.sh" + +# 创建版本信息文件 +log_info "创建版本信息..." +cat > "$BUILD_DIR/VERSION" << EOF +Application: ${APP_NAME} +Version: ${VERSION} +Build Time: ${BUILD_TIME} +Build OS/Arch: ${GOOS}/${GOARCH} +EOF + +# 打包成 tar.gz +ARCHIVE_NAME="${APP_NAME}_${VERSION}_${GOOS}_${GOARCH}.tar.gz" +log_info "创建归档文件: $ARCHIVE_NAME" +cd build +tar -czf "$ARCHIVE_NAME" "$APP_NAME" +cd .. + +# 输出构建信息 +log_info "构建成功!" +echo "" +echo "构建产物:" +echo " - 目录: $BUILD_DIR" +echo " - 归档: build/$ARCHIVE_NAME" +echo "" +echo "文件列表:" +ls -lah "$BUILD_DIR/" +echo "" +echo "归档大小:" +ls -lah "build/$ARCHIVE_NAME" \ No newline at end of file diff --git a/scripts/prometheus_adapter/deploy.sh b/scripts/prometheus_adapter/deploy.sh new file mode 100755 index 0000000..6f53a6f --- /dev/null +++ b/scripts/prometheus_adapter/deploy.sh @@ -0,0 +1,394 @@ +#!/bin/bash + +# Prometheus Adapter 部署脚本 +# 将打包好的文件解压并部署到指定目录 + +set -e + +# 颜色输出 +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' # No Color + +# 打印日志函数 +log_info() { + echo -e "${GREEN}[INFO]${NC} $1" +} + +log_error() { + echo -e "${RED}[ERROR]${NC} $1" +} + +log_warn() { + echo -e "${YELLOW}[WARN]${NC} $1" +} + +log_debug() { + echo -e "${BLUE}[DEBUG]${NC} $1" +} + +# 显示使用帮助 +show_usage() { + cat << EOF +使用方法: + $0 [选项] <归档文件> + +选项: + -d, --deploy-dir DIR 指定部署目录 (默认: /home/qboxserver/zeroops_prometheus_adapter) + -b, --backup 部署前备份现有目录 + -s, --start 部署后自动启动服务 + -r, --restart 如果服务已运行则重启 + -f, --force 强制部署,不询问确认 + -h, --help 显示此帮助信息 + +示例: + $0 prometheus_adapter_v1.0.0_linux_amd64.tar.gz + $0 -d /opt/prometheus_adapter -b -s prometheus_adapter.tar.gz + $0 --backup --restart prometheus_adapter.tar.gz + +EOF + exit 0 +} + +# 默认配置 +DEPLOY_DIR="/home/qboxserver/zeroops_prometheus_adapter" +BACKUP=false +START_SERVICE=false +RESTART_SERVICE=false +FORCE_DEPLOY=false +ARCHIVE_FILE="" + +# 解析命令行参数 +while [[ $# -gt 0 ]]; do + case $1 in + -d|--deploy-dir) + DEPLOY_DIR="$2" + shift 2 + ;; + -b|--backup) + BACKUP=true + shift + ;; + -s|--start) + START_SERVICE=true + shift + ;; + -r|--restart) + RESTART_SERVICE=true + shift + ;; + -f|--force) + FORCE_DEPLOY=true + shift + ;; + -h|--help) + show_usage + ;; + *) + if [ -z "$ARCHIVE_FILE" ]; then + ARCHIVE_FILE="$1" + else + log_error "未知参数: $1" + show_usage + fi + shift + ;; + esac +done + +# 检查归档文件参数 +if [ -z "$ARCHIVE_FILE" ]; then + log_error "请指定要部署的归档文件" + show_usage +fi + +# 检查归档文件是否存在 +if [ ! -f "$ARCHIVE_FILE" ]; then + log_error "找不到归档文件: $ARCHIVE_FILE" + exit 1 +fi + +# 获取归档文件的绝对路径 +ARCHIVE_FILE=$(realpath "$ARCHIVE_FILE") + +log_info "部署配置:" +log_info " 归档文件: $ARCHIVE_FILE" +log_info " 部署目录: $DEPLOY_DIR" +log_info " 备份现有: $BACKUP" +log_info " 自动启动: $START_SERVICE" +log_info " 重启服务: $RESTART_SERVICE" + +# 确认部署 +if [ "$FORCE_DEPLOY" = false ]; then + echo -n "确认部署? (y/N): " + read -r CONFIRM + if [ "$CONFIRM" != "y" ] && [ "$CONFIRM" != "Y" ]; then + log_warn "部署已取消" + exit 0 + fi +fi + +# 检查是否有运行中的服务 +check_running_service() { + # 优先从PID文件读取 + if [ -f "$DEPLOY_DIR/prometheus_adapter.pid" ]; then + local pid=$(cat "$DEPLOY_DIR/prometheus_adapter.pid" 2>/dev/null) + if [ -n "$pid" ] && kill -0 "$pid" 2>/dev/null; then + echo "$pid" + return + fi + fi + + # 如果PID文件不存在或进程已死,通过进程名查找 + local pid=$(ps aux | grep -v grep | grep "prometheus_adapter" | grep -v "$0" | awk '{print $2}') + if [ -n "$pid" ]; then + echo "$pid" + fi +} + +# 停止运行中的服务 +stop_service() { + local pid=$1 + if [ -n "$pid" ]; then + log_warn "停止运行中的服务 (PID: $pid)..." + kill -TERM "$pid" 2>/dev/null || true + + # 等待进程退出 + local count=0 + while [ $count -lt 10 ] && ps -p "$pid" > /dev/null 2>&1; do + sleep 1 + count=$((count + 1)) + done + + # 如果还没退出,强制停止 + if ps -p "$pid" > /dev/null 2>&1; then + log_warn "强制停止进程..." + kill -KILL "$pid" 2>/dev/null || true + fi + + # 清理PID文件 + if [ -f "$DEPLOY_DIR/prometheus_adapter.pid" ]; then + rm -f "$DEPLOY_DIR/prometheus_adapter.pid" + fi + + log_info "服务已停止" + fi +} + +# 检查运行中的服务 +RUNNING_PID=$(check_running_service) +if [ -n "$RUNNING_PID" ]; then + log_warn "检测到运行中的 prometheus_adapter 服务 (PID: $RUNNING_PID)" + if [ "$RESTART_SERVICE" = true ] || [ "$FORCE_DEPLOY" = true ]; then + stop_service "$RUNNING_PID" + else + log_error "服务正在运行,请先停止服务或使用 -r/--restart 选项" + exit 1 + fi +fi + +# 备份现有目录 +if [ "$BACKUP" = true ] && [ -d "$DEPLOY_DIR" ]; then + BACKUP_DIR="${DEPLOY_DIR}_backup_$(date +%Y%m%d_%H%M%S)" + log_info "备份现有目录到: $BACKUP_DIR" + + # 需要sudo权限 + if [ -w "$(dirname "$DEPLOY_DIR")" ]; then + mv "$DEPLOY_DIR" "$BACKUP_DIR" + else + log_warn "需要管理员权限来备份目录" + sudo mv "$DEPLOY_DIR" "$BACKUP_DIR" + fi +fi + +# 创建临时解压目录 +TEMP_DIR=$(mktemp -d) +log_info "创建临时目录: $TEMP_DIR" + +# 解压归档文件 +log_info "解压归档文件..." +tar -xzf "$ARCHIVE_FILE" -C "$TEMP_DIR" + +# 查找解压后的目录 +EXTRACTED_DIR=$(find "$TEMP_DIR" -maxdepth 1 -type d -name "prometheus_adapter" | head -n 1) +if [ -z "$EXTRACTED_DIR" ]; then + log_error "解压失败:找不到 prometheus_adapter 目录" + rm -rf "$TEMP_DIR" + exit 1 +fi + +# 创建部署目录(如果需要sudo) +log_info "创建部署目录..." +if [ -w "$(dirname "$DEPLOY_DIR")" ]; then + mkdir -p "$(dirname "$DEPLOY_DIR")" +else + log_warn "需要管理员权限来创建部署目录" + sudo mkdir -p "$(dirname "$DEPLOY_DIR")" +fi + +# 移动到部署目录 +log_info "部署到: $DEPLOY_DIR" +if [ -w "$(dirname "$DEPLOY_DIR")" ]; then + if [ -d "$DEPLOY_DIR" ]; then + rm -rf "$DEPLOY_DIR" + fi + mv "$EXTRACTED_DIR" "$DEPLOY_DIR" +else + log_warn "需要管理员权限来部署" + if [ -d "$DEPLOY_DIR" ]; then + sudo rm -rf "$DEPLOY_DIR" + fi + sudo mv "$EXTRACTED_DIR" "$DEPLOY_DIR" +fi + +# 设置权限 +log_info "设置文件权限..." +if [ -w "$DEPLOY_DIR" ]; then + chmod +x "$DEPLOY_DIR/bin/prometheus_adapter" + chmod +x "$DEPLOY_DIR/start.sh" + chmod +x "$DEPLOY_DIR/stop.sh" + [ -f "$DEPLOY_DIR/scripts/test_alert_update.sh" ] && chmod +x "$DEPLOY_DIR/scripts/test_alert_update.sh" + # 确保 config 目录和配置文件可读 + chmod 755 "$DEPLOY_DIR/config" + [ -f "$DEPLOY_DIR/config/prometheus_adapter.yml" ] && chmod 644 "$DEPLOY_DIR/config/prometheus_adapter.yml" + # 确保 rules 目录可写 + chmod 755 "$DEPLOY_DIR/rules" + [ -f "$DEPLOY_DIR/rules/alert_rules.yml" ] && chmod 644 "$DEPLOY_DIR/rules/alert_rules.yml" +else + sudo chmod +x "$DEPLOY_DIR/bin/prometheus_adapter" + sudo chmod +x "$DEPLOY_DIR/start.sh" + sudo chmod +x "$DEPLOY_DIR/stop.sh" + [ -f "$DEPLOY_DIR/scripts/test_alert_update.sh" ] && sudo chmod +x "$DEPLOY_DIR/scripts/test_alert_update.sh" + # 确保 config 目录和配置文件可读 + sudo chmod 755 "$DEPLOY_DIR/config" + [ -f "$DEPLOY_DIR/config/prometheus_adapter.yml" ] && sudo chmod 644 "$DEPLOY_DIR/config/prometheus_adapter.yml" + # 确保 rules 目录可写 + sudo chmod 755 "$DEPLOY_DIR/rules" + [ -f "$DEPLOY_DIR/rules/alert_rules.yml" ] && sudo chmod 644 "$DEPLOY_DIR/rules/alert_rules.yml" + # 设置 rules 目录的所有者为服务运行用户 + sudo chown -R qboxserver:qboxserver "$DEPLOY_DIR/rules" + # 确保配置文件也可以被服务用户读取 + sudo chown qboxserver:qboxserver "$DEPLOY_DIR/config/prometheus_adapter.yml" +fi + +# 清理临时目录 +rm -rf "$TEMP_DIR" + +# 显示部署信息 +log_info "部署成功!" +echo "" +echo "部署信息:" +echo " 目录: $DEPLOY_DIR" +echo "" +echo "版本信息:" +if [ -f "$DEPLOY_DIR/VERSION" ]; then + cat "$DEPLOY_DIR/VERSION" +else + echo " 无版本信息" +fi +echo "" +echo "文件列表:" +ls -lah "$DEPLOY_DIR/" + +# 创建systemd服务文件(可选) +create_systemd_service() { + local service_name="prometheus-adapter" + local service_file="/etc/systemd/system/${service_name}.service" + + log_info "创建 systemd 服务..." + + cat << EOF | sudo tee "$service_file" > /dev/null +[Unit] +Description=Prometheus Adapter Service +After=network.target + +[Service] +Type=simple +User=qboxserver +Group=qboxserver +WorkingDirectory=$DEPLOY_DIR +# 可选:通过环境变量覆盖配置 +#Environment="PROMETHEUS_ADDRESS=http://localhost:9090" +#Environment="ALERT_WEBHOOK_URL=http://alert-module:8080/v1/integrations/alertmanager/webhook" +#Environment="ALERT_POLLING_INTERVAL=10s" +#Environment="SERVER_BIND_ADDR=0.0.0.0:9999" +ExecStart=$DEPLOY_DIR/bin/prometheus_adapter +ExecStop=$DEPLOY_DIR/stop.sh +Restart=on-failure +RestartSec=10 + +[Install] +WantedBy=multi-user.target +EOF + + sudo systemctl daemon-reload + log_info "Systemd 服务已创建: ${service_name}.service" + echo "" + echo "可以使用以下命令管理服务:" + echo " 启动: sudo systemctl start ${service_name}" + echo " 停止: sudo systemctl stop ${service_name}" + echo " 重启: sudo systemctl restart ${service_name}" + echo " 状态: sudo systemctl status ${service_name}" + echo " 开机自启: sudo systemctl enable ${service_name}" +} + +# 询问是否创建systemd服务 +if [ "$FORCE_DEPLOY" = false ]; then + echo "" + echo -n "是否创建 systemd 服务? (y/N): " + read -r CREATE_SERVICE + if [ "$CREATE_SERVICE" = "y" ] || [ "$CREATE_SERVICE" = "Y" ]; then + create_systemd_service + fi +fi + +# 启动服务 +if [ "$START_SERVICE" = true ] || [ "$RESTART_SERVICE" = true ]; then + log_info "启动服务..." + + # 设置环境变量 + export PROMETHEUS_URL="${PROMETHEUS_URL:-http://localhost:9090}" + export PORT="${PORT:-8080}" + export LOG_LEVEL="${LOG_LEVEL:-info}" + + # 启动服务 + cd "$DEPLOY_DIR" + + # 直接启动二进制文件而不是通过start.sh脚本 + nohup ./bin/prometheus_adapter > prometheus_adapter.log 2>&1 & + PID=$! + + # 保存PID到文件 + echo $PID > prometheus_adapter.pid + + log_info "服务已启动 (PID: $PID)" + echo "PID文件: $DEPLOY_DIR/prometheus_adapter.pid" + echo "日志文件: $DEPLOY_DIR/prometheus_adapter.log" + + # 等待服务启动 + sleep 2 + + # 检查是否启动成功 + if kill -0 "$PID" 2>/dev/null; then + log_info "服务启动成功,正在运行" + echo "" + echo "查看日志: tail -f $DEPLOY_DIR/prometheus_adapter.log" + echo "停止服务: kill \$(cat $DEPLOY_DIR/prometheus_adapter.pid)" + else + log_error "服务启动失败,请检查日志" + exit 1 + fi +else + echo "" + echo "手动启动服务:" + echo " cd $DEPLOY_DIR" + echo " nohup ./bin/prometheus_adapter > prometheus_adapter.log 2>&1 &" + echo " echo \$! > prometheus_adapter.pid" + echo "" + echo "停止服务:" + echo " kill \$(cat prometheus_adapter.pid)" +fi + +log_info "部署完成!" \ No newline at end of file