From 23f91e46cae463c7d74b5eeb172d5d008808e370 Mon Sep 17 00:00:00 2001
From: dnj <shdnj@qq.com>
Date: Fri, 19 Sep 2025 16:41:33 +0800
Subject: [PATCH 01/18] =?UTF-8?q?feat(observability):=20=E6=B7=BB=E5=8A=A0?=
 =?UTF-8?q?HTTP=E8=AF=B7=E6=B1=82=E6=97=B6=E5=BB=B6=E6=8C=87=E6=A0=87?=
 =?UTF-8?q?=E6=94=B6=E9=9B=86=E5=8A=9F=E8=83=BD?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

实现HTTP中间件记录请求时延并导出到Prometheus指标
添加服务信息到指标收集器
统一代码格式和修复缩进问题
添加Prometheus Adapter API文档
---
 docs/prometheus_adapter/API.md                | 271 ++++++++++++++++++
 .../internal/handler/mock_error_handler.go    |  26 +-
 .../internal/service/mock_error_service.go    |  42 +--
 mock/s3/shared/interfaces/error_injector.go   |   2 +-
 .../error_injection/error_injection.go        |  54 ++--
 mock/s3/shared/models/error.go                |  14 +-
 mock/s3/shared/observability/metrics.go       |  58 +++-
 mock/s3/shared/observability/middleware.go    |  18 +-
 mock/s3/shared/observability/observability.go |   3 +
 mock/s3/shared/observability/providers.go     |  30 +-
 mock/s3/shared/server/service_bootstrap.go    | 164 +++++------
 mock/s3/shared/utils/instance.go              |  10 +-
 12 files changed, 519 insertions(+), 173 deletions(-)
 create mode 100644 docs/prometheus_adapter/API.md

diff --git a/docs/prometheus_adapter/API.md b/docs/prometheus_adapter/API.md
new file mode 100644
index 0000000..4cfe228
--- /dev/null
+++ b/docs/prometheus_adapter/API.md
@@ -0,0 +1,271 @@
+# Prometheus Adapter API 文档
+
+## 概述
+
+Prometheus Adapter 提供从 Prometheus 获取服务 QPS 和平均时延指标的 RESTful API 接口。支持按服务名称和版本进行查询。
+
+> **当前状态**：
+> - QPS 指标：已实现，使用 `system_network_qps` 指标（基于网络包统计）
+> - 时延指标：已实现，使用 `http.server.request.duration_seconds` 指标（HTTP 请求真实时延）
+
+## API
+
+### 1. 获取服务 QPS 指标
+
+**GET** `/v1/metrics/:service/qps`
+
+获取指定服务的 QPS（每秒请求数）指标数据。
+
+#### 路径参数
+- `service` (string, required): 服务名称
+
+#### 查询参数
+- `version` (string, optional): 服务版本，不指定则返回所有版本
+- `start` (string, optional): 开始时间 (RFC3339 格式，如: 2024-01-01T00:00:00Z)
+- `end` (string, optional): 结束时间 (RFC3339 格式，如: 2024-01-01T01:00:00Z)
+- `step` (string, optional): 时间步长 (如: 1m, 5m, 1h)，默认 1m
+
+#### 请求示例
+```bash
+GET /v1/metrics/metadata-service/qps?version=1.0.0&start=2024-01-01T00:00:00Z&end=2024-01-01T01:00:00Z&step=1m
+```
+
+#### 响应示例
+```json
+{
+  "service": "metadata-service",
+  "version": "1.0.0",
+  "metric_type": "qps",
+  "data": [
+    {
+      "timestamp": "2024-01-01T00:00:00Z",
+      "value": 150.5
+    },
+    {
+      "timestamp": "2024-01-01T00:01:00Z",
+      "value": 148.2
+    }
+  ],
+  "summary": {
+    "min": 120.1,
+    "max": 180.3,
+    "avg": 152.8,
+    "total_points": 60
+  }
+}
+```
+
+### 2. 获取服务平均时延指标
+
+**GET** `/v1/metrics/:service/latency`
+
+获取指定服务的平均响应时延指标数据（单位：秒）。
+
+#### 路径参数
+- `service` (string, required): 服务名称
+
+#### 查询参数
+- `version` (string, optional): 服务版本，不指定则返回所有版本
+- `start` (string, optional): 开始时间 (RFC3339 格式)
+- `end` (string, optional): 结束时间 (RFC3339 格式)
+- `step` (string, optional): 时间步长，默认 1m
+- `percentile` (string, optional): 百分位数 (p50, p95, p99)，默认 p50
+
+#### 请求示例
+```bash
+GET /v1/metrics/storage-service/latency?version=1.0.0&percentile=p95&start=2024-01-01T00:00:00Z&end=2024-01-01T01:00:00Z
+```
+
+#### 响应示例
+```json
+{
+  "service": "storage-service",
+  "version": "1.0.0",
+  "metric_type": "latency",
+  "percentile": "p95",
+  "data": [
+    {
+      "timestamp": "2024-01-01T00:00:00Z",
+      "value": 125.8
+    },
+    {
+      "timestamp": "2024-01-01T00:01:00Z",
+      "value": 132.1
+    }
+  ],
+  "summary": {
+    "min": 98.5,
+    "max": 201.2,
+    "avg": 128.9,
+    "total_points": 60
+  }
+}
+```
+
+### 3. 获取服务综合指标
+
+**GET** `/v1/metrics/:service/overview`
+
+同时获取指定服务的 QPS 和时延指标概览。
+
+#### 路径参数
+- `service` (string, required): 服务名称
+
+#### 查询参数
+- `version` (string, optional): 服务版本
+- `start` (string, optional): 开始时间 (RFC3339 格式)
+- `end` (string, optional): 结束时间 (RFC3339 格式)
+
+#### 响应示例
+```json
+{
+  "service": "queue-service",
+  "version": "1.0.0",
+  "time_range": {
+    "start": "2024-01-01T00:00:00Z",
+    "end": "2024-01-01T01:00:00Z"
+  },
+  "metrics": {
+    "qps": {
+      "current": 152.8,
+      "avg": 148.5,
+      "max": 180.3,
+      "min": 120.1
+    },
+    "latency": {
+      "p50": 85.2,
+      "p95": 128.9,
+      "p99": 201.2
+    }
+  }
+}
+```
+
+### 4. 获取可用服务列表
+
+**GET** `/v1/services`
+
+获取 Prometheus 中可监控的服务列表。
+
+#### 查询参数
+- `prefix` (string, optional): 服务名前缀过滤
+
+#### 响应示例
+```json
+{
+  "services": [
+    {
+      "name": "metadata-service",
+      "versions": ["1.0.0"],
+      "active_versions": ["1.0.0"],
+      "last_updated": "2024-01-01T01:00:00Z"
+    },
+    {
+      "name": "storage-service",
+      "versions": ["1.0.0"],
+      "active_versions": ["1.0.0"],
+      "last_updated": "2024-01-01T00:45:00Z"
+    },
+    {
+      "name": "queue-service",
+      "versions": ["1.0.0"],
+      "active_versions": ["1.0.0"],
+      "last_updated": "2024-01-01T00:30:00Z"
+    },
+    {
+      "name": "third-party-service",
+      "versions": ["1.0.0"],
+      "active_versions": ["1.0.0"],
+      "last_updated": "2024-01-01T00:20:00Z"
+    },
+    {
+      "name": "mock-error-service",
+      "versions": ["1.0.0"],
+      "active_versions": ["1.0.0"],
+      "last_updated": "2024-01-01T00:15:00Z"
+    }
+  ],
+  "total": 5
+}
+```
+
+## 错误响应
+
+所有 API 在出错时返回统一的错误格式：
+
+```json
+{
+  "error": "error_code",
+  "message": "详细错误描述",
+  "details": {
+    "field": "具体错误字段"
+  }
+}
+```
+
+### 常见错误码
+
+- `400 Bad Request`: 请求参数错误
+- `404 Not Found`: 服务或版本不存在
+- `500 Internal Server Error`: 内部服务器错误
+- `503 Service Unavailable`: Prometheus 连接失败
+
+## 实现说明
+
+### Prometheus 查询语法
+
+API 内部使用的 Prometheus 查询示例：
+
+#### QPS 查询
+```promql
+# 网络包 QPS（当前实现）
+system_network_qps{exported_job="metadata-service",service_version="1.0.0"}
+
+# 计算5分钟平均 QPS
+rate(system_network_qps{exported_job="metadata-service",service_version="1.0.0"}[5m])
+```
+
+#### 平均时延查询
+```promql
+# P95 时延（95分位数）
+histogram_quantile(0.95, rate(http.server.request.duration_seconds_bucket{exported_job="metadata-service",service_version="1.0.0"}[5m]))
+
+# P50 时延（中位数）
+histogram_quantile(0.50, rate(http.server.request.duration_seconds_bucket{exported_job="metadata-service",service_version="1.0.0"}[5m]))
+
+# P99 时延（99分位数）
+histogram_quantile(0.99, rate(http.server.request.duration_seconds_bucket{exported_job="metadata-service",service_version="1.0.0"}[5m]))
+
+# 平均时延
+rate(http.server.request.duration_seconds_sum{exported_job="metadata-service",service_version="1.0.0"}[5m])
+/
+rate(http.server.request.duration_seconds_count{exported_job="metadata-service",service_version="1.0.0"}[5m])
+```
+
+### 配置要求
+
+需要在配置文件中指定：
+- Prometheus 服务器地址：`http://10.210.10.33:9090`
+- 查询超时时间：30秒
+- 默认时间范围：最近1小时
+- 服务标签映射：
+  - 服务名：`exported_job`（在指标中作为标签）
+  - 版本号：`service_version`（在指标中作为标签）
+  - 实例标识：通过 OpenTelemetry 的 `service.instance.id` 属性设置
+
+### 支持的服务列表
+
+当前 mock/s3 环境中支持的服务：
+- `metadata-service` - 元数据管理服务（版本：1.0.0）
+- `storage-service` - 存储服务（版本：1.0.0）
+- `queue-service` - 消息队列服务（版本：1.0.0）
+- `third-party-service` - 第三方集成服务（版本：1.0.0）
+- `mock-error-service` - 错误模拟服务（版本：1.0.0）
+
+所有服务的版本信息通过 `service_version` 标签暴露。
+
+### 缓存策略
+
+- 指标数据缓存时间：30秒
+- 服务列表缓存时间：5分钟
+- 支持 ETag 缓存验证
\ No newline at end of file
diff --git a/mock/s3/services/mock-error/internal/handler/mock_error_handler.go b/mock/s3/services/mock-error/internal/handler/mock_error_handler.go
index 18be6b6..dfd1f33 100644
--- a/mock/s3/services/mock-error/internal/handler/mock_error_handler.go
+++ b/mock/s3/services/mock-error/internal/handler/mock_error_handler.go
@@ -85,13 +85,13 @@ func (h *MockErrorHandler) deleteMetricAnomaly(c *gin.Context) {
 
 // checkMetricInjection 检查是否应该注入指标异常
 func (h *MockErrorHandler) checkMetricInjection(c *gin.Context) {
-    ctx := c.Request.Context()
+	ctx := c.Request.Context()
 
-    var request struct {
-        Service    string `json:"service" binding:"required"`
-        MetricName string `json:"metric_name" binding:"required"`
-        Instance   string `json:"instance"`
-    }
+	var request struct {
+		Service    string `json:"service" binding:"required"`
+		MetricName string `json:"metric_name" binding:"required"`
+		Instance   string `json:"instance"`
+	}
 
 	if err := c.ShouldBindJSON(&request); err != nil {
 		h.logger.Error(ctx, "Failed to bind metric injection check request", observability.Error(err))
@@ -99,14 +99,14 @@ func (h *MockErrorHandler) checkMetricInjection(c *gin.Context) {
 		return
 	}
 
-    anomaly, shouldInject := h.errorService.ShouldInjectError(ctx, request.Service, request.MetricName, request.Instance)
+	anomaly, shouldInject := h.errorService.ShouldInjectError(ctx, request.Service, request.MetricName, request.Instance)
 
-    response := gin.H{
-        "should_inject": shouldInject,
-        "service":       request.Service,
-        "metric_name":   request.MetricName,
-        "instance":      request.Instance,
-    }
+	response := gin.H{
+		"should_inject": shouldInject,
+		"service":       request.Service,
+		"metric_name":   request.MetricName,
+		"instance":      request.Instance,
+	}
 
 	if shouldInject {
 		response["anomaly"] = anomaly
diff --git a/mock/s3/services/mock-error/internal/service/mock_error_service.go b/mock/s3/services/mock-error/internal/service/mock_error_service.go
index cdaf0c4..b445c82 100644
--- a/mock/s3/services/mock-error/internal/service/mock_error_service.go
+++ b/mock/s3/services/mock-error/internal/service/mock_error_service.go
@@ -118,19 +118,19 @@ func (s *MockErrorService) ShouldInjectError(ctx context.Context, service, metri
 	s.stats.TotalRequests++
 	s.stats.LastUpdated = time.Now()
 
-    for _, rule := range s.rules {
-        if !rule.Enabled {
-            continue
-        }
-
-        // 检查服务匹配
-        if rule.Service != "" && rule.Service != service {
-            continue
-        }
-        // 检查实例匹配（如果指定了实例，则必须匹配）
-        if rule.Instance != "" && rule.Instance != instance {
-            continue
-        }
+	for _, rule := range s.rules {
+		if !rule.Enabled {
+			continue
+		}
+
+		// 检查服务匹配
+		if rule.Service != "" && rule.Service != service {
+			continue
+		}
+		// 检查实例匹配（如果指定了实例，则必须匹配）
+		if rule.Instance != "" && rule.Instance != instance {
+			continue
+		}
 
 		// 检查指标名称匹配
 		if rule.MetricName != "" && rule.MetricName != metricName {
@@ -167,14 +167,14 @@ func (s *MockErrorService) ShouldInjectError(ctx context.Context, service, metri
 			"rule_id":      rule.ID,
 		}
 
-        s.logger.Info(ctx, "Metric anomaly injected",
-            observability.String("rule_id", rule.ID),
-            observability.String("service", service),
-            observability.String("instance", instance),
-            observability.String("metric_name", metricName),
-            observability.String("anomaly_type", rule.AnomalyType),
-            observability.Float64("target_value", rule.TargetValue),
-            observability.Int("triggered_count", rule.Triggered))
+		s.logger.Info(ctx, "Metric anomaly injected",
+			observability.String("rule_id", rule.ID),
+			observability.String("service", service),
+			observability.String("instance", instance),
+			observability.String("metric_name", metricName),
+			observability.String("anomaly_type", rule.AnomalyType),
+			observability.Float64("target_value", rule.TargetValue),
+			observability.Int("triggered_count", rule.Triggered))
 
 		return anomaly, true
 	}
diff --git a/mock/s3/shared/interfaces/error_injector.go b/mock/s3/shared/interfaces/error_injector.go
index 4feb187..c59894a 100644
--- a/mock/s3/shared/interfaces/error_injector.go
+++ b/mock/s3/shared/interfaces/error_injector.go
@@ -15,7 +15,7 @@ type MetricAnomalyService interface {
 	ListRules(ctx context.Context) ([]*models.MetricAnomalyRule, error)
 
 	// 指标异常注入核心功能
-    ShouldInjectError(ctx context.Context, service, metricName, instance string) (map[string]any, bool)
+	ShouldInjectError(ctx context.Context, service, metricName, instance string) (map[string]any, bool)
 }
 
 // MetricInjector HTTP指标异常注入器接口
diff --git a/mock/s3/shared/middleware/error_injection/error_injection.go b/mock/s3/shared/middleware/error_injection/error_injection.go
index 9eb98a1..dec2c6c 100644
--- a/mock/s3/shared/middleware/error_injection/error_injection.go
+++ b/mock/s3/shared/middleware/error_injection/error_injection.go
@@ -1,16 +1,16 @@
 package error_injection
 
 import (
-    "context"
-    "fmt"
-    "mocks3/shared/client"
-    "mocks3/shared/models"
-    "mocks3/shared/observability"
-    "mocks3/shared/utils"
-    "net/http"
-    "strconv"
-    "sync"
-    "time"
+	"context"
+	"fmt"
+	"mocks3/shared/client"
+	"mocks3/shared/models"
+	"mocks3/shared/observability"
+	"mocks3/shared/utils"
+	"net/http"
+	"strconv"
+	"sync"
+	"time"
 )
 
 // MetricInjectorConfig 指标异常注入器配置
@@ -125,11 +125,11 @@ func NewMetricInjectorWithDefaults(mockErrorServiceURL string, serviceName strin
 
 // InjectMetricAnomaly 检查并注入指标异常
 func (mi *MetricInjector) InjectMetricAnomaly(ctx context.Context, metricName string, originalValue float64) float64 {
-    // 计算实例标识，用于实例级注入与缓存
-    instanceID := utils.GetInstanceID(mi.serviceName)
+	// 计算实例标识，用于实例级注入与缓存
+	instanceID := utils.GetInstanceID(mi.serviceName)
 
-    // 检查缓存（加入实例维度）
-    cacheKey := mi.serviceName + ":" + instanceID + ":" + metricName
+	// 检查缓存（加入实例维度）
+	cacheKey := mi.serviceName + ":" + instanceID + ":" + metricName
 	mi.cacheMu.RLock()
 	if cached, exists := mi.cache[cacheKey]; exists && time.Now().Before(cached.ExpiresAt) {
 		mi.cacheMu.RUnlock()
@@ -141,19 +141,19 @@ func (mi *MetricInjector) InjectMetricAnomaly(ctx context.Context, metricName st
 	mi.cacheMu.RUnlock()
 
 	// 查询Mock Error Service获取异常规则
-    request := map[string]string{
-        "service":     mi.serviceName,
-        "metric_name": metricName,
-        "instance":    instanceID,
-    }
-
-    var response struct {
-        ShouldInject bool           `json:"should_inject"`
-        Service      string         `json:"service"`
-        MetricName   string         `json:"metric_name"`
-        Instance     string         `json:"instance"`
-        Anomaly      map[string]any `json:"anomaly,omitempty"`
-    }
+	request := map[string]string{
+		"service":     mi.serviceName,
+		"metric_name": metricName,
+		"instance":    instanceID,
+	}
+
+	var response struct {
+		ShouldInject bool           `json:"should_inject"`
+		Service      string         `json:"service"`
+		MetricName   string         `json:"metric_name"`
+		Instance     string         `json:"instance"`
+		Anomaly      map[string]any `json:"anomaly,omitempty"`
+	}
 
 	// 使用较短的超时时间避免影响正常指标收集
 	opts := client.RequestOptions{
diff --git a/mock/s3/shared/models/error.go b/mock/s3/shared/models/error.go
index 7ebd5b2..2643054 100644
--- a/mock/s3/shared/models/error.go
+++ b/mock/s3/shared/models/error.go
@@ -6,13 +6,13 @@ import (
 
 // MetricAnomalyRule 指标异常注入规则
 type MetricAnomalyRule struct {
-    ID          string `json:"id"`
-    Name        string `json:"name"`
-    Service     string `json:"service"`     // 目标服务
-    Instance    string `json:"instance,omitempty"` // 目标实例，可选
-    MetricName  string `json:"metric_name"` // 目标指标名称
-    AnomalyType string `json:"anomaly_type"`
-    Enabled     bool   `json:"enabled"`
+	ID          string `json:"id"`
+	Name        string `json:"name"`
+	Service     string `json:"service"`            // 目标服务
+	Instance    string `json:"instance,omitempty"` // 目标实例，可选
+	MetricName  string `json:"metric_name"`        // 目标指标名称
+	AnomalyType string `json:"anomaly_type"`
+	Enabled     bool   `json:"enabled"`
 
 	// 异常参数
 	TargetValue float64       `json:"target_value"` // 目标异常值
diff --git a/mock/s3/shared/observability/metrics.go b/mock/s3/shared/observability/metrics.go
index 3c5bdeb..755161e 100644
--- a/mock/s3/shared/observability/metrics.go
+++ b/mock/s3/shared/observability/metrics.go
@@ -12,6 +12,7 @@ import (
 	"time"
 
 	"github.com/prometheus/procfs"
+	"go.opentelemetry.io/otel/attribute"
 	"go.opentelemetry.io/otel/metric"
 )
 
@@ -47,6 +48,9 @@ type MetricCollector struct {
 	networkQPS          metric.Float64Gauge
 	machineOnlineStatus metric.Int64Gauge
 
+	// HTTP 请求指标
+	httpRequestDuration metric.Float64Histogram
+
 	// 统计状态
 	cpuStats     *CPUStats
 	networkStats *NetworkStats
@@ -55,6 +59,10 @@ type MetricCollector struct {
 
 	// 错误注入器
 	metricInjector MetricInjector
+
+	// 服务属性
+	serviceName    string
+	serviceVersion string
 }
 
 // NewMetricCollector 创建指标收集器
@@ -83,6 +91,12 @@ func NewMetricCollector(meter metric.Meter, logger *Logger) (*MetricCollector, e
 	return collector, nil
 }
 
+// SetServiceInfo 设置服务信息
+func (c *MetricCollector) SetServiceInfo(serviceName, serviceVersion string) {
+	c.serviceName = serviceName
+	c.serviceVersion = serviceVersion
+}
+
 // SetMetricInjector 设置错误注入器
 func (c *MetricCollector) SetMetricInjector(injector MetricInjector) {
 	c.metricInjector = injector
@@ -139,6 +153,18 @@ func (c *MetricCollector) initMetrics() error {
 		return err
 	}
 
+	// HTTP 请求时延 (使用 Prometheus 兼容的命名)
+	if c.httpRequestDuration, err = c.meter.Float64Histogram(
+		"http.server.request.duration_seconds",
+		metric.WithDescription("HTTP server request duration in seconds"),
+		metric.WithUnit("s"),
+		metric.WithExplicitBucketBoundaries(
+			0.005, 0.01, 0.025, 0.05, 0.075, 0.1, 0.25, 0.5, 0.75, 1, 2.5, 5, 7.5, 10,
+		),
+	); err != nil {
+		return err
+	}
+
 	return nil
 }
 
@@ -413,7 +439,16 @@ func (c *MetricCollector) collectNetworkMetrics(ctx context.Context) {
 			finalValue = c.metricInjector.InjectMetricAnomaly(ctx, "system_network_qps", qps)
 		}
 
-		c.networkQPS.Record(ctx, finalValue)
+		// 添加服务属性作为标签
+		attrs := []attribute.KeyValue{}
+		if c.serviceName != "" {
+			attrs = append(attrs, attribute.String("exported_job", c.serviceName))
+		}
+		if c.serviceVersion != "" {
+			attrs = append(attrs, attribute.String("service_version", c.serviceVersion))
+		}
+
+		c.networkQPS.Record(ctx, finalValue, metric.WithAttributes(attrs...))
 	}
 	c.networkStats.lastUpdate = now
 }
@@ -486,3 +521,24 @@ func (c *MetricCollector) updateMachineStatus(ctx context.Context) {
 
 	c.machineOnlineStatus.Record(ctx, int64(finalValue))
 }
+
+// RecordHTTPRequestDuration 记录 HTTP 请求时延
+func (c *MetricCollector) RecordHTTPRequestDuration(ctx context.Context, duration float64, method, path string, statusCode int) {
+	// 构建属性标签
+	attrs := []attribute.KeyValue{
+		attribute.String("http.method", method),
+		attribute.String("http.route", path),
+		attribute.Int("http.status_code", statusCode),
+	}
+
+	// 添加服务属性
+	if c.serviceName != "" {
+		attrs = append(attrs, attribute.String("exported_job", c.serviceName))
+	}
+	if c.serviceVersion != "" {
+		attrs = append(attrs, attribute.String("service_version", c.serviceVersion))
+	}
+
+	// 记录时延（以秒为单位）
+	c.httpRequestDuration.Record(ctx, duration, metric.WithAttributes(attrs...))
+}
diff --git a/mock/s3/shared/observability/middleware.go b/mock/s3/shared/observability/middleware.go
index 265bc3e..a07e8b8 100644
--- a/mock/s3/shared/observability/middleware.go
+++ b/mock/s3/shared/observability/middleware.go
@@ -29,10 +29,26 @@ func (m *HTTPMiddleware) GinMetricsMiddleware() gin.HandlerFunc {
 		// 处理请求
 		c.Next()
 
-		// 计算基本信息用于日志记录
+		// 计算请求时延
 		duration := time.Since(start)
 		statusCode := c.Writer.Status()
 
+		// 记录 HTTP 请求时延指标（以秒为单位）
+		if m.collector != nil {
+			durationSeconds := duration.Seconds()
+			path := c.FullPath()
+			if path == "" {
+				path = c.Request.URL.Path // 如果没有匹配的路由，使用原始路径
+			}
+			m.collector.RecordHTTPRequestDuration(
+				c.Request.Context(),
+				durationSeconds,
+				c.Request.Method,
+				path,
+				statusCode,
+			)
+		}
+
 		// 只记录错误请求的日志
 		if statusCode >= 400 {
 			m.logger.Warn(c.Request.Context(), "HTTP request completed with error",
diff --git a/mock/s3/shared/observability/observability.go b/mock/s3/shared/observability/observability.go
index a6efafa..7d238ce 100644
--- a/mock/s3/shared/observability/observability.go
+++ b/mock/s3/shared/observability/observability.go
@@ -30,6 +30,9 @@ func Setup(serviceName string, configPath string) (*Providers, *MetricCollector,
 		return nil, nil, nil, fmt.Errorf("failed to create metric collector: %w", err)
 	}
 
+	// 设置服务信息到指标收集器
+	collector.SetServiceInfo(config.ServiceName, config.ServiceVersion)
+
 	// 创建HTTP中间件
 	httpMiddleware := NewHTTPMiddleware(collector, providers.Logger)
 
diff --git a/mock/s3/shared/observability/providers.go b/mock/s3/shared/observability/providers.go
index 6f93882..4ca00ba 100644
--- a/mock/s3/shared/observability/providers.go
+++ b/mock/s3/shared/observability/providers.go
@@ -1,12 +1,12 @@
 package observability
 
 import (
-    "context"
-    "fmt"
-    "mocks3/shared/observability/config"
-    "mocks3/shared/utils"
+	"context"
+	"fmt"
+	"mocks3/shared/observability/config"
+	"mocks3/shared/utils"
 
-    "go.opentelemetry.io/otel"
+	"go.opentelemetry.io/otel"
 	"go.opentelemetry.io/otel/exporters/otlp/otlplog/otlploghttp"
 	"go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetrichttp"
 	"go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp"
@@ -183,14 +183,14 @@ func (p *Providers) Shutdown(ctx context.Context) error {
 
 // createResource 创建OTEL资源
 func createResource(config *config.ObservabilityConfig) (*resource.Resource, error) {
-    // 使用统一的实例ID生成器
-    instanceID := utils.GetInstanceID(config.ServiceName)
-    return resource.New(context.Background(),
-        resource.WithAttributes(
-            semconv.ServiceName(config.ServiceName),
-            semconv.ServiceVersion(config.ServiceVersion),
-            semconv.DeploymentEnvironment(config.Environment),
-            semconv.ServiceInstanceID(instanceID),
-        ),
-    )
+	// 使用统一的实例ID生成器
+	instanceID := utils.GetInstanceID(config.ServiceName)
+	return resource.New(context.Background(),
+		resource.WithAttributes(
+			semconv.ServiceName(config.ServiceName),
+			semconv.ServiceVersion(config.ServiceVersion),
+			semconv.DeploymentEnvironment(config.Environment),
+			semconv.ServiceInstanceID(instanceID),
+		),
+	)
 }
diff --git a/mock/s3/shared/server/service_bootstrap.go b/mock/s3/shared/server/service_bootstrap.go
index aa7d986..5e48b74 100644
--- a/mock/s3/shared/server/service_bootstrap.go
+++ b/mock/s3/shared/server/service_bootstrap.go
@@ -1,15 +1,15 @@
 package server
 
 import (
-    "context"
-    "fmt"
-    "mocks3/shared/observability"
-    "net/http"
-    "net"
-    "os"
-    "os/signal"
-    "syscall"
-    "time"
+	"context"
+	"fmt"
+	"mocks3/shared/observability"
+	"net"
+	"net/http"
+	"os"
+	"os/signal"
+	"syscall"
+	"time"
 
 	"github.com/gin-gonic/gin"
 	"mocks3/shared/middleware/consul"
@@ -174,7 +174,7 @@ func (sb *ServiceBootstrap) setupObservability() error {
 
 // setupConsulRegistration 设置Consul服务注册
 func (sb *ServiceBootstrap) setupConsulRegistration() error {
-    ctx := context.Background()
+	ctx := context.Background()
 
 	// 检查配置是否支持Consul
 	consulConfig, ok := sb.Config.(ConsulServiceConfig)
@@ -191,23 +191,23 @@ func (sb *ServiceBootstrap) setupConsulRegistration() error {
 
 	sb.ConsulClient = consulClient
 
-    // 注册服务到Consul
-    // 优先使用可达的容器/主机实例IP地址进行注册，确保多实例下目标唯一
-    var registerAddress string
-    if sb.Config.GetHost() == "0.0.0.0" {
-        // 允许通过环境变量覆盖对外公布地址
-        if envAddr := os.Getenv("ADVERTISE_ADDR"); envAddr != "" {
-            registerAddress = envAddr
-        } else {
-            ip, err := detectAdvertiseAddr()
-            if err != nil {
-                return fmt.Errorf("failed to detect advertise address: %w", err)
-            }
-            registerAddress = ip
-        }
-    } else {
-        registerAddress = sb.Config.GetHost()
-    }
+	// 注册服务到Consul
+	// 优先使用可达的容器/主机实例IP地址进行注册，确保多实例下目标唯一
+	var registerAddress string
+	if sb.Config.GetHost() == "0.0.0.0" {
+		// 允许通过环境变量覆盖对外公布地址
+		if envAddr := os.Getenv("ADVERTISE_ADDR"); envAddr != "" {
+			registerAddress = envAddr
+		} else {
+			ip, err := detectAdvertiseAddr()
+			if err != nil {
+				return fmt.Errorf("failed to detect advertise address: %w", err)
+			}
+			registerAddress = ip
+		}
+	} else {
+		registerAddress = sb.Config.GetHost()
+	}
 
 	err = consul.RegisterService(ctx, consulClient,
 		sb.Config.GetServiceName(),
@@ -217,69 +217,69 @@ func (sb *ServiceBootstrap) setupConsulRegistration() error {
 		return fmt.Errorf("failed to register service with Consul: %w", err)
 	}
 
-    sb.Logger.Info(ctx, "Service registered with Consul successfully",
-        observability.String("consul_addr", consulConfig.GetConsulAddress()),
-        observability.String("service_name", sb.Config.GetServiceName()),
-        observability.String("register_address", registerAddress))
+	sb.Logger.Info(ctx, "Service registered with Consul successfully",
+		observability.String("consul_addr", consulConfig.GetConsulAddress()),
+		observability.String("service_name", sb.Config.GetServiceName()),
+		observability.String("register_address", registerAddress))
 
-    return nil
+	return nil
 }
 
 // detectAdvertiseAddr 自动探测一个非回环的IPv4地址，优先选择常见容器网卡
 func detectAdvertiseAddr() (string, error) {
-    // 优先尝试常见的容器网卡名称
-    preferredIfaces := []string{"eth0", "ens3", "ens4", "en0"}
-    for _, name := range preferredIfaces {
-        ifi, err := net.InterfaceByName(name)
-        if err == nil && (ifi.Flags&net.FlagUp) != 0 {
-            addrs, err := ifi.Addrs()
-            if err == nil {
-                if ip := firstIPv4(addrs); ip != "" {
-                    return ip, nil
-                }
-            }
-        }
-    }
-
-    // 回退：遍历所有网卡，取第一个非回环且Up的IPv4
-    ifaces, err := net.Interfaces()
-    if err != nil {
-        return "", err
-    }
-    for _, ifi := range ifaces {
-        if (ifi.Flags&net.FlagUp) == 0 || (ifi.Flags&net.FlagLoopback) != 0 {
-            continue
-        }
-        addrs, err := ifi.Addrs()
-        if err != nil {
-            continue
-        }
-        if ip := firstIPv4(addrs); ip != "" {
-            return ip, nil
-        }
-    }
-    return "", fmt.Errorf("no non-loopback IPv4 address found")
+	// 优先尝试常见的容器网卡名称
+	preferredIfaces := []string{"eth0", "ens3", "ens4", "en0"}
+	for _, name := range preferredIfaces {
+		ifi, err := net.InterfaceByName(name)
+		if err == nil && (ifi.Flags&net.FlagUp) != 0 {
+			addrs, err := ifi.Addrs()
+			if err == nil {
+				if ip := firstIPv4(addrs); ip != "" {
+					return ip, nil
+				}
+			}
+		}
+	}
+
+	// 回退：遍历所有网卡，取第一个非回环且Up的IPv4
+	ifaces, err := net.Interfaces()
+	if err != nil {
+		return "", err
+	}
+	for _, ifi := range ifaces {
+		if (ifi.Flags&net.FlagUp) == 0 || (ifi.Flags&net.FlagLoopback) != 0 {
+			continue
+		}
+		addrs, err := ifi.Addrs()
+		if err != nil {
+			continue
+		}
+		if ip := firstIPv4(addrs); ip != "" {
+			return ip, nil
+		}
+	}
+	return "", fmt.Errorf("no non-loopback IPv4 address found")
 }
 
 func firstIPv4(addrs []net.Addr) string {
-    for _, a := range addrs {
-        var ip net.IP
-        switch v := a.(type) {
-        case *net.IPNet:
-            ip = v.IP
-        case *net.IPAddr:
-            ip = v.IP
-        }
-        if ip == nil {
-            continue
-        }
-        ip4 := ip.To4()
-        if ip4 == nil || ip4.IsLoopback() {
-            continue
-        }
-        return ip4.String()
-    }
-    return ""
+	for _, a := range addrs {
+		var ip net.IP
+		switch v := a.(type) {
+		case *net.IPNet:
+			ip = v.IP
+		case *net.IPAddr:
+			ip = v.IP
+		}
+		if ip == nil {
+			continue
+		}
+		ip4 := ip.To4()
+		if ip4 == nil || ip4.IsLoopback() {
+			continue
+		}
+		return ip4.String()
+	}
+	return ""
 }
 
 // setupErrorInjection 设置错误注入中间件
diff --git a/mock/s3/shared/utils/instance.go b/mock/s3/shared/utils/instance.go
index 8f8e519..79ad013 100644
--- a/mock/s3/shared/utils/instance.go
+++ b/mock/s3/shared/utils/instance.go
@@ -62,7 +62,7 @@ func GetInstanceID(serviceName string) string {
 func generateInstanceID(serviceName string) string {
 	// 清理服务名：移除常见后缀，转换为小写
 	cleanServiceName := cleanServiceName(serviceName)
-	
+
 	// 生成8位短UUID
 	shortUUID := generateShortUUID()
 	if shortUUID == "" {
@@ -75,7 +75,7 @@ func generateInstanceID(serviceName string) string {
 // cleanServiceName 清理服务名
 func cleanServiceName(serviceName string) string {
 	name := strings.ToLower(serviceName)
-	
+
 	// 移除常见后缀
 	suffixes := []string{"-service", "_service", "service"}
 	for _, suffix := range suffixes {
@@ -84,11 +84,11 @@ func cleanServiceName(serviceName string) string {
 			break
 		}
 	}
-	
+
 	// 替换特殊字符为连字符
 	name = strings.ReplaceAll(name, "_", "-")
 	name = strings.ReplaceAll(name, " ", "-")
-	
+
 	return name
 }
 
@@ -106,4 +106,4 @@ func ResetInstanceID() {
 	instanceIDMutex.Lock()
 	defer instanceIDMutex.Unlock()
 	cachedInstanceID = ""
-}
\ No newline at end of file
+}

From d5dc444a9ef6ffa4937b99c0061a3140d8f1b148 Mon Sep 17 00:00:00 2001
From: dnj <shdnj@qq.com>
Date: Fri, 19 Sep 2025 17:23:12 +0800
Subject: [PATCH 02/18] =?UTF-8?q?feat(observability):=20=E6=B7=BB=E5=8A=A0?=
 =?UTF-8?q?HTTP=E5=BB=B6=E8=BF=9F=E6=B3=A8=E5=85=A5=E5=8A=9F=E8=83=BD?=
 =?UTF-8?q?=E5=B9=B6=E4=BC=98=E5=8C=96=E6=8C=87=E6=A0=87=E6=94=B6=E9=9B=86?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

重构指标注入器以支持服务版本维度
移除冗余的exported_job标签和实例ID生成
新增HTTP延迟注入器并与中间件集成
---
 .../error_injection/error_injection.go        |  21 +-
 .../error_injection/http_latency_injector.go  | 183 ++++++++++++++++++
 mock/s3/shared/observability/metrics.go       |  12 +-
 mock/s3/shared/observability/middleware.go    |  60 ++++--
 mock/s3/shared/observability/providers.go     |   4 -
 mock/s3/shared/server/service_bootstrap.go    |  35 +++-
 6 files changed, 275 insertions(+), 40 deletions(-)
 create mode 100644 mock/s3/shared/middleware/error_injection/http_latency_injector.go

diff --git a/mock/s3/shared/middleware/error_injection/error_injection.go b/mock/s3/shared/middleware/error_injection/error_injection.go
index dec2c6c..42dd606 100644
--- a/mock/s3/shared/middleware/error_injection/error_injection.go
+++ b/mock/s3/shared/middleware/error_injection/error_injection.go
@@ -34,6 +34,7 @@ type CacheConfig struct {
 type MetricInjector struct {
 	mockErrorClient *client.BaseHTTPClient
 	serviceName     string
+	serviceVersion  string // 添加服务版本字段
 	logger          *observability.Logger
 
 	// 缓存
@@ -56,7 +57,7 @@ type CachedAnomaly struct {
 }
 
 // NewMetricInjector 从YAML配置创建指标异常注入器
-func NewMetricInjector(configPath string, serviceName string, logger *observability.Logger) (*MetricInjector, error) {
+func NewMetricInjector(configPath string, serviceName string, serviceVersion string, logger *observability.Logger) (*MetricInjector, error) {
 	// 加载配置文件
 	var config MetricInjectorConfig
 	if err := utils.LoadConfig(configPath, &config); err != nil {
@@ -84,6 +85,7 @@ func NewMetricInjector(configPath string, serviceName string, logger *observabil
 	injector := &MetricInjector{
 		mockErrorClient: client,
 		serviceName:     serviceName,
+		serviceVersion:  serviceVersion,
 		logger:          logger,
 		cache:           make(map[string]*CachedAnomaly),
 		cacheTTL:        config.Cache.TTL,
@@ -102,12 +104,13 @@ func NewMetricInjector(configPath string, serviceName string, logger *observabil
 }
 
 // NewMetricInjectorWithDefaults 使用默认配置创建指标异常注入器
-func NewMetricInjectorWithDefaults(mockErrorServiceURL string, serviceName string, logger *observability.Logger) *MetricInjector {
+func NewMetricInjectorWithDefaults(mockErrorServiceURL string, serviceName string, serviceVersion string, logger *observability.Logger) *MetricInjector {
 	client := client.NewBaseHTTPClient(mockErrorServiceURL, 5*time.Second, "metric-injector", logger)
 
 	injector := &MetricInjector{
 		mockErrorClient: client,
 		serviceName:     serviceName,
+		serviceVersion:  serviceVersion,
 		logger:          logger,
 		cache:           make(map[string]*CachedAnomaly),
 		cacheTTL:        30 * time.Second,
@@ -125,11 +128,9 @@ func NewMetricInjectorWithDefaults(mockErrorServiceURL string, serviceName strin
 
 // InjectMetricAnomaly 检查并注入指标异常
 func (mi *MetricInjector) InjectMetricAnomaly(ctx context.Context, metricName string, originalValue float64) float64 {
-	// 计算实例标识，用于实例级注入与缓存
-	instanceID := utils.GetInstanceID(mi.serviceName)
-
-	// 检查缓存（加入实例维度）
-	cacheKey := mi.serviceName + ":" + instanceID + ":" + metricName
+	// 使用服务版本作为注入维度，同一版本的所有实例共享相同的异常注入
+	// 检查缓存（基于服务版本）
+	cacheKey := mi.serviceName + ":" + mi.serviceVersion + ":" + metricName
 	mi.cacheMu.RLock()
 	if cached, exists := mi.cache[cacheKey]; exists && time.Now().Before(cached.ExpiresAt) {
 		mi.cacheMu.RUnlock()
@@ -140,18 +141,18 @@ func (mi *MetricInjector) InjectMetricAnomaly(ctx context.Context, metricName st
 	}
 	mi.cacheMu.RUnlock()
 
-	// 查询Mock Error Service获取异常规则
+	// 查询Mock Error Service获取异常规则（基于版本）
 	request := map[string]string{
 		"service":     mi.serviceName,
+		"version":     mi.serviceVersion,
 		"metric_name": metricName,
-		"instance":    instanceID,
 	}
 
 	var response struct {
 		ShouldInject bool           `json:"should_inject"`
 		Service      string         `json:"service"`
+		Version      string         `json:"version"`
 		MetricName   string         `json:"metric_name"`
-		Instance     string         `json:"instance"`
 		Anomaly      map[string]any `json:"anomaly,omitempty"`
 	}
 
diff --git a/mock/s3/shared/middleware/error_injection/http_latency_injector.go b/mock/s3/shared/middleware/error_injection/http_latency_injector.go
new file mode 100644
index 0000000..f26de3c
--- /dev/null
+++ b/mock/s3/shared/middleware/error_injection/http_latency_injector.go
@@ -0,0 +1,183 @@
+package error_injection
+
+import (
+	"context"
+	"mocks3/shared/client"
+	"mocks3/shared/observability"
+	"sync"
+	"time"
+)
+
+// HTTPLatencyInjector HTTP请求延迟注入器
+type HTTPLatencyInjector struct {
+	mockErrorClient *client.BaseHTTPClient
+	serviceName     string
+	serviceVersion  string
+	logger          *observability.Logger
+
+	// 缓存
+	cache    map[string]*CachedLatencyConfig
+	cacheMu  sync.RWMutex
+	cacheTTL time.Duration
+}
+
+// CachedLatencyConfig 缓存的延迟配置
+type CachedLatencyConfig struct {
+	Config    *LatencyConfig
+	ExpiresAt time.Time
+}
+
+// LatencyConfig 延迟配置
+type LatencyConfig struct {
+	ShouldInject bool          `json:"should_inject"`
+	Latency      time.Duration `json:"latency"`     // 注入的延迟时间
+	Probability  float64       `json:"probability"` // 注入概率 (0-1)
+	Pattern      string        `json:"pattern"`     // 路径匹配模式（可选）
+}
+
+// NewHTTPLatencyInjector 创建HTTP延迟注入器
+func NewHTTPLatencyInjector(mockErrorServiceURL string, serviceName, serviceVersion string, logger *observability.Logger) *HTTPLatencyInjector {
+	client := client.NewBaseHTTPClient(mockErrorServiceURL, 5*time.Second, "latency-injector", logger)
+
+	injector := &HTTPLatencyInjector{
+		mockErrorClient: client,
+		serviceName:     serviceName,
+		serviceVersion:  serviceVersion,
+		logger:          logger,
+		cache:           make(map[string]*CachedLatencyConfig),
+		cacheTTL:        30 * time.Second,
+	}
+
+	// 启动缓存清理
+	go injector.cleanupCache()
+
+	return injector
+}
+
+// GetLatencyConfig 获取延迟配置
+func (h *HTTPLatencyInjector) GetLatencyConfig(ctx context.Context, path string) (*LatencyConfig, error) {
+	// 构建缓存键（基于版本）
+	cacheKey := h.serviceName + ":" + h.serviceVersion + ":" + path
+
+	// 检查缓存
+	h.cacheMu.RLock()
+	if cached, exists := h.cache[cacheKey]; exists && time.Now().Before(cached.ExpiresAt) {
+		h.cacheMu.RUnlock()
+		return cached.Config, nil
+	}
+	h.cacheMu.RUnlock()
+
+	// 查询Mock Error Service获取延迟配置
+	request := map[string]string{
+		"service": h.serviceName,
+		"version": h.serviceVersion,
+		"path":    path,
+		"type":    "http_latency",
+	}
+
+	var response struct {
+		ShouldInject bool    `json:"should_inject"`
+		Latency      int64   `json:"latency_ms"` // 毫秒
+		Probability  float64 `json:"probability"`
+		Pattern      string  `json:"pattern"`
+	}
+
+	opts := client.RequestOptions{
+		Method: "POST",
+		Path:   "/api/v1/latency-inject/check",
+		Body:   request,
+	}
+
+	err := h.mockErrorClient.DoRequestWithJSON(ctx, opts, &response)
+	if err != nil {
+		h.logger.Debug(ctx, "Failed to check latency injection",
+			observability.Error(err),
+			observability.String("path", path))
+		// 失败时缓存空结果
+		h.updateCache(cacheKey, nil)
+		return nil, nil
+	}
+
+	// 构建配置
+	var config *LatencyConfig
+	if response.ShouldInject {
+		config = &LatencyConfig{
+			ShouldInject: true,
+			Latency:      time.Duration(response.Latency) * time.Millisecond,
+			Probability:  response.Probability,
+			Pattern:      response.Pattern,
+		}
+	}
+
+	// 更新缓存
+	h.updateCache(cacheKey, config)
+
+	return config, nil
+}
+
+// InjectLatency 注入延迟（如果需要）
+func (h *HTTPLatencyInjector) InjectLatency(ctx context.Context, path string) time.Duration {
+	config, err := h.GetLatencyConfig(ctx, path)
+	if err != nil || config == nil || !config.ShouldInject {
+		return 0
+	}
+
+	// 基于概率决定是否注入
+	if config.Probability < 1.0 {
+		// 简单的概率实现（生产环境应使用更好的随机数）
+		if time.Now().UnixNano()%100 >= int64(config.Probability*100) {
+			return 0
+		}
+	}
+
+	// 执行真实的延迟
+	if config.Latency > 0 {
+		h.logger.Info(ctx, "Injecting HTTP latency",
+			observability.String("service", h.serviceName),
+			observability.String("version", h.serviceVersion),
+			observability.String("path", path),
+			observability.Duration("latency", config.Latency))
+
+		// 真实的延迟注入
+		time.Sleep(config.Latency)
+
+		return config.Latency
+	}
+
+	return 0
+}
+
+// updateCache 更新缓存
+func (h *HTTPLatencyInjector) updateCache(key string, config *LatencyConfig) {
+	h.cacheMu.Lock()
+	defer h.cacheMu.Unlock()
+
+	h.cache[key] = &CachedLatencyConfig{
+		Config:    config,
+		ExpiresAt: time.Now().Add(h.cacheTTL),
+	}
+}
+
+// cleanupCache 定期清理过期缓存
+func (h *HTTPLatencyInjector) cleanupCache() {
+	ticker := time.NewTicker(1 * time.Minute)
+	defer ticker.Stop()
+
+	for range ticker.C {
+		h.cacheMu.Lock()
+		now := time.Now()
+		for key, cached := range h.cache {
+			if now.After(cached.ExpiresAt) {
+				delete(h.cache, key)
+			}
+		}
+		h.cacheMu.Unlock()
+	}
+}
+
+// Cleanup 清理资源
+func (h *HTTPLatencyInjector) Cleanup() {
+	h.cacheMu.Lock()
+	defer h.cacheMu.Unlock()
+	h.cache = make(map[string]*CachedLatencyConfig)
+}
diff --git a/mock/s3/shared/observability/metrics.go b/mock/s3/shared/observability/metrics.go
index 755161e..4f00649 100644
--- a/mock/s3/shared/observability/metrics.go
+++ b/mock/s3/shared/observability/metrics.go
@@ -439,11 +439,8 @@ func (c *MetricCollector) collectNetworkMetrics(ctx context.Context) {
 			finalValue = c.metricInjector.InjectMetricAnomaly(ctx, "system_network_qps", qps)
 		}
 
-		// 添加服务属性作为标签
+		// 添加服务版本标签（exported_job 冗余，已通过 service_name 资源属性暴露）
 		attrs := []attribute.KeyValue{}
-		if c.serviceName != "" {
-			attrs = append(attrs, attribute.String("exported_job", c.serviceName))
-		}
 		if c.serviceVersion != "" {
 			attrs = append(attrs, attribute.String("service_version", c.serviceVersion))
 		}
@@ -524,17 +521,14 @@ func (c *MetricCollector) updateMachineStatus(ctx context.Context) {
 
 // RecordHTTPRequestDuration 记录 HTTP 请求时延
 func (c *MetricCollector) RecordHTTPRequestDuration(ctx context.Context, duration float64, method, path string, statusCode int) {
-	// 构建属性标签
+	// 构建属性标签（移除 exported_job，保留 service_version）
 	attrs := []attribute.KeyValue{
 		attribute.String("http.method", method),
 		attribute.String("http.route", path),
 		attribute.Int("http.status_code", statusCode),
 	}
 
-	// 添加服务属性
-	if c.serviceName != "" {
-		attrs = append(attrs, attribute.String("exported_job", c.serviceName))
-	}
+	// 添加服务版本（必要标签，用于版本区分）
 	if c.serviceVersion != "" {
 		attrs = append(attrs, attribute.String("service_version", c.serviceVersion))
 	}
diff --git a/mock/s3/shared/observability/middleware.go b/mock/s3/shared/observability/middleware.go
index a07e8b8..5b22f85 100644
--- a/mock/s3/shared/observability/middleware.go
+++ b/mock/s3/shared/observability/middleware.go
@@ -1,16 +1,23 @@
 package observability
 
 import (
+	"context"
 	"time"
 
 	"github.com/gin-gonic/gin"
 	"go.opentelemetry.io/contrib/instrumentation/github.com/gin-gonic/gin/otelgin"
 )
 
+// LatencyInjector 延迟注入器接口
+type LatencyInjector interface {
+	InjectLatency(ctx context.Context, path string) time.Duration
+}
+
 // HTTPMiddleware HTTP监控中间件
 type HTTPMiddleware struct {
-	collector *MetricCollector
-	logger    *Logger
+	collector       *MetricCollector
+	logger          *Logger
+	latencyInjector LatencyInjector
 }
 
 // NewHTTPMiddleware 创建HTTP中间件
@@ -21,25 +28,38 @@ func NewHTTPMiddleware(collector *MetricCollector, logger *Logger) *HTTPMiddlewa
 	}
 }
 
+// SetLatencyInjector 设置延迟注入器
+func (m *HTTPMiddleware) SetLatencyInjector(injector LatencyInjector) {
+	m.latencyInjector = injector
+}
+
 // GinMetricsMiddleware Gin指标中间件
 func (m *HTTPMiddleware) GinMetricsMiddleware() gin.HandlerFunc {
 	return func(c *gin.Context) {
+		// 获取请求路径
+		path := c.FullPath()
+		if path == "" {
+			path = c.Request.URL.Path
+		}
+
+		// 在请求处理前注入延迟（如果配置了延迟注入器）
+		var injectedLatency time.Duration
+		if m.latencyInjector != nil {
+			injectedLatency = m.latencyInjector.InjectLatency(c.Request.Context(), path)
+		}
+
 		start := time.Now()
 
 		// 处理请求
 		c.Next()
 
-		// 计算请求时延
+		// 计算请求时延（包含注入的延迟）
 		duration := time.Since(start)
 		statusCode := c.Writer.Status()
 
 		// 记录 HTTP 请求时延指标（以秒为单位）
 		if m.collector != nil {
 			durationSeconds := duration.Seconds()
-			path := c.FullPath()
-			if path == "" {
-				path = c.Request.URL.Path // 如果没有匹配的路由，使用原始路径
-			}
 			m.collector.RecordHTTPRequestDuration(
 				c.Request.Context(),
 				durationSeconds,
@@ -53,18 +73,30 @@ func (m *HTTPMiddleware) GinMetricsMiddleware() gin.HandlerFunc {
 		if statusCode >= 400 {
 			m.logger.Warn(c.Request.Context(), "HTTP request completed with error",
 				String("method", c.Request.Method),
-				String("path", c.FullPath()),
+				String("path", path),
 				Int("status", statusCode),
 				Duration("duration", duration),
+				Duration("injected_latency", injectedLatency),
 			)
 		}
 
-		m.logger.Info(c.Request.Context(), "HTTP request completed",
-			String("method", c.Request.Method),
-			String("path", c.FullPath()),
-			Int("status", statusCode),
-			Duration("duration", duration),
-		)
+		// 记录请求信息（如果有注入延迟，记录在日志中）
+		if injectedLatency > 0 {
+			m.logger.Info(c.Request.Context(), "HTTP request completed with injected latency",
+				String("method", c.Request.Method),
+				String("path", path),
+				Int("status", statusCode),
+				Duration("duration", duration),
+				Duration("injected_latency", injectedLatency),
+			)
+		} else {
+			m.logger.Info(c.Request.Context(), "HTTP request completed",
+				String("method", c.Request.Method),
+				String("path", path),
+				Int("status", statusCode),
+				Duration("duration", duration),
+			)
+		}
 	}
 }
 
diff --git a/mock/s3/shared/observability/providers.go b/mock/s3/shared/observability/providers.go
index 4ca00ba..0f28ba5 100644
--- a/mock/s3/shared/observability/providers.go
+++ b/mock/s3/shared/observability/providers.go
@@ -4,7 +4,6 @@ import (
 	"context"
 	"fmt"
 	"mocks3/shared/observability/config"
-	"mocks3/shared/utils"
 
 	"go.opentelemetry.io/otel"
 	"go.opentelemetry.io/otel/exporters/otlp/otlplog/otlploghttp"
@@ -183,14 +182,11 @@ func (p *Providers) Shutdown(ctx context.Context) error {
 
 // createResource 创建OTEL资源
 func createResource(config *config.ObservabilityConfig) (*resource.Resource, error) {
-	// 使用统一的实例ID生成器
-	instanceID := utils.GetInstanceID(config.ServiceName)
 	return resource.New(context.Background(),
 		resource.WithAttributes(
 			semconv.ServiceName(config.ServiceName),
 			semconv.ServiceVersion(config.ServiceVersion),
 			semconv.DeploymentEnvironment(config.Environment),
-			semconv.ServiceInstanceID(instanceID),
 		),
 	)
 }
diff --git a/mock/s3/shared/server/service_bootstrap.go b/mock/s3/shared/server/service_bootstrap.go
index 5e48b74..6be5825 100644
--- a/mock/s3/shared/server/service_bootstrap.go
+++ b/mock/s3/shared/server/service_bootstrap.go
@@ -47,7 +47,8 @@ type ServiceBootstrap struct {
 	HTTPMiddleware *observability.HTTPMiddleware
 
 	// 错误注入
-	MetricInjector *error_injection.MetricInjector
+	MetricInjector  *error_injection.MetricInjector
+	LatencyInjector *error_injection.HTTPLatencyInjector
 
 	// Consul客户端
 	ConsulClient consul.ConsulClient
@@ -286,10 +287,14 @@ func firstIPv4(addrs []net.Addr) string {
 func (sb *ServiceBootstrap) setupErrorInjection() error {
 	ctx := context.Background()
 
-	// 尝试从配置文件加载
+	// 获取服务版本（默认为 1.0.0）
+	serviceVersion := "1.0.0"
+
+	// 尝试从配置文件加载指标注入器
 	metricInjector, err := error_injection.NewMetricInjector(
 		sb.MetricInjectorConfigPath,
 		sb.Config.GetServiceName(),
+		serviceVersion,
 		sb.Logger,
 	)
 
@@ -300,6 +305,7 @@ func (sb *ServiceBootstrap) setupErrorInjection() error {
 		sb.MetricInjector = error_injection.NewMetricInjectorWithDefaults(
 			"http://mock-error-service:8085",
 			sb.Config.GetServiceName(),
+			serviceVersion,
 			sb.Logger,
 		)
 	} else {
@@ -307,7 +313,24 @@ func (sb *ServiceBootstrap) setupErrorInjection() error {
 	}
 
 	if sb.MetricInjector != nil {
-		sb.Logger.Info(ctx, "Metric injector initialized successfully")
+		sb.Logger.Info(ctx, "Metric injector initialized successfully",
+			observability.String("service_version", serviceVersion))
+	}
+
+	// 创建HTTP延迟注入器
+	sb.LatencyInjector = error_injection.NewHTTPLatencyInjector(
+		"http://mock-error-service:8085",
+		sb.Config.GetServiceName(),
+		serviceVersion,
+		sb.Logger,
+	)
+
+	// 将延迟注入器连接到HTTP中间件
+	if sb.HTTPMiddleware != nil && sb.LatencyInjector != nil {
+		sb.HTTPMiddleware.SetLatencyInjector(sb.LatencyInjector)
+		sb.Logger.Info(ctx, "HTTP latency injector connected to middleware",
+			observability.String("service", sb.Config.GetServiceName()),
+			observability.String("version", serviceVersion))
 	}
 
 	return nil
@@ -407,6 +430,12 @@ func (sb *ServiceBootstrap) waitForShutdown(server *http.Server) {
 		sb.Logger.Info(ctx, "Metric injector cleaned up")
 	}
 
+	// 清理延迟注入器资源
+	if sb.LatencyInjector != nil {
+		sb.LatencyInjector.Cleanup()
+		sb.Logger.Info(ctx, "Latency injector cleaned up")
+	}
+
 	// 关闭HTTP服务器
 	shutdownCtx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
 	defer cancel()

From c8ba0993bde138c8bc7c451c2fef96f67a80b67c Mon Sep 17 00:00:00 2001
From: dnj <shdnj@qq.com>
Date: Fri, 19 Sep 2025 17:35:17 +0800
Subject: [PATCH 03/18] =?UTF-8?q?refactor(metrics):=20=E7=AE=80=E5=8C=96HT?=
 =?UTF-8?q?TP=E6=97=B6=E5=BB=B6=E6=8C=87=E6=A0=87=E5=90=8D=E7=A7=B0?=
 =?UTF-8?q?=E5=B9=B6=E6=9B=B4=E6=96=B0=E5=81=A5=E5=BA=B7=E6=A3=80=E6=9F=A5?=
 =?UTF-8?q?=E8=84=9A=E6=9C=AC?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

更新HTTP时延指标名称使其更简洁，同时改进健康检查脚本以支持批量检查多个端口
---
 mock/s3/DEPLOYMENT.md                   | 11 ++++++-----
 mock/s3/shared/observability/metrics.go |  4 ++--
 2 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/mock/s3/DEPLOYMENT.md b/mock/s3/DEPLOYMENT.md
index 6e69125..8a93c65 100644
--- a/mock/s3/DEPLOYMENT.md
+++ b/mock/s3/DEPLOYMENT.md
@@ -55,11 +55,12 @@ sudo supervisorctl start zeroops_*
 ps aux | grep zeroops_
 
 # 健康检查
-curl http://localhost:8181/health  # metadata-service
-curl http://localhost:8191/health  # storage-service
-curl http://localhost:8201/health  # queue-service
-curl http://localhost:8211/health  # third-party-service
-curl http://localhost:8221/health  # mock-error-service
+```
+for port in 8182 8183 8191 8192 8201 8202 8211 8221; do
+    echo "Checking port $port:"
+    curl -s -o /dev/null -w "HTTP Status: %{http_code}\n" http://localhost:$port/metrics ||echo "Failed"
+done
+```
 
 # 查看日志
 tail -f /home/qboxserver/zeroops_metadata_1/logs/service.log
diff --git a/mock/s3/shared/observability/metrics.go b/mock/s3/shared/observability/metrics.go
index 4f00649..d6a8343 100644
--- a/mock/s3/shared/observability/metrics.go
+++ b/mock/s3/shared/observability/metrics.go
@@ -153,9 +153,9 @@ func (c *MetricCollector) initMetrics() error {
 		return err
 	}
 
-	// HTTP 请求时延 (使用 Prometheus 兼容的命名)
+	// HTTP 请求时延
 	if c.httpRequestDuration, err = c.meter.Float64Histogram(
-		"http.server.request.duration_seconds",
+		"http_latency",
 		metric.WithDescription("HTTP server request duration in seconds"),
 		metric.WithUnit("s"),
 		metric.WithExplicitBucketBoundaries(

From 334a81d70c9d9a70c9147386a9bb41f0368649ff Mon Sep 17 00:00:00 2001
From: Ding <shdnj@qq.com>
Date: Sat, 20 Sep 2025 20:09:14 +0800
Subject: [PATCH 04/18] =?UTF-8?q?feat(prometheus=5Fadapter):=20=E6=96=B0?=
 =?UTF-8?q?=E5=A2=9Eprometheus=5Fadapter=E5=AE=9E=E7=8E=B0=E6=8C=87?=
 =?UTF-8?q?=E6=A0=87=E6=9F=A5=E8=AF=A2=E5=8A=9F=E8=83=BD?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

实现Prometheus适配器模块，包括以下主要功能：
- 添加Prometheus客户端封装，支持指标查询和范围查询
- 实现指标服务层，提供指标列表获取和指标数据查询
- 添加API路由和控制器，提供RESTful接口
- 定义模型结构体和错误处理机制
- 更新依赖添加Prometheus客户端库
- 编写详细API文档说明接口使用方式
---
 docs/prometheus_adapter/API.md                | 271 ------------------
 docs/prometheus_adapter/README.md             | 140 +++++++++
 go.mod                                        |  18 +-
 go.sum                                        |  48 +++-
 internal/prometheus_adapter/api/api.go        |  29 ++
 internal/prometheus_adapter/api/metric_api.go | 175 +++++++++++
 .../client/prometheus_client.go               | 144 ++++++++++
 internal/prometheus_adapter/model/api.go      |  24 ++
 .../prometheus_adapter/model/constants.go     |  10 +
 internal/prometheus_adapter/model/error.go    |  49 ++++
 internal/prometheus_adapter/server.go         |  65 +++++
 .../service/metric_service.go                 |  80 ++++++
 12 files changed, 761 insertions(+), 292 deletions(-)
 delete mode 100644 docs/prometheus_adapter/API.md
 create mode 100644 docs/prometheus_adapter/README.md
 create mode 100644 internal/prometheus_adapter/api/api.go
 create mode 100644 internal/prometheus_adapter/api/metric_api.go
 create mode 100644 internal/prometheus_adapter/client/prometheus_client.go
 create mode 100644 internal/prometheus_adapter/model/api.go
 create mode 100644 internal/prometheus_adapter/model/constants.go
 create mode 100644 internal/prometheus_adapter/model/error.go
 create mode 100644 internal/prometheus_adapter/server.go
 create mode 100644 internal/prometheus_adapter/service/metric_service.go

diff --git a/docs/prometheus_adapter/API.md b/docs/prometheus_adapter/API.md
deleted file mode 100644
index 4cfe228..0000000
--- a/docs/prometheus_adapter/API.md
+++ /dev/null
@@ -1,271 +0,0 @@
-# Prometheus Adapter API 文档
-
-## 概述
-
-Prometheus Adapter 提供从 Prometheus 获取服务 QPS 和平均时延指标的 RESTful API 接口。支持按服务名称和版本进行查询。
-
-> **当前状态**：
-> - QPS 指标：已实现，使用 `system_network_qps` 指标（基于网络包统计）
-> - 时延指标：已实现，使用 `http.server.request.duration_seconds` 指标（HTTP 请求真实时延）
-
-## API
-
-### 1. 获取服务 QPS 指标
-
-**GET** `/v1/metrics/:service/qps`
-
-获取指定服务的 QPS（每秒请求数）指标数据。
-
-#### 路径参数
-- `service` (string, required): 服务名称
-
-#### 查询参数
-- `version` (string, optional): 服务版本，不指定则返回所有版本
-- `start` (string, optional): 开始时间 (RFC3339 格式，如: 2024-01-01T00:00:00Z)
-- `end` (string, optional): 结束时间 (RFC3339 格式，如: 2024-01-01T01:00:00Z)
-- `step` (string, optional): 时间步长 (如: 1m, 5m, 1h)，默认 1m
-
-#### 请求示例
-```bash
-GET /v1/metrics/metadata-service/qps?version=1.0.0&start=2024-01-01T00:00:00Z&end=2024-01-01T01:00:00Z&step=1m
-```
-
-#### 响应示例
-```json
-{
-  "service": "metadata-service",
-  "version": "1.0.0",
-  "metric_type": "qps",
-  "data": [
-    {
-      "timestamp": "2024-01-01T00:00:00Z",
-      "value": 150.5
-    },
-    {
-      "timestamp": "2024-01-01T00:01:00Z",
-      "value": 148.2
-    }
-  ],
-  "summary": {
-    "min": 120.1,
-    "max": 180.3,
-    "avg": 152.8,
-    "total_points": 60
-  }
-}
-```
-
-### 2. 获取服务平均时延指标
-
-**GET** `/v1/metrics/:service/latency`
-
-获取指定服务的平均响应时延指标数据（单位：秒）。
-
-#### 路径参数
-- `service` (string, required): 服务名称
-
-#### 查询参数
-- `version` (string, optional): 服务版本，不指定则返回所有版本
-- `start` (string, optional): 开始时间 (RFC3339 格式)
-- `end` (string, optional): 结束时间 (RFC3339 格式)
-- `step` (string, optional): 时间步长，默认 1m
-- `percentile` (string, optional): 百分位数 (p50, p95, p99)，默认 p50
-
-#### 请求示例
-```bash
-GET /v1/metrics/storage-service/latency?version=1.0.0&percentile=p95&start=2024-01-01T00:00:00Z&end=2024-01-01T01:00:00Z
-```
-
-#### 响应示例
-```json
-{
-  "service": "storage-service",
-  "version": "1.0.0",
-  "metric_type": "latency",
-  "percentile": "p95",
-  "data": [
-    {
-      "timestamp": "2024-01-01T00:00:00Z",
-      "value": 125.8
-    },
-    {
-      "timestamp": "2024-01-01T00:01:00Z",
-      "value": 132.1
-    }
-  ],
-  "summary": {
-    "min": 98.5,
-    "max": 201.2,
-    "avg": 128.9,
-    "total_points": 60
-  }
-}
-```
-
-### 3. 获取服务综合指标
-
-**GET** `/v1/metrics/:service/overview`
-
-同时获取指定服务的 QPS 和时延指标概览。
-
-#### 路径参数
-- `service` (string, required): 服务名称
-
-#### 查询参数
-- `version` (string, optional): 服务版本
-- `start` (string, optional): 开始时间 (RFC3339 格式)
-- `end` (string, optional): 结束时间 (RFC3339 格式)
-
-#### 响应示例
-```json
-{
-  "service": "queue-service",
-  "version": "1.0.0",
-  "time_range": {
-    "start": "2024-01-01T00:00:00Z",
-    "end": "2024-01-01T01:00:00Z"
-  },
-  "metrics": {
-    "qps": {
-      "current": 152.8,
-      "avg": 148.5,
-      "max": 180.3,
-      "min": 120.1
-    },
-    "latency": {
-      "p50": 85.2,
-      "p95": 128.9,
-      "p99": 201.2
-    }
-  }
-}
-```
-
-### 4. 获取可用服务列表
-
-**GET** `/v1/services`
-
-获取 Prometheus 中可监控的服务列表。
-
-#### 查询参数
-- `prefix` (string, optional): 服务名前缀过滤
-
-#### 响应示例
-```json
-{
-  "services": [
-    {
-      "name": "metadata-service",
-      "versions": ["1.0.0"],
-      "active_versions": ["1.0.0"],
-      "last_updated": "2024-01-01T01:00:00Z"
-    },
-    {
-      "name": "storage-service",
-      "versions": ["1.0.0"],
-      "active_versions": ["1.0.0"],
-      "last_updated": "2024-01-01T00:45:00Z"
-    },
-    {
-      "name": "queue-service",
-      "versions": ["1.0.0"],
-      "active_versions": ["1.0.0"],
-      "last_updated": "2024-01-01T00:30:00Z"
-    },
-    {
-      "name": "third-party-service",
-      "versions": ["1.0.0"],
-      "active_versions": ["1.0.0"],
-      "last_updated": "2024-01-01T00:20:00Z"
-    },
-    {
-      "name": "mock-error-service",
-      "versions": ["1.0.0"],
-      "active_versions": ["1.0.0"],
-      "last_updated": "2024-01-01T00:15:00Z"
-    }
-  ],
-  "total": 5
-}
-```
-
-## 错误响应
-
-所有 API 在出错时返回统一的错误格式：
-
-```json
-{
-  "error": "error_code",
-  "message": "详细错误描述",
-  "details": {
-    "field": "具体错误字段"
-  }
-}
-```
-
-### 常见错误码
-
-- `400 Bad Request`: 请求参数错误
-- `404 Not Found`: 服务或版本不存在
-- `500 Internal Server Error`: 内部服务器错误
-- `503 Service Unavailable`: Prometheus 连接失败
-
-## 实现说明
-
-### Prometheus 查询语法
-
-API 内部使用的 Prometheus 查询示例：
-
-#### QPS 查询
-```promql
-# 网络包 QPS（当前实现）
-system_network_qps{exported_job="metadata-service",service_version="1.0.0"}
-
-# 计算5分钟平均 QPS
-rate(system_network_qps{exported_job="metadata-service",service_version="1.0.0"}[5m])
-```
-
-#### 平均时延查询
-```promql
-# P95 时延（95分位数）
-histogram_quantile(0.95, rate(http.server.request.duration_seconds_bucket{exported_job="metadata-service",service_version="1.0.0"}[5m]))
-
-# P50 时延（中位数）
-histogram_quantile(0.50, rate(http.server.request.duration_seconds_bucket{exported_job="metadata-service",service_version="1.0.0"}[5m]))
-
-# P99 时延（99分位数）
-histogram_quantile(0.99, rate(http.server.request.duration_seconds_bucket{exported_job="metadata-service",service_version="1.0.0"}[5m]))
-
-# 平均时延
-rate(http.server.request.duration_seconds_sum{exported_job="metadata-service",service_version="1.0.0"}[5m])
-/
-rate(http.server.request.duration_seconds_count{exported_job="metadata-service",service_version="1.0.0"}[5m])
-```
-
-### 配置要求
-
-需要在配置文件中指定：
-- Prometheus 服务器地址：`http://10.210.10.33:9090`
-- 查询超时时间：30秒
-- 默认时间范围：最近1小时
-- 服务标签映射：
-  - 服务名：`exported_job`（在指标中作为标签）
-  - 版本号：`service_version`（在指标中作为标签）
-  - 实例标识：通过 OpenTelemetry 的 `service.instance.id` 属性设置
-
-### 支持的服务列表
-
-当前 mock/s3 环境中支持的服务：
-- `metadata-service` - 元数据管理服务（版本：1.0.0）
-- `storage-service` - 存储服务（版本：1.0.0）
-- `queue-service` - 消息队列服务（版本：1.0.0）
-- `third-party-service` - 第三方集成服务（版本：1.0.0）
-- `mock-error-service` - 错误模拟服务（版本：1.0.0）
-
-所有服务的版本信息通过 `service_version` 标签暴露。
-
-### 缓存策略
-
-- 指标数据缓存时间：30秒
-- 服务列表缓存时间：5分钟
-- 支持 ETag 缓存验证
\ No newline at end of file
diff --git a/docs/prometheus_adapter/README.md b/docs/prometheus_adapter/README.md
new file mode 100644
index 0000000..30ca4be
--- /dev/null
+++ b/docs/prometheus_adapter/README.md
@@ -0,0 +1,140 @@
+# Prometheus Adapter API 文档
+
+## 概述
+
+Prometheus Adapter 提供从 Prometheus 获取服务指标的 RESTful API 接口。支持按服务名称和版本进行查询。
+
+## API
+
+### 1. 获取可用指标列表
+
+**GET** `/v1/metrics`
+
+获取所有可用的指标列表。
+
+#### 请求示例
+```bash
+GET /v1/metrics
+```
+
+#### 响应示例
+```json
+{
+  "metrics": [
+    "system_cpu_usage_percent",
+    "system_memory_usage_percent",
+    "system_disk_usage_percent",
+    "system_network_qps",
+    "system_machine_online_status",
+    "http_latency"
+  ]
+}
+```
+
+### 2. 通用指标查询接口
+
+**GET** `/v1/metrics/:service/:metric`
+
+获取指定服务的任意指标时间序列数据。指标不存在则返回错误。
+
+#### 路径参数
+- `service` (string, required): 服务名称
+- `metric` (string, required): 指标名称（必须是 Prometheus 中实际存在的指标）
+
+#### 查询参数
+- `version` (string, optional): 服务版本，不指定则返回所有版本
+- `start` (string, optional): 开始时间 (RFC3339 格式，如: 2024-01-01T00:00:00Z)
+- `end` (string, optional): 结束时间 (RFC3339 格式，如: 2024-01-01T01:00:00Z)
+- `step` (string, optional): 时间步长 (如: 1m, 5m, 1h)，默认 1m
+
+#### 请求示例
+
+1. **查询 CPU 使用率：**
+```bash
+GET /v1/metrics/metadata-service/system_cpu_usage_percent?version=1.0.0
+```
+
+2. **查询内存使用率：**
+```bash
+GET /v1/metrics/storage-service/system_memory_usage_percent?version=1.0.0
+```
+
+3. **查询 HTTP 请求延迟：**
+```bash
+GET /v1/metrics/storage-service/http_latency?version=1.0.0
+```
+
+4. **查询网络 QPS：**
+```bash
+GET /v1/metrics/storage-service/system_network_qps?version=1.0.0
+```
+
+#### 成功响应示例
+
+**HTTP 200 OK**
+```json
+{
+  "service": "metadata-service",
+  "version": "1.0.0",
+  "metric": "system_cpu_usage_percent",
+  "data": [
+    {
+      "timestamp": "2024-01-01T00:00:00Z",
+      "value": 45.2
+    },
+    {
+      "timestamp": "2024-01-01T00:01:00Z",
+      "value": 48.5
+    }
+  ]
+}
+```
+
+#### 错误响应示例
+
+**指标不存在时 - HTTP 404 Not Found**
+```json
+{
+  "error": {
+    "code": "METRIC_NOT_FOUND",
+    "message": "指标 'invalid_metric' 不存在",
+    "metric": "invalid_metric"
+  }
+}
+```
+
+**服务不存在时 - HTTP 404 Not Found**
+```json
+{
+  "error": {
+    "code": "SERVICE_NOT_FOUND",
+    "message": "服务 'invalid-service' 不存在",
+    "service": "invalid-service"
+  }
+}
+```
+
+**参数错误时 - HTTP 400 Bad Request**
+```json
+{
+  "error": {
+    "code": "INVALID_PARAMETER",
+    "message": "参数 'start' 格式错误: invalid-time",
+    "parameter": "start",
+    "value": "invalid-time"
+  }
+}
+```
+
+## 实现说明
+
+### 支持的服务列表
+
+当前 mock/s3 环境中支持的服务：
+- `metadata-service` - 元数据管理服务
+- `storage-service` - 存储服务
+- `queue-service` - 消息队列服务
+- `third-party-service` - 第三方集成服务
+- `mock-error-service` - 错误模拟服务
+
+所有服务的版本信息通过 `service_version` 标签暴露。
\ No newline at end of file
diff --git a/go.mod b/go.mod
index 6094f9c..8015824 100644
--- a/go.mod
+++ b/go.mod
@@ -7,6 +7,8 @@ require (
 	github.com/google/uuid v1.6.0
 	github.com/jackc/pgx/v5 v5.5.5
 	github.com/lib/pq v1.10.9
+	github.com/prometheus/client_golang v1.23.2
+	github.com/prometheus/common v0.66.1
 	github.com/redis/go-redis/v9 v9.5.1
 	github.com/rs/zerolog v1.34.0
 )
@@ -14,7 +16,7 @@ require (
 require (
 	github.com/bytedance/sonic v1.13.3 // indirect
 	github.com/bytedance/sonic/loader v0.2.4 // indirect
-	github.com/cespare/xxhash/v2 v2.2.0 // indirect
+	github.com/cespare/xxhash/v2 v2.3.0 // indirect
 	github.com/cloudwego/base64x v0.1.5 // indirect
 	github.com/dgryski/go-rendezvous v0.0.0-20200823014737-9f7001d12a5f // indirect
 	github.com/gabriel-vasile/mimetype v1.4.9 // indirect
@@ -39,14 +41,16 @@ require (
 	github.com/modern-go/reflect2 v1.0.2 // indirect
 	github.com/natefinch/lumberjack v2.0.0+incompatible // indirect
 	github.com/pelletier/go-toml/v2 v2.2.4 // indirect
+	github.com/prometheus/client_model v0.6.2 // indirect
 	github.com/twitchyliquid64/golang-asm v0.15.1 // indirect
 	github.com/ugorji/go/codec v1.3.0 // indirect
+	go.yaml.in/yaml/v2 v2.4.2 // indirect
 	golang.org/x/arch v0.18.0 // indirect
-	golang.org/x/crypto v0.39.0 // indirect
-	golang.org/x/net v0.41.0 // indirect
-	golang.org/x/sync v0.15.0 // indirect
-	golang.org/x/sys v0.33.0 // indirect
-	golang.org/x/text v0.26.0 // indirect
-	google.golang.org/protobuf v1.36.6 // indirect
+	golang.org/x/crypto v0.41.0 // indirect
+	golang.org/x/net v0.43.0 // indirect
+	golang.org/x/sync v0.16.0 // indirect
+	golang.org/x/sys v0.35.0 // indirect
+	golang.org/x/text v0.28.0 // indirect
+	google.golang.org/protobuf v1.36.8 // indirect
 	gopkg.in/yaml.v3 v3.0.1 // indirect
 )
diff --git a/go.sum b/go.sum
index 04e9b56..2c92f4d 100644
--- a/go.sum
+++ b/go.sum
@@ -1,5 +1,7 @@
 github.com/BurntSushi/toml v1.2.1 h1:9F2/+DoOYIOksmaJFPw1tGFy1eDnIJXg+UHjuD8lTak=
 github.com/BurntSushi/toml v1.2.1/go.mod h1:CxXYINrC8qIiEnFrOxCa7Jy5BFHlXnUU2pbicEuybxQ=
+github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM=
+github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw=
 github.com/bsm/ginkgo/v2 v2.12.0 h1:Ny8MWAHyOepLGlLKYmXG4IEkioBysk6GpaRTLC8zwWs=
 github.com/bsm/ginkgo/v2 v2.12.0/go.mod h1:SwYbGRRDovPVboqFv0tPTcG1sN61LM1Z4ARdbAV9g4c=
 github.com/bsm/gomega v1.27.10 h1:yeMWxP2pV2fG3FgAODIY8EiRE3dy0aeFYt4l7wh6yKA=
@@ -9,8 +11,8 @@ github.com/bytedance/sonic v1.13.3/go.mod h1:o68xyaF9u2gvVBuGHPlUVCy+ZfmNNO5ETf1
 github.com/bytedance/sonic/loader v0.1.1/go.mod h1:ncP89zfokxS5LZrJxl5z0UJcsk4M4yY2JpfqGeCtNLU=
 github.com/bytedance/sonic/loader v0.2.4 h1:ZWCw4stuXUsn1/+zQDqeE7JKP+QO47tz7QCNan80NzY=
 github.com/bytedance/sonic/loader v0.2.4/go.mod h1:N8A3vUdtUebEY2/VQC0MyhYeKUFosQU6FxH2JmUe6VI=
-github.com/cespare/xxhash/v2 v2.2.0 h1:DC2CZ1Ep5Y4k3ZQ899DldepgrayRUGE6BBZ/cd9Cj44=
-github.com/cespare/xxhash/v2 v2.2.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs=
+github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs=
+github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs=
 github.com/cloudwego/base64x v0.1.5 h1:XPciSp1xaq2VCSt6lF0phncD4koWyULpl5bUxbfCyP4=
 github.com/cloudwego/base64x v0.1.5/go.mod h1:0zlkT4Wn5C6NdauXdJRhSKRlJvmclQ1hhJgA0rcu/8w=
 github.com/cloudwego/iasm v0.2.0/go.mod h1:8rXZaNYT2n95jn+zTI1sDr+IgcD2GVs0nlbbQPiEFhY=
@@ -56,6 +58,8 @@ github.com/jackc/pgx/v5 v5.5.5 h1:amBjrZVmksIdNjxGW/IiIMzxMKZFelXbUoPNb+8sjQw=
 github.com/jackc/pgx/v5 v5.5.5/go.mod h1:ez9gk+OAat140fv9ErkZDYFWmXLfV+++K0uAOiwgm1A=
 github.com/jackc/puddle/v2 v2.2.1 h1:RhxXJtFG022u4ibrCSMSiu5aOq1i77R3OHKNJj77OAk=
 github.com/jackc/puddle/v2 v2.2.1/go.mod h1:vriiEXHvEE654aYKXXjOvZM39qJ0q+azkZFrfEOc3H4=
+github.com/jpillora/backoff v1.0.0 h1:uvFg412JmmHBHw7iwprIxkPMI+sGQ4kzOWsMeHnm2EA=
+github.com/jpillora/backoff v1.0.0/go.mod h1:J/6gKK9jxlEcS3zixgDgUAsiuZ7yrSoa/FX5e0EB2j4=
 github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM=
 github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo=
 github.com/klauspost/cpuid/v2 v2.0.9/go.mod h1:FInQzS24/EEf25PyTYn52gqo7WaD8xa0213Md/qVLRg=
@@ -84,6 +88,10 @@ github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w
 github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q=
 github.com/modern-go/reflect2 v1.0.2 h1:xBagoLtFs94CBntxluKeaWgTMpvLxC4ur3nMaC9Gz0M=
 github.com/modern-go/reflect2 v1.0.2/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk=
+github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA=
+github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ=
+github.com/mwitkow/go-conntrack v0.0.0-20190716064945-2f068394615f h1:KUppIJq7/+SVif2QVs3tOP0zanoHgBEVAwHxUSIzRqU=
+github.com/mwitkow/go-conntrack v0.0.0-20190716064945-2f068394615f/go.mod h1:qRWi+5nqEBWmkhHvq77mSJWrCKwh8bxhgT7d/eI7P4U=
 github.com/natefinch/lumberjack v2.0.0+incompatible h1:4QJd3OLAMgj7ph+yZTuX13Ld4UpgHp07nNdFX7mqFfM=
 github.com/natefinch/lumberjack v2.0.0+incompatible/go.mod h1:Wi9p2TTF5DG5oU+6YfsmYQpsTIOm0B1VNzQg9Mw6nPk=
 github.com/pelletier/go-toml/v2 v2.2.4 h1:mye9XuhQ6gvn5h28+VilKrrPoQVanw5PMw/TB0t5Ec4=
@@ -92,6 +100,14 @@ github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINE
 github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
 github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 h1:Jamvg5psRIccs7FGNTlIRMkT8wgtp5eCXdBlqhYGL6U=
 github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
+github.com/prometheus/client_golang v1.23.2 h1:Je96obch5RDVy3FDMndoUsjAhG5Edi49h0RJWRi/o0o=
+github.com/prometheus/client_golang v1.23.2/go.mod h1:Tb1a6LWHB3/SPIzCoaDXI4I8UHKeFTEQ1YCr+0Gyqmg=
+github.com/prometheus/client_model v0.6.2 h1:oBsgwpGs7iVziMvrGhE53c/GrLUsZdHnqNwqPLxwZyk=
+github.com/prometheus/client_model v0.6.2/go.mod h1:y3m2F6Gdpfy6Ut/GBsUqTWZqCUvMVzSfMLjcu6wAwpE=
+github.com/prometheus/common v0.66.1 h1:h5E0h5/Y8niHc5DlaLlWLArTQI7tMrsfQjHV+d9ZoGs=
+github.com/prometheus/common v0.66.1/go.mod h1:gcaUsgf3KfRSwHY4dIMXLPV0K/Wg1oZ8+SbZk/HH/dA=
+github.com/prometheus/procfs v0.16.1 h1:hZ15bTNuirocR6u0JZ6BAHHmwS1p8B4P6MRqxtzMyRg=
+github.com/prometheus/procfs v0.16.1/go.mod h1:teAbpZRB1iIAJYREa1LsoWUXykVXA1KlTmWl8x/U+Is=
 github.com/redis/go-redis/v9 v9.5.1 h1:H1X4D3yHPaYrkL5X06Wh6xNVM/pX0Ft4RV0vMGvLBh8=
 github.com/redis/go-redis/v9 v9.5.1/go.mod h1:hdY0cQFCN4fnSYT6TkisLufl/4W5UIXyv0b/CLO2V2M=
 github.com/rogpeppe/go-internal v1.10.0 h1:TMyTOH3F/DB16zRVcYyreMH6GnZZrwQVAoYjRBZyWFQ=
@@ -113,23 +129,27 @@ github.com/twitchyliquid64/golang-asm v0.15.1 h1:SU5vSMR7hnwNxj24w34ZyCi/FmDZTkS
 github.com/twitchyliquid64/golang-asm v0.15.1/go.mod h1:a1lVb/DtPvCB8fslRZhAngC2+aY1QWCk3Cedj/Gdt08=
 github.com/ugorji/go/codec v1.3.0 h1:Qd2W2sQawAfG8XSvzwhBeoGq71zXOC/Q1E9y/wUcsUA=
 github.com/ugorji/go/codec v1.3.0/go.mod h1:pRBVtBSKl77K30Bv8R2P+cLSGaTtex6fsA2Wjqmfxj4=
+go.yaml.in/yaml/v2 v2.4.2 h1:DzmwEr2rDGHl7lsFgAHxmNz/1NlQ7xLIrlN2h5d1eGI=
+go.yaml.in/yaml/v2 v2.4.2/go.mod h1:081UH+NErpNdqlCXm3TtEran0rJZGxAYx9hb/ELlsPU=
 golang.org/x/arch v0.18.0 h1:WN9poc33zL4AzGxqf8VtpKUnGvMi8O9lhNyBMF/85qc=
 golang.org/x/arch v0.18.0/go.mod h1:bdwinDaKcfZUGpH09BB7ZmOfhalA8lQdzl62l8gGWsk=
-golang.org/x/crypto v0.39.0 h1:SHs+kF4LP+f+p14esP5jAoDpHU8Gu/v9lFRK6IT5imM=
-golang.org/x/crypto v0.39.0/go.mod h1:L+Xg3Wf6HoL4Bn4238Z6ft6KfEpN0tJGo53AAPC632U=
-golang.org/x/net v0.41.0 h1:vBTly1HeNPEn3wtREYfy4GZ/NECgw2Cnl+nK6Nz3uvw=
-golang.org/x/net v0.41.0/go.mod h1:B/K4NNqkfmg07DQYrbwvSluqCJOOXwUjeb/5lOisjbA=
-golang.org/x/sync v0.15.0 h1:KWH3jNZsfyT6xfAfKiz6MRNmd46ByHDYaZ7KSkCtdW8=
-golang.org/x/sync v0.15.0/go.mod h1:1dzgHSNfp02xaA81J2MS99Qcpr2w7fw1gpm99rleRqA=
+golang.org/x/crypto v0.41.0 h1:WKYxWedPGCTVVl5+WHSSrOBT0O8lx32+zxmHxijgXp4=
+golang.org/x/crypto v0.41.0/go.mod h1:pO5AFd7FA68rFak7rOAGVuygIISepHftHnr8dr6+sUc=
+golang.org/x/net v0.43.0 h1:lat02VYK2j4aLzMzecihNvTlJNQUq316m2Mr9rnM6YE=
+golang.org/x/net v0.43.0/go.mod h1:vhO1fvI4dGsIjh73sWfUVjj3N7CA9WkKJNQm2svM6Jg=
+golang.org/x/oauth2 v0.30.0 h1:dnDm7JmhM45NNpd8FDDeLhK6FwqbOf4MLCM9zb1BOHI=
+golang.org/x/oauth2 v0.30.0/go.mod h1:B++QgG3ZKulg6sRPGD/mqlHQs5rB3Ml9erfeDY7xKlU=
+golang.org/x/sync v0.16.0 h1:ycBJEhp9p4vXvUZNszeOq0kGTPghopOL8q0fq3vstxw=
+golang.org/x/sync v0.16.0/go.mod h1:1dzgHSNfp02xaA81J2MS99Qcpr2w7fw1gpm99rleRqA=
 golang.org/x/sys v0.0.0-20220811171246-fbc7d0a398ab/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
 golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
 golang.org/x/sys v0.12.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
-golang.org/x/sys v0.33.0 h1:q3i8TbbEz+JRD9ywIRlyRAQbM0qF7hu24q3teo2hbuw=
-golang.org/x/sys v0.33.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k=
-golang.org/x/text v0.26.0 h1:P42AVeLghgTYr4+xUnTRKDMqpar+PtX7KWuNQL21L8M=
-golang.org/x/text v0.26.0/go.mod h1:QK15LZJUUQVJxhz7wXgxSy/CJaTFjd0G+YLonydOVQA=
-google.golang.org/protobuf v1.36.6 h1:z1NpPI8ku2WgiWnf+t9wTPsn6eP1L7ksHUlkfLvd9xY=
-google.golang.org/protobuf v1.36.6/go.mod h1:jduwjTPXsFjZGTmRluh+L6NjiWu7pchiJ2/5YcXBHnY=
+golang.org/x/sys v0.35.0 h1:vz1N37gP5bs89s7He8XuIYXpyY0+QlsKmzipCbUtyxI=
+golang.org/x/sys v0.35.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k=
+golang.org/x/text v0.28.0 h1:rhazDwis8INMIwQ4tpjLDzUhx6RlXqZNPEM0huQojng=
+golang.org/x/text v0.28.0/go.mod h1:U8nCwOR8jO/marOQ0QbDiOngZVEBB7MAiitBuMjXiNU=
+google.golang.org/protobuf v1.36.8 h1:xHScyCOEuuwZEc6UtSOvPbAT4zRh0xcNRYekJwfqyMc=
+google.golang.org/protobuf v1.36.8/go.mod h1:fuxRtAxBytpl4zzqUh6/eyUujkJdNiuEkXntxiD/uRU=
 gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
 gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk=
 gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q=
diff --git a/internal/prometheus_adapter/api/api.go b/internal/prometheus_adapter/api/api.go
new file mode 100644
index 0000000..4de0c96
--- /dev/null
+++ b/internal/prometheus_adapter/api/api.go
@@ -0,0 +1,29 @@
+package api
+
+import (
+	"github.com/fox-gonic/fox"
+	"github.com/qiniu/zeroops/internal/prometheus_adapter/service"
+)
+
+// Api Prometheus Adapter API
+type Api struct {
+	metricService *service.MetricService
+	router        *fox.Engine
+}
+
+// NewApi 创建新的 API
+func NewApi(metricService *service.MetricService, router *fox.Engine) (*Api, error) {
+	api := &Api{
+		metricService: metricService,
+		router:        router,
+	}
+
+	api.setupRouters(router)
+	return api, nil
+}
+
+// setupRouters 设置路由
+func (api *Api) setupRouters(router *fox.Engine) {
+	// 指标相关路由
+	api.setupMetricRouters(router)
+}
diff --git a/internal/prometheus_adapter/api/metric_api.go b/internal/prometheus_adapter/api/metric_api.go
new file mode 100644
index 0000000..431bfdf
--- /dev/null
+++ b/internal/prometheus_adapter/api/metric_api.go
@@ -0,0 +1,175 @@
+package api
+
+import (
+	"errors"
+	"fmt"
+	"net/http"
+	"time"
+
+	"github.com/fox-gonic/fox"
+	"github.com/qiniu/zeroops/internal/prometheus_adapter/model"
+	"github.com/rs/zerolog/log"
+)
+
+// setupMetricRouters 设置指标相关路由
+func (api *Api) setupMetricRouters(router *fox.Engine) {
+	router.GET("/v1/metrics", api.GetMetrics)
+	router.GET("/v1/metrics/:service/:metric", api.QueryMetric)
+}
+
+// GetMetrics 获取可用指标列表（GET /v1/metrics）
+func (api *Api) GetMetrics(c *fox.Context) {
+	ctx := c.Request.Context()
+
+	response, err := api.metricService.GetAvailableMetrics(ctx)
+	if err != nil {
+		log.Error().Err(err).Msg("failed to get available metrics")
+		api.sendErrorResponse(c, http.StatusInternalServerError, model.ErrorCodeInternalError, "获取指标列表失败", nil)
+		return
+	}
+
+	c.JSON(http.StatusOK, response)
+}
+
+// QueryMetric 查询指标数据（GET /v1/metrics/:service/:metric）
+func (api *Api) QueryMetric(c *fox.Context) {
+	ctx := c.Request.Context()
+
+	// 获取路径参数
+	serviceName := c.Param("service")
+	metricName := c.Param("metric")
+
+	// 获取查询参数
+	version := c.Query("version")
+	startStr := c.Query("start")
+	endStr := c.Query("end")
+	stepStr := c.Query("step")
+
+	// 解析时间参数
+	start, end, err := api.parseTimeRange(startStr, endStr)
+	if err != nil {
+		log.Error().Err(err).Msg("invalid time parameters")
+		api.sendErrorResponse(c, http.StatusBadRequest, model.ErrorCodeInvalidParameter,
+			fmt.Sprintf("参数 'start/end' 格式错误: %s", err.Error()), nil)
+		return
+	}
+
+	// 解析步长参数
+	step := api.parseStep(stepStr)
+
+	// 查询指标
+	response, err := api.metricService.QueryMetric(ctx, serviceName, metricName, version, start, end, step)
+	if err != nil {
+		api.handleQueryError(c, err, serviceName, metricName)
+		return
+	}
+
+	c.JSON(http.StatusOK, response)
+}
+
+// parseTimeRange 解析时间范围参数
+func (api *Api) parseTimeRange(startStr, endStr string) (time.Time, time.Time, error) {
+	var start, end time.Time
+	var err error
+
+	// 如果没有指定开始时间，默认为1小时前
+	if startStr == "" {
+		start = time.Now().Add(-1 * time.Hour)
+	} else {
+		start, err = time.Parse(time.RFC3339, startStr)
+		if err != nil {
+			return time.Time{}, time.Time{}, fmt.Errorf("invalid start time format: %w", err)
+		}
+	}
+
+	// 如果没有指定结束时间，默认为当前时间
+	if endStr == "" {
+		end = time.Now()
+	} else {
+		end, err = time.Parse(time.RFC3339, endStr)
+		if err != nil {
+			return time.Time{}, time.Time{}, fmt.Errorf("invalid end time format: %w", err)
+		}
+	}
+
+	// 验证时间范围的合理性
+	if end.Before(start) {
+		return time.Time{}, time.Time{}, fmt.Errorf("end time must be after start time")
+	}
+
+	return start, end, nil
+}
+
+// parseStep 解析步长参数
+func (api *Api) parseStep(stepStr string) time.Duration {
+	if stepStr == "" {
+		return time.Minute // 默认1分钟
+	}
+
+	duration, err := time.ParseDuration(stepStr)
+	if err != nil {
+		log.Warn().Str("step", stepStr).Msg("invalid step format, using default")
+		return time.Minute
+	}
+
+	return duration
+}
+
+// handleQueryError 处理查询错误
+func (api *Api) handleQueryError(c *fox.Context, err error, service, metric string) {
+	var serviceNotFound *model.ServiceNotFoundError
+	var metricNotFound *model.MetricNotFoundError
+	var prometheusError *model.PrometheusError
+
+	switch {
+	case errors.As(err, &serviceNotFound):
+		log.Error().Err(err).Str("service", service).Msg("service not found")
+		api.sendErrorResponse(c, http.StatusNotFound, model.ErrorCodeServiceNotFound,
+			err.Error(), map[string]string{"service": service})
+
+	case errors.As(err, &metricNotFound):
+		log.Error().Err(err).Str("metric", metric).Msg("metric not found")
+		api.sendErrorResponse(c, http.StatusNotFound, model.ErrorCodeMetricNotFound,
+			err.Error(), map[string]string{"metric": metric})
+
+	case errors.As(err, &prometheusError):
+		log.Error().Err(err).Msg("prometheus query error")
+		api.sendErrorResponse(c, http.StatusInternalServerError, model.ErrorCodePrometheusError,
+			"Prometheus 查询失败", nil)
+
+	default:
+		log.Error().Err(err).Msg("unexpected error during metric query")
+		api.sendErrorResponse(c, http.StatusInternalServerError, model.ErrorCodeInternalError,
+			"内部服务器错误", nil)
+	}
+}
+
+// sendErrorResponse 发送错误响应
+func (api *Api) sendErrorResponse(c *fox.Context, statusCode int, errorCode, message string, extras map[string]string) {
+	errorDetail := model.ErrorDetail{
+		Code:    errorCode,
+		Message: message,
+	}
+
+	// 添加额外的字段
+	if extras != nil {
+		if service, ok := extras["service"]; ok {
+			errorDetail.Service = service
+		}
+		if metric, ok := extras["metric"]; ok {
+			errorDetail.Metric = metric
+		}
+		if parameter, ok := extras["parameter"]; ok {
+			errorDetail.Parameter = parameter
+		}
+		if value, ok := extras["value"]; ok {
+			errorDetail.Value = value
+		}
+	}
+
+	response := model.ErrorResponse{
+		Error: errorDetail,
+	}
+
+	c.JSON(statusCode, response)
+}
diff --git a/internal/prometheus_adapter/client/prometheus_client.go b/internal/prometheus_adapter/client/prometheus_client.go
new file mode 100644
index 0000000..7bf0a3a
--- /dev/null
+++ b/internal/prometheus_adapter/client/prometheus_client.go
@@ -0,0 +1,144 @@
+package client
+
+import (
+	"context"
+	"fmt"
+	"time"
+
+	"github.com/prometheus/client_golang/api"
+	v1 "github.com/prometheus/client_golang/api/prometheus/v1"
+	promModel "github.com/prometheus/common/model"
+	"github.com/qiniu/zeroops/internal/prometheus_adapter/model"
+)
+
+// PrometheusClient Prometheus 客户端
+type PrometheusClient struct {
+	api v1.API
+}
+
+// NewPrometheusClient 创建新的 Prometheus 客户端
+func NewPrometheusClient(address string) (*PrometheusClient, error) {
+	client, err := api.NewClient(api.Config{
+		Address: address,
+	})
+	if err != nil {
+		return nil, fmt.Errorf("failed to create prometheus client: %w", err)
+	}
+
+	return &PrometheusClient{
+		api: v1.NewAPI(client),
+	}, nil
+}
+
+// QueryRange 执行范围查询
+func (c *PrometheusClient) QueryRange(ctx context.Context, query string, start, end time.Time, step time.Duration) ([]model.MetricDataPoint, error) {
+	r := v1.Range{
+		Start: start,
+		End:   end,
+		Step:  step,
+	}
+
+	result, warnings, err := c.api.QueryRange(ctx, query, r)
+	if err != nil {
+		return nil, fmt.Errorf("failed to query prometheus: %w", err)
+	}
+
+	if len(warnings) > 0 {
+		// 记录警告但不返回错误
+		fmt.Printf("Prometheus warnings: %v\n", warnings)
+	}
+
+	// 转换结果为我们的数据格式
+	matrix, ok := result.(promModel.Matrix)
+	if !ok {
+		return nil, fmt.Errorf("unexpected result type: %T", result)
+	}
+
+	var dataPoints []model.MetricDataPoint
+	for _, sample := range matrix {
+		for _, pair := range sample.Values {
+			dataPoints = append(dataPoints, model.MetricDataPoint{
+				Timestamp: pair.Timestamp.Time(),
+				Value:     float64(pair.Value),
+			})
+		}
+	}
+
+	return dataPoints, nil
+}
+
+// GetAvailableMetrics 获取所有可用的指标名称
+func (c *PrometheusClient) GetAvailableMetrics(ctx context.Context) ([]string, error) {
+	// 查询所有指标名称
+	result, warnings, err := c.api.LabelValues(ctx, "__name__", nil, time.Now().Add(-time.Hour), time.Now())
+	if err != nil {
+		return nil, fmt.Errorf("failed to get metrics: %w", err)
+	}
+
+	if len(warnings) > 0 {
+		fmt.Printf("Prometheus warnings: %v\n", warnings)
+	}
+
+	// 转换为字符串数组，过滤相关的指标
+	metrics := make([]string, 0)
+	for _, m := range result {
+		metricName := string(m)
+		metrics = append(metrics, metricName)
+	}
+
+	return metrics, nil
+}
+
+// CheckMetricExists 检查指标是否存在
+func (c *PrometheusClient) CheckMetricExists(ctx context.Context, metric string) (bool, error) {
+	// 查询指标是否存在
+	query := fmt.Sprintf(`{__name__="%s"}`, metric)
+	result, _, err := c.api.Query(ctx, query, time.Now())
+	if err != nil {
+		return false, fmt.Errorf("failed to check metric existence: %w", err)
+	}
+
+	// 如果有结果，说明指标存在
+	switch v := result.(type) {
+	case promModel.Vector:
+		return len(v) > 0, nil
+	case promModel.Matrix:
+		return len(v) > 0, nil
+	default:
+		return false, nil
+	}
+}
+
+// CheckServiceExists 检查服务是否存在
+func (c *PrometheusClient) CheckServiceExists(ctx context.Context, service string) (bool, error) {
+	// 查询服务是否存在
+	query := fmt.Sprintf(`{service_name="%s"}`, service)
+	result, _, err := c.api.Query(ctx, query, time.Now())
+	if err != nil {
+		return false, fmt.Errorf("failed to check service existence: %w", err)
+	}
+
+	// 如果有结果，说明服务存在
+	switch v := result.(type) {
+	case promModel.Vector:
+		return len(v) > 0, nil
+	case promModel.Matrix:
+		return len(v) > 0, nil
+	default:
+		return false, nil
+	}
+}
+
+// BuildQuery 构建 PromQL 查询
+func BuildQuery(service, metric, version string) string {
+	// 基础查询
+	query := fmt.Sprintf(`%s{service_name="%s"`, metric, service)
+
+	// 如果指定了版本，添加版本过滤
+	if version != "" {
+		query += fmt.Sprintf(`,service_version="%s"`, version)
+	}
+
+	query += "}"
+	return query
+}
diff --git a/internal/prometheus_adapter/model/api.go b/internal/prometheus_adapter/model/api.go
new file mode 100644
index 0000000..efef1d2
--- /dev/null
+++ b/internal/prometheus_adapter/model/api.go
@@ -0,0 +1,24 @@
+package model
+
+import "time"
+
+// ===== API 响应结构体 =====
+
+// MetricListResponse 指标列表响应（对应 GET /v1/metrics）
+type MetricListResponse struct {
+	Metrics []string `json:"metrics"`
+}
+
+// MetricQueryResponse 指标查询响应（对应 GET /v1/metrics/:service/:metric）
+type MetricQueryResponse struct {
+	Service string            `json:"service"`
+	Version string            `json:"version,omitempty"`
+	Metric  string            `json:"metric"`
+	Data    []MetricDataPoint `json:"data"`
+}
+
+// MetricDataPoint 指标数据点
+type MetricDataPoint struct {
+	Timestamp time.Time `json:"timestamp"`
+	Value     float64   `json:"value"`
+}
diff --git a/internal/prometheus_adapter/model/constants.go b/internal/prometheus_adapter/model/constants.go
new file mode 100644
index 0000000..3e8727b
--- /dev/null
+++ b/internal/prometheus_adapter/model/constants.go
@@ -0,0 +1,10 @@
+package model
+
+// 错误码常量
+const (
+	ErrorCodeMetricNotFound   = "METRIC_NOT_FOUND"
+	ErrorCodeServiceNotFound  = "SERVICE_NOT_FOUND"
+	ErrorCodeInvalidParameter = "INVALID_PARAMETER"
+	ErrorCodePrometheusError  = "PROMETHEUS_ERROR"
+	ErrorCodeInternalError    = "INTERNAL_ERROR"
+)
diff --git a/internal/prometheus_adapter/model/error.go b/internal/prometheus_adapter/model/error.go
new file mode 100644
index 0000000..aacbb63
--- /dev/null
+++ b/internal/prometheus_adapter/model/error.go
@@ -0,0 +1,49 @@
+package model
+
+import "fmt"
+
+// ===== 错误响应结构体 =====
+
+// ErrorResponse 错误响应
+type ErrorResponse struct {
+	Error ErrorDetail `json:"error"`
+}
+
+// ErrorDetail 错误详情
+type ErrorDetail struct {
+	Code      string `json:"code"`
+	Message   string `json:"message"`
+	Service   string `json:"service,omitempty"`
+	Metric    string `json:"metric,omitempty"`
+	Parameter string `json:"parameter,omitempty"`
+	Value     string `json:"value,omitempty"`
+}
+
+// ===== 自定义错误类型 =====
+
+// ServiceNotFoundError 服务不存在错误
+type ServiceNotFoundError struct {
+	Service string
+}
+
+func (e *ServiceNotFoundError) Error() string {
+	return fmt.Sprintf("服务 '%s' 不存在", e.Service)
+}
+
+// MetricNotFoundError 指标不存在错误
+type MetricNotFoundError struct {
+	Metric string
+}
+
+func (e *MetricNotFoundError) Error() string {
+	return fmt.Sprintf("指标 '%s' 不存在", e.Metric)
+}
+
+// PrometheusError Prometheus 查询错误
+type PrometheusError struct {
+	Message string
+}
+
+func (e *PrometheusError) Error() string {
+	return fmt.Sprintf("Prometheus 查询错误: %s", e.Message)
+}
diff --git a/internal/prometheus_adapter/server.go b/internal/prometheus_adapter/server.go
new file mode 100644
index 0000000..1940839
--- /dev/null
+++ b/internal/prometheus_adapter/server.go
@@ -0,0 +1,65 @@
+package prometheusadapter
+
+import (
+	"fmt"
+	"os"
+
+	"github.com/fox-gonic/fox"
+	"github.com/qiniu/zeroops/internal/config"
+	"github.com/qiniu/zeroops/internal/prometheus_adapter/api"
+	"github.com/qiniu/zeroops/internal/prometheus_adapter/client"
+	"github.com/qiniu/zeroops/internal/prometheus_adapter/service"
+	"github.com/rs/zerolog/log"
+)
+
+// PrometheusAdapterServer Prometheus Adapter 服务器
+type PrometheusAdapterServer struct {
+	config        *config.Config
+	promClient    *client.PrometheusClient
+	metricService *service.MetricService
+	api           *api.Api
+}
+
+// NewPrometheusAdapterServer 创建新的 Prometheus Adapter 服务器
+func NewPrometheusAdapterServer(cfg *config.Config) (*PrometheusAdapterServer, error) {
+	// 使用环境变量或默认值获取 Prometheus 地址
+	prometheusAddr := os.Getenv("PROMETHEUS_ADDRESS")
+	if prometheusAddr == "" {
+		prometheusAddr = "http://localhost:9090"
+	}
+
+	// 创建 Prometheus 客户端
+	promClient, err := client.NewPrometheusClient(prometheusAddr)
+	if err != nil {
+		return nil, fmt.Errorf("failed to create prometheus client: %w", err)
+	}
+
+	// 创建指标服务
+	metricService := service.NewMetricService(promClient)
+
+	server := &PrometheusAdapterServer{
+		config:        cfg,
+		promClient:    promClient,
+		metricService: metricService,
+	}
+
+	log.Info().Str("prometheus_address", prometheusAddr).Msg("Prometheus Adapter initialized successfully")
+	return server, nil
+}
+
+// UseApi 设置 API 路由
+func (s *PrometheusAdapterServer) UseApi(router *fox.Engine) error {
+	var err error
+	s.api, err = api.NewApi(s.metricService, router)
+	if err != nil {
+		return fmt.Errorf("failed to initialize API: %w", err)
+	}
+	return nil
+}
+
+// Close 关闭服务器
+func (s *PrometheusAdapterServer) Close() error {
+	// 当前没有需要关闭的资源
+	log.Info().Msg("Prometheus Adapter server closed")
+	return nil
+}
diff --git a/internal/prometheus_adapter/service/metric_service.go b/internal/prometheus_adapter/service/metric_service.go
new file mode 100644
index 0000000..fff69c2
--- /dev/null
+++ b/internal/prometheus_adapter/service/metric_service.go
@@ -0,0 +1,80 @@
+package service
+
+import (
+	"context"
+	"time"
+
+	"github.com/qiniu/zeroops/internal/prometheus_adapter/client"
+	"github.com/qiniu/zeroops/internal/prometheus_adapter/model"
+	"github.com/rs/zerolog/log"
+)
+
+// MetricService 指标服务
+type MetricService struct {
+	promClient *client.PrometheusClient
+}
+
+// NewMetricService 创建指标服务
+func NewMetricService(promClient *client.PrometheusClient) *MetricService {
+	return &MetricService{
+		promClient: promClient,
+	}
+}
+
+// GetAvailableMetrics 获取可用的指标列表
+func (s *MetricService) GetAvailableMetrics(ctx context.Context) (*model.MetricListResponse, error) {
+	// 从 Prometheus 动态获取指标列表
+	metrics, err := s.promClient.GetAvailableMetrics(ctx)
+	if err != nil {
+		log.Error().Err(err).Msg("failed to get available metrics from prometheus")
+		return nil, &model.PrometheusError{Message: err.Error()}
+	}
+
+	return &model.MetricListResponse{
+		Metrics: metrics,
+	}, nil
+}
+
+// QueryMetric 查询指标数据
+func (s *MetricService) QueryMetric(ctx context.Context, service, metric, version string, start, end time.Time, step time.Duration) (*model.MetricQueryResponse, error) {
+	// 动态验证服务是否存在
+	serviceExists, err := s.promClient.CheckServiceExists(ctx, service)
+	if err != nil {
+		log.Error().Err(err).Str("service", service).Msg("failed to check service existence")
+		return nil, &model.PrometheusError{Message: err.Error()}
+	}
+	if !serviceExists {
+		return nil, &model.ServiceNotFoundError{Service: service}
+	}
+
+	// 动态验证指标是否存在
+	metricExists, err := s.promClient.CheckMetricExists(ctx, metric)
+	if err != nil {
+		log.Error().Err(err).Str("metric", metric).Msg("failed to check metric existence")
+		return nil, &model.PrometheusError{Message: err.Error()}
+	}
+	if !metricExists {
+		return nil, &model.MetricNotFoundError{Metric: metric}
+	}
+
+	// 构建 PromQL 查询
+	query := client.BuildQuery(service, metric, version)
+	log.Debug().Str("query", query).Msg("executing prometheus query")
+
+	// 执行查询
+	dataPoints, err := s.promClient.QueryRange(ctx, query, start, end, step)
+	if err != nil {
+		log.Error().Err(err).Str("query", query).Msg("failed to query prometheus")
+		return nil, &model.PrometheusError{Message: err.Error()}
+	}
+
+	// 构建响应
+	response := &model.MetricQueryResponse{
+		Service: service,
+		Version: version,
+		Metric:  metric,
+		Data:    dataPoints,
+	}
+
+	return response, nil
+}

From 48b34b0cb7dbc5519e8a139165b8289a965512ac Mon Sep 17 00:00:00 2001
From: dnj <shdnj@qq.com>
Date: Tue, 23 Sep 2025 11:52:29 +0800
Subject: [PATCH 05/18] =?UTF-8?q?fix(prometheus=5Fadapter):=20=E6=9B=B4?=
 =?UTF-8?q?=E6=96=B0=E9=BB=98=E8=AE=A4Prometheus=E5=9C=B0=E5=9D=80?=
 =?UTF-8?q?=E5=B9=B6=E5=AE=8C=E5=96=84=E6=96=87=E6=A1=A3?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 docs/prometheus_adapter/README.md     | 75 ++++++++++++++++++++++++++-
 internal/prometheus_adapter/server.go |  2 +-
 2 files changed, 75 insertions(+), 2 deletions(-)

diff --git a/docs/prometheus_adapter/README.md b/docs/prometheus_adapter/README.md
index 30ca4be..7c1d400 100644
--- a/docs/prometheus_adapter/README.md
+++ b/docs/prometheus_adapter/README.md
@@ -1,9 +1,82 @@
-# Prometheus Adapter API 文档
+# Prometheus Adapter 模块文档
 
 ## 概述
 
 Prometheus Adapter 提供从 Prometheus 获取服务指标的 RESTful API 接口。支持按服务名称和版本进行查询。
 
+## 架构设计
+
+### 模块结构
+
+```
+internal/prometheus_adapter/
+├── server.go           # 服务器主入口，负责初始化和生命周期管理
+├── api/                # API 层，处理 HTTP 请求
+│   ├── api.go         # API 基础结构和初始化
+│   └── metric_api.go  # 指标相关的 API 处理器
+├── service/            # 业务逻辑层
+│   └── metric_service.go  # 指标查询服务实现
+├── client/             # Prometheus 客户端
+│   └── prometheus_client.go  # 封装 Prometheus API 调用
+└── model/              # 数据模型
+    ├── api.go         # API 请求响应模型
+    ├── constants.go   # 常量定义（错误码等）
+    └── error.go       # 错误类型定义
+```
+
+### 层次设计
+
+1. **API 层** (`api/`)
+   - 处理 HTTP 请求和响应
+   - 参数验证和解析
+   - 错误响应格式化
+
+2. **Service 层** (`service/`)
+   - 业务逻辑处理
+   - 指标和服务存在性验证
+   - 数据转换和组装
+
+3. **Client 层** (`client/`)
+   - 与 Prometheus API 交互
+   - PromQL 查询构建
+   - 结果数据转换
+
+4. **Model 层** (`model/`)
+   - 统一的数据模型定义
+   - 错误类型和错误码
+   - 请求响应结构体
+
+### 核心组件
+
+#### PrometheusAdapterServer
+主服务器组件，负责：
+- 初始化 Prometheus 客户端
+- 创建服务实例
+- 设置 API 路由
+- 管理生命周期
+
+#### PrometheusClient
+Prometheus 客户端封装，提供：
+- `QueryRange`: 执行时间范围查询
+- `GetAvailableMetrics`: 获取所有可用指标
+- `CheckMetricExists`: 检查指标是否存在
+- `CheckServiceExists`: 检查服务是否存在
+- `BuildQuery`: 构建 PromQL 查询语句
+
+#### MetricService
+业务逻辑服务，实现：
+- 动态指标发现
+- 查询参数验证
+- 错误处理和转换
+
+## 配置说明
+
+### 环境变量
+
+| 变量名 | 说明 | 默认值 |
+|--------|------|--------|
+| PROMETHEUS_ADDRESS | Prometheus 服务器地址 | http://10.210.10.33:9090 |
+
 ## API
 
 ### 1. 获取可用指标列表
diff --git a/internal/prometheus_adapter/server.go b/internal/prometheus_adapter/server.go
index 1940839..e921668 100644
--- a/internal/prometheus_adapter/server.go
+++ b/internal/prometheus_adapter/server.go
@@ -25,7 +25,7 @@ func NewPrometheusAdapterServer(cfg *config.Config) (*PrometheusAdapterServer, e
 	// 使用环境变量或默认值获取 Prometheus 地址
 	prometheusAddr := os.Getenv("PROMETHEUS_ADDRESS")
 	if prometheusAddr == "" {
-		prometheusAddr = "http://localhost:9090"
+		prometheusAddr = "http://10.210.10.33:9090/"
 	}
 
 	// 创建 Prometheus 客户端

From 855c51451c47cc7bd5f1b34853411f030ea9948e Mon Sep 17 00:00:00 2001
From: dnj <shdnj@qq.com>
Date: Tue, 23 Sep 2025 17:24:57 +0800
Subject: [PATCH 06/18] =?UTF-8?q?feat(=E5=91=8A=E8=AD=A6):=20=E5=AE=9E?=
 =?UTF-8?q?=E7=8E=B0=E5=91=8A=E8=AD=A6=E8=A7=84=E5=88=99=E5=90=8C=E6=AD=A5?=
 =?UTF-8?q?=E5=8A=9F=E8=83=BD=E5=B9=B6=E9=87=8D=E6=9E=84API=E5=B1=82?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

新增告警规则同步API及服务实现，支持将规则同步到Prometheus并触发重载
重构API层提取公共错误处理和工具方法到通用模块
添加相关模型定义和文档更新
---
 docs/prometheus_adapter/README.md             | 298 +++++++++---------
 internal/prometheus_adapter/api/alert_api.go  |  36 +++
 internal/prometheus_adapter/api/api.go        |  91 +++++-
 internal/prometheus_adapter/api/metric_api.go |  95 +-----
 internal/prometheus_adapter/model/alert.go    |  20 ++
 internal/prometheus_adapter/model/api.go      |  57 +++-
 .../prometheus_adapter/model/constants.go     |   1 +
 .../model/prometheus_rule.go                  |  21 ++
 internal/prometheus_adapter/server.go         |   8 +-
 .../service/alert_service.go                  | 282 +++++++++++++++++
 10 files changed, 672 insertions(+), 237 deletions(-)
 create mode 100644 internal/prometheus_adapter/api/alert_api.go
 create mode 100644 internal/prometheus_adapter/model/alert.go
 create mode 100644 internal/prometheus_adapter/model/prometheus_rule.go
 create mode 100644 internal/prometheus_adapter/service/alert_service.go

diff --git a/docs/prometheus_adapter/README.md b/docs/prometheus_adapter/README.md
index 7c1d400..81aa9f5 100644
--- a/docs/prometheus_adapter/README.md
+++ b/docs/prometheus_adapter/README.md
@@ -1,97 +1,70 @@
-# Prometheus Adapter 模块文档
+# Prometheus Adapter
+
+基于 Prometheus 的指标查询与告警规则同步适配层，提供统一的 REST API：
+- 按服务与版本查询任意 Prometheus 指标
+- 同步告警规则到 Prometheus 并触发重载
+
+目录
+- 概述
+- 快速开始
+- 架构设计
+- API 参考
+  - 指标查询
+  - 告警规则同步
+- Alertmanager 集成
+- 支持的服务
+- 错误码
 
 ## 概述
 
-Prometheus Adapter 提供从 Prometheus 获取服务指标的 RESTful API 接口。支持按服务名称和版本进行查询。
+Prometheus Adapter 作为内部系统与 Prometheus 之间的适配层：
+- 向上暴露简洁、统一的 HTTP API
+- 向下负责 PromQL 查询与 Prometheus 规则文件管理
 
 ## 架构设计
 
-### 模块结构
+- 分层设计
+  - API 层（`internal/prometheus_adapter/api`）：HTTP 请求处理、参数校验、错误格式化
+  - Service 层（`internal/prometheus_adapter/service`）：业务逻辑、指标与服务存在性校验、数据装配
+  - Client 层（`internal/prometheus_adapter/client`）：与 Prometheus API 交互、PromQL 构建、结果转换
+  - Model 层（`internal/prometheus_adapter/model`）：统一数据模型、错误类型、常量
 
+- 目录结构
 ```
 internal/prometheus_adapter/
-├── server.go           # 服务器主入口，负责初始化和生命周期管理
-├── api/                # API 层，处理 HTTP 请求
-│   ├── api.go         # API 基础结构和初始化
-│   └── metric_api.go  # 指标相关的 API 处理器
-├── service/            # 业务逻辑层
-│   └── metric_service.go  # 指标查询服务实现
-├── client/             # Prometheus 客户端
+├── server.go              # 服务器主入口，负责初始化和生命周期管理
+├── api/                   # API 层，处理 HTTP 请求
+│   ├── api.go            # API 基础结构和初始化
+│   ├── metric_api.go     # 指标相关的 API 处理器
+│   └── alert_api.go      # 告警规则同步 API 处理器
+├── service/               # 业务逻辑层
+│   ├── metric_service.go # 指标查询服务实现
+│   └── alert_service.go  # 告警规则同步服务实现
+├── client/                # Prometheus 客户端
 │   └── prometheus_client.go  # 封装 Prometheus API 调用
-└── model/              # 数据模型
-    ├── api.go         # API 请求响应模型
-    ├── constants.go   # 常量定义（错误码等）
-    └── error.go       # 错误类型定义
+└── model/                 # 数据模型
+    ├── api.go            # API 请求响应模型
+    ├── alert.go          # 告警规则模型
+    ├── constants.go      # 常量定义（错误码等）
+    ├── error.go          # 错误类型定义
+    └── prometheus.go     # Prometheus 规则文件模型
 ```
 
-### 层次设计
+- 核心组件
+  - PrometheusAdapterServer：初始化客户端与路由，管理服务生命周期
+  - PrometheusClient：`QueryRange`、`GetAvailableMetrics`、`CheckMetricExists`、`CheckServiceExists`、`BuildQuery`
+  - MetricService：参数校验、动态指标发现、错误转换
+  - AlertService：告警规则同步、Prometheus 规则文件生成、配置重载
 
-1. **API 层** (`api/`)
-   - 处理 HTTP 请求和响应
-   - 参数验证和解析
-   - 错误响应格式化
+## API 
 
-2. **Service 层** (`service/`)
-   - 业务逻辑处理
-   - 指标和服务存在性验证
-   - 数据转换和组装
+### 指标查询
 
-3. **Client 层** (`client/`)
-   - 与 Prometheus API 交互
-   - PromQL 查询构建
-   - 结果数据转换
-
-4. **Model 层** (`model/`)
-   - 统一的数据模型定义
-   - 错误类型和错误码
-   - 请求响应结构体
-
-### 核心组件
-
-#### PrometheusAdapterServer
-主服务器组件，负责：
-- 初始化 Prometheus 客户端
-- 创建服务实例
-- 设置 API 路由
-- 管理生命周期
-
-#### PrometheusClient
-Prometheus 客户端封装，提供：
-- `QueryRange`: 执行时间范围查询
-- `GetAvailableMetrics`: 获取所有可用指标
-- `CheckMetricExists`: 检查指标是否存在
-- `CheckServiceExists`: 检查服务是否存在
-- `BuildQuery`: 构建 PromQL 查询语句
-
-#### MetricService
-业务逻辑服务，实现：
-- 动态指标发现
-- 查询参数验证
-- 错误处理和转换
-
-## 配置说明
-
-### 环境变量
-
-| 变量名 | 说明 | 默认值 |
-|--------|------|--------|
-| PROMETHEUS_ADDRESS | Prometheus 服务器地址 | http://10.210.10.33:9090 |
-
-## API
-
-### 1. 获取可用指标列表
-
-**GET** `/v1/metrics`
-
-获取所有可用的指标列表。
-
-#### 请求示例
-```bash
-GET /v1/metrics
+1) 获取可用指标列表
+- 方法与路径：`GET /v1/metrics`
+- 用途：列出当前可查询的所有指标名称
+- 响应示例：
 ```
-
-#### 响应示例
-```json
 {
   "metrics": [
     "system_cpu_usage_percent",
@@ -104,69 +77,36 @@ GET /v1/metrics
 }
 ```
 
-### 2. 通用指标查询接口
-
-**GET** `/v1/metrics/:service/:metric`
-
-获取指定服务的任意指标时间序列数据。指标不存在则返回错误。
-
-#### 路径参数
-- `service` (string, required): 服务名称
-- `metric` (string, required): 指标名称（必须是 Prometheus 中实际存在的指标）
-
-#### 查询参数
-- `version` (string, optional): 服务版本，不指定则返回所有版本
-- `start` (string, optional): 开始时间 (RFC3339 格式，如: 2024-01-01T00:00:00Z)
-- `end` (string, optional): 结束时间 (RFC3339 格式，如: 2024-01-01T01:00:00Z)
-- `step` (string, optional): 时间步长 (如: 1m, 5m, 1h)，默认 1m
-
-#### 请求示例
-
-1. **查询 CPU 使用率：**
-```bash
-GET /v1/metrics/metadata-service/system_cpu_usage_percent?version=1.0.0
-```
-
-2. **查询内存使用率：**
-```bash
-GET /v1/metrics/storage-service/system_memory_usage_percent?version=1.0.0
-```
-
-3. **查询 HTTP 请求延迟：**
-```bash
-GET /v1/metrics/storage-service/http_latency?version=1.0.0
+2) 查询指定服务的指标时间序列
+- 方法与路径：`GET /v1/metrics/{service}/{metric}`
+- 路径参数：
+  - `service`：服务名（必填）
+  - `metric`：指标名（必填，需为 Prometheus 中存在的指标）
+- 查询参数：
+  - `version`：服务版本（选填；不传则返回所有版本）
+  - `start`：开始时间（选填，RFC3339）
+  - `end`：结束时间（选填，RFC3339）
+  - `step`：步长（选填，如 `1m`、`5m`、`1h`；默认 `1m`）
+- 请求示例：
+  - `GET /v1/metrics/metadata-service/system_cpu_usage_percent?version=1.0.0`
+  - `GET /v1/metrics/storage-service/system_memory_usage_percent?version=1.0.0`
+  - `GET /v1/metrics/storage-service/http_latency?version=1.0.0`
+  - `GET /v1/metrics/storage-service/system_network_qps?version=1.0.0`
+- 成功响应示例：
 ```
-
-4. **查询网络 QPS：**
-```bash
-GET /v1/metrics/storage-service/system_network_qps?version=1.0.0
-```
-
-#### 成功响应示例
-
-**HTTP 200 OK**
-```json
 {
   "service": "metadata-service",
   "version": "1.0.0",
   "metric": "system_cpu_usage_percent",
   "data": [
-    {
-      "timestamp": "2024-01-01T00:00:00Z",
-      "value": 45.2
-    },
-    {
-      "timestamp": "2024-01-01T00:01:00Z",
-      "value": 48.5
-    }
+    { "timestamp": "2024-01-01T00:00:00Z", "value": 45.2 },
+    { "timestamp": "2024-01-01T00:01:00Z", "value": 48.5 }
   ]
 }
 ```
-
-#### 错误响应示例
-
-**指标不存在时 - HTTP 404 Not Found**
-```json
+- 错误响应示例：
+  - 指标不存在（404）：
+```
 {
   "error": {
     "code": "METRIC_NOT_FOUND",
@@ -175,9 +115,8 @@ GET /v1/metrics/storage-service/system_network_qps?version=1.0.0
   }
 }
 ```
-
-**服务不存在时 - HTTP 404 Not Found**
-```json
+  - 服务不存在（404）：
+```
 {
   "error": {
     "code": "SERVICE_NOT_FOUND",
@@ -186,9 +125,8 @@ GET /v1/metrics/storage-service/system_network_qps?version=1.0.0
   }
 }
 ```
-
-**参数错误时 - HTTP 400 Bad Request**
-```json
+  - 参数错误（400）：
+```
 {
   "error": {
     "code": "INVALID_PARAMETER",
@@ -199,15 +137,81 @@ GET /v1/metrics/storage-service/system_network_qps?version=1.0.0
 }
 ```
 
-## 实现说明
+### 告警规则同步
+
+- 方法与路径：`POST /v1/alert-rules/sync`
+- 功能：接收监控告警模块发送的完整规则列表，生成 Prometheus 规则文件并触发重载（全量同步）
+- 请求体示例：
+```
+{
+  "rules": [
+    {
+      "name": "high_cpu_usage",
+      "description": "CPU使用率过高告警",
+      "expr": "system_cpu_usage_percent",
+      "op": ">",
+      "severity": "warning"
+    }
+  ],
+  "rule_metas": [
+    {
+      "alert_name": "high_cpu_usage_storage_v1",
+      "labels": "{\"service\":\"storage-service\",\"version\":\"1.0.0\"}",
+      "threshold": 90,
+      "watch_time": 300,
+      "match_time": "5m"
+    }
+  ]
+}
+```
+- 响应示例：
+```
+{
+  "status": "success",
+  "message": "Rules synced to Prometheus"
+}
+```
+
+## Alertmanager 集成
+
+- 目标：将 Prometheus 触发的告警通过 Alertmanager 转发到监控告警模块
+- `alertmanager.yml` 配置示例：
+```yaml
+global:
+  resolve_timeout: 5m
+
+route:
+  group_by: ['alertname', 'cluster', 'service']
+  group_wait: 10s
+  group_interval: 10s
+  repeat_interval: 1h
+  receiver: 'zeroops-alert-webhook'
+
+receivers:
+  - name: 'zeroops-alert-webhook'
+    webhook_configs:
+      - url: 'http://alert-module:8080/v1/integrations/alertmanager/webhook'
+        send_resolved: true
+```
+- 说明：
+  - `url`：监控告警模块的 webhook 地址（按实际部署修改主机与端口）
+  - `send_resolved`：为 `true` 时，告警恢复也会通知
+
+## 支持的服务
+
+当前 mock/s3 环境下：
+- `metadata-service`
+- `storage-service`
+- `queue-service`
+- `third-party-service`（原文为 third-party-servrice，已更正）
+- `mock-error-service`
 
-### 支持的服务列表
+所有服务的版本信息通过标签 `service_version` 暴露。
 
-当前 mock/s3 环境中支持的服务：
-- `metadata-service` - 元数据管理服务
-- `storage-service` - 存储服务
-- `queue-service` - 消息队列服务
-- `third-party-service` - 第三方集成服务
-- `mock-error-service` - 错误模拟服务
+## 错误码
 
-所有服务的版本信息通过 `service_version` 标签暴露。
\ No newline at end of file
+- `METRIC_NOT_FOUND`：指标不存在
+- `SERVICE_NOT_FOUND`：服务不存在
+- `INVALID_PARAMETER`：请求参数不合法（如时间格式不正确）
+- `INTERNAL_ERROR`：内部服务器错误
+- `PROMETHEUS_ERROR`：Prometheus 查询失败
diff --git a/internal/prometheus_adapter/api/alert_api.go b/internal/prometheus_adapter/api/alert_api.go
new file mode 100644
index 0000000..cb3e968
--- /dev/null
+++ b/internal/prometheus_adapter/api/alert_api.go
@@ -0,0 +1,36 @@
+package api
+
+import (
+	"net/http"
+
+	"github.com/fox-gonic/fox"
+	"github.com/qiniu/zeroops/internal/prometheus_adapter/model"
+)
+
+// setupAlertRouters 设置告警相关路由
+func (api *Api) setupAlertRouters(router *fox.Engine) {
+	router.POST("/v1/alert-rules/sync", api.SyncRules)
+}
+
+// SyncRules 同步规则到Prometheus
+// 接收从监控告警模块发来的规则列表，生成Prometheus规则文件并重载配置
+func (api *Api) SyncRules(c *fox.Context) {
+	var req model.SyncRulesRequest
+	if err := c.ShouldBindJSON(&req); err != nil {
+		SendErrorResponse(c, http.StatusBadRequest, model.ErrorCodeInvalidParameter,
+			"Invalid request body: "+err.Error(), nil)
+		return
+	}
+
+	err := api.alertService.SyncRulesToPrometheus(req.Rules, req.RuleMetas)
+	if err != nil {
+		SendErrorResponse(c, http.StatusInternalServerError, model.ErrorCodeInternalError,
+			"Failed to sync rules to Prometheus: "+err.Error(), nil)
+		return
+	}
+
+	c.JSON(http.StatusOK, map[string]string{
+		"status":  "success",
+		"message": "Rules synced to Prometheus",
+	})
+}
diff --git a/internal/prometheus_adapter/api/api.go b/internal/prometheus_adapter/api/api.go
index 4de0c96..2b6e432 100644
--- a/internal/prometheus_adapter/api/api.go
+++ b/internal/prometheus_adapter/api/api.go
@@ -1,20 +1,27 @@
 package api
 
 import (
+	"fmt"
+	"time"
+
 	"github.com/fox-gonic/fox"
+	"github.com/qiniu/zeroops/internal/prometheus_adapter/model"
 	"github.com/qiniu/zeroops/internal/prometheus_adapter/service"
+	"github.com/rs/zerolog/log"
 )
 
 // Api Prometheus Adapter API
 type Api struct {
 	metricService *service.MetricService
+	alertService  *service.AlertService
 	router        *fox.Engine
 }
 
 // NewApi 创建新的 API
-func NewApi(metricService *service.MetricService, router *fox.Engine) (*Api, error) {
+func NewApi(metricService *service.MetricService, alertService *service.AlertService, router *fox.Engine) (*Api, error) {
 	api := &Api{
 		metricService: metricService,
+		alertService:  alertService,
 		router:        router,
 	}
 
@@ -26,4 +33,86 @@ func NewApi(metricService *service.MetricService, router *fox.Engine) (*Api, err
 func (api *Api) setupRouters(router *fox.Engine) {
 	// 指标相关路由
 	api.setupMetricRouters(router)
+	// 告警相关路由
+	api.setupAlertRouters(router)
+}
+
+// ========== 通用辅助方法 ==========
+
+// SendErrorResponse 发送错误响应（可被其他API模块使用）
+func SendErrorResponse(c *fox.Context, statusCode int, errorCode, message string, extras map[string]string) {
+	errorDetail := model.ErrorDetail{
+		Code:    errorCode,
+		Message: message,
+	}
+
+	// 添加额外的字段
+	if extras != nil {
+		if service, ok := extras["service"]; ok {
+			errorDetail.Service = service
+		}
+		if metric, ok := extras["metric"]; ok {
+			errorDetail.Metric = metric
+		}
+		if parameter, ok := extras["parameter"]; ok {
+			errorDetail.Parameter = parameter
+		}
+		if value, ok := extras["value"]; ok {
+			errorDetail.Value = value
+		}
+	}
+
+	response := model.ErrorResponse{
+		Error: errorDetail,
+	}
+
+	c.JSON(statusCode, response)
+}
+
+// ParseTimeRange 解析时间范围参数
+func ParseTimeRange(startStr, endStr string) (time.Time, time.Time, error) {
+	var start, end time.Time
+	var err error
+
+	// 如果没有指定开始时间，默认为1小时前
+	if startStr == "" {
+		start = time.Now().Add(-1 * time.Hour)
+	} else {
+		start, err = time.Parse(time.RFC3339, startStr)
+		if err != nil {
+			return time.Time{}, time.Time{}, fmt.Errorf("invalid start time format: %w", err)
+		}
+	}
+
+	// 如果没有指定结束时间，默认为当前时间
+	if endStr == "" {
+		end = time.Now()
+	} else {
+		end, err = time.Parse(time.RFC3339, endStr)
+		if err != nil {
+			return time.Time{}, time.Time{}, fmt.Errorf("invalid end time format: %w", err)
+		}
+	}
+
+	// 验证时间范围的合理性
+	if end.Before(start) {
+		return time.Time{}, time.Time{}, fmt.Errorf("end time must be after start time")
+	}
+
+	return start, end, nil
+}
+
+// ParseStep 解析步长参数
+func ParseStep(stepStr string) time.Duration {
+	if stepStr == "" {
+		return time.Minute // 默认1分钟
+	}
+
+	duration, err := time.ParseDuration(stepStr)
+	if err != nil {
+		log.Warn().Str("step", stepStr).Msg("invalid step format, using default")
+		return time.Minute
+	}
+
+	return duration
 }
diff --git a/internal/prometheus_adapter/api/metric_api.go b/internal/prometheus_adapter/api/metric_api.go
index 431bfdf..832362f 100644
--- a/internal/prometheus_adapter/api/metric_api.go
+++ b/internal/prometheus_adapter/api/metric_api.go
@@ -4,7 +4,6 @@ import (
 	"errors"
 	"fmt"
 	"net/http"
-	"time"
 
 	"github.com/fox-gonic/fox"
 	"github.com/qiniu/zeroops/internal/prometheus_adapter/model"
@@ -24,7 +23,7 @@ func (api *Api) GetMetrics(c *fox.Context) {
 	response, err := api.metricService.GetAvailableMetrics(ctx)
 	if err != nil {
 		log.Error().Err(err).Msg("failed to get available metrics")
-		api.sendErrorResponse(c, http.StatusInternalServerError, model.ErrorCodeInternalError, "获取指标列表失败", nil)
+		SendErrorResponse(c, http.StatusInternalServerError, model.ErrorCodeInternalError, "获取指标列表失败", nil)
 		return
 	}
 
@@ -46,16 +45,16 @@ func (api *Api) QueryMetric(c *fox.Context) {
 	stepStr := c.Query("step")
 
 	// 解析时间参数
-	start, end, err := api.parseTimeRange(startStr, endStr)
+	start, end, err := ParseTimeRange(startStr, endStr)
 	if err != nil {
 		log.Error().Err(err).Msg("invalid time parameters")
-		api.sendErrorResponse(c, http.StatusBadRequest, model.ErrorCodeInvalidParameter,
+		SendErrorResponse(c, http.StatusBadRequest, model.ErrorCodeInvalidParameter,
 			fmt.Sprintf("参数 'start/end' 格式错误: %s", err.Error()), nil)
 		return
 	}
 
 	// 解析步长参数
-	step := api.parseStep(stepStr)
+	step := ParseStep(stepStr)
 
 	// 查询指标
 	response, err := api.metricService.QueryMetric(ctx, serviceName, metricName, version, start, end, step)
@@ -67,54 +66,6 @@ func (api *Api) QueryMetric(c *fox.Context) {
 	c.JSON(http.StatusOK, response)
 }
 
-// parseTimeRange 解析时间范围参数
-func (api *Api) parseTimeRange(startStr, endStr string) (time.Time, time.Time, error) {
-	var start, end time.Time
-	var err error
-
-	// 如果没有指定开始时间，默认为1小时前
-	if startStr == "" {
-		start = time.Now().Add(-1 * time.Hour)
-	} else {
-		start, err = time.Parse(time.RFC3339, startStr)
-		if err != nil {
-			return time.Time{}, time.Time{}, fmt.Errorf("invalid start time format: %w", err)
-		}
-	}
-
-	// 如果没有指定结束时间，默认为当前时间
-	if endStr == "" {
-		end = time.Now()
-	} else {
-		end, err = time.Parse(time.RFC3339, endStr)
-		if err != nil {
-			return time.Time{}, time.Time{}, fmt.Errorf("invalid end time format: %w", err)
-		}
-	}
-
-	// 验证时间范围的合理性
-	if end.Before(start) {
-		return time.Time{}, time.Time{}, fmt.Errorf("end time must be after start time")
-	}
-
-	return start, end, nil
-}
-
-// parseStep 解析步长参数
-func (api *Api) parseStep(stepStr string) time.Duration {
-	if stepStr == "" {
-		return time.Minute // 默认1分钟
-	}
-
-	duration, err := time.ParseDuration(stepStr)
-	if err != nil {
-		log.Warn().Str("step", stepStr).Msg("invalid step format, using default")
-		return time.Minute
-	}
-
-	return duration
-}
-
 // handleQueryError 处理查询错误
 func (api *Api) handleQueryError(c *fox.Context, err error, service, metric string) {
 	var serviceNotFound *model.ServiceNotFoundError
@@ -124,52 +75,22 @@ func (api *Api) handleQueryError(c *fox.Context, err error, service, metric stri
 	switch {
 	case errors.As(err, &serviceNotFound):
 		log.Error().Err(err).Str("service", service).Msg("service not found")
-		api.sendErrorResponse(c, http.StatusNotFound, model.ErrorCodeServiceNotFound,
+		SendErrorResponse(c, http.StatusNotFound, model.ErrorCodeServiceNotFound,
 			err.Error(), map[string]string{"service": service})
 
 	case errors.As(err, &metricNotFound):
 		log.Error().Err(err).Str("metric", metric).Msg("metric not found")
-		api.sendErrorResponse(c, http.StatusNotFound, model.ErrorCodeMetricNotFound,
+		SendErrorResponse(c, http.StatusNotFound, model.ErrorCodeMetricNotFound,
 			err.Error(), map[string]string{"metric": metric})
 
 	case errors.As(err, &prometheusError):
 		log.Error().Err(err).Msg("prometheus query error")
-		api.sendErrorResponse(c, http.StatusInternalServerError, model.ErrorCodePrometheusError,
+		SendErrorResponse(c, http.StatusInternalServerError, model.ErrorCodePrometheusError,
 			"Prometheus 查询失败", nil)
 
 	default:
 		log.Error().Err(err).Msg("unexpected error during metric query")
-		api.sendErrorResponse(c, http.StatusInternalServerError, model.ErrorCodeInternalError,
+		SendErrorResponse(c, http.StatusInternalServerError, model.ErrorCodeInternalError,
 			"内部服务器错误", nil)
 	}
 }
-
-// sendErrorResponse 发送错误响应
-func (api *Api) sendErrorResponse(c *fox.Context, statusCode int, errorCode, message string, extras map[string]string) {
-	errorDetail := model.ErrorDetail{
-		Code:    errorCode,
-		Message: message,
-	}
-
-	// 添加额外的字段
-	if extras != nil {
-		if service, ok := extras["service"]; ok {
-			errorDetail.Service = service
-		}
-		if metric, ok := extras["metric"]; ok {
-			errorDetail.Metric = metric
-		}
-		if parameter, ok := extras["parameter"]; ok {
-			errorDetail.Parameter = parameter
-		}
-		if value, ok := extras["value"]; ok {
-			errorDetail.Value = value
-		}
-	}
-
-	response := model.ErrorResponse{
-		Error: errorDetail,
-	}
-
-	c.JSON(statusCode, response)
-}
diff --git a/internal/prometheus_adapter/model/alert.go b/internal/prometheus_adapter/model/alert.go
new file mode 100644
index 0000000..605c5fb
--- /dev/null
+++ b/internal/prometheus_adapter/model/alert.go
@@ -0,0 +1,20 @@
+package model
+
+// AlertRule 告警规则表 - 定义告警规则模板
+type AlertRule struct {
+	Name        string `json:"name" gorm:"type:varchar(255);primaryKey"`
+	Description string `json:"description" gorm:"type:text"`
+	Expr        string `json:"expr" gorm:"type:text;not null"`
+	Op          string `json:"op" gorm:"type:enum('>', '<', '=', '!=');not null"`
+	Severity    string `json:"severity" gorm:"type:varchar(50);not null"`
+}
+
+// AlertRuleMeta 告警规则元信息表 - 存储服务级别的告警配置
+// 用于将告警规则模板实例化为具体的服务告警
+type AlertRuleMeta struct {
+	AlertName string  `json:"alert_name" gorm:"type:varchar(255);primaryKey"`
+	Labels    string  `json:"labels" gorm:"type:text"`     // JSON格式的服务标签，如：{"service":"storage-service","version":"1.0.0"}
+	Threshold float64 `json:"threshold"`                   // 告警阈值
+	WatchTime int     `json:"watch_time"`                  // 持续时间（秒），对应Prometheus的for字段
+	MatchTime string  `json:"match_time" gorm:"type:text"` // 时间范围表达式
+}
diff --git a/internal/prometheus_adapter/model/api.go b/internal/prometheus_adapter/model/api.go
index efef1d2..f7f6e7f 100644
--- a/internal/prometheus_adapter/model/api.go
+++ b/internal/prometheus_adapter/model/api.go
@@ -2,7 +2,7 @@ package model
 
 import "time"
 
-// ===== API 响应结构体 =====
+// ===== 指标相关 API =====
 
 // MetricListResponse 指标列表响应（对应 GET /v1/metrics）
 type MetricListResponse struct {
@@ -22,3 +22,58 @@ type MetricDataPoint struct {
 	Timestamp time.Time `json:"timestamp"`
 	Value     float64   `json:"value"`
 }
+
+// ===== 告警规则相关 API =====
+
+// CreateAlertRuleRequest 创建告警规则请求
+type CreateAlertRuleRequest struct {
+	Name        string `json:"name" binding:"required"`
+	Description string `json:"description,omitempty"`
+	Expr        string `json:"expr" binding:"required"`
+	Op          string `json:"op" binding:"required,oneof=> < = !="`
+	Severity    string `json:"severity" binding:"required"`
+
+	// 元信息字段（可选）
+	Labels    map[string]string `json:"labels,omitempty"`
+	Threshold float64           `json:"threshold,omitempty"`
+	WatchTime int               `json:"watch_time,omitempty"`
+	MatchTime string            `json:"match_time,omitempty"`
+}
+
+// UpdateAlertRuleRequest 更新告警规则请求
+type UpdateAlertRuleRequest struct {
+	Description *string `json:"description,omitempty"`
+	Expr        *string `json:"expr,omitempty"`
+	Op          *string `json:"op,omitempty" binding:"omitempty,oneof=> < = !="`
+	Severity    *string `json:"severity,omitempty"`
+
+	// 元信息字段（可选）
+	Labels    map[string]string `json:"labels,omitempty"`
+	Threshold *float64          `json:"threshold,omitempty"`
+	WatchTime *int              `json:"watch_time,omitempty"`
+	MatchTime *string           `json:"match_time,omitempty"`
+}
+
+// CreateAlertRuleMetaRequest 创建告警规则元信息请求
+type CreateAlertRuleMetaRequest struct {
+	AlertName string            `json:"alert_name" binding:"required"`
+	Labels    map[string]string `json:"labels" binding:"required"`
+	Threshold float64           `json:"threshold" binding:"required"`
+	WatchTime int               `json:"watch_time,omitempty"`
+	MatchTime string            `json:"match_time,omitempty"`
+}
+
+// UpdateAlertRuleMetaRequest 更新告警规则元信息请求
+type UpdateAlertRuleMetaRequest struct {
+	Labels    map[string]string `json:"labels,omitempty"`
+	Threshold *float64          `json:"threshold,omitempty"`
+	WatchTime *int              `json:"watch_time,omitempty"`
+	MatchTime *string           `json:"match_time,omitempty"`
+}
+
+// SyncRulesRequest 同步规则请求
+// 从监控告警模块发送过来的完整规则列表
+type SyncRulesRequest struct {
+	Rules     []AlertRule     `json:"rules"`      // 告警规则列表
+	RuleMetas []AlertRuleMeta `json:"rule_metas"` // 规则元信息列表
+}
diff --git a/internal/prometheus_adapter/model/constants.go b/internal/prometheus_adapter/model/constants.go
index 3e8727b..3992eae 100644
--- a/internal/prometheus_adapter/model/constants.go
+++ b/internal/prometheus_adapter/model/constants.go
@@ -7,4 +7,5 @@ const (
 	ErrorCodeInvalidParameter = "INVALID_PARAMETER"
 	ErrorCodePrometheusError  = "PROMETHEUS_ERROR"
 	ErrorCodeInternalError    = "INTERNAL_ERROR"
+	ErrorCodeRuleNotFound     = "RULE_NOT_FOUND"
 )
diff --git a/internal/prometheus_adapter/model/prometheus_rule.go b/internal/prometheus_adapter/model/prometheus_rule.go
new file mode 100644
index 0000000..3d5c9e4
--- /dev/null
+++ b/internal/prometheus_adapter/model/prometheus_rule.go
@@ -0,0 +1,21 @@
+package model
+
+// PrometheusRule Prometheus规则文件中的单个规则
+type PrometheusRule struct {
+	Alert       string            `yaml:"alert"`
+	Expr        string            `yaml:"expr"`
+	For         string            `yaml:"for,omitempty"`
+	Labels      map[string]string `yaml:"labels,omitempty"`
+	Annotations map[string]string `yaml:"annotations,omitempty"`
+}
+
+// PrometheusRuleGroup Prometheus规则组
+type PrometheusRuleGroup struct {
+	Name  string           `yaml:"name"`
+	Rules []PrometheusRule `yaml:"rules"`
+}
+
+// PrometheusRuleFile Prometheus规则文件结构
+type PrometheusRuleFile struct {
+	Groups []PrometheusRuleGroup `yaml:"groups"`
+}
diff --git a/internal/prometheus_adapter/server.go b/internal/prometheus_adapter/server.go
index e921668..d9fb2f4 100644
--- a/internal/prometheus_adapter/server.go
+++ b/internal/prometheus_adapter/server.go
@@ -17,6 +17,7 @@ type PrometheusAdapterServer struct {
 	config        *config.Config
 	promClient    *client.PrometheusClient
 	metricService *service.MetricService
+	alertService  *service.AlertService
 	api           *api.Api
 }
 
@@ -37,10 +38,14 @@ func NewPrometheusAdapterServer(cfg *config.Config) (*PrometheusAdapterServer, e
 	// 创建指标服务
 	metricService := service.NewMetricService(promClient)
 
+	// 创建告警服务
+	alertService := service.NewAlertService(promClient)
+
 	server := &PrometheusAdapterServer{
 		config:        cfg,
 		promClient:    promClient,
 		metricService: metricService,
+		alertService:  alertService,
 	}
 
 	log.Info().Str("prometheus_address", prometheusAddr).Msg("Prometheus Adapter initialized successfully")
@@ -50,10 +55,11 @@ func NewPrometheusAdapterServer(cfg *config.Config) (*PrometheusAdapterServer, e
 // UseApi 设置 API 路由
 func (s *PrometheusAdapterServer) UseApi(router *fox.Engine) error {
 	var err error
-	s.api, err = api.NewApi(s.metricService, router)
+	s.api, err = api.NewApi(s.metricService, s.alertService, router)
 	if err != nil {
 		return fmt.Errorf("failed to initialize API: %w", err)
 	}
+
 	return nil
 }
 
diff --git a/internal/prometheus_adapter/service/alert_service.go b/internal/prometheus_adapter/service/alert_service.go
new file mode 100644
index 0000000..2be8095
--- /dev/null
+++ b/internal/prometheus_adapter/service/alert_service.go
@@ -0,0 +1,282 @@
+package service
+
+import (
+	"encoding/json"
+	"fmt"
+	"net/http"
+	"os"
+	"path/filepath"
+	"strings"
+
+	"github.com/qiniu/zeroops/internal/prometheus_adapter/client"
+	"github.com/qiniu/zeroops/internal/prometheus_adapter/model"
+	"github.com/rs/zerolog/log"
+	"gopkg.in/yaml.v3"
+)
+
+// AlertService 告警服务 - 仅负责与Prometheus交互，不存储规则
+type AlertService struct {
+	promClient    *client.PrometheusClient
+	rulesFilePath string
+}
+
+// NewAlertService 创建告警服务
+func NewAlertService(promClient *client.PrometheusClient) *AlertService {
+	rulesFilePath := os.Getenv("PROMETHEUS_RULES_FILE")
+	if rulesFilePath == "" {
+		rulesFilePath = "/etc/prometheus/rules/alert_rules.yml"
+	}
+
+	return &AlertService{
+		promClient:    promClient,
+		rulesFilePath: rulesFilePath,
+	}
+}
+
+// SyncRulesToPrometheus 同步规则到Prometheus
+// 接收完整的规则列表，生成Prometheus规则文件并重载配置
+func (s *AlertService) SyncRulesToPrometheus(rules []model.AlertRule, ruleMetas []model.AlertRuleMeta) error {
+	// 构建Prometheus规则文件
+	prometheusRules := s.buildPrometheusRules(rules, ruleMetas)
+
+	// 写入规则文件
+	if err := s.writeRulesFile(prometheusRules); err != nil {
+		return fmt.Errorf("failed to write rules file: %w", err)
+	}
+
+	// 通知Prometheus重新加载配置
+	if err := s.reloadPrometheus(); err != nil {
+		log.Warn().Err(err).Msg("Failed to reload Prometheus, rules file has been updated")
+		// 不返回错误，因为文件已经更新成功
+	}
+
+	log.Info().
+		Int("rules_count", len(rules)).
+		Int("metas_count", len(ruleMetas)).
+		Msg("Rules synced to Prometheus successfully")
+
+	return nil
+}
+
+// buildPrometheusRules 构建Prometheus规则
+func (s *AlertService) buildPrometheusRules(rules []model.AlertRule, ruleMetas []model.AlertRuleMeta) *model.PrometheusRuleFile {
+	promRules := []model.PrometheusRule{}
+
+	// 创建规则名到规则的映射
+	ruleMap := make(map[string]*model.AlertRule)
+	for i := range rules {
+		ruleMap[rules[i].Name] = &rules[i]
+	}
+
+	// 为每个元信息生成Prometheus规则
+	for _, meta := range ruleMetas {
+		// 查找对应的规则模板
+		var rule *model.AlertRule
+
+		// 尝试从alert_name中提取规则名
+		// 假设alert_name格式为: {rule_name}_{service}_{version} 或类似格式
+		for ruleName, r := range ruleMap {
+			if strings.HasPrefix(meta.AlertName, ruleName) {
+				rule = r
+				break
+			}
+		}
+
+		if rule == nil {
+			log.Warn().
+				Str("alert_name", meta.AlertName).
+				Msg("No matching rule template found for alert meta, skipping")
+			continue
+		}
+
+		// 解析标签
+		var labels map[string]string
+		if meta.Labels != "" {
+			if err := json.Unmarshal([]byte(meta.Labels), &labels); err != nil {
+				log.Warn().
+					Err(err).
+					Str("alert_name", meta.AlertName).
+					Msg("Failed to parse labels, using empty labels")
+				labels = make(map[string]string)
+			}
+		} else {
+			labels = make(map[string]string)
+		}
+
+		// 添加severity标签
+		labels["severity"] = rule.Severity
+		labels["rule_name"] = rule.Name
+
+		// 构建表达式
+		expr := s.buildExpression(rule, &meta)
+
+		// 构建注释
+		annotations := map[string]string{
+			"description": rule.Description,
+			"summary":     fmt.Sprintf("%s %s %f", rule.Expr, rule.Op, meta.Threshold),
+		}
+
+		// 计算for字段
+		forDuration := ""
+		if meta.WatchTime > 0 {
+			forDuration = fmt.Sprintf("%ds", meta.WatchTime)
+		}
+
+		promRule := model.PrometheusRule{
+			Alert:       meta.AlertName,
+			Expr:        expr,
+			For:         forDuration,
+			Labels:      labels,
+			Annotations: annotations,
+		}
+
+		promRules = append(promRules, promRule)
+	}
+
+	// 如果没有元信息，为每个规则创建默认规则
+	if len(ruleMetas) == 0 {
+		for _, rule := range rules {
+			labels := map[string]string{
+				"severity": rule.Severity,
+			}
+
+			annotations := map[string]string{
+				"description": rule.Description,
+				"summary":     fmt.Sprintf("%s triggered", rule.Name),
+			}
+
+			promRule := model.PrometheusRule{
+				Alert:       rule.Name,
+				Expr:        rule.Expr,
+				Labels:      labels,
+				Annotations: annotations,
+			}
+
+			promRules = append(promRules, promRule)
+		}
+	}
+
+	return &model.PrometheusRuleFile{
+		Groups: []model.PrometheusRuleGroup{
+			{
+				Name:  "zeroops_alerts",
+				Rules: promRules,
+			},
+		},
+	}
+}
+
+// buildExpression 构建PromQL表达式
+func (s *AlertService) buildExpression(rule *model.AlertRule, meta *model.AlertRuleMeta) string {
+	expr := rule.Expr
+
+	// 解析标签并添加到表达式中
+	var labels map[string]string
+	if meta.Labels != "" {
+		json.Unmarshal([]byte(meta.Labels), &labels)
+	}
+
+	if len(labels) > 0 {
+		labelMatchers := []string{}
+		for k, v := range labels {
+			// 跳过内部使用的标签
+			if k == "rule_name" {
+				continue
+			}
+			labelMatchers = append(labelMatchers, fmt.Sprintf(`%s="%s"`, k, v))
+		}
+
+		if len(labelMatchers) > 0 {
+			// 如果表达式包含{，说明已经有标签选择器
+			if strings.Contains(expr, "{") {
+				expr = strings.Replace(expr, "}", ","+strings.Join(labelMatchers, ",")+"}", 1)
+			} else {
+				// 在指标名后添加标签选择器
+				// 查找第一个非字母数字下划线的字符
+				metricEnd := 0
+				for i, ch := range expr {
+					if !((ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z') ||
+						(ch >= '0' && ch <= '9') || ch == '_') {
+						metricEnd = i
+						break
+					}
+				}
+				if metricEnd == 0 {
+					metricEnd = len(expr)
+				}
+				expr = expr[:metricEnd] + "{" + strings.Join(labelMatchers, ",") + "}" + expr[metricEnd:]
+			}
+		}
+	}
+
+	// 添加时间范围
+	if meta.MatchTime != "" {
+		// 查找最后一个指标，添加时间范围
+		if !strings.Contains(expr, "[") {
+			// 简单处理：在第一个空格前添加时间范围
+			parts := strings.SplitN(expr, " ", 2)
+			if len(parts) == 2 {
+				expr = parts[0] + "[" + meta.MatchTime + "] " + parts[1]
+			} else {
+				expr = expr + "[" + meta.MatchTime + "]"
+			}
+		}
+	}
+
+	// 添加比较操作符和阈值
+	if meta.Threshold != 0 {
+		expr = fmt.Sprintf("%s %s %f", expr, rule.Op, meta.Threshold)
+	}
+
+	return expr
+}
+
+// writeRulesFile 写入规则文件
+func (s *AlertService) writeRulesFile(rules *model.PrometheusRuleFile) error {
+	// 确保目录存在
+	dir := filepath.Dir(s.rulesFilePath)
+	if err := os.MkdirAll(dir, 0755); err != nil {
+		return fmt.Errorf("failed to create rules directory: %w", err)
+	}
+
+	// 序列化为YAML
+	data, err := yaml.Marshal(rules)
+	if err != nil {
+		return fmt.Errorf("failed to marshal rules: %w", err)
+	}
+
+	// 写入文件
+	if err := os.WriteFile(s.rulesFilePath, data, 0644); err != nil {
+		return fmt.Errorf("failed to write rules file: %w", err)
+	}
+
+	log.Info().
+		Str("file", s.rulesFilePath).
+		Int("groups", len(rules.Groups)).
+		Msg("Prometheus rules file updated")
+
+	return nil
+}
+
+// reloadPrometheus 重新加载Prometheus配置
+func (s *AlertService) reloadPrometheus() error {
+	prometheusURL := os.Getenv("PROMETHEUS_ADDRESS")
+	if prometheusURL == "" {
+		prometheusURL = "http://10.210.10.33:9090"
+	}
+
+	reloadURL := fmt.Sprintf("%s/-/reload", strings.TrimSuffix(prometheusURL, "/"))
+
+	resp, err := http.Post(reloadURL, "text/plain", nil)
+	if err != nil {
+		return fmt.Errorf("failed to reload Prometheus: %w", err)
+	}
+	defer resp.Body.Close()
+
+	if resp.StatusCode != http.StatusOK {
+		return fmt.Errorf("Prometheus reload failed with status: %d", resp.StatusCode)
+	}
+
+	log.Info().Msg("Prometheus configuration reloaded")
+	return nil
+}

From 54b8f4d0b3c12f41ccc7201bb53559c3dd9793e3 Mon Sep 17 00:00:00 2001
From: dnj <shdnj@qq.com>
Date: Wed, 24 Sep 2025 19:22:01 +0800
Subject: [PATCH 07/18] =?UTF-8?q?feat(prometheus=5Fadapter):=20=E5=AE=9E?=
 =?UTF-8?q?=E7=8E=B0=E5=91=8A=E8=AD=A6=E8=A7=84=E5=88=99=E5=A2=9E=E9=87=8F?=
 =?UTF-8?q?=E6=9B=B4=E6=96=B0=E5=8A=9F=E8=83=BD?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

ps：未跑通
---
 cmd/prometheus_adapter/main.go                |  50 +++
 docs/prometheus_adapter/README.md             |  85 ++++-
 go.mod                                        |   2 +-
 internal/prometheus_adapter/api/alert_api.go  |  86 +++++
 internal/prometheus_adapter/model/alert.go    |  19 +-
 internal/prometheus_adapter/model/api.go      |  22 +-
 .../service/alert_service.go                  | 249 +++++++++++--
 .../prometheus_adapter/test_alert_update.sh   |  91 +++++
 scripts/prometheus_adapter/build.sh           | 180 +++++++++
 scripts/prometheus_adapter/deploy.sh          | 350 ++++++++++++++++++
 10 files changed, 1071 insertions(+), 63 deletions(-)
 create mode 100644 cmd/prometheus_adapter/main.go
 create mode 100755 internal/prometheus_adapter/test_alert_update.sh
 create mode 100755 scripts/prometheus_adapter/build.sh
 create mode 100755 scripts/prometheus_adapter/deploy.sh

diff --git a/cmd/prometheus_adapter/main.go b/cmd/prometheus_adapter/main.go
new file mode 100644
index 0000000..9f45442
--- /dev/null
+++ b/cmd/prometheus_adapter/main.go
@@ -0,0 +1,50 @@
+package main
+
+import (
+	"os"
+
+	"github.com/fox-gonic/fox"
+	"github.com/qiniu/zeroops/internal/config"
+	prometheusadapter "github.com/qiniu/zeroops/internal/prometheus_adapter"
+	"github.com/rs/zerolog"
+	"github.com/rs/zerolog/log"
+)
+
+func main() {
+	// 配置日志
+	log.Logger = log.Output(zerolog.ConsoleWriter{Out: os.Stderr})
+
+	log.Info().Msg("Starting Prometheus Adapter server")
+
+	// 加载配置
+	cfg := &config.Config{
+		Server: config.ServerConfig{
+			BindAddr: ":9999", // 默认端口
+		},
+	}
+
+	// 如果有环境变量，使用环境变量的端口
+	if port := os.Getenv("ADAPTER_PORT"); port != "" {
+		cfg.Server.BindAddr = ":" + port
+	}
+
+	// 创建 Prometheus Adapter 服务器
+	adapter, err := prometheusadapter.NewPrometheusAdapterServer(cfg)
+	if err != nil {
+		log.Fatal().Err(err).Msg("Failed to create Prometheus Adapter server")
+	}
+
+	// 创建路由
+	router := fox.New()
+
+	// 启动 API
+	if err := adapter.UseApi(router); err != nil {
+		log.Fatal().Err(err).Msg("Failed to setup API routes")
+	}
+
+	// 启动服务器
+	log.Info().Msgf("Starting Prometheus Adapter on %s", cfg.Server.BindAddr)
+	if err := router.Run(cfg.Server.BindAddr); err != nil {
+		log.Fatal().Err(err).Msg("Failed to start server")
+	}
+}
diff --git a/docs/prometheus_adapter/README.md b/docs/prometheus_adapter/README.md
index 81aa9f5..0a92312 100644
--- a/docs/prometheus_adapter/README.md
+++ b/docs/prometheus_adapter/README.md
@@ -139,10 +139,11 @@ internal/prometheus_adapter/
 
 ### 告警规则同步
 
+#### 1. 全量同步规则
 - 方法与路径：`POST /v1/alert-rules/sync`
 - 功能：接收监控告警模块发送的完整规则列表，生成 Prometheus 规则文件并触发重载（全量同步）
 - 请求体示例：
-```
+```json
 {
   "rules": [
     {
@@ -155,23 +156,95 @@ internal/prometheus_adapter/
   ],
   "rule_metas": [
     {
-      "alert_name": "high_cpu_usage_storage_v1",
+      "alert_name": "high_cpu_usage",  // 与规则模板的name字段保持一致
       "labels": "{\"service\":\"storage-service\",\"version\":\"1.0.0\"}",
       "threshold": 90,
-      "watch_time": 300,
-      "match_time": "5m"
+      "watch_time": 300
     }
   ]
 }
 ```
 - 响应示例：
-```
+```json
 {
   "status": "success",
   "message": "Rules synced to Prometheus"
 }
 ```
 
+#### 2. 更新单个规则模板
+- 方法与路径：`PUT /v1/alert-rules/:rule_name`
+- 功能：更新指定的告警规则模板，系统会自动查找所有使用该规则的元信息并重新生成 Prometheus 规则
+- 路径参数：
+  - `rule_name`：规则名称（如 `high_cpu_usage`）
+- 请求体示例：
+```json
+{
+  "description": "CPU使用率异常告警（更新后）",
+  "expr": "avg(system_cpu_usage_percent)",
+  "op": ">=",
+  "severity": "critical"
+}
+```
+- 响应示例：
+```json
+{
+  "status": "success",
+  "message": "Rule 'high_cpu_usage' updated and synced to Prometheus",
+  "affected_metas": 3  // 影响的元信息数量
+}
+```
+
+#### 3. 更新单个规则元信息
+- 方法与路径：`PUT /v1/alert-rules/meta`
+- 功能：更新指定规则的元信息，系统会根据对应的规则模板重新生成 Prometheus 规则
+- 请求体示例：
+```json
+{
+  "rule_name": "high_cpu_usage",  // 必填，对应规则模板的name
+  "labels": "{\"service\":\"storage-service\",\"version\":\"2.0.0\"}",  // 必填，用于唯一标识
+  "threshold": 85,
+  "watch_time": 600
+}
+```
+- 响应示例：
+```json
+{
+  "status": "success",
+  "message": "Rule meta updated and synced to Prometheus",
+  "rule_name": "high_cpu_usage",
+  "labels": "{\"service\":\"storage-service\",\"version\":\"2.0.0\"}"
+}
+```
+
+#### 规则生成机制
+- **规则模板与元信息关联**：通过 `alert_name` 字段关联
+  - `AlertRule.name` = `AlertRuleMeta.alert_name`
+- **元信息唯一标识**：通过 `alert_name` + `labels` 的组合唯一确定一个元信息记录
+- **Prometheus 告警生成**：
+  - 所有基于同一规则模板的告警使用相同的 `alert` 名称（即规则模板的 `name`）
+  - 通过 `labels` 区分不同的服务实例
+
+#### 字段说明
+- **AlertRule（规则模板）**：
+  - `name`：规则名称，作为 Prometheus 的 alert 名称
+  - `description`：规则描述，可读的 title
+  - `expr`：PromQL 表达式，如 `sum(apitime) by (service, version)`，可包含时间范围
+  - `op`：比较操作符（`>`, `<`, `=`, `!=`）
+  - `severity`：告警等级，通常进入告警的 labels.severity
+- **AlertRuleMeta（元信息）**：
+  - `alert_name`：关联的规则名称（对应 alert_rules.name）
+  - `labels`：JSON 格式的标签，用于筛选特定服务（如 `{"service":"s3","version":"v1"}`）
+  - `threshold`：告警阈值
+  - `watch_time`：持续时间（秒），对应 Prometheus 的 `for` 字段
+
+#### 增量更新说明
+- **增量更新**：新接口支持增量更新，只需传入需要修改的字段
+- **自动匹配**：
+  - 更新规则模板时，系统自动查找所有 `alert_name` 匹配的元信息并重新生成规则
+  - 更新元信息时，系统根据 `alert_name` + `labels` 查找并更新对应的元信息
+- **缓存机制**：系统在内存中缓存当前的规则和元信息，支持快速增量更新
+
 ## Alertmanager 集成
 
 - 目标：将 Prometheus 触发的告警通过 Alertmanager 转发到监控告警模块
@@ -203,7 +276,7 @@ receivers:
 - `metadata-service`
 - `storage-service`
 - `queue-service`
-- `third-party-service`（原文为 third-party-servrice，已更正）
+- `third-party-service`
 - `mock-error-service`
 
 所有服务的版本信息通过标签 `service_version` 暴露。
diff --git a/go.mod b/go.mod
index 8015824..94b9643 100644
--- a/go.mod
+++ b/go.mod
@@ -11,6 +11,7 @@ require (
 	github.com/prometheus/common v0.66.1
 	github.com/redis/go-redis/v9 v9.5.1
 	github.com/rs/zerolog v1.34.0
+	gopkg.in/yaml.v3 v3.0.1
 )
 
 require (
@@ -52,5 +53,4 @@ require (
 	golang.org/x/sys v0.35.0 // indirect
 	golang.org/x/text v0.28.0 // indirect
 	google.golang.org/protobuf v1.36.8 // indirect
-	gopkg.in/yaml.v3 v3.0.1 // indirect
 )
diff --git a/internal/prometheus_adapter/api/alert_api.go b/internal/prometheus_adapter/api/alert_api.go
index cb3e968..3206087 100644
--- a/internal/prometheus_adapter/api/alert_api.go
+++ b/internal/prometheus_adapter/api/alert_api.go
@@ -1,6 +1,7 @@
 package api
 
 import (
+	"fmt"
 	"net/http"
 
 	"github.com/fox-gonic/fox"
@@ -10,6 +11,8 @@ import (
 // setupAlertRouters 设置告警相关路由
 func (api *Api) setupAlertRouters(router *fox.Engine) {
 	router.POST("/v1/alert-rules/sync", api.SyncRules)
+	router.PUT("/v1/alert-rules/:rule_name", api.UpdateRule)
+	router.PUT("/v1/alert-rules/meta", api.UpdateRuleMeta)
 }
 
 // SyncRules 同步规则到Prometheus
@@ -34,3 +37,86 @@ func (api *Api) SyncRules(c *fox.Context) {
 		"message": "Rules synced to Prometheus",
 	})
 }
+
+// UpdateRule 更新单个规则模板
+// 只更新指定的规则，系统会自动查找所有使用该规则的元信息并重新生成
+func (api *Api) UpdateRule(c *fox.Context) {
+	ruleName := c.Param("rule_name")
+	if ruleName == "" {
+		SendErrorResponse(c, http.StatusBadRequest, model.ErrorCodeInvalidParameter,
+			"Rule name is required", nil)
+		return
+	}
+
+	var req model.UpdateAlertRuleRequest
+	if err := c.ShouldBindJSON(&req); err != nil {
+		SendErrorResponse(c, http.StatusBadRequest, model.ErrorCodeInvalidParameter,
+			"Invalid request body: "+err.Error(), nil)
+		return
+	}
+
+	// 构建完整的规则对象
+	rule := model.AlertRule{
+		Name:        ruleName,
+		Description: req.Description,
+		Expr:        req.Expr,
+		Op:          req.Op,
+		Severity:    req.Severity,
+	}
+
+	err := api.alertService.UpdateRule(rule)
+	if err != nil {
+		SendErrorResponse(c, http.StatusInternalServerError, model.ErrorCodeInternalError,
+			"Failed to update rule: "+err.Error(), nil)
+		return
+	}
+
+	// 获取受影响的元信息数量
+	affectedCount := api.alertService.GetAffectedMetas(ruleName)
+
+	c.JSON(http.StatusOK, map[string]interface{}{
+		"status":         "success",
+		"message":        fmt.Sprintf("Rule '%s' updated and synced to Prometheus", ruleName),
+		"affected_metas": affectedCount,
+	})
+}
+
+// UpdateRuleMeta 更新单个规则元信息
+// 通过 alert_name + labels 唯一确定一个元信息记录
+func (api *Api) UpdateRuleMeta(c *fox.Context) {
+	var req model.UpdateAlertRuleMetaRequest
+	if err := c.ShouldBindJSON(&req); err != nil {
+		SendErrorResponse(c, http.StatusBadRequest, model.ErrorCodeInvalidParameter,
+			"Invalid request body: "+err.Error(), nil)
+		return
+	}
+
+	// alert_name 和 labels 是必填的
+	if req.AlertName == "" || req.Labels == "" {
+		SendErrorResponse(c, http.StatusBadRequest, model.ErrorCodeInvalidParameter,
+			"alert_name and labels are required", nil)
+		return
+	}
+
+	// 构建完整的元信息对象
+	meta := model.AlertRuleMeta{
+		AlertName: req.AlertName,
+		Labels:    req.Labels,
+		Threshold: req.Threshold,
+		WatchTime: req.WatchTime,
+	}
+
+	err := api.alertService.UpdateRuleMeta(meta)
+	if err != nil {
+		SendErrorResponse(c, http.StatusInternalServerError, model.ErrorCodeInternalError,
+			"Failed to update rule meta: "+err.Error(), nil)
+		return
+	}
+
+	c.JSON(http.StatusOK, map[string]interface{}{
+		"status":     "success",
+		"message":    "Rule meta updated and synced to Prometheus",
+		"alert_name": req.AlertName,
+		"labels":     req.Labels,
+	})
+}
diff --git a/internal/prometheus_adapter/model/alert.go b/internal/prometheus_adapter/model/alert.go
index 605c5fb..566a143 100644
--- a/internal/prometheus_adapter/model/alert.go
+++ b/internal/prometheus_adapter/model/alert.go
@@ -2,19 +2,18 @@ package model
 
 // AlertRule 告警规则表 - 定义告警规则模板
 type AlertRule struct {
-	Name        string `json:"name" gorm:"type:varchar(255);primaryKey"`
-	Description string `json:"description" gorm:"type:text"`
-	Expr        string `json:"expr" gorm:"type:text;not null"`
-	Op          string `json:"op" gorm:"type:enum('>', '<', '=', '!=');not null"`
-	Severity    string `json:"severity" gorm:"type:varchar(50);not null"`
+	Name        string `json:"name" gorm:"type:varchar(255);primaryKey"`  // 主键，告警规则名称
+	Description string `json:"description" gorm:"type:text"`              // 可读标题，可拼接渲染为可读的 title
+	Expr        string `json:"expr" gorm:"type:text;not null"`            // 左侧业务指标表达式，如 sum(apitime) by (service, version)
+	Op          string `json:"op" gorm:"type:varchar(4);not null"`        // 阈值比较方式（>, <, =, !=）
+	Severity    string `json:"severity" gorm:"type:varchar(32);not null"` // 告警等级，通常进入告警的 labels.severity
 }
 
 // AlertRuleMeta 告警规则元信息表 - 存储服务级别的告警配置
 // 用于将告警规则模板实例化为具体的服务告警
 type AlertRuleMeta struct {
-	AlertName string  `json:"alert_name" gorm:"type:varchar(255);primaryKey"`
-	Labels    string  `json:"labels" gorm:"type:text"`     // JSON格式的服务标签，如：{"service":"storage-service","version":"1.0.0"}
-	Threshold float64 `json:"threshold"`                   // 告警阈值
-	WatchTime int     `json:"watch_time"`                  // 持续时间（秒），对应Prometheus的for字段
-	MatchTime string  `json:"match_time" gorm:"type:text"` // 时间范围表达式
+	AlertName string  `json:"alert_name" gorm:"type:varchar(255);index"` // 关联 alert_rules.name
+	Labels    string  `json:"labels" gorm:"type:jsonb"`                  // 适用标签，如 {"service":"s3","version":"v1"}，为空表示全局
+	Threshold float64 `json:"threshold"`                                 // 阈值（会被渲染成特定规则的 threshold metric 数值）
+	WatchTime int     `json:"watch_time"`                                // 持续时长（映射 Prometheus rule 的 for）
 }
diff --git a/internal/prometheus_adapter/model/api.go b/internal/prometheus_adapter/model/api.go
index f7f6e7f..4dc8421 100644
--- a/internal/prometheus_adapter/model/api.go
+++ b/internal/prometheus_adapter/model/api.go
@@ -42,16 +42,10 @@ type CreateAlertRuleRequest struct {
 
 // UpdateAlertRuleRequest 更新告警规则请求
 type UpdateAlertRuleRequest struct {
-	Description *string `json:"description,omitempty"`
-	Expr        *string `json:"expr,omitempty"`
-	Op          *string `json:"op,omitempty" binding:"omitempty,oneof=> < = !="`
-	Severity    *string `json:"severity,omitempty"`
-
-	// 元信息字段（可选）
-	Labels    map[string]string `json:"labels,omitempty"`
-	Threshold *float64          `json:"threshold,omitempty"`
-	WatchTime *int              `json:"watch_time,omitempty"`
-	MatchTime *string           `json:"match_time,omitempty"`
+	Description string `json:"description,omitempty"`
+	Expr        string `json:"expr,omitempty"`
+	Op          string `json:"op,omitempty" binding:"omitempty,oneof=> < = !="`
+	Severity    string `json:"severity,omitempty"`
 }
 
 // CreateAlertRuleMetaRequest 创建告警规则元信息请求
@@ -65,10 +59,10 @@ type CreateAlertRuleMetaRequest struct {
 
 // UpdateAlertRuleMetaRequest 更新告警规则元信息请求
 type UpdateAlertRuleMetaRequest struct {
-	Labels    map[string]string `json:"labels,omitempty"`
-	Threshold *float64          `json:"threshold,omitempty"`
-	WatchTime *int              `json:"watch_time,omitempty"`
-	MatchTime *string           `json:"match_time,omitempty"`
+	AlertName string  `json:"alert_name" binding:"required"`
+	Labels    string  `json:"labels" binding:"required"`
+	Threshold float64 `json:"threshold"`
+	WatchTime int     `json:"watch_time"`
 }
 
 // SyncRulesRequest 同步规则请求
diff --git a/internal/prometheus_adapter/service/alert_service.go b/internal/prometheus_adapter/service/alert_service.go
index 2be8095..0e4854a 100644
--- a/internal/prometheus_adapter/service/alert_service.go
+++ b/internal/prometheus_adapter/service/alert_service.go
@@ -5,6 +5,7 @@ import (
 	"fmt"
 	"net/http"
 	"os"
+	"os/exec"
 	"path/filepath"
 	"strings"
 
@@ -18,24 +19,34 @@ import (
 type AlertService struct {
 	promClient    *client.PrometheusClient
 	rulesFilePath string
+	// 内存中缓存当前规则，用于增量更新
+	currentRules     []model.AlertRule
+	currentRuleMetas []model.AlertRuleMeta
 }
 
 // NewAlertService 创建告警服务
 func NewAlertService(promClient *client.PrometheusClient) *AlertService {
 	rulesFilePath := os.Getenv("PROMETHEUS_RULES_FILE")
 	if rulesFilePath == "" {
-		rulesFilePath = "/etc/prometheus/rules/alert_rules.yml"
+		// 在本地生成规则文件，用于调试和后续同步到远程容器
+		rulesFilePath = "./prometheus_rules/alert_rules.yml"
 	}
 
 	return &AlertService{
-		promClient:    promClient,
-		rulesFilePath: rulesFilePath,
+		promClient:       promClient,
+		rulesFilePath:    rulesFilePath,
+		currentRules:     []model.AlertRule{},
+		currentRuleMetas: []model.AlertRuleMeta{},
 	}
 }
 
 // SyncRulesToPrometheus 同步规则到Prometheus
 // 接收完整的规则列表，生成Prometheus规则文件并重载配置
 func (s *AlertService) SyncRulesToPrometheus(rules []model.AlertRule, ruleMetas []model.AlertRuleMeta) error {
+	// 保存到内存缓存
+	s.currentRules = rules
+	s.currentRuleMetas = ruleMetas
+
 	// 构建Prometheus规则文件
 	prometheusRules := s.buildPrometheusRules(rules, ruleMetas)
 
@@ -73,14 +84,9 @@ func (s *AlertService) buildPrometheusRules(rules []model.AlertRule, ruleMetas [
 		// 查找对应的规则模板
 		var rule *model.AlertRule
 
-		// 尝试从alert_name中提取规则名
-		// 假设alert_name格式为: {rule_name}_{service}_{version} 或类似格式
-		for ruleName, r := range ruleMap {
-			if strings.HasPrefix(meta.AlertName, ruleName) {
-				rule = r
-				break
-			}
-		}
+		// 通过 alert_name 直接查找对应的规则模板
+		// AlertRuleMeta.alert_name 关联 AlertRule.name
+		rule = ruleMap[meta.AlertName]
 
 		if rule == nil {
 			log.Warn().
@@ -105,7 +111,6 @@ func (s *AlertService) buildPrometheusRules(rules []model.AlertRule, ruleMetas [
 
 		// 添加severity标签
 		labels["severity"] = rule.Severity
-		labels["rule_name"] = rule.Name
 
 		// 构建表达式
 		expr := s.buildExpression(rule, &meta)
@@ -122,8 +127,9 @@ func (s *AlertService) buildPrometheusRules(rules []model.AlertRule, ruleMetas [
 			forDuration = fmt.Sprintf("%ds", meta.WatchTime)
 		}
 
+		// 使用规则名作为 alert 名称，通过 labels 区分不同实例
 		promRule := model.PrometheusRule{
-			Alert:       meta.AlertName,
+			Alert:       rule.Name, // 使用规则名作为 alert 名称
 			Expr:        expr,
 			For:         forDuration,
 			Labels:      labels,
@@ -179,10 +185,6 @@ func (s *AlertService) buildExpression(rule *model.AlertRule, meta *model.AlertR
 	if len(labels) > 0 {
 		labelMatchers := []string{}
 		for k, v := range labels {
-			// 跳过内部使用的标签
-			if k == "rule_name" {
-				continue
-			}
 			labelMatchers = append(labelMatchers, fmt.Sprintf(`%s="%s"`, k, v))
 		}
 
@@ -209,20 +211,6 @@ func (s *AlertService) buildExpression(rule *model.AlertRule, meta *model.AlertR
 		}
 	}
 
-	// 添加时间范围
-	if meta.MatchTime != "" {
-		// 查找最后一个指标，添加时间范围
-		if !strings.Contains(expr, "[") {
-			// 简单处理：在第一个空格前添加时间范围
-			parts := strings.SplitN(expr, " ", 2)
-			if len(parts) == 2 {
-				expr = parts[0] + "[" + meta.MatchTime + "] " + parts[1]
-			} else {
-				expr = expr + "[" + meta.MatchTime + "]"
-			}
-		}
-	}
-
 	// 添加比较操作符和阈值
 	if meta.Threshold != 0 {
 		expr = fmt.Sprintf("%s %s %f", expr, rule.Op, meta.Threshold)
@@ -253,7 +241,13 @@ func (s *AlertService) writeRulesFile(rules *model.PrometheusRuleFile) error {
 	log.Info().
 		Str("file", s.rulesFilePath).
 		Int("groups", len(rules.Groups)).
-		Msg("Prometheus rules file updated")
+		Msg("Prometheus rules file updated locally")
+
+	// 同步到 Prometheus 容器
+	if err := s.syncToPrometheusContainer(); err != nil {
+		log.Warn().Err(err).Msg("Failed to sync rules to Prometheus container")
+		// 不返回错误，因为本地文件已经生成成功
+	}
 
 	return nil
 }
@@ -280,3 +274,194 @@ func (s *AlertService) reloadPrometheus() error {
 	log.Info().Msg("Prometheus configuration reloaded")
 	return nil
 }
+
+// syncToPrometheusContainer 同步规则文件到本地 Prometheus 容器
+func (s *AlertService) syncToPrometheusContainer() error {
+	// 获取容器名称，默认为 mock-s3-prometheus
+	containerName := os.Getenv("PROMETHEUS_CONTAINER")
+	if containerName == "" {
+		containerName = "mock-s3-prometheus"
+	}
+
+	// 1. 创建容器内的规则目录（如果不存在）
+	cmdMkdir := exec.Command("docker", "exec", containerName, "mkdir", "-p", "/etc/prometheus/rules")
+	if output, err := cmdMkdir.CombinedOutput(); err != nil {
+		// 目录可能已存在，记录日志但不返回错误
+		log.Debug().
+			Str("output", string(output)).
+			Msg("mkdir in container (may already exist)")
+	}
+
+	// 2. 将规则文件拷贝到容器内
+	cmdCopy := exec.Command("docker", "cp", s.rulesFilePath, fmt.Sprintf("%s:/etc/prometheus/rules/alert_rules.yml", containerName))
+	if output, err := cmdCopy.CombinedOutput(); err != nil {
+		return fmt.Errorf("failed to copy rules file to container: %w, output: %s", err, string(output))
+	}
+
+	log.Info().
+		Str("container", containerName).
+		Str("file", s.rulesFilePath).
+		Msg("Rules synced to Prometheus container")
+
+	// 3. 确保 Prometheus 配置包含 rule_files
+	if err := s.ensurePrometheusRuleConfig(containerName); err != nil {
+		log.Warn().Err(err).Msg("Failed to ensure Prometheus rule configuration")
+	}
+
+	return nil
+}
+
+// ensurePrometheusRuleConfig 确保 Prometheus 配置文件包含 rule_files 配置
+func (s *AlertService) ensurePrometheusRuleConfig(containerName string) error {
+	configPath := "/etc/prometheus/prometheus.yml"
+
+	// 1. 检查配置文件是否已包含 rule_files
+	cmdCheck := exec.Command("docker", "exec", containerName, "grep", "-q", "rule_files:", configPath)
+	if err := cmdCheck.Run(); err == nil {
+		// 已经包含 rule_files，不需要修改
+		log.Debug().Msg("Prometheus config already contains rule_files")
+		return nil
+	}
+
+	log.Info().Msg("Adding rule_files configuration to Prometheus")
+
+	// 3. 在 global 部分后添加 rule_files 配置
+	// 使用 sed 在 global: 块后插入 rule_files 配置
+	sedScript := `'/^global:/,/^[^[:space:]]/ {
+		/^[^[:space:]]/ {
+			i\
+# Alert rules\
+rule_files:\
+  - "/etc/prometheus/rules/*.yml"\
+
+		}
+	}'`
+
+	cmdSed := exec.Command("docker", "exec", containerName, "sh", "-c",
+		fmt.Sprintf(`sed -i '%s' %s`, sedScript, configPath))
+
+	if output, err := cmdSed.CombinedOutput(); err != nil {
+		// 如果 sed 失败，尝试使用更简单的方法
+		log.Warn().
+			Str("output", string(output)).
+			Msg("sed failed, trying alternative method")
+
+		// 使用 awk 方法
+		awkScript := `awk '/^global:/ {print; getline; print; print "# Alert rules"; print "rule_files:"; print "  - \"/etc/prometheus/rules/*.yml\""; next} {print}' %s > %s.tmp && mv %s.tmp %s`
+		cmdAwk := exec.Command("docker", "exec", containerName, "sh", "-c",
+			fmt.Sprintf(awkScript, configPath, configPath, configPath, configPath))
+
+		if output, err := cmdAwk.CombinedOutput(); err != nil {
+			return fmt.Errorf("failed to add rule_files to config: %w, output: %s", err, string(output))
+		}
+	}
+
+	log.Info().Msg("Successfully added rule_files configuration to Prometheus")
+
+	// 4. 重启 Prometheus 容器以应用配置
+	cmdRestart := exec.Command("docker", "restart", containerName)
+	if output, err := cmdRestart.CombinedOutput(); err != nil {
+		return fmt.Errorf("failed to restart Prometheus: %w, output: %s", err, string(output))
+	}
+
+	log.Info().Msg("Prometheus restarted with new configuration")
+	return nil
+}
+
+// UpdateRule 更新单个规则模板
+// 只更新传入的规则，其他规则和所有元信息保持不变
+func (s *AlertService) UpdateRule(rule model.AlertRule) error {
+	// 查找并更新规则
+	found := false
+	for i, r := range s.currentRules {
+		if r.Name == rule.Name {
+			s.currentRules[i] = rule
+			found = true
+			break
+		}
+	}
+
+	if !found {
+		// 如果规则不存在，添加新规则
+		s.currentRules = append(s.currentRules, rule)
+	}
+
+	// 统计受影响的元信息数量
+	affectedCount := 0
+	for _, meta := range s.currentRuleMetas {
+		if meta.AlertName == rule.Name {
+			affectedCount++
+		}
+	}
+
+	log.Info().
+		Str("rule", rule.Name).
+		Int("affected_metas", affectedCount).
+		Msg("Updating rule and affected metas")
+
+	// 使用更新后的规则重新生成并同步
+	return s.regenerateAndSync()
+}
+
+// UpdateRuleMeta 更新单个规则元信息
+// 通过 alert_name + labels 唯一确定一个元信息记录
+func (s *AlertService) UpdateRuleMeta(meta model.AlertRuleMeta) error {
+	// 查找并更新元信息
+	found := false
+	for i, m := range s.currentRuleMetas {
+		// 通过 alert_name + labels 唯一确定
+		if m.AlertName == meta.AlertName && m.Labels == meta.Labels {
+			s.currentRuleMetas[i] = meta
+			found = true
+			break
+		}
+	}
+
+	if !found {
+		// 如果元信息不存在，添加新元信息
+		s.currentRuleMetas = append(s.currentRuleMetas, meta)
+	}
+
+	log.Info().
+		Str("alert_name", meta.AlertName).
+		Str("labels", meta.Labels).
+		Msg("Updating rule meta")
+
+	// 使用更新后的元信息重新生成并同步
+	return s.regenerateAndSync()
+}
+
+// regenerateAndSync 使用当前内存中的规则和元信息重新生成Prometheus规则并同步
+func (s *AlertService) regenerateAndSync() error {
+	// 构建Prometheus规则文件
+	prometheusRules := s.buildPrometheusRules(s.currentRules, s.currentRuleMetas)
+
+	// 写入规则文件
+	if err := s.writeRulesFile(prometheusRules); err != nil {
+		return fmt.Errorf("failed to write rules file: %w", err)
+	}
+
+	// 通知Prometheus重新加载配置
+	if err := s.reloadPrometheus(); err != nil {
+		log.Warn().Err(err).Msg("Failed to reload Prometheus, rules file has been updated")
+		// 不返回错误，因为文件已经更新成功
+	}
+
+	log.Info().
+		Int("rules_count", len(s.currentRules)).
+		Int("metas_count", len(s.currentRuleMetas)).
+		Msg("Rules regenerated and synced to Prometheus")
+
+	return nil
+}
+
+// GetAffectedMetas 获取受影响的元信息数量
+func (s *AlertService) GetAffectedMetas(ruleName string) int {
+	count := 0
+	for _, meta := range s.currentRuleMetas {
+		if meta.AlertName == ruleName {
+			count++
+		}
+	}
+	return count
+}
diff --git a/internal/prometheus_adapter/test_alert_update.sh b/internal/prometheus_adapter/test_alert_update.sh
new file mode 100755
index 0000000..a8af0ea
--- /dev/null
+++ b/internal/prometheus_adapter/test_alert_update.sh
@@ -0,0 +1,91 @@
+#!/bin/bash
+
+# 测试增量更新告警规则功能
+
+BASE_URL="http://localhost:8080"
+
+echo "=== 测试增量更新告警规则 ==="
+
+# 1. 先进行全量同步，创建初始规则
+echo -e "\n1. 全量同步规则..."
+curl -X POST ${BASE_URL}/v1/alert-rules/sync \
+  -H "Content-Type: application/json" \
+  -d '{
+    "rules": [
+      {
+        "name": "high_cpu_usage",
+        "description": "CPU使用率过高告警",
+        "expr": "system_cpu_usage_percent",
+        "op": ">",
+        "severity": "warning"
+      },
+      {
+        "name": "high_memory_usage",
+        "description": "内存使用率过高告警",
+        "expr": "system_memory_usage_percent",
+        "op": ">",
+        "severity": "warning"
+      }
+    ],
+    "rule_metas": [
+      {
+        "alert_name": "high_cpu_usage",
+        "labels": "{\"service\":\"storage-service\",\"version\":\"1.0.0\"}",
+        "threshold": 80,
+        "watch_time": 300
+      },
+      {
+        "alert_name": "high_cpu_usage",
+        "labels": "{\"service\":\"metadata-service\",\"version\":\"1.0.0\"}",
+        "threshold": 85,
+        "watch_time": 300
+      },
+      {
+        "alert_name": "high_memory_usage",
+        "labels": "{\"service\":\"storage-service\",\"version\":\"1.0.0\"}",
+        "threshold": 90,
+        "watch_time": 600
+      }
+    ]
+  }' | jq .
+
+sleep 2
+
+# 2. 更新单个规则模板
+echo -e "\n2. 更新规则模板 high_cpu_usage..."
+curl -X PUT ${BASE_URL}/v1/alert-rules/high_cpu_usage \
+  -H "Content-Type: application/json" \
+  -d '{
+    "description": "CPU使用率异常告警（更新后）",
+    "expr": "avg(system_cpu_usage_percent[5m])",
+    "op": ">=",
+    "severity": "critical"
+  }' | jq .
+
+sleep 2
+
+# 3. 更新单个规则元信息
+echo -e "\n3. 更新规则元信息..."
+curl -X PUT ${BASE_URL}/v1/alert-rules/meta \
+  -H "Content-Type: application/json" \
+  -d '{
+    "alert_name": "high_cpu_usage",
+    "labels": "{\"service\":\"storage-service\",\"version\":\"1.0.0\"}",
+    "threshold": 75,
+    "watch_time": 600
+  }' | jq .
+
+sleep 2
+
+# 4. 添加新的元信息
+echo -e "\n4. 添加新的元信息..."
+curl -X PUT ${BASE_URL}/v1/alert-rules/meta \
+  -H "Content-Type: application/json" \
+  -d '{
+    "alert_name": "high_memory_usage",
+    "labels": "{\"service\":\"queue-service\",\"version\":\"2.0.0\"}",
+    "threshold": 95,
+    "watch_time": 300
+  }' | jq .
+
+echo -e "\n=== 测试完成 ==="
\ No newline at end of file
diff --git a/scripts/prometheus_adapter/build.sh b/scripts/prometheus_adapter/build.sh
new file mode 100755
index 0000000..ec5c08a
--- /dev/null
+++ b/scripts/prometheus_adapter/build.sh
@@ -0,0 +1,180 @@
+#!/bin/bash
+
+# Prometheus Adapter 打包脚本
+# 将编译产物和必要文件打包到 build 目录
+
+set -e
+
+# 颜色输出
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+NC='\033[0m' # No Color
+
+# 打印日志函数
+log_info() {
+    echo -e "${GREEN}[INFO]${NC} $1"
+}
+
+log_error() {
+    echo -e "${RED}[ERROR]${NC} $1"
+}
+
+log_warn() {
+    echo -e "${YELLOW}[WARN]${NC} $1"
+}
+
+# 项目根目录
+PROJECT_ROOT=$(cd "$(dirname "$0")"/../.. && pwd)
+cd "$PROJECT_ROOT"
+
+# 配置
+APP_NAME="prometheus_adapter"
+BUILD_DIR="build/${APP_NAME}"
+VERSION=$(git describe --tags --always --dirty 2>/dev/null || echo "dev")
+BUILD_TIME=$(date -u '+%Y-%m-%d_%H:%M:%S')
+GOOS=${GOOS:-linux}
+GOARCH=${GOARCH:-amd64}
+
+log_info "开始构建 ${APP_NAME}"
+log_info "版本: ${VERSION}"
+log_info "构建时间: ${BUILD_TIME}"
+log_info "目标系统: ${GOOS}/${GOARCH}"
+
+# 清理旧的构建目录
+if [ -d "$BUILD_DIR" ]; then
+    log_warn "清理旧的构建目录..."
+    rm -rf "$BUILD_DIR"
+fi
+
+# 创建构建目录
+log_info "创建构建目录..."
+mkdir -p "$BUILD_DIR/bin"
+mkdir -p "$BUILD_DIR/docs"
+mkdir -p "$BUILD_DIR/scripts"
+
+# 编译二进制文件
+log_info "编译 ${APP_NAME}..."
+LDFLAGS="-X main.Version=${VERSION} -X main.BuildTime=${BUILD_TIME}"
+CGO_ENABLED=0 GOOS=$GOOS GOARCH=$GOARCH go build \
+    -ldflags "$LDFLAGS" \
+    -o "$BUILD_DIR/bin/${APP_NAME}" \
+    "./cmd/${APP_NAME}"
+
+if [ $? -ne 0 ]; then
+    log_error "编译失败"
+    exit 1
+fi
+
+# 复制文档
+log_info "复制文档..."
+if [ -f "docs/${APP_NAME}/README.md" ]; then
+    cp "docs/${APP_NAME}/README.md" "$BUILD_DIR/docs/"
+fi
+
+# 复制测试脚本
+log_info "复制脚本..."
+if [ -f "internal/${APP_NAME}/test_alert_update.sh" ]; then
+    cp "internal/${APP_NAME}/test_alert_update.sh" "$BUILD_DIR/scripts/"
+    chmod +x "$BUILD_DIR/scripts/test_alert_update.sh"
+fi
+
+# 创建启动脚本
+log_info "创建启动脚本..."
+cat > "$BUILD_DIR/start.sh" << 'EOF'
+#!/bin/bash
+
+# Prometheus Adapter 启动脚本
+
+# 默认配置
+PROMETHEUS_URL=${PROMETHEUS_URL:-"http://localhost:9090"}
+PORT=${PORT:-8080}
+LOG_LEVEL=${LOG_LEVEL:-"info"}
+
+# 获取脚本所在目录
+SCRIPT_DIR=$(cd "$(dirname "$0")" && pwd)
+BIN_PATH="$SCRIPT_DIR/bin/prometheus_adapter"
+
+# 检查二进制文件
+if [ ! -f "$BIN_PATH" ]; then
+    echo "错误: 找不到可执行文件 $BIN_PATH"
+    exit 1
+fi
+
+# 启动参数
+ARGS=""
+ARGS="$ARGS --prometheus-url=$PROMETHEUS_URL"
+ARGS="$ARGS --port=$PORT"
+ARGS="$ARGS --log-level=$LOG_LEVEL"
+
+echo "启动 Prometheus Adapter..."
+echo "Prometheus URL: $PROMETHEUS_URL"
+echo "监听端口: $PORT"
+echo "日志级别: $LOG_LEVEL"
+
+# 启动服务
+exec "$BIN_PATH" $ARGS
+EOF
+chmod +x "$BUILD_DIR/start.sh"
+
+# 创建停止脚本
+log_info "创建停止脚本..."
+cat > "$BUILD_DIR/stop.sh" << 'EOF'
+#!/bin/bash
+
+# Prometheus Adapter 停止脚本
+
+APP_NAME="prometheus_adapter"
+
+# 查找进程
+PID=$(ps aux | grep -v grep | grep "$APP_NAME" | awk '{print $2}')
+
+if [ -z "$PID" ]; then
+    echo "没有找到运行中的 $APP_NAME 进程"
+    exit 0
+fi
+
+echo "停止 $APP_NAME (PID: $PID)..."
+kill -TERM $PID
+
+# 等待进程退出
+sleep 2
+
+# 检查是否还在运行
+if ps -p $PID > /dev/null 2>&1; then
+    echo "强制停止进程..."
+    kill -KILL $PID
+fi
+
+echo "$APP_NAME 已停止"
+EOF
+chmod +x "$BUILD_DIR/stop.sh"
+
+# 创建版本信息文件
+log_info "创建版本信息..."
+cat > "$BUILD_DIR/VERSION" << EOF
+Application: ${APP_NAME}
+Version: ${VERSION}
+Build Time: ${BUILD_TIME}
+Build OS/Arch: ${GOOS}/${GOARCH}
+EOF
+
+# 打包成 tar.gz
+ARCHIVE_NAME="${APP_NAME}_${VERSION}_${GOOS}_${GOARCH}.tar.gz"
+log_info "创建归档文件: $ARCHIVE_NAME"
+cd build
+tar -czf "$ARCHIVE_NAME" "$APP_NAME"
+cd ..
+
+# 输出构建信息
+log_info "构建成功!"
+echo ""
+echo "构建产物:"
+echo "  - 目录: $BUILD_DIR"
+echo "  - 归档: build/$ARCHIVE_NAME"
+echo ""
+echo "文件列表:"
+ls -lah "$BUILD_DIR/"
+echo ""
+echo "归档大小:"
+ls -lah "build/$ARCHIVE_NAME"
\ No newline at end of file
diff --git a/scripts/prometheus_adapter/deploy.sh b/scripts/prometheus_adapter/deploy.sh
new file mode 100755
index 0000000..85bddf7
--- /dev/null
+++ b/scripts/prometheus_adapter/deploy.sh
@@ -0,0 +1,350 @@
+#!/bin/bash
+
+# Prometheus Adapter 部署脚本
+# 将打包好的文件解压并部署到指定目录
+
+set -e
+
+# 颜色输出
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+BLUE='\033[0;34m'
+NC='\033[0m' # No Color
+
+# 打印日志函数
+log_info() {
+    echo -e "${GREEN}[INFO]${NC} $1"
+}
+
+log_error() {
+    echo -e "${RED}[ERROR]${NC} $1"
+}
+
+log_warn() {
+    echo -e "${YELLOW}[WARN]${NC} $1"
+}
+
+log_debug() {
+    echo -e "${BLUE}[DEBUG]${NC} $1"
+}
+
+# 显示使用帮助
+show_usage() {
+    cat << EOF
+使用方法:
+    $0 [选项] <归档文件>
+
+选项:
+    -d, --deploy-dir DIR    指定部署目录 (默认: /home/qboxserver/zeroops_prometheus_adapter)
+    -b, --backup            部署前备份现有目录
+    -s, --start             部署后自动启动服务
+    -r, --restart           如果服务已运行则重启
+    -f, --force             强制部署，不询问确认
+    -h, --help              显示此帮助信息
+
+示例:
+    $0 prometheus_adapter_v1.0.0_linux_amd64.tar.gz
+    $0 -d /opt/prometheus_adapter -b -s prometheus_adapter.tar.gz
+    $0 --backup --restart prometheus_adapter.tar.gz
+
+EOF
+    exit 0
+}
+
+# 默认配置
+DEPLOY_DIR="/home/qboxserver/zeroops_prometheus_adapter"
+BACKUP=false
+START_SERVICE=false
+RESTART_SERVICE=false
+FORCE_DEPLOY=false
+ARCHIVE_FILE=""
+
+# 解析命令行参数
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        -d|--deploy-dir)
+            DEPLOY_DIR="$2"
+            shift 2
+            ;;
+        -b|--backup)
+            BACKUP=true
+            shift
+            ;;
+        -s|--start)
+            START_SERVICE=true
+            shift
+            ;;
+        -r|--restart)
+            RESTART_SERVICE=true
+            shift
+            ;;
+        -f|--force)
+            FORCE_DEPLOY=true
+            shift
+            ;;
+        -h|--help)
+            show_usage
+            ;;
+        *)
+            if [ -z "$ARCHIVE_FILE" ]; then
+                ARCHIVE_FILE="$1"
+            else
+                log_error "未知参数: $1"
+                show_usage
+            fi
+            shift
+            ;;
+    esac
+done
+
+# 检查归档文件参数
+if [ -z "$ARCHIVE_FILE" ]; then
+    log_error "请指定要部署的归档文件"
+    show_usage
+fi
+
+# 检查归档文件是否存在
+if [ ! -f "$ARCHIVE_FILE" ]; then
+    log_error "找不到归档文件: $ARCHIVE_FILE"
+    exit 1
+fi
+
+# 获取归档文件的绝对路径
+ARCHIVE_FILE=$(realpath "$ARCHIVE_FILE")
+
+log_info "部署配置:"
+log_info "  归档文件: $ARCHIVE_FILE"
+log_info "  部署目录: $DEPLOY_DIR"
+log_info "  备份现有: $BACKUP"
+log_info "  自动启动: $START_SERVICE"
+log_info "  重启服务: $RESTART_SERVICE"
+
+# 确认部署
+if [ "$FORCE_DEPLOY" = false ]; then
+    echo -n "确认部署? (y/N): "
+    read -r CONFIRM
+    if [ "$CONFIRM" != "y" ] && [ "$CONFIRM" != "Y" ]; then
+        log_warn "部署已取消"
+        exit 0
+    fi
+fi
+
+# 检查是否有运行中的服务
+check_running_service() {
+    local pid=$(ps aux | grep -v grep | grep "prometheus_adapter" | grep -v "$0" | awk '{print $2}')
+    if [ -n "$pid" ]; then
+        echo "$pid"
+    fi
+}
+
+# 停止运行中的服务
+stop_service() {
+    local pid=$1
+    if [ -n "$pid" ]; then
+        log_warn "停止运行中的服务 (PID: $pid)..."
+        kill -TERM "$pid" 2>/dev/null || true
+
+        # 等待进程退出
+        local count=0
+        while [ $count -lt 10 ] && ps -p "$pid" > /dev/null 2>&1; do
+            sleep 1
+            count=$((count + 1))
+        done
+
+        # 如果还没退出，强制停止
+        if ps -p "$pid" > /dev/null 2>&1; then
+            log_warn "强制停止进程..."
+            kill -KILL "$pid" 2>/dev/null || true
+        fi
+
+        log_info "服务已停止"
+    fi
+}
+
+# 检查运行中的服务
+RUNNING_PID=$(check_running_service)
+if [ -n "$RUNNING_PID" ]; then
+    log_warn "检测到运行中的 prometheus_adapter 服务 (PID: $RUNNING_PID)"
+    if [ "$RESTART_SERVICE" = true ] || [ "$FORCE_DEPLOY" = true ]; then
+        stop_service "$RUNNING_PID"
+    else
+        log_error "服务正在运行，请先停止服务或使用 -r/--restart 选项"
+        exit 1
+    fi
+fi
+
+# 备份现有目录
+if [ "$BACKUP" = true ] && [ -d "$DEPLOY_DIR" ]; then
+    BACKUP_DIR="${DEPLOY_DIR}_backup_$(date +%Y%m%d_%H%M%S)"
+    log_info "备份现有目录到: $BACKUP_DIR"
+
+    # 需要sudo权限
+    if [ -w "$(dirname "$DEPLOY_DIR")" ]; then
+        mv "$DEPLOY_DIR" "$BACKUP_DIR"
+    else
+        log_warn "需要管理员权限来备份目录"
+        sudo mv "$DEPLOY_DIR" "$BACKUP_DIR"
+    fi
+fi
+
+# 创建临时解压目录
+TEMP_DIR=$(mktemp -d)
+log_info "创建临时目录: $TEMP_DIR"
+
+# 解压归档文件
+log_info "解压归档文件..."
+tar -xzf "$ARCHIVE_FILE" -C "$TEMP_DIR"
+
+# 查找解压后的目录
+EXTRACTED_DIR=$(find "$TEMP_DIR" -maxdepth 1 -type d -name "prometheus_adapter" | head -n 1)
+if [ -z "$EXTRACTED_DIR" ]; then
+    log_error "解压失败：找不到 prometheus_adapter 目录"
+    rm -rf "$TEMP_DIR"
+    exit 1
+fi
+
+# 创建部署目录（如果需要sudo）
+log_info "创建部署目录..."
+if [ -w "$(dirname "$DEPLOY_DIR")" ]; then
+    mkdir -p "$(dirname "$DEPLOY_DIR")"
+else
+    log_warn "需要管理员权限来创建部署目录"
+    sudo mkdir -p "$(dirname "$DEPLOY_DIR")"
+fi
+
+# 移动到部署目录
+log_info "部署到: $DEPLOY_DIR"
+if [ -w "$(dirname "$DEPLOY_DIR")" ]; then
+    if [ -d "$DEPLOY_DIR" ]; then
+        rm -rf "$DEPLOY_DIR"
+    fi
+    mv "$EXTRACTED_DIR" "$DEPLOY_DIR"
+else
+    log_warn "需要管理员权限来部署"
+    if [ -d "$DEPLOY_DIR" ]; then
+        sudo rm -rf "$DEPLOY_DIR"
+    fi
+    sudo mv "$EXTRACTED_DIR" "$DEPLOY_DIR"
+fi
+
+# 设置权限
+log_info "设置文件权限..."
+if [ -w "$DEPLOY_DIR" ]; then
+    chmod +x "$DEPLOY_DIR/bin/prometheus_adapter"
+    chmod +x "$DEPLOY_DIR/start.sh"
+    chmod +x "$DEPLOY_DIR/stop.sh"
+    [ -f "$DEPLOY_DIR/scripts/test_alert_update.sh" ] && chmod +x "$DEPLOY_DIR/scripts/test_alert_update.sh"
+else
+    sudo chmod +x "$DEPLOY_DIR/bin/prometheus_adapter"
+    sudo chmod +x "$DEPLOY_DIR/start.sh"
+    sudo chmod +x "$DEPLOY_DIR/stop.sh"
+    [ -f "$DEPLOY_DIR/scripts/test_alert_update.sh" ] && sudo chmod +x "$DEPLOY_DIR/scripts/test_alert_update.sh"
+fi
+
+# 清理临时目录
+rm -rf "$TEMP_DIR"
+
+# 显示部署信息
+log_info "部署成功!"
+echo ""
+echo "部署信息:"
+echo "  目录: $DEPLOY_DIR"
+echo ""
+echo "版本信息:"
+if [ -f "$DEPLOY_DIR/VERSION" ]; then
+    cat "$DEPLOY_DIR/VERSION"
+else
+    echo "  无版本信息"
+fi
+echo ""
+echo "文件列表:"
+ls -lah "$DEPLOY_DIR/"
+
+# 创建systemd服务文件（可选）
+create_systemd_service() {
+    local service_name="prometheus-adapter"
+    local service_file="/etc/systemd/system/${service_name}.service"
+
+    log_info "创建 systemd 服务..."
+
+    cat << EOF | sudo tee "$service_file" > /dev/null
+[Unit]
+Description=Prometheus Adapter Service
+After=network.target
+
+[Service]
+Type=simple
+User=qboxserver
+Group=qboxserver
+WorkingDirectory=$DEPLOY_DIR
+Environment="PROMETHEUS_URL=http://localhost:9090"
+Environment="PORT=8080"
+Environment="LOG_LEVEL=info"
+ExecStart=$DEPLOY_DIR/bin/prometheus_adapter
+ExecStop=$DEPLOY_DIR/stop.sh
+Restart=on-failure
+RestartSec=10
+
+[Install]
+WantedBy=multi-user.target
+EOF
+
+    sudo systemctl daemon-reload
+    log_info "Systemd 服务已创建: ${service_name}.service"
+    echo ""
+    echo "可以使用以下命令管理服务:"
+    echo "  启动: sudo systemctl start ${service_name}"
+    echo "  停止: sudo systemctl stop ${service_name}"
+    echo "  重启: sudo systemctl restart ${service_name}"
+    echo "  状态: sudo systemctl status ${service_name}"
+    echo "  开机自启: sudo systemctl enable ${service_name}"
+}
+
+# 询问是否创建systemd服务
+if [ "$FORCE_DEPLOY" = false ]; then
+    echo ""
+    echo -n "是否创建 systemd 服务? (y/N): "
+    read -r CREATE_SERVICE
+    if [ "$CREATE_SERVICE" = "y" ] || [ "$CREATE_SERVICE" = "Y" ]; then
+        create_systemd_service
+    fi
+fi
+
+# 启动服务
+if [ "$START_SERVICE" = true ] || [ "$RESTART_SERVICE" = true ]; then
+    log_info "启动服务..."
+
+    # 设置环境变量
+    export PROMETHEUS_URL="${PROMETHEUS_URL:-http://localhost:9090}"
+    export PORT="${PORT:-8080}"
+    export LOG_LEVEL="${LOG_LEVEL:-info}"
+
+    # 启动服务
+    cd "$DEPLOY_DIR"
+    nohup ./start.sh > prometheus_adapter.log 2>&1 &
+
+    # 等待服务启动
+    sleep 2
+
+    # 检查是否启动成功
+    NEW_PID=$(check_running_service)
+    if [ -n "$NEW_PID" ]; then
+        log_info "服务已启动 (PID: $NEW_PID)"
+        echo ""
+        echo "查看日志: tail -f $DEPLOY_DIR/prometheus_adapter.log"
+    else
+        log_error "服务启动失败，请检查日志"
+        exit 1
+    fi
+else
+    echo ""
+    echo "手动启动服务:"
+    echo "  cd $DEPLOY_DIR"
+    echo "  ./start.sh"
+    echo ""
+    echo "或使用后台模式:"
+    echo "  nohup ./start.sh > prometheus_adapter.log 2>&1 &"
+fi
+
+log_info "部署完成!"
\ No newline at end of file

From f6c1ad1888ca9343dc24738e56cce7697d9fdb5d Mon Sep 17 00:00:00 2001
From: Ding <shdnj@qq.com>
Date: Wed, 24 Sep 2025 22:35:22 +0800
Subject: [PATCH 08/18] =?UTF-8?q?feat(observability):=20=E4=BF=AE=E5=A4=8D?=
 =?UTF-8?q?Prometheus=E5=91=8A=E8=AD=A6=E8=A7=84=E5=88=99=E6=94=AF?=
 =?UTF-8?q?=E6=8C=81=E5=B9=B6=E4=BC=98=E5=8C=96=E8=A7=84=E5=88=99=E5=90=8C?=
 =?UTF-8?q?=E6=AD=A5=E6=9C=BA=E5=88=B6?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- 在prometheus.yml中配置告警规则文件路径
- 修改docker-compose.yml挂载规则目录
- 重构AlertService，移除本地文件存储，直接写入容器
- 添加容器内规则文件写入的容错机制
---
 .../service/alert_service.go                  | 69 ++++++++++---------
 mock/s3/deployments/docker-compose.yml        |  1 +
 .../deployments/observability/prometheus.yml  |  4 ++
 3 files changed, 43 insertions(+), 31 deletions(-)

diff --git a/internal/prometheus_adapter/service/alert_service.go b/internal/prometheus_adapter/service/alert_service.go
index 0e4854a..f9fe908 100644
--- a/internal/prometheus_adapter/service/alert_service.go
+++ b/internal/prometheus_adapter/service/alert_service.go
@@ -6,7 +6,6 @@ import (
 	"net/http"
 	"os"
 	"os/exec"
-	"path/filepath"
 	"strings"
 
 	"github.com/qiniu/zeroops/internal/prometheus_adapter/client"
@@ -17,8 +16,7 @@ import (
 
 // AlertService 告警服务 - 仅负责与Prometheus交互，不存储规则
 type AlertService struct {
-	promClient    *client.PrometheusClient
-	rulesFilePath string
+	promClient *client.PrometheusClient
 	// 内存中缓存当前规则，用于增量更新
 	currentRules     []model.AlertRule
 	currentRuleMetas []model.AlertRuleMeta
@@ -26,15 +24,8 @@ type AlertService struct {
 
 // NewAlertService 创建告警服务
 func NewAlertService(promClient *client.PrometheusClient) *AlertService {
-	rulesFilePath := os.Getenv("PROMETHEUS_RULES_FILE")
-	if rulesFilePath == "" {
-		// 在本地生成规则文件，用于调试和后续同步到远程容器
-		rulesFilePath = "./prometheus_rules/alert_rules.yml"
-	}
-
 	return &AlertService{
 		promClient:       promClient,
-		rulesFilePath:    rulesFilePath,
 		currentRules:     []model.AlertRule{},
 		currentRuleMetas: []model.AlertRuleMeta{},
 	}
@@ -221,34 +212,50 @@ func (s *AlertService) buildExpression(rule *model.AlertRule, meta *model.AlertR
 
 // writeRulesFile 写入规则文件
 func (s *AlertService) writeRulesFile(rules *model.PrometheusRuleFile) error {
-	// 确保目录存在
-	dir := filepath.Dir(s.rulesFilePath)
-	if err := os.MkdirAll(dir, 0755); err != nil {
-		return fmt.Errorf("failed to create rules directory: %w", err)
-	}
-
 	// 序列化为YAML
 	data, err := yaml.Marshal(rules)
 	if err != nil {
 		return fmt.Errorf("failed to marshal rules: %w", err)
 	}
 
-	// 写入文件
-	if err := os.WriteFile(s.rulesFilePath, data, 0644); err != nil {
-		return fmt.Errorf("failed to write rules file: %w", err)
+	// 获取容器名称
+	containerName := os.Getenv("PROMETHEUS_CONTAINER")
+	if containerName == "" {
+		containerName = "mock-s3-prometheus"
 	}
 
-	log.Info().
-		Str("file", s.rulesFilePath).
-		Int("groups", len(rules.Groups)).
-		Msg("Prometheus rules file updated locally")
+	// 直接写入到容器内的规则目录
+	// 使用docker exec和echo命令写入文件
+	cmd := exec.Command("docker", "exec", containerName, "sh", "-c",
+		fmt.Sprintf("cat > /etc/prometheus/rules/alert_rules.yml << 'EOF'\n%s\nEOF", string(data)))
 
-	// 同步到 Prometheus 容器
-	if err := s.syncToPrometheusContainer(); err != nil {
-		log.Warn().Err(err).Msg("Failed to sync rules to Prometheus container")
-		// 不返回错误，因为本地文件已经生成成功
+	if output, err := cmd.CombinedOutput(); err != nil {
+		// 如果直接写入容器失败，尝试使用临时文件+docker cp
+		log.Warn().
+			Err(err).
+			Str("output", string(output)).
+			Msg("Failed to write directly to container, trying docker cp")
+
+		// 写入临时文件
+		tmpFile := "/tmp/prometheus_alert_rules.yml"
+		if err := os.WriteFile(tmpFile, data, 0644); err != nil {
+			return fmt.Errorf("failed to write temp rules file: %w", err)
+		}
+
+		// 使用docker cp复制到容器
+		if err := s.syncRuleFileToContainer(tmpFile); err != nil {
+			return fmt.Errorf("failed to sync to container: %w", err)
+		}
+
+		// 清理临时文件
+		os.Remove(tmpFile)
 	}
 
+	log.Info().
+		Str("container", containerName).
+		Int("groups", len(rules.Groups)).
+		Msg("Prometheus rules file updated in container")
+
 	return nil
 }
 
@@ -275,8 +282,8 @@ func (s *AlertService) reloadPrometheus() error {
 	return nil
 }
 
-// syncToPrometheusContainer 同步规则文件到本地 Prometheus 容器
-func (s *AlertService) syncToPrometheusContainer() error {
+// syncRuleFileToContainer 同步规则文件到容器
+func (s *AlertService) syncRuleFileToContainer(filePath string) error {
 	// 获取容器名称，默认为 mock-s3-prometheus
 	containerName := os.Getenv("PROMETHEUS_CONTAINER")
 	if containerName == "" {
@@ -293,14 +300,14 @@ func (s *AlertService) syncToPrometheusContainer() error {
 	}
 
 	// 2. 将规则文件拷贝到容器内
-	cmdCopy := exec.Command("docker", "cp", s.rulesFilePath, fmt.Sprintf("%s:/etc/prometheus/rules/alert_rules.yml", containerName))
+	cmdCopy := exec.Command("docker", "cp", filePath, fmt.Sprintf("%s:/etc/prometheus/rules/alert_rules.yml", containerName))
 	if output, err := cmdCopy.CombinedOutput(); err != nil {
 		return fmt.Errorf("failed to copy rules file to container: %w, output: %s", err, string(output))
 	}
 
 	log.Info().
 		Str("container", containerName).
-		Str("file", s.rulesFilePath).
+		Str("file", filePath).
 		Msg("Rules synced to Prometheus container")
 
 	// 3. 确保 Prometheus 配置包含 rule_files
diff --git a/mock/s3/deployments/docker-compose.yml b/mock/s3/deployments/docker-compose.yml
index 61f13cd..377ec3d 100644
--- a/mock/s3/deployments/docker-compose.yml
+++ b/mock/s3/deployments/docker-compose.yml
@@ -84,6 +84,7 @@ services:
     volumes:
       - prometheus-data:/prometheus
       - ./prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro
+      - ./prometheus/rules:/etc/prometheus/rules:rw
     command:
       - '--config.file=/etc/prometheus/prometheus.yml'
       - '--storage.tsdb.path=/prometheus'
diff --git a/mock/s3/deployments/observability/prometheus.yml b/mock/s3/deployments/observability/prometheus.yml
index 2bcabb8..35fb014 100644
--- a/mock/s3/deployments/observability/prometheus.yml
+++ b/mock/s3/deployments/observability/prometheus.yml
@@ -5,6 +5,10 @@ global:
     cluster: mock-s3
     environment: docker
 
+# 告警规则文件
+rule_files:
+  - "/etc/prometheus/rules/*.yml"
+
 scrape_configs:
   # Prometheus自身的指标
   - job_name: 'prometheus'

From 0b636a1a78cc9b83649d721d49976d6e1eb454c6 Mon Sep 17 00:00:00 2001
From: Ding <shdnj@qq.com>
Date: Thu, 25 Sep 2025 11:22:24 +0800
Subject: [PATCH 09/18] =?UTF-8?q?refactor(prometheus=5Fadapter):=20?=
 =?UTF-8?q?=E9=87=8D=E6=9E=84=E5=91=8A=E8=AD=A6=E8=A7=84=E5=88=99=E7=AE=A1?=
 =?UTF-8?q?=E7=90=86API=E5=92=8C=E9=80=BB=E8=BE=91?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- 将watch_time字段从AlertRuleMeta移到AlertRule中
- 移除全量同步接口，改为增量更新方式
- 实现批量更新规则元信息的API
- 重构服务层代码结构，提高可维护性
- 更新文档
---
 docs/prometheus_adapter/README.md             |  72 ++---
 internal/prometheus_adapter/api/alert_api.go  |  83 +++---
 internal/prometheus_adapter/model/alert.go    |   2 +-
 internal/prometheus_adapter/model/api.go      |  17 +-
 .../service/alert_service.go                  | 249 ++++++++----------
 5 files changed, 182 insertions(+), 241 deletions(-)

diff --git a/docs/prometheus_adapter/README.md b/docs/prometheus_adapter/README.md
index 0a92312..650b6f8 100644
--- a/docs/prometheus_adapter/README.md
+++ b/docs/prometheus_adapter/README.md
@@ -10,7 +10,7 @@
 - 架构设计
 - API 参考
   - 指标查询
-  - 告警规则同步
+  - 告警规则管理
 - Alertmanager 集成
 - 支持的服务
 - 错误码
@@ -36,7 +36,7 @@ internal/prometheus_adapter/
 ├── api/                   # API 层，处理 HTTP 请求
 │   ├── api.go            # API 基础结构和初始化
 │   ├── metric_api.go     # 指标相关的 API 处理器
-│   └── alert_api.go      # 告警规则同步 API 处理器
+│   └── alert_api.go      # 告警规则管理 API 处理器
 ├── service/               # 业务逻辑层
 │   ├── metric_service.go # 指标查询服务实现
 │   └── alert_service.go  # 告警规则同步服务实现
@@ -137,42 +137,9 @@ internal/prometheus_adapter/
 }
 ```
 
-### 告警规则同步
+### 告警规则管理
 
-#### 1. 全量同步规则
-- 方法与路径：`POST /v1/alert-rules/sync`
-- 功能：接收监控告警模块发送的完整规则列表，生成 Prometheus 规则文件并触发重载（全量同步）
-- 请求体示例：
-```json
-{
-  "rules": [
-    {
-      "name": "high_cpu_usage",
-      "description": "CPU使用率过高告警",
-      "expr": "system_cpu_usage_percent",
-      "op": ">",
-      "severity": "warning"
-    }
-  ],
-  "rule_metas": [
-    {
-      "alert_name": "high_cpu_usage",  // 与规则模板的name字段保持一致
-      "labels": "{\"service\":\"storage-service\",\"version\":\"1.0.0\"}",
-      "threshold": 90,
-      "watch_time": 300
-    }
-  ]
-}
-```
-- 响应示例：
-```json
-{
-  "status": "success",
-  "message": "Rules synced to Prometheus"
-}
-```
-
-#### 2. 更新单个规则模板
+#### 1. 更新单个规则模板
 - 方法与路径：`PUT /v1/alert-rules/:rule_name`
 - 功能：更新指定的告警规则模板，系统会自动查找所有使用该规则的元信息并重新生成 Prometheus 规则
 - 路径参数：
@@ -183,7 +150,8 @@ internal/prometheus_adapter/
   "description": "CPU使用率异常告警（更新后）",
   "expr": "avg(system_cpu_usage_percent)",
   "op": ">=",
-  "severity": "critical"
+  "severity": "critical",
+  "watch_time": 300
 }
 ```
 - 响应示例：
@@ -195,25 +163,33 @@ internal/prometheus_adapter/
 }
 ```
 
-#### 3. 更新单个规则元信息
-- 方法与路径：`PUT /v1/alert-rules/meta`
-- 功能：更新指定规则的元信息，系统会根据对应的规则模板重新生成 Prometheus 规则
+#### 2. 批量更新规则元信息
+- 方法与路径：`PUT /v1/alert-rules-meta/:rule_name`
+- 功能：批量更新指定规则的元信息，系统会根据对应的规则模板重新生成 Prometheus 规则
+- 路径参数：
+  - `rule_name`：规则名称（如 `high_cpu_usage`）
 - 请求体示例：
 ```json
 {
-  "rule_name": "high_cpu_usage",  // 必填，对应规则模板的name
-  "labels": "{\"service\":\"storage-service\",\"version\":\"2.0.0\"}",  // 必填，用于唯一标识
-  "threshold": 85,
-  "watch_time": 600
+  "metas": [
+    {
+      "labels": "{\"service\":\"storage-service\",\"version\":\"1.0.0\"}",  // 必填，用于唯一标识
+      "threshold": 85
+    },
+    {
+      "labels": "{\"service\":\"storage-service\",\"version\":\"2.0.0\"}",  // 必填，用于唯一标识
+      "threshold": 90
+    }
+  ]
 }
 ```
 - 响应示例：
 ```json
 {
   "status": "success",
-  "message": "Rule meta updated and synced to Prometheus",
+  "message": "Rule metas updated and synced to Prometheus",
   "rule_name": "high_cpu_usage",
-  "labels": "{\"service\":\"storage-service\",\"version\":\"2.0.0\"}"
+  "updated_count": 2
 }
 ```
 
@@ -232,11 +208,11 @@ internal/prometheus_adapter/
   - `expr`：PromQL 表达式，如 `sum(apitime) by (service, version)`，可包含时间范围
   - `op`：比较操作符（`>`, `<`, `=`, `!=`）
   - `severity`：告警等级，通常进入告警的 labels.severity
+  - `watch_time`：持续时间（秒），对应 Prometheus 的 `for` 字段
 - **AlertRuleMeta（元信息）**：
   - `alert_name`：关联的规则名称（对应 alert_rules.name）
   - `labels`：JSON 格式的标签，用于筛选特定服务（如 `{"service":"s3","version":"v1"}`）
   - `threshold`：告警阈值
-  - `watch_time`：持续时间（秒），对应 Prometheus 的 `for` 字段
 
 #### 增量更新说明
 - **增量更新**：新接口支持增量更新，只需传入需要修改的字段
diff --git a/internal/prometheus_adapter/api/alert_api.go b/internal/prometheus_adapter/api/alert_api.go
index 3206087..8724803 100644
--- a/internal/prometheus_adapter/api/alert_api.go
+++ b/internal/prometheus_adapter/api/alert_api.go
@@ -10,32 +10,8 @@ import (
 
 // setupAlertRouters 设置告警相关路由
 func (api *Api) setupAlertRouters(router *fox.Engine) {
-	router.POST("/v1/alert-rules/sync", api.SyncRules)
 	router.PUT("/v1/alert-rules/:rule_name", api.UpdateRule)
-	router.PUT("/v1/alert-rules/meta", api.UpdateRuleMeta)
-}
-
-// SyncRules 同步规则到Prometheus
-// 接收从监控告警模块发来的规则列表，生成Prometheus规则文件并重载配置
-func (api *Api) SyncRules(c *fox.Context) {
-	var req model.SyncRulesRequest
-	if err := c.ShouldBindJSON(&req); err != nil {
-		SendErrorResponse(c, http.StatusBadRequest, model.ErrorCodeInvalidParameter,
-			"Invalid request body: "+err.Error(), nil)
-		return
-	}
-
-	err := api.alertService.SyncRulesToPrometheus(req.Rules, req.RuleMetas)
-	if err != nil {
-		SendErrorResponse(c, http.StatusInternalServerError, model.ErrorCodeInternalError,
-			"Failed to sync rules to Prometheus: "+err.Error(), nil)
-		return
-	}
-
-	c.JSON(http.StatusOK, map[string]string{
-		"status":  "success",
-		"message": "Rules synced to Prometheus",
-	})
+	router.PUT("/v1/alert-rules-meta/:rule_name", api.UpdateRuleMetas)
 }
 
 // UpdateRule 更新单个规则模板
@@ -62,6 +38,7 @@ func (api *Api) UpdateRule(c *fox.Context) {
 		Expr:        req.Expr,
 		Op:          req.Op,
 		Severity:    req.Severity,
+		WatchTime:   req.WatchTime,
 	}
 
 	err := api.alertService.UpdateRule(rule)
@@ -81,9 +58,16 @@ func (api *Api) UpdateRule(c *fox.Context) {
 	})
 }
 
-// UpdateRuleMeta 更新单个规则元信息
-// 通过 alert_name + labels 唯一确定一个元信息记录
-func (api *Api) UpdateRuleMeta(c *fox.Context) {
+// UpdateRuleMetas 批量更新规则元信息
+// 通过 rule_name + labels 唯一确定一个元信息记录
+func (api *Api) UpdateRuleMetas(c *fox.Context) {
+	ruleName := c.Param("rule_name")
+	if ruleName == "" {
+		SendErrorResponse(c, http.StatusBadRequest, model.ErrorCodeInvalidParameter,
+			"Rule name is required", nil)
+		return
+	}
+
 	var req model.UpdateAlertRuleMetaRequest
 	if err := c.ShouldBindJSON(&req); err != nil {
 		SendErrorResponse(c, http.StatusBadRequest, model.ErrorCodeInvalidParameter,
@@ -91,32 +75,35 @@ func (api *Api) UpdateRuleMeta(c *fox.Context) {
 		return
 	}
 
-	// alert_name 和 labels 是必填的
-	if req.AlertName == "" || req.Labels == "" {
+	if len(req.Metas) == 0 {
 		SendErrorResponse(c, http.StatusBadRequest, model.ErrorCodeInvalidParameter,
-			"alert_name and labels are required", nil)
+			"At least one meta update is required", nil)
 		return
 	}
 
-	// 构建完整的元信息对象
-	meta := model.AlertRuleMeta{
-		AlertName: req.AlertName,
-		Labels:    req.Labels,
-		Threshold: req.Threshold,
-		WatchTime: req.WatchTime,
-	}
-
-	err := api.alertService.UpdateRuleMeta(meta)
-	if err != nil {
-		SendErrorResponse(c, http.StatusInternalServerError, model.ErrorCodeInternalError,
-			"Failed to update rule meta: "+err.Error(), nil)
-		return
+	// 批量更新元信息
+	updatedCount := 0
+	for _, metaUpdate := range req.Metas {
+		// 构建完整的元信息对象
+		meta := model.AlertRuleMeta{
+			AlertName: ruleName,
+			Labels:    metaUpdate.Labels,
+			Threshold: metaUpdate.Threshold,
+		}
+
+		err := api.alertService.UpdateRuleMeta(meta)
+		if err != nil {
+			SendErrorResponse(c, http.StatusInternalServerError, model.ErrorCodeInternalError,
+				fmt.Sprintf("Failed to update rule meta: %v", err), nil)
+			return
+		}
+		updatedCount++
 	}
 
 	c.JSON(http.StatusOK, map[string]interface{}{
-		"status":     "success",
-		"message":    "Rule meta updated and synced to Prometheus",
-		"alert_name": req.AlertName,
-		"labels":     req.Labels,
+		"status":        "success",
+		"message":       "Rule metas updated and synced to Prometheus",
+		"rule_name":     ruleName,
+		"updated_count": updatedCount,
 	})
 }
diff --git a/internal/prometheus_adapter/model/alert.go b/internal/prometheus_adapter/model/alert.go
index 566a143..e64a047 100644
--- a/internal/prometheus_adapter/model/alert.go
+++ b/internal/prometheus_adapter/model/alert.go
@@ -7,6 +7,7 @@ type AlertRule struct {
 	Expr        string `json:"expr" gorm:"type:text;not null"`            // 左侧业务指标表达式，如 sum(apitime) by (service, version)
 	Op          string `json:"op" gorm:"type:varchar(4);not null"`        // 阈值比较方式（>, <, =, !=）
 	Severity    string `json:"severity" gorm:"type:varchar(32);not null"` // 告警等级，通常进入告警的 labels.severity
+	WatchTime   int    `json:"watch_time"`                                // 持续时长（秒），映射 Prometheus rule 的 for 字段
 }
 
 // AlertRuleMeta 告警规则元信息表 - 存储服务级别的告警配置
@@ -15,5 +16,4 @@ type AlertRuleMeta struct {
 	AlertName string  `json:"alert_name" gorm:"type:varchar(255);index"` // 关联 alert_rules.name
 	Labels    string  `json:"labels" gorm:"type:jsonb"`                  // 适用标签，如 {"service":"s3","version":"v1"}，为空表示全局
 	Threshold float64 `json:"threshold"`                                 // 阈值（会被渲染成特定规则的 threshold metric 数值）
-	WatchTime int     `json:"watch_time"`                                // 持续时长（映射 Prometheus rule 的 for）
 }
diff --git a/internal/prometheus_adapter/model/api.go b/internal/prometheus_adapter/model/api.go
index 4dc8421..775bdd9 100644
--- a/internal/prometheus_adapter/model/api.go
+++ b/internal/prometheus_adapter/model/api.go
@@ -46,6 +46,7 @@ type UpdateAlertRuleRequest struct {
 	Expr        string `json:"expr,omitempty"`
 	Op          string `json:"op,omitempty" binding:"omitempty,oneof=> < = !="`
 	Severity    string `json:"severity,omitempty"`
+	WatchTime   int    `json:"watch_time,omitempty"` // 持续时长（秒）
 }
 
 // CreateAlertRuleMetaRequest 创建告警规则元信息请求
@@ -57,17 +58,13 @@ type CreateAlertRuleMetaRequest struct {
 	MatchTime string            `json:"match_time,omitempty"`
 }
 
-// UpdateAlertRuleMetaRequest 更新告警规则元信息请求
+// UpdateAlertRuleMetaRequest 批量更新告警规则元信息请求
 type UpdateAlertRuleMetaRequest struct {
-	AlertName string  `json:"alert_name" binding:"required"`
-	Labels    string  `json:"labels" binding:"required"`
-	Threshold float64 `json:"threshold"`
-	WatchTime int     `json:"watch_time"`
+	Metas []AlertRuleMetaUpdate `json:"metas" binding:"required"`
 }
 
-// SyncRulesRequest 同步规则请求
-// 从监控告警模块发送过来的完整规则列表
-type SyncRulesRequest struct {
-	Rules     []AlertRule     `json:"rules"`      // 告警规则列表
-	RuleMetas []AlertRuleMeta `json:"rule_metas"` // 规则元信息列表
+// AlertRuleMetaUpdate 单个规则元信息更新项
+type AlertRuleMetaUpdate struct {
+	Labels    string  `json:"labels" binding:"required"` // 必填，用于唯一标识
+	Threshold float64 `json:"threshold"`
 }
diff --git a/internal/prometheus_adapter/service/alert_service.go b/internal/prometheus_adapter/service/alert_service.go
index f9fe908..cfae3f6 100644
--- a/internal/prometheus_adapter/service/alert_service.go
+++ b/internal/prometheus_adapter/service/alert_service.go
@@ -31,15 +31,88 @@ func NewAlertService(promClient *client.PrometheusClient) *AlertService {
 	}
 }
 
-// SyncRulesToPrometheus 同步规则到Prometheus
-// 接收完整的规则列表，生成Prometheus规则文件并重载配置
-func (s *AlertService) SyncRulesToPrometheus(rules []model.AlertRule, ruleMetas []model.AlertRuleMeta) error {
-	// 保存到内存缓存
-	s.currentRules = rules
-	s.currentRuleMetas = ruleMetas
+// ========== 公开 API 方法 ==========
 
+// UpdateRule 更新单个规则模板
+// 只更新传入的规则，其他规则和所有元信息保持不变
+func (s *AlertService) UpdateRule(rule model.AlertRule) error {
+	// 查找并更新规则
+	found := false
+	for i, r := range s.currentRules {
+		if r.Name == rule.Name {
+			s.currentRules[i] = rule
+			found = true
+			break
+		}
+	}
+
+	if !found {
+		// 如果规则不存在，添加新规则
+		s.currentRules = append(s.currentRules, rule)
+	}
+
+	// 统计受影响的元信息数量
+	affectedCount := 0
+	for _, meta := range s.currentRuleMetas {
+		if meta.AlertName == rule.Name {
+			affectedCount++
+		}
+	}
+
+	log.Info().
+		Str("rule", rule.Name).
+		Int("affected_metas", affectedCount).
+		Msg("Updating rule and affected metas")
+
+	// 使用更新后的规则重新生成并同步
+	return s.regenerateAndSync()
+}
+
+// UpdateRuleMeta 更新单个规则元信息
+// 通过 alert_name + labels 唯一确定一个元信息记录
+func (s *AlertService) UpdateRuleMeta(meta model.AlertRuleMeta) error {
+	// 查找并更新元信息
+	found := false
+	for i, m := range s.currentRuleMetas {
+		// 通过 alert_name + labels 唯一确定
+		if m.AlertName == meta.AlertName && m.Labels == meta.Labels {
+			s.currentRuleMetas[i] = meta
+			found = true
+			break
+		}
+	}
+
+	if !found {
+		// 如果元信息不存在，添加新元信息
+		s.currentRuleMetas = append(s.currentRuleMetas, meta)
+	}
+
+	log.Info().
+		Str("alert_name", meta.AlertName).
+		Str("labels", meta.Labels).
+		Msg("Updating rule meta")
+
+	// 使用更新后的元信息重新生成并同步
+	return s.regenerateAndSync()
+}
+
+// GetAffectedMetas 获取受影响的元信息数量
+func (s *AlertService) GetAffectedMetas(ruleName string) int {
+	count := 0
+	for _, meta := range s.currentRuleMetas {
+		if meta.AlertName == ruleName {
+			count++
+		}
+	}
+	return count
+}
+
+// ========== 内部核心方法 ==========
+
+// regenerateAndSync 使用当前内存中的规则和元信息重新生成Prometheus规则并同步
+func (s *AlertService) regenerateAndSync() error {
 	// 构建Prometheus规则文件
-	prometheusRules := s.buildPrometheusRules(rules, ruleMetas)
+	prometheusRules := s.buildPrometheusRules(s.currentRules, s.currentRuleMetas)
 
 	// 写入规则文件
 	if err := s.writeRulesFile(prometheusRules); err != nil {
@@ -53,13 +126,15 @@ func (s *AlertService) SyncRulesToPrometheus(rules []model.AlertRule, ruleMetas
 	}
 
 	log.Info().
-		Int("rules_count", len(rules)).
-		Int("metas_count", len(ruleMetas)).
-		Msg("Rules synced to Prometheus successfully")
+		Int("rules_count", len(s.currentRules)).
+		Int("metas_count", len(s.currentRuleMetas)).
+		Msg("Rules regenerated and synced to Prometheus")
 
 	return nil
 }
 
+// ========== 规则构建相关方法 ==========
+
 // buildPrometheusRules 构建Prometheus规则
 func (s *AlertService) buildPrometheusRules(rules []model.AlertRule, ruleMetas []model.AlertRuleMeta) *model.PrometheusRuleFile {
 	promRules := []model.PrometheusRule{}
@@ -114,8 +189,8 @@ func (s *AlertService) buildPrometheusRules(rules []model.AlertRule, ruleMetas [
 
 		// 计算for字段
 		forDuration := ""
-		if meta.WatchTime > 0 {
-			forDuration = fmt.Sprintf("%ds", meta.WatchTime)
+		if rule.WatchTime > 0 {
+			forDuration = fmt.Sprintf("%ds", rule.WatchTime)
 		}
 
 		// 使用规则名作为 alert 名称，通过 labels 区分不同实例
@@ -182,7 +257,7 @@ func (s *AlertService) buildExpression(rule *model.AlertRule, meta *model.AlertR
 		if len(labelMatchers) > 0 {
 			// 如果表达式包含{，说明已经有标签选择器
 			if strings.Contains(expr, "{") {
-				expr = strings.Replace(expr, "}", ","+strings.Join(labelMatchers, ",")+"}", 1)
+				expr = strings.Replace(expr, "}", ","+strings.Join(labelMatchers, ",")+"}}", 1)
 			} else {
 				// 在指标名后添加标签选择器
 				// 查找第一个非字母数字下划线的字符
@@ -210,6 +285,8 @@ func (s *AlertService) buildExpression(rule *model.AlertRule, meta *model.AlertR
 	return expr
 }
 
+// ========== 文件操作相关方法 ==========
+
 // writeRulesFile 写入规则文件
 func (s *AlertService) writeRulesFile(rules *model.PrometheusRuleFile) error {
 	// 序列化为YAML
@@ -259,29 +336,6 @@ func (s *AlertService) writeRulesFile(rules *model.PrometheusRuleFile) error {
 	return nil
 }
 
-// reloadPrometheus 重新加载Prometheus配置
-func (s *AlertService) reloadPrometheus() error {
-	prometheusURL := os.Getenv("PROMETHEUS_ADDRESS")
-	if prometheusURL == "" {
-		prometheusURL = "http://10.210.10.33:9090"
-	}
-
-	reloadURL := fmt.Sprintf("%s/-/reload", strings.TrimSuffix(prometheusURL, "/"))
-
-	resp, err := http.Post(reloadURL, "text/plain", nil)
-	if err != nil {
-		return fmt.Errorf("failed to reload Prometheus: %w", err)
-	}
-	defer resp.Body.Close()
-
-	if resp.StatusCode != http.StatusOK {
-		return fmt.Errorf("Prometheus reload failed with status: %d", resp.StatusCode)
-	}
-
-	log.Info().Msg("Prometheus configuration reloaded")
-	return nil
-}
-
 // syncRuleFileToContainer 同步规则文件到容器
 func (s *AlertService) syncRuleFileToContainer(filePath string) error {
 	// 获取容器名称，默认为 mock-s3-prometheus
@@ -318,6 +372,31 @@ func (s *AlertService) syncRuleFileToContainer(filePath string) error {
 	return nil
 }
 
+// ========== Prometheus 配置相关方法 ==========
+
+// reloadPrometheus 重新加载Prometheus配置
+func (s *AlertService) reloadPrometheus() error {
+	prometheusURL := os.Getenv("PROMETHEUS_ADDRESS")
+	if prometheusURL == "" {
+		prometheusURL = "http://10.210.10.33:9090"
+	}
+
+	reloadURL := fmt.Sprintf("%s/-/reload", strings.TrimSuffix(prometheusURL, "/"))
+
+	resp, err := http.Post(reloadURL, "text/plain", nil)
+	if err != nil {
+		return fmt.Errorf("failed to reload Prometheus: %w", err)
+	}
+	defer resp.Body.Close()
+
+	if resp.StatusCode != http.StatusOK {
+		return fmt.Errorf("Prometheus reload failed with status: %d", resp.StatusCode)
+	}
+
+	log.Info().Msg("Prometheus configuration reloaded")
+	return nil
+}
+
 // ensurePrometheusRuleConfig 确保 Prometheus 配置文件包含 rule_files 配置
 func (s *AlertService) ensurePrometheusRuleConfig(containerName string) error {
 	configPath := "/etc/prometheus/prometheus.yml"
@@ -374,101 +453,3 @@ rule_files:\
 	log.Info().Msg("Prometheus restarted with new configuration")
 	return nil
 }
-
-// UpdateRule 更新单个规则模板
-// 只更新传入的规则，其他规则和所有元信息保持不变
-func (s *AlertService) UpdateRule(rule model.AlertRule) error {
-	// 查找并更新规则
-	found := false
-	for i, r := range s.currentRules {
-		if r.Name == rule.Name {
-			s.currentRules[i] = rule
-			found = true
-			break
-		}
-	}
-
-	if !found {
-		// 如果规则不存在，添加新规则
-		s.currentRules = append(s.currentRules, rule)
-	}
-
-	// 统计受影响的元信息数量
-	affectedCount := 0
-	for _, meta := range s.currentRuleMetas {
-		if meta.AlertName == rule.Name {
-			affectedCount++
-		}
-	}
-
-	log.Info().
-		Str("rule", rule.Name).
-		Int("affected_metas", affectedCount).
-		Msg("Updating rule and affected metas")
-
-	// 使用更新后的规则重新生成并同步
-	return s.regenerateAndSync()
-}
-
-// UpdateRuleMeta 更新单个规则元信息
-// 通过 alert_name + labels 唯一确定一个元信息记录
-func (s *AlertService) UpdateRuleMeta(meta model.AlertRuleMeta) error {
-	// 查找并更新元信息
-	found := false
-	for i, m := range s.currentRuleMetas {
-		// 通过 alert_name + labels 唯一确定
-		if m.AlertName == meta.AlertName && m.Labels == meta.Labels {
-			s.currentRuleMetas[i] = meta
-			found = true
-			break
-		}
-	}
-
-	if !found {
-		// 如果元信息不存在，添加新元信息
-		s.currentRuleMetas = append(s.currentRuleMetas, meta)
-	}
-
-	log.Info().
-		Str("alert_name", meta.AlertName).
-		Str("labels", meta.Labels).
-		Msg("Updating rule meta")
-
-	// 使用更新后的元信息重新生成并同步
-	return s.regenerateAndSync()
-}
-
-// regenerateAndSync 使用当前内存中的规则和元信息重新生成Prometheus规则并同步
-func (s *AlertService) regenerateAndSync() error {
-	// 构建Prometheus规则文件
-	prometheusRules := s.buildPrometheusRules(s.currentRules, s.currentRuleMetas)
-
-	// 写入规则文件
-	if err := s.writeRulesFile(prometheusRules); err != nil {
-		return fmt.Errorf("failed to write rules file: %w", err)
-	}
-
-	// 通知Prometheus重新加载配置
-	if err := s.reloadPrometheus(); err != nil {
-		log.Warn().Err(err).Msg("Failed to reload Prometheus, rules file has been updated")
-		// 不返回错误，因为文件已经更新成功
-	}
-
-	log.Info().
-		Int("rules_count", len(s.currentRules)).
-		Int("metas_count", len(s.currentRuleMetas)).
-		Msg("Rules regenerated and synced to Prometheus")
-
-	return nil
-}
-
-// GetAffectedMetas 获取受影响的元信息数量
-func (s *AlertService) GetAffectedMetas(ruleName string) int {
-	count := 0
-	for _, meta := range s.currentRuleMetas {
-		if meta.AlertName == ruleName {
-			count++
-		}
-	}
-	return count
-}

From 40d41e8c28802765a37b6ef70e4b1ba135a8fdbc Mon Sep 17 00:00:00 2001
From: dnj <shdnj@qq.com>
Date: Thu, 25 Sep 2025 14:53:17 +0800
Subject: [PATCH 10/18] =?UTF-8?q?feat(prometheus=5Fadapter):=20=E5=AE=9E?=
 =?UTF-8?q?=E7=8E=B0=E5=91=8A=E8=AD=A6=E8=A7=84=E5=88=99=E6=8C=81=E4=B9=85?=
 =?UTF-8?q?=E5=8C=96=E4=B8=8E=E4=BC=98=E9=9B=85=E5=85=B3=E9=97=AD=E5=8A=9F?=
 =?UTF-8?q?=E8=83=BD?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

添加告警规则本地文件持久化功能，支持启动时加载和关闭时保存规则
重构关闭逻辑实现优雅关闭，包括保存当前规则状态
更新构建和部署脚本以处理规则文件目录
修改测试脚本以适配新的增量更新接口
---
 cmd/prometheus_adapter/main.go                |  43 ++++-
 .../prometheus_adapter/rules/alert_rules.yml  |   5 +
 internal/prometheus_adapter/server.go         |  18 +-
 .../service/alert_service.go                  | 156 +++++++++++++++++-
 .../prometheus_adapter/test_alert_update.sh   | 125 ++++++++------
 scripts/prometheus_adapter/build.sh           |  18 ++
 scripts/prometheus_adapter/deploy.sh          |  10 +-
 7 files changed, 319 insertions(+), 56 deletions(-)
 create mode 100644 internal/prometheus_adapter/rules/alert_rules.yml

diff --git a/cmd/prometheus_adapter/main.go b/cmd/prometheus_adapter/main.go
index 9f45442..847516b 100644
--- a/cmd/prometheus_adapter/main.go
+++ b/cmd/prometheus_adapter/main.go
@@ -1,7 +1,11 @@
 package main
 
 import (
+	"context"
 	"os"
+	"os/signal"
+	"syscall"
+	"time"
 
 	"github.com/fox-gonic/fox"
 	"github.com/qiniu/zeroops/internal/config"
@@ -42,9 +46,40 @@ func main() {
 		log.Fatal().Err(err).Msg("Failed to setup API routes")
 	}
 
-	// 启动服务器
-	log.Info().Msgf("Starting Prometheus Adapter on %s", cfg.Server.BindAddr)
-	if err := router.Run(cfg.Server.BindAddr); err != nil {
-		log.Fatal().Err(err).Msg("Failed to start server")
+	// 设置信号处理
+	sigChan := make(chan os.Signal, 1)
+	signal.Notify(sigChan, syscall.SIGINT, syscall.SIGTERM)
+
+	// 创建一个用于优雅关闭的context
+	ctx, cancel := context.WithCancel(context.Background())
+	defer cancel()
+
+	// 在goroutine中启动服务器
+	serverErr := make(chan error, 1)
+	go func() {
+		log.Info().Msgf("Starting Prometheus Adapter on %s", cfg.Server.BindAddr)
+		if err := router.Run(cfg.Server.BindAddr); err != nil {
+			serverErr <- err
+		}
+	}()
+
+	// 等待信号或服务器错误
+	select {
+	case sig := <-sigChan:
+		log.Info().Msgf("Received signal %s, shutting down...", sig)
+
+		// 创建超时context
+		shutdownCtx, shutdownCancel := context.WithTimeout(ctx, 10*time.Second)
+		defer shutdownCancel()
+
+		// 调用adapter的Shutdown方法
+		if err := adapter.Close(shutdownCtx); err != nil {
+			log.Error().Err(err).Msg("Error during shutdown")
+		}
+
+		log.Info().Msg("Shutdown complete")
+
+	case err := <-serverErr:
+		log.Fatal().Err(err).Msg("Server error")
 	}
 }
diff --git a/internal/prometheus_adapter/rules/alert_rules.yml b/internal/prometheus_adapter/rules/alert_rules.yml
new file mode 100644
index 0000000..7dd73ca
--- /dev/null
+++ b/internal/prometheus_adapter/rules/alert_rules.yml
@@ -0,0 +1,5 @@
+# Prometheus Alert Rules
+# This file is managed by the Prometheus Adapter service
+# It will be loaded on startup and saved on shutdown
+
+groups: []
\ No newline at end of file
diff --git a/internal/prometheus_adapter/server.go b/internal/prometheus_adapter/server.go
index d9fb2f4..35f5b08 100644
--- a/internal/prometheus_adapter/server.go
+++ b/internal/prometheus_adapter/server.go
@@ -1,6 +1,7 @@
 package prometheusadapter
 
 import (
+	"context"
 	"fmt"
 	"os"
 
@@ -63,9 +64,18 @@ func (s *PrometheusAdapterServer) UseApi(router *fox.Engine) error {
 	return nil
 }
 
-// Close 关闭服务器
-func (s *PrometheusAdapterServer) Close() error {
-	// 当前没有需要关闭的资源
-	log.Info().Msg("Prometheus Adapter server closed")
+// Close 优雅关闭服务器
+func (s *PrometheusAdapterServer) Close(ctx context.Context) error {
+	log.Info().Msg("Starting shutdown...")
+
+	// 调用 alertService 的 Shutdown 方法保存规则
+	if s.alertService != nil {
+		if err := s.alertService.Shutdown(); err != nil {
+			log.Error().Err(err).Msg("Failed to shutdown alert service")
+			return err
+		}
+	}
+
+	log.Info().Msg("Prometheus Adapter server shut down")
 	return nil
 }
diff --git a/internal/prometheus_adapter/service/alert_service.go b/internal/prometheus_adapter/service/alert_service.go
index cfae3f6..a254ae1 100644
--- a/internal/prometheus_adapter/service/alert_service.go
+++ b/internal/prometheus_adapter/service/alert_service.go
@@ -6,6 +6,8 @@ import (
 	"net/http"
 	"os"
 	"os/exec"
+	"path/filepath"
+	"strconv"
 	"strings"
 
 	"github.com/qiniu/zeroops/internal/prometheus_adapter/client"
@@ -20,15 +22,167 @@ type AlertService struct {
 	// 内存中缓存当前规则，用于增量更新
 	currentRules     []model.AlertRule
 	currentRuleMetas []model.AlertRuleMeta
+	// 本地规则文件路径
+	localRulesPath string
 }
 
 // NewAlertService 创建告警服务
 func NewAlertService(promClient *client.PrometheusClient) *AlertService {
-	return &AlertService{
+	service := &AlertService{
 		promClient:       promClient,
 		currentRules:     []model.AlertRule{},
 		currentRuleMetas: []model.AlertRuleMeta{},
+		localRulesPath:   "../rules/alert_rules.yml",
 	}
+
+	// 启动时尝试加载本地规则
+	if err := service.LoadRulesFromFile(); err != nil {
+		log.Warn().Err(err).Msg("Failed to load rules from file, starting with empty rules")
+	}
+
+	return service
+}
+
+// ========== 持久化方法 ==========
+
+// LoadRulesFromFile 从本地文件加载规则
+func (s *AlertService) LoadRulesFromFile() error {
+	// 检查文件是否存在
+	if _, err := os.Stat(s.localRulesPath); os.IsNotExist(err) {
+		log.Info().Str("path", s.localRulesPath).Msg("Local rules file does not exist, skipping load")
+		return nil
+	}
+
+	// 读取文件内容
+	data, err := os.ReadFile(s.localRulesPath)
+	if err != nil {
+		return fmt.Errorf("failed to read local rules file: %w", err)
+	}
+
+	// 解析规则文件
+	var rulesFile model.PrometheusRuleFile
+	if err := yaml.Unmarshal(data, &rulesFile); err != nil {
+		return fmt.Errorf("failed to parse rules file: %w", err)
+	}
+
+	// 从Prometheus格式转换回内部格式
+	s.currentRules = []model.AlertRule{}
+	s.currentRuleMetas = []model.AlertRuleMeta{}
+
+	// 用于去重的map
+	ruleMap := make(map[string]*model.AlertRule)
+
+	for _, group := range rulesFile.Groups {
+		for _, rule := range group.Rules {
+			// 提取基础规则信息
+			ruleName := rule.Alert
+
+			// 从annotations中获取description
+			description := ""
+			if desc, ok := rule.Annotations["description"]; ok {
+				description = desc
+			}
+
+			// 从labels中获取severity
+			severity := "warning"
+			if sev, ok := rule.Labels["severity"]; ok {
+				severity = sev
+				delete(rule.Labels, "severity") // 移除severity，剩下的是meta的labels
+			}
+
+			// 创建或更新规则模板
+			if _, exists := ruleMap[ruleName]; !exists {
+				alertRule := model.AlertRule{
+					Name:        ruleName,
+					Description: description,
+					Expr:        rule.Expr,
+					Severity:    severity,
+				}
+
+				// 解析For字段获取WatchTime
+				if rule.For != "" {
+					// 简单解析，假设格式为 "300s" 或 "5m"
+					if strings.HasSuffix(rule.For, "s") {
+						if seconds, err := strconv.Atoi(strings.TrimSuffix(rule.For, "s")); err == nil {
+							alertRule.WatchTime = seconds
+						}
+					} else if strings.HasSuffix(rule.For, "m") {
+						if minutes, err := strconv.Atoi(strings.TrimSuffix(rule.For, "m")); err == nil {
+							alertRule.WatchTime = minutes * 60
+						}
+					}
+				}
+
+				ruleMap[ruleName] = &alertRule
+				s.currentRules = append(s.currentRules, alertRule)
+			}
+
+			// 创建元信息
+			if len(rule.Labels) > 0 {
+				labelsJSON, _ := json.Marshal(rule.Labels)
+				meta := model.AlertRuleMeta{
+					AlertName: ruleName,
+					Labels:    string(labelsJSON),
+				}
+
+				// 从表达式中提取threshold（简单实现）
+				// 假设表达式类似 "metric > 80" 或 "metric{labels} > 80"
+				parts := strings.Split(rule.Expr, " ")
+				if len(parts) >= 3 {
+					if threshold, err := strconv.ParseFloat(parts[len(parts)-1], 64); err == nil {
+						meta.Threshold = threshold
+					}
+				}
+
+				s.currentRuleMetas = append(s.currentRuleMetas, meta)
+			}
+		}
+	}
+
+	log.Info().
+		Int("rules", len(s.currentRules)).
+		Int("metas", len(s.currentRuleMetas)).
+		Str("path", s.localRulesPath).
+		Msg("Loaded rules from local file")
+
+	return nil
+}
+
+// SaveRulesToFile 保存规则到本地文件
+func (s *AlertService) SaveRulesToFile() error {
+	// 确保目录存在
+	dir := filepath.Dir(s.localRulesPath)
+	if err := os.MkdirAll(dir, 0755); err != nil {
+		return fmt.Errorf("failed to create rules directory: %w", err)
+	}
+
+	// 构建Prometheus规则文件格式
+	prometheusRules := s.buildPrometheusRules(s.currentRules, s.currentRuleMetas)
+
+	// 序列化为YAML
+	data, err := yaml.Marshal(prometheusRules)
+	if err != nil {
+		return fmt.Errorf("failed to marshal rules: %w", err)
+	}
+
+	// 写入文件
+	if err := os.WriteFile(s.localRulesPath, data, 0644); err != nil {
+		return fmt.Errorf("failed to write rules file: %w", err)
+	}
+
+	log.Info().
+		Int("rules", len(s.currentRules)).
+		Int("metas", len(s.currentRuleMetas)).
+		Str("path", s.localRulesPath).
+		Msg("Saved rules to local file")
+
+	return nil
+}
+
+// Shutdown 优雅关闭，保存当前规则
+func (s *AlertService) Shutdown() error {
+	log.Info().Msg("Shutting down alert service, saving rules...")
+	return s.SaveRulesToFile()
 }
 
 // ========== 公开 API 方法 ==========
diff --git a/internal/prometheus_adapter/test_alert_update.sh b/internal/prometheus_adapter/test_alert_update.sh
index a8af0ea..4400b43 100755
--- a/internal/prometheus_adapter/test_alert_update.sh
+++ b/internal/prometheus_adapter/test_alert_update.sh
@@ -2,49 +2,69 @@
 
 # 测试增量更新告警规则功能
 
-BASE_URL="http://localhost:8080"
+BASE_URL="http://localhost:9999"
 
 echo "=== 测试增量更新告警规则 ==="
 
-# 1. 先进行全量同步，创建初始规则
-echo -e "\n1. 全量同步规则..."
-curl -X POST ${BASE_URL}/v1/alert-rules/sync \
+# 1. 初始化规则（使用增量更新接口）
+echo -e "\n1. 创建初始规则..."
+
+# 1.1 创建 high_cpu_usage 规则模板
+echo -e "\n1.1 创建规则模板: high_cpu_usage"
+curl -X PUT ${BASE_URL}/v1/alert-rules/high_cpu_usage \
   -H "Content-Type: application/json" \
   -d '{
-    "rules": [
-      {
-        "name": "high_cpu_usage",
-        "description": "CPU使用率过高告警",
-        "expr": "system_cpu_usage_percent",
-        "op": ">",
-        "severity": "warning"
-      },
-      {
-        "name": "high_memory_usage",
-        "description": "内存使用率过高告警",
-        "expr": "system_memory_usage_percent",
-        "op": ">",
-        "severity": "warning"
-      }
-    ],
-    "rule_metas": [
+    "description": "CPU使用率过高告警",
+    "expr": "system_cpu_usage_percent",
+    "op": ">",
+    "severity": "warning",
+    "watch_time": 300
+  }' | jq .
+
+sleep 1
+
+# 1.2 创建 high_memory_usage 规则模板
+echo -e "\n1.2 创建规则模板: high_memory_usage"
+curl -X PUT ${BASE_URL}/v1/alert-rules/high_memory_usage \
+  -H "Content-Type: application/json" \
+  -d '{
+    "description": "内存使用率过高告警",
+    "expr": "system_memory_usage_percent",
+    "op": ">",
+    "severity": "warning",
+    "watch_time": 600
+  }' | jq .
+
+sleep 1
+
+# 1.3 设置 high_cpu_usage 规则的元信息
+echo -e "\n1.3 设置规则元信息: high_cpu_usage"
+curl -X PUT ${BASE_URL}/v1/alert-rules-meta/high_cpu_usage \
+  -H "Content-Type: application/json" \
+  -d '{
+    "metas": [
       {
-        "alert_name": "high_cpu_usage",
         "labels": "{\"service\":\"storage-service\",\"version\":\"1.0.0\"}",
-        "threshold": 80,
-        "watch_time": 300
+        "threshold": 80
       },
       {
-        "alert_name": "high_cpu_usage",
         "labels": "{\"service\":\"metadata-service\",\"version\":\"1.0.0\"}",
-        "threshold": 85,
-        "watch_time": 300
-      },
+        "threshold": 85
+      }
+    ]
+  }' | jq .
+
+sleep 1
+
+# 1.4 设置 high_memory_usage 规则的元信息
+echo -e "\n1.4 设置规则元信息: high_memory_usage"
+curl -X PUT ${BASE_URL}/v1/alert-rules-meta/high_memory_usage \
+  -H "Content-Type: application/json" \
+  -d '{
+    "metas": [
       {
-        "alert_name": "high_memory_usage",
         "labels": "{\"service\":\"storage-service\",\"version\":\"1.0.0\"}",
-        "threshold": 90,
-        "watch_time": 600
+        "threshold": 90
       }
     ]
   }' | jq .
@@ -59,33 +79,46 @@ curl -X PUT ${BASE_URL}/v1/alert-rules/high_cpu_usage \
     "description": "CPU使用率异常告警（更新后）",
     "expr": "avg(system_cpu_usage_percent[5m])",
     "op": ">=",
-    "severity": "critical"
+    "severity": "critical",
+    "watch_time": 300
   }' | jq .
 
 sleep 2
 
-# 3. 更新单个规则元信息
-echo -e "\n3. 更新规则元信息..."
-curl -X PUT ${BASE_URL}/v1/alert-rules/meta \
+# 3. 批量更新规则元信息
+echo -e "\n3. 批量更新规则元信息（high_cpu_usage）..."
+curl -X PUT ${BASE_URL}/v1/alert-rules-meta/high_cpu_usage \
   -H "Content-Type: application/json" \
   -d '{
-    "alert_name": "high_cpu_usage",
-    "labels": "{\"service\":\"storage-service\",\"version\":\"1.0.0\"}",
-    "threshold": 75,
-    "watch_time": 600
+    "metas": [
+      {
+        "labels": "{\"service\":\"storage-service\",\"version\":\"1.0.0\"}",
+        "threshold": 75
+      },
+      {
+        "labels": "{\"service\":\"metadata-service\",\"version\":\"1.0.0\"}",
+        "threshold": 88
+      }
+    ]
   }' | jq .
 
 sleep 2
 
-# 4. 添加新的元信息
-echo -e "\n4. 添加新的元信息..."
-curl -X PUT ${BASE_URL}/v1/alert-rules/meta \
+# 4. 批量更新规则元信息（添加新的服务）
+echo -e "\n4. 批量更新规则元信息（high_memory_usage - 添加新服务）..."
+curl -X PUT ${BASE_URL}/v1/alert-rules-meta/high_memory_usage \
   -H "Content-Type: application/json" \
   -d '{
-    "alert_name": "high_memory_usage",
-    "labels": "{\"service\":\"queue-service\",\"version\":\"2.0.0\"}",
-    "threshold": 95,
-    "watch_time": 300
+    "metas": [
+      {
+        "labels": "{\"service\":\"queue-service\",\"version\":\"2.0.0\"}",
+        "threshold": 95
+      },
+      {
+        "labels": "{\"service\":\"third-party-service\",\"version\":\"1.0.0\"}",
+        "threshold": 92
+      }
+    ]
   }' | jq .
 
 echo -e "\n=== 测试完成 ==="
\ No newline at end of file
diff --git a/scripts/prometheus_adapter/build.sh b/scripts/prometheus_adapter/build.sh
index ec5c08a..dbcbcf1 100755
--- a/scripts/prometheus_adapter/build.sh
+++ b/scripts/prometheus_adapter/build.sh
@@ -52,6 +52,7 @@ log_info "创建构建目录..."
 mkdir -p "$BUILD_DIR/bin"
 mkdir -p "$BUILD_DIR/docs"
 mkdir -p "$BUILD_DIR/scripts"
+mkdir -p "$BUILD_DIR/rules"
 
 # 编译二进制文件
 log_info "编译 ${APP_NAME}..."
@@ -79,6 +80,23 @@ if [ -f "internal/${APP_NAME}/test_alert_update.sh" ]; then
     chmod +x "$BUILD_DIR/scripts/test_alert_update.sh"
 fi
 
+# 复制规则文件
+log_info "复制规则文件..."
+if [ -d "internal/${APP_NAME}/rules" ]; then
+    cp -r "internal/${APP_NAME}/rules/"* "$BUILD_DIR/rules/" 2>/dev/null || true
+    log_info "已复制规则文件到 $BUILD_DIR/rules/"
+else
+    # 如果没有规则文件夹，创建一个空的规则文件
+    log_warn "未找到规则目录，创建默认规则文件..."
+    cat > "$BUILD_DIR/rules/alert_rules.yml" << 'RULES_EOF'
+# Prometheus Alert Rules
+# This file is managed by the Prometheus Adapter service
+# It will be loaded on startup and saved on shutdown
+
+groups: []
+RULES_EOF
+fi
+
 # 创建启动脚本
 log_info "创建启动脚本..."
 cat > "$BUILD_DIR/start.sh" << 'EOF'
diff --git a/scripts/prometheus_adapter/deploy.sh b/scripts/prometheus_adapter/deploy.sh
index 85bddf7..01ccdf8 100755
--- a/scripts/prometheus_adapter/deploy.sh
+++ b/scripts/prometheus_adapter/deploy.sh
@@ -235,11 +235,19 @@ if [ -w "$DEPLOY_DIR" ]; then
     chmod +x "$DEPLOY_DIR/start.sh"
     chmod +x "$DEPLOY_DIR/stop.sh"
     [ -f "$DEPLOY_DIR/scripts/test_alert_update.sh" ] && chmod +x "$DEPLOY_DIR/scripts/test_alert_update.sh"
+    # 确保 rules 目录可写
+    chmod 755 "$DEPLOY_DIR/rules"
+    [ -f "$DEPLOY_DIR/rules/alert_rules.yml" ] && chmod 644 "$DEPLOY_DIR/rules/alert_rules.yml"
 else
     sudo chmod +x "$DEPLOY_DIR/bin/prometheus_adapter"
     sudo chmod +x "$DEPLOY_DIR/start.sh"
     sudo chmod +x "$DEPLOY_DIR/stop.sh"
     [ -f "$DEPLOY_DIR/scripts/test_alert_update.sh" ] && sudo chmod +x "$DEPLOY_DIR/scripts/test_alert_update.sh"
+    # 确保 rules 目录可写
+    sudo chmod 755 "$DEPLOY_DIR/rules"
+    [ -f "$DEPLOY_DIR/rules/alert_rules.yml" ] && sudo chmod 644 "$DEPLOY_DIR/rules/alert_rules.yml"
+    # 设置 rules 目录的所有者为服务运行用户
+    sudo chown -R qboxserver:qboxserver "$DEPLOY_DIR/rules"
 fi
 
 # 清理临时目录
@@ -277,7 +285,7 @@ After=network.target
 Type=simple
 User=qboxserver
 Group=qboxserver
-WorkingDirectory=$DEPLOY_DIR
+WorkingDirectory=$DEPLOY_DIR/bin
 Environment="PROMETHEUS_URL=http://localhost:9090"
 Environment="PORT=8080"
 Environment="LOG_LEVEL=info"

From 271763ea277900efbfe93b74c803cbf33687bf77 Mon Sep 17 00:00:00 2001
From: dnj <shdnj@qq.com>
Date: Thu, 25 Sep 2025 16:15:30 +0800
Subject: [PATCH 11/18] =?UTF-8?q?feat(prometheus):=20=E5=AE=9E=E7=8E=B0?=
 =?UTF-8?q?=E4=B8=BB=E5=8A=A8=E6=8B=89=E5=8F=96=E5=91=8A=E8=AD=A6=E7=9A=84?=
 =?UTF-8?q?webhook=E6=9C=8D=E5=8A=A1?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

添加prometheus_adapter.yml配置文件支持
重构alert_service使用配置而非环境变量
新增alert_webhook_service实现告警轮询推送
更新build.sh和deploy.sh支持配置文件部署
更新README文档说明新的webhook架构
---
 docs/prometheus_adapter/README.md             |  63 +++--
 .../client/prometheus_client.go               |  44 ++-
 internal/prometheus_adapter/config/config.go  | 165 ++++++++++++
 .../config/prometheus_adapter.yml             |  27 ++
 .../model/prometheus_alert.go                 |  49 ++++
 internal/prometheus_adapter/server.go         |  52 ++--
 .../service/alert_service.go                  |  24 +-
 .../service/alert_webhook_service.go          | 255 ++++++++++++++++++
 scripts/prometheus_adapter/build.sh           |  41 ++-
 scripts/prometheus_adapter/deploy.sh          |  18 +-
 10 files changed, 665 insertions(+), 73 deletions(-)
 create mode 100644 internal/prometheus_adapter/config/config.go
 create mode 100644 internal/prometheus_adapter/config/prometheus_adapter.yml
 create mode 100644 internal/prometheus_adapter/model/prometheus_alert.go
 create mode 100644 internal/prometheus_adapter/service/alert_webhook_service.go

diff --git a/docs/prometheus_adapter/README.md b/docs/prometheus_adapter/README.md
index 650b6f8..0b9d5ca 100644
--- a/docs/prometheus_adapter/README.md
+++ b/docs/prometheus_adapter/README.md
@@ -221,30 +221,51 @@ internal/prometheus_adapter/
   - 更新元信息时，系统根据 `alert_name` + `labels` 查找并更新对应的元信息
 - **缓存机制**：系统在内存中缓存当前的规则和元信息，支持快速增量更新
 
-## Alertmanager 集成
+## 告警接收 Webhook
 
-- 目标：将 Prometheus 触发的告警通过 Alertmanager 转发到监控告警模块
-- `alertmanager.yml` 配置示例：
-```yaml
-global:
-  resolve_timeout: 5m
+- 目标：实现自定义 webhook 服务，主动从 Prometheus 拉取告警并转发到监控告警模块
+- 实现方式：
+  - 通过 Prometheus Alerts API 获取告警
+  - 定期轮询 Prometheus 的 `/api/v1/alerts` 端点
+  - 将获取的告警格式化后 POST 到监控告警模块
 
-route:
-  group_by: ['alertname', 'cluster', 'service']
-  group_wait: 10s
-  group_interval: 10s
-  repeat_interval: 1h
-  receiver: 'zeroops-alert-webhook'
-
-receivers:
-  - name: 'zeroops-alert-webhook'
-    webhook_configs:
-      - url: 'http://alert-module:8080/v1/integrations/alertmanager/webhook'
-        send_resolved: true
+### Webhook 服务架构
+```
+┌─────────────────┐
+│   Prometheus    │
+│  (告警规则引擎)   │
+└────────┬────────┘
+         │ Pull (轮询)
+         │ GET /api/v1/alerts
+         ▼
+┌─────────────────┐
+│  Alert Webhook  │
+│   （自定义服务）  │
+└────────┬────────┘
+         │ Push
+         │ POST /v1/integrations/prometheus/alerts
+         ▼
+┌─────────────────┐
+│   监控告警模块    │
+│  (告警处理中心)   │
+└─────────────────┘
 ```
-- 说明：
-  - `url`：监控告警模块的 webhook 地址（按实际部署修改主机与端口）
-  - `send_resolved`：为 `true` 时，告警恢复也会通知
+
+### 实现细节
+- **轮询机制**：
+  - 每 10 秒从 Prometheus 拉取一次活跃告警
+  - 通过 `GET http://prometheus:9090/api/v1/alerts` 获取告警列表
+  - 维护告警状态缓存，避免重复推送
+
+- **告警格式转换**：
+  - 将 Prometheus 告警格式转换为监控告警模块所需格式
+  - 包含告警名称、标签、严重程度、开始时间等信息
+  - 支持告警恢复状态通知
+
+- **推送目标**：
+  - URL: `http://alert-module:8080/v1/integrations/prometheus/alerts`
+  - Method: POST
+  - Content-Type: application/json
 
 ## 支持的服务
 
diff --git a/internal/prometheus_adapter/client/prometheus_client.go b/internal/prometheus_adapter/client/prometheus_client.go
index 7bf0a3a..a42b58b 100644
--- a/internal/prometheus_adapter/client/prometheus_client.go
+++ b/internal/prometheus_adapter/client/prometheus_client.go
@@ -2,18 +2,24 @@ package client
 
 import (
 	"context"
+	"encoding/json"
 	"fmt"
+	"io"
+	"net/http"
 	"time"
 
 	"github.com/prometheus/client_golang/api"
 	v1 "github.com/prometheus/client_golang/api/prometheus/v1"
 	promModel "github.com/prometheus/common/model"
 	"github.com/qiniu/zeroops/internal/prometheus_adapter/model"
+	"github.com/rs/zerolog/log"
 )
 
 // PrometheusClient Prometheus 客户端
 type PrometheusClient struct {
-	api v1.API
+	api        v1.API
+	httpClient *http.Client
+	baseURL    string
 }
 
 // NewPrometheusClient 创建新的 Prometheus 客户端
@@ -26,7 +32,9 @@ func NewPrometheusClient(address string) (*PrometheusClient, error) {
 	}
 
 	return &PrometheusClient{
-		api: v1.NewAPI(client),
+		api:        v1.NewAPI(client),
+		httpClient: &http.Client{Timeout: 10 * time.Second},
+		baseURL:    address,
 	}, nil
 }
 
@@ -142,3 +150,35 @@ func BuildQuery(service, metric, version string) string {
 	query += "}"
 	return query
 }
+
+// GetAlerts 获取 Prometheus 当前的告警
+func (c *PrometheusClient) GetAlerts(ctx context.Context) (*model.PrometheusAlertsResponse, error) {
+	url := fmt.Sprintf("%s/api/v1/alerts", c.baseURL)
+
+	req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
+	if err != nil {
+		return nil, fmt.Errorf("failed to create request: %w", err)
+	}
+
+	resp, err := c.httpClient.Do(req)
+	if err != nil {
+		return nil, fmt.Errorf("failed to query alerts: %w", err)
+	}
+	defer resp.Body.Close()
+
+	if resp.StatusCode != http.StatusOK {
+		body, _ := io.ReadAll(resp.Body)
+		return nil, fmt.Errorf("prometheus returned status %d: %s", resp.StatusCode, string(body))
+	}
+
+	var alertsResp model.PrometheusAlertsResponse
+	if err := json.NewDecoder(resp.Body).Decode(&alertsResp); err != nil {
+		return nil, fmt.Errorf("failed to decode response: %w", err)
+	}
+
+	log.Debug().
+		Int("alert_count", len(alertsResp.Data.Alerts)).
+		Msg("Retrieved alerts from Prometheus")
+
+	return &alertsResp, nil
+}
diff --git a/internal/prometheus_adapter/config/config.go b/internal/prometheus_adapter/config/config.go
new file mode 100644
index 0000000..38f6998
--- /dev/null
+++ b/internal/prometheus_adapter/config/config.go
@@ -0,0 +1,165 @@
+package config
+
+import (
+	"fmt"
+	"os"
+	"time"
+
+	"github.com/rs/zerolog/log"
+	"gopkg.in/yaml.v3"
+)
+
+// PrometheusAdapterConfig Prometheus Adapter 配置
+type PrometheusAdapterConfig struct {
+	Prometheus   PrometheusConfig   `yaml:"prometheus"`
+	AlertWebhook AlertWebhookConfig `yaml:"alert_webhook"`
+	AlertRules   AlertRulesConfig   `yaml:"alert_rules"`
+	Server       ServerConfig       `yaml:"server"`
+}
+
+// PrometheusConfig Prometheus 服务配置
+type PrometheusConfig struct {
+	Address       string `yaml:"address"`        // Prometheus 地址
+	ContainerName string `yaml:"container_name"` // 容器名称
+}
+
+// AlertWebhookConfig 告警 Webhook 配置
+type AlertWebhookConfig struct {
+	URL             string `yaml:"url"`              // Webhook URL
+	PollingInterval string `yaml:"polling_interval"` // 轮询间隔
+}
+
+// AlertRulesConfig 告警规则配置
+type AlertRulesConfig struct {
+	LocalFile          string `yaml:"local_file"`           // 本地规则文件
+	PrometheusRulesDir string `yaml:"prometheus_rules_dir"` // Prometheus 规则目录
+}
+
+// ServerConfig 服务器配置
+type ServerConfig struct {
+	BindAddr string `yaml:"bind_addr"` // 监听地址
+}
+
+// LoadConfig 加载配置文件
+func LoadConfig(configPath string) (*PrometheusAdapterConfig, error) {
+	// 如果没有指定配置文件，使用默认路径
+	if configPath == "" {
+		configPath = "internal/prometheus_adapter/config/prometheus_adapter.yml"
+	}
+
+	// 读取配置文件
+	data, err := os.ReadFile(configPath)
+	if err != nil {
+		// 如果文件不存在，返回默认配置
+		if os.IsNotExist(err) {
+			log.Warn().Msg("Config file not found, using default configuration")
+			return getDefaultConfig(), nil
+		}
+		return nil, fmt.Errorf("failed to read config file: %w", err)
+	}
+
+	// 解析配置
+	var config PrometheusAdapterConfig
+	if err := yaml.Unmarshal(data, &config); err != nil {
+		return nil, fmt.Errorf("failed to parse config file: %w", err)
+	}
+
+	// 应用环境变量覆盖
+	applyEnvOverrides(&config)
+
+	// 验证配置
+	if err := validateConfig(&config); err != nil {
+		return nil, fmt.Errorf("invalid configuration: %w", err)
+	}
+
+	log.Info().
+		Str("config_file", configPath).
+		Msg("Configuration loaded successfully")
+
+	return &config, nil
+}
+
+// getDefaultConfig 获取默认配置
+func getDefaultConfig() *PrometheusAdapterConfig {
+	return &PrometheusAdapterConfig{
+		Prometheus: PrometheusConfig{
+			Address:       "http://10.210.10.33:9090",
+			ContainerName: "mock-s3-prometheus",
+		},
+		AlertWebhook: AlertWebhookConfig{
+			URL:             "http://alert-module:8080/v1/integrations/prometheus/alerts",
+			PollingInterval: "10s",
+		},
+		AlertRules: AlertRulesConfig{
+			LocalFile:          "../rules/alert_rules.yml",
+			PrometheusRulesDir: "/etc/prometheus/rules/",
+		},
+		Server: ServerConfig{
+			BindAddr: "0.0.0.0:9999",
+		},
+	}
+}
+
+// applyEnvOverrides 应用环境变量覆盖
+func applyEnvOverrides(config *PrometheusAdapterConfig) {
+	// Prometheus 配置
+	if addr := os.Getenv("PROMETHEUS_ADDRESS"); addr != "" {
+		config.Prometheus.Address = addr
+	}
+	if container := os.Getenv("PROMETHEUS_CONTAINER"); container != "" {
+		config.Prometheus.ContainerName = container
+	}
+
+	// Alert Webhook 配置
+	if url := os.Getenv("ALERT_WEBHOOK_URL"); url != "" {
+		config.AlertWebhook.URL = url
+	}
+	if interval := os.Getenv("ALERT_POLLING_INTERVAL"); interval != "" {
+		config.AlertWebhook.PollingInterval = interval
+	}
+
+	// Server 配置
+	if bindAddr := os.Getenv("SERVER_BIND_ADDR"); bindAddr != "" {
+		config.Server.BindAddr = bindAddr
+	}
+}
+
+// validateConfig 验证配置
+func validateConfig(config *PrometheusAdapterConfig) error {
+	// 验证 Prometheus 地址
+	if config.Prometheus.Address == "" {
+		return fmt.Errorf("prometheus address is required")
+	}
+
+	// 验证轮询间隔
+	if config.AlertWebhook.PollingInterval != "" {
+		if _, err := time.ParseDuration(config.AlertWebhook.PollingInterval); err != nil {
+			return fmt.Errorf("invalid polling interval: %w", err)
+		}
+	}
+
+	// 验证服务器地址
+	if config.Server.BindAddr == "" {
+		return fmt.Errorf("server bind address is required")
+	}
+
+	return nil
+}
+
+// GetPollingInterval 获取轮询间隔的 Duration
+func (c *AlertWebhookConfig) GetPollingInterval() time.Duration {
+	if c.PollingInterval == "" {
+		return 10 * time.Second
+	}
+
+	duration, err := time.ParseDuration(c.PollingInterval)
+	if err != nil {
+		log.Warn().
+			Err(err).
+			Str("interval", c.PollingInterval).
+			Msg("Invalid polling interval, using default")
+		return 10 * time.Second
+	}
+
+	return duration
+}
diff --git a/internal/prometheus_adapter/config/prometheus_adapter.yml b/internal/prometheus_adapter/config/prometheus_adapter.yml
new file mode 100644
index 0000000..a3eab43
--- /dev/null
+++ b/internal/prometheus_adapter/config/prometheus_adapter.yml
@@ -0,0 +1,27 @@
+# Prometheus Adapter 配置文件
+
+# Prometheus 服务配置
+prometheus:
+  # Prometheus 服务地址
+  address: "http://10.210.10.33:9090"
+  # 容器名称（用于规则同步）
+  container_name: "mock-s3-prometheus"
+
+# 告警 Webhook 服务配置
+alert_webhook:
+  # 监控告警模块地址
+  url: "http://alert-module:8080/v1/integrations/prometheus/alerts"
+  # 轮询间隔
+  polling_interval: "10s"
+
+# 告警规则管理配置
+alert_rules:
+  # 本地规则文件路径
+  local_file: "../rules/alert_rules.yml"
+  # Prometheus 规则目录
+  prometheus_rules_dir: "/etc/prometheus/rules/"
+
+# 服务器配置
+server:
+  # 服务监听地址
+  bind_addr: "0.0.0.0:9999"
\ No newline at end of file
diff --git a/internal/prometheus_adapter/model/prometheus_alert.go b/internal/prometheus_adapter/model/prometheus_alert.go
new file mode 100644
index 0000000..9dab331
--- /dev/null
+++ b/internal/prometheus_adapter/model/prometheus_alert.go
@@ -0,0 +1,49 @@
+package model
+
+import (
+	"time"
+)
+
+// PrometheusAlert Prometheus 告警 API 响应结构
+type PrometheusAlert struct {
+	Labels      map[string]string `json:"labels"`
+	Annotations map[string]string `json:"annotations"`
+	State       string            `json:"state"` // pending, firing
+	ActiveAt    time.Time         `json:"activeAt"`
+	Value       string            `json:"value"` // 触发告警时的值
+}
+
+// PrometheusAlertsResponse Prometheus /api/v1/alerts 响应
+type PrometheusAlertsResponse struct {
+	Status string `json:"status"`
+	Data   struct {
+		Alerts []PrometheusAlert `json:"alerts"`
+	} `json:"data"`
+}
+
+// AlertmanagerWebhookAlert 单个告警
+type AlertmanagerWebhookAlert struct {
+	Status       string            `json:"status"`       // "firing" or "resolved"
+	Labels       map[string]string `json:"labels"`       // 包含 alertname, service, severity, idc, service_version 等
+	Annotations  map[string]string `json:"annotations"`  // 包含 summary, description
+	StartsAt     string            `json:"startsAt"`     // RFC3339 格式时间
+	EndsAt       string            `json:"endsAt"`       // RFC3339 格式时间
+	GeneratorURL string            `json:"generatorURL"` // Prometheus 查询链接
+	Fingerprint  string            `json:"fingerprint"`  // 告警唯一标识
+}
+
+// AlertmanagerWebhookRequest 发送到监控告警模块的请求格式
+type AlertmanagerWebhookRequest struct {
+	Receiver     string                     `json:"receiver"` // "our-webhook"
+	Status       string                     `json:"status"`   // "firing" or "resolved"
+	Alerts       []AlertmanagerWebhookAlert `json:"alerts"`
+	GroupLabels  map[string]string          `json:"groupLabels"`  // 分组标签
+	CommonLabels map[string]string          `json:"commonLabels"` // 公共标签
+	Version      string                     `json:"version"`      // "4"
+}
+
+// AlertWebhookResponse 告警推送响应
+type AlertWebhookResponse struct {
+	Status  string `json:"status"`
+	Message string `json:"message"`
+}
diff --git a/internal/prometheus_adapter/server.go b/internal/prometheus_adapter/server.go
index 35f5b08..81a2824 100644
--- a/internal/prometheus_adapter/server.go
+++ b/internal/prometheus_adapter/server.go
@@ -3,35 +3,37 @@ package prometheusadapter
 import (
 	"context"
 	"fmt"
-	"os"
 
 	"github.com/fox-gonic/fox"
 	"github.com/qiniu/zeroops/internal/config"
 	"github.com/qiniu/zeroops/internal/prometheus_adapter/api"
 	"github.com/qiniu/zeroops/internal/prometheus_adapter/client"
+	promconfig "github.com/qiniu/zeroops/internal/prometheus_adapter/config"
 	"github.com/qiniu/zeroops/internal/prometheus_adapter/service"
 	"github.com/rs/zerolog/log"
 )
 
 // PrometheusAdapterServer Prometheus Adapter 服务器
 type PrometheusAdapterServer struct {
-	config        *config.Config
-	promClient    *client.PrometheusClient
-	metricService *service.MetricService
-	alertService  *service.AlertService
-	api           *api.Api
+	config              *config.Config
+	promConfig          *promconfig.PrometheusAdapterConfig
+	promClient          *client.PrometheusClient
+	metricService       *service.MetricService
+	alertService        *service.AlertService
+	alertWebhookService *service.AlertWebhookService
+	api                 *api.Api
 }
 
 // NewPrometheusAdapterServer 创建新的 Prometheus Adapter 服务器
 func NewPrometheusAdapterServer(cfg *config.Config) (*PrometheusAdapterServer, error) {
-	// 使用环境变量或默认值获取 Prometheus 地址
-	prometheusAddr := os.Getenv("PROMETHEUS_ADDRESS")
-	if prometheusAddr == "" {
-		prometheusAddr = "http://10.210.10.33:9090/"
+	// 加载 Prometheus Adapter 配置
+	promConfig, err := promconfig.LoadConfig("")
+	if err != nil {
+		return nil, fmt.Errorf("failed to load prometheus adapter config: %w", err)
 	}
 
 	// 创建 Prometheus 客户端
-	promClient, err := client.NewPrometheusClient(prometheusAddr)
+	promClient, err := client.NewPrometheusClient(promConfig.Prometheus.Address)
 	if err != nil {
 		return nil, fmt.Errorf("failed to create prometheus client: %w", err)
 	}
@@ -40,16 +42,27 @@ func NewPrometheusAdapterServer(cfg *config.Config) (*PrometheusAdapterServer, e
 	metricService := service.NewMetricService(promClient)
 
 	// 创建告警服务
-	alertService := service.NewAlertService(promClient)
+	alertService := service.NewAlertService(promClient, promConfig)
+
+	// 创建告警 Webhook 服务
+	alertWebhookService := service.NewAlertWebhookService(promClient, promConfig)
 
 	server := &PrometheusAdapterServer{
-		config:        cfg,
-		promClient:    promClient,
-		metricService: metricService,
-		alertService:  alertService,
+		config:              cfg,
+		promConfig:          promConfig,
+		promClient:          promClient,
+		metricService:       metricService,
+		alertService:        alertService,
+		alertWebhookService: alertWebhookService,
+	}
+
+	// 启动告警 Webhook 服务
+	if err := alertWebhookService.Start(); err != nil {
+		log.Error().Err(err).Msg("Failed to start alert webhook service")
+		// 不返回错误，允许服务继续运行
 	}
 
-	log.Info().Str("prometheus_address", prometheusAddr).Msg("Prometheus Adapter initialized successfully")
+	log.Info().Str("prometheus_address", promConfig.Prometheus.Address).Msg("Prometheus Adapter initialized successfully")
 	return server, nil
 }
 
@@ -68,6 +81,11 @@ func (s *PrometheusAdapterServer) UseApi(router *fox.Engine) error {
 func (s *PrometheusAdapterServer) Close(ctx context.Context) error {
 	log.Info().Msg("Starting shutdown...")
 
+	// 停止告警 Webhook 服务
+	if s.alertWebhookService != nil {
+		s.alertWebhookService.Stop()
+	}
+
 	// 调用 alertService 的 Shutdown 方法保存规则
 	if s.alertService != nil {
 		if err := s.alertService.Shutdown(); err != nil {
diff --git a/internal/prometheus_adapter/service/alert_service.go b/internal/prometheus_adapter/service/alert_service.go
index a254ae1..f74d4a3 100644
--- a/internal/prometheus_adapter/service/alert_service.go
+++ b/internal/prometheus_adapter/service/alert_service.go
@@ -11,6 +11,7 @@ import (
 	"strings"
 
 	"github.com/qiniu/zeroops/internal/prometheus_adapter/client"
+	promconfig "github.com/qiniu/zeroops/internal/prometheus_adapter/config"
 	"github.com/qiniu/zeroops/internal/prometheus_adapter/model"
 	"github.com/rs/zerolog/log"
 	"gopkg.in/yaml.v3"
@@ -19,6 +20,7 @@ import (
 // AlertService 告警服务 - 仅负责与Prometheus交互，不存储规则
 type AlertService struct {
 	promClient *client.PrometheusClient
+	config     *promconfig.PrometheusAdapterConfig
 	// 内存中缓存当前规则，用于增量更新
 	currentRules     []model.AlertRule
 	currentRuleMetas []model.AlertRuleMeta
@@ -27,12 +29,13 @@ type AlertService struct {
 }
 
 // NewAlertService 创建告警服务
-func NewAlertService(promClient *client.PrometheusClient) *AlertService {
+func NewAlertService(promClient *client.PrometheusClient, config *promconfig.PrometheusAdapterConfig) *AlertService {
 	service := &AlertService{
 		promClient:       promClient,
+		config:           config,
 		currentRules:     []model.AlertRule{},
 		currentRuleMetas: []model.AlertRuleMeta{},
-		localRulesPath:   "../rules/alert_rules.yml",
+		localRulesPath:   config.AlertRules.LocalFile,
 	}
 
 	// 启动时尝试加载本地规则
@@ -301,12 +304,9 @@ func (s *AlertService) buildPrometheusRules(rules []model.AlertRule, ruleMetas [
 
 	// 为每个元信息生成Prometheus规则
 	for _, meta := range ruleMetas {
-		// 查找对应的规则模板
-		var rule *model.AlertRule
-
 		// 通过 alert_name 直接查找对应的规则模板
 		// AlertRuleMeta.alert_name 关联 AlertRule.name
-		rule = ruleMap[meta.AlertName]
+		var rule *model.AlertRule = ruleMap[meta.AlertName]
 
 		if rule == nil {
 			log.Warn().
@@ -450,10 +450,7 @@ func (s *AlertService) writeRulesFile(rules *model.PrometheusRuleFile) error {
 	}
 
 	// 获取容器名称
-	containerName := os.Getenv("PROMETHEUS_CONTAINER")
-	if containerName == "" {
-		containerName = "mock-s3-prometheus"
-	}
+	containerName := s.config.Prometheus.ContainerName
 
 	// 直接写入到容器内的规则目录
 	// 使用docker exec和echo命令写入文件
@@ -530,10 +527,7 @@ func (s *AlertService) syncRuleFileToContainer(filePath string) error {
 
 // reloadPrometheus 重新加载Prometheus配置
 func (s *AlertService) reloadPrometheus() error {
-	prometheusURL := os.Getenv("PROMETHEUS_ADDRESS")
-	if prometheusURL == "" {
-		prometheusURL = "http://10.210.10.33:9090"
-	}
+	prometheusURL := s.config.Prometheus.Address
 
 	reloadURL := fmt.Sprintf("%s/-/reload", strings.TrimSuffix(prometheusURL, "/"))
 
@@ -544,7 +538,7 @@ func (s *AlertService) reloadPrometheus() error {
 	defer resp.Body.Close()
 
 	if resp.StatusCode != http.StatusOK {
-		return fmt.Errorf("Prometheus reload failed with status: %d", resp.StatusCode)
+		return fmt.Errorf("prometheus reload failed with status: %d", resp.StatusCode)
 	}
 
 	log.Info().Msg("Prometheus configuration reloaded")
diff --git a/internal/prometheus_adapter/service/alert_webhook_service.go b/internal/prometheus_adapter/service/alert_webhook_service.go
new file mode 100644
index 0000000..ac211ac
--- /dev/null
+++ b/internal/prometheus_adapter/service/alert_webhook_service.go
@@ -0,0 +1,255 @@
+package service
+
+import (
+	"bytes"
+	"context"
+	"crypto/md5"
+	"encoding/json"
+	"fmt"
+	"io"
+	"net/http"
+	"sync"
+	"time"
+
+	"github.com/qiniu/zeroops/internal/prometheus_adapter/client"
+	promconfig "github.com/qiniu/zeroops/internal/prometheus_adapter/config"
+	"github.com/qiniu/zeroops/internal/prometheus_adapter/model"
+	"github.com/rs/zerolog/log"
+)
+
+// AlertWebhookService 告警 Webhook 服务
+type AlertWebhookService struct {
+	promClient      *client.PrometheusClient
+	config          *promconfig.PrometheusAdapterConfig
+	webhookURL      string
+	pollingInterval time.Duration
+	httpClient      *http.Client
+	alertCache      map[string]*model.PrometheusAlert // 缓存已发送的告警
+	cacheMutex      sync.RWMutex
+	stopCh          chan struct{}
+	running         bool
+	runningMutex    sync.Mutex
+}
+
+// NewAlertWebhookService 创建告警 Webhook 服务
+func NewAlertWebhookService(promClient *client.PrometheusClient, config *promconfig.PrometheusAdapterConfig) *AlertWebhookService {
+	return &AlertWebhookService{
+		promClient:      promClient,
+		config:          config,
+		webhookURL:      config.AlertWebhook.URL,
+		pollingInterval: config.AlertWebhook.GetPollingInterval(),
+		httpClient:      &http.Client{Timeout: 30 * time.Second},
+		alertCache:      make(map[string]*model.PrometheusAlert),
+		stopCh:          make(chan struct{}),
+	}
+}
+
+// Start 启动告警轮询服务
+func (s *AlertWebhookService) Start() error {
+	s.runningMutex.Lock()
+	defer s.runningMutex.Unlock()
+
+	if s.running {
+		return fmt.Errorf("alert webhook service already running")
+	}
+
+	s.running = true
+	go s.pollAlerts()
+
+	log.Info().
+		Str("webhook_url", s.webhookURL).
+		Dur("interval", s.pollingInterval).
+		Msg("Alert webhook service started")
+
+	return nil
+}
+
+// Stop 停止告警轮询服务
+func (s *AlertWebhookService) Stop() {
+	s.runningMutex.Lock()
+	defer s.runningMutex.Unlock()
+
+	if !s.running {
+		return
+	}
+
+	close(s.stopCh)
+	s.running = false
+
+	log.Info().Msg("Alert webhook service stopped")
+}
+
+// pollAlerts 轮询告警
+func (s *AlertWebhookService) pollAlerts() {
+	ticker := time.NewTicker(s.pollingInterval)
+	defer ticker.Stop()
+
+	// 立即执行一次
+	s.fetchAndProcessAlerts()
+
+	for {
+		select {
+		case <-ticker.C:
+			s.fetchAndProcessAlerts()
+		case <-s.stopCh:
+			return
+		}
+	}
+}
+
+// fetchAndProcessAlerts 获取并处理告警
+func (s *AlertWebhookService) fetchAndProcessAlerts() {
+	ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
+	defer cancel()
+
+	// 从 Prometheus 获取告警
+	alertsResp, err := s.promClient.GetAlerts(ctx)
+	if err != nil {
+		log.Error().Err(err).Msg("Failed to fetch alerts from Prometheus")
+		return
+	}
+
+	// 处理告警
+	firingAlerts := []model.PrometheusAlert{}
+	resolvedAlerts := []model.PrometheusAlert{}
+
+	s.cacheMutex.Lock()
+	defer s.cacheMutex.Unlock()
+
+	// 分类告警
+	currentAlerts := make(map[string]*model.PrometheusAlert)
+	for _, alert := range alertsResp.Data.Alerts {
+		fingerprint := s.generateFingerprint(alert)
+		currentAlerts[fingerprint] = &alert
+
+		// 检查是否是新告警或状态变更
+		cachedAlert, exists := s.alertCache[fingerprint]
+		if !exists || cachedAlert.State != alert.State {
+			if alert.State == "firing" {
+				firingAlerts = append(firingAlerts, alert)
+			}
+		}
+	}
+
+	// 检查已恢复的告警
+	for fingerprint, cachedAlert := range s.alertCache {
+		if _, exists := currentAlerts[fingerprint]; !exists {
+			// 告警已恢复
+			resolvedAlert := *cachedAlert
+			resolvedAlert.State = "resolved"
+			resolvedAlerts = append(resolvedAlerts, resolvedAlert)
+		}
+	}
+
+	// 更新缓存
+	s.alertCache = currentAlerts
+
+	// 发送告警
+	if len(firingAlerts) > 0 {
+		if err := s.sendAlerts(firingAlerts, "firing"); err != nil {
+			log.Error().Err(err).Msg("Failed to send firing alerts")
+		}
+	}
+
+	if len(resolvedAlerts) > 0 {
+		if err := s.sendAlerts(resolvedAlerts, "resolved"); err != nil {
+			log.Error().Err(err).Msg("Failed to send resolved alerts")
+		}
+	}
+}
+
+// sendAlerts 发送告警到监控模块
+func (s *AlertWebhookService) sendAlerts(alerts []model.PrometheusAlert, status string) error {
+	webhookAlerts := []model.AlertmanagerWebhookAlert{}
+
+	// 收集所有标签用于 groupLabels 和 commonLabels
+	commonLabels := map[string]string{}
+	firstAlert := true
+
+	for _, alert := range alerts {
+		// 生成 fingerprint
+		fingerprint := s.generateFingerprint(alert)
+
+		// 转换时间格式
+		startsAt := alert.ActiveAt.Format(time.RFC3339)
+		endsAt := "0001-01-01T00:00:00Z"
+		if status == "resolved" {
+			endsAt = time.Now().Format(time.RFC3339)
+		}
+
+		// 构造 GeneratorURL
+		generatorURL := fmt.Sprintf("http://prometheus/graph?g0.expr=%s", alert.Labels["alertname"])
+
+		webhookAlert := model.AlertmanagerWebhookAlert{
+			Status:       status,
+			Labels:       alert.Labels,
+			Annotations:  alert.Annotations,
+			StartsAt:     startsAt,
+			EndsAt:       endsAt,
+			GeneratorURL: generatorURL,
+			Fingerprint:  fingerprint,
+		}
+		webhookAlerts = append(webhookAlerts, webhookAlert)
+
+		// 收集公共标签（取第一个告警的标签作为公共标签）
+		if firstAlert {
+			for k, v := range alert.Labels {
+				commonLabels[k] = v
+			}
+			firstAlert = false
+		}
+	}
+
+	groupLabels := map[string]string{}
+	if alertName, ok := commonLabels["alertname"]; ok {
+		groupLabels["alertname"] = alertName
+	}
+
+	// 构造请求
+	req := model.AlertmanagerWebhookRequest{
+		Receiver:     "prometheus_adapter",
+		Status:       status,
+		Alerts:       webhookAlerts,
+		GroupLabels:  groupLabels,
+		CommonLabels: commonLabels,
+		Version:      "1",
+	}
+
+	// 发送请求
+	jsonData, err := json.Marshal(req)
+	if err != nil {
+		return fmt.Errorf("failed to marshal request: %w", err)
+	}
+
+	resp, err := s.httpClient.Post(s.webhookURL, "application/json", bytes.NewBuffer(jsonData))
+	if err != nil {
+		return fmt.Errorf("failed to send webhook: %w", err)
+	}
+	defer resp.Body.Close()
+
+	if resp.StatusCode != http.StatusOK {
+		body, _ := io.ReadAll(resp.Body)
+		return fmt.Errorf("webhook returned status %d: %s", resp.StatusCode, string(body))
+	}
+
+	log.Info().
+		Str("status", status).
+		Int("alert_count", len(alerts)).
+		Str("webhook_url", s.webhookURL).
+		Msg("Successfully sent alerts to webhook")
+
+	return nil
+}
+
+// generateFingerprint 生成告警的唯一标识
+func (s *AlertWebhookService) generateFingerprint(alert model.PrometheusAlert) string {
+	// 基于标签生成指纹
+	labels := ""
+	for k, v := range alert.Labels {
+		labels += fmt.Sprintf("%s=%s,", k, v)
+	}
+
+	h := md5.New()
+	h.Write([]byte(labels))
+	return fmt.Sprintf("%x", h.Sum(nil))[:16]
+}
diff --git a/scripts/prometheus_adapter/build.sh b/scripts/prometheus_adapter/build.sh
index dbcbcf1..d7b5025 100755
--- a/scripts/prometheus_adapter/build.sh
+++ b/scripts/prometheus_adapter/build.sh
@@ -50,6 +50,7 @@ fi
 # 创建构建目录
 log_info "创建构建目录..."
 mkdir -p "$BUILD_DIR/bin"
+mkdir -p "$BUILD_DIR/config"
 mkdir -p "$BUILD_DIR/docs"
 mkdir -p "$BUILD_DIR/scripts"
 mkdir -p "$BUILD_DIR/rules"
@@ -67,6 +68,15 @@ if [ $? -ne 0 ]; then
     exit 1
 fi
 
+# 复制配置文件
+log_info "复制配置文件..."
+if [ -f "internal/${APP_NAME}/config/prometheus_adapter.yml" ]; then
+    cp "internal/${APP_NAME}/config/prometheus_adapter.yml" "$BUILD_DIR/config/"
+    log_info "已复制配置文件到 $BUILD_DIR/config/"
+else
+    log_warn "未找到配置文件，使用默认配置"
+fi
+
 # 复制文档
 log_info "复制文档..."
 if [ -f "docs/${APP_NAME}/README.md" ]; then
@@ -104,14 +114,10 @@ cat > "$BUILD_DIR/start.sh" << 'EOF'
 
 # Prometheus Adapter 启动脚本
 
-# 默认配置
-PROMETHEUS_URL=${PROMETHEUS_URL:-"http://localhost:9090"}
-PORT=${PORT:-8080}
-LOG_LEVEL=${LOG_LEVEL:-"info"}
-
 # 获取脚本所在目录
 SCRIPT_DIR=$(cd "$(dirname "$0")" && pwd)
 BIN_PATH="$SCRIPT_DIR/bin/prometheus_adapter"
+CONFIG_FILE="$SCRIPT_DIR/config/prometheus_adapter.yml"
 
 # 检查二进制文件
 if [ ! -f "$BIN_PATH" ]; then
@@ -119,19 +125,26 @@ if [ ! -f "$BIN_PATH" ]; then
     exit 1
 fi
 
-# 启动参数
-ARGS=""
-ARGS="$ARGS --prometheus-url=$PROMETHEUS_URL"
-ARGS="$ARGS --port=$PORT"
-ARGS="$ARGS --log-level=$LOG_LEVEL"
+# 检查配置文件
+if [ -f "$CONFIG_FILE" ]; then
+    echo "使用配置文件: $CONFIG_FILE"
+else
+    echo "警告: 找不到配置文件 $CONFIG_FILE，将使用默认配置"
+fi
+
+# 环境变量（可选，用于覆盖配置文件）
+# export PROMETHEUS_ADDRESS="http://localhost:9090"
+# export ALERT_WEBHOOK_URL="http://alert-module:8080/v1/integrations/prometheus/alerts"
+# export ALERT_POLLING_INTERVAL="10s"
+# export SERVER_BIND_ADDR="0.0.0.0:9999"
 
 echo "启动 Prometheus Adapter..."
-echo "Prometheus URL: $PROMETHEUS_URL"
-echo "监听端口: $PORT"
-echo "日志级别: $LOG_LEVEL"
+
+# 切换到 bin 目录，以便程序能正确找到相对路径的配置文件
+cd "$SCRIPT_DIR"
 
 # 启动服务
-exec "$BIN_PATH" $ARGS
+exec "$BIN_PATH"
 EOF
 chmod +x "$BUILD_DIR/start.sh"
 
diff --git a/scripts/prometheus_adapter/deploy.sh b/scripts/prometheus_adapter/deploy.sh
index 01ccdf8..ad68564 100755
--- a/scripts/prometheus_adapter/deploy.sh
+++ b/scripts/prometheus_adapter/deploy.sh
@@ -235,6 +235,9 @@ if [ -w "$DEPLOY_DIR" ]; then
     chmod +x "$DEPLOY_DIR/start.sh"
     chmod +x "$DEPLOY_DIR/stop.sh"
     [ -f "$DEPLOY_DIR/scripts/test_alert_update.sh" ] && chmod +x "$DEPLOY_DIR/scripts/test_alert_update.sh"
+    # 确保 config 目录和配置文件可读
+    chmod 755 "$DEPLOY_DIR/config"
+    [ -f "$DEPLOY_DIR/config/prometheus_adapter.yml" ] && chmod 644 "$DEPLOY_DIR/config/prometheus_adapter.yml"
     # 确保 rules 目录可写
     chmod 755 "$DEPLOY_DIR/rules"
     [ -f "$DEPLOY_DIR/rules/alert_rules.yml" ] && chmod 644 "$DEPLOY_DIR/rules/alert_rules.yml"
@@ -243,11 +246,16 @@ else
     sudo chmod +x "$DEPLOY_DIR/start.sh"
     sudo chmod +x "$DEPLOY_DIR/stop.sh"
     [ -f "$DEPLOY_DIR/scripts/test_alert_update.sh" ] && sudo chmod +x "$DEPLOY_DIR/scripts/test_alert_update.sh"
+    # 确保 config 目录和配置文件可读
+    sudo chmod 755 "$DEPLOY_DIR/config"
+    [ -f "$DEPLOY_DIR/config/prometheus_adapter.yml" ] && sudo chmod 644 "$DEPLOY_DIR/config/prometheus_adapter.yml"
     # 确保 rules 目录可写
     sudo chmod 755 "$DEPLOY_DIR/rules"
     [ -f "$DEPLOY_DIR/rules/alert_rules.yml" ] && sudo chmod 644 "$DEPLOY_DIR/rules/alert_rules.yml"
     # 设置 rules 目录的所有者为服务运行用户
     sudo chown -R qboxserver:qboxserver "$DEPLOY_DIR/rules"
+    # 确保配置文件也可以被服务用户读取
+    sudo chown qboxserver:qboxserver "$DEPLOY_DIR/config/prometheus_adapter.yml"
 fi
 
 # 清理临时目录
@@ -285,10 +293,12 @@ After=network.target
 Type=simple
 User=qboxserver
 Group=qboxserver
-WorkingDirectory=$DEPLOY_DIR/bin
-Environment="PROMETHEUS_URL=http://localhost:9090"
-Environment="PORT=8080"
-Environment="LOG_LEVEL=info"
+WorkingDirectory=$DEPLOY_DIR
+# 可选：通过环境变量覆盖配置
+#Environment="PROMETHEUS_ADDRESS=http://localhost:9090"
+#Environment="ALERT_WEBHOOK_URL=http://alert-module:8080/v1/integrations/prometheus/alerts"
+#Environment="ALERT_POLLING_INTERVAL=10s"
+#Environment="SERVER_BIND_ADDR=0.0.0.0:9999"
 ExecStart=$DEPLOY_DIR/bin/prometheus_adapter
 ExecStop=$DEPLOY_DIR/stop.sh
 Restart=on-failure

From b6e76bd7b68d28d82c3735683e10cbbbab14a2ec Mon Sep 17 00:00:00 2001
From: dnj <shdnj@qq.com>
Date: Thu, 25 Sep 2025 16:23:16 +0800
Subject: [PATCH 12/18] =?UTF-8?q?feat(=E9=85=8D=E7=BD=AE):=20=E6=94=AF?=
 =?UTF-8?q?=E6=8C=81=E4=BB=8E=E5=A4=9A=E4=B8=AA=E9=BB=98=E8=AE=A4=E8=B7=AF?=
 =?UTF-8?q?=E5=BE=84=E5=8A=A0=E8=BD=BD=E9=85=8D=E7=BD=AE=E6=96=87=E4=BB=B6?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 internal/prometheus_adapter/config/config.go | 22 ++++++++++++++++++--
 1 file changed, 20 insertions(+), 2 deletions(-)

diff --git a/internal/prometheus_adapter/config/config.go b/internal/prometheus_adapter/config/config.go
index 38f6998..39dd023 100644
--- a/internal/prometheus_adapter/config/config.go
+++ b/internal/prometheus_adapter/config/config.go
@@ -42,9 +42,27 @@ type ServerConfig struct {
 
 // LoadConfig 加载配置文件
 func LoadConfig(configPath string) (*PrometheusAdapterConfig, error) {
-	// 如果没有指定配置文件，使用默认路径
+	// 如果没有指定配置文件，尝试多个默认路径
 	if configPath == "" {
-		configPath = "internal/prometheus_adapter/config/prometheus_adapter.yml"
+		// 尝试的路径列表（按优先级）
+		possiblePaths := []string{
+			"config/prometheus_adapter.yml",                             // 部署环境：相对于工作目录
+			"internal/prometheus_adapter/config/prometheus_adapter.yml", // 开发环境：源码目录
+			"./prometheus_adapter.yml",                                  // 当前目录
+		}
+
+		for _, path := range possiblePaths {
+			if _, err := os.Stat(path); err == nil {
+				configPath = path
+				log.Info().Str("path", path).Msg("Found config file")
+				break
+			}
+		}
+
+		// 如果都找不到，使用第一个路径（稍后会返回默认配置）
+		if configPath == "" {
+			configPath = possiblePaths[0]
+		}
 	}
 
 	// 读取配置文件

From 4bc9b158c6908798d918950a333b649633a79f83 Mon Sep 17 00:00:00 2001
From: dnj <shdnj@qq.com>
Date: Thu, 25 Sep 2025 18:07:57 +0800
Subject: [PATCH 13/18] =?UTF-8?q?feat(prometheus):=20=E5=AE=9E=E7=8E=B0Ale?=
 =?UTF-8?q?rtmanager=E5=85=BC=E5=AE=B9API=E5=B9=B6=E9=87=8D=E6=9E=84?=
 =?UTF-8?q?=E5=91=8A=E8=AD=A6=E5=A4=84=E7=90=86?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- 新增Alertmanager API v2兼容接口用于接收Prometheus告警
- 重构告警服务架构，替换原有的轮询模式为推送模式
- 添加docker-compose配置支持Prometheus管理API
- 移除过时的AlertWebhookService实现
---
 .../api/alertmanager_api.go                   |  25 ++
 internal/prometheus_adapter/api/api.go        |  18 +-
 internal/prometheus_adapter/model/alert.go    |   9 +
 internal/prometheus_adapter/server.go         |  45 ++--
 .../service/alert_service.go                  |   2 +-
 .../service/alert_webhook_service.go          | 255 ------------------
 .../service/alertmanager_service.go           | 215 +++++++++++++++
 mock/s3/deployments/docker-compose.yml        |   3 +
 .../deployments/observability/prometheus.yml  |   8 +
 9 files changed, 290 insertions(+), 290 deletions(-)
 create mode 100644 internal/prometheus_adapter/api/alertmanager_api.go
 delete mode 100644 internal/prometheus_adapter/service/alert_webhook_service.go
 create mode 100644 internal/prometheus_adapter/service/alertmanager_service.go

diff --git a/internal/prometheus_adapter/api/alertmanager_api.go b/internal/prometheus_adapter/api/alertmanager_api.go
new file mode 100644
index 0000000..a50f2c9
--- /dev/null
+++ b/internal/prometheus_adapter/api/alertmanager_api.go
@@ -0,0 +1,25 @@
+package api
+
+import (
+	"github.com/fox-gonic/fox"
+	"github.com/qiniu/zeroops/internal/prometheus_adapter/service"
+)
+
+// setupAlertmanagerRouters 设置 Alertmanager 兼容路由
+// 这些路由模拟 Alertmanager API，接收 Prometheus 的告警推送
+func (api *Api) setupAlertmanagerRouters(router *fox.Engine, alertmanagerService *service.AlertmanagerService) {
+	// Alertmanager API v2 告警接收端点
+	router.POST("/api/v2/alerts", func(c *fox.Context) {
+		alertmanagerService.HandleAlertsV2(c.Writer, c.Request)
+	})
+
+	// 健康检查端点
+	router.GET("/-/healthy", func(c *fox.Context) {
+		alertmanagerService.HandleHealthCheck(c.Writer, c.Request)
+	})
+
+	// 就绪检查端点
+	router.GET("/-/ready", func(c *fox.Context) {
+		alertmanagerService.HandleReadyCheck(c.Writer, c.Request)
+	})
+}
diff --git a/internal/prometheus_adapter/api/api.go b/internal/prometheus_adapter/api/api.go
index 2b6e432..351544d 100644
--- a/internal/prometheus_adapter/api/api.go
+++ b/internal/prometheus_adapter/api/api.go
@@ -12,17 +12,19 @@ import (
 
 // Api Prometheus Adapter API
 type Api struct {
-	metricService *service.MetricService
-	alertService  *service.AlertService
-	router        *fox.Engine
+	metricService       *service.MetricService
+	alertService        *service.AlertService
+	alertmanagerService *service.AlertmanagerService
+	router              *fox.Engine
 }
 
 // NewApi 创建新的 API
-func NewApi(metricService *service.MetricService, alertService *service.AlertService, router *fox.Engine) (*Api, error) {
+func NewApi(metricService *service.MetricService, alertService *service.AlertService, alertmanagerService *service.AlertmanagerService, router *fox.Engine) (*Api, error) {
 	api := &Api{
-		metricService: metricService,
-		alertService:  alertService,
-		router:        router,
+		metricService:       metricService,
+		alertService:        alertService,
+		alertmanagerService: alertmanagerService,
+		router:              router,
 	}
 
 	api.setupRouters(router)
@@ -35,6 +37,8 @@ func (api *Api) setupRouters(router *fox.Engine) {
 	api.setupMetricRouters(router)
 	// 告警相关路由
 	api.setupAlertRouters(router)
+	// Alertmanager 兼容路由
+	api.setupAlertmanagerRouters(router, api.alertmanagerService)
 }
 
 // ========== 通用辅助方法 ==========
diff --git a/internal/prometheus_adapter/model/alert.go b/internal/prometheus_adapter/model/alert.go
index e64a047..497b541 100644
--- a/internal/prometheus_adapter/model/alert.go
+++ b/internal/prometheus_adapter/model/alert.go
@@ -17,3 +17,12 @@ type AlertRuleMeta struct {
 	Labels    string  `json:"labels" gorm:"type:jsonb"`                  // 适用标签，如 {"service":"s3","version":"v1"}，为空表示全局
 	Threshold float64 `json:"threshold"`                                 // 阈值（会被渲染成特定规则的 threshold metric 数值）
 }
+
+// AlertmanagerAlert 符合 Alertmanager API v2 的告警格式
+type AlertmanagerAlert struct {
+	Labels       map[string]string `json:"labels"`
+	Annotations  map[string]string `json:"annotations,omitempty"`
+	StartsAt     string            `json:"startsAt,omitempty"` // RFC3339 格式
+	EndsAt       string            `json:"endsAt,omitempty"`   // RFC3339 格式
+	GeneratorURL string            `json:"generatorURL,omitempty"`
+}
diff --git a/internal/prometheus_adapter/server.go b/internal/prometheus_adapter/server.go
index 81a2824..b2de452 100644
--- a/internal/prometheus_adapter/server.go
+++ b/internal/prometheus_adapter/server.go
@@ -15,13 +15,13 @@ import (
 
 // PrometheusAdapterServer Prometheus Adapter 服务器
 type PrometheusAdapterServer struct {
-	config              *config.Config
-	promConfig          *promconfig.PrometheusAdapterConfig
-	promClient          *client.PrometheusClient
-	metricService       *service.MetricService
-	alertService        *service.AlertService
-	alertWebhookService *service.AlertWebhookService
-	api                 *api.Api
+	config                   *config.Config
+	promConfig               *promconfig.PrometheusAdapterConfig
+	promClient               *client.PrometheusClient
+	metricService            *service.MetricService
+	alertService             *service.AlertService
+	alertmanagerProxyService *service.AlertmanagerService
+	api                      *api.Api
 }
 
 // NewPrometheusAdapterServer 创建新的 Prometheus Adapter 服务器
@@ -44,22 +44,16 @@ func NewPrometheusAdapterServer(cfg *config.Config) (*PrometheusAdapterServer, e
 	// 创建告警服务
 	alertService := service.NewAlertService(promClient, promConfig)
 
-	// 创建告警 Webhook 服务
-	alertWebhookService := service.NewAlertWebhookService(promClient, promConfig)
+	// 创建 Alertmanager 代理服务
+	alertmanagerProxyService := service.NewAlertmanagerProxyService(promConfig)
 
 	server := &PrometheusAdapterServer{
-		config:              cfg,
-		promConfig:          promConfig,
-		promClient:          promClient,
-		metricService:       metricService,
-		alertService:        alertService,
-		alertWebhookService: alertWebhookService,
-	}
-
-	// 启动告警 Webhook 服务
-	if err := alertWebhookService.Start(); err != nil {
-		log.Error().Err(err).Msg("Failed to start alert webhook service")
-		// 不返回错误，允许服务继续运行
+		config:                   cfg,
+		promConfig:               promConfig,
+		promClient:               promClient,
+		metricService:            metricService,
+		alertService:             alertService,
+		alertmanagerProxyService: alertmanagerProxyService,
 	}
 
 	log.Info().Str("prometheus_address", promConfig.Prometheus.Address).Msg("Prometheus Adapter initialized successfully")
@@ -69,11 +63,13 @@ func NewPrometheusAdapterServer(cfg *config.Config) (*PrometheusAdapterServer, e
 // UseApi 设置 API 路由
 func (s *PrometheusAdapterServer) UseApi(router *fox.Engine) error {
 	var err error
-	s.api, err = api.NewApi(s.metricService, s.alertService, router)
+	s.api, err = api.NewApi(s.metricService, s.alertService, s.alertmanagerProxyService, router)
 	if err != nil {
 		return fmt.Errorf("failed to initialize API: %w", err)
 	}
 
+	log.Info().Msg("All API endpoints registered")
+
 	return nil
 }
 
@@ -81,11 +77,6 @@ func (s *PrometheusAdapterServer) UseApi(router *fox.Engine) error {
 func (s *PrometheusAdapterServer) Close(ctx context.Context) error {
 	log.Info().Msg("Starting shutdown...")
 
-	// 停止告警 Webhook 服务
-	if s.alertWebhookService != nil {
-		s.alertWebhookService.Stop()
-	}
-
 	// 调用 alertService 的 Shutdown 方法保存规则
 	if s.alertService != nil {
 		if err := s.alertService.Shutdown(); err != nil {
diff --git a/internal/prometheus_adapter/service/alert_service.go b/internal/prometheus_adapter/service/alert_service.go
index f74d4a3..1c2290c 100644
--- a/internal/prometheus_adapter/service/alert_service.go
+++ b/internal/prometheus_adapter/service/alert_service.go
@@ -411,7 +411,7 @@ func (s *AlertService) buildExpression(rule *model.AlertRule, meta *model.AlertR
 		if len(labelMatchers) > 0 {
 			// 如果表达式包含{，说明已经有标签选择器
 			if strings.Contains(expr, "{") {
-				expr = strings.Replace(expr, "}", ","+strings.Join(labelMatchers, ",")+"}}", 1)
+				expr = strings.Replace(expr, "}", ","+strings.Join(labelMatchers, ",")+"}", 1)
 			} else {
 				// 在指标名后添加标签选择器
 				// 查找第一个非字母数字下划线的字符
diff --git a/internal/prometheus_adapter/service/alert_webhook_service.go b/internal/prometheus_adapter/service/alert_webhook_service.go
deleted file mode 100644
index ac211ac..0000000
--- a/internal/prometheus_adapter/service/alert_webhook_service.go
+++ /dev/null
@@ -1,255 +0,0 @@
-package service
-
-import (
-	"bytes"
-	"context"
-	"crypto/md5"
-	"encoding/json"
-	"fmt"
-	"io"
-	"net/http"
-	"sync"
-	"time"
-
-	"github.com/qiniu/zeroops/internal/prometheus_adapter/client"
-	promconfig "github.com/qiniu/zeroops/internal/prometheus_adapter/config"
-	"github.com/qiniu/zeroops/internal/prometheus_adapter/model"
-	"github.com/rs/zerolog/log"
-)
-
-// AlertWebhookService 告警 Webhook 服务
-type AlertWebhookService struct {
-	promClient      *client.PrometheusClient
-	config          *promconfig.PrometheusAdapterConfig
-	webhookURL      string
-	pollingInterval time.Duration
-	httpClient      *http.Client
-	alertCache      map[string]*model.PrometheusAlert // 缓存已发送的告警
-	cacheMutex      sync.RWMutex
-	stopCh          chan struct{}
-	running         bool
-	runningMutex    sync.Mutex
-}
-
-// NewAlertWebhookService 创建告警 Webhook 服务
-func NewAlertWebhookService(promClient *client.PrometheusClient, config *promconfig.PrometheusAdapterConfig) *AlertWebhookService {
-	return &AlertWebhookService{
-		promClient:      promClient,
-		config:          config,
-		webhookURL:      config.AlertWebhook.URL,
-		pollingInterval: config.AlertWebhook.GetPollingInterval(),
-		httpClient:      &http.Client{Timeout: 30 * time.Second},
-		alertCache:      make(map[string]*model.PrometheusAlert),
-		stopCh:          make(chan struct{}),
-	}
-}
-
-// Start 启动告警轮询服务
-func (s *AlertWebhookService) Start() error {
-	s.runningMutex.Lock()
-	defer s.runningMutex.Unlock()
-
-	if s.running {
-		return fmt.Errorf("alert webhook service already running")
-	}
-
-	s.running = true
-	go s.pollAlerts()
-
-	log.Info().
-		Str("webhook_url", s.webhookURL).
-		Dur("interval", s.pollingInterval).
-		Msg("Alert webhook service started")
-
-	return nil
-}
-
-// Stop 停止告警轮询服务
-func (s *AlertWebhookService) Stop() {
-	s.runningMutex.Lock()
-	defer s.runningMutex.Unlock()
-
-	if !s.running {
-		return
-	}
-
-	close(s.stopCh)
-	s.running = false
-
-	log.Info().Msg("Alert webhook service stopped")
-}
-
-// pollAlerts 轮询告警
-func (s *AlertWebhookService) pollAlerts() {
-	ticker := time.NewTicker(s.pollingInterval)
-	defer ticker.Stop()
-
-	// 立即执行一次
-	s.fetchAndProcessAlerts()
-
-	for {
-		select {
-		case <-ticker.C:
-			s.fetchAndProcessAlerts()
-		case <-s.stopCh:
-			return
-		}
-	}
-}
-
-// fetchAndProcessAlerts 获取并处理告警
-func (s *AlertWebhookService) fetchAndProcessAlerts() {
-	ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
-	defer cancel()
-
-	// 从 Prometheus 获取告警
-	alertsResp, err := s.promClient.GetAlerts(ctx)
-	if err != nil {
-		log.Error().Err(err).Msg("Failed to fetch alerts from Prometheus")
-		return
-	}
-
-	// 处理告警
-	firingAlerts := []model.PrometheusAlert{}
-	resolvedAlerts := []model.PrometheusAlert{}
-
-	s.cacheMutex.Lock()
-	defer s.cacheMutex.Unlock()
-
-	// 分类告警
-	currentAlerts := make(map[string]*model.PrometheusAlert)
-	for _, alert := range alertsResp.Data.Alerts {
-		fingerprint := s.generateFingerprint(alert)
-		currentAlerts[fingerprint] = &alert
-
-		// 检查是否是新告警或状态变更
-		cachedAlert, exists := s.alertCache[fingerprint]
-		if !exists || cachedAlert.State != alert.State {
-			if alert.State == "firing" {
-				firingAlerts = append(firingAlerts, alert)
-			}
-		}
-	}
-
-	// 检查已恢复的告警
-	for fingerprint, cachedAlert := range s.alertCache {
-		if _, exists := currentAlerts[fingerprint]; !exists {
-			// 告警已恢复
-			resolvedAlert := *cachedAlert
-			resolvedAlert.State = "resolved"
-			resolvedAlerts = append(resolvedAlerts, resolvedAlert)
-		}
-	}
-
-	// 更新缓存
-	s.alertCache = currentAlerts
-
-	// 发送告警
-	if len(firingAlerts) > 0 {
-		if err := s.sendAlerts(firingAlerts, "firing"); err != nil {
-			log.Error().Err(err).Msg("Failed to send firing alerts")
-		}
-	}
-
-	if len(resolvedAlerts) > 0 {
-		if err := s.sendAlerts(resolvedAlerts, "resolved"); err != nil {
-			log.Error().Err(err).Msg("Failed to send resolved alerts")
-		}
-	}
-}
-
-// sendAlerts 发送告警到监控模块
-func (s *AlertWebhookService) sendAlerts(alerts []model.PrometheusAlert, status string) error {
-	webhookAlerts := []model.AlertmanagerWebhookAlert{}
-
-	// 收集所有标签用于 groupLabels 和 commonLabels
-	commonLabels := map[string]string{}
-	firstAlert := true
-
-	for _, alert := range alerts {
-		// 生成 fingerprint
-		fingerprint := s.generateFingerprint(alert)
-
-		// 转换时间格式
-		startsAt := alert.ActiveAt.Format(time.RFC3339)
-		endsAt := "0001-01-01T00:00:00Z"
-		if status == "resolved" {
-			endsAt = time.Now().Format(time.RFC3339)
-		}
-
-		// 构造 GeneratorURL
-		generatorURL := fmt.Sprintf("http://prometheus/graph?g0.expr=%s", alert.Labels["alertname"])
-
-		webhookAlert := model.AlertmanagerWebhookAlert{
-			Status:       status,
-			Labels:       alert.Labels,
-			Annotations:  alert.Annotations,
-			StartsAt:     startsAt,
-			EndsAt:       endsAt,
-			GeneratorURL: generatorURL,
-			Fingerprint:  fingerprint,
-		}
-		webhookAlerts = append(webhookAlerts, webhookAlert)
-
-		// 收集公共标签（取第一个告警的标签作为公共标签）
-		if firstAlert {
-			for k, v := range alert.Labels {
-				commonLabels[k] = v
-			}
-			firstAlert = false
-		}
-	}
-
-	groupLabels := map[string]string{}
-	if alertName, ok := commonLabels["alertname"]; ok {
-		groupLabels["alertname"] = alertName
-	}
-
-	// 构造请求
-	req := model.AlertmanagerWebhookRequest{
-		Receiver:     "prometheus_adapter",
-		Status:       status,
-		Alerts:       webhookAlerts,
-		GroupLabels:  groupLabels,
-		CommonLabels: commonLabels,
-		Version:      "1",
-	}
-
-	// 发送请求
-	jsonData, err := json.Marshal(req)
-	if err != nil {
-		return fmt.Errorf("failed to marshal request: %w", err)
-	}
-
-	resp, err := s.httpClient.Post(s.webhookURL, "application/json", bytes.NewBuffer(jsonData))
-	if err != nil {
-		return fmt.Errorf("failed to send webhook: %w", err)
-	}
-	defer resp.Body.Close()
-
-	if resp.StatusCode != http.StatusOK {
-		body, _ := io.ReadAll(resp.Body)
-		return fmt.Errorf("webhook returned status %d: %s", resp.StatusCode, string(body))
-	}
-
-	log.Info().
-		Str("status", status).
-		Int("alert_count", len(alerts)).
-		Str("webhook_url", s.webhookURL).
-		Msg("Successfully sent alerts to webhook")
-
-	return nil
-}
-
-// generateFingerprint 生成告警的唯一标识
-func (s *AlertWebhookService) generateFingerprint(alert model.PrometheusAlert) string {
-	// 基于标签生成指纹
-	labels := ""
-	for k, v := range alert.Labels {
-		labels += fmt.Sprintf("%s=%s,", k, v)
-	}
-
-	h := md5.New()
-	h.Write([]byte(labels))
-	return fmt.Sprintf("%x", h.Sum(nil))[:16]
-}
diff --git a/internal/prometheus_adapter/service/alertmanager_service.go b/internal/prometheus_adapter/service/alertmanager_service.go
new file mode 100644
index 0000000..e91d721
--- /dev/null
+++ b/internal/prometheus_adapter/service/alertmanager_service.go
@@ -0,0 +1,215 @@
+package service
+
+import (
+	"bytes"
+	"encoding/json"
+	"fmt"
+	"io"
+	"net/http"
+	"time"
+
+	promconfig "github.com/qiniu/zeroops/internal/prometheus_adapter/config"
+	"github.com/qiniu/zeroops/internal/prometheus_adapter/model"
+	"github.com/rs/zerolog/log"
+)
+
+// AlertmanagerService Alertmanager 服务
+// 接收 Prometheus 的告警推送并转发到监控告警模块
+type AlertmanagerService struct {
+	config         *promconfig.PrometheusAdapterConfig
+	webhookURL     string
+	httpClient     *http.Client
+	resolveTimeout time.Duration
+}
+
+// NewAlertmanagerProxyService 创建新的 Alertmanager 代理服务
+func NewAlertmanagerProxyService(config *promconfig.PrometheusAdapterConfig) *AlertmanagerService {
+	return &AlertmanagerService{
+		config:         config,
+		webhookURL:     config.AlertWebhook.URL,
+		httpClient:     &http.Client{Timeout: 30 * time.Second},
+		resolveTimeout: 5 * time.Minute, // 默认 resolve_timeout
+	}
+}
+
+// HandleAlertsV2 处理 Prometheus 推送的告警
+// 实现 POST /api/v2/alerts 接口
+func (s *AlertmanagerService) HandleAlertsV2(w http.ResponseWriter, r *http.Request) {
+	// 检查 Content-Type
+	contentType := r.Header.Get("Content-Type")
+	if contentType != "application/json" && contentType != "" {
+		http.Error(w, "Content-Type must be application/json", http.StatusBadRequest)
+		return
+	}
+
+	// 解析 Prometheus 发送的告警
+	var alerts []model.AlertmanagerAlert
+	body, err := io.ReadAll(r.Body)
+	if err != nil {
+		log.Error().Err(err).Msg("Failed to read request body")
+		http.Error(w, "Failed to read request", http.StatusBadRequest)
+		return
+	}
+	defer r.Body.Close()
+
+	if err := json.Unmarshal(body, &alerts); err != nil {
+		log.Error().
+			Err(err).
+			Str("body", string(body)).
+			Msg("Failed to unmarshal alerts")
+		http.Error(w, "Invalid JSON", http.StatusBadRequest)
+		return
+	}
+
+	// 处理时间戳：如果缺失则设置默认值
+	now := time.Now()
+	for i := range alerts {
+		// 如果 startsAt 缺失，设置为当前时间
+		if alerts[i].StartsAt == "" {
+			alerts[i].StartsAt = now.Format(time.RFC3339)
+		}
+		// 如果 endsAt 缺失，设置为当前时间 + resolve_timeout
+		if alerts[i].EndsAt == "" {
+			alerts[i].EndsAt = now.Add(s.resolveTimeout).Format(time.RFC3339)
+		}
+	}
+
+	log.Info().
+		Int("alert_count", len(alerts)).
+		Msg("Received alerts from Prometheus")
+
+	// 转发告警到监控模块
+	if err := s.forwardAlertsV2(alerts); err != nil {
+		log.Error().Err(err).Msg("Failed to forward alerts")
+		// 返回 500 让 Prometheus 重试
+		http.Error(w, "Failed to forward alerts", http.StatusInternalServerError)
+		return
+	}
+
+	// 返回成功响应（Alertmanager API v2 返回空 JSON）
+	w.Header().Set("Content-Type", "application/json")
+	w.WriteHeader(http.StatusOK)
+	w.Write([]byte("{}"))
+}
+
+// HandleHealthCheck 健康检查接口
+// 实现 GET /-/healthy
+func (s *AlertmanagerService) HandleHealthCheck(w http.ResponseWriter, r *http.Request) {
+	w.WriteHeader(http.StatusOK)
+	w.Write([]byte("OK"))
+}
+
+// HandleReadyCheck 就绪检查接口
+// 实现 GET /-/ready
+func (s *AlertmanagerService) HandleReadyCheck(w http.ResponseWriter, r *http.Request) {
+	w.WriteHeader(http.StatusOK)
+	w.Write([]byte("OK"))
+}
+
+// forwardAlertsV2 转发告警到监控告警模块
+func (s *AlertmanagerService) forwardAlertsV2(alerts []model.AlertmanagerAlert) error {
+	// 转换为 Alertmanager webhook 格式
+	webhookAlerts := []model.AlertmanagerWebhookAlert{}
+	commonLabels := map[string]string{}
+	groupLabels := map[string]string{}
+
+	// 统计告警状态，用于确定总体状态
+	hasFiring := false
+
+	for _, alert := range alerts {
+		// 确定告警状态：通过比较 endsAt 和当前时间
+		status := "firing"
+		if alert.EndsAt != "" {
+			endsAtTime, err := time.Parse(time.RFC3339, alert.EndsAt)
+			if err == nil && endsAtTime.Before(time.Now()) {
+				status = "resolved"
+			} else {
+				hasFiring = true
+			}
+		} else {
+			hasFiring = true
+		}
+
+		// 生成 fingerprint
+		fingerprint := s.generateFingerprint(alert.Labels)
+
+		// 构造 GeneratorURL
+		generatorURL := alert.GeneratorURL
+		if generatorURL == "" && alert.Labels["alertname"] != "" {
+			generatorURL = fmt.Sprintf("http://prometheus/graph?g0.expr=%s", alert.Labels["alertname"])
+		}
+
+		webhookAlert := model.AlertmanagerWebhookAlert{
+			Status:       status,
+			Labels:       alert.Labels,
+			Annotations:  alert.Annotations,
+			StartsAt:     alert.StartsAt, // 已经是 RFC3339 格式
+			EndsAt:       alert.EndsAt,   // 已经是 RFC3339 格式
+			GeneratorURL: generatorURL,
+			Fingerprint:  fingerprint,
+		}
+		webhookAlerts = append(webhookAlerts, webhookAlert)
+
+		// 收集公共标签
+		if len(commonLabels) == 0 {
+			for k, v := range alert.Labels {
+				commonLabels[k] = v
+			}
+		}
+	}
+
+	// 设置 groupLabels
+	if alertName, ok := commonLabels["alertname"]; ok {
+		groupLabels["alertname"] = alertName
+	}
+
+	// 确定总体状态：如果有任何 firing 的告警，总体状态为 firing，否则为 resolved
+	overallStatus := "resolved"
+	if hasFiring {
+		overallStatus = "firing"
+	}
+
+	// 构造 webhook 请求
+	req := model.AlertmanagerWebhookRequest{
+		Receiver:     "prometheus_adapter",
+		Status:       overallStatus, // 根据告警实际状态设置
+		Alerts:       webhookAlerts,
+		GroupLabels:  groupLabels,
+		CommonLabels: commonLabels,
+		Version:      "1",
+	}
+
+	// 发送到监控告警模块
+	jsonData, err := json.Marshal(req)
+	if err != nil {
+		return fmt.Errorf("failed to marshal webhook request: %w", err)
+	}
+
+	resp, err := s.httpClient.Post(s.webhookURL, "application/json", bytes.NewBuffer(jsonData))
+	if err != nil {
+		return fmt.Errorf("failed to send webhook: %w", err)
+	}
+	defer resp.Body.Close()
+
+	if resp.StatusCode != http.StatusOK {
+		body, _ := io.ReadAll(resp.Body)
+		return fmt.Errorf("webhook returned status %d: %s", resp.StatusCode, string(body))
+	}
+
+	log.Info().
+		Int("alert_count", len(alerts)).
+		Str("webhook_url", s.webhookURL).
+		Msg("Successfully forwarded alerts to monitoring module")
+
+	return nil
+}
+
+// generateFingerprint 生成告警指纹
+func (s *AlertmanagerService) generateFingerprint(labels map[string]string) string {
+	// 简化版指纹生成
+	result := ""
+	for k, v := range labels {
+		result += fmt.Sprintf("%s:%s,", k, v)
+	}
+	return fmt.Sprintf("%x", result)[:16]
+}
diff --git a/mock/s3/deployments/docker-compose.yml b/mock/s3/deployments/docker-compose.yml
index 377ec3d..9f30f9e 100644
--- a/mock/s3/deployments/docker-compose.yml
+++ b/mock/s3/deployments/docker-compose.yml
@@ -89,6 +89,9 @@ services:
       - '--config.file=/etc/prometheus/prometheus.yml'
       - '--storage.tsdb.path=/prometheus'
       - '--web.enable-lifecycle'
+      - '--web.enable-admin-api'  # 启用管理API以支持配置重载
+    extra_hosts:
+      - "host.docker.internal:host-gateway"  # 允许容器访问宿主机
     restart: unless-stopped
 
   # Grafana - 可视化
diff --git a/mock/s3/deployments/observability/prometheus.yml b/mock/s3/deployments/observability/prometheus.yml
index 35fb014..6c9fb04 100644
--- a/mock/s3/deployments/observability/prometheus.yml
+++ b/mock/s3/deployments/observability/prometheus.yml
@@ -9,6 +9,14 @@ global:
 rule_files:
   - "/etc/prometheus/rules/*.yml"
 
+# Alerting 配置 - 将告警发送到 Prometheus Adapter (伪 Alertmanager)
+alerting:
+  alertmanagers:
+    - static_configs:
+        - targets:
+            - 'host.docker.internal:8081'  # Prometheus Adapter 运行在宿主机的 8081 端口
+      api_version: v2  # 使用 Alertmanager API v2
+
 scrape_configs:
   # Prometheus自身的指标
   - job_name: 'prometheus'

From 285412f197bd78dcdb40b28818710010a09d4eda Mon Sep 17 00:00:00 2001
From: dnj <shdnj@qq.com>
Date: Thu, 25 Sep 2025 18:21:02 +0800
Subject: [PATCH 14/18] =?UTF-8?q?feat(prometheus=5Fadapter):=20=E6=B7=BB?=
 =?UTF-8?q?=E5=8A=A0=E8=8E=B7=E5=8F=96=E7=BB=91=E5=AE=9A=E5=9C=B0=E5=9D=80?=
 =?UTF-8?q?=E6=96=B9=E6=B3=95=E5=B9=B6=E4=BC=98=E5=8C=96=E7=AB=AF=E5=8F=A3?=
 =?UTF-8?q?=E9=85=8D=E7=BD=AE=E9=80=BB=E8=BE=91?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 cmd/prometheus_adapter/main.go        | 28 ++++++++++++++++-----------
 internal/prometheus_adapter/server.go |  8 ++++++++
 2 files changed, 25 insertions(+), 11 deletions(-)

diff --git a/cmd/prometheus_adapter/main.go b/cmd/prometheus_adapter/main.go
index 847516b..696018b 100644
--- a/cmd/prometheus_adapter/main.go
+++ b/cmd/prometheus_adapter/main.go
@@ -20,22 +20,28 @@ func main() {
 
 	log.Info().Msg("Starting Prometheus Adapter server")
 
-	// 加载配置
-	cfg := &config.Config{
-		Server: config.ServerConfig{
-			BindAddr: ":9999", // 默认端口
-		},
+	// 加载 Prometheus Adapter 配置
+	adapter, err := prometheusadapter.NewPrometheusAdapterServer(&config.Config{})
+	if err != nil {
+		log.Fatal().Err(err).Msg("Failed to create Prometheus Adapter server")
+	}
+
+	// 获取 Prometheus Adapter 内部配置的绑定地址
+	bindAddr := ":9999" // 默认端口
+	if adapter.GetBindAddr() != "" {
+		bindAddr = adapter.GetBindAddr()
 	}
 
-	// 如果有环境变量，使用环境变量的端口
+	// 如果有环境变量，优先使用环境变量的端口
 	if port := os.Getenv("ADAPTER_PORT"); port != "" {
-		cfg.Server.BindAddr = ":" + port
+		bindAddr = ":" + port
 	}
 
-	// 创建 Prometheus Adapter 服务器
-	adapter, err := prometheusadapter.NewPrometheusAdapterServer(cfg)
-	if err != nil {
-		log.Fatal().Err(err).Msg("Failed to create Prometheus Adapter server")
+	// 更新配置（虽然已经创建了 adapter，但需要端口信息用于启动服务器）
+	cfg := &config.Config{
+		Server: config.ServerConfig{
+			BindAddr: bindAddr,
+		},
 	}
 
 	// 创建路由
diff --git a/internal/prometheus_adapter/server.go b/internal/prometheus_adapter/server.go
index b2de452..c8e321c 100644
--- a/internal/prometheus_adapter/server.go
+++ b/internal/prometheus_adapter/server.go
@@ -60,6 +60,14 @@ func NewPrometheusAdapterServer(cfg *config.Config) (*PrometheusAdapterServer, e
 	return server, nil
 }
 
+// GetBindAddr 获取配置文件中的绑定地址
+func (s *PrometheusAdapterServer) GetBindAddr() string {
+	if s.promConfig != nil && s.promConfig.Server.BindAddr != "" {
+		return s.promConfig.Server.BindAddr
+	}
+	return ""
+}
+
 // UseApi 设置 API 路由
 func (s *PrometheusAdapterServer) UseApi(router *fox.Engine) error {
 	var err error

From 705363e64bc4d92ede5e77ee4b1f7b4a457d0b47 Mon Sep 17 00:00:00 2001
From: dnj <shdnj@qq.com>
Date: Sun, 28 Sep 2025 10:49:12 +0800
Subject: [PATCH 15/18] =?UTF-8?q?fix(prometheus=5Fadapter):=20=E6=9B=B4?=
 =?UTF-8?q?=E6=96=B0=E5=91=8A=E8=AD=A6webhook=E8=B7=AF=E5=BE=84=E5=B9=B6?=
 =?UTF-8?q?=E6=B7=BB=E5=8A=A0REDACTED=E5=AD=97=E6=AE=B5?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 docs/prometheus_adapter/README.md                           | 4 ++--
 internal/prometheus_adapter/config/config.go                | 2 +-
 internal/prometheus_adapter/config/prometheus_adapter.yml   | 2 +-
 internal/prometheus_adapter/model/prometheus_alert.go       | 1 +
 internal/prometheus_adapter/service/alertmanager_service.go | 1 +
 internal/prometheus_adapter/test_alert_update.sh            | 2 +-
 scripts/prometheus_adapter/build.sh                         | 2 +-
 scripts/prometheus_adapter/deploy.sh                        | 2 +-
 8 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/docs/prometheus_adapter/README.md b/docs/prometheus_adapter/README.md
index 0b9d5ca..65d4714 100644
--- a/docs/prometheus_adapter/README.md
+++ b/docs/prometheus_adapter/README.md
@@ -243,7 +243,7 @@ internal/prometheus_adapter/
 │   （自定义服务）  │
 └────────┬────────┘
          │ Push
-         │ POST /v1/integrations/prometheus/alerts
+         │ POST /v1/integrations/alertmanager/webhook
          ▼
 ┌─────────────────┐
 │   监控告警模块    │
@@ -263,7 +263,7 @@ internal/prometheus_adapter/
   - 支持告警恢复状态通知
 
 - **推送目标**：
-  - URL: `http://alert-module:8080/v1/integrations/prometheus/alerts`
+  - URL: `http://alert-module:8080/v1/integrations/alertmanager/webhook`
   - Method: POST
   - Content-Type: application/json
 
diff --git a/internal/prometheus_adapter/config/config.go b/internal/prometheus_adapter/config/config.go
index 39dd023..5fe7fe8 100644
--- a/internal/prometheus_adapter/config/config.go
+++ b/internal/prometheus_adapter/config/config.go
@@ -105,7 +105,7 @@ func getDefaultConfig() *PrometheusAdapterConfig {
 			ContainerName: "mock-s3-prometheus",
 		},
 		AlertWebhook: AlertWebhookConfig{
-			URL:             "http://alert-module:8080/v1/integrations/prometheus/alerts",
+			URL:             "http://alert-module:8080/v1/integrations/alertmanager/webhook",
 			PollingInterval: "10s",
 		},
 		AlertRules: AlertRulesConfig{
diff --git a/internal/prometheus_adapter/config/prometheus_adapter.yml b/internal/prometheus_adapter/config/prometheus_adapter.yml
index a3eab43..55f0b56 100644
--- a/internal/prometheus_adapter/config/prometheus_adapter.yml
+++ b/internal/prometheus_adapter/config/prometheus_adapter.yml
@@ -10,7 +10,7 @@ prometheus:
 # 告警 Webhook 服务配置
 alert_webhook:
   # 监控告警模块地址
-  url: "http://alert-module:8080/v1/integrations/prometheus/alerts"
+  url: "http://alert-module:8080/v1/integrations/alertmanager/webhook"
   # 轮询间隔
   polling_interval: "10s"
 
diff --git a/internal/prometheus_adapter/model/prometheus_alert.go b/internal/prometheus_adapter/model/prometheus_alert.go
index 9dab331..c809359 100644
--- a/internal/prometheus_adapter/model/prometheus_alert.go
+++ b/internal/prometheus_adapter/model/prometheus_alert.go
@@ -39,6 +39,7 @@ type AlertmanagerWebhookRequest struct {
 	Alerts       []AlertmanagerWebhookAlert `json:"alerts"`
 	GroupLabels  map[string]string          `json:"groupLabels"`  // 分组标签
 	CommonLabels map[string]string          `json:"commonLabels"` // 公共标签
+	Alert        string                     `json:"alert"`        // "REDACTED"
 	Version      string                     `json:"version"`      // "4"
 }
 
diff --git a/internal/prometheus_adapter/service/alertmanager_service.go b/internal/prometheus_adapter/service/alertmanager_service.go
index e91d721..5ff5e90 100644
--- a/internal/prometheus_adapter/service/alertmanager_service.go
+++ b/internal/prometheus_adapter/service/alertmanager_service.go
@@ -176,6 +176,7 @@ func (s *AlertmanagerService) forwardAlertsV2(alerts []model.AlertmanagerAlert)
 		Alerts:       webhookAlerts,
 		GroupLabels:  groupLabels,
 		CommonLabels: commonLabels,
+		Alert:        "REDACTED",
 		Version:      "1",
 	}
 
diff --git a/internal/prometheus_adapter/test_alert_update.sh b/internal/prometheus_adapter/test_alert_update.sh
index 4400b43..f57fceb 100755
--- a/internal/prometheus_adapter/test_alert_update.sh
+++ b/internal/prometheus_adapter/test_alert_update.sh
@@ -2,7 +2,7 @@
 
 # 测试增量更新告警规则功能
 
-BASE_URL="http://localhost:9999"
+BASE_URL="http://10.210.10.33:9999"
 
 echo "=== 测试增量更新告警规则 ==="
 
diff --git a/scripts/prometheus_adapter/build.sh b/scripts/prometheus_adapter/build.sh
index d7b5025..06d7213 100755
--- a/scripts/prometheus_adapter/build.sh
+++ b/scripts/prometheus_adapter/build.sh
@@ -134,7 +134,7 @@ fi
 
 # 环境变量（可选，用于覆盖配置文件）
 # export PROMETHEUS_ADDRESS="http://localhost:9090"
-# export ALERT_WEBHOOK_URL="http://alert-module:8080/v1/integrations/prometheus/alerts"
+# export ALERT_WEBHOOK_URL="http://alert-module:8080/v1/integrations/alertmanager/webhook"
 # export ALERT_POLLING_INTERVAL="10s"
 # export SERVER_BIND_ADDR="0.0.0.0:9999"
 
diff --git a/scripts/prometheus_adapter/deploy.sh b/scripts/prometheus_adapter/deploy.sh
index ad68564..dc56736 100755
--- a/scripts/prometheus_adapter/deploy.sh
+++ b/scripts/prometheus_adapter/deploy.sh
@@ -296,7 +296,7 @@ Group=qboxserver
 WorkingDirectory=$DEPLOY_DIR
 # 可选：通过环境变量覆盖配置
 #Environment="PROMETHEUS_ADDRESS=http://localhost:9090"
-#Environment="ALERT_WEBHOOK_URL=http://alert-module:8080/v1/integrations/prometheus/alerts"
+#Environment="ALERT_WEBHOOK_URL=http://alert-module:8080/v1/integrations/alertmanager/webhook"
 #Environment="ALERT_POLLING_INTERVAL=10s"
 #Environment="SERVER_BIND_ADDR=0.0.0.0:9999"
 ExecStart=$DEPLOY_DIR/bin/prometheus_adapter

From 6de179393f3a64a988d86f4fbe40c162f0a161d0 Mon Sep 17 00:00:00 2001
From: dnj <shdnj@qq.com>
Date: Sun, 28 Sep 2025 11:13:54 +0800
Subject: [PATCH 16/18] =?UTF-8?q?feat(=E5=91=8A=E8=AD=A6=E8=A7=84=E5=88=99?=
 =?UTF-8?q?):=20=E6=B7=BB=E5=8A=A0=E5=88=A0=E9=99=A4=E8=A7=84=E5=88=99?=
 =?UTF-8?q?=E6=A8=A1=E6=9D=BF=E5=92=8C=E5=85=83=E4=BF=A1=E6=81=AF=E7=9A=84?=
 =?UTF-8?q?API=E6=8E=A5=E5=8F=A3?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

实现删除告警规则模板及其关联元信息的功能，包括：
1. 添加DELETE /v1/alert-rules/:rule_name接口删除规则模板
2. 添加DELETE /v1/alert-rules-meta/:rule_name接口删除特定元信息
3. 更新相关文档说明删除操作的使用方法
---
 docs/prometheus_adapter/README.md             | 54 ++++++++++++++
 internal/prometheus_adapter/api/alert_api.go  | 70 +++++++++++++++++++
 internal/prometheus_adapter/model/api.go      |  5 ++
 .../service/alert_service.go                  | 64 +++++++++++++++++
 4 files changed, 193 insertions(+)

diff --git a/docs/prometheus_adapter/README.md b/docs/prometheus_adapter/README.md
index 65d4714..0167488 100644
--- a/docs/prometheus_adapter/README.md
+++ b/docs/prometheus_adapter/README.md
@@ -193,6 +193,60 @@ internal/prometheus_adapter/
 }
 ```
 
+#### 3. 删除规则模板
+- 方法与路径：`DELETE /v1/alert-rules/:rule_name`
+- 功能：删除指定的告警规则模板及其所有关联的元信息
+- 路径参数：
+  - `rule_name`：规则名称（如 `high_cpu_usage`）
+- 响应示例：
+```json
+{
+  "status": "success",
+  "message": "Rule 'high_cpu_usage' and 3 associated metas deleted successfully",
+  "rule_name": "high_cpu_usage",
+  "deleted_metas": 3
+}
+```
+- 错误响应示例（规则不存在）：
+```json
+{
+  "error": {
+    "code": "INVALID_PARAMETER",
+    "message": "rule 'invalid_rule' not found"
+  }
+}
+```
+
+#### 4. 删除规则元信息
+- 方法与路径：`DELETE /v1/alert-rules-meta/:rule_name`
+- 功能：删除指定规则下的特定元信息（通过 labels 唯一标识）
+- 路径参数：
+  - `rule_name`：规则名称（如 `high_cpu_usage`）
+- 请求体示例：
+```json
+{
+  "labels": "{\"service\":\"storage-service\",\"version\":\"1.0.0\"}"
+}
+```
+- 响应示例：
+```json
+{
+  "status": "success",
+  "message": "Rule meta deleted successfully",
+  "rule_name": "high_cpu_usage",
+  "labels": "{\"service\":\"storage-service\",\"version\":\"1.0.0\"}"
+}
+```
+- 错误响应示例（元信息不存在）：
+```json
+{
+  "error": {
+    "code": "INVALID_PARAMETER",
+    "message": "rule meta not found for rule 'high_cpu_usage' with labels '{\"service\":\"invalid-service\"}'"
+  }
+}
+```
+
 #### 规则生成机制
 - **规则模板与元信息关联**：通过 `alert_name` 字段关联
   - `AlertRule.name` = `AlertRuleMeta.alert_name`
diff --git a/internal/prometheus_adapter/api/alert_api.go b/internal/prometheus_adapter/api/alert_api.go
index 8724803..786de6f 100644
--- a/internal/prometheus_adapter/api/alert_api.go
+++ b/internal/prometheus_adapter/api/alert_api.go
@@ -12,6 +12,8 @@ import (
 func (api *Api) setupAlertRouters(router *fox.Engine) {
 	router.PUT("/v1/alert-rules/:rule_name", api.UpdateRule)
 	router.PUT("/v1/alert-rules-meta/:rule_name", api.UpdateRuleMetas)
+	router.DELETE("/v1/alert-rules/:rule_name", api.DeleteRule)
+	router.DELETE("/v1/alert-rules-meta/:rule_name", api.DeleteRuleMeta)
 }
 
 // UpdateRule 更新单个规则模板
@@ -107,3 +109,71 @@ func (api *Api) UpdateRuleMetas(c *fox.Context) {
 		"updated_count": updatedCount,
 	})
 }
+
+// DeleteRule 删除单个规则模板及其所有关联的元信息
+func (api *Api) DeleteRule(c *fox.Context) {
+	ruleName := c.Param("rule_name")
+	if ruleName == "" {
+		SendErrorResponse(c, http.StatusBadRequest, model.ErrorCodeInvalidParameter,
+			"Rule name is required", nil)
+		return
+	}
+
+	// 获取受影响的元信息数量
+	affectedCount := api.alertService.GetAffectedMetas(ruleName)
+
+	err := api.alertService.DeleteRule(ruleName)
+	if err != nil {
+		if err.Error() == fmt.Sprintf("rule '%s' not found", ruleName) {
+			SendErrorResponse(c, http.StatusNotFound, model.ErrorCodeInvalidParameter,
+				err.Error(), nil)
+		} else {
+			SendErrorResponse(c, http.StatusInternalServerError, model.ErrorCodeInternalError,
+				"Failed to delete rule: "+err.Error(), nil)
+		}
+		return
+	}
+
+	c.JSON(http.StatusOK, map[string]interface{}{
+		"status":        "success",
+		"message":       fmt.Sprintf("Rule '%s' and %d associated metas deleted successfully", ruleName, affectedCount),
+		"rule_name":     ruleName,
+		"deleted_metas": affectedCount,
+	})
+}
+
+// DeleteRuleMeta 删除单个规则元信息
+func (api *Api) DeleteRuleMeta(c *fox.Context) {
+	ruleName := c.Param("rule_name")
+	if ruleName == "" {
+		SendErrorResponse(c, http.StatusBadRequest, model.ErrorCodeInvalidParameter,
+			"Rule name is required", nil)
+		return
+	}
+
+	var req model.DeleteAlertRuleMetaRequest
+	if err := c.ShouldBindJSON(&req); err != nil {
+		SendErrorResponse(c, http.StatusBadRequest, model.ErrorCodeInvalidParameter,
+			"Invalid request body: "+err.Error(), nil)
+		return
+	}
+
+	err := api.alertService.DeleteRuleMeta(ruleName, req.Labels)
+	if err != nil {
+		if err.Error() == fmt.Sprintf("rule meta not found for rule '%s' with labels '%s'", ruleName, req.Labels) {
+			SendErrorResponse(c, http.StatusNotFound, model.ErrorCodeInvalidParameter,
+				err.Error(), nil)
+		} else {
+			SendErrorResponse(c, http.StatusInternalServerError, model.ErrorCodeInternalError,
+				"Failed to delete rule meta: "+err.Error(), nil)
+		}
+		return
+	}
+
+	c.JSON(http.StatusOK, map[string]interface{}{
+		"status":    "success",
+		"message":   "Rule meta deleted successfully",
+		"rule_name": ruleName,
+		"labels":    req.Labels,
+	})
+}
diff --git a/internal/prometheus_adapter/model/api.go b/internal/prometheus_adapter/model/api.go
index 775bdd9..9138f68 100644
--- a/internal/prometheus_adapter/model/api.go
+++ b/internal/prometheus_adapter/model/api.go
@@ -68,3 +68,8 @@ type AlertRuleMetaUpdate struct {
 	Labels    string  `json:"labels" binding:"required"` // 必填，用于唯一标识
 	Threshold float64 `json:"threshold"`
 }
+
+// DeleteAlertRuleMetaRequest 删除告警规则元信息请求
+type DeleteAlertRuleMetaRequest struct {
+	Labels string `json:"labels" binding:"required"` // 必填，用于唯一标识要删除的元信息
+}
diff --git a/internal/prometheus_adapter/service/alert_service.go b/internal/prometheus_adapter/service/alert_service.go
index 1c2290c..1a1d115 100644
--- a/internal/prometheus_adapter/service/alert_service.go
+++ b/internal/prometheus_adapter/service/alert_service.go
@@ -264,6 +264,70 @@ func (s *AlertService) GetAffectedMetas(ruleName string) int {
 	return count
 }
 
+// DeleteRule 删除单个规则模板及其所有关联的元信息
+func (s *AlertService) DeleteRule(ruleName string) error {
+	// 查找并删除规则模板
+	ruleFound := false
+	for i, rule := range s.currentRules {
+		if rule.Name == ruleName {
+			// 从切片中删除规则
+			s.currentRules = append(s.currentRules[:i], s.currentRules[i+1:]...)
+			ruleFound = true
+			break
+		}
+	}
+
+	if !ruleFound {
+		return fmt.Errorf("rule '%s' not found", ruleName)
+	}
+
+	// 删除所有关联的元信息
+	deletedMetaCount := 0
+	newMetas := []model.AlertRuleMeta{}
+	for _, meta := range s.currentRuleMetas {
+		if meta.AlertName != ruleName {
+			newMetas = append(newMetas, meta)
+		} else {
+			deletedMetaCount++
+		}
+	}
+	s.currentRuleMetas = newMetas
+
+	log.Info().
+		Str("rule", ruleName).
+		Int("deleted_metas", deletedMetaCount).
+		Msg("Rule and associated metas deleted")
+
+	// 重新生成并同步
+	return s.regenerateAndSync()
+}
+
+// DeleteRuleMeta 删除单个规则元信息
+func (s *AlertService) DeleteRuleMeta(ruleName, labels string) error {
+	// 查找并删除匹配的元信息
+	found := false
+	for i, meta := range s.currentRuleMetas {
+		if meta.AlertName == ruleName && meta.Labels == labels {
+			// 从切片中删除元信息
+			s.currentRuleMetas = append(s.currentRuleMetas[:i], s.currentRuleMetas[i+1:]...)
+			found = true
+			break
+		}
+	}
+
+	if !found {
+		return fmt.Errorf("rule meta not found for rule '%s' with labels '%s'", ruleName, labels)
+	}
+
+	log.Info().
+		Str("rule", ruleName).
+		Str("labels", labels).
+		Msg("Rule meta deleted")
+
+	// 重新生成并同步
+	return s.regenerateAndSync()
+}
+
 // ========== 内部核心方法 ==========
 
 // regenerateAndSync 使用当前内存中的规则和元信息重新生成Prometheus规则并同步

From 4c7dab9fef28f1f3a0c373edb1efde78aceccb31 Mon Sep 17 00:00:00 2001
From: dnj <shdnj@qq.com>
Date: Sun, 28 Sep 2025 11:26:29 +0800
Subject: [PATCH 17/18] =?UTF-8?q?refactor(prometheus=5Fadapter):=20?=
 =?UTF-8?q?=E5=90=88=E5=B9=B6=E5=B9=B6=E6=89=A9=E5=B1=95=E5=91=8A=E8=AD=A6?=
 =?UTF-8?q?=E8=A7=84=E5=88=99=E6=B5=8B=E8=AF=95=E8=84=9A=E6=9C=AC?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../{test_alert_update.sh => test_alert.sh}   | 64 +++++++++++++++++++
 1 file changed, 64 insertions(+)
 rename internal/prometheus_adapter/{test_alert_update.sh => test_alert.sh} (59%)

diff --git a/internal/prometheus_adapter/test_alert_update.sh b/internal/prometheus_adapter/test_alert.sh
similarity index 59%
rename from internal/prometheus_adapter/test_alert_update.sh
rename to internal/prometheus_adapter/test_alert.sh
index f57fceb..cc74248 100755
--- a/internal/prometheus_adapter/test_alert_update.sh
+++ b/internal/prometheus_adapter/test_alert.sh
@@ -121,4 +121,68 @@ curl -X PUT ${BASE_URL}/v1/alert-rules-meta/high_memory_usage \
     ]
   }' | jq .
 
+sleep 2
+
+# 5. 测试删除规则元信息
+echo -e "\n5. 删除规则元信息（删除 high_cpu_usage 的 storage-service）..."
+curl -X DELETE ${BASE_URL}/v1/alert-rules-meta/high_cpu_usage \
+  -H "Content-Type: application/json" \
+  -d '{
+    "labels": "{\"service\":\"storage-service\",\"version\":\"1.0.0\"}"
+  }' | jq .
+
+sleep 2
+
+# 6. 测试删除不存在的规则元信息（应该返回404）
+echo -e "\n6. 删除不存在的规则元信息（测试错误处理）..."
+curl -X DELETE ${BASE_URL}/v1/alert-rules-meta/high_cpu_usage \
+  -H "Content-Type: application/json" \
+  -d '{
+    "labels": "{\"service\":\"non-existent-service\",\"version\":\"1.0.0\"}"
+  }' | jq .
+
+sleep 2
+
+# 7. 测试删除整个规则模板
+echo -e "\n7. 删除整个规则模板（删除 high_memory_usage 及其所有元信息）..."
+curl -X DELETE ${BASE_URL}/v1/alert-rules/high_memory_usage | jq .
+
+sleep 2
+
+# 8. 测试删除不存在的规则模板（应该返回404）
+echo -e "\n8. 删除不存在的规则模板（测试错误处理）..."
+curl -X DELETE ${BASE_URL}/v1/alert-rules/non_existent_rule | jq .
+
+sleep 2
+
+# 9. 验证删除结果 - 查看剩余的规则
+echo -e "\n9. 验证删除结果..."
+echo "9.1 尝试更新已删除的规则模板（应该创建新规则）："
+curl -X PUT ${BASE_URL}/v1/alert-rules/high_memory_usage \
+  -H "Content-Type: application/json" \
+  -d '{
+    "description": "重新创建的内存告警规则",
+    "expr": "system_memory_usage_percent",
+    "op": ">",
+    "severity": "warning",
+    "watch_time": 300
+  }' | jq .
+
+sleep 1
+
+echo -e "\n9.2 查看当前 high_cpu_usage 的受影响元信息数量（应该只剩1个）："
+curl -X PUT ${BASE_URL}/v1/alert-rules/high_cpu_usage \
+  -H "Content-Type: application/json" \
+  -d '{
+    "description": "验证剩余元信息的规则更新"
+  }' | jq .
+
+echo -e "\n=== 删除功能测试完成 ==="
+echo -e "\n测试总结："
+echo "✓ 测试了删除单个规则元信息"
+echo "✓ 测试了删除不存在的规则元信息（错误处理）"
+echo "✓ 测试了删除整个规则模板及其所有元信息"
+echo "✓ 测试了删除不存在的规则模板（错误处理）"
+echo "✓ 验证了删除操作的实际效果"
+
 echo -e "\n=== 测试完成 ==="
\ No newline at end of file

From 4555e066326958d25da3ba29f99f56922867516f Mon Sep 17 00:00:00 2001
From: dnj <shdnj@qq.com>
Date: Mon, 29 Sep 2025 10:03:47 +0800
Subject: [PATCH 18/18] =?UTF-8?q?fix(prometheus=5Fadapter):=20=E6=94=B9?=
 =?UTF-8?q?=E8=BF=9B=E6=9C=8D=E5=8A=A1=E8=BF=9B=E7=A8=8B=E7=AE=A1=E7=90=86?=
 =?UTF-8?q?=E5=92=8C=E5=91=8A=E8=AD=A6=E8=A1=A8=E8=BE=BE=E5=BC=8F=E7=94=9F?=
 =?UTF-8?q?=E6=88=90?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- 在deploy.sh和build.sh中添加PID文件管理，优化服务启动和停止流程
- 修复alert_service.go中告警表达式生成的标签处理逻辑
- 使用%g代替%f格式化浮点数以避免科学计数法显示
---
 .../service/alert_service.go                  | 55 ++++++++++----
 scripts/prometheus_adapter/build.sh           | 75 ++++++++++++++++---
 scripts/prometheus_adapter/deploy.sh          | 40 ++++++++--
 3 files changed, 137 insertions(+), 33 deletions(-)

diff --git a/internal/prometheus_adapter/service/alert_service.go b/internal/prometheus_adapter/service/alert_service.go
index 1a1d115..fdd1b5c 100644
--- a/internal/prometheus_adapter/service/alert_service.go
+++ b/internal/prometheus_adapter/service/alert_service.go
@@ -402,7 +402,7 @@ func (s *AlertService) buildPrometheusRules(rules []model.AlertRule, ruleMetas [
 		// 构建注释
 		annotations := map[string]string{
 			"description": rule.Description,
-			"summary":     fmt.Sprintf("%s %s %f", rule.Expr, rule.Op, meta.Threshold),
+			"summary":     fmt.Sprintf("%s %s %g", rule.Expr, rule.Op, meta.Threshold),
 		}
 
 		// 计算for字段
@@ -475,29 +475,54 @@ func (s *AlertService) buildExpression(rule *model.AlertRule, meta *model.AlertR
 		if len(labelMatchers) > 0 {
 			// 如果表达式包含{，说明已经有标签选择器
 			if strings.Contains(expr, "{") {
-				expr = strings.Replace(expr, "}", ","+strings.Join(labelMatchers, ",")+"}", 1)
-			} else {
-				// 在指标名后添加标签选择器
-				// 查找第一个非字母数字下划线的字符
-				metricEnd := 0
-				for i, ch := range expr {
-					if !((ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z') ||
-						(ch >= '0' && ch <= '9') || ch == '_') {
-						metricEnd = i
-						break
+				// 查找第一个 { 后的内容
+				start := strings.Index(expr, "{")
+				end := strings.Index(expr[start:], "}")
+				if end != -1 {
+					end += start
+					existingLabels := strings.TrimSpace(expr[start+1 : end])
+					if existingLabels == "" {
+						// 空的标签选择器，直接替换
+						expr = expr[:start+1] + strings.Join(labelMatchers, ",") + expr[end:]
+					} else {
+						// 已有标签，需要检查是否重复
+						existingLabelMap := make(map[string]bool)
+						// 解析现有标签
+						labelPairs := strings.Split(existingLabels, ",")
+						for _, pair := range labelPairs {
+							if strings.Contains(pair, "=") {
+								key := strings.TrimSpace(strings.Split(pair, "=")[0])
+								if key != "" {
+									existingLabelMap[key] = true
+								}
+							}
+						}
+						// 只添加不重复的标签
+						newLabels := []string{}
+						for k, v := range labels {
+							if !existingLabelMap[k] && k != "" && v != "" {
+								newLabels = append(newLabels, fmt.Sprintf(`%s="%s"`, k, v))
+							}
+						}
+						if len(newLabels) > 0 {
+							expr = expr[:end] + "," + strings.Join(newLabels, ",") + expr[end:]
+						}
 					}
 				}
-				if metricEnd == 0 {
-					metricEnd = len(expr)
+			} else {
+				// 对于没有标签的简单指标，只处理单个单词的情况
+				// 如果表达式包含空格、括号等，不进行标签注入
+				if !strings.ContainsAny(expr, " ()[]{}") {
+					// 只有单个指标名，可以安全添加标签
+					expr = expr + "{" + strings.Join(labelMatchers, ",") + "}"
 				}
-				expr = expr[:metricEnd] + "{" + strings.Join(labelMatchers, ",") + "}" + expr[metricEnd:]
 			}
 		}
 	}
 
 	// 添加比较操作符和阈值
 	if meta.Threshold != 0 {
-		expr = fmt.Sprintf("%s %s %f", expr, rule.Op, meta.Threshold)
+		expr = fmt.Sprintf("%s %s %g", expr, rule.Op, meta.Threshold)
 	}
 
 	return expr
diff --git a/scripts/prometheus_adapter/build.sh b/scripts/prometheus_adapter/build.sh
index 06d7213..d75f0b8 100755
--- a/scripts/prometheus_adapter/build.sh
+++ b/scripts/prometheus_adapter/build.sh
@@ -118,6 +118,8 @@ cat > "$BUILD_DIR/start.sh" << 'EOF'
 SCRIPT_DIR=$(cd "$(dirname "$0")" && pwd)
 BIN_PATH="$SCRIPT_DIR/bin/prometheus_adapter"
 CONFIG_FILE="$SCRIPT_DIR/config/prometheus_adapter.yml"
+PID_FILE="$SCRIPT_DIR/prometheus_adapter.pid"
+LOG_FILE="$SCRIPT_DIR/prometheus_adapter.log"
 
 # 检查二进制文件
 if [ ! -f "$BIN_PATH" ]; then
@@ -125,6 +127,17 @@ if [ ! -f "$BIN_PATH" ]; then
     exit 1
 fi
 
+# 检查是否已在运行
+if [ -f "$PID_FILE" ]; then
+    PID=$(cat "$PID_FILE")
+    if kill -0 "$PID" 2>/dev/null; then
+        echo "Prometheus Adapter已在运行 (PID: $PID)"
+        exit 1
+    else
+        rm -f "$PID_FILE"
+    fi
+fi
+
 # 检查配置文件
 if [ -f "$CONFIG_FILE" ]; then
     echo "使用配置文件: $CONFIG_FILE"
@@ -140,11 +153,23 @@ fi
 
 echo "启动 Prometheus Adapter..."
 
-# 切换到 bin 目录，以便程序能正确找到相对路径的配置文件
+# 切换到脚本目录
 cd "$SCRIPT_DIR"
 
-# 启动服务
-exec "$BIN_PATH"
+# 后台启动服务
+nohup "$BIN_PATH" > "$LOG_FILE" 2>&1 &
+PID=$!
+
+# 保存PID
+echo $PID > "$PID_FILE"
+
+echo "Prometheus Adapter已启动"
+echo "PID: $PID"
+echo "日志文件: $LOG_FILE"
+echo "PID文件: $PID_FILE"
+echo ""
+echo "查看日志: tail -f $LOG_FILE"
+echo "停止服务: ./stop.sh"
 EOF
 chmod +x "$BUILD_DIR/start.sh"
 
@@ -155,10 +180,29 @@ cat > "$BUILD_DIR/stop.sh" << 'EOF'
 
 # Prometheus Adapter 停止脚本
 
+# 获取脚本所在目录
+SCRIPT_DIR=$(cd "$(dirname "$0")" && pwd)
+PID_FILE="$SCRIPT_DIR/prometheus_adapter.pid"
 APP_NAME="prometheus_adapter"
 
-# 查找进程
-PID=$(ps aux | grep -v grep | grep "$APP_NAME" | awk '{print $2}')
+# 优先从PID文件读取
+if [ -f "$PID_FILE" ]; then
+    PID=$(cat "$PID_FILE" 2>/dev/null)
+    if [ -n "$PID" ] && kill -0 "$PID" 2>/dev/null; then
+        echo "从PID文件获取进程ID: $PID"
+    else
+        echo "PID文件中的进程已不存在，清理PID文件"
+        rm -f "$PID_FILE"
+        PID=""
+    fi
+else
+    PID=""
+fi
+
+# 如果PID文件不存在或进程已死，通过进程名查找
+if [ -z "$PID" ]; then
+    PID=$(ps aux | grep -v grep | grep "$APP_NAME" | awk '{print $2}')
+fi
 
 if [ -z "$PID" ]; then
     echo "没有找到运行中的 $APP_NAME 进程"
@@ -166,15 +210,24 @@ if [ -z "$PID" ]; then
 fi
 
 echo "停止 $APP_NAME (PID: $PID)..."
-kill -TERM $PID
+kill -TERM $PID 2>/dev/null || true
 
 # 等待进程退出
-sleep 2
+count=0
+while [ $count -lt 10 ] && ps -p "$PID" > /dev/null 2>&1; do
+    sleep 1
+    count=$((count + 1))
+done
+
+# 检查是否已退出
+if ps -p "$PID" > /dev/null 2>&1; then
+    echo "强制停止 $APP_NAME..."
+    kill -KILL "$PID" 2>/dev/null || true
+fi
 
-# 检查是否还在运行
-if ps -p $PID > /dev/null 2>&1; then
-    echo "强制停止进程..."
-    kill -KILL $PID
+# 清理PID文件
+if [ -f "$PID_FILE" ]; then
+    rm -f "$PID_FILE"
 fi
 
 echo "$APP_NAME 已停止"
diff --git a/scripts/prometheus_adapter/deploy.sh b/scripts/prometheus_adapter/deploy.sh
index dc56736..6f53a6f 100755
--- a/scripts/prometheus_adapter/deploy.sh
+++ b/scripts/prometheus_adapter/deploy.sh
@@ -132,6 +132,16 @@ fi
 
 # 检查是否有运行中的服务
 check_running_service() {
+    # 优先从PID文件读取
+    if [ -f "$DEPLOY_DIR/prometheus_adapter.pid" ]; then
+        local pid=$(cat "$DEPLOY_DIR/prometheus_adapter.pid" 2>/dev/null)
+        if [ -n "$pid" ] && kill -0 "$pid" 2>/dev/null; then
+            echo "$pid"
+            return
+        fi
+    fi
+
+    # 如果PID文件不存在或进程已死，通过进程名查找
     local pid=$(ps aux | grep -v grep | grep "prometheus_adapter" | grep -v "$0" | awk '{print $2}')
     if [ -n "$pid" ]; then
         echo "$pid"
@@ -158,6 +168,11 @@ stop_service() {
             kill -KILL "$pid" 2>/dev/null || true
         fi
 
+        # 清理PID文件
+        if [ -f "$DEPLOY_DIR/prometheus_adapter.pid" ]; then
+            rm -f "$DEPLOY_DIR/prometheus_adapter.pid"
+        fi
+
         log_info "服务已停止"
     fi
 }
@@ -340,17 +355,27 @@ if [ "$START_SERVICE" = true ] || [ "$RESTART_SERVICE" = true ]; then
 
     # 启动服务
     cd "$DEPLOY_DIR"
-    nohup ./start.sh > prometheus_adapter.log 2>&1 &
+
+    # 直接启动二进制文件而不是通过start.sh脚本
+    nohup ./bin/prometheus_adapter > prometheus_adapter.log 2>&1 &
+    PID=$!
+
+    # 保存PID到文件
+    echo $PID > prometheus_adapter.pid
+
+    log_info "服务已启动 (PID: $PID)"
+    echo "PID文件: $DEPLOY_DIR/prometheus_adapter.pid"
+    echo "日志文件: $DEPLOY_DIR/prometheus_adapter.log"
 
     # 等待服务启动
     sleep 2
 
     # 检查是否启动成功
-    NEW_PID=$(check_running_service)
-    if [ -n "$NEW_PID" ]; then
-        log_info "服务已启动 (PID: $NEW_PID)"
+    if kill -0 "$PID" 2>/dev/null; then
+        log_info "服务启动成功，正在运行"
         echo ""
         echo "查看日志: tail -f $DEPLOY_DIR/prometheus_adapter.log"
+        echo "停止服务: kill \$(cat $DEPLOY_DIR/prometheus_adapter.pid)"
     else
         log_error "服务启动失败，请检查日志"
         exit 1
@@ -359,10 +384,11 @@ else
     echo ""
     echo "手动启动服务:"
     echo "  cd $DEPLOY_DIR"
-    echo "  ./start.sh"
+    echo "  nohup ./bin/prometheus_adapter > prometheus_adapter.log 2>&1 &"
+    echo "  echo \$! > prometheus_adapter.pid"
     echo ""
-    echo "或使用后台模式:"
-    echo "  nohup ./start.sh > prometheus_adapter.log 2>&1 &"
+    echo "停止服务:"
+    echo "  kill \$(cat prometheus_adapter.pid)"
 fi
 
 log_info "部署完成!"
\ No newline at end of file