From 29013ccb7b1ba0e4e152227b107141bf95fb3bc7 Mon Sep 17 00:00:00 2001 From: Brooks Swinnerton Date: Fri, 3 Apr 2026 00:10:45 -0400 Subject: [PATCH 01/10] fix: wire up events_ingested and events_rejected metrics Increment prefixd_events_ingested_total after each ban event is stored. Increment prefixd_events_rejected_total on duplicate events and guardrail rejections. Co-Authored-By: Claude Opus 4.6 (1M context) --- src/api/handlers.rs | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/api/handlers.rs b/src/api/handlers.rs index a0b047e..ec1579f 100644 --- a/src/api/handlers.rs +++ b/src/api/handlers.rs @@ -589,6 +589,9 @@ async fn handle_ban( .find_ban_event_by_external_id(&input.source, ext_id) .await { + crate::observability::metrics::EVENTS_REJECTED + .with_label_values(&[&input.source, "duplicate"]) + .inc(); return Err(AppError(PrefixdError::DuplicateEvent { detector_source: input.source.clone(), external_id: ext_id.clone(), @@ -602,6 +605,10 @@ async fn handle_ban( // Store event state.repo.insert_event(&event).await.map_err(AppError)?; + crate::observability::metrics::EVENTS_INGESTED + .with_label_values(&[&event.source, &event.attack_vector().to_string()]) + .inc(); + // Check if shutting down if state.is_shutting_down() { return Err(AppError(PrefixdError::ShuttingDown)); @@ -873,6 +880,9 @@ async fn handle_ban( .validate(&intent, state.repo.as_ref(), is_safelisted) .await { + crate::observability::metrics::EVENTS_REJECTED + .with_label_values(&[&event.source, "guardrail"]) + .inc(); tracing::warn!(error = %e, "guardrail rejected mitigation"); return Err(AppError(e)); } From 9627f2376e10dc5164db18537c8905c1c60476e1 Mon Sep 17 00:00:00 2001 From: Brooks Swinnerton Date: Fri, 3 Apr 2026 00:12:48 -0400 Subject: [PATCH 02/10] fix: wire up mitigation lifecycle metrics - MITIGATIONS_CREATED: increment on successful mitigation creation (both via event ingest and manual API) - MITIGATIONS_WITHDRAWN: increment on detector unban and operator withdrawal, with reason label - MITIGATIONS_EXPIRED: increment in reconciliation expire loop - MITIGATIONS_ACTIVE: set as gauge in reconciliation sync, grouped by action_type and pop Co-Authored-By: Claude Opus 4.6 (1M context) --- src/api/handlers.rs | 18 ++++++++++++++ src/scheduler/reconcile.rs | 48 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 66 insertions(+) diff --git a/src/api/handlers.rs b/src/api/handlers.rs index ec1579f..73b381b 100644 --- a/src/api/handlers.rs +++ b/src/api/handlers.rs @@ -540,6 +540,7 @@ async fn handle_unban( } // Update mitigation status + let action_type_str = mitigation.action_type.to_string(); mitigation.withdraw(Some(format!("Detector unban: {}", source))); state .repo @@ -547,6 +548,10 @@ async fn handle_unban( .await .map_err(AppError)?; + crate::observability::metrics::MITIGATIONS_WITHDRAWN + .with_label_values(&[&action_type_str, &mitigation.pop, "detector_unban"]) + .inc(); + // Broadcast withdrawal via WebSocket let _ = state .ws_broadcast @@ -917,6 +922,10 @@ async fn handle_ban( .await .map_err(AppError)?; + crate::observability::metrics::MITIGATIONS_CREATED + .with_label_values(&[&mitigation.action_type.to_string(), &state.settings.pop]) + .inc(); + // Resolve signal group to 'resolved' now that mitigation is confirmed if let Some(group_id) = signal_group_id { if let Ok(Some(mut group)) = state.repo.get_signal_group(group_id).await { @@ -1401,6 +1410,10 @@ pub async fn create_mitigation( return Ok(AppError(e).into_response()); } + crate::observability::metrics::MITIGATIONS_CREATED + .with_label_values(&[&mitigation.action_type.to_string(), &state.settings.pop]) + .inc(); + Ok(( StatusCode::CREATED, Json(MitigationResponse::from(&mitigation)), @@ -1451,6 +1464,7 @@ pub async fn withdraw_mitigation( .map_err(|_| StatusCode::INTERNAL_SERVER_ERROR)?; } + let action_type_str = mitigation.action_type.to_string(); mitigation.withdraw(Some(format!("{}: {}", req.operator_id, req.reason))); state .repo @@ -1458,6 +1472,10 @@ pub async fn withdraw_mitigation( .await .map_err(|_| StatusCode::INTERNAL_SERVER_ERROR)?; + crate::observability::metrics::MITIGATIONS_WITHDRAWN + .with_label_values(&[&action_type_str, &mitigation.pop, "operator"]) + .inc(); + // Broadcast withdrawal via WebSocket let _ = state .ws_broadcast diff --git a/src/scheduler/reconcile.rs b/src/scheduler/reconcile.rs index 07f3d42..4c05a5d 100644 --- a/src/scheduler/reconcile.rs +++ b/src/scheduler/reconcile.rs @@ -88,6 +88,9 @@ impl ReconciliationLoop { // 3. Sync desired vs actual state self.sync_announcements().await?; + // 4. Update BGP session metrics + self.update_bgp_session_metrics().await; + Ok(()) } @@ -114,9 +117,15 @@ impl ReconciliationLoop { } // Update status + let action_type_str = mitigation.action_type.to_string(); + let pop = mitigation.pop.clone(); mitigation.expire(); self.repo.update_mitigation(&mitigation).await?; + crate::observability::metrics::MITIGATIONS_EXPIRED + .with_label_values(&[&action_type_str, &pop]) + .inc(); + // Broadcast expiry via WebSocket if let Some(ref tx) = self.ws_broadcast { let _ = tx.send(WsMessage::MitigationExpired { @@ -197,6 +206,25 @@ impl ReconciliationLoop { .with_label_values(&["local"]) .set(active.len() as f64); + // Update MITIGATIONS_ACTIVE gauge by action_type and pop + { + use std::collections::HashMap; + let mut counts: HashMap<(String, String), f64> = HashMap::new(); + for m in &active { + *counts + .entry((m.action_type.to_string(), m.pop.clone())) + .or_default() += 1.0; + } + // Reset all label combinations to zero first, then set observed values. + // This handles the case where a combination drops to zero. + crate::observability::metrics::MITIGATIONS_ACTIVE.reset(); + for ((action_type, pop), count) in &counts { + crate::observability::metrics::MITIGATIONS_ACTIVE + .with_label_values(&[action_type, pop]) + .set(*count); + } + } + // Get actual state from BGP let announced = self.announcer.list_active().await?; let announced_hashes: std::collections::HashSet<_> = @@ -245,6 +273,26 @@ impl ReconciliationLoop { Ok(()) } + async fn update_bgp_session_metrics(&self) { + match self.announcer.session_status().await { + Ok(peers) => { + for peer in &peers { + let value = if peer.state.is_established() { + 1.0 + } else { + 0.0 + }; + crate::observability::metrics::BGP_SESSION_UP + .with_label_values(&[&peer.name]) + .set(value); + } + } + Err(e) => { + tracing::warn!(error = %e, "failed to fetch BGP session status for metrics"); + } + } + } + fn build_flowspec_rule(&self, m: &crate::domain::Mitigation) -> FlowSpecRule { let nlri = FlowSpecNlri::from(&m.match_criteria); let action = FlowSpecAction::from((m.action_type, &m.action_params)); From 3791723c25cfbbd9c8beaaef5873774e419704e3 Mon Sep 17 00:00:00 2001 From: Brooks Swinnerton Date: Fri, 3 Apr 2026 00:15:44 -0400 Subject: [PATCH 03/10] fix: wire up announcements_total and announcements_latency metrics Instrument all announce() and withdraw() call sites with counters and latency histograms: - handle_ban announce - handle_unban withdraw - create_mitigation announce - withdraw_mitigation withdraw - bulk_withdraw withdraw - reconciliation expire withdraw - reconciliation re-announce Uses "unknown" as the peer label since GoBGP manages peer selection internally. Co-Authored-By: Claude Opus 4.6 (1M context) --- src/api/handlers.rs | 57 +++++++++++++++++++++++++++++++++++++- src/scheduler/reconcile.rs | 22 +++++++++++++++ 2 files changed, 78 insertions(+), 1 deletion(-) diff --git a/src/api/handlers.rs b/src/api/handlers.rs index 73b381b..39855ad 100644 --- a/src/api/handlers.rs +++ b/src/api/handlers.rs @@ -533,9 +533,20 @@ async fn handle_unban( let action = FlowSpecAction::from((mitigation.action_type, &mitigation.action_params)); let rule = FlowSpecRule::new(nlri, action); + let start = std::time::Instant::now(); if let Err(e) = state.announcer.withdraw(&rule).await { + crate::observability::metrics::ANNOUNCEMENTS_TOTAL + .with_label_values(&["unknown", "error"]) + .inc(); tracing::error!(error = %e, "BGP withdrawal failed"); // Continue anyway - mark as withdrawn in DB + } else { + crate::observability::metrics::ANNOUNCEMENTS_TOTAL + .with_label_values(&["unknown", "withdrawn"]) + .inc(); + crate::observability::metrics::ANNOUNCEMENTS_LATENCY + .with_label_values(&["unknown"]) + .observe(start.elapsed().as_secs_f64()); } } @@ -903,7 +914,11 @@ async fn handle_ban( let action = FlowSpecAction::from((mitigation.action_type, &mitigation.action_params)); let rule = FlowSpecRule::new(nlri, action); + let start = std::time::Instant::now(); if let Err(e) = state.announcer.announce(&rule).await { + crate::observability::metrics::ANNOUNCEMENTS_TOTAL + .with_label_values(&["unknown", "error"]) + .inc(); tracing::error!(error = %e, "BGP announcement failed"); mitigation.reject(e.to_string()); state @@ -913,6 +928,12 @@ async fn handle_ban( .map_err(AppError)?; return Err(AppError(e)); } + crate::observability::metrics::ANNOUNCEMENTS_TOTAL + .with_label_values(&["unknown", "announced"]) + .inc(); + crate::observability::metrics::ANNOUNCEMENTS_LATENCY + .with_label_values(&["unknown"]) + .observe(start.elapsed().as_secs_f64()); } mitigation.activate(); @@ -1400,9 +1421,19 @@ pub async fn create_mitigation( let nlri = FlowSpecNlri::from(&mitigation.match_criteria); let action = FlowSpecAction::from((mitigation.action_type, &mitigation.action_params)); let rule = FlowSpecRule::new(nlri, action); + let start = std::time::Instant::now(); if let Err(e) = state.announcer.announce(&rule).await { + crate::observability::metrics::ANNOUNCEMENTS_TOTAL + .with_label_values(&["unknown", "error"]) + .inc(); return Ok(AppError(e).into_response()); } + crate::observability::metrics::ANNOUNCEMENTS_TOTAL + .with_label_values(&["unknown", "announced"]) + .inc(); + crate::observability::metrics::ANNOUNCEMENTS_LATENCY + .with_label_values(&["unknown"]) + .observe(start.elapsed().as_secs_f64()); } mitigation.activate(); @@ -1457,11 +1488,24 @@ pub async fn withdraw_mitigation( let nlri = FlowSpecNlri::from(&mitigation.match_criteria); let action = FlowSpecAction::from((mitigation.action_type, &mitigation.action_params)); let rule = FlowSpecRule::new(nlri, action); + let start = std::time::Instant::now(); state .announcer .withdraw(&rule) .await - .map_err(|_| StatusCode::INTERNAL_SERVER_ERROR)?; + .map_err(|e| { + crate::observability::metrics::ANNOUNCEMENTS_TOTAL + .with_label_values(&["unknown", "error"]) + .inc(); + let _ = e; + StatusCode::INTERNAL_SERVER_ERROR + })?; + crate::observability::metrics::ANNOUNCEMENTS_TOTAL + .with_label_values(&["unknown", "withdrawn"]) + .inc(); + crate::observability::metrics::ANNOUNCEMENTS_LATENCY + .with_label_values(&["unknown"]) + .observe(start.elapsed().as_secs_f64()); } let action_type_str = mitigation.action_type.to_string(); @@ -1593,8 +1637,19 @@ pub async fn bulk_withdraw_mitigations( let nlri = FlowSpecNlri::from(&mitigation.match_criteria); let action = FlowSpecAction::from((mitigation.action_type, &mitigation.action_params)); let rule = FlowSpecRule::new(nlri, action); + let start = std::time::Instant::now(); if let Err(e) = state.announcer.withdraw(&rule).await { + crate::observability::metrics::ANNOUNCEMENTS_TOTAL + .with_label_values(&["unknown", "error"]) + .inc(); tracing::error!(error = %e, mitigation_id = %id, "BGP withdrawal failed in bulk withdraw"); + } else { + crate::observability::metrics::ANNOUNCEMENTS_TOTAL + .with_label_values(&["unknown", "withdrawn"]) + .inc(); + crate::observability::metrics::ANNOUNCEMENTS_LATENCY + .with_label_values(&["unknown"]) + .observe(start.elapsed().as_secs_f64()); } } diff --git a/src/scheduler/reconcile.rs b/src/scheduler/reconcile.rs index 4c05a5d..d2b2d37 100644 --- a/src/scheduler/reconcile.rs +++ b/src/scheduler/reconcile.rs @@ -107,12 +107,23 @@ impl ReconciliationLoop { // Withdraw BGP announcement if !self.dry_run { let rule = self.build_flowspec_rule(&mitigation); + let start = std::time::Instant::now(); if let Err(e) = self.announcer.withdraw(&rule).await { + crate::observability::metrics::ANNOUNCEMENTS_TOTAL + .with_label_values(&["unknown", "error"]) + .inc(); tracing::warn!( mitigation_id = %mitigation.mitigation_id, error = %e, "failed to withdraw expired mitigation" ); + } else { + crate::observability::metrics::ANNOUNCEMENTS_TOTAL + .with_label_values(&["unknown", "withdrawn"]) + .inc(); + crate::observability::metrics::ANNOUNCEMENTS_LATENCY + .with_label_values(&["unknown"]) + .observe(start.elapsed().as_secs_f64()); } } @@ -243,12 +254,23 @@ impl ReconciliationLoop { ); if !self.dry_run { + let start = std::time::Instant::now(); if let Err(e) = self.announcer.announce(&rule).await { + crate::observability::metrics::ANNOUNCEMENTS_TOTAL + .with_label_values(&["unknown", "error"]) + .inc(); tracing::error!( mitigation_id = %mitigation.mitigation_id, error = %e, "failed to re-announce" ); + } else { + crate::observability::metrics::ANNOUNCEMENTS_TOTAL + .with_label_values(&["unknown", "announced"]) + .inc(); + crate::observability::metrics::ANNOUNCEMENTS_LATENCY + .with_label_values(&["unknown"]) + .observe(start.elapsed().as_secs_f64()); } } } From ff560ab9aa0442c0c4740f9089321bbcb4fcb9bc Mon Sep 17 00:00:00 2001 From: Brooks Swinnerton Date: Fri, 3 Apr 2026 00:16:19 -0400 Subject: [PATCH 04/10] fix: wire up reconciliation_runs and bgp_session_up metrics - RECONCILIATION_RUNS: increment with success/error label after each reconciliation cycle (initial and periodic) - BGP_SESSION_UP: poll session_status() each cycle and set gauge per peer (1=established, 0=down) Co-Authored-By: Claude Opus 4.6 (1M context) --- src/scheduler/reconcile.rs | 28 ++++++++++++++++++++++++---- 1 file changed, 24 insertions(+), 4 deletions(-) diff --git a/src/scheduler/reconcile.rs b/src/scheduler/reconcile.rs index d2b2d37..f308b21 100644 --- a/src/scheduler/reconcile.rs +++ b/src/scheduler/reconcile.rs @@ -55,8 +55,18 @@ impl ReconciliationLoop { ); // Initial reconciliation - if let Err(e) = self.reconcile().await { - tracing::error!(error = %e, "initial reconciliation failed"); + match self.reconcile().await { + Ok(()) => { + crate::observability::metrics::RECONCILIATION_RUNS + .with_label_values(&["success"]) + .inc(); + } + Err(e) => { + crate::observability::metrics::RECONCILIATION_RUNS + .with_label_values(&["error"]) + .inc(); + tracing::error!(error = %e, "initial reconciliation failed"); + } } let mut interval = tokio::time::interval(self.interval); @@ -65,8 +75,18 @@ impl ReconciliationLoop { loop { tokio::select! { _ = interval.tick() => { - if let Err(e) = self.reconcile().await { - tracing::error!(error = %e, "reconciliation failed"); + match self.reconcile().await { + Ok(()) => { + crate::observability::metrics::RECONCILIATION_RUNS + .with_label_values(&["success"]) + .inc(); + } + Err(e) => { + crate::observability::metrics::RECONCILIATION_RUNS + .with_label_values(&["error"]) + .inc(); + tracing::error!(error = %e, "reconciliation failed"); + } } } _ = shutdown.recv() => { From d46eeda6611da6d1fd737c8bc933a3c0f35dc7a3 Mon Sep 17 00:00:00 2001 From: Brooks Swinnerton Date: Fri, 3 Apr 2026 00:19:28 -0400 Subject: [PATCH 05/10] fix: wire up guardrail_rejections metric Increment prefixd_guardrail_rejections_total when guardrails reject a mitigation, with the variant name as the reason label (e.g. Safelisted, QuotaExceeded, TtlRequired). Co-Authored-By: Claude Opus 4.6 (1M context) --- src/api/handlers.rs | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/src/api/handlers.rs b/src/api/handlers.rs index 39855ad..0488e9d 100644 --- a/src/api/handlers.rs +++ b/src/api/handlers.rs @@ -899,6 +899,15 @@ async fn handle_ban( crate::observability::metrics::EVENTS_REJECTED .with_label_values(&[&event.source, "guardrail"]) .inc(); + let reason = match &e { + PrefixdError::GuardrailViolation(g) => { + format!("{:?}", g).split_whitespace().next().unwrap_or("unknown").to_string() + } + _ => "unknown".to_string(), + }; + crate::observability::metrics::GUARDRAIL_REJECTIONS + .with_label_values(&[&reason]) + .inc(); tracing::warn!(error = %e, "guardrail rejected mitigation"); return Err(AppError(e)); } @@ -1410,6 +1419,15 @@ pub async fn create_mitigation( .validate(&intent, state.repo.as_ref(), is_safelisted) .await { + let reason = match &e { + PrefixdError::GuardrailViolation(g) => { + format!("{:?}", g).split_whitespace().next().unwrap_or("unknown").to_string() + } + _ => "unknown".to_string(), + }; + crate::observability::metrics::GUARDRAIL_REJECTIONS + .with_label_values(&[&reason]) + .inc(); return Ok(AppError(e).into_response()); } From acad85925723e8f93d15fc7e82a720ddaee819d2 Mon Sep 17 00:00:00 2001 From: Brooks Swinnerton Date: Fri, 3 Apr 2026 00:23:12 -0400 Subject: [PATCH 06/10] fix: fix type mismatches in metrics label values Use .as_str() for String fields passed alongside &str literals in with_label_values() calls. Co-Authored-By: Claude Opus 4.6 (1M context) --- src/api/handlers.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/api/handlers.rs b/src/api/handlers.rs index 0488e9d..c13908b 100644 --- a/src/api/handlers.rs +++ b/src/api/handlers.rs @@ -560,7 +560,7 @@ async fn handle_unban( .map_err(AppError)?; crate::observability::metrics::MITIGATIONS_WITHDRAWN - .with_label_values(&[&action_type_str, &mitigation.pop, "detector_unban"]) + .with_label_values(&[action_type_str.as_str(), mitigation.pop.as_str(), "detector_unban"]) .inc(); // Broadcast withdrawal via WebSocket @@ -606,7 +606,7 @@ async fn handle_ban( .await { crate::observability::metrics::EVENTS_REJECTED - .with_label_values(&[&input.source, "duplicate"]) + .with_label_values(&[input.source.as_str(), "duplicate"]) .inc(); return Err(AppError(PrefixdError::DuplicateEvent { detector_source: input.source.clone(), @@ -897,7 +897,7 @@ async fn handle_ban( .await { crate::observability::metrics::EVENTS_REJECTED - .with_label_values(&[&event.source, "guardrail"]) + .with_label_values(&[event.source.as_str(), "guardrail"]) .inc(); let reason = match &e { PrefixdError::GuardrailViolation(g) => { @@ -1535,7 +1535,7 @@ pub async fn withdraw_mitigation( .map_err(|_| StatusCode::INTERNAL_SERVER_ERROR)?; crate::observability::metrics::MITIGATIONS_WITHDRAWN - .with_label_values(&[&action_type_str, &mitigation.pop, "operator"]) + .with_label_values(&[action_type_str.as_str(), mitigation.pop.as_str(), "operator"]) .inc(); // Broadcast withdrawal via WebSocket From 847dcf6b393820fc7febe3d20638eba123c065db Mon Sep 17 00:00:00 2001 From: Brooks Swinnerton Date: Fri, 3 Apr 2026 00:27:31 -0400 Subject: [PATCH 07/10] style: run cargo fmt Co-Authored-By: Claude Opus 4.6 (1M context) --- src/api/handlers.rs | 46 ++++++++++++++++++++++++++------------------- 1 file changed, 27 insertions(+), 19 deletions(-) diff --git a/src/api/handlers.rs b/src/api/handlers.rs index c13908b..54e772e 100644 --- a/src/api/handlers.rs +++ b/src/api/handlers.rs @@ -560,7 +560,11 @@ async fn handle_unban( .map_err(AppError)?; crate::observability::metrics::MITIGATIONS_WITHDRAWN - .with_label_values(&[action_type_str.as_str(), mitigation.pop.as_str(), "detector_unban"]) + .with_label_values(&[ + action_type_str.as_str(), + mitigation.pop.as_str(), + "detector_unban", + ]) .inc(); // Broadcast withdrawal via WebSocket @@ -900,9 +904,11 @@ async fn handle_ban( .with_label_values(&[event.source.as_str(), "guardrail"]) .inc(); let reason = match &e { - PrefixdError::GuardrailViolation(g) => { - format!("{:?}", g).split_whitespace().next().unwrap_or("unknown").to_string() - } + PrefixdError::GuardrailViolation(g) => format!("{:?}", g) + .split_whitespace() + .next() + .unwrap_or("unknown") + .to_string(), _ => "unknown".to_string(), }; crate::observability::metrics::GUARDRAIL_REJECTIONS @@ -1420,9 +1426,11 @@ pub async fn create_mitigation( .await { let reason = match &e { - PrefixdError::GuardrailViolation(g) => { - format!("{:?}", g).split_whitespace().next().unwrap_or("unknown").to_string() - } + PrefixdError::GuardrailViolation(g) => format!("{:?}", g) + .split_whitespace() + .next() + .unwrap_or("unknown") + .to_string(), _ => "unknown".to_string(), }; crate::observability::metrics::GUARDRAIL_REJECTIONS @@ -1507,17 +1515,13 @@ pub async fn withdraw_mitigation( let action = FlowSpecAction::from((mitigation.action_type, &mitigation.action_params)); let rule = FlowSpecRule::new(nlri, action); let start = std::time::Instant::now(); - state - .announcer - .withdraw(&rule) - .await - .map_err(|e| { - crate::observability::metrics::ANNOUNCEMENTS_TOTAL - .with_label_values(&["unknown", "error"]) - .inc(); - let _ = e; - StatusCode::INTERNAL_SERVER_ERROR - })?; + state.announcer.withdraw(&rule).await.map_err(|e| { + crate::observability::metrics::ANNOUNCEMENTS_TOTAL + .with_label_values(&["unknown", "error"]) + .inc(); + let _ = e; + StatusCode::INTERNAL_SERVER_ERROR + })?; crate::observability::metrics::ANNOUNCEMENTS_TOTAL .with_label_values(&["unknown", "withdrawn"]) .inc(); @@ -1535,7 +1539,11 @@ pub async fn withdraw_mitigation( .map_err(|_| StatusCode::INTERNAL_SERVER_ERROR)?; crate::observability::metrics::MITIGATIONS_WITHDRAWN - .with_label_values(&[action_type_str.as_str(), mitigation.pop.as_str(), "operator"]) + .with_label_values(&[ + action_type_str.as_str(), + mitigation.pop.as_str(), + "operator", + ]) .inc(); // Broadcast withdrawal via WebSocket From bcb49ce8d73955a439df4a4576f7d1277b8953ba Mon Sep 17 00:00:00 2001 From: Brooks Swinnerton Date: Fri, 3 Apr 2026 10:53:47 -0400 Subject: [PATCH 08/10] refactor: drop peer label from announcement metrics prefixd calls GoBGP's AddPath/DeletePath RPCs which operate on the global RIB. GoBGP distributes routes to all peers based on policy, so there is no per-peer dimension at the announce/withdraw layer. Co-Authored-By: Claude Opus 4.6 (1M context) --- src/api/handlers.rs | 30 +++++++++++++++--------------- src/observability/metrics.rs | 4 ++-- src/scheduler/reconcile.rs | 12 ++++++------ 3 files changed, 23 insertions(+), 23 deletions(-) diff --git a/src/api/handlers.rs b/src/api/handlers.rs index 54e772e..674b328 100644 --- a/src/api/handlers.rs +++ b/src/api/handlers.rs @@ -536,16 +536,16 @@ async fn handle_unban( let start = std::time::Instant::now(); if let Err(e) = state.announcer.withdraw(&rule).await { crate::observability::metrics::ANNOUNCEMENTS_TOTAL - .with_label_values(&["unknown", "error"]) + .with_label_values(&["error"]) .inc(); tracing::error!(error = %e, "BGP withdrawal failed"); // Continue anyway - mark as withdrawn in DB } else { crate::observability::metrics::ANNOUNCEMENTS_TOTAL - .with_label_values(&["unknown", "withdrawn"]) + .with_label_values(&["withdrawn"]) .inc(); crate::observability::metrics::ANNOUNCEMENTS_LATENCY - .with_label_values(&["unknown"]) + .with_label_values(&[] as &[&str]) .observe(start.elapsed().as_secs_f64()); } } @@ -932,7 +932,7 @@ async fn handle_ban( let start = std::time::Instant::now(); if let Err(e) = state.announcer.announce(&rule).await { crate::observability::metrics::ANNOUNCEMENTS_TOTAL - .with_label_values(&["unknown", "error"]) + .with_label_values(&["error"]) .inc(); tracing::error!(error = %e, "BGP announcement failed"); mitigation.reject(e.to_string()); @@ -944,10 +944,10 @@ async fn handle_ban( return Err(AppError(e)); } crate::observability::metrics::ANNOUNCEMENTS_TOTAL - .with_label_values(&["unknown", "announced"]) + .with_label_values(&["announced"]) .inc(); crate::observability::metrics::ANNOUNCEMENTS_LATENCY - .with_label_values(&["unknown"]) + .with_label_values(&[] as &[&str]) .observe(start.elapsed().as_secs_f64()); } @@ -1450,15 +1450,15 @@ pub async fn create_mitigation( let start = std::time::Instant::now(); if let Err(e) = state.announcer.announce(&rule).await { crate::observability::metrics::ANNOUNCEMENTS_TOTAL - .with_label_values(&["unknown", "error"]) + .with_label_values(&["error"]) .inc(); return Ok(AppError(e).into_response()); } crate::observability::metrics::ANNOUNCEMENTS_TOTAL - .with_label_values(&["unknown", "announced"]) + .with_label_values(&["announced"]) .inc(); crate::observability::metrics::ANNOUNCEMENTS_LATENCY - .with_label_values(&["unknown"]) + .with_label_values(&[] as &[&str]) .observe(start.elapsed().as_secs_f64()); } @@ -1517,16 +1517,16 @@ pub async fn withdraw_mitigation( let start = std::time::Instant::now(); state.announcer.withdraw(&rule).await.map_err(|e| { crate::observability::metrics::ANNOUNCEMENTS_TOTAL - .with_label_values(&["unknown", "error"]) + .with_label_values(&["error"]) .inc(); let _ = e; StatusCode::INTERNAL_SERVER_ERROR })?; crate::observability::metrics::ANNOUNCEMENTS_TOTAL - .with_label_values(&["unknown", "withdrawn"]) + .with_label_values(&["withdrawn"]) .inc(); crate::observability::metrics::ANNOUNCEMENTS_LATENCY - .with_label_values(&["unknown"]) + .with_label_values(&[] as &[&str]) .observe(start.elapsed().as_secs_f64()); } @@ -1666,15 +1666,15 @@ pub async fn bulk_withdraw_mitigations( let start = std::time::Instant::now(); if let Err(e) = state.announcer.withdraw(&rule).await { crate::observability::metrics::ANNOUNCEMENTS_TOTAL - .with_label_values(&["unknown", "error"]) + .with_label_values(&["error"]) .inc(); tracing::error!(error = %e, mitigation_id = %id, "BGP withdrawal failed in bulk withdraw"); } else { crate::observability::metrics::ANNOUNCEMENTS_TOTAL - .with_label_values(&["unknown", "withdrawn"]) + .with_label_values(&["withdrawn"]) .inc(); crate::observability::metrics::ANNOUNCEMENTS_LATENCY - .with_label_values(&["unknown"]) + .with_label_values(&[] as &[&str]) .observe(start.elapsed().as_secs_f64()); } } diff --git a/src/observability/metrics.rs b/src/observability/metrics.rs index 36b6daf..b47e1c6 100644 --- a/src/observability/metrics.rs +++ b/src/observability/metrics.rs @@ -65,7 +65,7 @@ pub static ANNOUNCEMENTS_TOTAL: Lazy = Lazy::new(|| { register_counter_vec!( "prefixd_announcements_total", "Total number of BGP announcements", - &["peer", "status"] + &["status"] ) .unwrap() }); @@ -74,7 +74,7 @@ pub static ANNOUNCEMENTS_LATENCY: Lazy = Lazy::new(|| { register_histogram_vec!( "prefixd_announcements_latency_seconds", "BGP announcement latency in seconds", - &["peer"], + &[], vec![0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0] ) .unwrap() diff --git a/src/scheduler/reconcile.rs b/src/scheduler/reconcile.rs index f308b21..90e6c07 100644 --- a/src/scheduler/reconcile.rs +++ b/src/scheduler/reconcile.rs @@ -130,7 +130,7 @@ impl ReconciliationLoop { let start = std::time::Instant::now(); if let Err(e) = self.announcer.withdraw(&rule).await { crate::observability::metrics::ANNOUNCEMENTS_TOTAL - .with_label_values(&["unknown", "error"]) + .with_label_values(&["error"]) .inc(); tracing::warn!( mitigation_id = %mitigation.mitigation_id, @@ -139,10 +139,10 @@ impl ReconciliationLoop { ); } else { crate::observability::metrics::ANNOUNCEMENTS_TOTAL - .with_label_values(&["unknown", "withdrawn"]) + .with_label_values(&["withdrawn"]) .inc(); crate::observability::metrics::ANNOUNCEMENTS_LATENCY - .with_label_values(&["unknown"]) + .with_label_values(&[] as &[&str]) .observe(start.elapsed().as_secs_f64()); } } @@ -277,7 +277,7 @@ impl ReconciliationLoop { let start = std::time::Instant::now(); if let Err(e) = self.announcer.announce(&rule).await { crate::observability::metrics::ANNOUNCEMENTS_TOTAL - .with_label_values(&["unknown", "error"]) + .with_label_values(&["error"]) .inc(); tracing::error!( mitigation_id = %mitigation.mitigation_id, @@ -286,10 +286,10 @@ impl ReconciliationLoop { ); } else { crate::observability::metrics::ANNOUNCEMENTS_TOTAL - .with_label_values(&["unknown", "announced"]) + .with_label_values(&["announced"]) .inc(); crate::observability::metrics::ANNOUNCEMENTS_LATENCY - .with_label_values(&["unknown"]) + .with_label_values(&[] as &[&str]) .observe(start.elapsed().as_secs_f64()); } } From ce9a72b8e9c04fc6bd1c06fdf17a1b319aaafbf8 Mon Sep 17 00:00:00 2001 From: Brooks Swinnerton Date: Fri, 3 Apr 2026 10:57:27 -0400 Subject: [PATCH 09/10] fix: remove announcement metrics from error paths Failed announce/withdraw calls should not be counted as announcements. Co-Authored-By: Claude Opus 4.6 (1M context) --- src/api/handlers.rs | 24 +++++------------------- src/scheduler/reconcile.rs | 6 ------ 2 files changed, 5 insertions(+), 25 deletions(-) diff --git a/src/api/handlers.rs b/src/api/handlers.rs index 674b328..5cbb24f 100644 --- a/src/api/handlers.rs +++ b/src/api/handlers.rs @@ -535,9 +535,6 @@ async fn handle_unban( let start = std::time::Instant::now(); if let Err(e) = state.announcer.withdraw(&rule).await { - crate::observability::metrics::ANNOUNCEMENTS_TOTAL - .with_label_values(&["error"]) - .inc(); tracing::error!(error = %e, "BGP withdrawal failed"); // Continue anyway - mark as withdrawn in DB } else { @@ -931,9 +928,6 @@ async fn handle_ban( let start = std::time::Instant::now(); if let Err(e) = state.announcer.announce(&rule).await { - crate::observability::metrics::ANNOUNCEMENTS_TOTAL - .with_label_values(&["error"]) - .inc(); tracing::error!(error = %e, "BGP announcement failed"); mitigation.reject(e.to_string()); state @@ -1449,9 +1443,6 @@ pub async fn create_mitigation( let rule = FlowSpecRule::new(nlri, action); let start = std::time::Instant::now(); if let Err(e) = state.announcer.announce(&rule).await { - crate::observability::metrics::ANNOUNCEMENTS_TOTAL - .with_label_values(&["error"]) - .inc(); return Ok(AppError(e).into_response()); } crate::observability::metrics::ANNOUNCEMENTS_TOTAL @@ -1515,13 +1506,11 @@ pub async fn withdraw_mitigation( let action = FlowSpecAction::from((mitigation.action_type, &mitigation.action_params)); let rule = FlowSpecRule::new(nlri, action); let start = std::time::Instant::now(); - state.announcer.withdraw(&rule).await.map_err(|e| { - crate::observability::metrics::ANNOUNCEMENTS_TOTAL - .with_label_values(&["error"]) - .inc(); - let _ = e; - StatusCode::INTERNAL_SERVER_ERROR - })?; + state + .announcer + .withdraw(&rule) + .await + .map_err(|_| StatusCode::INTERNAL_SERVER_ERROR)?; crate::observability::metrics::ANNOUNCEMENTS_TOTAL .with_label_values(&["withdrawn"]) .inc(); @@ -1665,9 +1654,6 @@ pub async fn bulk_withdraw_mitigations( let rule = FlowSpecRule::new(nlri, action); let start = std::time::Instant::now(); if let Err(e) = state.announcer.withdraw(&rule).await { - crate::observability::metrics::ANNOUNCEMENTS_TOTAL - .with_label_values(&["error"]) - .inc(); tracing::error!(error = %e, mitigation_id = %id, "BGP withdrawal failed in bulk withdraw"); } else { crate::observability::metrics::ANNOUNCEMENTS_TOTAL diff --git a/src/scheduler/reconcile.rs b/src/scheduler/reconcile.rs index 90e6c07..9938b9d 100644 --- a/src/scheduler/reconcile.rs +++ b/src/scheduler/reconcile.rs @@ -129,9 +129,6 @@ impl ReconciliationLoop { let rule = self.build_flowspec_rule(&mitigation); let start = std::time::Instant::now(); if let Err(e) = self.announcer.withdraw(&rule).await { - crate::observability::metrics::ANNOUNCEMENTS_TOTAL - .with_label_values(&["error"]) - .inc(); tracing::warn!( mitigation_id = %mitigation.mitigation_id, error = %e, @@ -276,9 +273,6 @@ impl ReconciliationLoop { if !self.dry_run { let start = std::time::Instant::now(); if let Err(e) = self.announcer.announce(&rule).await { - crate::observability::metrics::ANNOUNCEMENTS_TOTAL - .with_label_values(&["error"]) - .inc(); tracing::error!( mitigation_id = %mitigation.mitigation_id, error = %e, From 247676170652ac61b5d51d43332ccaa3348bd1b0 Mon Sep 17 00:00:00 2001 From: Brooks Swinnerton Date: Fri, 3 Apr 2026 11:11:44 -0400 Subject: [PATCH 10/10] refactor: change ANNOUNCEMENTS_LATENCY from HistogramVec to Histogram No labels are needed since the peer label was removed. Using a plain Histogram avoids the awkward empty slice cast at call sites. Co-Authored-By: Claude Opus 4.6 (1M context) --- src/api/handlers.rs | 14 +++----------- src/observability/metrics.rs | 9 ++++----- src/scheduler/reconcile.rs | 5 ----- 3 files changed, 7 insertions(+), 21 deletions(-) diff --git a/src/api/handlers.rs b/src/api/handlers.rs index 5cbb24f..7887c4a 100644 --- a/src/api/handlers.rs +++ b/src/api/handlers.rs @@ -542,7 +542,6 @@ async fn handle_unban( .with_label_values(&["withdrawn"]) .inc(); crate::observability::metrics::ANNOUNCEMENTS_LATENCY - .with_label_values(&[] as &[&str]) .observe(start.elapsed().as_secs_f64()); } } @@ -940,9 +939,7 @@ async fn handle_ban( crate::observability::metrics::ANNOUNCEMENTS_TOTAL .with_label_values(&["announced"]) .inc(); - crate::observability::metrics::ANNOUNCEMENTS_LATENCY - .with_label_values(&[] as &[&str]) - .observe(start.elapsed().as_secs_f64()); + crate::observability::metrics::ANNOUNCEMENTS_LATENCY.observe(start.elapsed().as_secs_f64()); } mitigation.activate(); @@ -1448,9 +1445,7 @@ pub async fn create_mitigation( crate::observability::metrics::ANNOUNCEMENTS_TOTAL .with_label_values(&["announced"]) .inc(); - crate::observability::metrics::ANNOUNCEMENTS_LATENCY - .with_label_values(&[] as &[&str]) - .observe(start.elapsed().as_secs_f64()); + crate::observability::metrics::ANNOUNCEMENTS_LATENCY.observe(start.elapsed().as_secs_f64()); } mitigation.activate(); @@ -1514,9 +1509,7 @@ pub async fn withdraw_mitigation( crate::observability::metrics::ANNOUNCEMENTS_TOTAL .with_label_values(&["withdrawn"]) .inc(); - crate::observability::metrics::ANNOUNCEMENTS_LATENCY - .with_label_values(&[] as &[&str]) - .observe(start.elapsed().as_secs_f64()); + crate::observability::metrics::ANNOUNCEMENTS_LATENCY.observe(start.elapsed().as_secs_f64()); } let action_type_str = mitigation.action_type.to_string(); @@ -1660,7 +1653,6 @@ pub async fn bulk_withdraw_mitigations( .with_label_values(&["withdrawn"]) .inc(); crate::observability::metrics::ANNOUNCEMENTS_LATENCY - .with_label_values(&[] as &[&str]) .observe(start.elapsed().as_secs_f64()); } } diff --git a/src/observability/metrics.rs b/src/observability/metrics.rs index b47e1c6..d86e388 100644 --- a/src/observability/metrics.rs +++ b/src/observability/metrics.rs @@ -1,7 +1,7 @@ use once_cell::sync::Lazy; use prometheus::{ - CounterVec, Encoder, GaugeVec, HistogramVec, TextEncoder, register_counter_vec, - register_gauge_vec, register_histogram_vec, + CounterVec, Encoder, GaugeVec, Histogram, HistogramVec, TextEncoder, register_counter_vec, + register_gauge_vec, register_histogram, register_histogram_vec, }; // Event metrics @@ -70,11 +70,10 @@ pub static ANNOUNCEMENTS_TOTAL: Lazy = Lazy::new(|| { .unwrap() }); -pub static ANNOUNCEMENTS_LATENCY: Lazy = Lazy::new(|| { - register_histogram_vec!( +pub static ANNOUNCEMENTS_LATENCY: Lazy = Lazy::new(|| { + register_histogram!( "prefixd_announcements_latency_seconds", "BGP announcement latency in seconds", - &[], vec![0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0] ) .unwrap() diff --git a/src/scheduler/reconcile.rs b/src/scheduler/reconcile.rs index 9938b9d..c45f043 100644 --- a/src/scheduler/reconcile.rs +++ b/src/scheduler/reconcile.rs @@ -139,7 +139,6 @@ impl ReconciliationLoop { .with_label_values(&["withdrawn"]) .inc(); crate::observability::metrics::ANNOUNCEMENTS_LATENCY - .with_label_values(&[] as &[&str]) .observe(start.elapsed().as_secs_f64()); } } @@ -234,7 +233,6 @@ impl ReconciliationLoop { .with_label_values(&["local"]) .set(active.len() as f64); - // Update MITIGATIONS_ACTIVE gauge by action_type and pop { use std::collections::HashMap; let mut counts: HashMap<(String, String), f64> = HashMap::new(); @@ -243,8 +241,6 @@ impl ReconciliationLoop { .entry((m.action_type.to_string(), m.pop.clone())) .or_default() += 1.0; } - // Reset all label combinations to zero first, then set observed values. - // This handles the case where a combination drops to zero. crate::observability::metrics::MITIGATIONS_ACTIVE.reset(); for ((action_type, pop), count) in &counts { crate::observability::metrics::MITIGATIONS_ACTIVE @@ -283,7 +279,6 @@ impl ReconciliationLoop { .with_label_values(&["announced"]) .inc(); crate::observability::metrics::ANNOUNCEMENTS_LATENCY - .with_label_values(&[] as &[&str]) .observe(start.elapsed().as_secs_f64()); } }