diff --git a/src/health_monitoring_lib/rust/common.rs b/src/health_monitoring_lib/rust/common.rs index 82e4f887..2fc158f5 100644 --- a/src/health_monitoring_lib/rust/common.rs +++ b/src/health_monitoring_lib/rust/common.rs @@ -12,11 +12,13 @@ // ******************************************************************************* use crate::deadline::DeadlineEvaluationError; +use crate::heartbeat::HeartbeatEvaluationError; use crate::log::ScoreDebug; use crate::tag::MonitorTag; use core::hash::Hash; use core::time::Duration; use std::sync::Arc; +use std::time::Instant; /// Range of accepted time. #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] @@ -48,7 +50,7 @@ pub(crate) trait Monitor { #[allow(dead_code)] pub(crate) enum MonitorEvaluationError { Deadline(DeadlineEvaluationError), - Heartbeat, + Heartbeat(HeartbeatEvaluationError), Logic, } @@ -58,12 +60,19 @@ impl From for MonitorEvaluationError { } } +impl From for MonitorEvaluationError { + fn from(value: HeartbeatEvaluationError) -> Self { + MonitorEvaluationError::Heartbeat(value) + } +} + /// Trait for evaluating monitors and reporting errors to be used by HealthMonitor. pub(crate) trait MonitorEvaluator { /// Run monitor evaluation. /// + /// - `hmon_starting_point` - starting point of all monitors. /// - `on_error` - error handling, containing tag of failing object and error code. - fn evaluate(&self, on_error: &mut dyn FnMut(&MonitorTag, MonitorEvaluationError)); + fn evaluate(&self, hmon_starting_point: Instant, on_error: &mut dyn FnMut(&MonitorTag, MonitorEvaluationError)); } /// Handle to a monitor evaluator, allowing for dynamic dispatch. @@ -78,7 +87,76 @@ impl MonitorEvalHandle { } impl MonitorEvaluator for MonitorEvalHandle { - fn evaluate(&self, on_error: &mut dyn FnMut(&MonitorTag, MonitorEvaluationError)) { - self.inner.evaluate(on_error) + fn evaluate(&self, hmon_starting_point: Instant, on_error: &mut dyn FnMut(&MonitorTag, MonitorEvaluationError)) { + self.inner.evaluate(hmon_starting_point, on_error) + } +} + +/// Get offset between HMON and monitor starting time points as integers. +pub(crate) fn hmon_time_offset(hmon_starting_point: Instant, monitor_starting_point: Instant) -> T +where + T: TryFrom, + >::Error: core::fmt::Debug, +{ + let result = hmon_starting_point.checked_duration_since(monitor_starting_point); + let duration_since = result.expect("HMON starting point is earlier than monitor starting point"); + duration_to_int(duration_since) +} + +/// Get duration as an integer. +pub(crate) fn duration_to_int(duration: Duration) -> T +where + T: TryFrom, + >::Error: core::fmt::Debug, +{ + let millis = duration.as_millis(); + T::try_from(millis).expect("Monitor running for too long") +} + +#[cfg(all(test, not(loom)))] +mod tests { + use crate::common::{duration_to_int, hmon_time_offset}; + use core::time::Duration; + use std::time::Instant; + + #[test] + fn hmon_time_offset_valid() { + let monitor_starting_point = Instant::now(); + let hmon_starting_point = Instant::now(); + let offset: u32 = hmon_time_offset(hmon_starting_point, monitor_starting_point); + // Allow small offset. + assert!(offset < 10); + } + + #[test] + #[should_panic(expected = "HMON starting point is earlier than monitor starting point")] + fn hmon_time_offset_wrong_order() { + let hmon_starting_point = Instant::now(); + let monitor_starting_point = Instant::now(); + let _offset: u32 = hmon_time_offset(hmon_starting_point, monitor_starting_point); + } + + #[test] + #[should_panic(expected = "Monitor running for too long")] + fn hmon_time_offset_diff_too_large() { + const HUNDRED_DAYS_AS_SECS: u64 = 100 * 24 * 60 * 60; + let monitor_starting_point = Instant::now(); + let hmon_starting_point = Instant::now() + .checked_add(Duration::from_secs(HUNDRED_DAYS_AS_SECS)) + .unwrap(); + let _offset: u32 = hmon_time_offset(hmon_starting_point, monitor_starting_point); + } + + #[test] + fn duration_to_int_valid() { + let result: u32 = duration_to_int(Duration::from_millis(1234)); + assert_eq!(result, 1234); + } + + #[test] + #[should_panic(expected = "Monitor running for too long")] + fn duration_to_int_too_large() { + const HUNDRED_DAYS_AS_SECS: u64 = 100 * 24 * 60 * 60; + let _result: u32 = duration_to_int(Duration::from_secs(HUNDRED_DAYS_AS_SECS)); } } diff --git a/src/health_monitoring_lib/rust/deadline/deadline_monitor.rs b/src/health_monitoring_lib/rust/deadline/deadline_monitor.rs index 1f1c7cd8..5a138db9 100644 --- a/src/health_monitoring_lib/rust/deadline/deadline_monitor.rs +++ b/src/health_monitoring_lib/rust/deadline/deadline_monitor.rs @@ -10,13 +10,12 @@ // // SPDX-License-Identifier: Apache-2.0 // ******************************************************************************* -use crate::common::{Monitor, MonitorEvalHandle, MonitorEvaluationError, MonitorEvaluator}; +use crate::common::{duration_to_int, Monitor, MonitorEvalHandle, MonitorEvaluationError, MonitorEvaluator, TimeRange}; use crate::deadline::common::{DeadlineTemplate, StateIndex}; use crate::deadline::deadline_state::{DeadlineState, DeadlineStateSnapshot}; use crate::log::{error, warn, ScoreDebug}; use crate::protected_memory::ProtectedMemoryAllocator; use crate::tag::{DeadlineTag, MonitorTag}; -use crate::TimeRange; use core::hash::Hash; use std::collections::HashMap; use std::sync::Arc; @@ -153,7 +152,7 @@ impl Deadline { /// Caller must ensure that deadline is not used until it's stopped. /// After this call You shall assure there's only a single owner of the `Deadline` instance and it does not call start before stopping. pub(super) unsafe fn start_internal(&mut self) -> Result<(), DeadlineError> { - let now = self.monitor.now(); + let now = duration_to_int::(self.monitor.monitor_starting_point.elapsed()); let max_time = now + self.range.max.as_millis() as u32; let mut is_broken = false; @@ -178,7 +177,7 @@ impl Deadline { } pub(super) fn stop_internal(&mut self) { - let now = self.monitor.now(); + let now = duration_to_int::(self.monitor.monitor_starting_point.elapsed()); let max = self.range.max.as_millis() as u32; let min = self.range.min.as_millis() as u32; @@ -260,7 +259,7 @@ struct DeadlineMonitorInner { } impl MonitorEvaluator for DeadlineMonitorInner { - fn evaluate(&self, on_error: &mut dyn FnMut(&MonitorTag, MonitorEvaluationError)) { + fn evaluate(&self, _hmon_starting_point: Instant, on_error: &mut dyn FnMut(&MonitorTag, MonitorEvaluationError)) { for (deadline_tag, deadline) in self.active_deadlines.iter() { let snapshot = deadline.snapshot(); if snapshot.is_underrun() { @@ -275,7 +274,7 @@ impl MonitorEvaluator for DeadlineMonitorInner { "Deadline snapshot cannot be both running and stopped" ); - let now = self.now(); + let now = duration_to_int::(self.monitor_starting_point.elapsed()); let expected = snapshot.timestamp_ms(); if now > expected { // Deadline missed, report @@ -336,13 +335,6 @@ impl DeadlineMonitorInner { Err(DeadlineMonitorError::DeadlineNotFound) } } - - fn now(&self) -> u32 { - let duration = self.monitor_starting_point.elapsed(); - // As u32 can hold up to ~49 days in milliseconds, this should be sufficient for our use case - // We still have a room up to 60bits timestamp if needed in future - u32::try_from(duration.as_millis()).expect("Monitor running for too long") - } } #[score_testing_macros::test_mod_with_log] @@ -410,6 +402,7 @@ mod tests { #[test] fn start_stop_deadline_within_range_works() { let monitor = create_monitor_with_deadlines(); + let hmon_starting_point = Instant::now(); let mut deadline = monitor.get_deadline(DeadlineTag::from("deadline_long")).unwrap(); let handle = deadline.start().unwrap(); @@ -417,49 +410,57 @@ mod tests { drop(handle); // stop the deadline - monitor.inner.evaluate(&mut |monitor_tag, deadline_failure| { - panic!( - "Deadline {:?} should not have failed or underrun({:?})", - monitor_tag, deadline_failure - ); - }); + monitor + .inner + .evaluate(hmon_starting_point, &mut |monitor_tag, deadline_failure| { + panic!( + "Deadline {:?} should not have failed or underrun({:?})", + monitor_tag, deadline_failure + ); + }); } #[test] fn start_stop_deadline_outside_ranges_is_error_when_dropped_before_evaluate() { let monitor = create_monitor_with_deadlines(); + let hmon_starting_point = Instant::now(); let mut deadline = monitor.get_deadline(DeadlineTag::from("deadline_long")).unwrap(); let handle = deadline.start().unwrap(); drop(handle); // stop the deadline - monitor.inner.evaluate(&mut |monitor_tag, deadline_failure| { - assert_eq!( - deadline_failure, - DeadlineEvaluationError::TooEarly.into(), - "Deadline {:?} should not have failed({:?})", - monitor_tag, - deadline_failure - ); - }); + monitor + .inner + .evaluate(hmon_starting_point, &mut |monitor_tag, deadline_failure| { + assert_eq!( + deadline_failure, + DeadlineEvaluationError::TooEarly.into(), + "Deadline {:?} should not have failed({:?})", + monitor_tag, + deadline_failure + ); + }); } #[test] fn deadline_outside_time_range_is_error_when_dropped_after_evaluate() { let monitor = create_monitor_with_deadlines(); + let hmon_starting_point = Instant::now(); let mut deadline = monitor.get_deadline(DeadlineTag::from("deadline_long")).unwrap(); let handle = deadline.start().unwrap(); // So deadline stop happens after evaluate, still it should be reported as failed - monitor.inner.evaluate(&mut |monitor_tag, deadline_failure| { - assert_eq!( - deadline_failure, - DeadlineEvaluationError::TooEarly.into(), - "Deadline {:?} should not have failed({:?})", - monitor_tag, - deadline_failure - ); - }); + monitor + .inner + .evaluate(hmon_starting_point, &mut |monitor_tag, deadline_failure| { + assert_eq!( + deadline_failure, + DeadlineEvaluationError::TooEarly.into(), + "Deadline {:?} should not have failed({:?})", + monitor_tag, + deadline_failure + ); + }); drop(handle); // stop the deadline } @@ -467,6 +468,7 @@ mod tests { #[test] fn deadline_failed_on_first_run_and_then_restarted_is_evaluated_as_error() { let monitor = create_monitor_with_deadlines(); + let hmon_starting_point = Instant::now(); let mut deadline = monitor.get_deadline(DeadlineTag::from("deadline_long")).unwrap(); let handle = deadline.start().unwrap(); @@ -477,39 +479,45 @@ mod tests { let handle = deadline.start(); assert_eq!(handle.err(), Some(DeadlineError::DeadlineAlreadyFailed)); - monitor.inner.evaluate(&mut |monitor_tag, deadline_failure| { - assert_eq!( - deadline_failure, - DeadlineEvaluationError::TooEarly.into(), - "Deadline {:?} should not have failed ({:?})", - monitor_tag, - deadline_failure - ); - }); + monitor + .inner + .evaluate(hmon_starting_point, &mut |monitor_tag, deadline_failure| { + assert_eq!( + deadline_failure, + DeadlineEvaluationError::TooEarly.into(), + "Deadline {:?} should not have failed ({:?})", + monitor_tag, + deadline_failure + ); + }); } #[test] fn start_stop_deadline_outside_ranges_is_evaluated_as_error() { let monitor = create_monitor_with_deadlines(); + let hmon_starting_point = Instant::now(); let mut deadline = monitor.get_deadline(DeadlineTag::from("deadline_fast")).unwrap(); let handle = deadline.start().unwrap(); drop(handle); // stop the deadline - monitor.inner.evaluate(&mut |monitor_tag, deadline_failure| { - assert_eq!( - deadline_failure, - DeadlineEvaluationError::TooLate.into(), - "Deadline {:?} should not have failed({:?})", - monitor_tag, - deadline_failure - ); - }); + monitor + .inner + .evaluate(hmon_starting_point, &mut |monitor_tag, deadline_failure| { + assert_eq!( + deadline_failure, + DeadlineEvaluationError::TooLate.into(), + "Deadline {:?} should not have failed({:?})", + monitor_tag, + deadline_failure + ); + }); } #[test] fn monitor_with_multiple_running_deadlines() { let monitor = create_monitor_with_multiple_running_deadlines(); + let hmon_starting_point = Instant::now(); let mut deadline = monitor.get_deadline(DeadlineTag::from("deadline_fast1")).unwrap(); let _handle1 = deadline.start().unwrap(); @@ -524,16 +532,18 @@ mod tests { let mut cnt = 0; - monitor.inner.evaluate(&mut |monitor_tag, deadline_failure| { - cnt += 1; - assert_eq!( - deadline_failure, - DeadlineEvaluationError::TooLate.into(), - "Deadline {:?} should not have failed({:?})", - monitor_tag, - deadline_failure - ); - }); + monitor + .inner + .evaluate(hmon_starting_point, &mut |monitor_tag, deadline_failure| { + cnt += 1; + assert_eq!( + deadline_failure, + DeadlineEvaluationError::TooLate.into(), + "Deadline {:?} should not have failed({:?})", + monitor_tag, + deadline_failure + ); + }); assert_eq!(cnt, 3, "All three deadlines should have been evaluated"); } diff --git a/src/health_monitoring_lib/rust/heartbeat/heartbeat_monitor.rs b/src/health_monitoring_lib/rust/heartbeat/heartbeat_monitor.rs new file mode 100644 index 00000000..ec446367 --- /dev/null +++ b/src/health_monitoring_lib/rust/heartbeat/heartbeat_monitor.rs @@ -0,0 +1,751 @@ +// ******************************************************************************* +// Copyright (c) 2026 Contributors to the Eclipse Foundation +// +// See the NOTICE file(s) distributed with this work for additional +// information regarding copyright ownership. +// +// This program and the accompanying materials are made available under the +// terms of the Apache License Version 2.0 which is available at +// +// +// SPDX-License-Identifier: Apache-2.0 +// ******************************************************************************* + +use crate::common::{ + duration_to_int, hmon_time_offset, Monitor, MonitorEvalHandle, MonitorEvaluationError, MonitorEvaluator, TimeRange, +}; +use crate::heartbeat::heartbeat_state::{HeartbeatState, HeartbeatStateSnapshot}; +use crate::log::{error, warn}; +use crate::protected_memory::ProtectedMemoryAllocator; +use crate::tag::MonitorTag; +use crate::HealthMonitorError; +use core::sync::atomic::{AtomicU64, Ordering}; +use core::time::Duration; +use score_log::ScoreDebug; +use std::sync::Arc; +use std::time::Instant; + +/// Heartbeat evaluation errors. +#[derive(Debug, PartialEq, Eq, Clone, Copy, Hash, ScoreDebug)] +pub(crate) enum HeartbeatEvaluationError { + /// Finished too early. + TooEarly, + /// Finished too late. + TooLate, + /// Multiple heartbeats observed. + MultipleHeartbeats, +} + +/// Builder for [`HeartbeatMonitor`]. +#[derive(Debug)] +pub struct HeartbeatMonitorBuilder { + /// Time range between heartbeats. + range: TimeRange, +} + +impl HeartbeatMonitorBuilder { + /// Create a new [`HeartbeatMonitorBuilder`]. + /// + /// - `range` - time range between heartbeats. + pub fn new(range: TimeRange) -> Self { + Self { range } + } + + /// Build the [`HeartbeatMonitor`]. + /// + /// - `monitor_tag` - tag of this monitor. + /// - `internal_processing_cycle` - health monitor processing cycle. + /// - `_allocator` - protected memory allocator. + pub(crate) fn build( + self, + monitor_tag: MonitorTag, + internal_processing_cycle: Duration, + _allocator: &ProtectedMemoryAllocator, + ) -> Result { + // Check range is valid. + let range_min_ms = self.range.min.as_millis() as u64; + let internal_processing_cycle_ms = internal_processing_cycle.as_millis() as u64; + if range_min_ms * 2 <= internal_processing_cycle_ms { + error!( + "Internal processing cycle duration ({} ms) must be shorter than two shortest allowed ranges ({} ms).", + internal_processing_cycle_ms, range_min_ms + ); + return Err(HealthMonitorError::InvalidArgument); + } + + let inner = Arc::new(HeartbeatMonitorInner::new(monitor_tag, self.range)); + Ok(HeartbeatMonitor::new(inner)) + } +} + +/// Heartbeat monitor. +pub struct HeartbeatMonitor { + inner: Arc, +} + +impl HeartbeatMonitor { + /// Create a new [`HeartbeatMonitor`] instance. + fn new(inner: Arc) -> Self { + Self { inner } + } + + /// Provide a heartbeat. + pub fn heartbeat(&self) { + self.inner.heartbeat() + } +} + +impl Monitor for HeartbeatMonitor { + fn get_eval_handle(&self) -> crate::common::MonitorEvalHandle { + MonitorEvalHandle::new(Arc::new(HeartbeatMonitorHandle { + inner: Arc::clone(&self.inner), + start_timestamp: AtomicU64::new(0), + })) + } +} + +struct HeartbeatMonitorHandle { + inner: Arc, + start_timestamp: AtomicU64, +} + +impl MonitorEvaluator for HeartbeatMonitorHandle { + fn evaluate(&self, hmon_starting_point: Instant, on_error: &mut dyn FnMut(&MonitorTag, MonitorEvaluationError)) { + let start_timestamp = self.start_timestamp.load(Ordering::Relaxed); + let evaluate_result = self.inner.evaluate(start_timestamp, hmon_starting_point, on_error); + if let Some(new_start_timestamp) = evaluate_result { + self.start_timestamp.store(new_start_timestamp, Ordering::Relaxed); + } + } +} + +/// Time range using [`u64`]. +#[derive(ScoreDebug)] +struct InternalRange { + min: u64, + max: u64, +} + +impl InternalRange { + /// Create range using provided values. + fn new(min: u64, max: u64) -> Self { + assert!(min <= max, "provided min is greater than provided max"); + Self { min, max } + } + + /// Create range with values offset by timestamp. + fn offset(&self, timestamp: u64) -> Self { + Self::new(self.min + timestamp, self.max + timestamp) + } +} + +impl From for InternalRange { + fn from(value: TimeRange) -> Self { + let min = duration_to_int(value.min); + let max = duration_to_int(value.max); + Self::new(min, max) + } +} + +pub(crate) struct HeartbeatMonitorInner { + /// Tag of this monitor. + monitor_tag: MonitorTag, + + /// Time range between heartbeats. + range: InternalRange, + + /// Monitor starting point. + /// Offset is calculated during evaluation in relation to provided health monitor starting point. + monitor_starting_point: Instant, + + /// Current heartbeat state. + /// Contains data in relation to [`Self::monitor_starting_point`]. + heartbeat_state: HeartbeatState, +} + +impl HeartbeatMonitorInner { + fn new(monitor_tag: MonitorTag, range: TimeRange) -> Self { + let monitor_starting_point = Instant::now(); + let heartbeat_state_snapshot = HeartbeatStateSnapshot::default(); + let heartbeat_state = HeartbeatState::new(heartbeat_state_snapshot); + Self { + monitor_tag, + range: InternalRange::from(range), + monitor_starting_point, + heartbeat_state, + } + } + + /// Provide a heartbeat. + fn heartbeat(&self) { + // Get current timestamp. + let now = duration_to_int(self.monitor_starting_point.elapsed()); + + // Set heartbeat timestamp and update counter. + let _ = self.heartbeat_state.update(|mut state| { + state.set_heartbeat_timestamp(now); + state.increment_counter(); + Some(state) + }); + } + + pub fn evaluate( + &self, + start_timestamp: u64, + hmon_starting_point: Instant, + on_error: &mut dyn FnMut(&MonitorTag, MonitorEvaluationError), + ) -> Option { + // Get current timestamp, with offset to HMON time. + let offset = hmon_time_offset(hmon_starting_point, self.monitor_starting_point); + let now = offset + duration_to_int::(hmon_starting_point.elapsed()); + + // Load current monitor state. + let snapshot = self.heartbeat_state.snapshot(); + + // Get and recalculate snapshot timestamps. + // IMPORTANT: first heartbeat is obtained when HMON time is unknown. + // It is necessary to: + // - use offset as cycle starting point. + // - get heartbeat snapshot in relation to zero point. + let start_timestamp = if snapshot.post_init() { start_timestamp } else { offset }; + let heartbeat_timestamp = snapshot.heartbeat_timestamp(); + + // Get allowed time range as absolute values. + let range = self.range.offset(start_timestamp); + + // Check current counter state. + let counter = snapshot.counter(); + // Disallow multiple heartbeats in same heartbeat cycle. + if counter > 1 { + warn!("Multiple heartbeats detected"); + on_error(&self.monitor_tag, HeartbeatEvaluationError::MultipleHeartbeats.into()); + return None; + } + // Handle no heartbeats. + else if counter == 0 { + // Disallow no heartbeats when already out of time range. + // Stop execution if still in range. + if now > range.max { + let offset = now - range.max; + warn!("No heartbeat detected, observed after range: {}", offset); + on_error(&self.monitor_tag, HeartbeatEvaluationError::TooLate.into()); + } + // Either way - execution is stopped here. + return None; + } + + // Check current heartbeat state. + // Heartbeat before allowed range. + if heartbeat_timestamp < range.min { + let offset = range.min - heartbeat_timestamp; + warn!("Heartbeat occurred too early, offset to range: {}", offset); + on_error(&self.monitor_tag, HeartbeatEvaluationError::TooEarly.into()); + None + } + // Heartbeat after allowed range. + else if heartbeat_timestamp > range.max { + let offset = heartbeat_timestamp - range.max; + warn!("Heartbeat occurred too late, offset to range: {}", offset); + on_error(&self.monitor_tag, HeartbeatEvaluationError::TooLate.into()); + None + } + // Heartbeat in allowed state. + else { + let _ = self.heartbeat_state.update(|_| { + let mut snapshot = HeartbeatStateSnapshot::new(); + snapshot.set_post_init(true); + Some(snapshot) + }); + // Update heartbeat monitor state with a current heartbeat as a beginning of a new cycle. + Some(heartbeat_timestamp) + } + } +} + +#[cfg(test)] +mod test_common { + use crate::TimeRange; + use core::time::Duration; + use std::thread::sleep; + use std::time::Instant; + + pub(super) const TAG: &str = "heartbeat_monitor"; + + pub(super) fn sleep_until(target: Duration, start: Instant) { + let elapsed = start.elapsed(); + let diff = target.saturating_sub(elapsed); + sleep(diff) + } + + pub(super) fn range_from_ms(min: u64, max: u64) -> TimeRange { + TimeRange::new(Duration::from_millis(min), Duration::from_millis(max)) + } +} + +#[score_testing_macros::test_mod_with_log] +#[cfg(all(test, not(loom)))] +mod tests { + use crate::common::{Monitor, MonitorEvaluationError, MonitorEvaluator, TimeRange}; + use crate::heartbeat::heartbeat_monitor::test_common::{range_from_ms, sleep_until, TAG}; + use crate::heartbeat::{HeartbeatEvaluationError, HeartbeatMonitor, HeartbeatMonitorBuilder}; + use crate::protected_memory::ProtectedMemoryAllocator; + use crate::tag::MonitorTag; + use crate::HealthMonitorError; + use core::sync::atomic::{AtomicBool, Ordering}; + use core::time::Duration; + use std::sync::Arc; + use std::thread::{sleep, spawn}; + use std::time::Instant; + + #[test] + fn heartbeat_monitor_builder_build_ok() { + let range = TimeRange::new(Duration::from_millis(500), Duration::from_millis(1000)); + let monitor_tag = MonitorTag::from("heartbeat_monitor"); + let internal_processing_cycle = Duration::from_millis(100); + let allocator = ProtectedMemoryAllocator {}; + let result = HeartbeatMonitorBuilder::new(range).build(monitor_tag, internal_processing_cycle, &allocator); + assert!(result.is_ok()); + } + + #[test] + fn heartbeat_monitor_builder_build_invalid_internal_processing_cycle() { + let range = TimeRange::new(Duration::from_millis(500), Duration::from_millis(1000)); + let monitor_tag = MonitorTag::from("heartbeat_monitor"); + let internal_processing_cycle = Duration::from_millis(1000); + let allocator = ProtectedMemoryAllocator {}; + let result = HeartbeatMonitorBuilder::new(range).build(monitor_tag, internal_processing_cycle, &allocator); + assert!(result.is_err_and(|e| e == HealthMonitorError::InvalidArgument)); + } + + fn create_monitor_single_cycle(range: TimeRange) -> HeartbeatMonitor { + let monitor_tag = MonitorTag::from(TAG); + let internal_processing_cycle = Duration::from_millis(1); + let allocator = ProtectedMemoryAllocator {}; + HeartbeatMonitorBuilder::new(range) + .build(monitor_tag, internal_processing_cycle, &allocator) + .unwrap() + } + + #[test] + fn heartbeat_monitor_no_beat_evaluate_early() { + let range = range_from_ms(80, 120); + let monitor = create_monitor_single_cycle(range); + let hmon_starting_point = Instant::now(); + + // No beat happened, no error is expected. + monitor + .get_eval_handle() + .evaluate(hmon_starting_point, &mut |monitor_tag, error| { + panic!("error happened, tag: {monitor_tag:?}, error: {error:?}") + }); + } + + #[test] + #[cfg_attr(miri, ignore)] + fn heartbeat_monitor_no_beat_evaluate_in_range() { + let range = range_from_ms(80, 120); + let monitor = create_monitor_single_cycle(range); + let hmon_starting_point = Instant::now(); + + // Wait until middle of range. + sleep_until(Duration::from_millis(100), hmon_starting_point); + + // No beat happened, no error is expected. + monitor + .get_eval_handle() + .evaluate(hmon_starting_point, &mut |monitor_tag, error| { + panic!("error happened, tag: {monitor_tag:?}, error: {error:?}") + }); + } + + #[test] + fn heartbeat_monitor_no_beat_evaluate_late() { + let range = range_from_ms(80, 120); + let monitor = create_monitor_single_cycle(range); + let hmon_starting_point = Instant::now(); + + // Wait until late. + sleep_until(Duration::from_millis(150), hmon_starting_point); + + // No beat happened, too late error is expected. + monitor + .get_eval_handle() + .evaluate(hmon_starting_point, &mut |monitor_tag, error| { + assert_eq!(*monitor_tag, MonitorTag::from(TAG)); + assert_eq!(error, HeartbeatEvaluationError::TooLate.into()); + }); + } + + fn beat_eval_test( + beat_time: Duration, + eval_time: Duration, + on_error: &mut dyn FnMut(&MonitorTag, MonitorEvaluationError), + ) { + let range = range_from_ms(80, 120); + let monitor = create_monitor_single_cycle(range); + let hmon_starting_point = Instant::now(); + + // Wait and beat. + sleep_until(beat_time, hmon_starting_point); + monitor.heartbeat(); + + // Wait and evaluate. + sleep_until(eval_time, hmon_starting_point); + monitor.get_eval_handle().evaluate(hmon_starting_point, on_error); + } + + fn beat_early_test(eval_time: Duration) { + beat_eval_test(Duration::from_millis(25), eval_time, &mut |monitor_tag, error| { + assert_eq!(*monitor_tag, MonitorTag::from(TAG)); + assert_eq!(error, HeartbeatEvaluationError::TooEarly.into()); + }); + } + + #[test] + fn heartbeat_monitor_beat_early_evaluate_early() { + beat_early_test(Duration::from_millis(50)); + } + + #[test] + fn heartbeat_monitor_beat_early_evaluate_in_range() { + beat_early_test(Duration::from_millis(100)); + } + + #[test] + fn heartbeat_monitor_beat_early_evaluate_late() { + beat_early_test(Duration::from_millis(150)); + } + + fn beat_in_range_test(eval_time: Duration) { + beat_eval_test(Duration::from_millis(90), eval_time, &mut |monitor_tag, error| { + panic!("error happened, tag: {monitor_tag:?}, error: {error:?}") + }); + } + + #[test] + fn heartbeat_monitor_beat_in_range_evaluate_in_range() { + beat_in_range_test(Duration::from_millis(100)); + } + + #[test] + fn heartbeat_monitor_beat_in_range_evaluate_late() { + beat_in_range_test(Duration::from_millis(150)); + } + + #[test] + fn heartbeat_monitor_beat_late_evaluate_late() { + beat_eval_test( + Duration::from_millis(150), + Duration::from_millis(200), + &mut |monitor_tag, error| { + assert_eq!(*monitor_tag, MonitorTag::from(TAG)); + assert_eq!(error, HeartbeatEvaluationError::TooLate.into()); + }, + ) + } + + fn multiple_beats_eval_test(beat_time: Duration, eval_time: Duration) { + let range = range_from_ms(80, 120); + let monitor = create_monitor_single_cycle(range); + let hmon_starting_point = Instant::now(); + + // Wait and beat. + sleep_until(beat_time, hmon_starting_point); + const NUM_BEATS: usize = 10; + for _ in 0..NUM_BEATS { + monitor.heartbeat(); + } + + // Wait and evaluate. + sleep_until(eval_time, hmon_starting_point); + monitor + .get_eval_handle() + .evaluate(hmon_starting_point, &mut |monitor_tag, error| { + assert_eq!(*monitor_tag, MonitorTag::from(TAG)); + assert_eq!(error, HeartbeatEvaluationError::MultipleHeartbeats.into()); + }); + } + + #[test] + fn heartbeat_monitor_multiple_beats_early_evaluate_early() { + multiple_beats_eval_test(Duration::from_millis(25), Duration::from_millis(50)) + } + + #[test] + fn heartbeat_monitor_multiple_beats_early_evaluate_in_range() { + multiple_beats_eval_test(Duration::from_millis(25), Duration::from_millis(100)) + } + + #[test] + fn heartbeat_monitor_multiple_beats_early_evaluate_late() { + multiple_beats_eval_test(Duration::from_millis(25), Duration::from_millis(150)) + } + + #[test] + fn heartbeat_monitor_multiple_beats_in_range_evaluate_in_range() { + multiple_beats_eval_test(Duration::from_millis(90), Duration::from_millis(100)) + } + + #[test] + fn heartbeat_monitor_multiple_beats_in_range_evaluate_late() { + multiple_beats_eval_test(Duration::from_millis(90), Duration::from_millis(150)) + } + + #[test] + fn heartbeat_monitor_multiple_beats_late_evaluate_late() { + multiple_beats_eval_test(Duration::from_millis(150), Duration::from_millis(200)) + } + + fn create_monitor_multiple_cycles(cycle: Duration) -> Arc { + let range = range_from_ms(80, 120); + let monitor_tag = MonitorTag::from(TAG); + let allocator = ProtectedMemoryAllocator {}; + let monitor = HeartbeatMonitorBuilder::new(range) + .build(monitor_tag, cycle, &allocator) + .unwrap(); + Arc::new(monitor) + } + + #[test] + #[cfg_attr(miri, ignore)] + fn heartbeat_monitor_cycle_early() { + let cycle = Duration::from_millis(20); + let monitor = create_monitor_multiple_cycles(cycle); + let hmon_starting_point = Instant::now(); + + // Run heartbeat thread. + let monitor_clone = monitor.clone(); + let heartbeat_finished = Arc::new(AtomicBool::new(false)); + let heartbeat_finished_clone = heartbeat_finished.clone(); + let heartbeat_thread = spawn(move || { + const NUM_BEATS: u32 = 3; + const BEAT_INTERVAL: Duration = Duration::from_millis(100); + for i in 1..NUM_BEATS { + sleep_until(i * BEAT_INTERVAL, hmon_starting_point); + monitor_clone.heartbeat(); + } + + // Perform a last heartbeat in shorter interval. + sleep_until( + NUM_BEATS * BEAT_INTERVAL - Duration::from_millis(40), + hmon_starting_point, + ); + monitor_clone.heartbeat(); + + heartbeat_finished_clone.store(true, Ordering::Release); + }); + + // Run evaluation thread. + let eval_handle = monitor.get_eval_handle(); + while !heartbeat_finished.load(Ordering::Acquire) { + sleep(cycle); + // Too early error is expected. + eval_handle.evaluate(hmon_starting_point, &mut |monitor_tag, error| { + assert_eq!(*monitor_tag, MonitorTag::from(TAG)); + assert_eq!(error, HeartbeatEvaluationError::TooEarly.into()); + }); + } + + heartbeat_thread.join().unwrap(); + } + + #[test] + #[cfg_attr(miri, ignore)] + fn heartbeat_monitor_cycle_in_range() { + let cycle = Duration::from_millis(20); + let monitor = create_monitor_multiple_cycles(cycle); + let hmon_starting_point = Instant::now(); + + // Run heartbeat thread. + let monitor_clone = monitor.clone(); + let heartbeat_finished = Arc::new(AtomicBool::new(false)); + let heartbeat_finished_clone = heartbeat_finished.clone(); + let heartbeat_thread = spawn(move || { + const NUM_BEATS: u32 = 3; + const BEAT_INTERVAL: Duration = Duration::from_millis(100); + for i in 1..=NUM_BEATS { + sleep_until(i * BEAT_INTERVAL, hmon_starting_point); + monitor_clone.heartbeat(); + } + heartbeat_finished_clone.store(true, Ordering::Release); + }); + + // Run evaluation thread. + let eval_handle = monitor.get_eval_handle(); + while !heartbeat_finished.load(Ordering::Acquire) { + sleep(cycle); + // No error is expected. + eval_handle.evaluate(hmon_starting_point, &mut |monitor_tag, error| { + panic!("error happened, tag: {monitor_tag:?}, error: {error:?}") + }); + } + + heartbeat_thread.join().unwrap(); + } + + #[test] + #[cfg_attr(miri, ignore)] + fn heartbeat_monitor_cycle_late() { + let cycle = Duration::from_millis(20); + let monitor = create_monitor_multiple_cycles(cycle); + let hmon_starting_point = Instant::now(); + + // Run heartbeat thread. + let monitor_clone = monitor.clone(); + let heartbeat_finished = Arc::new(AtomicBool::new(false)); + let heartbeat_finished_clone = heartbeat_finished.clone(); + let heartbeat_thread = spawn(move || { + const NUM_BEATS: u32 = 3; + const BEAT_INTERVAL: Duration = Duration::from_millis(100); + for i in 1..NUM_BEATS { + sleep_until(i * BEAT_INTERVAL, hmon_starting_point); + monitor_clone.heartbeat(); + } + + // Perform a last heartbeat in shorter interval. + sleep_until( + NUM_BEATS * BEAT_INTERVAL + Duration::from_millis(40), + hmon_starting_point, + ); + monitor_clone.heartbeat(); + + heartbeat_finished_clone.store(true, Ordering::Release); + }); + + // Run evaluation thread. + let eval_handle = monitor.get_eval_handle(); + while !heartbeat_finished.load(Ordering::Acquire) { + sleep(cycle); + // No heartbeat or too late error is expected. + eval_handle.evaluate(hmon_starting_point, &mut |monitor_tag, error| { + assert_eq!(*monitor_tag, MonitorTag::from(TAG)); + assert_eq!(error, HeartbeatEvaluationError::TooLate.into()); + }); + } + + heartbeat_thread.join().unwrap(); + } + + #[test] + fn heartbeat_monitor_timestamp_offset() { + let range = range_from_ms(80, 120); + let monitor = create_monitor_single_cycle(range); + + // Move away monitor creation and HMON starting point. + sleep(Duration::from_millis(300)); + let hmon_starting_point = Instant::now(); + + // Wait and beat. + sleep_until(Duration::from_millis(90), hmon_starting_point); + monitor.heartbeat(); + + // Wait and evaluate. + sleep_until(Duration::from_millis(100), hmon_starting_point); + monitor + .get_eval_handle() + .evaluate(hmon_starting_point, &mut |monitor_tag, error| { + panic!("error happened, tag: {monitor_tag:?}, error: {error:?}") + }); + } +} + +#[cfg(all(test, loom))] +mod loom_tests { + use crate::common::{Monitor, MonitorEvaluator}; + use crate::heartbeat::heartbeat_monitor::test_common::{range_from_ms, sleep_until, TAG}; + use crate::heartbeat::{HeartbeatEvaluationError, HeartbeatMonitor, HeartbeatMonitorBuilder}; + use crate::protected_memory::ProtectedMemoryAllocator; + use crate::tag::MonitorTag; + use crate::TimeRange; + use core::time::Duration; + use loom::thread::spawn; + use std::sync::Arc; + use std::time::Instant; + + fn create_monitor_single_cycle(range: TimeRange) -> Arc { + let monitor_tag = MonitorTag::from(TAG); + let internal_processing_cycle = Duration::from_millis(1); + let allocator = ProtectedMemoryAllocator {}; + let monitor = HeartbeatMonitorBuilder::new(range) + .build(monitor_tag, internal_processing_cycle, &allocator) + .unwrap(); + Arc::new(monitor) + } + + #[test] + fn heartbeat_monitor_heartbeat_evaluate_too_early() { + loom::model(|| { + let range = range_from_ms(30, 70); + let monitor = create_monitor_single_cycle(range); + let hmon_starting_point = Instant::now(); + + // Perform heartbeat in a separate thread. + let monitor_clone = monitor.clone(); + let heartbeat_thread = spawn(move || monitor_clone.heartbeat()); + + // Evaluate. + monitor + .get_eval_handle() + .evaluate(hmon_starting_point, &mut |monitor_tag, error| { + assert_eq!(*monitor_tag, MonitorTag::from(TAG)); + assert_eq!(error, HeartbeatEvaluationError::TooEarly.into()); + }); + + heartbeat_thread.join().unwrap(); + }); + } + + #[test] + fn heartbeat_monitor_heartbeat_evaluate_in_range() { + loom::model(|| { + let range = range_from_ms(30, 70); + let monitor = create_monitor_single_cycle(range); + let hmon_starting_point = Instant::now(); + + // Wait until in range. + sleep_until(Duration::from_millis(50), hmon_starting_point); + + // Perform heartbeat in a separate thread. + let monitor_clone = monitor.clone(); + let heartbeat_thread = spawn(move || monitor_clone.heartbeat()); + + // Evaluate. + monitor + .get_eval_handle() + .evaluate(hmon_starting_point, &mut |monitor_tag, error| { + panic!("error happened, tag: {monitor_tag:?}, error: {error:?}"); + }); + + heartbeat_thread.join().unwrap(); + }); + } + + #[test] + fn heartbeat_monitor_heartbeat_evaluate_too_late() { + loom::model(|| { + let range = range_from_ms(30, 70); + let monitor = create_monitor_single_cycle(range); + let hmon_starting_point = Instant::now(); + + // Wait until too late. + sleep_until(Duration::from_millis(100), hmon_starting_point); + + // Perform heartbeat in a separate thread. + let monitor_clone = monitor.clone(); + let heartbeat_thread = spawn(move || monitor_clone.heartbeat()); + + // Evaluate. + let mut error_detected = false; + monitor + .get_eval_handle() + .evaluate(hmon_starting_point, &mut |monitor_tag, error| { + assert_eq!(*monitor_tag, MonitorTag::from(TAG)); + assert_eq!(error, HeartbeatEvaluationError::TooLate.into()); + error_detected = true; + }); + + heartbeat_thread.join().unwrap(); + assert!(error_detected); + }); + } +} diff --git a/src/health_monitoring_lib/rust/heartbeat/heartbeat_state.rs b/src/health_monitoring_lib/rust/heartbeat/heartbeat_state.rs new file mode 100644 index 00000000..f6a34746 --- /dev/null +++ b/src/health_monitoring_lib/rust/heartbeat/heartbeat_state.rs @@ -0,0 +1,260 @@ +// ******************************************************************************* +// Copyright (c) 2026 Contributors to the Eclipse Foundation +// +// See the NOTICE file(s) distributed with this work for additional +// information regarding copyright ownership. +// +// This program and the accompanying materials are made available under the +// terms of the Apache License Version 2.0 which is available at +// +// +// SPDX-License-Identifier: Apache-2.0 +// ******************************************************************************* + +use core::cmp::min; + +#[cfg(not(loom))] +use core::sync::atomic::{AtomicU64, Ordering}; +#[cfg(loom)] +use loom::sync::atomic::{AtomicU64, Ordering}; + +/// Snapshot of a heartbeat state. +/// Data layout: +/// - heartbeat timestamp: 61 bits +/// - heartbeat counter: 2 bits +/// - post-init flag: 1 bit +#[derive(Clone, Copy, Default)] +pub struct HeartbeatStateSnapshot(u64); + +const BEAT_MASK: u64 = 0xFFFFFFFF_FFFFFFF8; +const BEAT_OFFSET: u32 = 3; +const COUNT_MASK: u64 = 0b0110; +const COUNT_OFFSET: u32 = 1; +const POST_INIT_MASK: u64 = 0b0001; + +impl HeartbeatStateSnapshot { + /// Create a new snapshot. + pub fn new() -> Self { + Self(0) + } + + /// Return underlying data. + pub fn as_u64(&self) -> u64 { + self.0 + } + + /// Heartbeat timestamp. + pub fn heartbeat_timestamp(&self) -> u64 { + (self.0 & BEAT_MASK) >> BEAT_OFFSET + } + + /// Set heartbeat timestamp. + /// Value is 61-bit, must be lower than 0x1FFFFFFF_FFFFFFFF. + pub fn set_heartbeat_timestamp(&mut self, value: u64) { + assert!(value < 1 << 61, "provided heartbeat offset is out of range"); + self.0 = (value << BEAT_OFFSET) | (self.0 & !BEAT_MASK); + } + + /// Heartbeat counter. + pub fn counter(&self) -> u8 { + ((self.0 & COUNT_MASK) >> COUNT_OFFSET) as u8 + } + + /// Increment heartbeat counter. + /// Value is 2-bit, larger values are saturated to max value (3). + pub fn increment_counter(&mut self) { + let value = min(self.counter() + 1, 3); + self.0 = ((value as u64) << COUNT_OFFSET) | (self.0 & !COUNT_MASK); + } + + /// Post-init state. + /// This should be `false` only before first cycle is concluded. + pub fn post_init(&self) -> bool { + let value = self.0 & POST_INIT_MASK; + value != 0 + } + + /// Set post-init state. + pub fn set_post_init(&mut self, value: bool) { + self.0 = (value as u64) | (self.0 & !POST_INIT_MASK); + } +} + +impl From for HeartbeatStateSnapshot { + fn from(value: u64) -> Self { + Self(value) + } +} + +/// Atomic representation of [`HeartbeatStateSnapshot`]. +pub struct HeartbeatState(AtomicU64); + +impl HeartbeatState { + /// Create a new [`HeartbeatState`] using provided [`HeartbeatStateSnapshot`]. + pub fn new(snapshot: HeartbeatStateSnapshot) -> Self { + Self(AtomicU64::new(snapshot.as_u64())) + } + + /// Return a snapshot of the current heartbeat state. + pub fn snapshot(&self) -> HeartbeatStateSnapshot { + HeartbeatStateSnapshot::from(self.0.load(Ordering::Relaxed)) + } + + /// Update the heartbeat state using the provided closure. + /// Closure receives the current state and should return an [`Option`] containing a new state. + /// If [`None`] is returned then the state was not updated. + pub fn update Option>( + &self, + mut f: F, + ) -> Result { + // Prev values returned + self.0 + .fetch_update(Ordering::Relaxed, Ordering::Relaxed, |prev| { + let snapshot = HeartbeatStateSnapshot::from(prev); + f(snapshot).map(|new_snapshot| new_snapshot.as_u64()) + }) + .map(HeartbeatStateSnapshot::from) + .map_err(HeartbeatStateSnapshot::from) + } +} + +#[cfg(all(test, not(loom)))] +mod tests { + use crate::heartbeat::heartbeat_state::{HeartbeatState, HeartbeatStateSnapshot}; + use core::cmp::min; + use core::sync::atomic::Ordering; + + #[test] + fn snapshot_new_succeeds() { + let state = HeartbeatStateSnapshot::new(); + + assert_eq!(state.as_u64(), 0x00); + assert_eq!(state.heartbeat_timestamp(), 0); + assert_eq!(state.counter(), 0); + assert!(!state.post_init()); + } + + #[test] + fn snapshot_from_u64_zero() { + let state = HeartbeatStateSnapshot::from(0); + + assert_eq!(state.as_u64(), 0x00); + assert_eq!(state.heartbeat_timestamp(), 0); + assert_eq!(state.counter(), 0); + assert!(!state.post_init()); + } + + #[test] + fn snapshot_from_u64_valid() { + let state = HeartbeatStateSnapshot::from(0xDEADBEEF_DEADBEEF); + + assert_eq!(state.as_u64(), 0xDEADBEEF_DEADBEEF); + assert_eq!(state.heartbeat_timestamp(), 0xDEADBEEF_DEADBEEF >> 3); + assert_eq!(state.counter(), 3); + assert!(state.post_init()); + } + + #[test] + fn snapshot_from_u64_max() { + let state = HeartbeatStateSnapshot::from(u64::MAX); + + assert_eq!(state.as_u64(), u64::MAX); + assert_eq!(state.heartbeat_timestamp(), u64::MAX >> 3); + assert_eq!(state.counter(), 3); + assert!(state.post_init()); + } + + #[test] + fn snapshot_default() { + let state = HeartbeatStateSnapshot::default(); + + assert_eq!(state.as_u64(), 0x00); + assert_eq!(state.heartbeat_timestamp(), 0); + assert_eq!(state.counter(), 0); + assert!(!state.post_init()); + } + + #[test] + fn snapshot_set_heartbeat_timestamp_valid() { + let mut state = HeartbeatStateSnapshot::from(0xDEADBEEF_DEADBEEF); + state.set_heartbeat_timestamp(0x1CAFEBAD_CAFEBAAD); + + assert_eq!(state.heartbeat_timestamp(), 0x1CAFEBAD_CAFEBAAD); + + // Check other parameters unchanged. + assert_eq!(state.counter(), 3); + assert!(state.post_init()); + } + + #[test] + #[should_panic(expected = "provided heartbeat offset is out of range")] + fn snapshot_set_heartbeat_timestamp_out_of_range() { + let mut state = HeartbeatStateSnapshot::from(0xDEADBEEF_DEADBEEF); + state.set_heartbeat_timestamp(0x20000000_00000000); + } + + #[test] + fn snapshot_counter_increment() { + let mut state = HeartbeatStateSnapshot::from(0xDEADBEEF_DEADBEE9); + + // Max value is 3, check if saturates. + for i in 1..=4 { + state.increment_counter(); + assert_eq!(state.counter(), min(i, 3)); + } + + // Check other parameters unchanged. + assert_eq!(state.heartbeat_timestamp(), 0xDEADBEEF_DEADBEE9 >> 3); + assert!(state.post_init()); + } + + #[test] + fn snapshot_set_post_init() { + let mut state = HeartbeatStateSnapshot::from(0xDEADBEEF_DEADBEEF); + + state.set_post_init(false); + assert!(!state.post_init()); + state.set_post_init(true); + assert!(state.post_init()); + + // Check other parameters unchanged. + assert_eq!(state.heartbeat_timestamp(), 0xDEADBEEF_DEADBEEF >> 3); + assert_eq!(state.counter(), 3); + } + + #[test] + fn state_new() { + let state = HeartbeatState::new(HeartbeatStateSnapshot::from(0xDEADBEEF_DEADBEEF)); + assert_eq!(state.0.load(Ordering::Relaxed), 0xDEADBEEF_DEADBEEF); + } + + #[test] + fn state_snapshot() { + let state = HeartbeatState::new(HeartbeatStateSnapshot::from(0xDEADBEEF_DEADBEEF)); + assert_eq!(state.snapshot().as_u64(), 0xDEADBEEF_DEADBEEF); + } + + #[test] + fn state_update_some() { + let state = HeartbeatState::new(HeartbeatStateSnapshot::from(0xDEADBEEF_DEADBEEF)); + let _ = state.update(|prev_snapshot| { + // Make sure state is as expected. + assert_eq!(prev_snapshot.as_u64(), 0xDEADBEEF_DEADBEEF); + assert_eq!(prev_snapshot.heartbeat_timestamp(), 0xDEADBEEF_DEADBEEF >> 3); + assert_eq!(prev_snapshot.counter(), 3); + assert!(prev_snapshot.post_init()); + + Some(HeartbeatStateSnapshot::from(0)) + }); + + assert_eq!(state.snapshot().as_u64(), 0); + } + + #[test] + fn state_update_none() { + let state = HeartbeatState::new(HeartbeatStateSnapshot::from(0xDEADBEEF_DEADBEEF)); + let _ = state.update(|_| None); + + assert_eq!(state.snapshot().as_u64(), 0xDEADBEEF_DEADBEEF); + } +} diff --git a/src/health_monitoring_lib/rust/heartbeat/mod.rs b/src/health_monitoring_lib/rust/heartbeat/mod.rs new file mode 100644 index 00000000..4afe2ab5 --- /dev/null +++ b/src/health_monitoring_lib/rust/heartbeat/mod.rs @@ -0,0 +1,18 @@ +// ******************************************************************************* +// Copyright (c) 2026 Contributors to the Eclipse Foundation +// +// See the NOTICE file(s) distributed with this work for additional +// information regarding copyright ownership. +// +// This program and the accompanying materials are made available under the +// terms of the Apache License Version 2.0 which is available at +// +// +// SPDX-License-Identifier: Apache-2.0 +// ******************************************************************************* + +mod heartbeat_monitor; +mod heartbeat_state; + +pub(crate) use heartbeat_monitor::HeartbeatEvaluationError; +pub use heartbeat_monitor::{HeartbeatMonitor, HeartbeatMonitorBuilder}; diff --git a/src/health_monitoring_lib/rust/lib.rs b/src/health_monitoring_lib/rust/lib.rs index fcbcb945..24187565 100644 --- a/src/health_monitoring_lib/rust/lib.rs +++ b/src/health_monitoring_lib/rust/lib.rs @@ -20,9 +20,11 @@ mod tag; mod worker; pub mod deadline; +pub mod heartbeat; use crate::common::{Monitor, MonitorEvalHandle}; use crate::deadline::{DeadlineMonitor, DeadlineMonitorBuilder}; +use crate::heartbeat::{HeartbeatMonitor, HeartbeatMonitorBuilder}; use crate::log::{error, ScoreDebug}; pub use common::TimeRange; use containers::fixed_capacity::FixedCapacityVec; @@ -45,6 +47,7 @@ pub enum HealthMonitorError { #[derive(Default)] pub struct HealthMonitorBuilder { deadline_monitor_builders: HashMap, + heartbeat_monitor_builders: HashMap, supervisor_api_cycle: Duration, internal_processing_cycle: Duration, } @@ -54,6 +57,7 @@ impl HealthMonitorBuilder { pub fn new() -> Self { Self { deadline_monitor_builders: HashMap::new(), + heartbeat_monitor_builders: HashMap::new(), supervisor_api_cycle: Duration::from_millis(500), internal_processing_cycle: Duration::from_millis(100), } @@ -72,6 +76,19 @@ impl HealthMonitorBuilder { self } + /// Add a [`HeartbeatMonitor`] for the given [`MonitorTag`]. + /// + /// - `monitor_tag` - unique tag for the [`HeartbeatMonitor`]. + /// - `monitor_builder` - monitor builder to finalize. + /// + /// # Note + /// + /// If a heartbeat monitor with the same tag already exists, it will be overwritten. + pub fn add_heartbeat_monitor(mut self, monitor_tag: MonitorTag, monitor_builder: HeartbeatMonitorBuilder) -> Self { + self.add_heartbeat_monitor_internal(monitor_tag, monitor_builder); + self + } + /// Set the interval between supervisor API notifications. /// This duration determines how often the health monitor notifies the supervisor about system liveness. /// @@ -104,7 +121,7 @@ impl HealthMonitorBuilder { } // Check number of monitors. - let num_monitors = self.deadline_monitor_builders.len(); + let num_monitors = self.deadline_monitor_builders.len() + self.heartbeat_monitor_builders.len(); if num_monitors == 0 { error!("No monitors have been added. HealthMonitor cannot be created."); return Err(HealthMonitorError::WrongState); @@ -120,8 +137,16 @@ impl HealthMonitorBuilder { deadline_monitors.insert(tag, Some(MonitorState::Available(monitor))); } + // Create heartbeat monitors. + let mut heartbeat_monitors = HashMap::new(); + for (tag, builder) in self.heartbeat_monitor_builders { + let monitor = builder.build(tag, self.internal_processing_cycle, &allocator)?; + heartbeat_monitors.insert(tag, Some(MonitorState::Available(monitor))); + } + Ok(HealthMonitor { deadline_monitors, + heartbeat_monitors, worker: worker::UniqueThreadRunner::new(self.internal_processing_cycle), supervisor_api_cycle: self.supervisor_api_cycle, }) @@ -137,6 +162,14 @@ impl HealthMonitorBuilder { self.deadline_monitor_builders.insert(monitor_tag, monitor_builder); } + pub(crate) fn add_heartbeat_monitor_internal( + &mut self, + monitor_tag: MonitorTag, + monitor_builder: HeartbeatMonitorBuilder, + ) { + self.heartbeat_monitor_builders.insert(monitor_tag, monitor_builder); + } + pub(crate) fn with_supervisor_api_cycle_internal(&mut self, cycle_duration: Duration) { self.supervisor_api_cycle = cycle_duration; } @@ -162,6 +195,7 @@ type MonitorContainer = Option>; /// Health monitor. pub struct HealthMonitor { deadline_monitors: HashMap>, + heartbeat_monitors: HashMap>, worker: worker::UniqueThreadRunner, supervisor_api_cycle: Duration, } @@ -197,6 +231,16 @@ impl HealthMonitor { Self::get_monitor(&mut self.deadline_monitors, monitor_tag) } + /// Get and pass ownership of a [`HeartbeatMonitor`] for the given [`MonitorTag`]. + /// + /// - `monitor_tag` - unique tag for the [`HeartbeatMonitor`]. + /// + /// Returns [`Some`] containing [`HeartbeatMonitor`] if found and not taken. + /// Otherwise returns [`None`]. + pub fn get_heartbeat_monitor(&mut self, monitor_tag: MonitorTag) -> Option { + Self::get_monitor(&mut self.heartbeat_monitors, monitor_tag) + } + fn collect_given_monitors( monitors_to_collect: &mut HashMap>, collected_monitors: &mut FixedCapacityVec, @@ -243,9 +287,10 @@ impl HealthMonitor { /// Health monitoring logic stops when the [`HealthMonitor`] is dropped. pub fn start(&mut self) -> Result<(), HealthMonitorError> { // Collect all monitors. - let num_monitors = self.deadline_monitors.len(); + let num_monitors = self.deadline_monitors.len() + self.heartbeat_monitors.len(); let mut collected_monitors = FixedCapacityVec::new(num_monitors); Self::collect_given_monitors(&mut self.deadline_monitors, &mut collected_monitors)?; + Self::collect_given_monitors(&mut self.heartbeat_monitors, &mut collected_monitors)?; // Start monitoring logic. let monitoring_logic = worker::MonitoringLogic::new( @@ -267,15 +312,23 @@ impl HealthMonitor { #[score_testing_macros::test_mod_with_log] #[cfg(all(test, not(loom)))] mod tests { + use crate::common::TimeRange; use crate::deadline::DeadlineMonitorBuilder; + use crate::heartbeat::HeartbeatMonitorBuilder; use crate::tag::MonitorTag; use crate::{HealthMonitorBuilder, HealthMonitorError}; use core::time::Duration; + fn def_heartbeat_monitor_builder() -> HeartbeatMonitorBuilder { + let range = TimeRange::new(Duration::from_millis(100), Duration::from_millis(200)); + HeartbeatMonitorBuilder::new(range) + } + #[test] fn health_monitor_builder_new_succeeds() { let health_monitor_builder = HealthMonitorBuilder::new(); assert!(health_monitor_builder.deadline_monitor_builders.is_empty()); + assert!(health_monitor_builder.heartbeat_monitor_builders.is_empty()); assert_eq!(health_monitor_builder.supervisor_api_cycle, Duration::from_millis(500)); assert_eq!( health_monitor_builder.internal_processing_cycle, @@ -287,9 +340,12 @@ mod tests { fn health_monitor_builder_build_succeeds() { let deadline_monitor_tag = MonitorTag::from("deadline_monitor"); let deadline_monitor_builder = DeadlineMonitorBuilder::new(); + let heartbeat_monitor_tag = MonitorTag::from("heartbeat_monitor"); + let heartbeat_monitor_builder = def_heartbeat_monitor_builder(); let result = HealthMonitorBuilder::new() .add_deadline_monitor(deadline_monitor_tag, deadline_monitor_builder) + .add_heartbeat_monitor(heartbeat_monitor_tag, heartbeat_monitor_builder) .build(); assert!(result.is_ok()); } @@ -364,17 +420,76 @@ mod tests { assert!(result.is_none()); } + #[test] + fn health_monitor_get_heartbeat_monitor_available() { + let heartbeat_monitor_tag = MonitorTag::from("heartbeat_monitor"); + let heartbeat_monitor_builder = def_heartbeat_monitor_builder(); + let mut health_monitor = HealthMonitorBuilder::new() + .add_heartbeat_monitor(heartbeat_monitor_tag, heartbeat_monitor_builder) + .build() + .unwrap(); + + let result = health_monitor.get_heartbeat_monitor(heartbeat_monitor_tag); + assert!(result.is_some()); + } + + #[test] + fn health_monitor_get_heartbeat_monitor_taken() { + let heartbeat_monitor_tag = MonitorTag::from("heartbeat_monitor"); + let heartbeat_monitor_builder = def_heartbeat_monitor_builder(); + let mut health_monitor = HealthMonitorBuilder::new() + .add_heartbeat_monitor(heartbeat_monitor_tag, heartbeat_monitor_builder) + .build() + .unwrap(); + + let _ = health_monitor.get_heartbeat_monitor(heartbeat_monitor_tag); + let result = health_monitor.get_heartbeat_monitor(heartbeat_monitor_tag); + assert!(result.is_none()); + } + + #[test] + fn health_monitor_get_heartbeat_monitor_unknown() { + let heartbeat_monitor_builder = def_heartbeat_monitor_builder(); + let mut health_monitor = HealthMonitorBuilder::new() + .add_heartbeat_monitor(MonitorTag::from("heartbeat_monitor"), heartbeat_monitor_builder) + .build() + .unwrap(); + + let result = health_monitor.get_heartbeat_monitor(MonitorTag::from("undefined_monitor")); + assert!(result.is_none()); + } + + #[test] + fn health_monitor_get_heartbeat_monitor_invalid_state() { + let heartbeat_monitor_tag = MonitorTag::from("heartbeat_monitor"); + let heartbeat_monitor_builder = def_heartbeat_monitor_builder(); + let mut health_monitor = HealthMonitorBuilder::new() + .add_heartbeat_monitor(heartbeat_monitor_tag, heartbeat_monitor_builder) + .build() + .unwrap(); + + // Inject broken state - unreachable otherwise. + health_monitor.heartbeat_monitors.insert(heartbeat_monitor_tag, None); + + let result = health_monitor.get_heartbeat_monitor(heartbeat_monitor_tag); + assert!(result.is_none()); + } + #[test] fn health_monitor_start_succeeds() { let deadline_monitor_tag = MonitorTag::from("deadline_monitor"); let deadline_monitor_builder = DeadlineMonitorBuilder::new(); + let heartbeat_monitor_tag = MonitorTag::from("heartbeat_monitor"); + let heartbeat_monitor_builder = def_heartbeat_monitor_builder(); let mut health_monitor = HealthMonitorBuilder::new() .add_deadline_monitor(deadline_monitor_tag, deadline_monitor_builder) + .add_heartbeat_monitor(heartbeat_monitor_tag, heartbeat_monitor_builder) .build() .unwrap(); let _deadline_monitor = health_monitor.get_deadline_monitor(deadline_monitor_tag).unwrap(); + let _heartbeat_monitor = health_monitor.get_heartbeat_monitor(heartbeat_monitor_tag).unwrap(); let result = health_monitor.start(); assert!(result.is_ok()); @@ -383,9 +498,11 @@ mod tests { #[test] fn health_monitor_start_monitors_not_taken() { let deadline_monitor_builder = DeadlineMonitorBuilder::new(); + let heartbeat_monitor_builder = def_heartbeat_monitor_builder(); let mut health_monitor = HealthMonitorBuilder::new() .add_deadline_monitor(MonitorTag::from("deadline_monitor"), deadline_monitor_builder) + .add_heartbeat_monitor(MonitorTag::from("heartbeat_monitor"), heartbeat_monitor_builder) .build() .unwrap(); @@ -397,9 +514,12 @@ mod tests { fn health_monitor_start_not_taken_then_restart() { let deadline_monitor_tag = MonitorTag::from("deadline_monitor"); let deadline_monitor_builder = DeadlineMonitorBuilder::new(); + let heartbeat_monitor_tag = MonitorTag::from("heartbeat_monitor"); + let heartbeat_monitor_builder = def_heartbeat_monitor_builder(); let mut health_monitor = HealthMonitorBuilder::new() .add_deadline_monitor(deadline_monitor_tag, deadline_monitor_builder) + .add_heartbeat_monitor(heartbeat_monitor_tag, heartbeat_monitor_builder) .build() .unwrap(); @@ -407,9 +527,11 @@ mod tests { let start_result = health_monitor.start(); assert!(start_result.is_err_and(|e| e == HealthMonitorError::WrongState)); - // Take monitor. + // Take monitors. let get_deadline_monitor_result = health_monitor.get_deadline_monitor(deadline_monitor_tag); assert!(get_deadline_monitor_result.is_some()); + let get_heartbeat_monitor_result = health_monitor.get_heartbeat_monitor(heartbeat_monitor_tag); + assert!(get_heartbeat_monitor_result.is_some()); // Try to start again, this time should be successful. let start_result = health_monitor.start(); diff --git a/src/health_monitoring_lib/rust/worker.rs b/src/health_monitoring_lib/rust/worker.rs index 508f9732..d5f82c3c 100644 --- a/src/health_monitoring_lib/rust/worker.rs +++ b/src/health_monitoring_lib/rust/worker.rs @@ -45,11 +45,11 @@ impl MonitoringLogic { } } - fn run(&mut self) -> bool { + fn run(&mut self, hmon_starting_point: Instant) -> bool { let mut has_any_error = false; for monitor in self.monitors.iter() { - monitor.evaluate(&mut |monitor_tag, error| { + monitor.evaluate(hmon_starting_point, &mut |monitor_tag, error| { has_any_error = true; match error { @@ -59,7 +59,12 @@ impl MonitoringLogic { monitor_tag, deadline_evaluation_error ) }, - MonitorEvaluationError::Heartbeat => unimplemented!(), + MonitorEvaluationError::Heartbeat(heartbeat_evaluation_error) => { + warn!( + "Heartbeat monitor with tag {:?} reported error: {:?}.", + monitor_tag, heartbeat_evaluation_error + ) + }, MonitorEvaluationError::Logic => unimplemented!(), } }); @@ -105,6 +110,7 @@ impl UniqueThreadRunner { std::thread::spawn(move || { info!("Monitoring thread started."); + let hmon_starting_point = Instant::now(); let mut next_sleep_time = interval; // TODO Add some checks and log if cyclicly here is not met. @@ -113,7 +119,7 @@ impl UniqueThreadRunner { let now = Instant::now(); - if !monitoring_logic.run() { + if !monitoring_logic.run(hmon_starting_point) { info!("Monitoring logic failed, stopping thread."); break; } @@ -168,6 +174,7 @@ mod tests { use core::sync::atomic::{AtomicUsize, Ordering}; use core::time::Duration; use std::sync::Arc; + use std::time::Instant; #[derive(Clone)] struct MockSupervisorAPIClient { @@ -211,6 +218,7 @@ mod tests { fn monitoring_logic_report_error_when_deadline_failed() { let deadline_monitor = create_monitor_with_deadlines(); let alive_mock = MockSupervisorAPIClient::new(); + let hmon_starting_point = Instant::now(); let mut logic = MonitoringLogic::new( { @@ -229,7 +237,7 @@ mod tests { drop(handle); - assert!(!logic.run()); + assert!(!logic.run(hmon_starting_point)); assert_eq!(alive_mock.get_notify_count(), 0); } @@ -237,6 +245,7 @@ mod tests { fn monitoring_logic_report_alive_on_each_call_when_no_error() { let deadline_monitor = create_monitor_with_deadlines(); let alive_mock = MockSupervisorAPIClient::new(); + let hmon_starting_point = Instant::now(); let mut logic = MonitoringLogic::new( { @@ -253,11 +262,11 @@ mod tests { .unwrap(); let _handle = deadline.start().unwrap(); - assert!(logic.run()); - assert!(logic.run()); - assert!(logic.run()); - assert!(logic.run()); - assert!(logic.run()); + assert!(logic.run(hmon_starting_point)); + assert!(logic.run(hmon_starting_point)); + assert!(logic.run(hmon_starting_point)); + assert!(logic.run(hmon_starting_point)); + assert!(logic.run(hmon_starting_point)); assert_eq!(alive_mock.get_notify_count(), 5); } @@ -266,6 +275,7 @@ mod tests { fn monitoring_logic_report_alive_respect_cycle() { let deadline_monitor = create_monitor_with_deadlines(); let alive_mock = MockSupervisorAPIClient::new(); + let hmon_starting_point = Instant::now(); let mut logic = MonitoringLogic::new( { @@ -283,19 +293,19 @@ mod tests { let _handle = deadline.start().unwrap(); std::thread::sleep(Duration::from_millis(30)); - assert!(logic.run()); + assert!(logic.run(hmon_starting_point)); std::thread::sleep(Duration::from_millis(30)); - assert!(logic.run()); + assert!(logic.run(hmon_starting_point)); std::thread::sleep(Duration::from_millis(30)); - assert!(logic.run()); + assert!(logic.run(hmon_starting_point)); std::thread::sleep(Duration::from_millis(30)); - assert!(logic.run()); + assert!(logic.run(hmon_starting_point)); std::thread::sleep(Duration::from_millis(30)); - assert!(logic.run()); + assert!(logic.run(hmon_starting_point)); assert_eq!(alive_mock.get_notify_count(), 5); }