diff --git a/Cargo.lock b/Cargo.lock index d2d342a1..aae4276a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2,6 +2,15 @@ # It is not intended for manual editing. version = 4 +[[package]] +name = "aho-corasick" +version = "1.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ddd31a130427c27518df266943a5308ed92d4b226cc639f5a8f1002816174301" +dependencies = [ + "memchr", +] + [[package]] name = "anstream" version = "0.6.21" @@ -52,6 +61,22 @@ dependencies = [ "windows-sys", ] +[[package]] +name = "cc" +version = "1.2.56" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aebf35691d1bfb0ac386a69bac2fde4dd276fb618cf8bf4f5318fe285e821bb2" +dependencies = [ + "find-msvc-tools", + "shlex", +] + +[[package]] +name = "cfg-if" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801" + [[package]] name = "clap" version = "4.5.52" @@ -103,11 +128,33 @@ name = "containers" version = "0.1.0" source = "git+https://github.com/eclipse-score/baselibs_rust.git?tag=v0.0.4#d36362e03664f65117145d6fc90e38505d54a900" +[[package]] +name = "find-msvc-tools" +version = "0.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5baebc0774151f905a1a2cc41989300b1e6fbb29aff0ceffa1064fdd3088d582" + +[[package]] +name = "generator" +version = "0.8.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "52f04ae4152da20c76fe800fa48659201d5cf627c5149ca0b707b69d7eef6cf9" +dependencies = [ + "cc", + "cfg-if", + "libc", + "log", + "rustversion", + "windows-link", + "windows-result", +] + [[package]] name = "health_monitoring_lib" version = "0.0.1" dependencies = [ "containers", + "loom", "monitor_rs", "score_log", "score_testing_macros", @@ -126,6 +173,18 @@ version = "1.70.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a6cb138bb79a146c1bd460005623e142ef0181e3d0219cb493e02f7d08a35695" +[[package]] +name = "itoa" +version = "1.0.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "92ecc6618181def0457392ccd0ee51198e065e016d1d527a7ac1b6dc7c1f09d2" + +[[package]] +name = "lazy_static" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe" + [[package]] name = "libc" version = "0.2.177" @@ -139,6 +198,42 @@ dependencies = [ "libc", ] +[[package]] +name = "log" +version = "0.4.29" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5e5032e24019045c762d3c0f28f5b6b8bbf38563a65908389bf7978758920897" + +[[package]] +name = "loom" +version = "0.7.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "419e0dc8046cb947daa77eb95ae174acfbddb7673b4151f56d1eed8e93fbfaca" +dependencies = [ + "cfg-if", + "generator", + "scoped-tls", + "serde", + "serde_json", + "tracing", + "tracing-subscriber", +] + +[[package]] +name = "matchers" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d1525a2a28c7f4fa0fc98bb91ae755d1e2d1505079e05539e35bc876b5d65ae9" +dependencies = [ + "regex-automata", +] + +[[package]] +name = "memchr" +version = "2.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f8ca58f447f06ed17d5fc4043ce1b10dd205e060fb3ce5b979b8ed8e59ff3f79" + [[package]] name = "monitor_rs" version = "0.0.1" @@ -146,12 +241,33 @@ dependencies = [ "libc", ] +[[package]] +name = "nu-ansi-term" +version = "0.50.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7957b9740744892f114936ab4a57b3f487491bbeafaf8083688b16841a4240e5" +dependencies = [ + "windows-sys", +] + +[[package]] +name = "once_cell" +version = "1.21.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d" + [[package]] name = "once_cell_polyfill" version = "1.70.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "384b8ab6d37215f3c5301a95a4accb5d64aa607f1fcb26a11b5303878451b4fe" +[[package]] +name = "pin-project-lite" +version = "0.2.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3b3cff922bd51709b605d9ead9aa71031d81447142d828eb4a6eba76fe619f9b" + [[package]] name = "proc-macro2" version = "1.0.103" @@ -170,6 +286,23 @@ dependencies = [ "proc-macro2", ] +[[package]] +name = "regex-automata" +version = "0.4.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6e1dd4122fc1595e8162618945476892eefca7b88c52820e74af6262213cae8f" +dependencies = [ + "aho-corasick", + "memchr", + "regex-syntax", +] + +[[package]] +name = "regex-syntax" +version = "0.8.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc897dd8d9e8bd1ed8cdad82b5966c3e0ecae09fb1907d58efaa013543185d0a" + [[package]] name = "rust_supervised_app" version = "0.0.1" @@ -184,6 +317,18 @@ dependencies = [ "stdout_logger", ] +[[package]] +name = "rustversion" +version = "1.0.22" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b39cdef0fa800fc44525c84ccb54a029961a8215f9619753635a9c0d2538d46d" + +[[package]] +name = "scoped-tls" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e1cf6437eb19a8f4a6cc0f7dca544973b0b78843adbfeb3683d1a94a0024a294" + [[package]] name = "score_log" version = "0.0.1" @@ -219,6 +364,64 @@ dependencies = [ "syn", ] +[[package]] +name = "serde" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a8e94ea7f378bd32cbbd37198a4a91436180c5bb472411e48b5ec2e2124ae9e" +dependencies = [ + "serde_core", + "serde_derive", +] + +[[package]] +name = "serde_core" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "41d385c7d4ca58e59fc732af25c3983b67ac852c1a25000afe1175de458b67ad" +dependencies = [ + "serde_derive", +] + +[[package]] +name = "serde_derive" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "serde_json" +version = "1.0.149" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "83fc039473c5595ace860d8c4fafa220ff474b3fc6bfdb4293327f1a37e94d86" +dependencies = [ + "itoa", + "memchr", + "serde", + "serde_core", + "zmij", +] + +[[package]] +name = "sharded-slab" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f40ca3c46823713e0d4209592e8d6e826aa57e928f09752619fc696c499637f6" +dependencies = [ + "lazy_static", +] + +[[package]] +name = "shlex" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" + [[package]] name = "signal-hook" version = "0.3.18" @@ -238,6 +441,12 @@ dependencies = [ "libc", ] +[[package]] +name = "smallvec" +version = "1.15.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "67b1b7a3b5fe4f1376887184045fcf45c69e92af734b7aaddc05fb777b6fbd03" + [[package]] name = "stdout_logger" version = "0.0.1" @@ -263,6 +472,64 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "thread_local" +version = "1.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f60246a4944f24f6e018aa17cdeffb7818b76356965d03b07d6a9886e8962185" +dependencies = [ + "cfg-if", +] + +[[package]] +name = "tracing" +version = "0.1.44" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "63e71662fa4b2a2c3a26f570f037eb95bb1f85397f3cd8076caed2f026a6d100" +dependencies = [ + "pin-project-lite", + "tracing-core", +] + +[[package]] +name = "tracing-core" +version = "0.1.36" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "db97caf9d906fbde555dd62fa95ddba9eecfd14cb388e4f491a66d74cd5fb79a" +dependencies = [ + "once_cell", + "valuable", +] + +[[package]] +name = "tracing-log" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ee855f1f400bd0e5c02d150ae5de3840039a3f54b025156404e34c23c03f47c3" +dependencies = [ + "log", + "once_cell", + "tracing-core", +] + +[[package]] +name = "tracing-subscriber" +version = "0.3.22" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2f30143827ddab0d256fd843b7a66d164e9f271cfa0dde49142c5ca0ca291f1e" +dependencies = [ + "matchers", + "nu-ansi-term", + "once_cell", + "regex-automata", + "sharded-slab", + "smallvec", + "thread_local", + "tracing", + "tracing-core", + "tracing-log", +] + [[package]] name = "unicode-ident" version = "1.0.22" @@ -275,12 +542,27 @@ version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" +[[package]] +name = "valuable" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ba73ea9cf16a25df0c8caa16c51acb937d5712a8429db78a3ee29d5dcacd3a65" + [[package]] name = "windows-link" version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5" +[[package]] +name = "windows-result" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7781fa89eaf60850ac3d2da7af8e5242a5ea78d1a11c49bf2910bb5a73853eb5" +dependencies = [ + "windows-link", +] + [[package]] name = "windows-sys" version = "0.61.2" @@ -289,3 +571,9 @@ checksum = "ae137229bcbd6cdf0f7b80a31df61766145077ddf49416a728b02cb3921ff3fc" dependencies = [ "windows-link", ] + +[[package]] +name = "zmij" +version = "1.0.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b8848ee67ecc8aedbaf3e4122217aff892639231befc6a1b58d29fff4c2cabaa" diff --git a/Cargo.toml b/Cargo.toml index 4c19b6ec..e9e7c923 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -6,6 +6,7 @@ members = [ "src/health_monitoring_lib", "examples/rust_supervised_app", ] +default-members = ["src/health_monitoring_lib"] [workspace.package] edition = "2021" @@ -28,3 +29,6 @@ containers = { git = "https://github.com/eclipse-score/baselibs_rust.git", tag = [workspace.lints.clippy] std_instead_of_core = "warn" alloc_instead_of_core = "warn" + +[workspace.lints.rust] +unexpected_cfgs = { level = "warn", check-cfg = ['cfg(loom)'] } diff --git a/src/health_monitoring_lib/BUILD b/src/health_monitoring_lib/BUILD index c473903d..69533160 100644 --- a/src/health_monitoring_lib/BUILD +++ b/src/health_monitoring_lib/BUILD @@ -42,6 +42,7 @@ CC_HDRS = [ rust_library( name = "health_monitoring_lib", srcs = glob(["rust/**/*.rs"]), + crate_features = ["score_supervisor_api_client"], crate_root = "rust/lib.rs", proc_macro_deps = PROC_MACRO_DEPS, visibility = ["//visibility:public"], @@ -65,6 +66,7 @@ cc_library( rust_static_library( name = "health_monitoring_lib_ffi", srcs = glob(["rust/**/*.rs"]), + crate_features = ["score_supervisor_api_client"], crate_name = "health_monitoring_lib", crate_root = "rust/lib.rs", proc_macro_deps = [ @@ -100,6 +102,7 @@ cc_library( rust_test( name = "tests", crate = ":health_monitoring_lib", + crate_features = ["stub_supervisor_api_client"], rustc_flags = [ "-C", "link-arg=-lm", diff --git a/src/health_monitoring_lib/Cargo.toml b/src/health_monitoring_lib/Cargo.toml index 71b38d46..1531b353 100644 --- a/src/health_monitoring_lib/Cargo.toml +++ b/src/health_monitoring_lib/Cargo.toml @@ -7,7 +7,6 @@ edition.workspace = true authors.workspace = true license-file.workspace = true - [lib] path = "rust/lib.rs" @@ -18,11 +17,15 @@ workspace = true score_log.workspace = true score_testing_macros.workspace = true containers.workspace = true -monitor_rs.workspace = true +monitor_rs = { workspace = true, optional = true } [dev-dependencies] stdout_logger.workspace = true +[target.'cfg(loom)'.dependencies] +loom = { version = "0.7", features = ["checkpoint"] } + [features] -default = [] +default = ["stub_supervisor_api_client"] stub_supervisor_api_client = [] +score_supervisor_api_client = ["monitor_rs"] diff --git a/src/health_monitoring_lib/rust/common.rs b/src/health_monitoring_lib/rust/common.rs index 35a8ffa5..d86003cf 100644 --- a/src/health_monitoring_lib/rust/common.rs +++ b/src/health_monitoring_lib/rust/common.rs @@ -11,10 +11,14 @@ // SPDX-License-Identifier: Apache-2.0 // ******************************************************************************* +use crate::deadline::DeadlineEvaluationError; +use crate::heartbeat::HeartbeatEvaluationError; +use crate::log::ScoreDebug; use crate::tag::MonitorTag; use core::hash::Hash; use core::time::Duration; use std::sync::Arc; +use std::time::Instant; /// Range of accepted time. #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] @@ -30,16 +34,45 @@ impl TimeRange { } } +/// The monitor has an evaluation handle available. +pub(crate) trait HasEvalHandle { + /// Get an evaluation handle for this monitor. + /// + /// # NOTE + /// + /// This method is intended to be called from a background thread periodically. + fn get_eval_handle(&self) -> MonitorEvalHandle; +} + /// Errors that can occur during monitor evaluation. -#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, crate::log::ScoreDebug)] +/// Contains failing monitor type. +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, ScoreDebug)] +#[allow(dead_code)] pub(crate) enum MonitorEvaluationError { - TooEarly, - TooLate, + Deadline(DeadlineEvaluationError), + Heartbeat(HeartbeatEvaluationError), + Logic, +} + +impl From for MonitorEvaluationError { + fn from(value: DeadlineEvaluationError) -> Self { + MonitorEvaluationError::Deadline(value) + } +} + +impl From for MonitorEvaluationError { + fn from(value: HeartbeatEvaluationError) -> Self { + MonitorEvaluationError::Heartbeat(value) + } } /// Trait for evaluating monitors and reporting errors to be used by HealthMonitor. pub(crate) trait MonitorEvaluator { - fn evaluate(&self, on_error: &mut dyn FnMut(&MonitorTag, MonitorEvaluationError)); + /// Run monitor evaluation. + /// + /// - `hmon_starting_point` - starting point of all monitors. + /// - `on_error` - error handling, containing tag of failing object and error code. + fn evaluate(&self, hmon_starting_point: Instant, on_error: &mut dyn FnMut(&MonitorTag, MonitorEvaluationError)); } /// Handle to a monitor evaluator, allowing for dynamic dispatch. @@ -54,7 +87,68 @@ impl MonitorEvalHandle { } impl MonitorEvaluator for MonitorEvalHandle { - fn evaluate(&self, on_error: &mut dyn FnMut(&MonitorTag, MonitorEvaluationError)) { - self.inner.evaluate(on_error) + fn evaluate(&self, hmon_starting_point: Instant, on_error: &mut dyn FnMut(&MonitorTag, MonitorEvaluationError)) { + self.inner.evaluate(hmon_starting_point, on_error) + } +} + +/// Get offset between HMON and monitor starting time points as [`u32`]. +pub(crate) fn hmon_time_offset(hmon_starting_point: Instant, monitor_starting_point: Instant) -> u32 { + let result = hmon_starting_point.checked_duration_since(monitor_starting_point); + let duration_since = result.expect("HMON starting point is earlier than monitor starting point"); + duration_to_u32(duration_since) +} + +/// Get duration as [`u32`]. +pub(crate) fn duration_to_u32(duration: Duration) -> u32 { + let millis = duration.as_millis(); + u32::try_from(millis).expect("Monitor running for too long") +} + +#[cfg(test)] +mod tests { + use crate::common::{duration_to_u32, hmon_time_offset}; + use core::time::Duration; + use std::time::Instant; + + #[test] + fn hmon_time_offset_valid() { + let monitor_starting_point = Instant::now(); + let hmon_starting_point = Instant::now(); + let offset = hmon_time_offset(hmon_starting_point, monitor_starting_point); + // Allow small offset. + assert!(offset < 10); + } + + #[test] + #[should_panic(expected = "HMON starting point is earlier than monitor starting point")] + fn hmon_time_offset_wrong_order() { + let hmon_starting_point = Instant::now(); + let monitor_starting_point = Instant::now(); + let _offset = hmon_time_offset(hmon_starting_point, monitor_starting_point); + } + + #[test] + #[should_panic(expected = "Monitor running for too long")] + fn hmon_time_offset_diff_too_large() { + const HUNDRED_DAYS_AS_SECS: u64 = 100 * 24 * 60 * 60; + let monitor_starting_point = Instant::now(); + let hmon_starting_point = Instant::now() + .checked_add(Duration::from_secs(HUNDRED_DAYS_AS_SECS)) + .unwrap(); + let _offset = hmon_time_offset(hmon_starting_point, monitor_starting_point); + } + + #[test] + fn duration_to_u32_valid() { + let result = duration_to_u32(Duration::from_millis(1234)); + assert_eq!(result, 1234); + } + + #[test] + #[should_panic(expected = "Monitor running for too long")] + fn duration_to_u32_too_large() { + const HUNDRED_DAYS_AS_SECS: u64 = 100 * 24 * 60 * 60; + let _result = duration_to_u32(Duration::from_secs(HUNDRED_DAYS_AS_SECS)); } } diff --git a/src/health_monitoring_lib/rust/deadline/deadline_monitor.rs b/src/health_monitoring_lib/rust/deadline/deadline_monitor.rs index 995b851d..4a690889 100644 --- a/src/health_monitoring_lib/rust/deadline/deadline_monitor.rs +++ b/src/health_monitoring_lib/rust/deadline/deadline_monitor.rs @@ -10,20 +10,27 @@ // // SPDX-License-Identifier: Apache-2.0 // ******************************************************************************* -use super::common::DeadlineTemplate; -use crate::common::{MonitorEvalHandle, MonitorEvaluationError, MonitorEvaluator, TimeRange}; -use crate::tag::{DeadlineTag, MonitorTag}; -use crate::{ - deadline::{ - common::StateIndex, - deadline_state::{DeadlineState, DeadlineStateSnapshot}, - }, - protected_memory::ProtectedMemoryAllocator, +use crate::common::{ + duration_to_u32, HasEvalHandle, MonitorEvalHandle, MonitorEvaluationError, MonitorEvaluator, TimeRange, }; +use crate::deadline::common::{DeadlineTemplate, StateIndex}; +use crate::deadline::deadline_state::{DeadlineState, DeadlineStateSnapshot}; +use crate::log::{error, warn, ScoreDebug}; +use crate::protected_memory::ProtectedMemoryAllocator; +use crate::tag::{DeadlineTag, MonitorTag}; use core::hash::Hash; -use std::{collections::HashMap, sync::Arc, time::Instant}; - -use crate::log::*; +use std::collections::HashMap; +use std::sync::Arc; +use std::time::Instant; + +/// Deadline evaluation errors. +#[derive(Debug, PartialEq, Eq, Clone, Copy, Hash, ScoreDebug)] +pub(crate) enum DeadlineEvaluationError { + /// Finished too early. + TooEarly, + /// Finished too late. + TooLate, +} /// /// Errors that can occur when working with DeadlineMonitor @@ -65,7 +72,8 @@ impl DeadlineMonitorBuilder { /// Builds the DeadlineMonitor with the configured deadlines. pub(crate) fn build(self, monitor_tag: MonitorTag, _allocator: &ProtectedMemoryAllocator) -> DeadlineMonitor { - DeadlineMonitor::new(monitor_tag, self.deadlines) + let inner = Arc::new(DeadlineMonitorInner::new(monitor_tag, self.deadlines)); + DeadlineMonitor::new(inner) } // Used by FFI and config parsing code which prefer not to move builder instance @@ -80,27 +88,9 @@ pub struct DeadlineMonitor { } impl DeadlineMonitor { - fn new(monitor_tag: MonitorTag, deadlines: HashMap) -> Self { - let mut active_deadlines = vec![]; - - let deadlines = deadlines - .into_iter() - .enumerate() - .map(|(index, (deadline_tag, range))| { - active_deadlines.push((deadline_tag, DeadlineState::new())); - (deadline_tag, DeadlineTemplate::new(range, StateIndex::new(index))) - }) - .collect(); - - Self { - #[allow(clippy::arc_with_non_send_sync)] // This will be fixed once we add background thread - inner: Arc::new(DeadlineMonitorInner { - monitor_tag, - deadlines, - active_deadlines: active_deadlines.into(), - monitor_starting_point: Instant::now(), - }), - } + /// Create a new [`DeadlineMonitor`] instance. + fn new(inner: Arc) -> Self { + Self { inner } } /// Acquires a deadline instance for the given tag. @@ -109,26 +99,12 @@ impl DeadlineMonitor { /// - Err(DeadlineMonitorError::DeadlineInUse) - if the deadline is already in use /// - Err(DeadlineMonitorError::DeadlineNotFound) - if the deadline tag is not registered pub fn get_deadline(&self, deadline_tag: DeadlineTag) -> Result { - if let Some(template) = self.inner.deadlines.get(&deadline_tag) { - match template.acquire_deadline() { - Some(range) => Ok(Deadline { - range, - deadline_tag, - monitor: Arc::clone(&self.inner), - state_index: template.assigned_state_index, - }), - None => Err(DeadlineMonitorError::DeadlineInUse), - } - } else { - Err(DeadlineMonitorError::DeadlineNotFound) - } + self.inner.get_deadline(deadline_tag) } +} - /// Handle for evaluation of all active deadlines and reporting any missed deadlines or underruns. - /// - /// # NOTE - /// This function is intended to be called from a background thread periodically. - pub(crate) fn get_eval_handle(&self) -> MonitorEvalHandle { +impl HasEvalHandle for DeadlineMonitor { + fn get_eval_handle(&self) -> MonitorEvalHandle { MonitorEvalHandle::new(Arc::clone(&self.inner)) } } @@ -178,7 +154,7 @@ impl Deadline { /// Caller must ensure that deadline is not used until it's stopped. /// After this call You shall assure there's only a single owner of the `Deadline` instance and it does not call start before stopping. pub(super) unsafe fn start_internal(&mut self) -> Result<(), DeadlineError> { - let now = self.monitor.now(); + let now = duration_to_u32(self.monitor.monitor_starting_point.elapsed()); let max_time = now + self.range.max.as_millis() as u32; let mut is_broken = false; @@ -203,7 +179,7 @@ impl Deadline { } pub(super) fn stop_internal(&mut self) { - let now = self.monitor.now(); + let now = duration_to_u32(self.monitor.monitor_starting_point.elapsed()); let max = self.range.max.as_millis() as u32; let min = self.range.min.as_millis() as u32; @@ -220,7 +196,7 @@ impl Deadline { let expected = current.timestamp_ms(); if expected < now { - possible_err = (Some(MonitorEvaluationError::TooLate), now - expected); + possible_err = (Some(DeadlineEvaluationError::TooLate), now - expected); return None; // Deadline missed, let state as is for BG thread to report } @@ -231,7 +207,7 @@ impl Deadline { // Finished too early, leave it for reporting by BG thread current.set_underrun(); - possible_err = (Some(MonitorEvaluationError::TooEarly), earliest_time - now); + possible_err = (Some(DeadlineEvaluationError::TooEarly), earliest_time - now); return Some(current); } @@ -239,10 +215,10 @@ impl Deadline { }); match possible_err { - (Some(MonitorEvaluationError::TooEarly), val) => { + (Some(DeadlineEvaluationError::TooEarly), val) => { error!("Deadline {:?} stopped too early by {} ms", self.deadline_tag, val); }, - (Some(MonitorEvaluationError::TooLate), val) => { + (Some(DeadlineEvaluationError::TooLate), val) => { error!("Deadline {:?} stopped too late by {} ms", self.deadline_tag, val); }, (None, _) => {}, @@ -285,28 +261,7 @@ struct DeadlineMonitorInner { } impl MonitorEvaluator for DeadlineMonitorInner { - fn evaluate(&self, on_error: &mut dyn FnMut(&MonitorTag, MonitorEvaluationError)) { - self.evaluate(on_error); - } -} - -impl DeadlineMonitorInner { - fn release_deadline(&self, deadline_tag: DeadlineTag) { - if let Some(template) = self.deadlines.get(&deadline_tag) { - template.release_deadline(); - } else { - unreachable!("Releasing unknown deadline tag: {:?}", deadline_tag); - } - } - - fn now(&self) -> u32 { - let duration = self.monitor_starting_point.elapsed(); - // As u32 can hold up to ~49 days in milliseconds, this should be sufficient for our use case - // We still have a room up to 60bits timestamp if needed in future - u32::try_from(duration.as_millis()).expect("Monitor running for too long") - } - - fn evaluate(&self, mut on_failed: impl FnMut(&MonitorTag, MonitorEvaluationError)) { + fn evaluate(&self, _hmon_starting_point: Instant, on_error: &mut dyn FnMut(&MonitorTag, MonitorEvaluationError)) { for (deadline_tag, deadline) in self.active_deadlines.iter() { let snapshot = deadline.snapshot(); if snapshot.is_underrun() { @@ -314,14 +269,14 @@ impl DeadlineMonitorInner { warn!("Deadline ({:?}) finished too early!", deadline_tag); // Here we would normally report the underrun to the monitoring system - on_failed(&self.monitor_tag, MonitorEvaluationError::TooEarly); + on_error(&self.monitor_tag, DeadlineEvaluationError::TooEarly.into()); } else if snapshot.is_running() { debug_assert!( snapshot.is_stopped(), "Deadline snapshot cannot be both running and stopped" ); - let now = self.now(); + let now = duration_to_u32(self.monitor_starting_point.elapsed()); let expected = snapshot.timestamp_ms(); if now > expected { // Deadline missed, report @@ -331,13 +286,60 @@ impl DeadlineMonitorInner { ); // Here we would normally report the missed deadline to the monitoring system - on_failed(&self.monitor_tag, MonitorEvaluationError::TooLate); + on_error(&self.monitor_tag, DeadlineEvaluationError::TooLate.into()); } } } } } +impl DeadlineMonitorInner { + fn new(monitor_tag: MonitorTag, deadlines: HashMap) -> Self { + let mut active_deadlines = vec![]; + + let deadlines = deadlines + .into_iter() + .enumerate() + .map(|(index, (deadline_tag, range))| { + active_deadlines.push((deadline_tag, DeadlineState::new())); + (deadline_tag, DeadlineTemplate::new(range, StateIndex::new(index))) + }) + .collect(); + + #[allow(clippy::arc_with_non_send_sync)] // This will be fixed once we add background thread + Self { + monitor_tag, + deadlines, + active_deadlines: active_deadlines.into(), + monitor_starting_point: Instant::now(), + } + } + + fn release_deadline(&self, deadline_tag: DeadlineTag) { + if let Some(template) = self.deadlines.get(&deadline_tag) { + template.release_deadline(); + } else { + unreachable!("Releasing unknown deadline tag: {:?}", deadline_tag); + } + } + + pub(crate) fn get_deadline(self: &Arc, deadline_tag: DeadlineTag) -> Result { + if let Some(template) = self.deadlines.get(&deadline_tag) { + match template.acquire_deadline() { + Some(range) => Ok(Deadline { + range, + deadline_tag, + monitor: self.clone(), + state_index: template.assigned_state_index, + }), + None => Err(DeadlineMonitorError::DeadlineInUse), + } + } else { + Err(DeadlineMonitorError::DeadlineNotFound) + } + } +} + #[score_testing_macros::test_mod_with_log] #[cfg(test)] mod tests { @@ -403,6 +405,7 @@ mod tests { #[test] fn start_stop_deadline_within_range_works() { let monitor = create_monitor_with_deadlines(); + let hmon_starting_point = Instant::now(); let mut deadline = monitor.get_deadline(DeadlineTag::from("deadline_long")).unwrap(); let handle = deadline.start().unwrap(); @@ -410,49 +413,57 @@ mod tests { drop(handle); // stop the deadline - monitor.inner.evaluate(|monitor_tag, deadline_failure| { - panic!( - "Deadline {:?} should not have failed or underrun({:?})", - monitor_tag, deadline_failure - ); - }); + monitor + .inner + .evaluate(hmon_starting_point, &mut |monitor_tag, deadline_failure| { + panic!( + "Deadline {:?} should not have failed or underrun({:?})", + monitor_tag, deadline_failure + ); + }); } #[test] fn start_stop_deadline_outside_ranges_is_error_when_dropped_before_evaluate() { let monitor = create_monitor_with_deadlines(); + let hmon_starting_point = Instant::now(); let mut deadline = monitor.get_deadline(DeadlineTag::from("deadline_long")).unwrap(); let handle = deadline.start().unwrap(); drop(handle); // stop the deadline - monitor.inner.evaluate(|monitor_tag, deadline_failure| { - assert_eq!( - deadline_failure, - MonitorEvaluationError::TooEarly, - "Deadline {:?} should not have failed({:?})", - monitor_tag, - deadline_failure - ); - }); + monitor + .inner + .evaluate(hmon_starting_point, &mut |monitor_tag, deadline_failure| { + assert_eq!( + deadline_failure, + DeadlineEvaluationError::TooEarly.into(), + "Deadline {:?} should not have failed({:?})", + monitor_tag, + deadline_failure + ); + }); } #[test] fn deadline_outside_time_range_is_error_when_dropped_after_evaluate() { let monitor = create_monitor_with_deadlines(); + let hmon_starting_point = Instant::now(); let mut deadline = monitor.get_deadline(DeadlineTag::from("deadline_long")).unwrap(); let handle = deadline.start().unwrap(); // So deadline stop happens after evaluate, still it should be reported as failed - monitor.inner.evaluate(|monitor_tag, deadline_failure| { - assert_eq!( - deadline_failure, - MonitorEvaluationError::TooEarly, - "Deadline {:?} should not have failed({:?})", - monitor_tag, - deadline_failure - ); - }); + monitor + .inner + .evaluate(hmon_starting_point, &mut |monitor_tag, deadline_failure| { + assert_eq!( + deadline_failure, + DeadlineEvaluationError::TooEarly.into(), + "Deadline {:?} should not have failed({:?})", + monitor_tag, + deadline_failure + ); + }); drop(handle); // stop the deadline } @@ -460,6 +471,7 @@ mod tests { #[test] fn deadline_failed_on_first_run_and_then_restarted_is_evaluated_as_error() { let monitor = create_monitor_with_deadlines(); + let hmon_starting_point = Instant::now(); let mut deadline = monitor.get_deadline(DeadlineTag::from("deadline_long")).unwrap(); let handle = deadline.start().unwrap(); @@ -470,39 +482,45 @@ mod tests { let handle = deadline.start(); assert_eq!(handle.err(), Some(DeadlineError::DeadlineAlreadyFailed)); - monitor.inner.evaluate(|monitor_tag, deadline_failure| { - assert_eq!( - deadline_failure, - MonitorEvaluationError::TooEarly, - "Deadline {:?} should not have failed ({:?})", - monitor_tag, - deadline_failure - ); - }); + monitor + .inner + .evaluate(hmon_starting_point, &mut |monitor_tag, deadline_failure| { + assert_eq!( + deadline_failure, + DeadlineEvaluationError::TooEarly.into(), + "Deadline {:?} should not have failed ({:?})", + monitor_tag, + deadline_failure + ); + }); } #[test] fn start_stop_deadline_outside_ranges_is_evaluated_as_error() { let monitor = create_monitor_with_deadlines(); + let hmon_starting_point = Instant::now(); let mut deadline = monitor.get_deadline(DeadlineTag::from("deadline_fast")).unwrap(); let handle = deadline.start().unwrap(); drop(handle); // stop the deadline - monitor.inner.evaluate(|monitor_tag, deadline_failure| { - assert_eq!( - deadline_failure, - MonitorEvaluationError::TooLate, - "Deadline {:?} should not have failed({:?})", - monitor_tag, - deadline_failure - ); - }); + monitor + .inner + .evaluate(hmon_starting_point, &mut |monitor_tag, deadline_failure| { + assert_eq!( + deadline_failure, + DeadlineEvaluationError::TooLate.into(), + "Deadline {:?} should not have failed({:?})", + monitor_tag, + deadline_failure + ); + }); } #[test] fn monitor_with_multiple_running_deadlines() { let monitor = create_monitor_with_multiple_running_deadlines(); + let hmon_starting_point = Instant::now(); let mut deadline = monitor.get_deadline(DeadlineTag::from("deadline_fast1")).unwrap(); let _handle1 = deadline.start().unwrap(); @@ -517,16 +535,18 @@ mod tests { let mut cnt = 0; - monitor.inner.evaluate(|monitor_tag, deadline_failure| { - cnt += 1; - assert_eq!( - deadline_failure, - MonitorEvaluationError::TooLate, - "Deadline {:?} should not have failed({:?})", - monitor_tag, - deadline_failure - ); - }); + monitor + .inner + .evaluate(hmon_starting_point, &mut |monitor_tag, deadline_failure| { + cnt += 1; + assert_eq!( + deadline_failure, + DeadlineEvaluationError::TooLate.into(), + "Deadline {:?} should not have failed({:?})", + monitor_tag, + deadline_failure + ); + }); assert_eq!(cnt, 3, "All three deadlines should have been evaluated"); } diff --git a/src/health_monitoring_lib/rust/deadline/mod.rs b/src/health_monitoring_lib/rust/deadline/mod.rs index d903c412..7444ce3b 100644 --- a/src/health_monitoring_lib/rust/deadline/mod.rs +++ b/src/health_monitoring_lib/rust/deadline/mod.rs @@ -15,6 +15,7 @@ mod common; mod deadline_monitor; mod deadline_state; +pub(crate) use deadline_monitor::DeadlineEvaluationError; pub use deadline_monitor::{ DeadlineError, DeadlineHandle, DeadlineMonitor, DeadlineMonitorBuilder, DeadlineMonitorError, }; diff --git a/src/health_monitoring_lib/rust/heartbeat/heartbeat_monitor.rs b/src/health_monitoring_lib/rust/heartbeat/heartbeat_monitor.rs new file mode 100644 index 00000000..9eb3460d --- /dev/null +++ b/src/health_monitoring_lib/rust/heartbeat/heartbeat_monitor.rs @@ -0,0 +1,675 @@ +// ******************************************************************************* +// Copyright (c) 2026 Contributors to the Eclipse Foundation +// +// See the NOTICE file(s) distributed with this work for additional +// information regarding copyright ownership. +// +// This program and the accompanying materials are made available under the +// terms of the Apache License Version 2.0 which is available at +// +// +// SPDX-License-Identifier: Apache-2.0 +// ******************************************************************************* + +use crate::common::{ + duration_to_u32, hmon_time_offset, HasEvalHandle, MonitorEvalHandle, MonitorEvaluationError, MonitorEvaluator, + TimeRange, +}; +use crate::heartbeat::heartbeat_state::{HeartbeatState, HeartbeatStateSnapshot}; +use crate::log::warn; +use crate::protected_memory::ProtectedMemoryAllocator; +use crate::tag::MonitorTag; +use core::time::Duration; +use score_log::ScoreDebug; +use std::sync::Arc; +use std::time::Instant; + +/// Heartbeat evaluation errors. +#[derive(Debug, PartialEq, Eq, Clone, Copy, Hash, ScoreDebug)] +pub(crate) enum HeartbeatEvaluationError { + /// Finished too early. + TooEarly, + /// Finished too late. + TooLate, + /// Multiple heartbeats observed. + MultipleHeartbeats, +} + +/// Builder for [`HeartbeatMonitor`]. +#[derive(Debug)] +pub struct HeartbeatMonitorBuilder { + /// Time range between heartbeats. + range: TimeRange, +} + +impl HeartbeatMonitorBuilder { + /// Create a new [`HeartbeatMonitorBuilder`]. + /// + /// - `range` - time range between heartbeats. + pub fn new(range: TimeRange) -> Self { + Self { range } + } + + /// Build the [`HeartbeatMonitor`]. + /// + /// - `monitor_tag` - tag of this monitor. + /// - `internal_processing_cycle` - health monitor processing cycle. + /// - `_allocator` - protected memory allocator. + /// + /// # Panics + /// + /// Internal processing cycle must be shorter than doubled minimum time range. + pub(crate) fn build( + self, + monitor_tag: MonitorTag, + internal_processing_cycle: Duration, + _allocator: &ProtectedMemoryAllocator, + ) -> HeartbeatMonitor { + assert!(self.range.min * 2 > internal_processing_cycle); + let inner = Arc::new(HeartbeatMonitorInner::new(monitor_tag, self.range)); + HeartbeatMonitor::new(inner) + } +} + +/// Heartbeat monitor. +pub struct HeartbeatMonitor { + inner: Arc, +} + +impl HeartbeatMonitor { + /// Create a new [`HeartbeatMonitor`] instance. + pub(crate) fn new(inner: Arc) -> Self { + Self { inner } + } + + /// Provide a heartbeat. + pub fn heartbeat(&self) { + self.inner.heartbeat() + } +} + +impl HasEvalHandle for HeartbeatMonitor { + fn get_eval_handle(&self) -> crate::common::MonitorEvalHandle { + MonitorEvalHandle::new(Arc::clone(&self.inner)) + } +} + +/// Time range using [`u32`]. +#[derive(ScoreDebug)] +struct InternalRange { + min: u32, + max: u32, +} + +impl InternalRange { + /// Create range using provided values. + fn new(min: u32, max: u32) -> Self { + assert!(min <= max, "provided min is greater than provided max"); + Self { min, max } + } + + /// Create range with values offset by timestamp. + fn offset(&self, timestamp: u32) -> Self { + Self::new(self.min + timestamp, self.max + timestamp) + } +} + +impl From for InternalRange { + fn from(value: TimeRange) -> Self { + let min = duration_to_u32(value.min); + let max = duration_to_u32(value.max); + Self::new(min, max) + } +} + +pub(crate) struct HeartbeatMonitorInner { + /// Tag of this monitor. + monitor_tag: MonitorTag, + + /// Time range between heartbeats. + range: InternalRange, + + /// Monitor starting point. + /// Offset is calculated during evaluation in relation to provided health monitor starting point. + monitor_starting_point: Instant, + + /// Current heartbeat state. + /// Contains data in relation to [`Self::monitor_starting_point`]. + heartbeat_state: HeartbeatState, +} + +impl MonitorEvaluator for HeartbeatMonitorInner { + fn evaluate(&self, hmon_starting_point: Instant, on_error: &mut dyn FnMut(&MonitorTag, MonitorEvaluationError)) { + // Get current timestamp, with offset to HMON time. + let offset = hmon_time_offset(hmon_starting_point, self.monitor_starting_point); + let now = offset + duration_to_u32(hmon_starting_point.elapsed()); + + // Load current monitor state. + let snapshot = self.heartbeat_state.snapshot(); + + // Get and recalculate snapshot timestamps. + // IMPORTANT: first heartbeat is obtained when HMON time is unknown. + // It is necessary to: + // - use offset as cycle starting point. + // - get heartbeat snapshot in relation to zero point. + let (start_timestamp, heartbeat_timestamp) = if snapshot.post_init() { + let start_timestamp = snapshot.start_timestamp(); + let heartbeat_timestamp = start_timestamp + snapshot.heartbeat_timestamp_offset(); + (start_timestamp, heartbeat_timestamp) + } else { + let start_timestamp = offset; + let heartbeat_timestamp = snapshot.heartbeat_timestamp_offset(); + (start_timestamp, heartbeat_timestamp) + }; + + // Get allowed time range as absolute values. + let range = self.range.offset(start_timestamp); + + // Check current counter state. + let counter = snapshot.counter(); + // Disallow multiple heartbeats in same heartbeat cycle. + if counter > 1 { + warn!("Multiple heartbeats detected"); + on_error(&self.monitor_tag, HeartbeatEvaluationError::MultipleHeartbeats.into()); + return; + } + // Handle no heartbeats. + else if counter == 0 { + // Disallow no heartbeats when already out of time range. + // Stop execution if still in range. + if now > range.max { + let offset = now - range.max; + warn!("No heartbeat detected, observed after range: {}", offset); + on_error(&self.monitor_tag, HeartbeatEvaluationError::TooLate.into()); + } + // Either way - execution is stopped here. + return; + } + + // Check current heartbeat state. + // Heartbeat before allowed range. + if heartbeat_timestamp < range.min { + let offset = range.min - heartbeat_timestamp; + warn!("Heartbeat occurred too early, offset to range: {}", offset); + on_error(&self.monitor_tag, HeartbeatEvaluationError::TooEarly.into()); + } + // Heartbeat after allowed range. + else if heartbeat_timestamp > range.max { + let offset = heartbeat_timestamp - range.max; + warn!("Heartbeat occurred too late, offset to range: {}", offset); + on_error(&self.monitor_tag, HeartbeatEvaluationError::TooLate.into()); + } + // Heartbeat in allowed state. + else { + // Update heartbeat monitor state with a current heartbeat as a beginning of a new cycle. + let _ = self + .heartbeat_state + .update(|_| Some(HeartbeatStateSnapshot::new(heartbeat_timestamp))); + } + } +} + +impl HeartbeatMonitorInner { + fn new(monitor_tag: MonitorTag, range: TimeRange) -> Self { + let monitor_starting_point = Instant::now(); + let heartbeat_state_snapshot = HeartbeatStateSnapshot::default(); + let heartbeat_state = HeartbeatState::new(heartbeat_state_snapshot); + Self { + monitor_tag, + range: InternalRange::from(range), + monitor_starting_point, + heartbeat_state, + } + } + + /// Provide a heartbeat. + fn heartbeat(&self) { + // Get current timestamp. + let now = duration_to_u32(self.monitor_starting_point.elapsed()); + + // Set heartbeat timestamp and update counter. + let _ = self.heartbeat_state.update(|mut state| { + let start_ts = state.start_timestamp(); + state.set_heartbeat_timestamp_offset(now - start_ts); + state.increment_counter(); + Some(state) + }); + } +} + +#[cfg(test)] +mod test_common { + use crate::TimeRange; + use core::time::Duration; + use std::thread::sleep; + use std::time::Instant; + + pub(super) const TAG: &str = "heartbeat_monitor"; + + pub(super) fn sleep_until(target: Duration, start: Instant) { + let elapsed = start.elapsed(); + let diff = target.saturating_sub(elapsed); + sleep(diff) + } + + pub(super) fn range_from_ms(min: u64, max: u64) -> TimeRange { + TimeRange::new(Duration::from_millis(min), Duration::from_millis(max)) + } +} + +#[score_testing_macros::test_mod_with_log] +#[cfg(all(test, not(loom)))] +mod tests { + use crate::common::{MonitorEvaluationError, MonitorEvaluator, TimeRange}; + use crate::heartbeat::heartbeat_monitor::test_common::{range_from_ms, sleep_until, TAG}; + use crate::heartbeat::heartbeat_monitor::HeartbeatEvaluationError; + use crate::heartbeat::{HeartbeatMonitor, HeartbeatMonitorBuilder}; + use crate::protected_memory::ProtectedMemoryAllocator; + use crate::tag::MonitorTag; + use core::sync::atomic::{AtomicBool, Ordering}; + use core::time::Duration; + use std::sync::Arc; + use std::thread::{sleep, spawn}; + use std::time::Instant; + + fn create_monitor_single_cycle(range: TimeRange) -> HeartbeatMonitor { + let monitor_tag = MonitorTag::from(TAG); + let internal_processing_cycle = Duration::from_millis(1); + let allocator = ProtectedMemoryAllocator {}; + HeartbeatMonitorBuilder::new(range).build(monitor_tag, internal_processing_cycle, &allocator) + } + + #[test] + fn test_no_beat_evaluate_early() { + let range = range_from_ms(80, 120); + let monitor = create_monitor_single_cycle(range); + let hmon_starting_point = Instant::now(); + + // No beat happened, no error is expected. + monitor.inner.evaluate(hmon_starting_point, &mut |monitor_tag, error| { + panic!("error happened, tag: {monitor_tag:?}, error: {error:?}") + }); + } + + #[test] + fn test_no_beat_evaluate_in_range() { + let range = range_from_ms(80, 120); + let monitor = create_monitor_single_cycle(range); + let hmon_starting_point = Instant::now(); + + // Wait until middle of range. + sleep_until(Duration::from_millis(100), hmon_starting_point); + + // No beat happened, no error is expected. + monitor.inner.evaluate(hmon_starting_point, &mut |monitor_tag, error| { + panic!("error happened, tag: {monitor_tag:?}, error: {error:?}") + }); + } + #[test] + fn test_no_beat_evaluate_late() { + let range = range_from_ms(80, 120); + let monitor = create_monitor_single_cycle(range); + let hmon_starting_point = Instant::now(); + + // Wait until late. + sleep_until(Duration::from_millis(150), hmon_starting_point); + + // No beat happened, too late error is expected. + monitor.inner.evaluate(hmon_starting_point, &mut |monitor_tag, error| { + assert_eq!(*monitor_tag, MonitorTag::from(TAG)); + assert_eq!(error, HeartbeatEvaluationError::TooLate.into()); + }); + } + + fn beat_eval_test( + beat_time: Duration, + eval_time: Duration, + on_error: &mut dyn FnMut(&MonitorTag, MonitorEvaluationError), + ) { + let range = range_from_ms(80, 120); + let monitor = create_monitor_single_cycle(range); + let hmon_starting_point = Instant::now(); + + // Wait and beat. + sleep_until(beat_time, hmon_starting_point); + monitor.heartbeat(); + + // Wait and evaluate. + sleep_until(eval_time, hmon_starting_point); + monitor.inner.evaluate(hmon_starting_point, on_error); + } + + fn beat_early_test(eval_time: Duration) { + beat_eval_test(Duration::from_millis(25), eval_time, &mut |monitor_tag, error| { + assert_eq!(*monitor_tag, MonitorTag::from(TAG)); + assert_eq!(error, HeartbeatEvaluationError::TooEarly.into()); + }); + } + + #[test] + fn test_beat_early_evaluate_early() { + beat_early_test(Duration::from_millis(50)); + } + + #[test] + fn test_beat_early_evaluate_in_range() { + beat_early_test(Duration::from_millis(100)); + } + + #[test] + fn test_beat_early_evaluate_late() { + beat_early_test(Duration::from_millis(150)); + } + + fn beat_in_range_test(eval_time: Duration) { + beat_eval_test(Duration::from_millis(90), eval_time, &mut |monitor_tag, error| { + panic!("error happened, tag: {monitor_tag:?}, error: {error:?}") + }); + } + + #[test] + fn test_beat_in_range_evaluate_in_range() { + beat_in_range_test(Duration::from_millis(100)); + } + + #[test] + fn test_beat_in_range_evaluate_late() { + beat_in_range_test(Duration::from_millis(150)); + } + + #[test] + fn test_beat_late_evaluate_late() { + beat_eval_test( + Duration::from_millis(150), + Duration::from_millis(200), + &mut |monitor_tag, error| { + assert_eq!(*monitor_tag, MonitorTag::from(TAG)); + assert_eq!(error, HeartbeatEvaluationError::TooLate.into()); + }, + ) + } + + fn multiple_beats_eval_test(beat_time: Duration, eval_time: Duration) { + let range = range_from_ms(80, 120); + let monitor = create_monitor_single_cycle(range); + let hmon_starting_point = Instant::now(); + + // Wait and beat. + sleep_until(beat_time, hmon_starting_point); + const NUM_BEATS: usize = 10; + for _ in 0..NUM_BEATS { + monitor.heartbeat(); + } + + // Wait and evaluate. + sleep_until(eval_time, hmon_starting_point); + monitor.inner.evaluate(hmon_starting_point, &mut |monitor_tag, error| { + assert_eq!(*monitor_tag, MonitorTag::from(TAG)); + assert_eq!(error, HeartbeatEvaluationError::MultipleHeartbeats.into()); + }); + } + + #[test] + fn test_multiple_beats_early_evaluate_early() { + multiple_beats_eval_test(Duration::from_millis(25), Duration::from_millis(50)) + } + + #[test] + fn test_multiple_beats_early_evaluate_in_range() { + multiple_beats_eval_test(Duration::from_millis(25), Duration::from_millis(100)) + } + + #[test] + fn test_multiple_beats_early_evaluate_late() { + multiple_beats_eval_test(Duration::from_millis(25), Duration::from_millis(150)) + } + + #[test] + fn test_multiple_beats_in_range_evaluate_in_range() { + multiple_beats_eval_test(Duration::from_millis(90), Duration::from_millis(100)) + } + + #[test] + fn test_multiple_beats_in_range_evaluate_late() { + multiple_beats_eval_test(Duration::from_millis(90), Duration::from_millis(150)) + } + + #[test] + fn test_multiple_beats_late_evaluate_late() { + multiple_beats_eval_test(Duration::from_millis(150), Duration::from_millis(200)) + } + + fn create_monitor_multiple_cycles(cycle: Duration) -> Arc { + let range = range_from_ms(80, 120); + let monitor_tag = MonitorTag::from(TAG); + let allocator = ProtectedMemoryAllocator {}; + let monitor = HeartbeatMonitorBuilder::new(range).build(monitor_tag, cycle, &allocator); + Arc::new(monitor) + } + + #[test] + fn test_cycle_early() { + let cycle = Duration::from_millis(20); + let monitor = create_monitor_multiple_cycles(cycle); + let hmon_starting_point = Instant::now(); + + // Run heartbeat thread. + let monitor_clone = monitor.clone(); + let heartbeat_finished = Arc::new(AtomicBool::new(false)); + let heartbeat_finished_clone = heartbeat_finished.clone(); + let heartbeat_thread = spawn(move || { + const NUM_BEATS: u32 = 3; + const BEAT_INTERVAL: Duration = Duration::from_millis(100); + for i in 1..NUM_BEATS { + sleep_until(i * BEAT_INTERVAL, hmon_starting_point); + monitor_clone.heartbeat(); + } + + // Perform a last heartbeat in shorter interval. + sleep_until( + NUM_BEATS * BEAT_INTERVAL - Duration::from_millis(40), + hmon_starting_point, + ); + monitor_clone.heartbeat(); + + heartbeat_finished_clone.store(true, Ordering::Release); + }); + + // Run evaluation thread. + while !heartbeat_finished.load(Ordering::Acquire) { + sleep(cycle); + // Too early error is expected. + monitor.inner.evaluate(hmon_starting_point, &mut |monitor_tag, error| { + assert_eq!(*monitor_tag, MonitorTag::from(TAG)); + assert_eq!(error, HeartbeatEvaluationError::TooEarly.into()); + }); + } + + heartbeat_thread.join().unwrap(); + } + + #[test] + fn test_cycle_in_range() { + let cycle = Duration::from_millis(20); + let monitor = create_monitor_multiple_cycles(cycle); + let hmon_starting_point = Instant::now(); + + // Run heartbeat thread. + let monitor_clone = monitor.clone(); + let heartbeat_finished = Arc::new(AtomicBool::new(false)); + let heartbeat_finished_clone = heartbeat_finished.clone(); + let heartbeat_thread = spawn(move || { + const NUM_BEATS: u32 = 3; + const BEAT_INTERVAL: Duration = Duration::from_millis(100); + for i in 1..=NUM_BEATS { + sleep_until(i * BEAT_INTERVAL, hmon_starting_point); + monitor_clone.heartbeat(); + } + heartbeat_finished_clone.store(true, Ordering::Release); + }); + + // Run evaluation thread. + while !heartbeat_finished.load(Ordering::Acquire) { + sleep(cycle); + // No error is expected. + monitor.inner.evaluate(hmon_starting_point, &mut |monitor_tag, error| { + panic!("error happened, tag: {monitor_tag:?}, error: {error:?}") + }); + } + + heartbeat_thread.join().unwrap(); + } + + #[test] + fn test_cycle_late() { + let cycle = Duration::from_millis(20); + let monitor = create_monitor_multiple_cycles(cycle); + let hmon_starting_point = Instant::now(); + + // Run heartbeat thread. + let monitor_clone = monitor.clone(); + let heartbeat_finished = Arc::new(AtomicBool::new(false)); + let heartbeat_finished_clone = heartbeat_finished.clone(); + let heartbeat_thread = spawn(move || { + const NUM_BEATS: u32 = 3; + const BEAT_INTERVAL: Duration = Duration::from_millis(100); + for i in 1..NUM_BEATS { + sleep_until(i * BEAT_INTERVAL, hmon_starting_point); + monitor_clone.heartbeat(); + } + + // Perform a last heartbeat in shorter interval. + sleep_until( + NUM_BEATS * BEAT_INTERVAL + Duration::from_millis(40), + hmon_starting_point, + ); + monitor_clone.heartbeat(); + + heartbeat_finished_clone.store(true, Ordering::Release); + }); + + // Run evaluation thread. + while !heartbeat_finished.load(Ordering::Acquire) { + sleep(cycle); + // No heartbeat or too late error is expected. + monitor.inner.evaluate(hmon_starting_point, &mut |monitor_tag, error| { + assert_eq!(*monitor_tag, MonitorTag::from(TAG)); + assert_eq!(error, HeartbeatEvaluationError::TooLate.into()); + }); + } + + heartbeat_thread.join().unwrap(); + } + + #[test] + fn test_timestamp_offset() { + let range = range_from_ms(80, 120); + let monitor = create_monitor_single_cycle(range); + + // Move away monitor creation and HMON starting point. + sleep(Duration::from_millis(300)); + let hmon_starting_point = Instant::now(); + + // Wait and beat. + sleep_until(Duration::from_millis(90), hmon_starting_point); + monitor.heartbeat(); + + // Wait and evaluate. + sleep_until(Duration::from_millis(100), hmon_starting_point); + monitor.inner.evaluate(hmon_starting_point, &mut |monitor_tag, error| { + panic!("error happened, tag: {monitor_tag:?}, error: {error:?}") + }); + } +} + +#[cfg(all(test, loom))] +mod loom_tests { + use crate::common::MonitorEvaluator; + use crate::heartbeat::heartbeat_monitor::test_common::{range_from_ms, sleep_until, TAG}; + use crate::heartbeat::{HeartbeatEvaluationError, HeartbeatMonitor, HeartbeatMonitorBuilder}; + use crate::protected_memory::ProtectedMemoryAllocator; + use crate::tag::MonitorTag; + use crate::TimeRange; + use core::time::Duration; + use loom::thread::spawn; + use std::sync::Arc; + use std::time::Instant; + + fn create_monitor_single_cycle(range: TimeRange) -> Arc { + let monitor_tag = MonitorTag::from(TAG); + let internal_processing_cycle = Duration::from_millis(1); + let allocator = ProtectedMemoryAllocator {}; + Arc::new(HeartbeatMonitorBuilder::new(range).build(monitor_tag, internal_processing_cycle, &allocator)) + } + + #[test] + fn test_heartbeat_evaluate_too_early() { + loom::model(|| { + let range = range_from_ms(30, 70); + let monitor = create_monitor_single_cycle(range); + let hmon_starting_point = Instant::now(); + + // Perform heartbeat in a separate thread. + let monitor_clone = monitor.clone(); + let heartbeat_thread = spawn(move || monitor_clone.heartbeat()); + + // Evaluate. + monitor.inner.evaluate(hmon_starting_point, &mut |monitor_tag, error| { + assert_eq!(*monitor_tag, MonitorTag::from(TAG)); + assert_eq!(error, HeartbeatEvaluationError::TooEarly.into()); + }); + + heartbeat_thread.join().unwrap(); + }); + } + + #[test] + fn test_heartbeat_evaluate_in_range() { + loom::model(|| { + let range = range_from_ms(30, 70); + let monitor = create_monitor_single_cycle(range); + let hmon_starting_point = Instant::now(); + + // Wait until in range. + sleep_until(Duration::from_millis(50), hmon_starting_point); + + // Perform heartbeat in a separate thread. + let monitor_clone = monitor.clone(); + let heartbeat_thread = spawn(move || monitor_clone.heartbeat()); + + // Evaluate. + monitor.inner.evaluate(hmon_starting_point, &mut |monitor_tag, error| { + panic!("error happened, tag: {monitor_tag:?}, error: {error:?}"); + }); + + heartbeat_thread.join().unwrap(); + }); + } + + #[test] + fn test_heartbeat_evaluate_too_late() { + loom::model(|| { + let range = range_from_ms(30, 70); + let monitor = create_monitor_single_cycle(range); + let hmon_starting_point = Instant::now(); + + // Wait until too late. + sleep_until(Duration::from_millis(100), hmon_starting_point); + + // Perform heartbeat in a separate thread. + let monitor_clone = monitor.clone(); + let heartbeat_thread = spawn(move || monitor_clone.heartbeat()); + + // Evaluate. + let mut error_detected = false; + monitor.inner.evaluate(hmon_starting_point, &mut |monitor_tag, error| { + assert_eq!(*monitor_tag, MonitorTag::from(TAG)); + assert_eq!(error, HeartbeatEvaluationError::TooLate.into()); + error_detected = true; + }); + + heartbeat_thread.join().unwrap(); + assert!(error_detected); + }); + } +} diff --git a/src/health_monitoring_lib/rust/heartbeat/heartbeat_state.rs b/src/health_monitoring_lib/rust/heartbeat/heartbeat_state.rs new file mode 100644 index 00000000..4f8d8d96 --- /dev/null +++ b/src/health_monitoring_lib/rust/heartbeat/heartbeat_state.rs @@ -0,0 +1,321 @@ +// ******************************************************************************* +// Copyright (c) 2026 Contributors to the Eclipse Foundation +// +// See the NOTICE file(s) distributed with this work for additional +// information regarding copyright ownership. +// +// This program and the accompanying materials are made available under the +// terms of the Apache License Version 2.0 which is available at +// +// +// SPDX-License-Identifier: Apache-2.0 +// ******************************************************************************* + +use core::cmp::min; + +#[cfg(not(loom))] +use core::sync::atomic::{AtomicU64, Ordering}; +#[cfg(loom)] +use loom::sync::atomic::{AtomicU64, Ordering}; + +/// Snapshot of a heartbeat state. +/// Data layout: +/// - cycle start timestamp: 32 bits +/// - heartbeat timestamp offset: 29 bits +/// - heartbeat counter: 2 bits +/// - post-init flag: 1 bit +#[derive(Clone, Copy, Default)] +pub struct HeartbeatStateSnapshot(u64); + +const START_MASK: u64 = 0xFFFFFFFF_00000000; +const START_OFFSET: u32 = u32::BITS; +const BEAT_MASK: u64 = 0x00000000_FFFFFFF8; +const BEAT_OFFSET: u32 = 3; +const COUNT_MASK: u64 = 0b0110; +const COUNT_OFFSET: u32 = 1; +const POST_INIT_MASK: u64 = 0b0001; + +impl HeartbeatStateSnapshot { + /// Create a new snapshot with known starting point. + /// `post_init` flag is implicitly set to 1. + pub fn new(start_timestamp: u32) -> Self { + let mut snapshot = Self::default(); + snapshot.set_start_timestamp(start_timestamp); + snapshot.set_post_init(true); + snapshot + } + + /// Return underlying data. + pub fn as_u64(&self) -> u64 { + self.0 + } + + /// Cycle start timestamp. + pub fn start_timestamp(&self) -> u32 { + ((self.0 & START_MASK) >> START_OFFSET) as u32 + } + + /// Set cycle start timestamp. + pub fn set_start_timestamp(&mut self, value: u32) { + self.0 = ((value as u64) << START_OFFSET) | (self.0 & !START_MASK); + } + + /// Heartbeat timestamp offset. + pub fn heartbeat_timestamp_offset(&self) -> u32 { + ((self.0 & BEAT_MASK) >> BEAT_OFFSET) as u32 + } + + /// Set heartbeat timestamp offset. + /// Value is 29-bit, must be lower than 0x1FFFFFFF. + pub fn set_heartbeat_timestamp_offset(&mut self, value: u32) { + assert!(value < 1 << 29, "provided heartbeat offset is out of range"); + self.0 = ((value as u64) << BEAT_OFFSET) | (self.0 & !BEAT_MASK); + } + + /// Heartbeat counter. + pub fn counter(&self) -> u8 { + ((self.0 & COUNT_MASK) >> COUNT_OFFSET) as u8 + } + + /// Increment heartbeat counter. + /// Value is 2-bit, larger values are saturated to max value (3). + pub fn increment_counter(&mut self) { + let value = min(self.counter() + 1, 3); + self.0 = ((value as u64) << COUNT_OFFSET) | (self.0 & !COUNT_MASK); + } + + /// Post-init state. + /// This should be `false` only before first cycle is concluded. + pub fn post_init(&self) -> bool { + let value = self.0 & POST_INIT_MASK; + value != 0 + } + + /// Set post-init state. + pub fn set_post_init(&mut self, value: bool) { + self.0 = (value as u64) | (self.0 & !POST_INIT_MASK); + } +} + +impl From for HeartbeatStateSnapshot { + fn from(value: u64) -> Self { + Self(value) + } +} + +/// Atomic representation of [`HeartbeatStateSnapshot`]. +pub struct HeartbeatState(AtomicU64); + +impl HeartbeatState { + /// Create a new [`HeartbeatState`] using provided [`HeartbeatStateSnapshot`]. + pub fn new(snapshot: HeartbeatStateSnapshot) -> Self { + Self(AtomicU64::new(snapshot.as_u64())) + } + + /// Return a snapshot of the current heartbeat state. + pub fn snapshot(&self) -> HeartbeatStateSnapshot { + HeartbeatStateSnapshot::from(self.0.load(Ordering::Relaxed)) + } + + /// Update the heartbeat state using the provided closure. + /// Closure receives the current state and should return an [`Option`] containing a new state. + /// If [`None`] is returned then the state was not updated. + pub fn update Option>( + &self, + mut f: F, + ) -> Result { + // Prev values returned + self.0 + .fetch_update(Ordering::Relaxed, Ordering::Relaxed, |prev| { + let snapshot = HeartbeatStateSnapshot::from(prev); + f(snapshot).map(|new_snapshot| new_snapshot.as_u64()) + }) + .map(HeartbeatStateSnapshot::from) + .map_err(HeartbeatStateSnapshot::from) + } +} + +#[cfg(all(test, not(loom)))] +mod tests { + use crate::heartbeat::heartbeat_state::{HeartbeatState, HeartbeatStateSnapshot}; + use core::cmp::min; + use core::sync::atomic::Ordering; + + #[test] + fn test_snapshot_new_zero() { + let state = HeartbeatStateSnapshot::new(0); + + assert_eq!(state.as_u64(), 0x01); + assert_eq!(state.start_timestamp(), 0); + assert_eq!(state.heartbeat_timestamp_offset(), 0); + assert_eq!(state.counter(), 0); + assert!(state.post_init()); + } + + #[test] + fn test_snapshot_new_valid() { + let state = HeartbeatStateSnapshot::new(0xDEADBEEF); + + assert_eq!(state.as_u64(), (0xDEADBEEF << u32::BITS) + 0x01); + assert_eq!(state.start_timestamp(), 0xDEADBEEF); + assert_eq!(state.heartbeat_timestamp_offset(), 0); + assert_eq!(state.counter(), 0); + assert!(state.post_init()); + } + + #[test] + fn test_snapshot_new_max() { + let state = HeartbeatStateSnapshot::new(u32::MAX); + + assert_eq!(state.as_u64(), ((u32::MAX as u64) << u32::BITS) + 0x01); + assert_eq!(state.start_timestamp(), u32::MAX); + assert_eq!(state.heartbeat_timestamp_offset(), 0); + assert_eq!(state.counter(), 0); + assert!(state.post_init()); + } + + #[test] + fn test_snapshot_from_u64_zero() { + let state = HeartbeatStateSnapshot::from(0); + + assert_eq!(state.as_u64(), 0); + assert_eq!(state.start_timestamp(), 0); + assert_eq!(state.heartbeat_timestamp_offset(), 0); + assert_eq!(state.counter(), 0); + assert!(!state.post_init()); + } + + #[test] + fn test_snapshot_from_u64_valid() { + let state = HeartbeatStateSnapshot::from(0xDEADBEEF_DEADBEEF); + + assert_eq!(state.as_u64(), 0xDEADBEEF_DEADBEEF); + assert_eq!(state.start_timestamp(), 0xDEADBEEF); + assert_eq!(state.heartbeat_timestamp_offset(), 0xDEADBEEF >> 3); + assert_eq!(state.counter(), 3); + assert!(state.post_init()); + } + + #[test] + fn test_snapshot_from_u64_max() { + let state = HeartbeatStateSnapshot::from(u64::MAX); + + assert_eq!(state.as_u64(), u64::MAX); + assert_eq!(state.start_timestamp(), u32::MAX); + assert_eq!(state.heartbeat_timestamp_offset(), u32::MAX >> 3); + assert_eq!(state.counter(), 3); + assert!(state.post_init()); + } + + #[test] + fn test_snapshot_default() { + let state = HeartbeatStateSnapshot::default(); + + assert_eq!(state.as_u64(), 0); + assert_eq!(state.start_timestamp(), 0); + assert_eq!(state.heartbeat_timestamp_offset(), 0); + assert_eq!(state.counter(), 0); + assert!(!state.post_init()); + } + + #[test] + fn test_snapshot_set_start_timestamp() { + let mut state = HeartbeatStateSnapshot::from(0xDEADBEEF_DEADBEEF); + state.set_start_timestamp(0xCAFEBAAD); + + assert_eq!(state.start_timestamp(), 0xCAFEBAAD); + + // Check other parameters unchanged. + assert_eq!(state.heartbeat_timestamp_offset(), 0xDEADBEEF >> 3); + assert_eq!(state.counter(), 3); + assert!(state.post_init()); + } + + #[test] + fn test_snapshot_set_heartbeat_timestamp_valid() { + let mut state = HeartbeatStateSnapshot::from(0xDEADBEEF_DEADBEEF); + state.set_heartbeat_timestamp_offset(0x1CAFEBAD); + + assert_eq!(state.heartbeat_timestamp_offset(), 0x1CAFEBAD); + + // Check other parameters unchanged. + assert_eq!(state.start_timestamp(), 0xDEADBEEF); + assert_eq!(state.counter(), 3); + assert!(state.post_init()); + } + + #[test] + #[should_panic(expected = "provided heartbeat offset is out of range")] + fn test_snapshot_set_heartbeat_timestamp_out_of_range() { + let mut state = HeartbeatStateSnapshot::from(0xDEADBEEF_DEADBEEF); + state.set_heartbeat_timestamp_offset(0x20000000); + } + + #[test] + fn test_snapshot_counter_increment() { + let mut state = HeartbeatStateSnapshot::from(0xDEADBEEF_DEADBEE9); + + // Max value is 3, check if saturates. + for i in 1..=4 { + state.increment_counter(); + assert_eq!(state.counter(), min(i, 3)); + } + + // Check other parameters unchanged. + assert_eq!(state.start_timestamp(), 0xDEADBEEF); + assert_eq!(state.heartbeat_timestamp_offset(), 0xDEADBEEF >> 3); + assert!(state.post_init()); + } + + #[test] + fn test_snapshot_set_post_init() { + let mut state = HeartbeatStateSnapshot::from(0xDEADBEEF_DEADBEEF); + + state.set_post_init(false); + assert!(!state.post_init()); + state.set_post_init(true); + assert!(state.post_init()); + + // Check other parameters unchanged. + assert_eq!(state.start_timestamp(), 0xDEADBEEF); + assert_eq!(state.heartbeat_timestamp_offset(), 0xDEADBEEF >> 3); + assert_eq!(state.counter(), 3); + } + + #[test] + fn test_state_new() { + let state = HeartbeatState::new(HeartbeatStateSnapshot::from(0xDEADBEEF_DEADBEEF)); + assert_eq!(state.0.load(Ordering::Relaxed), 0xDEADBEEF_DEADBEEF); + } + + #[test] + fn test_state_snapshot() { + let state = HeartbeatState::new(HeartbeatStateSnapshot::from(0xDEADBEEF_DEADBEEF)); + assert_eq!(state.snapshot().as_u64(), 0xDEADBEEF_DEADBEEF); + } + + #[test] + fn test_state_update_some() { + let state = HeartbeatState::new(HeartbeatStateSnapshot::from(0xDEADBEEF_DEADBEEF)); + let _ = state.update(|prev_snapshot| { + // Make sure state is as expected. + assert_eq!(prev_snapshot.as_u64(), 0xDEADBEEF_DEADBEEF); + assert_eq!(prev_snapshot.start_timestamp(), 0xDEADBEEF); + assert_eq!(prev_snapshot.heartbeat_timestamp_offset(), 0xDEADBEEF >> 3); + assert_eq!(prev_snapshot.counter(), 3); + assert!(prev_snapshot.post_init()); + + Some(HeartbeatStateSnapshot::from(0)) + }); + + assert_eq!(state.snapshot().as_u64(), 0); + } + + #[test] + fn test_state_update_none() { + let state = HeartbeatState::new(HeartbeatStateSnapshot::from(0xDEADBEEF_DEADBEEF)); + let _ = state.update(|_| None); + + assert_eq!(state.snapshot().as_u64(), 0xDEADBEEF_DEADBEEF); + } +} diff --git a/src/health_monitoring_lib/rust/heartbeat/mod.rs b/src/health_monitoring_lib/rust/heartbeat/mod.rs new file mode 100644 index 00000000..4afe2ab5 --- /dev/null +++ b/src/health_monitoring_lib/rust/heartbeat/mod.rs @@ -0,0 +1,18 @@ +// ******************************************************************************* +// Copyright (c) 2026 Contributors to the Eclipse Foundation +// +// See the NOTICE file(s) distributed with this work for additional +// information regarding copyright ownership. +// +// This program and the accompanying materials are made available under the +// terms of the Apache License Version 2.0 which is available at +// +// +// SPDX-License-Identifier: Apache-2.0 +// ******************************************************************************* + +mod heartbeat_monitor; +mod heartbeat_state; + +pub(crate) use heartbeat_monitor::HeartbeatEvaluationError; +pub use heartbeat_monitor::{HeartbeatMonitor, HeartbeatMonitorBuilder}; diff --git a/src/health_monitoring_lib/rust/lib.rs b/src/health_monitoring_lib/rust/lib.rs index 492e6378..845a0fd0 100644 --- a/src/health_monitoring_lib/rust/lib.rs +++ b/src/health_monitoring_lib/rust/lib.rs @@ -15,21 +15,27 @@ mod common; mod ffi; mod log; mod protected_memory; +mod supervisor_api_client; mod tag; mod worker; pub mod deadline; +pub mod heartbeat; -use crate::common::MonitorEvalHandle; +use crate::common::{HasEvalHandle, MonitorEvalHandle}; +use crate::deadline::{DeadlineMonitor, DeadlineMonitorBuilder}; +use crate::heartbeat::{HeartbeatMonitor, HeartbeatMonitorBuilder}; pub use common::TimeRange; use containers::fixed_capacity::FixedCapacityVec; use core::time::Duration; use std::collections::HashMap; pub use tag::{DeadlineTag, MonitorTag}; +/// Builder for the [`HealthMonitor`]. #[derive(Default)] pub struct HealthMonitorBuilder { - deadline_monitor_builders: HashMap, + deadline_monitor_builders: HashMap, + heartbeat_monitor_builders: HashMap, supervisor_api_cycle: Duration, internal_processing_cycle: Duration, } @@ -39,31 +45,50 @@ impl HealthMonitorBuilder { pub fn new() -> Self { Self { deadline_monitor_builders: HashMap::new(), + heartbeat_monitor_builders: HashMap::new(), supervisor_api_cycle: Duration::from_millis(500), internal_processing_cycle: Duration::from_millis(100), } } - /// Adds a deadline monitor for a specific identifier tag. - /// # Arguments - /// * `monitor_tag` - The unique identifier for the deadline monitor. - /// * `monitor` - The builder for the deadline monitor. + /// Add a [`DeadlineMonitor`] for the given [`MonitorTag`]. + /// + /// - `monitor_tag` - unique tag for the [`DeadlineMonitor`]. + /// - `monitor_builder` - monitor builder to finalize. + /// + /// # Note + /// + /// If a deadline monitor with the same tag already exists, it will be overwritten. + pub fn add_deadline_monitor(mut self, monitor_tag: MonitorTag, monitor_builder: DeadlineMonitorBuilder) -> Self { + self.add_deadline_monitor_internal(monitor_tag, monitor_builder); + self + } + + /// Add a [`HeartbeatMonitor`] for the given [`MonitorTag`]. + /// + /// - `monitor_tag` - unique tag for the [`HeartbeatMonitor`]. + /// - `monitor_builder` - monitor builder to finalize. + /// /// # Note - /// If a monitor with the same tag already exists, it will be overwritten. - pub fn add_deadline_monitor(mut self, monitor_tag: MonitorTag, monitor: deadline::DeadlineMonitorBuilder) -> Self { - self.add_deadline_monitor_internal(monitor_tag, monitor); + /// + /// If a heartbeat monitor with the same tag already exists, it will be overwritten. + pub fn add_heartbeat_monitor(mut self, monitor_tag: MonitorTag, monitor_builder: HeartbeatMonitorBuilder) -> Self { + self.add_heartbeat_monitor_internal(monitor_tag, monitor_builder); self } - /// Sets the cycle duration for supervisor API notifications. - /// This duration determines how often the health monitor notifies the supervisor that the system is alive. + /// Set the interval between supervisor API notifications. + /// This duration determines how often the health monitor notifies the supervisor about system liveness. + /// + /// - `cycle_duration` - interval between notifications. pub fn with_supervisor_api_cycle(mut self, cycle_duration: Duration) -> Self { self.with_supervisor_api_cycle_internal(cycle_duration); self } - /// Sets the internal processing cycle duration. - /// This duration determines how often the health monitor checks deadlines. + /// Set the internal interval between health monitor evaluations. + /// + /// - `cycle_duration` - interval between evaluations. pub fn with_internal_processing_cycle(mut self, cycle_duration: Duration) -> Self { self.with_internal_processing_cycle_internal(cycle_duration); self @@ -84,9 +109,17 @@ impl HealthMonitorBuilder { pub(crate) fn add_deadline_monitor_internal( &mut self, monitor_tag: MonitorTag, - monitor: deadline::DeadlineMonitorBuilder, + monitor_builder: DeadlineMonitorBuilder, + ) { + self.deadline_monitor_builders.insert(monitor_tag, monitor_builder); + } + + pub(crate) fn add_heartbeat_monitor_internal( + &mut self, + monitor_tag: MonitorTag, + monitor_builder: HeartbeatMonitorBuilder, ) { - self.deadline_monitor_builders.insert(monitor_tag, monitor); + self.heartbeat_monitor_builders.insert(monitor_tag, monitor_builder); } pub(crate) fn with_supervisor_api_cycle_internal(&mut self, cycle_duration: Duration) { @@ -109,69 +142,111 @@ impl HealthMonitorBuilder { // Create deadline monitors. let mut deadline_monitors = HashMap::new(); for (tag, builder) in self.deadline_monitor_builders { - deadline_monitors.insert( + deadline_monitors.insert(tag, Some(MonitorState::Available(builder.build(tag, &allocator)))); + } + + // Create heartbeat monitors. + let mut heartbeat_monitors = HashMap::new(); + for (tag, builder) in self.heartbeat_monitor_builders { + heartbeat_monitors.insert( tag, - Some(DeadlineMonitorState::Available(builder.build(tag, &allocator))), + Some(MonitorState::Available(builder.build( + tag, + self.internal_processing_cycle, + &allocator, + ))), ); } HealthMonitor { deadline_monitors, + heartbeat_monitors, worker: worker::UniqueThreadRunner::new(self.internal_processing_cycle), supervisor_api_cycle: self.supervisor_api_cycle, } } } -enum DeadlineMonitorState { - Available(deadline::DeadlineMonitor), - Taken(common::MonitorEvalHandle), +/// Monitor ownership state in the [`HealthMonitor`]. +enum MonitorState { + /// Monitor is available. + Available(Monitor), + /// Monitor is already taken. + Taken(MonitorEvalHandle), } +/// Monitor container. +/// - Must be an option to ensure monitor can be taken out (not referenced). +/// - Must be an enum to ensure evaluation handle is still available for HMON after monitor is taken. +type MonitorContainer = Option>; + +/// Health monitor. pub struct HealthMonitor { - deadline_monitors: HashMap>, + deadline_monitors: HashMap>, + heartbeat_monitors: HashMap>, worker: worker::UniqueThreadRunner, supervisor_api_cycle: Duration, } impl HealthMonitor { - /// Retrieves and removes (hand over to user) a deadline monitor associated with the given identifier tag. - /// # Arguments - /// * `monitor_tag` - The unique identifier for the deadline monitor. - /// # Returns - /// An Option containing the DeadlineMonitor if found, or None if - /// - no monitor exists for the given tag or was already obtained - /// - pub fn get_deadline_monitor(&mut self, monitor_tag: MonitorTag) -> Option { - let monitor = self.deadline_monitors.get_mut(&monitor_tag)?; - - match monitor.take() { - Some(DeadlineMonitorState::Available(deadline_monitor)) => { - monitor.replace(DeadlineMonitorState::Taken(deadline_monitor.get_eval_handle())); + fn get_monitor( + monitors: &mut HashMap>, + monitor_tag: MonitorTag, + ) -> Option { + let monitor_state = monitors.get_mut(&monitor_tag)?; - Some(deadline_monitor) + match monitor_state.take() { + Some(MonitorState::Available(monitor)) => { + monitor_state.replace(MonitorState::Taken(monitor.get_eval_handle())); + Some(monitor) }, - Some(DeadlineMonitorState::Taken(v)) => { - monitor.replace(DeadlineMonitorState::Taken(v)); // Insert back + Some(MonitorState::Taken(handle)) => { + // Taken handle is inserted back. + monitor_state.replace(MonitorState::Taken(handle)); None }, None => None, } } - /// Starts the health monitoring logic in a separate thread. + /// Get and pass ownership of a [`DeadlineMonitor`] for the given [`MonitorTag`]. + /// + /// - `monitor_tag` - unique tag for the [`DeadlineMonitor`]. + /// + /// Returns [`Some`] containing [`DeadlineMonitor`] if found and not taken. + /// Otherwise returns [`None`]. + pub fn get_deadline_monitor(&mut self, monitor_tag: MonitorTag) -> Option { + Self::get_monitor(&mut self.deadline_monitors, monitor_tag) + } + + /// Get and pass ownership of a [`HeartbeatMonitor`] for the given [`MonitorTag`]. + /// + /// - `monitor_tag` - unique tag for the [`HeartbeatMonitor`]. + /// + /// Returns [`Some`] containing [`HeartbeatMonitor`] if found and not taken. + /// Otherwise returns [`None`]. + pub fn get_heartbeat_monitor(&mut self, monitor_tag: MonitorTag) -> Option { + Self::get_monitor(&mut self.heartbeat_monitors, monitor_tag) + } + + /// Start the health monitoring logic in a separate thread. /// /// From this point, the health monitor will periodically check monitors and notify the supervisor about system liveness. /// - /// # Note - /// - This function shall be called before Lifecycle.running() otherwise the supervisor might consider the process not alive. - /// - Stops when the HealthMonitor instance is dropped. + /// # Notes + /// + /// This method shall be called before `Lifecycle.running()`. + /// Otherwise the supervisor might consider the process not alive. + /// + /// Health monitoring logic stop when the [`HealthMonitor`] is dropped. + /// + /// # Panics /// - /// Panics if no monitors have been added. + /// Method panics if no monitors have been added. pub fn start(&mut self) { assert!( self.check_monitors_exist_internal(), - "No deadline monitors have been added. HealthMonitor cannot start without any monitors." + "No monitors have been added. HealthMonitor cannot start without any monitors." ); let monitors = match self.collect_monitors_internal() { @@ -183,20 +258,22 @@ impl HealthMonitor { } pub(crate) fn check_monitors_exist_internal(&self) -> bool { - !self.deadline_monitors.is_empty() + !self.deadline_monitors.is_empty() || !self.heartbeat_monitors.is_empty() } - pub(crate) fn collect_monitors_internal(&mut self) -> Result, String> { - let mut monitors = FixedCapacityVec::new(self.deadline_monitors.len()); - for (tag, monitor) in self.deadline_monitors.iter_mut() { + fn collect_given_monitors( + monitors_to_collect: &mut HashMap>, + collected_monitors: &mut FixedCapacityVec, + ) -> Result<(), String> { + for (tag, monitor) in monitors_to_collect.iter_mut() { match monitor.take() { - Some(DeadlineMonitorState::Taken(handle)) => { - if monitors.push(handle).is_err() { - // Should not fail since we preallocated enough capacity + Some(MonitorState::Taken(handle)) => { + if collected_monitors.push(handle).is_err() { + // Should not fail - capacity was preallocated. return Err("Failed to push monitor handle".to_string()); } }, - Some(DeadlineMonitorState::Available(_)) => { + Some(MonitorState::Available(_)) => { return Err(format!( "All monitors must be taken before starting HealthMonitor but {:?} is not taken.", tag @@ -210,19 +287,28 @@ impl HealthMonitor { }, } } - Ok(monitors) + Ok(()) + } + + pub(crate) fn collect_monitors_internal(&mut self) -> Result, String> { + let num_monitors = self.deadline_monitors.len() + self.heartbeat_monitors.len(); + let mut collected_monitors = FixedCapacityVec::new(num_monitors); + Self::collect_given_monitors(&mut self.deadline_monitors, &mut collected_monitors)?; + Self::collect_given_monitors(&mut self.heartbeat_monitors, &mut collected_monitors)?; + Ok(collected_monitors) } pub(crate) fn start_internal(&mut self, monitors: FixedCapacityVec) { let monitoring_logic = worker::MonitoringLogic::new( monitors, self.supervisor_api_cycle, - // Currently only `ScoreSupervisorAPIClient` and `StubSupervisorAPIClient` are supported. - // The later is meant to be used for testing purposes. - #[cfg(not(any(test, feature = "stub_supervisor_api_client")))] - worker::ScoreSupervisorAPIClient::new(), - #[cfg(any(test, feature = "stub_supervisor_api_client"))] - worker::StubSupervisorAPIClient {}, + #[cfg(all(not(test), feature = "score_supervisor_api_client"))] + supervisor_api_client::score_supervisor_api_client::ScoreSupervisorAPIClient::new(), + #[cfg(any( + test, + all(feature = "stub_supervisor_api_client", not(feature = "score_supervisor_api_client")) + ))] + supervisor_api_client::stub_supervisor_api_client::StubSupervisorAPIClient::new(), ); self.worker.start(monitoring_logic) @@ -237,7 +323,7 @@ mod tests { use super::*; #[test] - #[should_panic(expected = "No deadline monitors have been added. HealthMonitor cannot start without any monitors.")] + #[should_panic(expected = "No monitors have been added. HealthMonitor cannot start without any monitors.")] fn hm_with_no_monitors_shall_panic_on_start() { let health_monitor_builder = super::HealthMonitorBuilder::new(); health_monitor_builder.build().start(); @@ -254,10 +340,7 @@ mod tests { #[test] fn hm_with_taken_monitors_starts() { let mut health_monitor = HealthMonitorBuilder::new() - .add_deadline_monitor( - MonitorTag::from("test_monitor"), - deadline::DeadlineMonitorBuilder::new(), - ) + .add_deadline_monitor(MonitorTag::from("test_monitor"), DeadlineMonitorBuilder::new()) .build(); let _monitor = health_monitor.get_deadline_monitor(MonitorTag::from("test_monitor")); @@ -270,10 +353,7 @@ mod tests { )] fn hm_with_monitors_shall_not_start_with_not_taken_monitors() { let mut health_monitor = HealthMonitorBuilder::new() - .add_deadline_monitor( - MonitorTag::from("test_monitor"), - deadline::DeadlineMonitorBuilder::new(), - ) + .add_deadline_monitor(MonitorTag::from("test_monitor"), DeadlineMonitorBuilder::new()) .build(); health_monitor.start(); @@ -282,10 +362,7 @@ mod tests { #[test] fn hm_get_deadline_monitor_works() { let mut health_monitor = HealthMonitorBuilder::new() - .add_deadline_monitor( - MonitorTag::from("test_monitor"), - deadline::DeadlineMonitorBuilder::new(), - ) + .add_deadline_monitor(MonitorTag::from("test_monitor"), DeadlineMonitorBuilder::new()) .build(); { diff --git a/src/health_monitoring_lib/rust/supervisor_api_client/mod.rs b/src/health_monitoring_lib/rust/supervisor_api_client/mod.rs new file mode 100644 index 00000000..195d167a --- /dev/null +++ b/src/health_monitoring_lib/rust/supervisor_api_client/mod.rs @@ -0,0 +1,28 @@ +// ******************************************************************************* +// Copyright (c) 2026 Contributors to the Eclipse Foundation +// +// See the NOTICE file(s) distributed with this work for additional +// information regarding copyright ownership. +// +// This program and the accompanying materials are made available under the +// terms of the Apache License Version 2.0 which is available at +// +// +// SPDX-License-Identifier: Apache-2.0 +// ******************************************************************************* + +//! Module providing [`SupervisorAPIClient`] implementations. +//! Currently `ScoreSupervisorAPIClient` and `StubSupervisorAPIClient` are supported. +//! The latter is meant for testing purposes. + +/// An abstraction over the API used to notify the supervisor about process liveness. +pub trait SupervisorAPIClient { + fn notify_alive(&self); +} + +// NOTE: various implementations are not mutually exclusive. + +#[cfg(feature = "score_supervisor_api_client")] +pub mod score_supervisor_api_client; +#[cfg(feature = "stub_supervisor_api_client")] +pub mod stub_supervisor_api_client; diff --git a/src/health_monitoring_lib/rust/supervisor_api_client/score_supervisor_api_client.rs b/src/health_monitoring_lib/rust/supervisor_api_client/score_supervisor_api_client.rs new file mode 100644 index 00000000..a198f9ad --- /dev/null +++ b/src/health_monitoring_lib/rust/supervisor_api_client/score_supervisor_api_client.rs @@ -0,0 +1,40 @@ +// ******************************************************************************* +// Copyright (c) 2026 Contributors to the Eclipse Foundation +// +// See the NOTICE file(s) distributed with this work for additional +// information regarding copyright ownership. +// +// This program and the accompanying materials are made available under the +// terms of the Apache License Version 2.0 which is available at +// +// +// SPDX-License-Identifier: Apache-2.0 +// ******************************************************************************* + +#![allow(dead_code)] + +use crate::log::debug; +use crate::supervisor_api_client::SupervisorAPIClient; +use crate::worker::Checks; + +pub struct ScoreSupervisorAPIClient { + supervisor_link: monitor_rs::Monitor, +} + +unsafe impl Send for ScoreSupervisorAPIClient {} // Just assuming it's safe to send across threads, this is a temporary solution + +impl ScoreSupervisorAPIClient { + pub fn new() -> Self { + let value = std::env::var("IDENTIFIER").expect("IDENTIFIER env not set"); + debug!("ScoreSupervisorAPIClient: Creating with IDENTIFIER={}", value); + // This is only temporary usage so unwrap is fine here. + let supervisor_link = monitor_rs::Monitor::::new(&value).expect("Failed to create supervisor_link"); + Self { supervisor_link } + } +} + +impl SupervisorAPIClient for ScoreSupervisorAPIClient { + fn notify_alive(&self) { + self.supervisor_link.report_checkpoint(Checks::WorkerCheckpoint); + } +} diff --git a/src/health_monitoring_lib/rust/supervisor_api_client/stub_supervisor_api_client.rs b/src/health_monitoring_lib/rust/supervisor_api_client/stub_supervisor_api_client.rs new file mode 100644 index 00000000..e98f4909 --- /dev/null +++ b/src/health_monitoring_lib/rust/supervisor_api_client/stub_supervisor_api_client.rs @@ -0,0 +1,32 @@ +// ******************************************************************************* +// Copyright (c) 2026 Contributors to the Eclipse Foundation +// +// See the NOTICE file(s) distributed with this work for additional +// information regarding copyright ownership. +// +// This program and the accompanying materials are made available under the +// terms of the Apache License Version 2.0 which is available at +// +// +// SPDX-License-Identifier: Apache-2.0 +// ******************************************************************************* + +#![allow(dead_code)] + +use crate::log::warn; +use crate::supervisor_api_client::SupervisorAPIClient; + +/// A stub implementation of the SupervisorAPIClient that logs alive notifications. +pub struct StubSupervisorAPIClient; + +impl StubSupervisorAPIClient { + pub fn new() -> Self { + Self + } +} + +impl SupervisorAPIClient for StubSupervisorAPIClient { + fn notify_alive(&self) { + warn!("StubSupervisorAPIClient: notify_alive called"); + } +} diff --git a/src/health_monitoring_lib/rust/tag.rs b/src/health_monitoring_lib/rust/tag.rs index eca868df..204902b7 100644 --- a/src/health_monitoring_lib/rust/tag.rs +++ b/src/health_monitoring_lib/rust/tag.rs @@ -247,13 +247,41 @@ mod tests { } #[test] - fn tag_hash() { - let example_str = "EXAMPLE"; - let tag = Tag::from(example_str.to_string()); - let mut hasher = DefaultHasher::new(); - tag.hash(&mut hasher); - let hash = hasher.finish(); - assert_eq!(hash, 14738755424381306335); + fn tag_hash_is_eq() { + let tag1 = Tag::from("same"); + let hash1 = { + let mut hasher = DefaultHasher::new(); + tag1.hash(&mut hasher); + hasher.finish() + }; + + let tag2 = Tag::from("same"); + let hash2 = { + let mut hasher = DefaultHasher::new(); + tag2.hash(&mut hasher); + hasher.finish() + }; + + assert_eq!(hash1, hash2); + } + + #[test] + fn tag_hash_is_ne() { + let tag1 = Tag::from("first"); + let hash1 = { + let mut hasher = DefaultHasher::new(); + tag1.hash(&mut hasher); + hasher.finish() + }; + + let tag2 = Tag::from("second"); + let hash2 = { + let mut hasher = DefaultHasher::new(); + tag2.hash(&mut hasher); + hasher.finish() + }; + + assert_ne!(hash1, hash2); } #[test] diff --git a/src/health_monitoring_lib/rust/worker.rs b/src/health_monitoring_lib/rust/worker.rs index 8830e153..c494595d 100644 --- a/src/health_monitoring_lib/rust/worker.rs +++ b/src/health_monitoring_lib/rust/worker.rs @@ -10,20 +10,20 @@ // // SPDX-License-Identifier: Apache-2.0 // ******************************************************************************* -use crate::common::{MonitorEvalHandle, MonitorEvaluator}; -use crate::log::{debug, info, warn}; +use crate::common::{MonitorEvalHandle, MonitorEvaluationError, MonitorEvaluator}; +use crate::log::{info, warn}; +use crate::supervisor_api_client::SupervisorAPIClient; use containers::fixed_capacity::FixedCapacityVec; - -/// An abstraction over the API used to notify the supervisor about process liveness. -pub(super) trait SupervisorAPIClient { - fn notify_alive(&self); -} +use core::sync::atomic::{AtomicBool, Ordering}; +use core::time::Duration; +use std::sync::Arc; +use std::time::Instant; pub(super) struct MonitoringLogic { monitors: FixedCapacityVec, client: T, - last_notification: std::time::Instant, - supervisor_api_cycle: core::time::Duration, + last_notification: Instant, + supervisor_api_cycle: Duration, } impl MonitoringLogic { @@ -34,31 +34,45 @@ impl MonitoringLogic { /// * `client` - An implementation of the SupervisorAPIClient trait. pub(super) fn new( monitors: FixedCapacityVec, - supervisor_api_cycle: core::time::Duration, + supervisor_api_cycle: Duration, client: T, ) -> Self { Self { monitors, client, supervisor_api_cycle, - last_notification: std::time::Instant::now(), + last_notification: Instant::now(), } } - fn run(&mut self) -> bool { + fn run(&mut self, hmon_starting_point: Instant) -> bool { let mut has_any_error = false; for monitor in self.monitors.iter() { - monitor.evaluate(&mut |monitor_tag, error| { + monitor.evaluate(hmon_starting_point, &mut |monitor_tag, error| { has_any_error = true; - // TODO: monitor type should be mentioned. - warn!("Monitor with tag {:?} reported error: {:?}.", monitor_tag, error); + + match error { + MonitorEvaluationError::Deadline(deadline_evaluation_error) => { + warn!( + "Deadline monitor with tag {:?} reported error: {:?}.", + monitor_tag, deadline_evaluation_error + ) + }, + MonitorEvaluationError::Heartbeat(heartbeat_evaluation_error) => { + warn!( + "Heartbeat monitor with tag {:?} reported error: {:?}.", + monitor_tag, heartbeat_evaluation_error + ) + }, + MonitorEvaluationError::Logic => unimplemented!(), + } }); } if !has_any_error { if self.last_notification.elapsed() > self.supervisor_api_cycle { - self.last_notification = std::time::Instant::now(); + self.last_notification = Instant::now(); self.client.notify_alive(); } } else { @@ -73,15 +87,15 @@ impl MonitoringLogic { /// A struct that manages a unique thread for running monitoring logic periodically. pub struct UniqueThreadRunner { handle: Option>, - should_stop: std::sync::Arc, - internal_duration_cycle: core::time::Duration, + should_stop: Arc, + internal_duration_cycle: Duration, } impl UniqueThreadRunner { - pub(super) fn new(internal_duration_cycle: core::time::Duration) -> Self { + pub(super) fn new(internal_duration_cycle: Duration) -> Self { Self { handle: None, - should_stop: std::sync::Arc::new(core::sync::atomic::AtomicBool::new(false)), + should_stop: Arc::new(AtomicBool::new(false)), internal_duration_cycle, } } @@ -96,15 +110,16 @@ impl UniqueThreadRunner { std::thread::spawn(move || { info!("Monitoring thread started."); + let hmon_starting_point = Instant::now(); let mut next_sleep_time = interval; // TODO Add some checks and log if cyclicly here is not met. - while !should_stop.load(core::sync::atomic::Ordering::Relaxed) { + while !should_stop.load(Ordering::Relaxed) { std::thread::sleep(next_sleep_time); - let now = std::time::Instant::now(); + let now = Instant::now(); - if !monitoring_logic.run() { + if !monitoring_logic.run(hmon_starting_point) { info!("Monitoring logic failed, stopping thread."); break; } @@ -118,7 +133,7 @@ impl UniqueThreadRunner { } pub fn join(&mut self) { - self.should_stop.store(true, core::sync::atomic::Ordering::Relaxed); + self.should_stop.store(true, Ordering::Relaxed); if let Some(handle) = self.handle.take() { let _ = handle.join(); } @@ -131,20 +146,9 @@ impl Drop for UniqueThreadRunner { } } -/// A stub implementation of the SupervisorAPIClient that logs alive notifications. -#[allow(dead_code)] -pub(super) struct StubSupervisorAPIClient; - -#[allow(dead_code)] -impl SupervisorAPIClient for StubSupervisorAPIClient { - fn notify_alive(&self) { - warn!("StubSupervisorAPIClient: notify_alive called"); - } -} - #[allow(dead_code)] #[derive(Copy, Clone)] -enum Checks { +pub(crate) enum Checks { WorkerCheckpoint, } @@ -156,59 +160,42 @@ impl From for u32 { } } -#[allow(dead_code)] -pub(super) struct ScoreSupervisorAPIClient { - supervisor_link: monitor_rs::Monitor, -} - -unsafe impl Send for ScoreSupervisorAPIClient {} // Just assuming it's safe to send across threads, this is a temporary solution - -#[allow(dead_code)] -impl ScoreSupervisorAPIClient { - pub fn new() -> Self { - let value = std::env::var("IDENTIFIER").expect("IDENTIFIER env not set"); - debug!("ScoreSupervisorAPIClient: Creating with IDENTIFIER={}", value); - // This is only temporary usage so unwrap is fine here. - let supervisor_link = monitor_rs::Monitor::::new(&value).expect("Failed to create supervisor_link"); - Self { supervisor_link } - } -} -impl SupervisorAPIClient for ScoreSupervisorAPIClient { - fn notify_alive(&self) { - self.supervisor_link.report_checkpoint(Checks::WorkerCheckpoint); - } -} - #[score_testing_macros::test_mod_with_log] #[cfg(test)] mod tests { + use crate::common::HasEvalHandle; use crate::deadline::{DeadlineMonitor, DeadlineMonitorBuilder}; use crate::protected_memory::ProtectedMemoryAllocator; + use crate::supervisor_api_client::SupervisorAPIClient; use crate::tag::{DeadlineTag, MonitorTag}; + use crate::worker::{MonitoringLogic, UniqueThreadRunner}; use crate::TimeRange; - - use super::*; + use containers::fixed_capacity::FixedCapacityVec; + use core::sync::atomic::{AtomicUsize, Ordering}; + use core::time::Duration; + use std::sync::Arc; + use std::time::Instant; #[derive(Clone)] struct MockSupervisorAPIClient { - pub notify_called: std::sync::Arc, + pub notify_called: Arc, } impl MockSupervisorAPIClient { pub fn new() -> Self { Self { - notify_called: std::sync::Arc::new(core::sync::atomic::AtomicUsize::new(0)), + notify_called: Arc::new(AtomicUsize::new(0)), } } fn get_notify_count(&self) -> usize { - self.notify_called.load(core::sync::atomic::Ordering::Acquire) + self.notify_called.load(Ordering::Acquire) } } impl SupervisorAPIClient for MockSupervisorAPIClient { fn notify_alive(&self) { - self.notify_called.fetch_add(1, core::sync::atomic::Ordering::AcqRel); + self.notify_called.fetch_add(1, Ordering::AcqRel); } } @@ -218,14 +205,11 @@ mod tests { DeadlineMonitorBuilder::new() .add_deadline( DeadlineTag::from("deadline_long"), - TimeRange::new(core::time::Duration::from_secs(1), core::time::Duration::from_secs(50)), + TimeRange::new(Duration::from_secs(1), Duration::from_secs(50)), ) .add_deadline( DeadlineTag::from("deadline_fast"), - TimeRange::new( - core::time::Duration::from_millis(0), - core::time::Duration::from_millis(50), - ), + TimeRange::new(Duration::from_millis(0), Duration::from_millis(50)), ) .build(monitor_tag, &allocator) } @@ -234,6 +218,7 @@ mod tests { fn monitoring_logic_report_error_when_deadline_failed() { let deadline_monitor = create_monitor_with_deadlines(); let alive_mock = MockSupervisorAPIClient::new(); + let hmon_starting_point = Instant::now(); let mut logic = MonitoringLogic::new( { @@ -241,7 +226,7 @@ mod tests { vec.push(deadline_monitor.get_eval_handle()).unwrap(); vec }, - core::time::Duration::from_secs(1), + Duration::from_secs(1), alive_mock.clone(), ); @@ -252,7 +237,7 @@ mod tests { drop(handle); - assert!(!logic.run()); + assert!(!logic.run(hmon_starting_point)); assert_eq!(alive_mock.get_notify_count(), 0); } @@ -260,6 +245,7 @@ mod tests { fn monitoring_logic_report_alive_on_each_call_when_no_error() { let deadline_monitor = create_monitor_with_deadlines(); let alive_mock = MockSupervisorAPIClient::new(); + let hmon_starting_point = Instant::now(); let mut logic = MonitoringLogic::new( { @@ -267,7 +253,7 @@ mod tests { vec.push(deadline_monitor.get_eval_handle()).unwrap(); vec }, - core::time::Duration::from_nanos(0), // Make sure each call notifies alive + Duration::from_nanos(0), // Make sure each call notifies alive alive_mock.clone(), ); @@ -276,11 +262,11 @@ mod tests { .unwrap(); let _handle = deadline.start().unwrap(); - assert!(logic.run()); - assert!(logic.run()); - assert!(logic.run()); - assert!(logic.run()); - assert!(logic.run()); + assert!(logic.run(hmon_starting_point)); + assert!(logic.run(hmon_starting_point)); + assert!(logic.run(hmon_starting_point)); + assert!(logic.run(hmon_starting_point)); + assert!(logic.run(hmon_starting_point)); assert_eq!(alive_mock.get_notify_count(), 5); } @@ -289,6 +275,7 @@ mod tests { fn monitoring_logic_report_alive_respect_cycle() { let deadline_monitor = create_monitor_with_deadlines(); let alive_mock = MockSupervisorAPIClient::new(); + let hmon_starting_point = Instant::now(); let mut logic = MonitoringLogic::new( { @@ -296,7 +283,7 @@ mod tests { vec.push(deadline_monitor.get_eval_handle()).unwrap(); vec }, - core::time::Duration::from_millis(30), + Duration::from_millis(30), alive_mock.clone(), ); @@ -305,20 +292,20 @@ mod tests { .unwrap(); let _handle = deadline.start().unwrap(); - std::thread::sleep(core::time::Duration::from_millis(30)); - assert!(logic.run()); + std::thread::sleep(Duration::from_millis(30)); + assert!(logic.run(hmon_starting_point)); - std::thread::sleep(core::time::Duration::from_millis(30)); - assert!(logic.run()); + std::thread::sleep(Duration::from_millis(30)); + assert!(logic.run(hmon_starting_point)); - std::thread::sleep(core::time::Duration::from_millis(30)); - assert!(logic.run()); + std::thread::sleep(Duration::from_millis(30)); + assert!(logic.run(hmon_starting_point)); - std::thread::sleep(core::time::Duration::from_millis(30)); - assert!(logic.run()); + std::thread::sleep(Duration::from_millis(30)); + assert!(logic.run(hmon_starting_point)); - std::thread::sleep(core::time::Duration::from_millis(30)); - assert!(logic.run()); + std::thread::sleep(Duration::from_millis(30)); + assert!(logic.run(hmon_starting_point)); assert_eq!(alive_mock.get_notify_count(), 5); } @@ -335,11 +322,11 @@ mod tests { vec.push(deadline_monitor.get_eval_handle()).unwrap(); vec }, - core::time::Duration::from_nanos(0), // Make sure each call notifies alive + Duration::from_nanos(0), // Make sure each call notifies alive alive_mock.clone(), ); - let mut worker = UniqueThreadRunner::new(core::time::Duration::from_millis(10)); + let mut worker = UniqueThreadRunner::new(Duration::from_millis(10)); worker.start(logic); let mut deadline = deadline_monitor @@ -348,7 +335,7 @@ mod tests { let handle = deadline.start().unwrap(); - std::thread::sleep(core::time::Duration::from_millis(70)); + std::thread::sleep(Duration::from_millis(70)); let current_count = alive_mock.get_notify_count(); assert!( @@ -358,7 +345,7 @@ mod tests { ); // We shall not get any new alive calls. - std::thread::sleep(core::time::Duration::from_millis(50)); + std::thread::sleep(Duration::from_millis(50)); assert_eq!(alive_mock.get_notify_count(), current_count); handle.stop(); }