279 lines
9.1 KiB
Rust
279 lines
9.1 KiB
Rust
#![deny(warnings)]
|
|
#![allow(dead_code)]
|
|
//! [TSM.ID].[11031972] -- Platform X Ecosystem
|
|
//! xcu-sentinel -- System Watchdog with Resource Monitoring
|
|
//! CPU/RAM/disk monitoring, threshold alerts, SLA enforcement
|
|
|
|
use std::collections::VecDeque;
|
|
use std::sync::{Arc, Mutex};
|
|
use std::time::SystemTime;
|
|
|
|
#[derive(Debug)]
|
|
pub enum SentinelError {
|
|
ThresholdExceeded(String),
|
|
MonitorFailed(String),
|
|
ConfigError(String),
|
|
}
|
|
|
|
impl std::fmt::Display for SentinelError {
|
|
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
|
match self {
|
|
Self::ThresholdExceeded(e) => write!(f, "Threshold exceeded: {e}"),
|
|
Self::MonitorFailed(e) => write!(f, "Monitor failed: {e}"),
|
|
Self::ConfigError(e) => write!(f, "Config error: {e}"),
|
|
}
|
|
}
|
|
}
|
|
|
|
impl std::error::Error for SentinelError {}
|
|
|
|
#[derive(Debug, Clone)]
|
|
pub struct ResourceSnapshot {
|
|
pub cpu_percent: f64,
|
|
pub memory_used_mb: u64,
|
|
pub memory_total_mb: u64,
|
|
pub disk_used_percent: f64,
|
|
pub open_connections: u32,
|
|
pub timestamp: u64,
|
|
}
|
|
|
|
#[derive(Debug, Clone)]
|
|
pub struct AlertThreshold {
|
|
pub cpu_critical: f64,
|
|
pub cpu_warning: f64,
|
|
pub memory_critical_percent: f64,
|
|
pub memory_warning_percent: f64,
|
|
pub disk_critical_percent: f64,
|
|
pub response_time_ms_critical: u64,
|
|
}
|
|
|
|
impl Default for AlertThreshold {
|
|
fn default() -> Self {
|
|
Self {
|
|
cpu_critical: 90.0,
|
|
cpu_warning: 70.0,
|
|
memory_critical_percent: 85.0,
|
|
memory_warning_percent: 70.0,
|
|
disk_critical_percent: 90.0,
|
|
response_time_ms_critical: 5000,
|
|
}
|
|
}
|
|
}
|
|
|
|
#[derive(Debug, Clone)]
|
|
pub enum AlertLevel { Info, Warning, Critical, Fatal }
|
|
|
|
#[derive(Debug, Clone)]
|
|
pub struct Alert {
|
|
pub level: AlertLevel,
|
|
pub resource: String,
|
|
pub message: String,
|
|
pub value: f64,
|
|
pub threshold: f64,
|
|
pub timestamp: u64,
|
|
}
|
|
|
|
pub struct Sentinel {
|
|
thresholds: AlertThreshold,
|
|
history: Arc<Mutex<VecDeque<ResourceSnapshot>>>,
|
|
alerts: Arc<Mutex<Vec<Alert>>>,
|
|
max_history: usize,
|
|
}
|
|
|
|
impl Sentinel {
|
|
pub fn new(thresholds: AlertThreshold, max_history: usize) -> Self {
|
|
Self {
|
|
thresholds,
|
|
history: Arc::new(Mutex::new(VecDeque::with_capacity(max_history))),
|
|
alerts: Arc::new(Mutex::new(Vec::new())),
|
|
max_history,
|
|
}
|
|
}
|
|
|
|
/// Record a resource snapshot and check thresholds
|
|
pub fn record(&self, snapshot: ResourceSnapshot) -> Result<Vec<Alert>, SentinelError> {
|
|
let mut new_alerts = Vec::new();
|
|
let ts = snapshot.timestamp;
|
|
|
|
// CPU check
|
|
if snapshot.cpu_percent >= self.thresholds.cpu_critical {
|
|
new_alerts.push(Alert {
|
|
level: AlertLevel::Critical,
|
|
resource: "cpu".into(),
|
|
message: format!("CPU {}% >= {}%", snapshot.cpu_percent, self.thresholds.cpu_critical),
|
|
value: snapshot.cpu_percent,
|
|
threshold: self.thresholds.cpu_critical,
|
|
timestamp: ts,
|
|
});
|
|
} else if snapshot.cpu_percent >= self.thresholds.cpu_warning {
|
|
new_alerts.push(Alert {
|
|
level: AlertLevel::Warning,
|
|
resource: "cpu".into(),
|
|
message: format!("CPU {}% >= {}%", snapshot.cpu_percent, self.thresholds.cpu_warning),
|
|
value: snapshot.cpu_percent,
|
|
threshold: self.thresholds.cpu_warning,
|
|
timestamp: ts,
|
|
});
|
|
}
|
|
|
|
// Memory check
|
|
let mem_percent = if snapshot.memory_total_mb > 0 {
|
|
(snapshot.memory_used_mb as f64 / snapshot.memory_total_mb as f64) * 100.0
|
|
} else {
|
|
0.0
|
|
};
|
|
if mem_percent >= self.thresholds.memory_critical_percent {
|
|
new_alerts.push(Alert {
|
|
level: AlertLevel::Critical,
|
|
resource: "memory".into(),
|
|
message: format!("Memory {:.1}% >= {}%", mem_percent, self.thresholds.memory_critical_percent),
|
|
value: mem_percent,
|
|
threshold: self.thresholds.memory_critical_percent,
|
|
timestamp: ts,
|
|
});
|
|
} else if mem_percent >= self.thresholds.memory_warning_percent {
|
|
new_alerts.push(Alert {
|
|
level: AlertLevel::Warning,
|
|
resource: "memory".into(),
|
|
message: format!("Memory {:.1}% >= {}%", mem_percent, self.thresholds.memory_warning_percent),
|
|
value: mem_percent,
|
|
threshold: self.thresholds.memory_warning_percent,
|
|
timestamp: ts,
|
|
});
|
|
}
|
|
|
|
// Disk check
|
|
if snapshot.disk_used_percent >= self.thresholds.disk_critical_percent {
|
|
new_alerts.push(Alert {
|
|
level: AlertLevel::Critical,
|
|
resource: "disk".into(),
|
|
message: format!("Disk {:.1}% >= {}%", snapshot.disk_used_percent, self.thresholds.disk_critical_percent),
|
|
value: snapshot.disk_used_percent,
|
|
threshold: self.thresholds.disk_critical_percent,
|
|
timestamp: ts,
|
|
});
|
|
}
|
|
|
|
// Store history
|
|
if let Ok(mut hist) = self.history.lock() {
|
|
if hist.len() >= self.max_history {
|
|
hist.pop_front();
|
|
}
|
|
hist.push_back(snapshot);
|
|
}
|
|
|
|
// Store alerts
|
|
if let Ok(mut alert_log) = self.alerts.lock() {
|
|
for a in &new_alerts {
|
|
alert_log.push(a.clone());
|
|
}
|
|
}
|
|
|
|
Ok(new_alerts)
|
|
}
|
|
|
|
/// Calculate moving average of CPU over last N samples
|
|
pub fn cpu_moving_average(&self, window: usize) -> Result<f64, SentinelError> {
|
|
if let Ok(hist) = self.history.lock() {
|
|
let samples: Vec<f64> = hist.iter().rev().take(window).map(|s| s.cpu_percent).collect();
|
|
if samples.is_empty() {
|
|
return Ok(0.0);
|
|
}
|
|
let sum: f64 = samples.iter().sum();
|
|
Ok(sum / samples.len() as f64)
|
|
} else {
|
|
Err(SentinelError::MonitorFailed("Lock poisoned".into()))
|
|
}
|
|
}
|
|
|
|
/// Detect anomaly: sudden spike compared to moving average
|
|
pub fn detect_anomaly(&self, current_cpu: f64, window: usize) -> Result<bool, SentinelError> {
|
|
let avg = self.cpu_moving_average(window)?;
|
|
if avg > 0.0 {
|
|
let deviation = (current_cpu - avg).abs() / avg;
|
|
Ok(deviation > 0.5) // 50% deviation = anomaly
|
|
} else {
|
|
Ok(false)
|
|
}
|
|
}
|
|
|
|
/// SLA check: uptime percentage
|
|
pub fn calculate_uptime(&self, total_checks: u64, failed_checks: u64) -> Result<f64, SentinelError> {
|
|
if total_checks == 0 {
|
|
return Err(SentinelError::ConfigError("No checks recorded".into()));
|
|
}
|
|
let uptime = ((total_checks - failed_checks) as f64 / total_checks as f64) * 100.0;
|
|
Ok(uptime)
|
|
}
|
|
|
|
/// Get current epoch timestamp
|
|
pub fn now_epoch() -> u64 {
|
|
SystemTime::now()
|
|
.duration_since(SystemTime::UNIX_EPOCH)
|
|
.map(|d| d.as_secs())
|
|
.unwrap_or(0)
|
|
}
|
|
|
|
pub fn get_alerts(&self) -> Vec<Alert> {
|
|
self.alerts.lock().map(|a| a.clone()).unwrap_or_default()
|
|
}
|
|
|
|
pub fn get_history(&self) -> Vec<ResourceSnapshot> {
|
|
self.history.lock().map(|h| h.iter().cloned().collect()).unwrap_or_default()
|
|
}
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use super::*;
|
|
|
|
#[test]
|
|
fn test_cpu_critical_alert() {
|
|
let sentinel = Sentinel::new(AlertThreshold::default(), 100);
|
|
let snap = ResourceSnapshot {
|
|
cpu_percent: 95.0, memory_used_mb: 4000, memory_total_mb: 8000,
|
|
disk_used_percent: 50.0, open_connections: 100, timestamp: 1000,
|
|
};
|
|
let alerts = sentinel.record(snap).unwrap();
|
|
assert!(!alerts.is_empty());
|
|
assert!(matches!(alerts[0].level, AlertLevel::Critical));
|
|
}
|
|
|
|
#[test]
|
|
fn test_moving_average() {
|
|
let sentinel = Sentinel::new(AlertThreshold::default(), 100);
|
|
for i in 0..10 {
|
|
let snap = ResourceSnapshot {
|
|
cpu_percent: 30.0 + i as f64, memory_used_mb: 2000,
|
|
memory_total_mb: 8000, disk_used_percent: 40.0,
|
|
open_connections: 50, timestamp: i as u64,
|
|
};
|
|
let _ = sentinel.record(snap);
|
|
}
|
|
let avg = sentinel.cpu_moving_average(5).unwrap();
|
|
assert!(avg > 30.0 && avg < 40.0);
|
|
}
|
|
|
|
#[test]
|
|
fn test_anomaly_detection() {
|
|
let sentinel = Sentinel::new(AlertThreshold::default(), 100);
|
|
for i in 0..20 {
|
|
let snap = ResourceSnapshot {
|
|
cpu_percent: 30.0, memory_used_mb: 2000,
|
|
memory_total_mb: 8000, disk_used_percent: 40.0,
|
|
open_connections: 50, timestamp: i,
|
|
};
|
|
let _ = sentinel.record(snap);
|
|
}
|
|
let is_anomaly = sentinel.detect_anomaly(80.0, 10).unwrap();
|
|
assert!(is_anomaly);
|
|
}
|
|
|
|
#[test]
|
|
fn test_sla_uptime() {
|
|
let sentinel = Sentinel::new(AlertThreshold::default(), 100);
|
|
let uptime = sentinel.calculate_uptime(1000, 1).unwrap();
|
|
assert!(uptime > 99.8);
|
|
}
|
|
}
|