Files
multiverse/xcom-ultra/xcu-sentinel/src/lib.rs
T

279 lines
9.1 KiB
Rust

#![deny(warnings)]
#![allow(dead_code)]
//! [TSM.ID].[11031972] -- Platform X Ecosystem
//! xcu-sentinel -- System Watchdog with Resource Monitoring
//! CPU/RAM/disk monitoring, threshold alerts, SLA enforcement
use std::collections::VecDeque;
use std::sync::{Arc, Mutex};
use std::time::SystemTime;
#[derive(Debug)]
pub enum SentinelError {
ThresholdExceeded(String),
MonitorFailed(String),
ConfigError(String),
}
impl std::fmt::Display for SentinelError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
Self::ThresholdExceeded(e) => write!(f, "Threshold exceeded: {e}"),
Self::MonitorFailed(e) => write!(f, "Monitor failed: {e}"),
Self::ConfigError(e) => write!(f, "Config error: {e}"),
}
}
}
impl std::error::Error for SentinelError {}
#[derive(Debug, Clone)]
pub struct ResourceSnapshot {
pub cpu_percent: f64,
pub memory_used_mb: u64,
pub memory_total_mb: u64,
pub disk_used_percent: f64,
pub open_connections: u32,
pub timestamp: u64,
}
#[derive(Debug, Clone)]
pub struct AlertThreshold {
pub cpu_critical: f64,
pub cpu_warning: f64,
pub memory_critical_percent: f64,
pub memory_warning_percent: f64,
pub disk_critical_percent: f64,
pub response_time_ms_critical: u64,
}
impl Default for AlertThreshold {
fn default() -> Self {
Self {
cpu_critical: 90.0,
cpu_warning: 70.0,
memory_critical_percent: 85.0,
memory_warning_percent: 70.0,
disk_critical_percent: 90.0,
response_time_ms_critical: 5000,
}
}
}
#[derive(Debug, Clone)]
pub enum AlertLevel { Info, Warning, Critical, Fatal }
#[derive(Debug, Clone)]
pub struct Alert {
pub level: AlertLevel,
pub resource: String,
pub message: String,
pub value: f64,
pub threshold: f64,
pub timestamp: u64,
}
pub struct Sentinel {
thresholds: AlertThreshold,
history: Arc<Mutex<VecDeque<ResourceSnapshot>>>,
alerts: Arc<Mutex<Vec<Alert>>>,
max_history: usize,
}
impl Sentinel {
pub fn new(thresholds: AlertThreshold, max_history: usize) -> Self {
Self {
thresholds,
history: Arc::new(Mutex::new(VecDeque::with_capacity(max_history))),
alerts: Arc::new(Mutex::new(Vec::new())),
max_history,
}
}
/// Record a resource snapshot and check thresholds
pub fn record(&self, snapshot: ResourceSnapshot) -> Result<Vec<Alert>, SentinelError> {
let mut new_alerts = Vec::new();
let ts = snapshot.timestamp;
// CPU check
if snapshot.cpu_percent >= self.thresholds.cpu_critical {
new_alerts.push(Alert {
level: AlertLevel::Critical,
resource: "cpu".into(),
message: format!("CPU {}% >= {}%", snapshot.cpu_percent, self.thresholds.cpu_critical),
value: snapshot.cpu_percent,
threshold: self.thresholds.cpu_critical,
timestamp: ts,
});
} else if snapshot.cpu_percent >= self.thresholds.cpu_warning {
new_alerts.push(Alert {
level: AlertLevel::Warning,
resource: "cpu".into(),
message: format!("CPU {}% >= {}%", snapshot.cpu_percent, self.thresholds.cpu_warning),
value: snapshot.cpu_percent,
threshold: self.thresholds.cpu_warning,
timestamp: ts,
});
}
// Memory check
let mem_percent = if snapshot.memory_total_mb > 0 {
(snapshot.memory_used_mb as f64 / snapshot.memory_total_mb as f64) * 100.0
} else {
0.0
};
if mem_percent >= self.thresholds.memory_critical_percent {
new_alerts.push(Alert {
level: AlertLevel::Critical,
resource: "memory".into(),
message: format!("Memory {:.1}% >= {}%", mem_percent, self.thresholds.memory_critical_percent),
value: mem_percent,
threshold: self.thresholds.memory_critical_percent,
timestamp: ts,
});
} else if mem_percent >= self.thresholds.memory_warning_percent {
new_alerts.push(Alert {
level: AlertLevel::Warning,
resource: "memory".into(),
message: format!("Memory {:.1}% >= {}%", mem_percent, self.thresholds.memory_warning_percent),
value: mem_percent,
threshold: self.thresholds.memory_warning_percent,
timestamp: ts,
});
}
// Disk check
if snapshot.disk_used_percent >= self.thresholds.disk_critical_percent {
new_alerts.push(Alert {
level: AlertLevel::Critical,
resource: "disk".into(),
message: format!("Disk {:.1}% >= {}%", snapshot.disk_used_percent, self.thresholds.disk_critical_percent),
value: snapshot.disk_used_percent,
threshold: self.thresholds.disk_critical_percent,
timestamp: ts,
});
}
// Store history
if let Ok(mut hist) = self.history.lock() {
if hist.len() >= self.max_history {
hist.pop_front();
}
hist.push_back(snapshot);
}
// Store alerts
if let Ok(mut alert_log) = self.alerts.lock() {
for a in &new_alerts {
alert_log.push(a.clone());
}
}
Ok(new_alerts)
}
/// Calculate moving average of CPU over last N samples
pub fn cpu_moving_average(&self, window: usize) -> Result<f64, SentinelError> {
if let Ok(hist) = self.history.lock() {
let samples: Vec<f64> = hist.iter().rev().take(window).map(|s| s.cpu_percent).collect();
if samples.is_empty() {
return Ok(0.0);
}
let sum: f64 = samples.iter().sum();
Ok(sum / samples.len() as f64)
} else {
Err(SentinelError::MonitorFailed("Lock poisoned".into()))
}
}
/// Detect anomaly: sudden spike compared to moving average
pub fn detect_anomaly(&self, current_cpu: f64, window: usize) -> Result<bool, SentinelError> {
let avg = self.cpu_moving_average(window)?;
if avg > 0.0 {
let deviation = (current_cpu - avg).abs() / avg;
Ok(deviation > 0.5) // 50% deviation = anomaly
} else {
Ok(false)
}
}
/// SLA check: uptime percentage
pub fn calculate_uptime(&self, total_checks: u64, failed_checks: u64) -> Result<f64, SentinelError> {
if total_checks == 0 {
return Err(SentinelError::ConfigError("No checks recorded".into()));
}
let uptime = ((total_checks - failed_checks) as f64 / total_checks as f64) * 100.0;
Ok(uptime)
}
/// Get current epoch timestamp
pub fn now_epoch() -> u64 {
SystemTime::now()
.duration_since(SystemTime::UNIX_EPOCH)
.map(|d| d.as_secs())
.unwrap_or(0)
}
pub fn get_alerts(&self) -> Vec<Alert> {
self.alerts.lock().map(|a| a.clone()).unwrap_or_default()
}
pub fn get_history(&self) -> Vec<ResourceSnapshot> {
self.history.lock().map(|h| h.iter().cloned().collect()).unwrap_or_default()
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_cpu_critical_alert() {
let sentinel = Sentinel::new(AlertThreshold::default(), 100);
let snap = ResourceSnapshot {
cpu_percent: 95.0, memory_used_mb: 4000, memory_total_mb: 8000,
disk_used_percent: 50.0, open_connections: 100, timestamp: 1000,
};
let alerts = sentinel.record(snap).unwrap();
assert!(!alerts.is_empty());
assert!(matches!(alerts[0].level, AlertLevel::Critical));
}
#[test]
fn test_moving_average() {
let sentinel = Sentinel::new(AlertThreshold::default(), 100);
for i in 0..10 {
let snap = ResourceSnapshot {
cpu_percent: 30.0 + i as f64, memory_used_mb: 2000,
memory_total_mb: 8000, disk_used_percent: 40.0,
open_connections: 50, timestamp: i as u64,
};
let _ = sentinel.record(snap);
}
let avg = sentinel.cpu_moving_average(5).unwrap();
assert!(avg > 30.0 && avg < 40.0);
}
#[test]
fn test_anomaly_detection() {
let sentinel = Sentinel::new(AlertThreshold::default(), 100);
for i in 0..20 {
let snap = ResourceSnapshot {
cpu_percent: 30.0, memory_used_mb: 2000,
memory_total_mb: 8000, disk_used_percent: 40.0,
open_connections: 50, timestamp: i,
};
let _ = sentinel.record(snap);
}
let is_anomaly = sentinel.detect_anomaly(80.0, 10).unwrap();
assert!(is_anomaly);
}
#[test]
fn test_sla_uptime() {
let sentinel = Sentinel::new(AlertThreshold::default(), 100);
let uptime = sentinel.calculate_uptime(1000, 1).unwrap();
assert!(uptime > 99.8);
}
}