parent
77cb687263
commit
599776736b
@ -1 +1,174 @@
|
||||
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
import psutil
|
||||
import threading
|
||||
import traceback
|
||||
from typing import Callable, Dict, List, Optional
|
||||
from dataclasses import dataclass
|
||||
from datetime import datetime
|
||||
|
||||
from swarms.utils.terminal_output import terminal
|
||||
|
||||
@dataclass
|
||||
class SystemMetrics:
|
||||
cpu_percent: float
|
||||
memory_percent: float
|
||||
disk_usage_percent: float
|
||||
timestamp: datetime
|
||||
|
||||
class HealthCheck:
|
||||
"""System health monitoring and self-healing capabilities"""
|
||||
|
||||
def __init__(self):
|
||||
self.metrics_history: List[SystemMetrics] = []
|
||||
self.error_count: Dict[str, int] = {}
|
||||
self.recovery_actions: Dict[str, Callable] = {}
|
||||
self.monitoring_thread: Optional[threading.Thread] = None
|
||||
self.stop_monitoring = threading.Event()
|
||||
|
||||
# Default thresholds
|
||||
self.thresholds = {
|
||||
"cpu_percent": 90.0,
|
||||
"memory_percent": 85.0,
|
||||
"disk_usage_percent": 90.0,
|
||||
"error_threshold": 3
|
||||
}
|
||||
|
||||
def register_recovery_action(self, error_type: str, action: Callable):
|
||||
"""Register a recovery action for a specific error type"""
|
||||
self.recovery_actions[error_type] = action
|
||||
terminal.status_panel(f"Registered recovery action for {error_type}", "info")
|
||||
|
||||
def collect_metrics(self) -> SystemMetrics:
|
||||
"""Collect current system metrics"""
|
||||
try:
|
||||
cpu = psutil.cpu_percent(interval=1)
|
||||
memory = psutil.virtual_memory().percent
|
||||
disk = psutil.disk_usage('/').percent
|
||||
|
||||
metrics = SystemMetrics(
|
||||
cpu_percent=cpu,
|
||||
memory_percent=memory,
|
||||
disk_usage_percent=disk,
|
||||
timestamp=datetime.now()
|
||||
)
|
||||
|
||||
self.metrics_history.append(metrics)
|
||||
if len(self.metrics_history) > 100: # Keep last 100 readings
|
||||
self.metrics_history.pop(0)
|
||||
|
||||
return metrics
|
||||
|
||||
except Exception as e:
|
||||
terminal.status_panel(f"Error collecting metrics: {str(e)}", "error")
|
||||
return None
|
||||
|
||||
def check_system_health(self) -> bool:
|
||||
"""Check if system metrics are within acceptable thresholds"""
|
||||
metrics = self.collect_metrics()
|
||||
if not metrics:
|
||||
return False
|
||||
|
||||
issues = []
|
||||
|
||||
if metrics.cpu_percent > self.thresholds["cpu_percent"]:
|
||||
issues.append(f"High CPU usage: {metrics.cpu_percent}%")
|
||||
|
||||
if metrics.memory_percent > self.thresholds["memory_percent"]:
|
||||
issues.append(f"High memory usage: {metrics.memory_percent}%")
|
||||
|
||||
if metrics.disk_usage_percent > self.thresholds["disk_usage_percent"]:
|
||||
issues.append(f"High disk usage: {metrics.disk_usage_percent}%")
|
||||
|
||||
if issues:
|
||||
terminal.status_panel("\n".join(issues), "warning")
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
def handle_error(self, error: Exception, context: str = ""):
|
||||
"""Handle errors and attempt recovery"""
|
||||
error_type = type(error).__name__
|
||||
|
||||
# Increment error count
|
||||
self.error_count[error_type] = self.error_count.get(error_type, 0) + 1
|
||||
|
||||
terminal.status_panel(
|
||||
f"Error occurred in {context}: {str(error)}\n{traceback.format_exc()}",
|
||||
"error"
|
||||
)
|
||||
|
||||
# Check if we need to take recovery action
|
||||
if self.error_count[error_type] >= self.thresholds["error_threshold"]:
|
||||
self.attempt_recovery(error_type, error)
|
||||
|
||||
def attempt_recovery(self, error_type: str, error: Exception):
|
||||
"""Attempt to recover from an error"""
|
||||
terminal.status_panel(f"Attempting recovery for {error_type}", "info")
|
||||
|
||||
if error_type in self.recovery_actions:
|
||||
try:
|
||||
self.recovery_actions[error_type](error)
|
||||
terminal.status_panel(f"Recovery action completed for {error_type}", "success")
|
||||
self.error_count[error_type] = 0 # Reset error count after successful recovery
|
||||
except Exception as e:
|
||||
terminal.status_panel(
|
||||
f"Recovery action failed for {error_type}: {str(e)}",
|
||||
"error"
|
||||
)
|
||||
else:
|
||||
terminal.status_panel(
|
||||
f"No recovery action registered for {error_type}",
|
||||
"warning"
|
||||
)
|
||||
|
||||
def start_monitoring(self):
|
||||
"""Start continuous system monitoring"""
|
||||
if self.monitoring_thread and self.monitoring_thread.is_alive():
|
||||
return
|
||||
|
||||
def monitor():
|
||||
while not self.stop_monitoring.is_set():
|
||||
healthy = self.check_system_health()
|
||||
if healthy:
|
||||
terminal.status_panel("System health check passed", "success")
|
||||
time.sleep(60) # Check every minute
|
||||
|
||||
self.monitoring_thread = threading.Thread(target=monitor, daemon=True)
|
||||
self.monitoring_thread.start()
|
||||
terminal.status_panel("System monitoring started", "info")
|
||||
|
||||
def stop_monitoring(self):
|
||||
"""Stop system monitoring"""
|
||||
if self.monitoring_thread and self.monitoring_thread.is_alive():
|
||||
self.stop_monitoring.set()
|
||||
self.monitoring_thread.join()
|
||||
terminal.status_panel("System monitoring stopped", "info")
|
||||
|
||||
def get_health_report(self) -> dict:
|
||||
"""Generate a health report"""
|
||||
if not self.metrics_history:
|
||||
return {"status": "No metrics collected yet"}
|
||||
|
||||
latest = self.metrics_history[-1]
|
||||
avg_metrics = {
|
||||
"cpu_percent": sum(m.cpu_percent for m in self.metrics_history) / len(self.metrics_history),
|
||||
"memory_percent": sum(m.memory_percent for m in self.metrics_history) / len(self.metrics_history),
|
||||
"disk_usage_percent": sum(m.disk_usage_percent for m in self.metrics_history) / len(self.metrics_history)
|
||||
}
|
||||
|
||||
return {
|
||||
"current_metrics": {
|
||||
"cpu_percent": latest.cpu_percent,
|
||||
"memory_percent": latest.memory_percent,
|
||||
"disk_usage_percent": latest.disk_usage_percent,
|
||||
"timestamp": latest.timestamp.isoformat()
|
||||
},
|
||||
"average_metrics": avg_metrics,
|
||||
"error_counts": self.error_count,
|
||||
"status": "Healthy" if self.check_system_health() else "Issues Detected"
|
||||
}
|
||||
|
||||
# Create singleton instance
|
||||
health_monitor = HealthCheck()
|
Loading…
Reference in new issue