self healing structuted output

1 year ago · 599776736b
parent 77cb687263
commit 599776736b
2 changed files with 436 additions and 93 deletions
--- a/swarms/agents/self_healing_agent.py
+++ b/swarms/agents/self_healing_agent.py
@ -1,15 +1,19 @@
 import os
 import sys
 import ast
 import json
 import traceback
 from typing import Optional, Dict, List, Any
 from dataclasses import dataclass
 from pathlib import Path
 from datetime import datetime
 from swarms.utils.terminal_output import terminal
 from swarms.structs.agent import Agent
@dataclass
 class ErrorContext:
    """Context about an error that occurred"""
    error_type: str
    error_message: str
    traceback: str
@ -21,43 +25,106 @@ class CodeFixerAgent(Agent):
    """An agent specialized in analyzing and fixing code errors"""
    def __init__(self, *args, **kwargs):
-        system_prompt = """You are an expert code debugging and fixing agent. Your role is to:
+        system_prompt = """You are an expert code debugging and fixing agent. Your role is to analyze errors and propose fixes.
-        1. Analyze error messages and stack traces to understand the root cause
+
-        2. Examine the code context where the error occurred
+        When analyzing errors, follow these steps:
-        3. Propose specific fixes with clear explanations
+        1. Examine the error message and stack trace carefully
-        4. Consider multiple potential solutions and their trade-offs
+        2. Look at the code context where the error occurred
-        5. Ensure fixes maintain code quality and follow best practices
+        3. Consider multiple potential causes and solutions
-        
+        4. Choose the most appropriate fix
-        When proposing fixes:
+        5. Explain your reasoning clearly
-        - Explain why the error occurred
+
-        - Detail what changes need to be made and in which files
+        Your output must follow this exact format:
-        - Consider potential side effects of the changes
+
-        - Suggest any additional improvements or preventive measures
+        {
-        
+            "error_analysis": {
-        Format your response as:
+                "root_cause": "Brief explanation of what caused the error",
-        ERROR ANALYSIS:
+                "impact": "What effects this error has on the system",
-        <explanation of what caused the error>
+                "severity": "high|medium|low"
-        
+            },
-        PROPOSED FIX:
+            "proposed_fix": {
-        File: <file_path>
+                "file": "Path to the file that needs changes",
-        Lines: <line_numbers>
+                "line_range": "start-end or single line number",
-        ```<language>
+                "code_changes": "The actual code changes to make",
-        <code changes>
+                "type": "syntax|import|permission|memory|other"
-        ```
+            },
-        
+            "explanation": {
-        EXPLANATION:
+                "why_it_works": "Why this fix will solve the problem",
-        <why this fix works and any considerations>
+                "side_effects": "Any potential side effects to consider",
                "alternatives": "Other possible solutions that were considered"
            },
            "prevention": {
                "recommendations": ["List of recommendations to prevent similar errors"],
                "best_practices": ["Relevant best practices to follow"]
            }
        }
        Always ensure your response is valid JSON and includes all the required fields.
        Be specific about file paths and line numbers.
        Include complete code snippets that can be directly applied.
        """
        kwargs["system_prompt"] = system_prompt
        kwargs["agent_name"] = kwargs.get("agent_name", "Code-Fixer-Agent")
        kwargs["output_type"] = "json"  # Ensure JSON output
        super().__init__(*args, **kwargs)
        self.error_history: List[ErrorContext] = []
 class SelfHealingAgent(Agent):
-    """An agent that can diagnose and fix runtime errors using LLM-based analysis"""
+    """An agent that can diagnose and fix runtime errors using LLM-based analysis
    This agent uses a specialized CodeFixerAgent to analyze errors and propose fixes.
    It can handle various types of errors including:
    - Syntax errors
    - Import errors
    - Permission errors
    - Memory errors
    - General runtime errors
    The agent maintains a history of errors and fixes, and can provide detailed reports
    of its healing activities.
    Attributes:
        error_history (List[ErrorContext]): History of errors encountered
        fixer_agent (CodeFixerAgent): Specialized agent for analyzing and fixing errors
        max_fix_attempts (int): Maximum number of fix attempts per error
    """
    def __init__(self, *args, **kwargs):
        system_prompt = """You are a self-healing agent capable of detecting, analyzing, and fixing runtime errors.
        Your responses should follow this format:
        {
            "status": {
                "state": "running|error|fixed|failed",
                "message": "Current status message",
                "timestamp": "ISO timestamp"
            },
            "error_details": {
                "type": "Error type if applicable",
                "message": "Error message if applicable",
                "location": "File and line number where error occurred"
            },
            "healing_actions": {
                "attempted_fixes": ["List of fixes attempted"],
                "successful_fixes": ["List of successful fixes"],
                "failed_fixes": ["List of failed fixes"]
            },
            "system_health": {
                "memory_usage": "Current memory usage",
                "cpu_usage": "Current CPU usage",
                "disk_usage": "Current disk usage"
            },
            "recommendations": {
                "immediate": ["Immediate actions needed"],
                "long_term": ["Long-term improvements suggested"]
            }
        }
        """
        kwargs["system_prompt"] = system_prompt
        kwargs["agent_name"] = kwargs.get("agent_name", "Self-Healing-Agent")
        kwargs["output_type"] = "json"  # Ensure JSON output
        super().__init__(*args, **kwargs)
        # Initialize the code fixer agent
@ -67,24 +134,32 @@ class SelfHealingAgent(Agent):
            verbose=True
        )
        self.error_history = []
        self.max_fix_attempts = 3
    def diagnose_error(self, error: Exception) -> ErrorContext:
-        """Gather context about an error"""
+        """Gather detailed context about an error
        Args:
            error (Exception): The error that occurred
        Returns:
            ErrorContext: Detailed context about the error
        """
        tb = traceback.extract_tb(sys.exc_info()[2])
        file_path = None
        line_number = None
        code_snippet = ""
        # Get the last frame from traceback which is usually where the error occurred
        if tb:
            last_frame = tb[-1]
            file_path = last_frame.filename
            line_number = last_frame.lineno
            # Try to get code context
            if file_path and os.path.exists(file_path):
                with open(file_path, 'r') as f:
                    lines = f.readlines()
-                    start = max(0, line_number - 5)  # Get more context
+                    start = max(0, line_number - 5)
                    end = min(len(lines), line_number + 5)
                    code_snippet = ''.join(lines[start:end])
@ -98,9 +173,16 @@ class SelfHealingAgent(Agent):
        )
    def get_fix_prompt(self, error_context: ErrorContext) -> str:
-        """Create a detailed prompt for the fixer agent"""
+        """Create a detailed prompt for the fixer agent
        Args:
            error_context (ErrorContext): Context about the error
        Returns:
            str: Prompt for the fixer agent
        """
        return f"""
-        An error occurred in the code. Please analyze it and propose a fix.
+        Analyze this error and propose a fix following the required JSON format.
        ERROR TYPE: {error_context.error_type}
        ERROR MESSAGE: {error_context.error_message}
@ -115,80 +197,79 @@ class SelfHealingAgent(Agent):
        FULL TRACEBACK:
        {error_context.traceback}
        Please analyze this error and propose a specific fix. Include:
        1. What caused the error
        2. Exact changes needed (file paths and line numbers)
        3. The code that needs to be changed
        4. Why the fix will work
        5. Any potential side effects to consider
        """
-    def apply_fix(self, fix_proposal: str) -> bool:
+    def apply_fix(self, fix_proposal: Dict[str, Any]) -> bool:
-        """Apply the fix proposed by the fixer agent"""
+        """Apply a fix proposed by the fixer agent
-        try:
+        
-            # Parse the fix proposal to extract file and changes
+        Args:
-            import re
+            fix_proposal (Dict[str, Any]): The fix proposal in JSON format
            # Extract file path
            file_match = re.search(r"File: (.+)", fix_proposal)
            if not file_match:
                terminal.status_panel("Could not find file path in fix proposal", "error")
                return False
            file_path = file_match.group(1).strip()
-            # Extract code changes
+        Returns:
-            code_match = re.search(r"```(?:python)?\n(.*?)\n```", fix_proposal, re.DOTALL)
+            bool: Whether the fix was successfully applied
-            if not code_match:
+        """
-                terminal.status_panel("Could not find code changes in fix proposal", "error")
+        try:
-                return False
+            # Extract fix details
-                
+            file_path = fix_proposal["proposed_fix"]["file"]
-            new_code = code_match.group(1).strip()
+            line_range = fix_proposal["proposed_fix"]["line_range"]
            new_code = fix_proposal["proposed_fix"]["code_changes"]
-            # Extract line numbers if specified
+            # Parse line range
-            lines_match = re.search(r"Lines: (.+)", fix_proposal)
+            if '-' in line_range:
-            line_range = None
+                start, end = map(int, line_range.split('-'))
-            if lines_match:
+            else:
-                try:
+                start = end = int(line_range)
                    # Parse line range (e.g., "5-10" or "5")
                    line_spec = lines_match.group(1).strip()
                    if '-' in line_spec:
                        start, end = map(int, line_spec.split('-'))
                        line_range = (start, end)
                    else:
                        line_num = int(line_spec)
                        line_range = (line_num, line_num)
                except:
                    terminal.status_panel("Could not parse line numbers", "warning")
            # Apply the changes
            with open(file_path, 'r') as f:
                lines = f.readlines()
-            if line_range:
+            # Convert to 0-based indexing
-                # Replace specific lines
+            start -= 1
-                start, end = line_range
+            lines[start:end] = new_code.splitlines(True)
                start -= 1  # Convert to 0-based index
                lines[start:end] = new_code.splitlines(True)
            else:
                # Replace the entire file
                lines = new_code.splitlines(True)
            with open(file_path, 'w') as f:
                f.writelines(lines)
-            terminal.status_panel(f"Applied fix to {file_path}", "success")
+            terminal.status_panel(
                f"Applied fix to {file_path} lines {start+1}-{end}", 
                "success"
            )
            return True
        except Exception as e:
            terminal.status_panel(f"Failed to apply fix: {str(e)}", "error")
            return False
-    def run(self, *args, **kwargs) -> Any:
+    def run(self, *args, **kwargs) -> Dict[str, Any]:
-        """Run with self-healing capabilities using LLM-based analysis"""
+        """Run with self-healing capabilities using LLM-based analysis
        Returns:
            Dict[str, Any]: Structured output about the run and any healing actions
        """
        try:
-            return super().run(*args, **kwargs)
+            result = super().run(*args, **kwargs)
            # Return success status
            return {
                "status": {
                    "state": "running",
                    "message": "Operation completed successfully",
                    "timestamp": datetime.now().isoformat()
                },
                "error_details": None,
                "healing_actions": {
                    "attempted_fixes": [],
                    "successful_fixes": [],
                    "failed_fixes": []
                },
                "system_health": self.get_system_health(),
                "recommendations": {
                    "immediate": [],
                    "long_term": ["Monitor system health regularly"]
                }
            }
        except Exception as error:
            terminal.status_panel("Error detected, analyzing with LLM...", "warning")
@ -201,13 +282,102 @@ class SelfHealingAgent(Agent):
            fix_proposal = self.fixer_agent.run(fix_prompt)
            terminal.status_panel("Fix proposed by LLM:", "info")
-            terminal.status_panel(fix_proposal, "info")
+            terminal.status_panel(json.dumps(fix_proposal, indent=2), "info")
            # Track healing actions
            attempted_fixes = [fix_proposal["proposed_fix"]["type"]]
            successful_fixes = []
            failed_fixes = []
            # Apply the fix
            if self.apply_fix(fix_proposal):
                terminal.status_panel("Fix applied, retrying operation...", "info")
-                # Retry the operation
+                successful_fixes.append(fix_proposal["proposed_fix"]["type"])
-                return super().run(*args, **kwargs)
+                
                try:
                    # Retry the operation
                    result = super().run(*args, **kwargs)
                    return {
                        "status": {
                            "state": "fixed",
                            "message": "Error fixed and operation completed",
                            "timestamp": datetime.now().isoformat()
                        },
                        "error_details": {
                            "type": error_context.error_type,
                            "message": error_context.error_message,
                            "location": f"{error_context.file_path}:{error_context.line_number}"
                        },
                        "healing_actions": {
                            "attempted_fixes": attempted_fixes,
                            "successful_fixes": successful_fixes,
                            "failed_fixes": failed_fixes
                        },
                        "system_health": self.get_system_health(),
                        "recommendations": fix_proposal["prevention"]
                    }
                except Exception as e:
                    failed_fixes.append(fix_proposal["proposed_fix"]["type"])
                    return {
                        "status": {
                            "state": "failed",
                            "message": "Fix applied but error persists",
                            "timestamp": datetime.now().isoformat()
                        },
                        "error_details": {
                            "type": type(e).__name__,
                            "message": str(e),
                            "location": f"{error_context.file_path}:{error_context.line_number}"
                        },
                        "healing_actions": {
                            "attempted_fixes": attempted_fixes,
                            "successful_fixes": successful_fixes,
                            "failed_fixes": failed_fixes
                        },
                        "system_health": self.get_system_health(),
                        "recommendations": {
                            "immediate": ["Manual intervention required"],
                            "long_term": fix_proposal["prevention"]["recommendations"]
                        }
                    }
            else:
-                terminal.status_panel("Unable to apply fix automatically", "error")
+                failed_fixes.append(fix_proposal["proposed_fix"]["type"])
-                raise 
+                return {
                    "status": {
                        "state": "error",
                        "message": "Unable to apply fix",
                        "timestamp": datetime.now().isoformat()
                    },
                    "error_details": {
                        "type": error_context.error_type,
                        "message": error_context.error_message,
                        "location": f"{error_context.file_path}:{error_context.line_number}"
                    },
                    "healing_actions": {
                        "attempted_fixes": attempted_fixes,
                        "successful_fixes": successful_fixes,
                        "failed_fixes": failed_fixes
                    },
                    "system_health": self.get_system_health(),
                    "recommendations": {
                        "immediate": ["Manual intervention required"],
                        "long_term": fix_proposal["prevention"]["recommendations"]
                    }
                }
    def get_system_health(self) -> Dict[str, str]:
        """Get current system health metrics
        Returns:
            Dict[str, str]: System health metrics
        """
        import psutil
        return {
            "memory_usage": f"{psutil.virtual_memory().percent}%",
            "cpu_usage": f"{psutil.cpu_percent()}%",
            "disk_usage": f"{psutil.disk_usage('/').percent}%"
        }
--- a/swarms/utils/self_healing.py
+++ b/swarms/utils/self_healing.py
@ -1 +1,174 @@
- 
+import os
 import sys
 import time
 import psutil
 import threading
 import traceback
 from typing import Callable, Dict, List, Optional
 from dataclasses import dataclass
 from datetime import datetime
 from swarms.utils.terminal_output import terminal
@dataclass
 class SystemMetrics:
    cpu_percent: float
    memory_percent: float
    disk_usage_percent: float
    timestamp: datetime
 class HealthCheck:
    """System health monitoring and self-healing capabilities"""
    def __init__(self):
        self.metrics_history: List[SystemMetrics] = []
        self.error_count: Dict[str, int] = {}
        self.recovery_actions: Dict[str, Callable] = {}
        self.monitoring_thread: Optional[threading.Thread] = None
        self.stop_monitoring = threading.Event()
        # Default thresholds
        self.thresholds = {
            "cpu_percent": 90.0,
            "memory_percent": 85.0,
            "disk_usage_percent": 90.0,
            "error_threshold": 3
        }
    def register_recovery_action(self, error_type: str, action: Callable):
        """Register a recovery action for a specific error type"""
        self.recovery_actions[error_type] = action
        terminal.status_panel(f"Registered recovery action for {error_type}", "info")
    def collect_metrics(self) -> SystemMetrics:
        """Collect current system metrics"""
        try:
            cpu = psutil.cpu_percent(interval=1)
            memory = psutil.virtual_memory().percent
            disk = psutil.disk_usage('/').percent
            metrics = SystemMetrics(
                cpu_percent=cpu,
                memory_percent=memory,
                disk_usage_percent=disk,
                timestamp=datetime.now()
            )
            self.metrics_history.append(metrics)
            if len(self.metrics_history) > 100:  # Keep last 100 readings
                self.metrics_history.pop(0)
            return metrics
        except Exception as e:
            terminal.status_panel(f"Error collecting metrics: {str(e)}", "error")
            return None
    def check_system_health(self) -> bool:
        """Check if system metrics are within acceptable thresholds"""
        metrics = self.collect_metrics()
        if not metrics:
            return False
        issues = []
        if metrics.cpu_percent > self.thresholds["cpu_percent"]:
            issues.append(f"High CPU usage: {metrics.cpu_percent}%")
        if metrics.memory_percent > self.thresholds["memory_percent"]:
            issues.append(f"High memory usage: {metrics.memory_percent}%")
        if metrics.disk_usage_percent > self.thresholds["disk_usage_percent"]:
            issues.append(f"High disk usage: {metrics.disk_usage_percent}%")
        if issues:
            terminal.status_panel("\n".join(issues), "warning")
            return False
        return True
    def handle_error(self, error: Exception, context: str = ""):
        """Handle errors and attempt recovery"""
        error_type = type(error).__name__
        # Increment error count
        self.error_count[error_type] = self.error_count.get(error_type, 0) + 1
        terminal.status_panel(
            f"Error occurred in {context}: {str(error)}\n{traceback.format_exc()}", 
            "error"
        )
        # Check if we need to take recovery action
        if self.error_count[error_type] >= self.thresholds["error_threshold"]:
            self.attempt_recovery(error_type, error)
    def attempt_recovery(self, error_type: str, error: Exception):
        """Attempt to recover from an error"""
        terminal.status_panel(f"Attempting recovery for {error_type}", "info")
        if error_type in self.recovery_actions:
            try:
                self.recovery_actions[error_type](error)
                terminal.status_panel(f"Recovery action completed for {error_type}", "success")
                self.error_count[error_type] = 0  # Reset error count after successful recovery
            except Exception as e:
                terminal.status_panel(
                    f"Recovery action failed for {error_type}: {str(e)}", 
                    "error"
                )
        else:
            terminal.status_panel(
                f"No recovery action registered for {error_type}", 
                "warning"
            )
    def start_monitoring(self):
        """Start continuous system monitoring"""
        if self.monitoring_thread and self.monitoring_thread.is_alive():
            return
        def monitor():
            while not self.stop_monitoring.is_set():
                healthy = self.check_system_health()
                if healthy:
                    terminal.status_panel("System health check passed", "success")
                time.sleep(60)  # Check every minute
        self.monitoring_thread = threading.Thread(target=monitor, daemon=True)
        self.monitoring_thread.start()
        terminal.status_panel("System monitoring started", "info")
    def stop_monitoring(self):
        """Stop system monitoring"""
        if self.monitoring_thread and self.monitoring_thread.is_alive():
            self.stop_monitoring.set()
            self.monitoring_thread.join()
            terminal.status_panel("System monitoring stopped", "info")
    def get_health_report(self) -> dict:
        """Generate a health report"""
        if not self.metrics_history:
            return {"status": "No metrics collected yet"}
        latest = self.metrics_history[-1]
        avg_metrics = {
            "cpu_percent": sum(m.cpu_percent for m in self.metrics_history) / len(self.metrics_history),
            "memory_percent": sum(m.memory_percent for m in self.metrics_history) / len(self.metrics_history),
            "disk_usage_percent": sum(m.disk_usage_percent for m in self.metrics_history) / len(self.metrics_history)
        }
        return {
            "current_metrics": {
                "cpu_percent": latest.cpu_percent,
                "memory_percent": latest.memory_percent,
                "disk_usage_percent": latest.disk_usage_percent,
                "timestamp": latest.timestamp.isoformat()
            },
            "average_metrics": avg_metrics,
            "error_counts": self.error_count,
            "status": "Healthy" if self.check_system_health() else "Issues Detected"
        }
 # Create singleton instance
 health_monitor = HealthCheck()