Update agent_judge.py to resolve conflicts

5 days ago · a0fadf2874
parent 950184b5c5
commit a0fadf2874
1 changed files with 352 additions and 72 deletions
--- a/swarms/agents/agent_judge.py
+++ b/swarms/agents/agent_judge.py
@ -1,54 +1,105 @@
-from typing import List, Dict, Optional
+import traceback
+from typing import List, Optional, Union, Dict
+import uuid

 from swarms.prompts.agent_judge_prompt import AGENT_JUDGE_PROMPT
-
 from swarms.structs.agent import Agent
-
 from swarms.structs.conversation import Conversation
-
 from swarms.utils.any_to_str import any_to_str

-from loguru import logger
+class AgentJudgeInitializationError(Exception):
+    """
+    Exception raised when there is an error initializing the AgentJudge.
+    """
+    pass
+
+class AgentJudgeExecutionError(Exception):
+    """
+    Exception raised when there is an error executing the AgentJudge.
+    """
+    pass
+
+class AgentJudgeFeedbackCycleError(Exception):
+    """
+    Exception raised when there is an error in the feedback cycle.
+    """
+    pass

 class AgentJudge:
    """
-    A class to represent an agent judge that processes tasks and generates responses.
+    A specialized agent designed to evaluate and judge outputs from other agents or systems.
+    The AgentJudge acts as a quality control mechanism, providing objective assessments
+    and feedback on various types of content, decisions, or outputs. It's based on research
+    in LLM-based evaluation systems and can maintain context across multiple evaluations.
+    This implementation supports both single task evaluation and batch processing with
+    iterative refinement capabilities.

    Attributes:
+        id (str): Unique identifier for the judge agent instance.
        agent_name (str): The name of the agent judge.
-        system_prompt (str): The system prompt for the agent.
-        model_name (str): The model name used for generating responses.
+        system_prompt (str): The system prompt for the agent containing evaluation instructions.
+        model_name (str): The model name used for generating evaluations (e.g., "openai/o1", "gpt-4").
        conversation (Conversation): An instance of the Conversation class to manage conversation history.
-        max_loops (int): The maximum number of iterations to run the tasks.
-        agent (Agent): An instance of the Agent class that performs the task execution.
+        max_loops (int): The maximum number of evaluation iterations to run.
+        verbose (bool): Whether to enable verbose logging.
+        agent (Agent): An instance of the Agent class that performs the evaluation execution.
        evaluation_criteria (Dict[str, float]): Dictionary of evaluation criteria and their weights.
+
+    Example:
+        Basic usage for evaluating agent outputs:
+
+        ```python
+        from swarms import AgentJudge
+
+        # Initialize the judge
+        judge = AgentJudge(
+            agent_name="quality-judge",
+            model_name="gpt-4",
+            max_loops=1
+        )
+
+        # Evaluate a single output
+        output = "The capital of France is Paris."
+        evaluation = judge.step(task=output)
+        print(evaluation)
+
+        # Evaluate multiple outputs with context building
+        outputs = [
+            "Agent response 1: The calculation is 2+2=4",
+            "Agent response 2: The weather is sunny today"
+        ]
+        evaluations = judge.run(tasks=outputs)
+        ```
+
+    Methods:
+        step(task: str = None, tasks: List[str] = None, img: str = None) -> str:
+            Processes a single task or list of tasks and returns the agent's evaluation.
+        run(task: str = None, tasks: List[str] = None, img: str = None) -> List[str]:
+            Executes evaluation in a loop with context building, collecting responses.
+        run_batched(tasks: List[str] = None, imgs: List[str] = None) -> List[str]:
+            Executes batch evaluation of tasks with corresponding images.
    """

    def __init__(
        self,
-        agent_name: str = "agent-judge-01",
+        id: str = str(uuid.uuid4()),
+        agent_name: str = "Agent Judge",
+        description: str = "You're an expert AI agent judge. Carefully review the following output(s) generated by another agent. Your job is to provide a detailed, constructive, and actionable critique that will help the agent improve its future performance.",
        system_prompt: str = AGENT_JUDGE_PROMPT,
        model_name: str = "openai/o1",
        max_loops: int = 1,
+        verbose: bool = False,
        evaluation_criteria: Optional[Dict[str, float]] = None,
-    ) -> None:
-        """
-        Initializes the AgentJudge with the specified parameters.
-
-        Args:
-            agent_name (str): The name of the agent judge.
-            system_prompt (str): The system prompt for the agent.
-            model_name (str): The model name used for generating responses.
-            max_loops (int): The maximum number of iterations to run the tasks.
-            evaluation_criteria (Optional[Dict[str, float]]): Dictionary of evaluation criteria
-                and their weights. Keys are criteria names, values are weights.
-                Example: {"correctness": 0.4, "efficiency": 0.3, "clarity": 0.3}
-        """
+        *args,
+        **kwargs,
+    ):
+        self.id = id
        self.agent_name = agent_name
        self.system_prompt = system_prompt
        self.model_name = model_name
        self.conversation = Conversation(time_enabled=False)
        self.max_loops = max_loops
+        self.verbose = verbose
        self.evaluation_criteria = evaluation_criteria or {}
        
        # Enhance system prompt with evaluation criteria if provided
@ -58,78 +109,307 @@ class AgentJudge:
            for criterion, weight in self.evaluation_criteria.items():
                criteria_str += f"- {criterion}: weight = {weight}\n"
            enhanced_prompt += criteria_str
-        
+
        self.agent = Agent(
            agent_name=agent_name,
-            agent_description="You're the agent judge",
+            agent_description=description,
            system_prompt=enhanced_prompt,
            model_name=model_name,
            max_loops=1,
+            *args,
+            **kwargs,
        )

-    def step(self, tasks: List[str]) -> str:
+    def feedback_cycle_step(
+        self,
+        agent: Union[Agent, callable],
+        task: str,
+        img: Optional[str] = None,
+    ):
+        try:
+            # First run the main agent
+            agent_output = agent.run(task=task, img=img)
+
+            # Then run the judge agent
+            judge_output = self.run(task=agent_output, img=img)
+
+            # Run the main agent again with the judge's feedback, using a much improved prompt
+            improved_prompt = (
+                f"You have received the following detailed feedback from the expert agent judge ({self.agent_name}):\n\n"
+                f"--- FEEDBACK START ---\n{judge_output}\n--- FEEDBACK END ---\n\n"
+                f"Your task is to thoughtfully revise and enhance your previous output based on this critique. "
+                f"Carefully address all identified weaknesses, incorporate the suggestions, and strive to maximize the strengths noted. "
+                f"Be specific, accurate, and actionable in your improvements. "
+                f"Here is the original task for reference:\n\n"
+                f"--- TASK ---\n{task}\n--- END TASK ---\n\n"
+                f"Please provide your improved and fully revised output below."
+            )
+
+            return agent.run(task=improved_prompt, img=img)
+        except Exception as e:
+            raise AgentJudgeFeedbackCycleError(
+                f"Error In Agent Judge Feedback Cycle: {e} Traceback: {traceback.format_exc()}"
+            )
+
+    def feedback_cycle(
+        self,
+        agent: Union[Agent, callable],
+        task: str,
+        img: Optional[str] = None,
+        loops: int = 1,
+    ):
+        loop = 0
+        original_task = task  # Preserve the original task
+        current_output = None  # Track the current output
+        all_outputs = []  # Collect all outputs from each iteration
+
+        while loop < loops:
+            # First iteration: run the standard feedback cycle step
+            current_output = self.feedback_cycle_step(
+                agent, original_task, img
+            )
+
+            # Add the current output to our collection
+            all_outputs.append(current_output)
+            loop += 1
+
+        return all_outputs
+
+    def step(
+        self,
+        task: str = None,
+        tasks: Optional[List[str]] = None,
+        img: Optional[str] = None,
+    ) -> str:
        """
-        Processes a list of tasks and returns the agent's response.
+        Processes a single task or list of tasks and returns the agent's evaluation.
+        This method performs a one-shot evaluation of the provided content. It takes
+        either a single task string or a list of tasks and generates a comprehensive
+        evaluation with strengths, weaknesses, and improvement suggestions.

        Args:
-            tasks (List[str]): A list of tasks to be processed.
+            task (str, optional): A single task/output to be evaluated.
+            tasks (List[str], optional): A list of tasks/outputs to be evaluated.
+            img (str, optional): Path to an image file for multimodal evaluation.

        Returns:
-            str: The response generated by the agent.
+            str: A detailed evaluation response from the agent including:
+                - Strengths: What the agent/output did well
+                - Weaknesses: Areas that need improvement
+                - Suggestions: Specific recommendations for improvement
+                - Factual accuracy assessment
+
+        Raises:
+            ValueError: If neither task nor tasks are provided.
+
+        Example:
+            ```python
+            # Single task evaluation
+            evaluation = judge.step(task="The answer is 42.")
+
+            # Multiple tasks evaluation
+            evaluation = judge.step(tasks=[
+                "Response 1: Paris is the capital of France",
+                "Response 2: 2 + 2 = 5"  # Incorrect
+            ])
+
+            # Multimodal evaluation
+            evaluation = judge.step(
+                task="Describe this image",
+                img="path/to/image.jpg"
+            )
+            ```
        """
-        prompt = any_to_str(tasks)
-        logger.debug(f"Running step with prompt: {prompt}")
-        print(prompt)
-        
-        task_instruction = "Evaluate the following output or outputs"
-        if self.evaluation_criteria:
-            criteria_names = list(self.evaluation_criteria.keys())
-            if len(criteria_names) == 1:
-                task_instruction += f" based on {criteria_names[0]}"
+        try:
+            prompt = ""
+            if tasks:
+                prompt = any_to_str(tasks)
+            elif task:
+                prompt = task
            else:
-                formatted_criteria = ", ".join(criteria_names[:-1]) + f" and {criteria_names[-1]}"
-                task_instruction += f" based on the criteria: {formatted_criteria}"
-        
-        response = self.agent.run(
-            task=f"{task_instruction}: {prompt}"
-        )
+                raise ValueError("No tasks or task provided")
+                
+            # 添加评估标准到任务描述中
+            task_instruction = "You are an expert AI agent judge. Carefully review the following output(s) generated by another agent. "
+            task_instruction += "Your job is to provide a detailed, constructive, and actionable critique that will help the agent improve its future performance. "
+            task_instruction += "Your feedback should address the following points:\n"
+            task_instruction += "1. Strengths: What did the agent do well? Highlight any correct reasoning, clarity, or effective problem-solving.\n"
+            task_instruction += "2. Weaknesses: Identify any errors, omissions, unclear reasoning, or areas where the output could be improved.\n"
+            task_instruction += "3. Suggestions: Offer specific, practical recommendations for how the agent can improve its next attempt. "
+            task_instruction += "This may include advice on reasoning, structure, completeness, or style.\n"
+            task_instruction += "4. If relevant, point out any factual inaccuracies or logical inconsistencies.\n"
+            
+            # 在任务说明中添加评估标准
+            if self.evaluation_criteria:
+                criteria_names = list(self.evaluation_criteria.keys())
+                task_instruction += "\nPlease use these specific evaluation criteria with their respective weights:\n"
+                for criterion, weight in self.evaluation_criteria.items():
+                    task_instruction += f"- {criterion}: weight = {weight}\n"
+                    
+            task_instruction += "Be thorough, objective, and professional. Your goal is to help the agent learn and produce better results in the future.\n\n"
+            task_instruction += f"Output(s) to evaluate:\n{prompt}\n"

-        logger.debug(f"Received response: {response}")
-        return response
+            response = self.agent.run(
+                task=task_instruction,
+                img=img,
+            )

-    def run(self, tasks: List[str]) -> List[str]:
+            return response
+        except Exception as e:
+            error_message = (
+                f"AgentJudge encountered an error: {e}\n"
+                f"Traceback:\n{traceback.format_exc()}\n\n"
+                "If this issue persists, please:\n"
+                "- Open a GitHub issue: https://github.com/swarms-ai/swarms/issues\n"
+                "- Join our Discord for real-time support: swarms.ai\n"
+                "- Or book a call: https://cal.com/swarms\n"
+            )
+            raise AgentJudgeExecutionError(error_message)
+
+    def run(
+        self,
+        task: str = None,
+        tasks: Optional[List[str]] = None,
+        img: Optional[str] = None,
+    ):
        """
-        Executes the tasks in a loop, updating context and collecting responses.
+        Executes evaluation in multiple iterations with context building and refinement.
+        This method runs the evaluation process for the specified number of max_loops,
+        where each iteration builds upon the previous context. This allows for iterative
+        refinement of evaluations and deeper analysis over multiple passes.

        Args:
-            tasks (List[str]): A list of tasks to be executed.
+            task (str, optional): A single task/output to be evaluated.
+            tasks (List[str], optional): A list of tasks/outputs to be evaluated.
+            img (str, optional): Path to an image file for multimodal evaluation.

        Returns:
-            List[str]: A list of responses generated by the agent for each iteration.
+            List[str]: A list of evaluation responses, one for each iteration.
+                      Each subsequent evaluation includes context from previous iterations.
+
+        Example:
+            ```python
+            # Single task with iterative refinement
+            judge = AgentJudge(max_loops=3)
+            evaluations = judge.run(task="Agent output to evaluate")
+            # Returns 3 evaluations, each building on the previous
+
+            # Multiple tasks with context building
+            evaluations = judge.run(tasks=[
+                "First agent response",
+                "Second agent response"
+            ])
+
+            # With image analysis
+            evaluations = judge.run(
+                task="Analyze this chart",
+                img="chart.png"
+            )
+            ```
+
+        Note:
+            - The first iteration evaluates the original task(s)
+            - Subsequent iterations include context from previous evaluations
+            - This enables deeper analysis and refinement of judgments
+            - Useful for complex evaluations requiring multiple perspectives
        """
-        responses = []
-        context = ""
-
-        for _ in range(self.max_loops):
-            # Add context to the tasks if available
-            if context:
-                contextualized_tasks = [
-                    f"Previous context: {context}\nTask: {task}"
-                    for task in tasks
-                ]
-            else:
-                contextualized_tasks = tasks
+        try:
+            responses = []
+            context = ""
+
+            # Convert single task to list for consistent processing
+            if task and not tasks:
+                tasks = [task]
+                task = None  # Clear to avoid confusion in step method
+
+            for _ in range(self.max_loops):
+                # Add context to the tasks if available
+                if context and tasks:
+                    contextualized_tasks = [
+                        f"Previous context: {context}\nTask: {t}"
+                        for t in tasks
+                    ]
+                else:
+                    contextualized_tasks = tasks

-            # Get response for current iteration
-            current_response = self.step(contextualized_tasks)
-            responses.append(current_response)
-            logger.debug(
-                f"Current response added: {current_response}"
+                # Get response for current iteration
+                current_response = self.step(
+                    task=task,
+                    tasks=contextualized_tasks,
+                    img=img,
+                )
+
+                responses.append(current_response)
+
+                # Update context for next iteration
+                context = current_response
+
+            return responses
+        except Exception as e:
+            error_message = (
+                f"AgentJudge encountered an error: {e}\n"
+                f"Traceback:\n{traceback.format_exc()}\n\n"
+                "If this issue persists, please:\n"
+                "- Open a GitHub issue: https://github.com/swarms-ai/swarms/issues\n"
+                "- Join our Discord for real-time support: swarms.ai\n"
+                "- Or book a call: https://cal.com/swarms\n"
            )
+            raise AgentJudgeExecutionError(error_message)
+
+    def run_batched(
+        self,
+        tasks: Optional[List[str]] = None,
+        imgs: Optional[List[str]] = None,
+    ):
+        """
+        Executes batch evaluation of multiple tasks with corresponding images.
+        This method processes multiple task-image pairs independently, where each
+        task can be evaluated with its corresponding image. Unlike the run() method,
+        this doesn't build context between different tasks - each is evaluated
+        independently.

-            # Update context for next iteration
-            context = current_response
-            # Add to conversation history
-            logger.debug("Added message to conversation history.")
+        Args:
+            tasks (List[str], optional): A list of tasks/outputs to be evaluated.
+            imgs (List[str], optional): A list of image paths corresponding to each task.
+                                       Must be the same length as tasks if provided.
+
+        Returns:
+            List[List[str]]: A list of evaluation responses for each task. Each inner
+                           list contains the responses from all iterations (max_loops)
+                           for that particular task.

+        Example:
+            ```python
+            # Batch evaluation with images
+            tasks = [
+                "Describe what you see in this image",
+                "What's wrong with this chart?",
+                "Analyze the trends shown"
+            ]
+            images = [
+                "photo1.jpg",
+                "chart1.png",
+                "graph1.png"
+            ]
+            evaluations = judge.run_batched(tasks=tasks, imgs=images)
+            # Returns evaluations for each task-image pair
+
+            # Batch evaluation without images
+            evaluations = judge.run_batched(tasks=[
+                "Agent response 1",
+                "Agent response 2",
+                "Agent response 3"
+            ])
+            ```
+
+        Note:
+            - Each task is processed independently
+            - If imgs is provided, it must have the same length as tasks
+            - Each task goes through max_loops iterations independently
+            - No context is shared between different tasks in the batch
+        """
+        responses = []
+        for task, img in zip(tasks, imgs):
+            response = self.run(task=task, img=img)
+            responses.append(response)
        return responses