diff --git a/swarms/agents/agent_judge.py b/swarms/agents/agent_judge.py index 09dc9e34..9a5c1e3a 100644 --- a/swarms/agents/agent_judge.py +++ b/swarms/agents/agent_judge.py @@ -1,54 +1,105 @@ -from typing import List, Dict, Optional +import traceback +from typing import List, Optional, Union, Dict +import uuid from swarms.prompts.agent_judge_prompt import AGENT_JUDGE_PROMPT - from swarms.structs.agent import Agent - from swarms.structs.conversation import Conversation - from swarms.utils.any_to_str import any_to_str -from loguru import logger +class AgentJudgeInitializationError(Exception): + """ + Exception raised when there is an error initializing the AgentJudge. + """ + pass + +class AgentJudgeExecutionError(Exception): + """ + Exception raised when there is an error executing the AgentJudge. + """ + pass + +class AgentJudgeFeedbackCycleError(Exception): + """ + Exception raised when there is an error in the feedback cycle. + """ + pass class AgentJudge: """ - A class to represent an agent judge that processes tasks and generates responses. + A specialized agent designed to evaluate and judge outputs from other agents or systems. + The AgentJudge acts as a quality control mechanism, providing objective assessments + and feedback on various types of content, decisions, or outputs. It's based on research + in LLM-based evaluation systems and can maintain context across multiple evaluations. + This implementation supports both single task evaluation and batch processing with + iterative refinement capabilities. Attributes: + id (str): Unique identifier for the judge agent instance. agent_name (str): The name of the agent judge. - system_prompt (str): The system prompt for the agent. - model_name (str): The model name used for generating responses. + system_prompt (str): The system prompt for the agent containing evaluation instructions. + model_name (str): The model name used for generating evaluations (e.g., "openai/o1", "gpt-4"). conversation (Conversation): An instance of the Conversation class to manage conversation history. - max_loops (int): The maximum number of iterations to run the tasks. - agent (Agent): An instance of the Agent class that performs the task execution. + max_loops (int): The maximum number of evaluation iterations to run. + verbose (bool): Whether to enable verbose logging. + agent (Agent): An instance of the Agent class that performs the evaluation execution. evaluation_criteria (Dict[str, float]): Dictionary of evaluation criteria and their weights. + + Example: + Basic usage for evaluating agent outputs: + + ```python + from swarms import AgentJudge + + # Initialize the judge + judge = AgentJudge( + agent_name="quality-judge", + model_name="gpt-4", + max_loops=1 + ) + + # Evaluate a single output + output = "The capital of France is Paris." + evaluation = judge.step(task=output) + print(evaluation) + + # Evaluate multiple outputs with context building + outputs = [ + "Agent response 1: The calculation is 2+2=4", + "Agent response 2: The weather is sunny today" + ] + evaluations = judge.run(tasks=outputs) + ``` + + Methods: + step(task: str = None, tasks: List[str] = None, img: str = None) -> str: + Processes a single task or list of tasks and returns the agent's evaluation. + run(task: str = None, tasks: List[str] = None, img: str = None) -> List[str]: + Executes evaluation in a loop with context building, collecting responses. + run_batched(tasks: List[str] = None, imgs: List[str] = None) -> List[str]: + Executes batch evaluation of tasks with corresponding images. """ def __init__( self, - agent_name: str = "agent-judge-01", + id: str = str(uuid.uuid4()), + agent_name: str = "Agent Judge", + description: str = "You're an expert AI agent judge. Carefully review the following output(s) generated by another agent. Your job is to provide a detailed, constructive, and actionable critique that will help the agent improve its future performance.", system_prompt: str = AGENT_JUDGE_PROMPT, model_name: str = "openai/o1", max_loops: int = 1, + verbose: bool = False, evaluation_criteria: Optional[Dict[str, float]] = None, - ) -> None: - """ - Initializes the AgentJudge with the specified parameters. - - Args: - agent_name (str): The name of the agent judge. - system_prompt (str): The system prompt for the agent. - model_name (str): The model name used for generating responses. - max_loops (int): The maximum number of iterations to run the tasks. - evaluation_criteria (Optional[Dict[str, float]]): Dictionary of evaluation criteria - and their weights. Keys are criteria names, values are weights. - Example: {"correctness": 0.4, "efficiency": 0.3, "clarity": 0.3} - """ + *args, + **kwargs, + ): + self.id = id self.agent_name = agent_name self.system_prompt = system_prompt self.model_name = model_name self.conversation = Conversation(time_enabled=False) self.max_loops = max_loops + self.verbose = verbose self.evaluation_criteria = evaluation_criteria or {} # Enhance system prompt with evaluation criteria if provided @@ -58,78 +109,307 @@ class AgentJudge: for criterion, weight in self.evaluation_criteria.items(): criteria_str += f"- {criterion}: weight = {weight}\n" enhanced_prompt += criteria_str - + self.agent = Agent( agent_name=agent_name, - agent_description="You're the agent judge", + agent_description=description, system_prompt=enhanced_prompt, model_name=model_name, max_loops=1, + *args, + **kwargs, ) - def step(self, tasks: List[str]) -> str: + def feedback_cycle_step( + self, + agent: Union[Agent, callable], + task: str, + img: Optional[str] = None, + ): + try: + # First run the main agent + agent_output = agent.run(task=task, img=img) + + # Then run the judge agent + judge_output = self.run(task=agent_output, img=img) + + # Run the main agent again with the judge's feedback, using a much improved prompt + improved_prompt = ( + f"You have received the following detailed feedback from the expert agent judge ({self.agent_name}):\n\n" + f"--- FEEDBACK START ---\n{judge_output}\n--- FEEDBACK END ---\n\n" + f"Your task is to thoughtfully revise and enhance your previous output based on this critique. " + f"Carefully address all identified weaknesses, incorporate the suggestions, and strive to maximize the strengths noted. " + f"Be specific, accurate, and actionable in your improvements. " + f"Here is the original task for reference:\n\n" + f"--- TASK ---\n{task}\n--- END TASK ---\n\n" + f"Please provide your improved and fully revised output below." + ) + + return agent.run(task=improved_prompt, img=img) + except Exception as e: + raise AgentJudgeFeedbackCycleError( + f"Error In Agent Judge Feedback Cycle: {e} Traceback: {traceback.format_exc()}" + ) + + def feedback_cycle( + self, + agent: Union[Agent, callable], + task: str, + img: Optional[str] = None, + loops: int = 1, + ): + loop = 0 + original_task = task # Preserve the original task + current_output = None # Track the current output + all_outputs = [] # Collect all outputs from each iteration + + while loop < loops: + # First iteration: run the standard feedback cycle step + current_output = self.feedback_cycle_step( + agent, original_task, img + ) + + # Add the current output to our collection + all_outputs.append(current_output) + loop += 1 + + return all_outputs + + def step( + self, + task: str = None, + tasks: Optional[List[str]] = None, + img: Optional[str] = None, + ) -> str: """ - Processes a list of tasks and returns the agent's response. + Processes a single task or list of tasks and returns the agent's evaluation. + This method performs a one-shot evaluation of the provided content. It takes + either a single task string or a list of tasks and generates a comprehensive + evaluation with strengths, weaknesses, and improvement suggestions. Args: - tasks (List[str]): A list of tasks to be processed. + task (str, optional): A single task/output to be evaluated. + tasks (List[str], optional): A list of tasks/outputs to be evaluated. + img (str, optional): Path to an image file for multimodal evaluation. Returns: - str: The response generated by the agent. + str: A detailed evaluation response from the agent including: + - Strengths: What the agent/output did well + - Weaknesses: Areas that need improvement + - Suggestions: Specific recommendations for improvement + - Factual accuracy assessment + + Raises: + ValueError: If neither task nor tasks are provided. + + Example: + ```python + # Single task evaluation + evaluation = judge.step(task="The answer is 42.") + + # Multiple tasks evaluation + evaluation = judge.step(tasks=[ + "Response 1: Paris is the capital of France", + "Response 2: 2 + 2 = 5" # Incorrect + ]) + + # Multimodal evaluation + evaluation = judge.step( + task="Describe this image", + img="path/to/image.jpg" + ) + ``` """ - prompt = any_to_str(tasks) - logger.debug(f"Running step with prompt: {prompt}") - print(prompt) - - task_instruction = "Evaluate the following output or outputs" - if self.evaluation_criteria: - criteria_names = list(self.evaluation_criteria.keys()) - if len(criteria_names) == 1: - task_instruction += f" based on {criteria_names[0]}" + try: + prompt = "" + if tasks: + prompt = any_to_str(tasks) + elif task: + prompt = task else: - formatted_criteria = ", ".join(criteria_names[:-1]) + f" and {criteria_names[-1]}" - task_instruction += f" based on the criteria: {formatted_criteria}" - - response = self.agent.run( - task=f"{task_instruction}: {prompt}" - ) + raise ValueError("No tasks or task provided") + + # 添加评估标准到任务描述中 + task_instruction = "You are an expert AI agent judge. Carefully review the following output(s) generated by another agent. " + task_instruction += "Your job is to provide a detailed, constructive, and actionable critique that will help the agent improve its future performance. " + task_instruction += "Your feedback should address the following points:\n" + task_instruction += "1. Strengths: What did the agent do well? Highlight any correct reasoning, clarity, or effective problem-solving.\n" + task_instruction += "2. Weaknesses: Identify any errors, omissions, unclear reasoning, or areas where the output could be improved.\n" + task_instruction += "3. Suggestions: Offer specific, practical recommendations for how the agent can improve its next attempt. " + task_instruction += "This may include advice on reasoning, structure, completeness, or style.\n" + task_instruction += "4. If relevant, point out any factual inaccuracies or logical inconsistencies.\n" + + # 在任务说明中添加评估标准 + if self.evaluation_criteria: + criteria_names = list(self.evaluation_criteria.keys()) + task_instruction += "\nPlease use these specific evaluation criteria with their respective weights:\n" + for criterion, weight in self.evaluation_criteria.items(): + task_instruction += f"- {criterion}: weight = {weight}\n" + + task_instruction += "Be thorough, objective, and professional. Your goal is to help the agent learn and produce better results in the future.\n\n" + task_instruction += f"Output(s) to evaluate:\n{prompt}\n" - logger.debug(f"Received response: {response}") - return response + response = self.agent.run( + task=task_instruction, + img=img, + ) - def run(self, tasks: List[str]) -> List[str]: + return response + except Exception as e: + error_message = ( + f"AgentJudge encountered an error: {e}\n" + f"Traceback:\n{traceback.format_exc()}\n\n" + "If this issue persists, please:\n" + "- Open a GitHub issue: https://github.com/swarms-ai/swarms/issues\n" + "- Join our Discord for real-time support: swarms.ai\n" + "- Or book a call: https://cal.com/swarms\n" + ) + raise AgentJudgeExecutionError(error_message) + + def run( + self, + task: str = None, + tasks: Optional[List[str]] = None, + img: Optional[str] = None, + ): """ - Executes the tasks in a loop, updating context and collecting responses. + Executes evaluation in multiple iterations with context building and refinement. + This method runs the evaluation process for the specified number of max_loops, + where each iteration builds upon the previous context. This allows for iterative + refinement of evaluations and deeper analysis over multiple passes. Args: - tasks (List[str]): A list of tasks to be executed. + task (str, optional): A single task/output to be evaluated. + tasks (List[str], optional): A list of tasks/outputs to be evaluated. + img (str, optional): Path to an image file for multimodal evaluation. Returns: - List[str]: A list of responses generated by the agent for each iteration. + List[str]: A list of evaluation responses, one for each iteration. + Each subsequent evaluation includes context from previous iterations. + + Example: + ```python + # Single task with iterative refinement + judge = AgentJudge(max_loops=3) + evaluations = judge.run(task="Agent output to evaluate") + # Returns 3 evaluations, each building on the previous + + # Multiple tasks with context building + evaluations = judge.run(tasks=[ + "First agent response", + "Second agent response" + ]) + + # With image analysis + evaluations = judge.run( + task="Analyze this chart", + img="chart.png" + ) + ``` + + Note: + - The first iteration evaluates the original task(s) + - Subsequent iterations include context from previous evaluations + - This enables deeper analysis and refinement of judgments + - Useful for complex evaluations requiring multiple perspectives """ - responses = [] - context = "" - - for _ in range(self.max_loops): - # Add context to the tasks if available - if context: - contextualized_tasks = [ - f"Previous context: {context}\nTask: {task}" - for task in tasks - ] - else: - contextualized_tasks = tasks + try: + responses = [] + context = "" + + # Convert single task to list for consistent processing + if task and not tasks: + tasks = [task] + task = None # Clear to avoid confusion in step method + + for _ in range(self.max_loops): + # Add context to the tasks if available + if context and tasks: + contextualized_tasks = [ + f"Previous context: {context}\nTask: {t}" + for t in tasks + ] + else: + contextualized_tasks = tasks - # Get response for current iteration - current_response = self.step(contextualized_tasks) - responses.append(current_response) - logger.debug( - f"Current response added: {current_response}" + # Get response for current iteration + current_response = self.step( + task=task, + tasks=contextualized_tasks, + img=img, + ) + + responses.append(current_response) + + # Update context for next iteration + context = current_response + + return responses + except Exception as e: + error_message = ( + f"AgentJudge encountered an error: {e}\n" + f"Traceback:\n{traceback.format_exc()}\n\n" + "If this issue persists, please:\n" + "- Open a GitHub issue: https://github.com/swarms-ai/swarms/issues\n" + "- Join our Discord for real-time support: swarms.ai\n" + "- Or book a call: https://cal.com/swarms\n" ) + raise AgentJudgeExecutionError(error_message) + + def run_batched( + self, + tasks: Optional[List[str]] = None, + imgs: Optional[List[str]] = None, + ): + """ + Executes batch evaluation of multiple tasks with corresponding images. + This method processes multiple task-image pairs independently, where each + task can be evaluated with its corresponding image. Unlike the run() method, + this doesn't build context between different tasks - each is evaluated + independently. - # Update context for next iteration - context = current_response - # Add to conversation history - logger.debug("Added message to conversation history.") + Args: + tasks (List[str], optional): A list of tasks/outputs to be evaluated. + imgs (List[str], optional): A list of image paths corresponding to each task. + Must be the same length as tasks if provided. + + Returns: + List[List[str]]: A list of evaluation responses for each task. Each inner + list contains the responses from all iterations (max_loops) + for that particular task. + Example: + ```python + # Batch evaluation with images + tasks = [ + "Describe what you see in this image", + "What's wrong with this chart?", + "Analyze the trends shown" + ] + images = [ + "photo1.jpg", + "chart1.png", + "graph1.png" + ] + evaluations = judge.run_batched(tasks=tasks, imgs=images) + # Returns evaluations for each task-image pair + + # Batch evaluation without images + evaluations = judge.run_batched(tasks=[ + "Agent response 1", + "Agent response 2", + "Agent response 3" + ]) + ``` + + Note: + - Each task is processed independently + - If imgs is provided, it must have the same length as tasks + - Each task goes through max_loops iterations independently + - No context is shared between different tasks in the batch + """ + responses = [] + for task, img in zip(tasks, imgs): + response = self.run(task=task, img=img) + responses.append(response) return responses \ No newline at end of file