diff --git a/examples/single_agent/reasoning_agent_examples/agent_judge_evaluation_criteria_example.py b/examples/single_agent/reasoning_agent_examples/agent_judge_evaluation_criteria_example.py deleted file mode 100644 index f8a1b044..00000000 --- a/examples/single_agent/reasoning_agent_examples/agent_judge_evaluation_criteria_example.py +++ /dev/null @@ -1,100 +0,0 @@ -""" -Agent Judge with Evaluation Criteria Example - -This example demonstrates how to use the AgentJudge with custom evaluation criteria. -The evaluation_criteria parameter allows specifying different criteria with weights -for more targeted and customizable evaluation of agent outputs. -""" - -from swarms.agents.agent_judge import AgentJudge -import os -from dotenv import load_dotenv - -load_dotenv() - -# Example 1: Basic usage with evaluation criteria -print("\n=== Example 1: Using Custom Evaluation Criteria ===\n") - -# Create an AgentJudge with custom evaluation criteria -judge = AgentJudge( - model_name="claude-3-7-sonnet-20250219", # Use any available model - evaluation_criteria={ - "correctness": 0.5, - "problem_solving_approach": 0.3, - "explanation_clarity": 0.2 - } -) - -# Sample output to evaluate -task_response = [ - "Task: Determine the time complexity of a binary search algorithm and explain your reasoning.\n\n" - "Agent response: The time complexity of binary search is O(log n). In each step, " - "we divide the search space in half, resulting in a logarithmic relationship between " - "the input size and the number of operations. This can be proven by solving the " - "recurrence relation T(n) = T(n/2) + O(1), which gives us T(n) = O(log n)." -] - -# Run evaluation -evaluation = judge.run(task_response) -print(evaluation[0]) - -# Example 2: Specialized criteria for code evaluation -print("\n=== Example 2: Code Evaluation with Specialized Criteria ===\n") - -code_judge = AgentJudge( - model_name="claude-3-7-sonnet-20250219", - agent_name="code_judge", - evaluation_criteria={ - "code_correctness": 0.4, - "code_efficiency": 0.3, - "code_readability": 0.3 - } -) - -# Sample code to evaluate -code_response = [ - "Task: Write a function to find the maximum subarray sum in an array of integers.\n\n" - "Agent response:\n```python\n" - "def max_subarray_sum(arr):\n" - " current_sum = max_sum = arr[0]\n" - " for i in range(1, len(arr)):\n" - " current_sum = max(arr[i], current_sum + arr[i])\n" - " max_sum = max(max_sum, current_sum)\n" - " return max_sum\n\n" - "# Example usage\n" - "print(max_subarray_sum([-2, 1, -3, 4, -1, 2, 1, -5, 4])) # Output: 6 (subarray [4, -1, 2, 1])\n" - "```\n" - "This implementation uses Kadane's algorithm which has O(n) time complexity and " - "O(1) space complexity, making it optimal for this problem." -] - -code_evaluation = code_judge.run(code_response) -print(code_evaluation[0]) - -# Example 3: Comparing multiple responses -print("\n=== Example 3: Comparing Multiple Agent Responses ===\n") - -comparison_judge = AgentJudge( - model_name="claude-3-7-sonnet-20250219", - evaluation_criteria={ - "accuracy": 0.6, - "completeness": 0.4 - } -) - -multiple_responses = comparison_judge.run([ - "Task: Explain the CAP theorem in distributed systems.\n\n" - "Agent A response: CAP theorem states that a distributed system cannot simultaneously " - "provide Consistency, Availability, and Partition tolerance. In practice, you must " - "choose two out of these three properties.", - - "Task: Explain the CAP theorem in distributed systems.\n\n" - "Agent B response: The CAP theorem, formulated by Eric Brewer, states that in a " - "distributed data store, you can only guarantee two of the following three properties: " - "Consistency (all nodes see the same data at the same time), Availability (every request " - "receives a response), and Partition tolerance (the system continues to operate despite " - "network failures). Most modern distributed systems choose to sacrifice consistency in " - "favor of availability and partition tolerance, implementing eventual consistency models instead." -]) - -print(multiple_responses[0]) \ No newline at end of file diff --git a/swarms/agents/agent_judge.py b/swarms/agents/agent_judge.py index 9a5c1e3a..cb33bd87 100644 --- a/swarms/agents/agent_judge.py +++ b/swarms/agents/agent_judge.py @@ -1,415 +1,119 @@ -import traceback -from typing import List, Optional, Union, Dict -import uuid +from typing import List from swarms.prompts.agent_judge_prompt import AGENT_JUDGE_PROMPT from swarms.structs.agent import Agent from swarms.structs.conversation import Conversation from swarms.utils.any_to_str import any_to_str -class AgentJudgeInitializationError(Exception): - """ - Exception raised when there is an error initializing the AgentJudge. - """ - pass - -class AgentJudgeExecutionError(Exception): - """ - Exception raised when there is an error executing the AgentJudge. - """ - pass +from loguru import logger -class AgentJudgeFeedbackCycleError(Exception): - """ - Exception raised when there is an error in the feedback cycle. - """ - pass class AgentJudge: """ - A specialized agent designed to evaluate and judge outputs from other agents or systems. - The AgentJudge acts as a quality control mechanism, providing objective assessments - and feedback on various types of content, decisions, or outputs. It's based on research - in LLM-based evaluation systems and can maintain context across multiple evaluations. - This implementation supports both single task evaluation and batch processing with - iterative refinement capabilities. + A class to represent an agent judge that processes tasks and generates responses. Attributes: - id (str): Unique identifier for the judge agent instance. agent_name (str): The name of the agent judge. - system_prompt (str): The system prompt for the agent containing evaluation instructions. - model_name (str): The model name used for generating evaluations (e.g., "openai/o1", "gpt-4"). + system_prompt (str): The system prompt for the agent. + model_name (str): The model name used for generating responses. conversation (Conversation): An instance of the Conversation class to manage conversation history. - max_loops (int): The maximum number of evaluation iterations to run. - verbose (bool): Whether to enable verbose logging. - agent (Agent): An instance of the Agent class that performs the evaluation execution. - evaluation_criteria (Dict[str, float]): Dictionary of evaluation criteria and their weights. - - Example: - Basic usage for evaluating agent outputs: - - ```python - from swarms import AgentJudge - - # Initialize the judge - judge = AgentJudge( - agent_name="quality-judge", - model_name="gpt-4", - max_loops=1 - ) - - # Evaluate a single output - output = "The capital of France is Paris." - evaluation = judge.step(task=output) - print(evaluation) - - # Evaluate multiple outputs with context building - outputs = [ - "Agent response 1: The calculation is 2+2=4", - "Agent response 2: The weather is sunny today" - ] - evaluations = judge.run(tasks=outputs) - ``` + max_loops (int): The maximum number of iterations to run the tasks. + agent (Agent): An instance of the Agent class that performs the task execution. Methods: - step(task: str = None, tasks: List[str] = None, img: str = None) -> str: - Processes a single task or list of tasks and returns the agent's evaluation. - run(task: str = None, tasks: List[str] = None, img: str = None) -> List[str]: - Executes evaluation in a loop with context building, collecting responses. - run_batched(tasks: List[str] = None, imgs: List[str] = None) -> List[str]: - Executes batch evaluation of tasks with corresponding images. + step(tasks: List[str]) -> str: + Processes a list of tasks and returns the agent's response. + + run(tasks: List[str]) -> List[str]: + Executes the tasks in a loop, updating context and collecting responses. """ def __init__( self, - id: str = str(uuid.uuid4()), - agent_name: str = "Agent Judge", - description: str = "You're an expert AI agent judge. Carefully review the following output(s) generated by another agent. Your job is to provide a detailed, constructive, and actionable critique that will help the agent improve its future performance.", + agent_name: str = "agent-judge-01", system_prompt: str = AGENT_JUDGE_PROMPT, model_name: str = "openai/o1", max_loops: int = 1, - verbose: bool = False, - evaluation_criteria: Optional[Dict[str, float]] = None, - *args, - **kwargs, - ): - self.id = id + ) -> None: + """ + Initializes the AgentJudge with the specified parameters. + + Args: + agent_name (str): The name of the agent judge. + system_prompt (str): The system prompt for the agent. + model_name (str): The model name used for generating responses. + max_loops (int): The maximum number of iterations to run the tasks. + """ self.agent_name = agent_name self.system_prompt = system_prompt self.model_name = model_name self.conversation = Conversation(time_enabled=False) self.max_loops = max_loops - self.verbose = verbose - self.evaluation_criteria = evaluation_criteria or {} - - # Enhance system prompt with evaluation criteria if provided - enhanced_prompt = system_prompt - if self.evaluation_criteria: - criteria_str = "\n\nEvaluation Criteria:\n" - for criterion, weight in self.evaluation_criteria.items(): - criteria_str += f"- {criterion}: weight = {weight}\n" - enhanced_prompt += criteria_str self.agent = Agent( agent_name=agent_name, - agent_description=description, - system_prompt=enhanced_prompt, + agent_description="You're the agent judge", + system_prompt=AGENT_JUDGE_PROMPT, model_name=model_name, max_loops=1, - *args, - **kwargs, ) - def feedback_cycle_step( - self, - agent: Union[Agent, callable], - task: str, - img: Optional[str] = None, - ): - try: - # First run the main agent - agent_output = agent.run(task=task, img=img) - - # Then run the judge agent - judge_output = self.run(task=agent_output, img=img) - - # Run the main agent again with the judge's feedback, using a much improved prompt - improved_prompt = ( - f"You have received the following detailed feedback from the expert agent judge ({self.agent_name}):\n\n" - f"--- FEEDBACK START ---\n{judge_output}\n--- FEEDBACK END ---\n\n" - f"Your task is to thoughtfully revise and enhance your previous output based on this critique. " - f"Carefully address all identified weaknesses, incorporate the suggestions, and strive to maximize the strengths noted. " - f"Be specific, accurate, and actionable in your improvements. " - f"Here is the original task for reference:\n\n" - f"--- TASK ---\n{task}\n--- END TASK ---\n\n" - f"Please provide your improved and fully revised output below." - ) - - return agent.run(task=improved_prompt, img=img) - except Exception as e: - raise AgentJudgeFeedbackCycleError( - f"Error In Agent Judge Feedback Cycle: {e} Traceback: {traceback.format_exc()}" - ) - - def feedback_cycle( - self, - agent: Union[Agent, callable], - task: str, - img: Optional[str] = None, - loops: int = 1, - ): - loop = 0 - original_task = task # Preserve the original task - current_output = None # Track the current output - all_outputs = [] # Collect all outputs from each iteration - - while loop < loops: - # First iteration: run the standard feedback cycle step - current_output = self.feedback_cycle_step( - agent, original_task, img - ) - - # Add the current output to our collection - all_outputs.append(current_output) - loop += 1 - - return all_outputs - - def step( - self, - task: str = None, - tasks: Optional[List[str]] = None, - img: Optional[str] = None, - ) -> str: + def step(self, tasks: List[str]) -> str: """ - Processes a single task or list of tasks and returns the agent's evaluation. - This method performs a one-shot evaluation of the provided content. It takes - either a single task string or a list of tasks and generates a comprehensive - evaluation with strengths, weaknesses, and improvement suggestions. + Processes a list of tasks and returns the agent's response. Args: - task (str, optional): A single task/output to be evaluated. - tasks (List[str], optional): A list of tasks/outputs to be evaluated. - img (str, optional): Path to an image file for multimodal evaluation. + tasks (List[str]): A list of tasks to be processed. Returns: - str: A detailed evaluation response from the agent including: - - Strengths: What the agent/output did well - - Weaknesses: Areas that need improvement - - Suggestions: Specific recommendations for improvement - - Factual accuracy assessment - - Raises: - ValueError: If neither task nor tasks are provided. - - Example: - ```python - # Single task evaluation - evaluation = judge.step(task="The answer is 42.") - - # Multiple tasks evaluation - evaluation = judge.step(tasks=[ - "Response 1: Paris is the capital of France", - "Response 2: 2 + 2 = 5" # Incorrect - ]) - - # Multimodal evaluation - evaluation = judge.step( - task="Describe this image", - img="path/to/image.jpg" - ) - ``` + str: The response generated by the agent. """ - try: - prompt = "" - if tasks: - prompt = any_to_str(tasks) - elif task: - prompt = task - else: - raise ValueError("No tasks or task provided") - - # 添加评估标准到任务描述中 - task_instruction = "You are an expert AI agent judge. Carefully review the following output(s) generated by another agent. " - task_instruction += "Your job is to provide a detailed, constructive, and actionable critique that will help the agent improve its future performance. " - task_instruction += "Your feedback should address the following points:\n" - task_instruction += "1. Strengths: What did the agent do well? Highlight any correct reasoning, clarity, or effective problem-solving.\n" - task_instruction += "2. Weaknesses: Identify any errors, omissions, unclear reasoning, or areas where the output could be improved.\n" - task_instruction += "3. Suggestions: Offer specific, practical recommendations for how the agent can improve its next attempt. " - task_instruction += "This may include advice on reasoning, structure, completeness, or style.\n" - task_instruction += "4. If relevant, point out any factual inaccuracies or logical inconsistencies.\n" - - # 在任务说明中添加评估标准 - if self.evaluation_criteria: - criteria_names = list(self.evaluation_criteria.keys()) - task_instruction += "\nPlease use these specific evaluation criteria with their respective weights:\n" - for criterion, weight in self.evaluation_criteria.items(): - task_instruction += f"- {criterion}: weight = {weight}\n" - - task_instruction += "Be thorough, objective, and professional. Your goal is to help the agent learn and produce better results in the future.\n\n" - task_instruction += f"Output(s) to evaluate:\n{prompt}\n" + prompt = any_to_str(tasks) + logger.debug(f"Running step with prompt: {prompt}") - response = self.agent.run( - task=task_instruction, - img=img, - ) + print(prompt) - return response - except Exception as e: - error_message = ( - f"AgentJudge encountered an error: {e}\n" - f"Traceback:\n{traceback.format_exc()}\n\n" - "If this issue persists, please:\n" - "- Open a GitHub issue: https://github.com/swarms-ai/swarms/issues\n" - "- Join our Discord for real-time support: swarms.ai\n" - "- Or book a call: https://cal.com/swarms\n" - ) - raise AgentJudgeExecutionError(error_message) + response = self.agent.run( + task=f"Evaluate the following output or outputs: {prompt}" + ) + logger.debug(f"Received response: {response}") - def run( - self, - task: str = None, - tasks: Optional[List[str]] = None, - img: Optional[str] = None, - ): + return response + + def run(self, tasks: List[str]) -> List[str]: """ - Executes evaluation in multiple iterations with context building and refinement. - This method runs the evaluation process for the specified number of max_loops, - where each iteration builds upon the previous context. This allows for iterative - refinement of evaluations and deeper analysis over multiple passes. + Executes the tasks in a loop, updating context and collecting responses. Args: - task (str, optional): A single task/output to be evaluated. - tasks (List[str], optional): A list of tasks/outputs to be evaluated. - img (str, optional): Path to an image file for multimodal evaluation. + tasks (List[str]): A list of tasks to be executed. Returns: - List[str]: A list of evaluation responses, one for each iteration. - Each subsequent evaluation includes context from previous iterations. - - Example: - ```python - # Single task with iterative refinement - judge = AgentJudge(max_loops=3) - evaluations = judge.run(task="Agent output to evaluate") - # Returns 3 evaluations, each building on the previous - - # Multiple tasks with context building - evaluations = judge.run(tasks=[ - "First agent response", - "Second agent response" - ]) - - # With image analysis - evaluations = judge.run( - task="Analyze this chart", - img="chart.png" - ) - ``` - - Note: - - The first iteration evaluates the original task(s) - - Subsequent iterations include context from previous evaluations - - This enables deeper analysis and refinement of judgments - - Useful for complex evaluations requiring multiple perspectives + List[str]: A list of responses generated by the agent for each iteration. """ - try: - responses = [] - context = "" - - # Convert single task to list for consistent processing - if task and not tasks: - tasks = [task] - task = None # Clear to avoid confusion in step method - - for _ in range(self.max_loops): - # Add context to the tasks if available - if context and tasks: - contextualized_tasks = [ - f"Previous context: {context}\nTask: {t}" - for t in tasks - ] - else: - contextualized_tasks = tasks - - # Get response for current iteration - current_response = self.step( - task=task, - tasks=contextualized_tasks, - img=img, - ) - - responses.append(current_response) - - # Update context for next iteration - context = current_response + responses = [] + context = "" + + for _ in range(self.max_loops): + # Add context to the tasks if available + if context: + contextualized_tasks = [ + f"Previous context: {context}\nTask: {task}" + for task in tasks + ] + else: + contextualized_tasks = tasks - return responses - except Exception as e: - error_message = ( - f"AgentJudge encountered an error: {e}\n" - f"Traceback:\n{traceback.format_exc()}\n\n" - "If this issue persists, please:\n" - "- Open a GitHub issue: https://github.com/swarms-ai/swarms/issues\n" - "- Join our Discord for real-time support: swarms.ai\n" - "- Or book a call: https://cal.com/swarms\n" + # Get response for current iteration + current_response = self.step(contextualized_tasks) + responses.append(current_response) + logger.debug( + f"Current response added: {current_response}" ) - raise AgentJudgeExecutionError(error_message) - - def run_batched( - self, - tasks: Optional[List[str]] = None, - imgs: Optional[List[str]] = None, - ): - """ - Executes batch evaluation of multiple tasks with corresponding images. - This method processes multiple task-image pairs independently, where each - task can be evaluated with its corresponding image. Unlike the run() method, - this doesn't build context between different tasks - each is evaluated - independently. - - Args: - tasks (List[str], optional): A list of tasks/outputs to be evaluated. - imgs (List[str], optional): A list of image paths corresponding to each task. - Must be the same length as tasks if provided. - Returns: - List[List[str]]: A list of evaluation responses for each task. Each inner - list contains the responses from all iterations (max_loops) - for that particular task. - - Example: - ```python - # Batch evaluation with images - tasks = [ - "Describe what you see in this image", - "What's wrong with this chart?", - "Analyze the trends shown" - ] - images = [ - "photo1.jpg", - "chart1.png", - "graph1.png" - ] - evaluations = judge.run_batched(tasks=tasks, imgs=images) - # Returns evaluations for each task-image pair + # Update context for next iteration + context = current_response - # Batch evaluation without images - evaluations = judge.run_batched(tasks=[ - "Agent response 1", - "Agent response 2", - "Agent response 3" - ]) - ``` + # Add to conversation history + logger.debug("Added message to conversation history.") - Note: - - Each task is processed independently - - If imgs is provided, it must have the same length as tasks - - Each task goes through max_loops iterations independently - - No context is shared between different tasks in the batch - """ - responses = [] - for task, img in zip(tasks, imgs): - response = self.run(task=task, img=img) - responses.append(response) - return responses \ No newline at end of file + return responses