From 78bd1def942b1774d011e8aa23bad416575ae0bc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8E=8B=E7=A5=A5=E5=AE=87?= <625024108@qq.com> Date: Thu, 17 Jul 2025 00:29:38 +0800 Subject: [PATCH 1/3] Added `evaluation_criteria` parameter to AgentJudge class --- swarms/agents/agent_judge.py | 50 ++++++++++++++++++++++++------------ 1 file changed, 33 insertions(+), 17 deletions(-) diff --git a/swarms/agents/agent_judge.py b/swarms/agents/agent_judge.py index cb33bd87..09dc9e34 100644 --- a/swarms/agents/agent_judge.py +++ b/swarms/agents/agent_judge.py @@ -1,13 +1,15 @@ -from typing import List +from typing import List, Dict, Optional from swarms.prompts.agent_judge_prompt import AGENT_JUDGE_PROMPT + from swarms.structs.agent import Agent + from swarms.structs.conversation import Conversation + from swarms.utils.any_to_str import any_to_str from loguru import logger - class AgentJudge: """ A class to represent an agent judge that processes tasks and generates responses. @@ -19,13 +21,7 @@ class AgentJudge: conversation (Conversation): An instance of the Conversation class to manage conversation history. max_loops (int): The maximum number of iterations to run the tasks. agent (Agent): An instance of the Agent class that performs the task execution. - - Methods: - step(tasks: List[str]) -> str: - Processes a list of tasks and returns the agent's response. - - run(tasks: List[str]) -> List[str]: - Executes the tasks in a loop, updating context and collecting responses. + evaluation_criteria (Dict[str, float]): Dictionary of evaluation criteria and their weights. """ def __init__( @@ -34,6 +30,7 @@ class AgentJudge: system_prompt: str = AGENT_JUDGE_PROMPT, model_name: str = "openai/o1", max_loops: int = 1, + evaluation_criteria: Optional[Dict[str, float]] = None, ) -> None: """ Initializes the AgentJudge with the specified parameters. @@ -43,17 +40,29 @@ class AgentJudge: system_prompt (str): The system prompt for the agent. model_name (str): The model name used for generating responses. max_loops (int): The maximum number of iterations to run the tasks. + evaluation_criteria (Optional[Dict[str, float]]): Dictionary of evaluation criteria + and their weights. Keys are criteria names, values are weights. + Example: {"correctness": 0.4, "efficiency": 0.3, "clarity": 0.3} """ self.agent_name = agent_name self.system_prompt = system_prompt self.model_name = model_name self.conversation = Conversation(time_enabled=False) self.max_loops = max_loops - + self.evaluation_criteria = evaluation_criteria or {} + + # Enhance system prompt with evaluation criteria if provided + enhanced_prompt = system_prompt + if self.evaluation_criteria: + criteria_str = "\n\nEvaluation Criteria:\n" + for criterion, weight in self.evaluation_criteria.items(): + criteria_str += f"- {criterion}: weight = {weight}\n" + enhanced_prompt += criteria_str + self.agent = Agent( agent_name=agent_name, agent_description="You're the agent judge", - system_prompt=AGENT_JUDGE_PROMPT, + system_prompt=enhanced_prompt, model_name=model_name, max_loops=1, ) @@ -70,14 +79,22 @@ class AgentJudge: """ prompt = any_to_str(tasks) logger.debug(f"Running step with prompt: {prompt}") - print(prompt) - + + task_instruction = "Evaluate the following output or outputs" + if self.evaluation_criteria: + criteria_names = list(self.evaluation_criteria.keys()) + if len(criteria_names) == 1: + task_instruction += f" based on {criteria_names[0]}" + else: + formatted_criteria = ", ".join(criteria_names[:-1]) + f" and {criteria_names[-1]}" + task_instruction += f" based on the criteria: {formatted_criteria}" + response = self.agent.run( - task=f"Evaluate the following output or outputs: {prompt}" + task=f"{task_instruction}: {prompt}" ) - logger.debug(f"Received response: {response}") + logger.debug(f"Received response: {response}") return response def run(self, tasks: List[str]) -> List[str]: @@ -112,8 +129,7 @@ class AgentJudge: # Update context for next iteration context = current_response - # Add to conversation history logger.debug("Added message to conversation history.") - return responses + return responses \ No newline at end of file From 950184b5c57280aae0f939b6117aa8689374d0e2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8E=8B=E7=A5=A5=E5=AE=87?= <625024108@qq.com> Date: Thu, 17 Jul 2025 11:10:48 +0800 Subject: [PATCH 2/3] Add example for AgentJudge with evaluation criteria --- ...agent_judge_evaluation_criteria_example.py | 100 ++++++++++++++++++ 1 file changed, 100 insertions(+) create mode 100644 examples/single_agent/reasoning_agent_examples/agent_judge_evaluation_criteria_example.py diff --git a/examples/single_agent/reasoning_agent_examples/agent_judge_evaluation_criteria_example.py b/examples/single_agent/reasoning_agent_examples/agent_judge_evaluation_criteria_example.py new file mode 100644 index 00000000..f8a1b044 --- /dev/null +++ b/examples/single_agent/reasoning_agent_examples/agent_judge_evaluation_criteria_example.py @@ -0,0 +1,100 @@ +""" +Agent Judge with Evaluation Criteria Example + +This example demonstrates how to use the AgentJudge with custom evaluation criteria. +The evaluation_criteria parameter allows specifying different criteria with weights +for more targeted and customizable evaluation of agent outputs. +""" + +from swarms.agents.agent_judge import AgentJudge +import os +from dotenv import load_dotenv + +load_dotenv() + +# Example 1: Basic usage with evaluation criteria +print("\n=== Example 1: Using Custom Evaluation Criteria ===\n") + +# Create an AgentJudge with custom evaluation criteria +judge = AgentJudge( + model_name="claude-3-7-sonnet-20250219", # Use any available model + evaluation_criteria={ + "correctness": 0.5, + "problem_solving_approach": 0.3, + "explanation_clarity": 0.2 + } +) + +# Sample output to evaluate +task_response = [ + "Task: Determine the time complexity of a binary search algorithm and explain your reasoning.\n\n" + "Agent response: The time complexity of binary search is O(log n). In each step, " + "we divide the search space in half, resulting in a logarithmic relationship between " + "the input size and the number of operations. This can be proven by solving the " + "recurrence relation T(n) = T(n/2) + O(1), which gives us T(n) = O(log n)." +] + +# Run evaluation +evaluation = judge.run(task_response) +print(evaluation[0]) + +# Example 2: Specialized criteria for code evaluation +print("\n=== Example 2: Code Evaluation with Specialized Criteria ===\n") + +code_judge = AgentJudge( + model_name="claude-3-7-sonnet-20250219", + agent_name="code_judge", + evaluation_criteria={ + "code_correctness": 0.4, + "code_efficiency": 0.3, + "code_readability": 0.3 + } +) + +# Sample code to evaluate +code_response = [ + "Task: Write a function to find the maximum subarray sum in an array of integers.\n\n" + "Agent response:\n```python\n" + "def max_subarray_sum(arr):\n" + " current_sum = max_sum = arr[0]\n" + " for i in range(1, len(arr)):\n" + " current_sum = max(arr[i], current_sum + arr[i])\n" + " max_sum = max(max_sum, current_sum)\n" + " return max_sum\n\n" + "# Example usage\n" + "print(max_subarray_sum([-2, 1, -3, 4, -1, 2, 1, -5, 4])) # Output: 6 (subarray [4, -1, 2, 1])\n" + "```\n" + "This implementation uses Kadane's algorithm which has O(n) time complexity and " + "O(1) space complexity, making it optimal for this problem." +] + +code_evaluation = code_judge.run(code_response) +print(code_evaluation[0]) + +# Example 3: Comparing multiple responses +print("\n=== Example 3: Comparing Multiple Agent Responses ===\n") + +comparison_judge = AgentJudge( + model_name="claude-3-7-sonnet-20250219", + evaluation_criteria={ + "accuracy": 0.6, + "completeness": 0.4 + } +) + +multiple_responses = comparison_judge.run([ + "Task: Explain the CAP theorem in distributed systems.\n\n" + "Agent A response: CAP theorem states that a distributed system cannot simultaneously " + "provide Consistency, Availability, and Partition tolerance. In practice, you must " + "choose two out of these three properties.", + + "Task: Explain the CAP theorem in distributed systems.\n\n" + "Agent B response: The CAP theorem, formulated by Eric Brewer, states that in a " + "distributed data store, you can only guarantee two of the following three properties: " + "Consistency (all nodes see the same data at the same time), Availability (every request " + "receives a response), and Partition tolerance (the system continues to operate despite " + "network failures). Most modern distributed systems choose to sacrifice consistency in " + "favor of availability and partition tolerance, implementing eventual consistency models instead." +]) + +print(multiple_responses[0]) \ No newline at end of file From a0fadf2874365c37cddd288ece0eea42b4ef73bb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8E=8B=E7=A5=A5=E5=AE=87?= <625024108@qq.com> Date: Fri, 18 Jul 2025 11:52:19 +0800 Subject: [PATCH 3/3] Update agent_judge.py to resolve conflicts --- swarms/agents/agent_judge.py | 424 +++++++++++++++++++++++++++++------ 1 file changed, 352 insertions(+), 72 deletions(-) diff --git a/swarms/agents/agent_judge.py b/swarms/agents/agent_judge.py index 09dc9e34..9a5c1e3a 100644 --- a/swarms/agents/agent_judge.py +++ b/swarms/agents/agent_judge.py @@ -1,54 +1,105 @@ -from typing import List, Dict, Optional +import traceback +from typing import List, Optional, Union, Dict +import uuid from swarms.prompts.agent_judge_prompt import AGENT_JUDGE_PROMPT - from swarms.structs.agent import Agent - from swarms.structs.conversation import Conversation - from swarms.utils.any_to_str import any_to_str -from loguru import logger +class AgentJudgeInitializationError(Exception): + """ + Exception raised when there is an error initializing the AgentJudge. + """ + pass + +class AgentJudgeExecutionError(Exception): + """ + Exception raised when there is an error executing the AgentJudge. + """ + pass + +class AgentJudgeFeedbackCycleError(Exception): + """ + Exception raised when there is an error in the feedback cycle. + """ + pass class AgentJudge: """ - A class to represent an agent judge that processes tasks and generates responses. + A specialized agent designed to evaluate and judge outputs from other agents or systems. + The AgentJudge acts as a quality control mechanism, providing objective assessments + and feedback on various types of content, decisions, or outputs. It's based on research + in LLM-based evaluation systems and can maintain context across multiple evaluations. + This implementation supports both single task evaluation and batch processing with + iterative refinement capabilities. Attributes: + id (str): Unique identifier for the judge agent instance. agent_name (str): The name of the agent judge. - system_prompt (str): The system prompt for the agent. - model_name (str): The model name used for generating responses. + system_prompt (str): The system prompt for the agent containing evaluation instructions. + model_name (str): The model name used for generating evaluations (e.g., "openai/o1", "gpt-4"). conversation (Conversation): An instance of the Conversation class to manage conversation history. - max_loops (int): The maximum number of iterations to run the tasks. - agent (Agent): An instance of the Agent class that performs the task execution. + max_loops (int): The maximum number of evaluation iterations to run. + verbose (bool): Whether to enable verbose logging. + agent (Agent): An instance of the Agent class that performs the evaluation execution. evaluation_criteria (Dict[str, float]): Dictionary of evaluation criteria and their weights. + + Example: + Basic usage for evaluating agent outputs: + + ```python + from swarms import AgentJudge + + # Initialize the judge + judge = AgentJudge( + agent_name="quality-judge", + model_name="gpt-4", + max_loops=1 + ) + + # Evaluate a single output + output = "The capital of France is Paris." + evaluation = judge.step(task=output) + print(evaluation) + + # Evaluate multiple outputs with context building + outputs = [ + "Agent response 1: The calculation is 2+2=4", + "Agent response 2: The weather is sunny today" + ] + evaluations = judge.run(tasks=outputs) + ``` + + Methods: + step(task: str = None, tasks: List[str] = None, img: str = None) -> str: + Processes a single task or list of tasks and returns the agent's evaluation. + run(task: str = None, tasks: List[str] = None, img: str = None) -> List[str]: + Executes evaluation in a loop with context building, collecting responses. + run_batched(tasks: List[str] = None, imgs: List[str] = None) -> List[str]: + Executes batch evaluation of tasks with corresponding images. """ def __init__( self, - agent_name: str = "agent-judge-01", + id: str = str(uuid.uuid4()), + agent_name: str = "Agent Judge", + description: str = "You're an expert AI agent judge. Carefully review the following output(s) generated by another agent. Your job is to provide a detailed, constructive, and actionable critique that will help the agent improve its future performance.", system_prompt: str = AGENT_JUDGE_PROMPT, model_name: str = "openai/o1", max_loops: int = 1, + verbose: bool = False, evaluation_criteria: Optional[Dict[str, float]] = None, - ) -> None: - """ - Initializes the AgentJudge with the specified parameters. - - Args: - agent_name (str): The name of the agent judge. - system_prompt (str): The system prompt for the agent. - model_name (str): The model name used for generating responses. - max_loops (int): The maximum number of iterations to run the tasks. - evaluation_criteria (Optional[Dict[str, float]]): Dictionary of evaluation criteria - and their weights. Keys are criteria names, values are weights. - Example: {"correctness": 0.4, "efficiency": 0.3, "clarity": 0.3} - """ + *args, + **kwargs, + ): + self.id = id self.agent_name = agent_name self.system_prompt = system_prompt self.model_name = model_name self.conversation = Conversation(time_enabled=False) self.max_loops = max_loops + self.verbose = verbose self.evaluation_criteria = evaluation_criteria or {} # Enhance system prompt with evaluation criteria if provided @@ -58,78 +109,307 @@ class AgentJudge: for criterion, weight in self.evaluation_criteria.items(): criteria_str += f"- {criterion}: weight = {weight}\n" enhanced_prompt += criteria_str - + self.agent = Agent( agent_name=agent_name, - agent_description="You're the agent judge", + agent_description=description, system_prompt=enhanced_prompt, model_name=model_name, max_loops=1, + *args, + **kwargs, ) - def step(self, tasks: List[str]) -> str: + def feedback_cycle_step( + self, + agent: Union[Agent, callable], + task: str, + img: Optional[str] = None, + ): + try: + # First run the main agent + agent_output = agent.run(task=task, img=img) + + # Then run the judge agent + judge_output = self.run(task=agent_output, img=img) + + # Run the main agent again with the judge's feedback, using a much improved prompt + improved_prompt = ( + f"You have received the following detailed feedback from the expert agent judge ({self.agent_name}):\n\n" + f"--- FEEDBACK START ---\n{judge_output}\n--- FEEDBACK END ---\n\n" + f"Your task is to thoughtfully revise and enhance your previous output based on this critique. " + f"Carefully address all identified weaknesses, incorporate the suggestions, and strive to maximize the strengths noted. " + f"Be specific, accurate, and actionable in your improvements. " + f"Here is the original task for reference:\n\n" + f"--- TASK ---\n{task}\n--- END TASK ---\n\n" + f"Please provide your improved and fully revised output below." + ) + + return agent.run(task=improved_prompt, img=img) + except Exception as e: + raise AgentJudgeFeedbackCycleError( + f"Error In Agent Judge Feedback Cycle: {e} Traceback: {traceback.format_exc()}" + ) + + def feedback_cycle( + self, + agent: Union[Agent, callable], + task: str, + img: Optional[str] = None, + loops: int = 1, + ): + loop = 0 + original_task = task # Preserve the original task + current_output = None # Track the current output + all_outputs = [] # Collect all outputs from each iteration + + while loop < loops: + # First iteration: run the standard feedback cycle step + current_output = self.feedback_cycle_step( + agent, original_task, img + ) + + # Add the current output to our collection + all_outputs.append(current_output) + loop += 1 + + return all_outputs + + def step( + self, + task: str = None, + tasks: Optional[List[str]] = None, + img: Optional[str] = None, + ) -> str: """ - Processes a list of tasks and returns the agent's response. + Processes a single task or list of tasks and returns the agent's evaluation. + This method performs a one-shot evaluation of the provided content. It takes + either a single task string or a list of tasks and generates a comprehensive + evaluation with strengths, weaknesses, and improvement suggestions. Args: - tasks (List[str]): A list of tasks to be processed. + task (str, optional): A single task/output to be evaluated. + tasks (List[str], optional): A list of tasks/outputs to be evaluated. + img (str, optional): Path to an image file for multimodal evaluation. Returns: - str: The response generated by the agent. + str: A detailed evaluation response from the agent including: + - Strengths: What the agent/output did well + - Weaknesses: Areas that need improvement + - Suggestions: Specific recommendations for improvement + - Factual accuracy assessment + + Raises: + ValueError: If neither task nor tasks are provided. + + Example: + ```python + # Single task evaluation + evaluation = judge.step(task="The answer is 42.") + + # Multiple tasks evaluation + evaluation = judge.step(tasks=[ + "Response 1: Paris is the capital of France", + "Response 2: 2 + 2 = 5" # Incorrect + ]) + + # Multimodal evaluation + evaluation = judge.step( + task="Describe this image", + img="path/to/image.jpg" + ) + ``` """ - prompt = any_to_str(tasks) - logger.debug(f"Running step with prompt: {prompt}") - print(prompt) - - task_instruction = "Evaluate the following output or outputs" - if self.evaluation_criteria: - criteria_names = list(self.evaluation_criteria.keys()) - if len(criteria_names) == 1: - task_instruction += f" based on {criteria_names[0]}" + try: + prompt = "" + if tasks: + prompt = any_to_str(tasks) + elif task: + prompt = task else: - formatted_criteria = ", ".join(criteria_names[:-1]) + f" and {criteria_names[-1]}" - task_instruction += f" based on the criteria: {formatted_criteria}" - - response = self.agent.run( - task=f"{task_instruction}: {prompt}" - ) + raise ValueError("No tasks or task provided") + + # 添加评估标准到任务描述中 + task_instruction = "You are an expert AI agent judge. Carefully review the following output(s) generated by another agent. " + task_instruction += "Your job is to provide a detailed, constructive, and actionable critique that will help the agent improve its future performance. " + task_instruction += "Your feedback should address the following points:\n" + task_instruction += "1. Strengths: What did the agent do well? Highlight any correct reasoning, clarity, or effective problem-solving.\n" + task_instruction += "2. Weaknesses: Identify any errors, omissions, unclear reasoning, or areas where the output could be improved.\n" + task_instruction += "3. Suggestions: Offer specific, practical recommendations for how the agent can improve its next attempt. " + task_instruction += "This may include advice on reasoning, structure, completeness, or style.\n" + task_instruction += "4. If relevant, point out any factual inaccuracies or logical inconsistencies.\n" + + # 在任务说明中添加评估标准 + if self.evaluation_criteria: + criteria_names = list(self.evaluation_criteria.keys()) + task_instruction += "\nPlease use these specific evaluation criteria with their respective weights:\n" + for criterion, weight in self.evaluation_criteria.items(): + task_instruction += f"- {criterion}: weight = {weight}\n" + + task_instruction += "Be thorough, objective, and professional. Your goal is to help the agent learn and produce better results in the future.\n\n" + task_instruction += f"Output(s) to evaluate:\n{prompt}\n" - logger.debug(f"Received response: {response}") - return response + response = self.agent.run( + task=task_instruction, + img=img, + ) - def run(self, tasks: List[str]) -> List[str]: + return response + except Exception as e: + error_message = ( + f"AgentJudge encountered an error: {e}\n" + f"Traceback:\n{traceback.format_exc()}\n\n" + "If this issue persists, please:\n" + "- Open a GitHub issue: https://github.com/swarms-ai/swarms/issues\n" + "- Join our Discord for real-time support: swarms.ai\n" + "- Or book a call: https://cal.com/swarms\n" + ) + raise AgentJudgeExecutionError(error_message) + + def run( + self, + task: str = None, + tasks: Optional[List[str]] = None, + img: Optional[str] = None, + ): """ - Executes the tasks in a loop, updating context and collecting responses. + Executes evaluation in multiple iterations with context building and refinement. + This method runs the evaluation process for the specified number of max_loops, + where each iteration builds upon the previous context. This allows for iterative + refinement of evaluations and deeper analysis over multiple passes. Args: - tasks (List[str]): A list of tasks to be executed. + task (str, optional): A single task/output to be evaluated. + tasks (List[str], optional): A list of tasks/outputs to be evaluated. + img (str, optional): Path to an image file for multimodal evaluation. Returns: - List[str]: A list of responses generated by the agent for each iteration. + List[str]: A list of evaluation responses, one for each iteration. + Each subsequent evaluation includes context from previous iterations. + + Example: + ```python + # Single task with iterative refinement + judge = AgentJudge(max_loops=3) + evaluations = judge.run(task="Agent output to evaluate") + # Returns 3 evaluations, each building on the previous + + # Multiple tasks with context building + evaluations = judge.run(tasks=[ + "First agent response", + "Second agent response" + ]) + + # With image analysis + evaluations = judge.run( + task="Analyze this chart", + img="chart.png" + ) + ``` + + Note: + - The first iteration evaluates the original task(s) + - Subsequent iterations include context from previous evaluations + - This enables deeper analysis and refinement of judgments + - Useful for complex evaluations requiring multiple perspectives """ - responses = [] - context = "" - - for _ in range(self.max_loops): - # Add context to the tasks if available - if context: - contextualized_tasks = [ - f"Previous context: {context}\nTask: {task}" - for task in tasks - ] - else: - contextualized_tasks = tasks + try: + responses = [] + context = "" + + # Convert single task to list for consistent processing + if task and not tasks: + tasks = [task] + task = None # Clear to avoid confusion in step method + + for _ in range(self.max_loops): + # Add context to the tasks if available + if context and tasks: + contextualized_tasks = [ + f"Previous context: {context}\nTask: {t}" + for t in tasks + ] + else: + contextualized_tasks = tasks - # Get response for current iteration - current_response = self.step(contextualized_tasks) - responses.append(current_response) - logger.debug( - f"Current response added: {current_response}" + # Get response for current iteration + current_response = self.step( + task=task, + tasks=contextualized_tasks, + img=img, + ) + + responses.append(current_response) + + # Update context for next iteration + context = current_response + + return responses + except Exception as e: + error_message = ( + f"AgentJudge encountered an error: {e}\n" + f"Traceback:\n{traceback.format_exc()}\n\n" + "If this issue persists, please:\n" + "- Open a GitHub issue: https://github.com/swarms-ai/swarms/issues\n" + "- Join our Discord for real-time support: swarms.ai\n" + "- Or book a call: https://cal.com/swarms\n" ) + raise AgentJudgeExecutionError(error_message) + + def run_batched( + self, + tasks: Optional[List[str]] = None, + imgs: Optional[List[str]] = None, + ): + """ + Executes batch evaluation of multiple tasks with corresponding images. + This method processes multiple task-image pairs independently, where each + task can be evaluated with its corresponding image. Unlike the run() method, + this doesn't build context between different tasks - each is evaluated + independently. - # Update context for next iteration - context = current_response - # Add to conversation history - logger.debug("Added message to conversation history.") + Args: + tasks (List[str], optional): A list of tasks/outputs to be evaluated. + imgs (List[str], optional): A list of image paths corresponding to each task. + Must be the same length as tasks if provided. + + Returns: + List[List[str]]: A list of evaluation responses for each task. Each inner + list contains the responses from all iterations (max_loops) + for that particular task. + Example: + ```python + # Batch evaluation with images + tasks = [ + "Describe what you see in this image", + "What's wrong with this chart?", + "Analyze the trends shown" + ] + images = [ + "photo1.jpg", + "chart1.png", + "graph1.png" + ] + evaluations = judge.run_batched(tasks=tasks, imgs=images) + # Returns evaluations for each task-image pair + + # Batch evaluation without images + evaluations = judge.run_batched(tasks=[ + "Agent response 1", + "Agent response 2", + "Agent response 3" + ]) + ``` + + Note: + - Each task is processed independently + - If imgs is provided, it must have the same length as tasks + - Each task goes through max_loops iterations independently + - No context is shared between different tasks in the batch + """ + responses = [] + for task, img in zip(tasks, imgs): + response = self.run(task=task, img=img) + responses.append(response) return responses \ No newline at end of file