diff --git a/examples/single_agent/reasoning_agent_examples/agent_judge_evaluation_criteria_example.py b/examples/single_agent/reasoning_agent_examples/agent_judge_evaluation_criteria_example.py new file mode 100644 index 00000000..f8a1b044 --- /dev/null +++ b/examples/single_agent/reasoning_agent_examples/agent_judge_evaluation_criteria_example.py @@ -0,0 +1,100 @@ +""" +Agent Judge with Evaluation Criteria Example + +This example demonstrates how to use the AgentJudge with custom evaluation criteria. +The evaluation_criteria parameter allows specifying different criteria with weights +for more targeted and customizable evaluation of agent outputs. +""" + +from swarms.agents.agent_judge import AgentJudge +import os +from dotenv import load_dotenv + +load_dotenv() + +# Example 1: Basic usage with evaluation criteria +print("\n=== Example 1: Using Custom Evaluation Criteria ===\n") + +# Create an AgentJudge with custom evaluation criteria +judge = AgentJudge( + model_name="claude-3-7-sonnet-20250219", # Use any available model + evaluation_criteria={ + "correctness": 0.5, + "problem_solving_approach": 0.3, + "explanation_clarity": 0.2 + } +) + +# Sample output to evaluate +task_response = [ + "Task: Determine the time complexity of a binary search algorithm and explain your reasoning.\n\n" + "Agent response: The time complexity of binary search is O(log n). In each step, " + "we divide the search space in half, resulting in a logarithmic relationship between " + "the input size and the number of operations. This can be proven by solving the " + "recurrence relation T(n) = T(n/2) + O(1), which gives us T(n) = O(log n)." +] + +# Run evaluation +evaluation = judge.run(task_response) +print(evaluation[0]) + +# Example 2: Specialized criteria for code evaluation +print("\n=== Example 2: Code Evaluation with Specialized Criteria ===\n") + +code_judge = AgentJudge( + model_name="claude-3-7-sonnet-20250219", + agent_name="code_judge", + evaluation_criteria={ + "code_correctness": 0.4, + "code_efficiency": 0.3, + "code_readability": 0.3 + } +) + +# Sample code to evaluate +code_response = [ + "Task: Write a function to find the maximum subarray sum in an array of integers.\n\n" + "Agent response:\n```python\n" + "def max_subarray_sum(arr):\n" + " current_sum = max_sum = arr[0]\n" + " for i in range(1, len(arr)):\n" + " current_sum = max(arr[i], current_sum + arr[i])\n" + " max_sum = max(max_sum, current_sum)\n" + " return max_sum\n\n" + "# Example usage\n" + "print(max_subarray_sum([-2, 1, -3, 4, -1, 2, 1, -5, 4])) # Output: 6 (subarray [4, -1, 2, 1])\n" + "```\n" + "This implementation uses Kadane's algorithm which has O(n) time complexity and " + "O(1) space complexity, making it optimal for this problem." +] + +code_evaluation = code_judge.run(code_response) +print(code_evaluation[0]) + +# Example 3: Comparing multiple responses +print("\n=== Example 3: Comparing Multiple Agent Responses ===\n") + +comparison_judge = AgentJudge( + model_name="claude-3-7-sonnet-20250219", + evaluation_criteria={ + "accuracy": 0.6, + "completeness": 0.4 + } +) + +multiple_responses = comparison_judge.run([ + "Task: Explain the CAP theorem in distributed systems.\n\n" + "Agent A response: CAP theorem states that a distributed system cannot simultaneously " + "provide Consistency, Availability, and Partition tolerance. In practice, you must " + "choose two out of these three properties.", + + "Task: Explain the CAP theorem in distributed systems.\n\n" + "Agent B response: The CAP theorem, formulated by Eric Brewer, states that in a " + "distributed data store, you can only guarantee two of the following three properties: " + "Consistency (all nodes see the same data at the same time), Availability (every request " + "receives a response), and Partition tolerance (the system continues to operate despite " + "network failures). Most modern distributed systems choose to sacrifice consistency in " + "favor of availability and partition tolerance, implementing eventual consistency models instead." +]) + +print(multiple_responses[0]) \ No newline at end of file diff --git a/swarms/agents/agent_judge.py b/swarms/agents/agent_judge.py index 566125d1..5a8742e8 100644 --- a/swarms/agents/agent_judge.py +++ b/swarms/agents/agent_judge.py @@ -1,5 +1,7 @@ import traceback -from typing import List, Optional, Union + +from typing import List, Optional, Union, Dict + import uuid from swarms.prompts.agent_judge_prompt import AGENT_JUDGE_PROMPT @@ -15,23 +17,20 @@ class AgentJudgeInitializationError(Exception): pass - class AgentJudgeExecutionError(Exception): """ Exception raised when there is an error executing the AgentJudge. """ - pass + pass class AgentJudgeFeedbackCycleError(Exception): """ Exception raised when there is an error in the feedback cycle. """ - pass - class AgentJudge: """ A specialized agent designed to evaluate and judge outputs from other agents or systems. @@ -53,6 +52,8 @@ class AgentJudge: verbose (bool): Whether to enable verbose logging. agent (Agent): An instance of the Agent class that performs the evaluation execution. + evaluation_criteria (Dict[str, float]): Dictionary of evaluation criteria and their weights. + Example: Basic usage for evaluating agent outputs: @@ -82,7 +83,6 @@ class AgentJudge: Methods: step(task: str = None, tasks: List[str] = None, img: str = None) -> str: Processes a single task or list of tasks and returns the agent's evaluation. - run(task: str = None, tasks: List[str] = None, img: str = None) -> List[str]: Executes evaluation in a loop with context building, collecting responses. @@ -99,6 +99,9 @@ class AgentJudge: model_name: str = "openai/o1", max_loops: int = 1, verbose: bool = False, + + evaluation_criteria: Optional[Dict[str, float]] = None, + *args, **kwargs, ): @@ -110,10 +113,23 @@ class AgentJudge: self.max_loops = max_loops self.verbose = verbose + self.evaluation_criteria = evaluation_criteria or {} + + # Enhance system prompt with evaluation criteria if provided + enhanced_prompt = system_prompt + if self.evaluation_criteria: + criteria_str = "\n\nEvaluation Criteria:\n" + for criterion, weight in self.evaluation_criteria.items(): + criteria_str += f"- {criterion}: weight = {weight}\n" + enhanced_prompt += criteria_str + + self.agent = Agent( agent_name=agent_name, agent_description=description, - system_prompt=AGENT_JUDGE_PROMPT, + + system_prompt=enhanced_prompt, + model_name=model_name, max_loops=1, *args, @@ -144,6 +160,7 @@ class AgentJudge: f"--- TASK ---\n{task}\n--- END TASK ---\n\n" f"Please provide your improved and fully revised output below." ) + return agent.run(task=improved_prompt, img=img) except Exception as e: raise AgentJudgeFeedbackCycleError( @@ -207,6 +224,7 @@ class AgentJudge: # Single task evaluation evaluation = judge.step(task="The answer is 42.") + # Multiple tasks evaluation evaluation = judge.step(tasks=[ "Response 1: Paris is the capital of France", @@ -228,20 +246,29 @@ class AgentJudge: prompt = task else: raise ValueError("No tasks or task provided") + + # 添加评估标准到任务描述中 + task_instruction = "You are an expert AI agent judge. Carefully review the following output(s) generated by another agent. " + task_instruction += "Your job is to provide a detailed, constructive, and actionable critique that will help the agent improve its future performance. " + task_instruction += "Your feedback should address the following points:\n" + task_instruction += "1. Strengths: What did the agent do well? Highlight any correct reasoning, clarity, or effective problem-solving.\n" + task_instruction += "2. Weaknesses: Identify any errors, omissions, unclear reasoning, or areas where the output could be improved.\n" + task_instruction += "3. Suggestions: Offer specific, practical recommendations for how the agent can improve its next attempt. " + task_instruction += "This may include advice on reasoning, structure, completeness, or style.\n" + task_instruction += "4. If relevant, point out any factual inaccuracies or logical inconsistencies.\n" + + # 在任务说明中添加评估标准 + if self.evaluation_criteria: + criteria_names = list(self.evaluation_criteria.keys()) + task_instruction += "\nPlease use these specific evaluation criteria with their respective weights:\n" + for criterion, weight in self.evaluation_criteria.items(): + task_instruction += f"- {criterion}: weight = {weight}\n" + + task_instruction += "Be thorough, objective, and professional. Your goal is to help the agent learn and produce better results in the future.\n\n" + task_instruction += f"Output(s) to evaluate:\n{prompt}\n" response = self.agent.run( - task=( - "You are an expert AI agent judge. Carefully review the following output(s) generated by another agent. " - "Your job is to provide a detailed, constructive, and actionable critique that will help the agent improve its future performance. " - "Your feedback should address the following points:\n" - "1. Strengths: What did the agent do well? Highlight any correct reasoning, clarity, or effective problem-solving.\n" - "2. Weaknesses: Identify any errors, omissions, unclear reasoning, or areas where the output could be improved.\n" - "3. Suggestions: Offer specific, practical recommendations for how the agent can improve its next attempt. " - "This may include advice on reasoning, structure, completeness, or style.\n" - "4. If relevant, point out any factual inaccuracies or logical inconsistencies.\n" - "Be thorough, objective, and professional. Your goal is to help the agent learn and produce better results in the future.\n\n" - f"Output(s) to evaluate:\n{prompt}\n" - ), + task=task_instruction, img=img, ) @@ -330,6 +357,7 @@ class AgentJudge: tasks=contextualized_tasks, img=img, ) + responses.append(current_response) # Update context for next iteration @@ -360,6 +388,7 @@ class AgentJudge: this doesn't build context between different tasks - each is evaluated independently. + Args: tasks (List[str], optional): A list of tasks/outputs to be evaluated. imgs (List[str], optional): A list of image paths corresponding to each task. @@ -370,6 +399,7 @@ class AgentJudge: list contains the responses from all iterations (max_loops) for that particular task. + Example: ```python # Batch evaluation with images @@ -383,7 +413,6 @@ class AgentJudge: "chart1.png", "graph1.png" ] - evaluations = judge.run_batched(tasks=tasks, imgs=images) # Returns evaluations for each task-image pair @@ -395,6 +424,7 @@ class AgentJudge: ]) ``` + Note: - Each task is processed independently - If imgs is provided, it must have the same length as tasks @@ -405,4 +435,6 @@ class AgentJudge: for task, img in zip(tasks, imgs): response = self.run(task=task, img=img) responses.append(response) + return responses +