Merge pull request #958 from Wxysnx/agent_judge0717

Added `evaluation_criteria` parameter to AgentJudge class
dependabot/pip/docstring-parser-0.17.0
Kye Gomez 4 days ago committed by GitHub
commit f916c89cc1
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

@ -0,0 +1,100 @@
"""
Agent Judge with Evaluation Criteria Example
This example demonstrates how to use the AgentJudge with custom evaluation criteria.
The evaluation_criteria parameter allows specifying different criteria with weights
for more targeted and customizable evaluation of agent outputs.
"""
from swarms.agents.agent_judge import AgentJudge
import os
from dotenv import load_dotenv
load_dotenv()
# Example 1: Basic usage with evaluation criteria
print("\n=== Example 1: Using Custom Evaluation Criteria ===\n")
# Create an AgentJudge with custom evaluation criteria
judge = AgentJudge(
model_name="claude-3-7-sonnet-20250219", # Use any available model
evaluation_criteria={
"correctness": 0.5,
"problem_solving_approach": 0.3,
"explanation_clarity": 0.2
}
)
# Sample output to evaluate
task_response = [
"Task: Determine the time complexity of a binary search algorithm and explain your reasoning.\n\n"
"Agent response: The time complexity of binary search is O(log n). In each step, "
"we divide the search space in half, resulting in a logarithmic relationship between "
"the input size and the number of operations. This can be proven by solving the "
"recurrence relation T(n) = T(n/2) + O(1), which gives us T(n) = O(log n)."
]
# Run evaluation
evaluation = judge.run(task_response)
print(evaluation[0])
# Example 2: Specialized criteria for code evaluation
print("\n=== Example 2: Code Evaluation with Specialized Criteria ===\n")
code_judge = AgentJudge(
model_name="claude-3-7-sonnet-20250219",
agent_name="code_judge",
evaluation_criteria={
"code_correctness": 0.4,
"code_efficiency": 0.3,
"code_readability": 0.3
}
)
# Sample code to evaluate
code_response = [
"Task: Write a function to find the maximum subarray sum in an array of integers.\n\n"
"Agent response:\n```python\n"
"def max_subarray_sum(arr):\n"
" current_sum = max_sum = arr[0]\n"
" for i in range(1, len(arr)):\n"
" current_sum = max(arr[i], current_sum + arr[i])\n"
" max_sum = max(max_sum, current_sum)\n"
" return max_sum\n\n"
"# Example usage\n"
"print(max_subarray_sum([-2, 1, -3, 4, -1, 2, 1, -5, 4])) # Output: 6 (subarray [4, -1, 2, 1])\n"
"```\n"
"This implementation uses Kadane's algorithm which has O(n) time complexity and "
"O(1) space complexity, making it optimal for this problem."
]
code_evaluation = code_judge.run(code_response)
print(code_evaluation[0])
# Example 3: Comparing multiple responses
print("\n=== Example 3: Comparing Multiple Agent Responses ===\n")
comparison_judge = AgentJudge(
model_name="claude-3-7-sonnet-20250219",
evaluation_criteria={
"accuracy": 0.6,
"completeness": 0.4
}
)
multiple_responses = comparison_judge.run([
"Task: Explain the CAP theorem in distributed systems.\n\n"
"Agent A response: CAP theorem states that a distributed system cannot simultaneously "
"provide Consistency, Availability, and Partition tolerance. In practice, you must "
"choose two out of these three properties.",
"Task: Explain the CAP theorem in distributed systems.\n\n"
"Agent B response: The CAP theorem, formulated by Eric Brewer, states that in a "
"distributed data store, you can only guarantee two of the following three properties: "
"Consistency (all nodes see the same data at the same time), Availability (every request "
"receives a response), and Partition tolerance (the system continues to operate despite "
"network failures). Most modern distributed systems choose to sacrifice consistency in "
"favor of availability and partition tolerance, implementing eventual consistency models instead."
])
print(multiple_responses[0])

@ -1,5 +1,7 @@
import traceback
from typing import List, Optional, Union
from typing import List, Optional, Union, Dict
import uuid
from swarms.prompts.agent_judge_prompt import AGENT_JUDGE_PROMPT
@ -15,23 +17,20 @@ class AgentJudgeInitializationError(Exception):
pass
class AgentJudgeExecutionError(Exception):
"""
Exception raised when there is an error executing the AgentJudge.
"""
pass
pass
class AgentJudgeFeedbackCycleError(Exception):
"""
Exception raised when there is an error in the feedback cycle.
"""
pass
class AgentJudge:
"""
A specialized agent designed to evaluate and judge outputs from other agents or systems.
@ -53,6 +52,8 @@ class AgentJudge:
verbose (bool): Whether to enable verbose logging.
agent (Agent): An instance of the Agent class that performs the evaluation execution.
evaluation_criteria (Dict[str, float]): Dictionary of evaluation criteria and their weights.
Example:
Basic usage for evaluating agent outputs:
@ -82,7 +83,6 @@ class AgentJudge:
Methods:
step(task: str = None, tasks: List[str] = None, img: str = None) -> str:
Processes a single task or list of tasks and returns the agent's evaluation.
run(task: str = None, tasks: List[str] = None, img: str = None) -> List[str]:
Executes evaluation in a loop with context building, collecting responses.
@ -99,6 +99,9 @@ class AgentJudge:
model_name: str = "openai/o1",
max_loops: int = 1,
verbose: bool = False,
evaluation_criteria: Optional[Dict[str, float]] = None,
*args,
**kwargs,
):
@ -110,10 +113,23 @@ class AgentJudge:
self.max_loops = max_loops
self.verbose = verbose
self.evaluation_criteria = evaluation_criteria or {}
# Enhance system prompt with evaluation criteria if provided
enhanced_prompt = system_prompt
if self.evaluation_criteria:
criteria_str = "\n\nEvaluation Criteria:\n"
for criterion, weight in self.evaluation_criteria.items():
criteria_str += f"- {criterion}: weight = {weight}\n"
enhanced_prompt += criteria_str
self.agent = Agent(
agent_name=agent_name,
agent_description=description,
system_prompt=AGENT_JUDGE_PROMPT,
system_prompt=enhanced_prompt,
model_name=model_name,
max_loops=1,
*args,
@ -144,6 +160,7 @@ class AgentJudge:
f"--- TASK ---\n{task}\n--- END TASK ---\n\n"
f"Please provide your improved and fully revised output below."
)
return agent.run(task=improved_prompt, img=img)
except Exception as e:
raise AgentJudgeFeedbackCycleError(
@ -207,6 +224,7 @@ class AgentJudge:
# Single task evaluation
evaluation = judge.step(task="The answer is 42.")
# Multiple tasks evaluation
evaluation = judge.step(tasks=[
"Response 1: Paris is the capital of France",
@ -228,20 +246,29 @@ class AgentJudge:
prompt = task
else:
raise ValueError("No tasks or task provided")
# 添加评估标准到任务描述中
task_instruction = "You are an expert AI agent judge. Carefully review the following output(s) generated by another agent. "
task_instruction += "Your job is to provide a detailed, constructive, and actionable critique that will help the agent improve its future performance. "
task_instruction += "Your feedback should address the following points:\n"
task_instruction += "1. Strengths: What did the agent do well? Highlight any correct reasoning, clarity, or effective problem-solving.\n"
task_instruction += "2. Weaknesses: Identify any errors, omissions, unclear reasoning, or areas where the output could be improved.\n"
task_instruction += "3. Suggestions: Offer specific, practical recommendations for how the agent can improve its next attempt. "
task_instruction += "This may include advice on reasoning, structure, completeness, or style.\n"
task_instruction += "4. If relevant, point out any factual inaccuracies or logical inconsistencies.\n"
# 在任务说明中添加评估标准
if self.evaluation_criteria:
criteria_names = list(self.evaluation_criteria.keys())
task_instruction += "\nPlease use these specific evaluation criteria with their respective weights:\n"
for criterion, weight in self.evaluation_criteria.items():
task_instruction += f"- {criterion}: weight = {weight}\n"
task_instruction += "Be thorough, objective, and professional. Your goal is to help the agent learn and produce better results in the future.\n\n"
task_instruction += f"Output(s) to evaluate:\n{prompt}\n"
response = self.agent.run(
task=(
"You are an expert AI agent judge. Carefully review the following output(s) generated by another agent. "
"Your job is to provide a detailed, constructive, and actionable critique that will help the agent improve its future performance. "
"Your feedback should address the following points:\n"
"1. Strengths: What did the agent do well? Highlight any correct reasoning, clarity, or effective problem-solving.\n"
"2. Weaknesses: Identify any errors, omissions, unclear reasoning, or areas where the output could be improved.\n"
"3. Suggestions: Offer specific, practical recommendations for how the agent can improve its next attempt. "
"This may include advice on reasoning, structure, completeness, or style.\n"
"4. If relevant, point out any factual inaccuracies or logical inconsistencies.\n"
"Be thorough, objective, and professional. Your goal is to help the agent learn and produce better results in the future.\n\n"
f"Output(s) to evaluate:\n{prompt}\n"
),
task=task_instruction,
img=img,
)
@ -330,6 +357,7 @@ class AgentJudge:
tasks=contextualized_tasks,
img=img,
)
responses.append(current_response)
# Update context for next iteration
@ -360,6 +388,7 @@ class AgentJudge:
this doesn't build context between different tasks - each is evaluated
independently.
Args:
tasks (List[str], optional): A list of tasks/outputs to be evaluated.
imgs (List[str], optional): A list of image paths corresponding to each task.
@ -370,6 +399,7 @@ class AgentJudge:
list contains the responses from all iterations (max_loops)
for that particular task.
Example:
```python
# Batch evaluation with images
@ -383,7 +413,6 @@ class AgentJudge:
"chart1.png",
"graph1.png"
]
evaluations = judge.run_batched(tasks=tasks, imgs=images)
# Returns evaluations for each task-image pair
@ -395,6 +424,7 @@ class AgentJudge:
])
```
Note:
- Each task is processed independently
- If imgs is provided, it must have the same length as tasks
@ -405,4 +435,6 @@ class AgentJudge:
for task, img in zip(tasks, imgs):
response = self.run(task=task, img=img)
responses.append(response)
return responses

Loading…
Cancel
Save