Update agent_judge.py to resolve conflicts

pull/958/head
王祥宇 5 days ago
parent 950184b5c5
commit a0fadf2874

@ -1,54 +1,105 @@
from typing import List, Dict, Optional
import traceback
from typing import List, Optional, Union, Dict
import uuid
from swarms.prompts.agent_judge_prompt import AGENT_JUDGE_PROMPT
from swarms.structs.agent import Agent
from swarms.structs.conversation import Conversation
from swarms.utils.any_to_str import any_to_str
from loguru import logger
class AgentJudgeInitializationError(Exception):
"""
Exception raised when there is an error initializing the AgentJudge.
"""
pass
class AgentJudgeExecutionError(Exception):
"""
Exception raised when there is an error executing the AgentJudge.
"""
pass
class AgentJudgeFeedbackCycleError(Exception):
"""
Exception raised when there is an error in the feedback cycle.
"""
pass
class AgentJudge:
"""
A class to represent an agent judge that processes tasks and generates responses.
A specialized agent designed to evaluate and judge outputs from other agents or systems.
The AgentJudge acts as a quality control mechanism, providing objective assessments
and feedback on various types of content, decisions, or outputs. It's based on research
in LLM-based evaluation systems and can maintain context across multiple evaluations.
This implementation supports both single task evaluation and batch processing with
iterative refinement capabilities.
Attributes:
id (str): Unique identifier for the judge agent instance.
agent_name (str): The name of the agent judge.
system_prompt (str): The system prompt for the agent.
model_name (str): The model name used for generating responses.
system_prompt (str): The system prompt for the agent containing evaluation instructions.
model_name (str): The model name used for generating evaluations (e.g., "openai/o1", "gpt-4").
conversation (Conversation): An instance of the Conversation class to manage conversation history.
max_loops (int): The maximum number of iterations to run the tasks.
agent (Agent): An instance of the Agent class that performs the task execution.
max_loops (int): The maximum number of evaluation iterations to run.
verbose (bool): Whether to enable verbose logging.
agent (Agent): An instance of the Agent class that performs the evaluation execution.
evaluation_criteria (Dict[str, float]): Dictionary of evaluation criteria and their weights.
Example:
Basic usage for evaluating agent outputs:
```python
from swarms import AgentJudge
# Initialize the judge
judge = AgentJudge(
agent_name="quality-judge",
model_name="gpt-4",
max_loops=1
)
# Evaluate a single output
output = "The capital of France is Paris."
evaluation = judge.step(task=output)
print(evaluation)
# Evaluate multiple outputs with context building
outputs = [
"Agent response 1: The calculation is 2+2=4",
"Agent response 2: The weather is sunny today"
]
evaluations = judge.run(tasks=outputs)
```
Methods:
step(task: str = None, tasks: List[str] = None, img: str = None) -> str:
Processes a single task or list of tasks and returns the agent's evaluation.
run(task: str = None, tasks: List[str] = None, img: str = None) -> List[str]:
Executes evaluation in a loop with context building, collecting responses.
run_batched(tasks: List[str] = None, imgs: List[str] = None) -> List[str]:
Executes batch evaluation of tasks with corresponding images.
"""
def __init__(
self,
agent_name: str = "agent-judge-01",
id: str = str(uuid.uuid4()),
agent_name: str = "Agent Judge",
description: str = "You're an expert AI agent judge. Carefully review the following output(s) generated by another agent. Your job is to provide a detailed, constructive, and actionable critique that will help the agent improve its future performance.",
system_prompt: str = AGENT_JUDGE_PROMPT,
model_name: str = "openai/o1",
max_loops: int = 1,
verbose: bool = False,
evaluation_criteria: Optional[Dict[str, float]] = None,
) -> None:
"""
Initializes the AgentJudge with the specified parameters.
Args:
agent_name (str): The name of the agent judge.
system_prompt (str): The system prompt for the agent.
model_name (str): The model name used for generating responses.
max_loops (int): The maximum number of iterations to run the tasks.
evaluation_criteria (Optional[Dict[str, float]]): Dictionary of evaluation criteria
and their weights. Keys are criteria names, values are weights.
Example: {"correctness": 0.4, "efficiency": 0.3, "clarity": 0.3}
"""
*args,
**kwargs,
):
self.id = id
self.agent_name = agent_name
self.system_prompt = system_prompt
self.model_name = model_name
self.conversation = Conversation(time_enabled=False)
self.max_loops = max_loops
self.verbose = verbose
self.evaluation_criteria = evaluation_criteria or {}
# Enhance system prompt with evaluation criteria if provided
@ -58,78 +109,307 @@ class AgentJudge:
for criterion, weight in self.evaluation_criteria.items():
criteria_str += f"- {criterion}: weight = {weight}\n"
enhanced_prompt += criteria_str
self.agent = Agent(
agent_name=agent_name,
agent_description="You're the agent judge",
agent_description=description,
system_prompt=enhanced_prompt,
model_name=model_name,
max_loops=1,
*args,
**kwargs,
)
def step(self, tasks: List[str]) -> str:
def feedback_cycle_step(
self,
agent: Union[Agent, callable],
task: str,
img: Optional[str] = None,
):
try:
# First run the main agent
agent_output = agent.run(task=task, img=img)
# Then run the judge agent
judge_output = self.run(task=agent_output, img=img)
# Run the main agent again with the judge's feedback, using a much improved prompt
improved_prompt = (
f"You have received the following detailed feedback from the expert agent judge ({self.agent_name}):\n\n"
f"--- FEEDBACK START ---\n{judge_output}\n--- FEEDBACK END ---\n\n"
f"Your task is to thoughtfully revise and enhance your previous output based on this critique. "
f"Carefully address all identified weaknesses, incorporate the suggestions, and strive to maximize the strengths noted. "
f"Be specific, accurate, and actionable in your improvements. "
f"Here is the original task for reference:\n\n"
f"--- TASK ---\n{task}\n--- END TASK ---\n\n"
f"Please provide your improved and fully revised output below."
)
return agent.run(task=improved_prompt, img=img)
except Exception as e:
raise AgentJudgeFeedbackCycleError(
f"Error In Agent Judge Feedback Cycle: {e} Traceback: {traceback.format_exc()}"
)
def feedback_cycle(
self,
agent: Union[Agent, callable],
task: str,
img: Optional[str] = None,
loops: int = 1,
):
loop = 0
original_task = task # Preserve the original task
current_output = None # Track the current output
all_outputs = [] # Collect all outputs from each iteration
while loop < loops:
# First iteration: run the standard feedback cycle step
current_output = self.feedback_cycle_step(
agent, original_task, img
)
# Add the current output to our collection
all_outputs.append(current_output)
loop += 1
return all_outputs
def step(
self,
task: str = None,
tasks: Optional[List[str]] = None,
img: Optional[str] = None,
) -> str:
"""
Processes a list of tasks and returns the agent's response.
Processes a single task or list of tasks and returns the agent's evaluation.
This method performs a one-shot evaluation of the provided content. It takes
either a single task string or a list of tasks and generates a comprehensive
evaluation with strengths, weaknesses, and improvement suggestions.
Args:
tasks (List[str]): A list of tasks to be processed.
task (str, optional): A single task/output to be evaluated.
tasks (List[str], optional): A list of tasks/outputs to be evaluated.
img (str, optional): Path to an image file for multimodal evaluation.
Returns:
str: The response generated by the agent.
str: A detailed evaluation response from the agent including:
- Strengths: What the agent/output did well
- Weaknesses: Areas that need improvement
- Suggestions: Specific recommendations for improvement
- Factual accuracy assessment
Raises:
ValueError: If neither task nor tasks are provided.
Example:
```python
# Single task evaluation
evaluation = judge.step(task="The answer is 42.")
# Multiple tasks evaluation
evaluation = judge.step(tasks=[
"Response 1: Paris is the capital of France",
"Response 2: 2 + 2 = 5" # Incorrect
])
# Multimodal evaluation
evaluation = judge.step(
task="Describe this image",
img="path/to/image.jpg"
)
```
"""
prompt = any_to_str(tasks)
logger.debug(f"Running step with prompt: {prompt}")
print(prompt)
task_instruction = "Evaluate the following output or outputs"
if self.evaluation_criteria:
criteria_names = list(self.evaluation_criteria.keys())
if len(criteria_names) == 1:
task_instruction += f" based on {criteria_names[0]}"
try:
prompt = ""
if tasks:
prompt = any_to_str(tasks)
elif task:
prompt = task
else:
formatted_criteria = ", ".join(criteria_names[:-1]) + f" and {criteria_names[-1]}"
task_instruction += f" based on the criteria: {formatted_criteria}"
response = self.agent.run(
task=f"{task_instruction}: {prompt}"
)
raise ValueError("No tasks or task provided")
# 添加评估标准到任务描述中
task_instruction = "You are an expert AI agent judge. Carefully review the following output(s) generated by another agent. "
task_instruction += "Your job is to provide a detailed, constructive, and actionable critique that will help the agent improve its future performance. "
task_instruction += "Your feedback should address the following points:\n"
task_instruction += "1. Strengths: What did the agent do well? Highlight any correct reasoning, clarity, or effective problem-solving.\n"
task_instruction += "2. Weaknesses: Identify any errors, omissions, unclear reasoning, or areas where the output could be improved.\n"
task_instruction += "3. Suggestions: Offer specific, practical recommendations for how the agent can improve its next attempt. "
task_instruction += "This may include advice on reasoning, structure, completeness, or style.\n"
task_instruction += "4. If relevant, point out any factual inaccuracies or logical inconsistencies.\n"
# 在任务说明中添加评估标准
if self.evaluation_criteria:
criteria_names = list(self.evaluation_criteria.keys())
task_instruction += "\nPlease use these specific evaluation criteria with their respective weights:\n"
for criterion, weight in self.evaluation_criteria.items():
task_instruction += f"- {criterion}: weight = {weight}\n"
task_instruction += "Be thorough, objective, and professional. Your goal is to help the agent learn and produce better results in the future.\n\n"
task_instruction += f"Output(s) to evaluate:\n{prompt}\n"
logger.debug(f"Received response: {response}")
return response
response = self.agent.run(
task=task_instruction,
img=img,
)
def run(self, tasks: List[str]) -> List[str]:
return response
except Exception as e:
error_message = (
f"AgentJudge encountered an error: {e}\n"
f"Traceback:\n{traceback.format_exc()}\n\n"
"If this issue persists, please:\n"
"- Open a GitHub issue: https://github.com/swarms-ai/swarms/issues\n"
"- Join our Discord for real-time support: swarms.ai\n"
"- Or book a call: https://cal.com/swarms\n"
)
raise AgentJudgeExecutionError(error_message)
def run(
self,
task: str = None,
tasks: Optional[List[str]] = None,
img: Optional[str] = None,
):
"""
Executes the tasks in a loop, updating context and collecting responses.
Executes evaluation in multiple iterations with context building and refinement.
This method runs the evaluation process for the specified number of max_loops,
where each iteration builds upon the previous context. This allows for iterative
refinement of evaluations and deeper analysis over multiple passes.
Args:
tasks (List[str]): A list of tasks to be executed.
task (str, optional): A single task/output to be evaluated.
tasks (List[str], optional): A list of tasks/outputs to be evaluated.
img (str, optional): Path to an image file for multimodal evaluation.
Returns:
List[str]: A list of responses generated by the agent for each iteration.
List[str]: A list of evaluation responses, one for each iteration.
Each subsequent evaluation includes context from previous iterations.
Example:
```python
# Single task with iterative refinement
judge = AgentJudge(max_loops=3)
evaluations = judge.run(task="Agent output to evaluate")
# Returns 3 evaluations, each building on the previous
# Multiple tasks with context building
evaluations = judge.run(tasks=[
"First agent response",
"Second agent response"
])
# With image analysis
evaluations = judge.run(
task="Analyze this chart",
img="chart.png"
)
```
Note:
- The first iteration evaluates the original task(s)
- Subsequent iterations include context from previous evaluations
- This enables deeper analysis and refinement of judgments
- Useful for complex evaluations requiring multiple perspectives
"""
responses = []
context = ""
for _ in range(self.max_loops):
# Add context to the tasks if available
if context:
contextualized_tasks = [
f"Previous context: {context}\nTask: {task}"
for task in tasks
]
else:
contextualized_tasks = tasks
try:
responses = []
context = ""
# Convert single task to list for consistent processing
if task and not tasks:
tasks = [task]
task = None # Clear to avoid confusion in step method
for _ in range(self.max_loops):
# Add context to the tasks if available
if context and tasks:
contextualized_tasks = [
f"Previous context: {context}\nTask: {t}"
for t in tasks
]
else:
contextualized_tasks = tasks
# Get response for current iteration
current_response = self.step(contextualized_tasks)
responses.append(current_response)
logger.debug(
f"Current response added: {current_response}"
# Get response for current iteration
current_response = self.step(
task=task,
tasks=contextualized_tasks,
img=img,
)
responses.append(current_response)
# Update context for next iteration
context = current_response
return responses
except Exception as e:
error_message = (
f"AgentJudge encountered an error: {e}\n"
f"Traceback:\n{traceback.format_exc()}\n\n"
"If this issue persists, please:\n"
"- Open a GitHub issue: https://github.com/swarms-ai/swarms/issues\n"
"- Join our Discord for real-time support: swarms.ai\n"
"- Or book a call: https://cal.com/swarms\n"
)
raise AgentJudgeExecutionError(error_message)
def run_batched(
self,
tasks: Optional[List[str]] = None,
imgs: Optional[List[str]] = None,
):
"""
Executes batch evaluation of multiple tasks with corresponding images.
This method processes multiple task-image pairs independently, where each
task can be evaluated with its corresponding image. Unlike the run() method,
this doesn't build context between different tasks - each is evaluated
independently.
# Update context for next iteration
context = current_response
# Add to conversation history
logger.debug("Added message to conversation history.")
Args:
tasks (List[str], optional): A list of tasks/outputs to be evaluated.
imgs (List[str], optional): A list of image paths corresponding to each task.
Must be the same length as tasks if provided.
Returns:
List[List[str]]: A list of evaluation responses for each task. Each inner
list contains the responses from all iterations (max_loops)
for that particular task.
Example:
```python
# Batch evaluation with images
tasks = [
"Describe what you see in this image",
"What's wrong with this chart?",
"Analyze the trends shown"
]
images = [
"photo1.jpg",
"chart1.png",
"graph1.png"
]
evaluations = judge.run_batched(tasks=tasks, imgs=images)
# Returns evaluations for each task-image pair
# Batch evaluation without images
evaluations = judge.run_batched(tasks=[
"Agent response 1",
"Agent response 2",
"Agent response 3"
])
```
Note:
- Each task is processed independently
- If imgs is provided, it must have the same length as tasks
- Each task goes through max_loops iterations independently
- No context is shared between different tasks in the batch
"""
responses = []
for task, img in zip(tasks, imgs):
response = self.run(task=task, img=img)
responses.append(response)
return responses
Loading…
Cancel
Save