parent
e1149cbf02
commit
ac212fe31a
@ -1,100 +0,0 @@
|
|||||||
"""
|
|
||||||
Agent Judge with Evaluation Criteria Example
|
|
||||||
|
|
||||||
This example demonstrates how to use the AgentJudge with custom evaluation criteria.
|
|
||||||
The evaluation_criteria parameter allows specifying different criteria with weights
|
|
||||||
for more targeted and customizable evaluation of agent outputs.
|
|
||||||
"""
|
|
||||||
|
|
||||||
from swarms.agents.agent_judge import AgentJudge
|
|
||||||
import os
|
|
||||||
from dotenv import load_dotenv
|
|
||||||
|
|
||||||
load_dotenv()
|
|
||||||
|
|
||||||
# Example 1: Basic usage with evaluation criteria
|
|
||||||
print("\n=== Example 1: Using Custom Evaluation Criteria ===\n")
|
|
||||||
|
|
||||||
# Create an AgentJudge with custom evaluation criteria
|
|
||||||
judge = AgentJudge(
|
|
||||||
model_name="claude-3-7-sonnet-20250219", # Use any available model
|
|
||||||
evaluation_criteria={
|
|
||||||
"correctness": 0.5,
|
|
||||||
"problem_solving_approach": 0.3,
|
|
||||||
"explanation_clarity": 0.2
|
|
||||||
}
|
|
||||||
)
|
|
||||||
|
|
||||||
# Sample output to evaluate
|
|
||||||
task_response = [
|
|
||||||
"Task: Determine the time complexity of a binary search algorithm and explain your reasoning.\n\n"
|
|
||||||
"Agent response: The time complexity of binary search is O(log n). In each step, "
|
|
||||||
"we divide the search space in half, resulting in a logarithmic relationship between "
|
|
||||||
"the input size and the number of operations. This can be proven by solving the "
|
|
||||||
"recurrence relation T(n) = T(n/2) + O(1), which gives us T(n) = O(log n)."
|
|
||||||
]
|
|
||||||
|
|
||||||
# Run evaluation
|
|
||||||
evaluation = judge.run(task_response)
|
|
||||||
print(evaluation[0])
|
|
||||||
|
|
||||||
# Example 2: Specialized criteria for code evaluation
|
|
||||||
print("\n=== Example 2: Code Evaluation with Specialized Criteria ===\n")
|
|
||||||
|
|
||||||
code_judge = AgentJudge(
|
|
||||||
model_name="claude-3-7-sonnet-20250219",
|
|
||||||
agent_name="code_judge",
|
|
||||||
evaluation_criteria={
|
|
||||||
"code_correctness": 0.4,
|
|
||||||
"code_efficiency": 0.3,
|
|
||||||
"code_readability": 0.3
|
|
||||||
}
|
|
||||||
)
|
|
||||||
|
|
||||||
# Sample code to evaluate
|
|
||||||
code_response = [
|
|
||||||
"Task: Write a function to find the maximum subarray sum in an array of integers.\n\n"
|
|
||||||
"Agent response:\n```python\n"
|
|
||||||
"def max_subarray_sum(arr):\n"
|
|
||||||
" current_sum = max_sum = arr[0]\n"
|
|
||||||
" for i in range(1, len(arr)):\n"
|
|
||||||
" current_sum = max(arr[i], current_sum + arr[i])\n"
|
|
||||||
" max_sum = max(max_sum, current_sum)\n"
|
|
||||||
" return max_sum\n\n"
|
|
||||||
"# Example usage\n"
|
|
||||||
"print(max_subarray_sum([-2, 1, -3, 4, -1, 2, 1, -5, 4])) # Output: 6 (subarray [4, -1, 2, 1])\n"
|
|
||||||
"```\n"
|
|
||||||
"This implementation uses Kadane's algorithm which has O(n) time complexity and "
|
|
||||||
"O(1) space complexity, making it optimal for this problem."
|
|
||||||
]
|
|
||||||
|
|
||||||
code_evaluation = code_judge.run(code_response)
|
|
||||||
print(code_evaluation[0])
|
|
||||||
|
|
||||||
# Example 3: Comparing multiple responses
|
|
||||||
print("\n=== Example 3: Comparing Multiple Agent Responses ===\n")
|
|
||||||
|
|
||||||
comparison_judge = AgentJudge(
|
|
||||||
model_name="claude-3-7-sonnet-20250219",
|
|
||||||
evaluation_criteria={
|
|
||||||
"accuracy": 0.6,
|
|
||||||
"completeness": 0.4
|
|
||||||
}
|
|
||||||
)
|
|
||||||
|
|
||||||
multiple_responses = comparison_judge.run([
|
|
||||||
"Task: Explain the CAP theorem in distributed systems.\n\n"
|
|
||||||
"Agent A response: CAP theorem states that a distributed system cannot simultaneously "
|
|
||||||
"provide Consistency, Availability, and Partition tolerance. In practice, you must "
|
|
||||||
"choose two out of these three properties.",
|
|
||||||
|
|
||||||
"Task: Explain the CAP theorem in distributed systems.\n\n"
|
|
||||||
"Agent B response: The CAP theorem, formulated by Eric Brewer, states that in a "
|
|
||||||
"distributed data store, you can only guarantee two of the following three properties: "
|
|
||||||
"Consistency (all nodes see the same data at the same time), Availability (every request "
|
|
||||||
"receives a response), and Partition tolerance (the system continues to operate despite "
|
|
||||||
"network failures). Most modern distributed systems choose to sacrifice consistency in "
|
|
||||||
"favor of availability and partition tolerance, implementing eventual consistency models instead."
|
|
||||||
])
|
|
||||||
|
|
||||||
print(multiple_responses[0])
|
|
@ -1,415 +1,119 @@
|
|||||||
import traceback
|
from typing import List
|
||||||
from typing import List, Optional, Union, Dict
|
|
||||||
import uuid
|
|
||||||
|
|
||||||
from swarms.prompts.agent_judge_prompt import AGENT_JUDGE_PROMPT
|
from swarms.prompts.agent_judge_prompt import AGENT_JUDGE_PROMPT
|
||||||
from swarms.structs.agent import Agent
|
from swarms.structs.agent import Agent
|
||||||
from swarms.structs.conversation import Conversation
|
from swarms.structs.conversation import Conversation
|
||||||
from swarms.utils.any_to_str import any_to_str
|
from swarms.utils.any_to_str import any_to_str
|
||||||
|
|
||||||
class AgentJudgeInitializationError(Exception):
|
from loguru import logger
|
||||||
"""
|
|
||||||
Exception raised when there is an error initializing the AgentJudge.
|
|
||||||
"""
|
|
||||||
pass
|
|
||||||
|
|
||||||
class AgentJudgeExecutionError(Exception):
|
|
||||||
"""
|
|
||||||
Exception raised when there is an error executing the AgentJudge.
|
|
||||||
"""
|
|
||||||
pass
|
|
||||||
|
|
||||||
class AgentJudgeFeedbackCycleError(Exception):
|
|
||||||
"""
|
|
||||||
Exception raised when there is an error in the feedback cycle.
|
|
||||||
"""
|
|
||||||
pass
|
|
||||||
|
|
||||||
class AgentJudge:
|
class AgentJudge:
|
||||||
"""
|
"""
|
||||||
A specialized agent designed to evaluate and judge outputs from other agents or systems.
|
A class to represent an agent judge that processes tasks and generates responses.
|
||||||
The AgentJudge acts as a quality control mechanism, providing objective assessments
|
|
||||||
and feedback on various types of content, decisions, or outputs. It's based on research
|
|
||||||
in LLM-based evaluation systems and can maintain context across multiple evaluations.
|
|
||||||
This implementation supports both single task evaluation and batch processing with
|
|
||||||
iterative refinement capabilities.
|
|
||||||
|
|
||||||
Attributes:
|
Attributes:
|
||||||
id (str): Unique identifier for the judge agent instance.
|
|
||||||
agent_name (str): The name of the agent judge.
|
agent_name (str): The name of the agent judge.
|
||||||
system_prompt (str): The system prompt for the agent containing evaluation instructions.
|
system_prompt (str): The system prompt for the agent.
|
||||||
model_name (str): The model name used for generating evaluations (e.g., "openai/o1", "gpt-4").
|
model_name (str): The model name used for generating responses.
|
||||||
conversation (Conversation): An instance of the Conversation class to manage conversation history.
|
conversation (Conversation): An instance of the Conversation class to manage conversation history.
|
||||||
max_loops (int): The maximum number of evaluation iterations to run.
|
max_loops (int): The maximum number of iterations to run the tasks.
|
||||||
verbose (bool): Whether to enable verbose logging.
|
agent (Agent): An instance of the Agent class that performs the task execution.
|
||||||
agent (Agent): An instance of the Agent class that performs the evaluation execution.
|
|
||||||
evaluation_criteria (Dict[str, float]): Dictionary of evaluation criteria and their weights.
|
|
||||||
|
|
||||||
Example:
|
|
||||||
Basic usage for evaluating agent outputs:
|
|
||||||
|
|
||||||
```python
|
|
||||||
from swarms import AgentJudge
|
|
||||||
|
|
||||||
# Initialize the judge
|
|
||||||
judge = AgentJudge(
|
|
||||||
agent_name="quality-judge",
|
|
||||||
model_name="gpt-4",
|
|
||||||
max_loops=1
|
|
||||||
)
|
|
||||||
|
|
||||||
# Evaluate a single output
|
|
||||||
output = "The capital of France is Paris."
|
|
||||||
evaluation = judge.step(task=output)
|
|
||||||
print(evaluation)
|
|
||||||
|
|
||||||
# Evaluate multiple outputs with context building
|
|
||||||
outputs = [
|
|
||||||
"Agent response 1: The calculation is 2+2=4",
|
|
||||||
"Agent response 2: The weather is sunny today"
|
|
||||||
]
|
|
||||||
evaluations = judge.run(tasks=outputs)
|
|
||||||
```
|
|
||||||
|
|
||||||
Methods:
|
Methods:
|
||||||
step(task: str = None, tasks: List[str] = None, img: str = None) -> str:
|
step(tasks: List[str]) -> str:
|
||||||
Processes a single task or list of tasks and returns the agent's evaluation.
|
Processes a list of tasks and returns the agent's response.
|
||||||
run(task: str = None, tasks: List[str] = None, img: str = None) -> List[str]:
|
|
||||||
Executes evaluation in a loop with context building, collecting responses.
|
run(tasks: List[str]) -> List[str]:
|
||||||
run_batched(tasks: List[str] = None, imgs: List[str] = None) -> List[str]:
|
Executes the tasks in a loop, updating context and collecting responses.
|
||||||
Executes batch evaluation of tasks with corresponding images.
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
id: str = str(uuid.uuid4()),
|
agent_name: str = "agent-judge-01",
|
||||||
agent_name: str = "Agent Judge",
|
|
||||||
description: str = "You're an expert AI agent judge. Carefully review the following output(s) generated by another agent. Your job is to provide a detailed, constructive, and actionable critique that will help the agent improve its future performance.",
|
|
||||||
system_prompt: str = AGENT_JUDGE_PROMPT,
|
system_prompt: str = AGENT_JUDGE_PROMPT,
|
||||||
model_name: str = "openai/o1",
|
model_name: str = "openai/o1",
|
||||||
max_loops: int = 1,
|
max_loops: int = 1,
|
||||||
verbose: bool = False,
|
) -> None:
|
||||||
evaluation_criteria: Optional[Dict[str, float]] = None,
|
"""
|
||||||
*args,
|
Initializes the AgentJudge with the specified parameters.
|
||||||
**kwargs,
|
|
||||||
):
|
Args:
|
||||||
self.id = id
|
agent_name (str): The name of the agent judge.
|
||||||
|
system_prompt (str): The system prompt for the agent.
|
||||||
|
model_name (str): The model name used for generating responses.
|
||||||
|
max_loops (int): The maximum number of iterations to run the tasks.
|
||||||
|
"""
|
||||||
self.agent_name = agent_name
|
self.agent_name = agent_name
|
||||||
self.system_prompt = system_prompt
|
self.system_prompt = system_prompt
|
||||||
self.model_name = model_name
|
self.model_name = model_name
|
||||||
self.conversation = Conversation(time_enabled=False)
|
self.conversation = Conversation(time_enabled=False)
|
||||||
self.max_loops = max_loops
|
self.max_loops = max_loops
|
||||||
self.verbose = verbose
|
|
||||||
self.evaluation_criteria = evaluation_criteria or {}
|
|
||||||
|
|
||||||
# Enhance system prompt with evaluation criteria if provided
|
|
||||||
enhanced_prompt = system_prompt
|
|
||||||
if self.evaluation_criteria:
|
|
||||||
criteria_str = "\n\nEvaluation Criteria:\n"
|
|
||||||
for criterion, weight in self.evaluation_criteria.items():
|
|
||||||
criteria_str += f"- {criterion}: weight = {weight}\n"
|
|
||||||
enhanced_prompt += criteria_str
|
|
||||||
|
|
||||||
self.agent = Agent(
|
self.agent = Agent(
|
||||||
agent_name=agent_name,
|
agent_name=agent_name,
|
||||||
agent_description=description,
|
agent_description="You're the agent judge",
|
||||||
system_prompt=enhanced_prompt,
|
system_prompt=AGENT_JUDGE_PROMPT,
|
||||||
model_name=model_name,
|
model_name=model_name,
|
||||||
max_loops=1,
|
max_loops=1,
|
||||||
*args,
|
|
||||||
**kwargs,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
def feedback_cycle_step(
|
def step(self, tasks: List[str]) -> str:
|
||||||
self,
|
|
||||||
agent: Union[Agent, callable],
|
|
||||||
task: str,
|
|
||||||
img: Optional[str] = None,
|
|
||||||
):
|
|
||||||
try:
|
|
||||||
# First run the main agent
|
|
||||||
agent_output = agent.run(task=task, img=img)
|
|
||||||
|
|
||||||
# Then run the judge agent
|
|
||||||
judge_output = self.run(task=agent_output, img=img)
|
|
||||||
|
|
||||||
# Run the main agent again with the judge's feedback, using a much improved prompt
|
|
||||||
improved_prompt = (
|
|
||||||
f"You have received the following detailed feedback from the expert agent judge ({self.agent_name}):\n\n"
|
|
||||||
f"--- FEEDBACK START ---\n{judge_output}\n--- FEEDBACK END ---\n\n"
|
|
||||||
f"Your task is to thoughtfully revise and enhance your previous output based on this critique. "
|
|
||||||
f"Carefully address all identified weaknesses, incorporate the suggestions, and strive to maximize the strengths noted. "
|
|
||||||
f"Be specific, accurate, and actionable in your improvements. "
|
|
||||||
f"Here is the original task for reference:\n\n"
|
|
||||||
f"--- TASK ---\n{task}\n--- END TASK ---\n\n"
|
|
||||||
f"Please provide your improved and fully revised output below."
|
|
||||||
)
|
|
||||||
|
|
||||||
return agent.run(task=improved_prompt, img=img)
|
|
||||||
except Exception as e:
|
|
||||||
raise AgentJudgeFeedbackCycleError(
|
|
||||||
f"Error In Agent Judge Feedback Cycle: {e} Traceback: {traceback.format_exc()}"
|
|
||||||
)
|
|
||||||
|
|
||||||
def feedback_cycle(
|
|
||||||
self,
|
|
||||||
agent: Union[Agent, callable],
|
|
||||||
task: str,
|
|
||||||
img: Optional[str] = None,
|
|
||||||
loops: int = 1,
|
|
||||||
):
|
|
||||||
loop = 0
|
|
||||||
original_task = task # Preserve the original task
|
|
||||||
current_output = None # Track the current output
|
|
||||||
all_outputs = [] # Collect all outputs from each iteration
|
|
||||||
|
|
||||||
while loop < loops:
|
|
||||||
# First iteration: run the standard feedback cycle step
|
|
||||||
current_output = self.feedback_cycle_step(
|
|
||||||
agent, original_task, img
|
|
||||||
)
|
|
||||||
|
|
||||||
# Add the current output to our collection
|
|
||||||
all_outputs.append(current_output)
|
|
||||||
loop += 1
|
|
||||||
|
|
||||||
return all_outputs
|
|
||||||
|
|
||||||
def step(
|
|
||||||
self,
|
|
||||||
task: str = None,
|
|
||||||
tasks: Optional[List[str]] = None,
|
|
||||||
img: Optional[str] = None,
|
|
||||||
) -> str:
|
|
||||||
"""
|
"""
|
||||||
Processes a single task or list of tasks and returns the agent's evaluation.
|
Processes a list of tasks and returns the agent's response.
|
||||||
This method performs a one-shot evaluation of the provided content. It takes
|
|
||||||
either a single task string or a list of tasks and generates a comprehensive
|
|
||||||
evaluation with strengths, weaknesses, and improvement suggestions.
|
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
task (str, optional): A single task/output to be evaluated.
|
tasks (List[str]): A list of tasks to be processed.
|
||||||
tasks (List[str], optional): A list of tasks/outputs to be evaluated.
|
|
||||||
img (str, optional): Path to an image file for multimodal evaluation.
|
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
str: A detailed evaluation response from the agent including:
|
str: The response generated by the agent.
|
||||||
- Strengths: What the agent/output did well
|
|
||||||
- Weaknesses: Areas that need improvement
|
|
||||||
- Suggestions: Specific recommendations for improvement
|
|
||||||
- Factual accuracy assessment
|
|
||||||
|
|
||||||
Raises:
|
|
||||||
ValueError: If neither task nor tasks are provided.
|
|
||||||
|
|
||||||
Example:
|
|
||||||
```python
|
|
||||||
# Single task evaluation
|
|
||||||
evaluation = judge.step(task="The answer is 42.")
|
|
||||||
|
|
||||||
# Multiple tasks evaluation
|
|
||||||
evaluation = judge.step(tasks=[
|
|
||||||
"Response 1: Paris is the capital of France",
|
|
||||||
"Response 2: 2 + 2 = 5" # Incorrect
|
|
||||||
])
|
|
||||||
|
|
||||||
# Multimodal evaluation
|
|
||||||
evaluation = judge.step(
|
|
||||||
task="Describe this image",
|
|
||||||
img="path/to/image.jpg"
|
|
||||||
)
|
|
||||||
```
|
|
||||||
"""
|
"""
|
||||||
try:
|
prompt = any_to_str(tasks)
|
||||||
prompt = ""
|
logger.debug(f"Running step with prompt: {prompt}")
|
||||||
if tasks:
|
|
||||||
prompt = any_to_str(tasks)
|
|
||||||
elif task:
|
|
||||||
prompt = task
|
|
||||||
else:
|
|
||||||
raise ValueError("No tasks or task provided")
|
|
||||||
|
|
||||||
# 添加评估标准到任务描述中
|
|
||||||
task_instruction = "You are an expert AI agent judge. Carefully review the following output(s) generated by another agent. "
|
|
||||||
task_instruction += "Your job is to provide a detailed, constructive, and actionable critique that will help the agent improve its future performance. "
|
|
||||||
task_instruction += "Your feedback should address the following points:\n"
|
|
||||||
task_instruction += "1. Strengths: What did the agent do well? Highlight any correct reasoning, clarity, or effective problem-solving.\n"
|
|
||||||
task_instruction += "2. Weaknesses: Identify any errors, omissions, unclear reasoning, or areas where the output could be improved.\n"
|
|
||||||
task_instruction += "3. Suggestions: Offer specific, practical recommendations for how the agent can improve its next attempt. "
|
|
||||||
task_instruction += "This may include advice on reasoning, structure, completeness, or style.\n"
|
|
||||||
task_instruction += "4. If relevant, point out any factual inaccuracies or logical inconsistencies.\n"
|
|
||||||
|
|
||||||
# 在任务说明中添加评估标准
|
print(prompt)
|
||||||
if self.evaluation_criteria:
|
|
||||||
criteria_names = list(self.evaluation_criteria.keys())
|
|
||||||
task_instruction += "\nPlease use these specific evaluation criteria with their respective weights:\n"
|
|
||||||
for criterion, weight in self.evaluation_criteria.items():
|
|
||||||
task_instruction += f"- {criterion}: weight = {weight}\n"
|
|
||||||
|
|
||||||
task_instruction += "Be thorough, objective, and professional. Your goal is to help the agent learn and produce better results in the future.\n\n"
|
response = self.agent.run(
|
||||||
task_instruction += f"Output(s) to evaluate:\n{prompt}\n"
|
task=f"Evaluate the following output or outputs: {prompt}"
|
||||||
|
)
|
||||||
response = self.agent.run(
|
logger.debug(f"Received response: {response}")
|
||||||
task=task_instruction,
|
|
||||||
img=img,
|
|
||||||
)
|
|
||||||
|
|
||||||
return response
|
return response
|
||||||
except Exception as e:
|
|
||||||
error_message = (
|
|
||||||
f"AgentJudge encountered an error: {e}\n"
|
|
||||||
f"Traceback:\n{traceback.format_exc()}\n\n"
|
|
||||||
"If this issue persists, please:\n"
|
|
||||||
"- Open a GitHub issue: https://github.com/swarms-ai/swarms/issues\n"
|
|
||||||
"- Join our Discord for real-time support: swarms.ai\n"
|
|
||||||
"- Or book a call: https://cal.com/swarms\n"
|
|
||||||
)
|
|
||||||
raise AgentJudgeExecutionError(error_message)
|
|
||||||
|
|
||||||
def run(
|
def run(self, tasks: List[str]) -> List[str]:
|
||||||
self,
|
|
||||||
task: str = None,
|
|
||||||
tasks: Optional[List[str]] = None,
|
|
||||||
img: Optional[str] = None,
|
|
||||||
):
|
|
||||||
"""
|
"""
|
||||||
Executes evaluation in multiple iterations with context building and refinement.
|
Executes the tasks in a loop, updating context and collecting responses.
|
||||||
This method runs the evaluation process for the specified number of max_loops,
|
|
||||||
where each iteration builds upon the previous context. This allows for iterative
|
|
||||||
refinement of evaluations and deeper analysis over multiple passes.
|
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
task (str, optional): A single task/output to be evaluated.
|
tasks (List[str]): A list of tasks to be executed.
|
||||||
tasks (List[str], optional): A list of tasks/outputs to be evaluated.
|
|
||||||
img (str, optional): Path to an image file for multimodal evaluation.
|
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
List[str]: A list of evaluation responses, one for each iteration.
|
List[str]: A list of responses generated by the agent for each iteration.
|
||||||
Each subsequent evaluation includes context from previous iterations.
|
|
||||||
|
|
||||||
Example:
|
|
||||||
```python
|
|
||||||
# Single task with iterative refinement
|
|
||||||
judge = AgentJudge(max_loops=3)
|
|
||||||
evaluations = judge.run(task="Agent output to evaluate")
|
|
||||||
# Returns 3 evaluations, each building on the previous
|
|
||||||
|
|
||||||
# Multiple tasks with context building
|
|
||||||
evaluations = judge.run(tasks=[
|
|
||||||
"First agent response",
|
|
||||||
"Second agent response"
|
|
||||||
])
|
|
||||||
|
|
||||||
# With image analysis
|
|
||||||
evaluations = judge.run(
|
|
||||||
task="Analyze this chart",
|
|
||||||
img="chart.png"
|
|
||||||
)
|
|
||||||
```
|
|
||||||
|
|
||||||
Note:
|
|
||||||
- The first iteration evaluates the original task(s)
|
|
||||||
- Subsequent iterations include context from previous evaluations
|
|
||||||
- This enables deeper analysis and refinement of judgments
|
|
||||||
- Useful for complex evaluations requiring multiple perspectives
|
|
||||||
"""
|
"""
|
||||||
try:
|
responses = []
|
||||||
responses = []
|
context = ""
|
||||||
context = ""
|
|
||||||
|
for _ in range(self.max_loops):
|
||||||
# Convert single task to list for consistent processing
|
# Add context to the tasks if available
|
||||||
if task and not tasks:
|
if context:
|
||||||
tasks = [task]
|
contextualized_tasks = [
|
||||||
task = None # Clear to avoid confusion in step method
|
f"Previous context: {context}\nTask: {task}"
|
||||||
|
for task in tasks
|
||||||
for _ in range(self.max_loops):
|
]
|
||||||
# Add context to the tasks if available
|
else:
|
||||||
if context and tasks:
|
contextualized_tasks = tasks
|
||||||
contextualized_tasks = [
|
|
||||||
f"Previous context: {context}\nTask: {t}"
|
|
||||||
for t in tasks
|
|
||||||
]
|
|
||||||
else:
|
|
||||||
contextualized_tasks = tasks
|
|
||||||
|
|
||||||
# Get response for current iteration
|
|
||||||
current_response = self.step(
|
|
||||||
task=task,
|
|
||||||
tasks=contextualized_tasks,
|
|
||||||
img=img,
|
|
||||||
)
|
|
||||||
|
|
||||||
responses.append(current_response)
|
|
||||||
|
|
||||||
# Update context for next iteration
|
|
||||||
context = current_response
|
|
||||||
|
|
||||||
return responses
|
# Get response for current iteration
|
||||||
except Exception as e:
|
current_response = self.step(contextualized_tasks)
|
||||||
error_message = (
|
responses.append(current_response)
|
||||||
f"AgentJudge encountered an error: {e}\n"
|
logger.debug(
|
||||||
f"Traceback:\n{traceback.format_exc()}\n\n"
|
f"Current response added: {current_response}"
|
||||||
"If this issue persists, please:\n"
|
|
||||||
"- Open a GitHub issue: https://github.com/swarms-ai/swarms/issues\n"
|
|
||||||
"- Join our Discord for real-time support: swarms.ai\n"
|
|
||||||
"- Or book a call: https://cal.com/swarms\n"
|
|
||||||
)
|
)
|
||||||
raise AgentJudgeExecutionError(error_message)
|
|
||||||
|
|
||||||
def run_batched(
|
|
||||||
self,
|
|
||||||
tasks: Optional[List[str]] = None,
|
|
||||||
imgs: Optional[List[str]] = None,
|
|
||||||
):
|
|
||||||
"""
|
|
||||||
Executes batch evaluation of multiple tasks with corresponding images.
|
|
||||||
This method processes multiple task-image pairs independently, where each
|
|
||||||
task can be evaluated with its corresponding image. Unlike the run() method,
|
|
||||||
this doesn't build context between different tasks - each is evaluated
|
|
||||||
independently.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
tasks (List[str], optional): A list of tasks/outputs to be evaluated.
|
|
||||||
imgs (List[str], optional): A list of image paths corresponding to each task.
|
|
||||||
Must be the same length as tasks if provided.
|
|
||||||
|
|
||||||
Returns:
|
# Update context for next iteration
|
||||||
List[List[str]]: A list of evaluation responses for each task. Each inner
|
context = current_response
|
||||||
list contains the responses from all iterations (max_loops)
|
|
||||||
for that particular task.
|
|
||||||
|
|
||||||
Example:
|
|
||||||
```python
|
|
||||||
# Batch evaluation with images
|
|
||||||
tasks = [
|
|
||||||
"Describe what you see in this image",
|
|
||||||
"What's wrong with this chart?",
|
|
||||||
"Analyze the trends shown"
|
|
||||||
]
|
|
||||||
images = [
|
|
||||||
"photo1.jpg",
|
|
||||||
"chart1.png",
|
|
||||||
"graph1.png"
|
|
||||||
]
|
|
||||||
evaluations = judge.run_batched(tasks=tasks, imgs=images)
|
|
||||||
# Returns evaluations for each task-image pair
|
|
||||||
|
|
||||||
# Batch evaluation without images
|
# Add to conversation history
|
||||||
evaluations = judge.run_batched(tasks=[
|
logger.debug("Added message to conversation history.")
|
||||||
"Agent response 1",
|
|
||||||
"Agent response 2",
|
|
||||||
"Agent response 3"
|
|
||||||
])
|
|
||||||
```
|
|
||||||
|
|
||||||
Note:
|
|
||||||
- Each task is processed independently
|
|
||||||
- If imgs is provided, it must have the same length as tasks
|
|
||||||
- Each task goes through max_loops iterations independently
|
|
||||||
- No context is shared between different tasks in the batch
|
|
||||||
"""
|
|
||||||
responses = []
|
|
||||||
for task, img in zip(tasks, imgs):
|
|
||||||
response = self.run(task=task, img=img)
|
|
||||||
responses.append(response)
|
|
||||||
return responses
|
return responses
|
Loading…
Reference in new issue