@ -1,54 +1,105 @@
from typing import List , Dict , Optional
import traceback
from typing import List , Optional , Union , Dict
import uuid
from swarms . prompts . agent_judge_prompt import AGENT_JUDGE_PROMPT
from swarms . structs . agent import Agent
from swarms . structs . conversation import Conversation
from swarms . utils . any_to_str import any_to_str
from loguru import logger
class AgentJudgeInitializationError ( Exception ) :
"""
Exception raised when there is an error initializing the AgentJudge .
"""
pass
class AgentJudgeExecutionError ( Exception ) :
"""
Exception raised when there is an error executing the AgentJudge .
"""
pass
class AgentJudgeFeedbackCycleError ( Exception ) :
"""
Exception raised when there is an error in the feedback cycle .
"""
pass
class AgentJudge :
"""
A class to represent an agent judge that processes tasks and generates responses .
A specialized agent designed to evaluate and judge outputs from other agents or systems .
The AgentJudge acts as a quality control mechanism , providing objective assessments
and feedback on various types of content , decisions , or outputs . It ' s based on research
in LLM - based evaluation systems and can maintain context across multiple evaluations .
This implementation supports both single task evaluation and batch processing with
iterative refinement capabilities .
Attributes :
id ( str ) : Unique identifier for the judge agent instance .
agent_name ( str ) : The name of the agent judge .
system_prompt ( str ) : The system prompt for the agent .
model_name ( str ) : The model name used for generating responses .
system_prompt ( str ) : The system prompt for the agent containing evaluation instructions .
model_name ( str ) : The model name used for generating evaluations ( e . g . , " openai/o1 " , " gpt-4 " ) .
conversation ( Conversation ) : An instance of the Conversation class to manage conversation history .
max_loops ( int ) : The maximum number of iterations to run the tasks .
agent ( Agent ) : An instance of the Agent class that performs the task execution .
max_loops ( int ) : The maximum number of evaluation iterations to run .
verbose ( bool ) : Whether to enable verbose logging .
agent ( Agent ) : An instance of the Agent class that performs the evaluation execution .
evaluation_criteria ( Dict [ str , float ] ) : Dictionary of evaluation criteria and their weights .
Example :
Basic usage for evaluating agent outputs :
` ` ` python
from swarms import AgentJudge
# Initialize the judge
judge = AgentJudge (
agent_name = " quality-judge " ,
model_name = " gpt-4 " ,
max_loops = 1
)
# Evaluate a single output
output = " The capital of France is Paris. "
evaluation = judge . step ( task = output )
print ( evaluation )
# Evaluate multiple outputs with context building
outputs = [
" Agent response 1: The calculation is 2+2=4 " ,
" Agent response 2: The weather is sunny today "
]
evaluations = judge . run ( tasks = outputs )
` ` `
Methods :
step ( task : str = None , tasks : List [ str ] = None , img : str = None ) - > str :
Processes a single task or list of tasks and returns the agent ' s evaluation.
run ( task : str = None , tasks : List [ str ] = None , img : str = None ) - > List [ str ] :
Executes evaluation in a loop with context building , collecting responses .
run_batched ( tasks : List [ str ] = None , imgs : List [ str ] = None ) - > List [ str ] :
Executes batch evaluation of tasks with corresponding images .
"""
def __init__ (
self ,
agent_name : str = " agent-judge-01 " ,
id : str = str ( uuid . uuid4 ( ) ) ,
agent_name : str = " Agent Judge " ,
description : str = " You ' re an expert AI agent judge. Carefully review the following output(s) generated by another agent. Your job is to provide a detailed, constructive, and actionable critique that will help the agent improve its future performance. " ,
system_prompt : str = AGENT_JUDGE_PROMPT ,
model_name : str = " openai/o1 " ,
max_loops : int = 1 ,
verbose : bool = False ,
evaluation_criteria : Optional [ Dict [ str , float ] ] = None ,
) - > None :
"""
Initializes the AgentJudge with the specified parameters .
Args :
agent_name ( str ) : The name of the agent judge .
system_prompt ( str ) : The system prompt for the agent .
model_name ( str ) : The model name used for generating responses .
max_loops ( int ) : The maximum number of iterations to run the tasks .
evaluation_criteria ( Optional [ Dict [ str , float ] ] ) : Dictionary of evaluation criteria
and their weights . Keys are criteria names , values are weights .
Example : { " correctness " : 0.4 , " efficiency " : 0.3 , " clarity " : 0.3 }
"""
* args ,
* * kwargs ,
) :
self . id = id
self . agent_name = agent_name
self . system_prompt = system_prompt
self . model_name = model_name
self . conversation = Conversation ( time_enabled = False )
self . max_loops = max_loops
self . verbose = verbose
self . evaluation_criteria = evaluation_criteria or { }
# Enhance system prompt with evaluation criteria if provided
@ -58,78 +109,307 @@ class AgentJudge:
for criterion , weight in self . evaluation_criteria . items ( ) :
criteria_str + = f " - { criterion } : weight = { weight } \n "
enhanced_prompt + = criteria_str
self . agent = Agent (
agent_name = agent_name ,
agent_description = " You ' re the agent judge " ,
agent_description = description ,
system_prompt = enhanced_prompt ,
model_name = model_name ,
max_loops = 1 ,
* args ,
* * kwargs ,
)
def step ( self , tasks : List [ str ] ) - > str :
def feedback_cycle_step (
self ,
agent : Union [ Agent , callable ] ,
task : str ,
img : Optional [ str ] = None ,
) :
try :
# First run the main agent
agent_output = agent . run ( task = task , img = img )
# Then run the judge agent
judge_output = self . run ( task = agent_output , img = img )
# Run the main agent again with the judge's feedback, using a much improved prompt
improved_prompt = (
f " You have received the following detailed feedback from the expert agent judge ( { self . agent_name } ): \n \n "
f " --- FEEDBACK START --- \n { judge_output } \n --- FEEDBACK END --- \n \n "
f " Your task is to thoughtfully revise and enhance your previous output based on this critique. "
f " Carefully address all identified weaknesses, incorporate the suggestions, and strive to maximize the strengths noted. "
f " Be specific, accurate, and actionable in your improvements. "
f " Here is the original task for reference: \n \n "
f " --- TASK --- \n { task } \n --- END TASK --- \n \n "
f " Please provide your improved and fully revised output below. "
)
return agent . run ( task = improved_prompt , img = img )
except Exception as e :
raise AgentJudgeFeedbackCycleError (
f " Error In Agent Judge Feedback Cycle: { e } Traceback: { traceback . format_exc ( ) } "
)
def feedback_cycle (
self ,
agent : Union [ Agent , callable ] ,
task : str ,
img : Optional [ str ] = None ,
loops : int = 1 ,
) :
loop = 0
original_task = task # Preserve the original task
current_output = None # Track the current output
all_outputs = [ ] # Collect all outputs from each iteration
while loop < loops :
# First iteration: run the standard feedback cycle step
current_output = self . feedback_cycle_step (
agent , original_task , img
)
# Add the current output to our collection
all_outputs . append ( current_output )
loop + = 1
return all_outputs
def step (
self ,
task : str = None ,
tasks : Optional [ List [ str ] ] = None ,
img : Optional [ str ] = None ,
) - > str :
"""
Processes a list of tasks and returns the agent ' s response.
Processes a single task or list of tasks and returns the agent ' s evaluation.
This method performs a one - shot evaluation of the provided content . It takes
either a single task string or a list of tasks and generates a comprehensive
evaluation with strengths , weaknesses , and improvement suggestions .
Args :
tasks ( List [ str ] ) : A list of tasks to be processed .
task ( str , optional ) : A single task / output to be evaluated .
tasks ( List [ str ] , optional ) : A list of tasks / outputs to be evaluated .
img ( str , optional ) : Path to an image file for multimodal evaluation .
Returns :
str : The response generated by the agent .
str : A detailed evaluation response from the agent including :
- Strengths : What the agent / output did well
- Weaknesses : Areas that need improvement
- Suggestions : Specific recommendations for improvement
- Factual accuracy assessment
Raises :
ValueError : If neither task nor tasks are provided .
Example :
` ` ` python
# Single task evaluation
evaluation = judge . step ( task = " The answer is 42. " )
# Multiple tasks evaluation
evaluation = judge . step ( tasks = [
" Response 1: Paris is the capital of France " ,
" Response 2: 2 + 2 = 5 " # Incorrect
] )
# Multimodal evaluation
evaluation = judge . step (
task = " Describe this image " ,
img = " path/to/image.jpg "
)
` ` `
"""
prompt = any_to_str ( tasks )
logger . debug ( f " Running step with prompt: { prompt } " )
print ( prompt )
task_instruction = " Evaluate the following output or outputs "
if self . evaluation_criteria :
criteria_names = list ( self . evaluation_criteria . keys ( ) )
if len ( criteria_names ) == 1 :
task_instruction + = f " based on { criteria_names [ 0 ] } "
try :
prompt = " "
if tasks :
prompt = any_to_str ( tasks )
elif task :
prompt = task
else :
formatted_criteria = " , " . join ( criteria_names [ : - 1 ] ) + f " and { criteria_names [ - 1 ] } "
task_instruction + = f " based on the criteria: { formatted_criteria } "
response = self . agent . run (
task = f " { task_instruction } : { prompt } "
)
raise ValueError ( " No tasks or task provided " )
# 添加评估标准到任务描述中
task_instruction = " You are an expert AI agent judge. Carefully review the following output(s) generated by another agent. "
task_instruction + = " Your job is to provide a detailed, constructive, and actionable critique that will help the agent improve its future performance. "
task_instruction + = " Your feedback should address the following points: \n "
task_instruction + = " 1. Strengths: What did the agent do well? Highlight any correct reasoning, clarity, or effective problem-solving. \n "
task_instruction + = " 2. Weaknesses: Identify any errors, omissions, unclear reasoning, or areas where the output could be improved. \n "
task_instruction + = " 3. Suggestions: Offer specific, practical recommendations for how the agent can improve its next attempt. "
task_instruction + = " This may include advice on reasoning, structure, completeness, or style. \n "
task_instruction + = " 4. If relevant, point out any factual inaccuracies or logical inconsistencies. \n "
# 在任务说明中添加评估标准
if self . evaluation_criteria :
criteria_names = list ( self . evaluation_criteria . keys ( ) )
task_instruction + = " \n Please use these specific evaluation criteria with their respective weights: \n "
for criterion , weight in self . evaluation_criteria . items ( ) :
task_instruction + = f " - { criterion } : weight = { weight } \n "
task_instruction + = " Be thorough, objective, and professional. Your goal is to help the agent learn and produce better results in the future. \n \n "
task_instruction + = f " Output(s) to evaluate: \n { prompt } \n "
logger . debug ( f " Received response: { response } " )
return response
response = self . agent . run (
task = task_instruction ,
img = img ,
)
def run ( self , tasks : List [ str ] ) - > List [ str ] :
return response
except Exception as e :
error_message = (
f " AgentJudge encountered an error: { e } \n "
f " Traceback: \n { traceback . format_exc ( ) } \n \n "
" If this issue persists, please: \n "
" - Open a GitHub issue: https://github.com/swarms-ai/swarms/issues \n "
" - Join our Discord for real-time support: swarms.ai \n "
" - Or book a call: https://cal.com/swarms \n "
)
raise AgentJudgeExecutionError ( error_message )
def run (
self ,
task : str = None ,
tasks : Optional [ List [ str ] ] = None ,
img : Optional [ str ] = None ,
) :
"""
Executes the tasks in a loop , updating context and collecting responses .
Executes evaluation in multiple iterations with context building and refinement .
This method runs the evaluation process for the specified number of max_loops ,
where each iteration builds upon the previous context . This allows for iterative
refinement of evaluations and deeper analysis over multiple passes .
Args :
tasks ( List [ str ] ) : A list of tasks to be executed .
task ( str , optional ) : A single task / output to be evaluated .
tasks ( List [ str ] , optional ) : A list of tasks / outputs to be evaluated .
img ( str , optional ) : Path to an image file for multimodal evaluation .
Returns :
List [ str ] : A list of responses generated by the agent for each iteration .
List [ str ] : A list of evaluation responses , one for each iteration .
Each subsequent evaluation includes context from previous iterations .
Example :
` ` ` python
# Single task with iterative refinement
judge = AgentJudge ( max_loops = 3 )
evaluations = judge . run ( task = " Agent output to evaluate " )
# Returns 3 evaluations, each building on the previous
# Multiple tasks with context building
evaluations = judge . run ( tasks = [
" First agent response " ,
" Second agent response "
] )
# With image analysis
evaluations = judge . run (
task = " Analyze this chart " ,
img = " chart.png "
)
` ` `
Note :
- The first iteration evaluates the original task ( s )
- Subsequent iterations include context from previous evaluations
- This enables deeper analysis and refinement of judgments
- Useful for complex evaluations requiring multiple perspectives
"""
responses = [ ]
context = " "
for _ in range ( self . max_loops ) :
# Add context to the tasks if available
if context :
contextualized_tasks = [
f " Previous context: { context } \n Task: { task } "
for task in tasks
]
else :
contextualized_tasks = tasks
try :
responses = [ ]
context = " "
# Convert single task to list for consistent processing
if task and not tasks :
tasks = [ task ]
task = None # Clear to avoid confusion in step method
for _ in range ( self . max_loops ) :
# Add context to the tasks if available
if context and tasks :
contextualized_tasks = [
f " Previous context: { context } \n Task: { t } "
for t in tasks
]
else :
contextualized_tasks = tasks
# Get response for current iteration
current_response = self . step ( contextualized_tasks )
responses . append ( current_response )
logger . debug (
f " Current response added: { current_response } "
# Get response for current iteration
current_response = self . step (
task = task ,
tasks = contextualized_tasks ,
img = img ,
)
responses . append ( current_response )
# Update context for next iteration
context = current_response
return responses
except Exception as e :
error_message = (
f " AgentJudge encountered an error: { e } \n "
f " Traceback: \n { traceback . format_exc ( ) } \n \n "
" If this issue persists, please: \n "
" - Open a GitHub issue: https://github.com/swarms-ai/swarms/issues \n "
" - Join our Discord for real-time support: swarms.ai \n "
" - Or book a call: https://cal.com/swarms \n "
)
raise AgentJudgeExecutionError ( error_message )
def run_batched (
self ,
tasks : Optional [ List [ str ] ] = None ,
imgs : Optional [ List [ str ] ] = None ,
) :
"""
Executes batch evaluation of multiple tasks with corresponding images .
This method processes multiple task - image pairs independently , where each
task can be evaluated with its corresponding image . Unlike the run ( ) method ,
this doesn ' t build context between different tasks - each is evaluated
independently .
# Update context for next iteration
context = current_response
# Add to conversation history
logger . debug ( " Added message to conversation history. " )
Args :
tasks ( List [ str ] , optional ) : A list of tasks / outputs to be evaluated .
imgs ( List [ str ] , optional ) : A list of image paths corresponding to each task .
Must be the same length as tasks if provided .
Returns :
List [ List [ str ] ] : A list of evaluation responses for each task . Each inner
list contains the responses from all iterations ( max_loops )
for that particular task .
Example :
` ` ` python
# Batch evaluation with images
tasks = [
" Describe what you see in this image " ,
" What ' s wrong with this chart? " ,
" Analyze the trends shown "
]
images = [
" photo1.jpg " ,
" chart1.png " ,
" graph1.png "
]
evaluations = judge . run_batched ( tasks = tasks , imgs = images )
# Returns evaluations for each task-image pair
# Batch evaluation without images
evaluations = judge . run_batched ( tasks = [
" Agent response 1 " ,
" Agent response 2 " ,
" Agent response 3 "
] )
` ` `
Note :
- Each task is processed independently
- If imgs is provided , it must have the same length as tasks
- Each task goes through max_loops iterations independently
- No context is shared between different tasks in the batch
"""
responses = [ ]
for task , img in zip ( tasks , imgs ) :
response = self . run ( task = task , img = img )
responses . append ( response )
return responses