From 78bd1def942b1774d011e8aa23bad416575ae0bc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=8E=8B=E7=A5=A5=E5=AE=87?= <625024108@qq.com>
Date: Thu, 17 Jul 2025 00:29:38 +0800
Subject: [PATCH 1/3] Added `evaluation_criteria` parameter to AgentJudge class

---
 swarms/agents/agent_judge.py | 50 ++++++++++++++++++++++++------------
 1 file changed, 33 insertions(+), 17 deletions(-)

diff --git a/swarms/agents/agent_judge.py b/swarms/agents/agent_judge.py
index cb33bd87..09dc9e34 100644
--- a/swarms/agents/agent_judge.py
+++ b/swarms/agents/agent_judge.py
@@ -1,13 +1,15 @@
-from typing import List
+from typing import List, Dict, Optional
 
 from swarms.prompts.agent_judge_prompt import AGENT_JUDGE_PROMPT
+
 from swarms.structs.agent import Agent
+
 from swarms.structs.conversation import Conversation
+
 from swarms.utils.any_to_str import any_to_str
 
 from loguru import logger
 
-
 class AgentJudge:
     """
     A class to represent an agent judge that processes tasks and generates responses.
@@ -19,13 +21,7 @@ class AgentJudge:
         conversation (Conversation): An instance of the Conversation class to manage conversation history.
         max_loops (int): The maximum number of iterations to run the tasks.
         agent (Agent): An instance of the Agent class that performs the task execution.
-
-    Methods:
-        step(tasks: List[str]) -> str:
-            Processes a list of tasks and returns the agent's response.
-
-        run(tasks: List[str]) -> List[str]:
-            Executes the tasks in a loop, updating context and collecting responses.
+        evaluation_criteria (Dict[str, float]): Dictionary of evaluation criteria and their weights.
     """
 
     def __init__(
@@ -34,6 +30,7 @@ class AgentJudge:
         system_prompt: str = AGENT_JUDGE_PROMPT,
         model_name: str = "openai/o1",
         max_loops: int = 1,
+        evaluation_criteria: Optional[Dict[str, float]] = None,
     ) -> None:
         """
         Initializes the AgentJudge with the specified parameters.
@@ -43,17 +40,29 @@ class AgentJudge:
             system_prompt (str): The system prompt for the agent.
             model_name (str): The model name used for generating responses.
             max_loops (int): The maximum number of iterations to run the tasks.
+            evaluation_criteria (Optional[Dict[str, float]]): Dictionary of evaluation criteria
+                and their weights. Keys are criteria names, values are weights.
+                Example: {"correctness": 0.4, "efficiency": 0.3, "clarity": 0.3}
         """
         self.agent_name = agent_name
         self.system_prompt = system_prompt
         self.model_name = model_name
         self.conversation = Conversation(time_enabled=False)
         self.max_loops = max_loops
-
+        self.evaluation_criteria = evaluation_criteria or {}
+        
+        # Enhance system prompt with evaluation criteria if provided
+        enhanced_prompt = system_prompt
+        if self.evaluation_criteria:
+            criteria_str = "\n\nEvaluation Criteria:\n"
+            for criterion, weight in self.evaluation_criteria.items():
+                criteria_str += f"- {criterion}: weight = {weight}\n"
+            enhanced_prompt += criteria_str
+        
         self.agent = Agent(
             agent_name=agent_name,
             agent_description="You're the agent judge",
-            system_prompt=AGENT_JUDGE_PROMPT,
+            system_prompt=enhanced_prompt,
             model_name=model_name,
             max_loops=1,
         )
@@ -70,14 +79,22 @@ class AgentJudge:
         """
         prompt = any_to_str(tasks)
         logger.debug(f"Running step with prompt: {prompt}")
-
         print(prompt)
-
+        
+        task_instruction = "Evaluate the following output or outputs"
+        if self.evaluation_criteria:
+            criteria_names = list(self.evaluation_criteria.keys())
+            if len(criteria_names) == 1:
+                task_instruction += f" based on {criteria_names[0]}"
+            else:
+                formatted_criteria = ", ".join(criteria_names[:-1]) + f" and {criteria_names[-1]}"
+                task_instruction += f" based on the criteria: {formatted_criteria}"
+        
         response = self.agent.run(
-            task=f"Evaluate the following output or outputs: {prompt}"
+            task=f"{task_instruction}: {prompt}"
         )
-        logger.debug(f"Received response: {response}")
 
+        logger.debug(f"Received response: {response}")
         return response
 
     def run(self, tasks: List[str]) -> List[str]:
@@ -112,8 +129,7 @@ class AgentJudge:
 
             # Update context for next iteration
             context = current_response
-
             # Add to conversation history
             logger.debug("Added message to conversation history.")
 
-        return responses
+        return responses
\ No newline at end of file

From 950184b5c57280aae0f939b6117aa8689374d0e2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=8E=8B=E7=A5=A5=E5=AE=87?= <625024108@qq.com>
Date: Thu, 17 Jul 2025 11:10:48 +0800
Subject: [PATCH 2/3] Add example for AgentJudge with evaluation criteria

---
 ...agent_judge_evaluation_criteria_example.py | 100 ++++++++++++++++++
 1 file changed, 100 insertions(+)
 create mode 100644 examples/single_agent/reasoning_agent_examples/agent_judge_evaluation_criteria_example.py

diff --git a/examples/single_agent/reasoning_agent_examples/agent_judge_evaluation_criteria_example.py b/examples/single_agent/reasoning_agent_examples/agent_judge_evaluation_criteria_example.py
new file mode 100644
index 00000000..f8a1b044
--- /dev/null
+++ b/examples/single_agent/reasoning_agent_examples/agent_judge_evaluation_criteria_example.py
@@ -0,0 +1,100 @@
+"""
+Agent Judge with Evaluation Criteria Example
+
+This example demonstrates how to use the AgentJudge with custom evaluation criteria.
+The evaluation_criteria parameter allows specifying different criteria with weights 
+for more targeted and customizable evaluation of agent outputs.
+"""
+
+from swarms.agents.agent_judge import AgentJudge
+import os
+from dotenv import load_dotenv
+
+load_dotenv()
+
+# Example 1: Basic usage with evaluation criteria
+print("\n=== Example 1: Using Custom Evaluation Criteria ===\n")
+
+# Create an AgentJudge with custom evaluation criteria
+judge = AgentJudge(
+    model_name="claude-3-7-sonnet-20250219",  # Use any available model
+    evaluation_criteria={
+        "correctness": 0.5,
+        "problem_solving_approach": 0.3, 
+        "explanation_clarity": 0.2
+    }
+)
+
+# Sample output to evaluate
+task_response = [
+    "Task: Determine the time complexity of a binary search algorithm and explain your reasoning.\n\n"
+    "Agent response: The time complexity of binary search is O(log n). In each step, "
+    "we divide the search space in half, resulting in a logarithmic relationship between "
+    "the input size and the number of operations. This can be proven by solving the "
+    "recurrence relation T(n) = T(n/2) + O(1), which gives us T(n) = O(log n)."
+]
+
+# Run evaluation
+evaluation = judge.run(task_response)
+print(evaluation[0])
+
+# Example 2: Specialized criteria for code evaluation
+print("\n=== Example 2: Code Evaluation with Specialized Criteria ===\n")
+
+code_judge = AgentJudge(
+    model_name="claude-3-7-sonnet-20250219",
+    agent_name="code_judge",
+    evaluation_criteria={
+        "code_correctness": 0.4,
+        "code_efficiency": 0.3,
+        "code_readability": 0.3
+    }
+)
+
+# Sample code to evaluate
+code_response = [
+    "Task: Write a function to find the maximum subarray sum in an array of integers.\n\n"
+    "Agent response:\n```python\n"
+    "def max_subarray_sum(arr):\n"
+    "    current_sum = max_sum = arr[0]\n"
+    "    for i in range(1, len(arr)):\n"
+    "        current_sum = max(arr[i], current_sum + arr[i])\n"
+    "        max_sum = max(max_sum, current_sum)\n"
+    "    return max_sum\n\n"
+    "# Example usage\n"
+    "print(max_subarray_sum([-2, 1, -3, 4, -1, 2, 1, -5, 4]))  # Output: 6 (subarray [4, -1, 2, 1])\n"
+    "```\n"
+    "This implementation uses Kadane's algorithm which has O(n) time complexity and "
+    "O(1) space complexity, making it optimal for this problem."
+]
+
+code_evaluation = code_judge.run(code_response)
+print(code_evaluation[0])
+
+# Example 3: Comparing multiple responses
+print("\n=== Example 3: Comparing Multiple Agent Responses ===\n")
+
+comparison_judge = AgentJudge(
+    model_name="claude-3-7-sonnet-20250219",
+    evaluation_criteria={
+        "accuracy": 0.6,
+        "completeness": 0.4
+    }
+)
+
+multiple_responses = comparison_judge.run([
+    "Task: Explain the CAP theorem in distributed systems.\n\n"
+    "Agent A response: CAP theorem states that a distributed system cannot simultaneously "
+    "provide Consistency, Availability, and Partition tolerance. In practice, you must "
+    "choose two out of these three properties.",
+    
+    "Task: Explain the CAP theorem in distributed systems.\n\n"
+    "Agent B response: The CAP theorem, formulated by Eric Brewer, states that in a "
+    "distributed data store, you can only guarantee two of the following three properties: "
+    "Consistency (all nodes see the same data at the same time), Availability (every request "
+    "receives a response), and Partition tolerance (the system continues to operate despite "
+    "network failures). Most modern distributed systems choose to sacrifice consistency in "
+    "favor of availability and partition tolerance, implementing eventual consistency models instead."
+])
+
+print(multiple_responses[0])
\ No newline at end of file

From a0fadf2874365c37cddd288ece0eea42b4ef73bb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=8E=8B=E7=A5=A5=E5=AE=87?= <625024108@qq.com>
Date: Fri, 18 Jul 2025 11:52:19 +0800
Subject: [PATCH 3/3] Update agent_judge.py to resolve conflicts

---
 swarms/agents/agent_judge.py | 424 +++++++++++++++++++++++++++++------
 1 file changed, 352 insertions(+), 72 deletions(-)

diff --git a/swarms/agents/agent_judge.py b/swarms/agents/agent_judge.py
index 09dc9e34..9a5c1e3a 100644
--- a/swarms/agents/agent_judge.py
+++ b/swarms/agents/agent_judge.py
@@ -1,54 +1,105 @@
-from typing import List, Dict, Optional
+import traceback
+from typing import List, Optional, Union, Dict
+import uuid
 
 from swarms.prompts.agent_judge_prompt import AGENT_JUDGE_PROMPT
-
 from swarms.structs.agent import Agent
-
 from swarms.structs.conversation import Conversation
-
 from swarms.utils.any_to_str import any_to_str
 
-from loguru import logger
+class AgentJudgeInitializationError(Exception):
+    """
+    Exception raised when there is an error initializing the AgentJudge.
+    """
+    pass
+
+class AgentJudgeExecutionError(Exception):
+    """
+    Exception raised when there is an error executing the AgentJudge.
+    """
+    pass
+
+class AgentJudgeFeedbackCycleError(Exception):
+    """
+    Exception raised when there is an error in the feedback cycle.
+    """
+    pass
 
 class AgentJudge:
     """
-    A class to represent an agent judge that processes tasks and generates responses.
+    A specialized agent designed to evaluate and judge outputs from other agents or systems.
+    The AgentJudge acts as a quality control mechanism, providing objective assessments
+    and feedback on various types of content, decisions, or outputs. It's based on research
+    in LLM-based evaluation systems and can maintain context across multiple evaluations.
+    This implementation supports both single task evaluation and batch processing with
+    iterative refinement capabilities.
 
     Attributes:
+        id (str): Unique identifier for the judge agent instance.
         agent_name (str): The name of the agent judge.
-        system_prompt (str): The system prompt for the agent.
-        model_name (str): The model name used for generating responses.
+        system_prompt (str): The system prompt for the agent containing evaluation instructions.
+        model_name (str): The model name used for generating evaluations (e.g., "openai/o1", "gpt-4").
         conversation (Conversation): An instance of the Conversation class to manage conversation history.
-        max_loops (int): The maximum number of iterations to run the tasks.
-        agent (Agent): An instance of the Agent class that performs the task execution.
+        max_loops (int): The maximum number of evaluation iterations to run.
+        verbose (bool): Whether to enable verbose logging.
+        agent (Agent): An instance of the Agent class that performs the evaluation execution.
         evaluation_criteria (Dict[str, float]): Dictionary of evaluation criteria and their weights.
+
+    Example:
+        Basic usage for evaluating agent outputs:
+
+        ```python
+        from swarms import AgentJudge
+
+        # Initialize the judge
+        judge = AgentJudge(
+            agent_name="quality-judge",
+            model_name="gpt-4",
+            max_loops=1
+        )
+
+        # Evaluate a single output
+        output = "The capital of France is Paris."
+        evaluation = judge.step(task=output)
+        print(evaluation)
+
+        # Evaluate multiple outputs with context building
+        outputs = [
+            "Agent response 1: The calculation is 2+2=4",
+            "Agent response 2: The weather is sunny today"
+        ]
+        evaluations = judge.run(tasks=outputs)
+        ```
+
+    Methods:
+        step(task: str = None, tasks: List[str] = None, img: str = None) -> str:
+            Processes a single task or list of tasks and returns the agent's evaluation.
+        run(task: str = None, tasks: List[str] = None, img: str = None) -> List[str]:
+            Executes evaluation in a loop with context building, collecting responses.
+        run_batched(tasks: List[str] = None, imgs: List[str] = None) -> List[str]:
+            Executes batch evaluation of tasks with corresponding images.
     """
 
     def __init__(
         self,
-        agent_name: str = "agent-judge-01",
+        id: str = str(uuid.uuid4()),
+        agent_name: str = "Agent Judge",
+        description: str = "You're an expert AI agent judge. Carefully review the following output(s) generated by another agent. Your job is to provide a detailed, constructive, and actionable critique that will help the agent improve its future performance.",
         system_prompt: str = AGENT_JUDGE_PROMPT,
         model_name: str = "openai/o1",
         max_loops: int = 1,
+        verbose: bool = False,
         evaluation_criteria: Optional[Dict[str, float]] = None,
-    ) -> None:
-        """
-        Initializes the AgentJudge with the specified parameters.
-
-        Args:
-            agent_name (str): The name of the agent judge.
-            system_prompt (str): The system prompt for the agent.
-            model_name (str): The model name used for generating responses.
-            max_loops (int): The maximum number of iterations to run the tasks.
-            evaluation_criteria (Optional[Dict[str, float]]): Dictionary of evaluation criteria
-                and their weights. Keys are criteria names, values are weights.
-                Example: {"correctness": 0.4, "efficiency": 0.3, "clarity": 0.3}
-        """
+        *args,
+        **kwargs,
+    ):
+        self.id = id
         self.agent_name = agent_name
         self.system_prompt = system_prompt
         self.model_name = model_name
         self.conversation = Conversation(time_enabled=False)
         self.max_loops = max_loops
+        self.verbose = verbose
         self.evaluation_criteria = evaluation_criteria or {}
         
         # Enhance system prompt with evaluation criteria if provided
@@ -58,78 +109,307 @@ class AgentJudge:
             for criterion, weight in self.evaluation_criteria.items():
                 criteria_str += f"- {criterion}: weight = {weight}\n"
             enhanced_prompt += criteria_str
-        
+
         self.agent = Agent(
             agent_name=agent_name,
-            agent_description="You're the agent judge",
+            agent_description=description,
             system_prompt=enhanced_prompt,
             model_name=model_name,
             max_loops=1,
+            *args,
+            **kwargs,
         )
 
-    def step(self, tasks: List[str]) -> str:
+    def feedback_cycle_step(
+        self,
+        agent: Union[Agent, callable],
+        task: str,
+        img: Optional[str] = None,
+    ):
+        try:
+            # First run the main agent
+            agent_output = agent.run(task=task, img=img)
+
+            # Then run the judge agent
+            judge_output = self.run(task=agent_output, img=img)
+
+            # Run the main agent again with the judge's feedback, using a much improved prompt
+            improved_prompt = (
+                f"You have received the following detailed feedback from the expert agent judge ({self.agent_name}):\n\n"
+                f"--- FEEDBACK START ---\n{judge_output}\n--- FEEDBACK END ---\n\n"
+                f"Your task is to thoughtfully revise and enhance your previous output based on this critique. "
+                f"Carefully address all identified weaknesses, incorporate the suggestions, and strive to maximize the strengths noted. "
+                f"Be specific, accurate, and actionable in your improvements. "
+                f"Here is the original task for reference:\n\n"
+                f"--- TASK ---\n{task}\n--- END TASK ---\n\n"
+                f"Please provide your improved and fully revised output below."
+            )
+
+            return agent.run(task=improved_prompt, img=img)
+        except Exception as e:
+            raise AgentJudgeFeedbackCycleError(
+                f"Error In Agent Judge Feedback Cycle: {e} Traceback: {traceback.format_exc()}"
+            )
+
+    def feedback_cycle(
+        self,
+        agent: Union[Agent, callable],
+        task: str,
+        img: Optional[str] = None,
+        loops: int = 1,
+    ):
+        loop = 0
+        original_task = task  # Preserve the original task
+        current_output = None  # Track the current output
+        all_outputs = []  # Collect all outputs from each iteration
+
+        while loop < loops:
+            # First iteration: run the standard feedback cycle step
+            current_output = self.feedback_cycle_step(
+                agent, original_task, img
+            )
+
+            # Add the current output to our collection
+            all_outputs.append(current_output)
+            loop += 1
+
+        return all_outputs
+
+    def step(
+        self,
+        task: str = None,
+        tasks: Optional[List[str]] = None,
+        img: Optional[str] = None,
+    ) -> str:
         """
-        Processes a list of tasks and returns the agent's response.
+        Processes a single task or list of tasks and returns the agent's evaluation.
+        This method performs a one-shot evaluation of the provided content. It takes
+        either a single task string or a list of tasks and generates a comprehensive
+        evaluation with strengths, weaknesses, and improvement suggestions.
 
         Args:
-            tasks (List[str]): A list of tasks to be processed.
+            task (str, optional): A single task/output to be evaluated.
+            tasks (List[str], optional): A list of tasks/outputs to be evaluated.
+            img (str, optional): Path to an image file for multimodal evaluation.
 
         Returns:
-            str: The response generated by the agent.
+            str: A detailed evaluation response from the agent including:
+                - Strengths: What the agent/output did well
+                - Weaknesses: Areas that need improvement
+                - Suggestions: Specific recommendations for improvement
+                - Factual accuracy assessment
+
+        Raises:
+            ValueError: If neither task nor tasks are provided.
+
+        Example:
+            ```python
+            # Single task evaluation
+            evaluation = judge.step(task="The answer is 42.")
+
+            # Multiple tasks evaluation
+            evaluation = judge.step(tasks=[
+                "Response 1: Paris is the capital of France",
+                "Response 2: 2 + 2 = 5"  # Incorrect
+            ])
+
+            # Multimodal evaluation
+            evaluation = judge.step(
+                task="Describe this image",
+                img="path/to/image.jpg"
+            )
+            ```
         """
-        prompt = any_to_str(tasks)
-        logger.debug(f"Running step with prompt: {prompt}")
-        print(prompt)
-        
-        task_instruction = "Evaluate the following output or outputs"
-        if self.evaluation_criteria:
-            criteria_names = list(self.evaluation_criteria.keys())
-            if len(criteria_names) == 1:
-                task_instruction += f" based on {criteria_names[0]}"
+        try:
+            prompt = ""
+            if tasks:
+                prompt = any_to_str(tasks)
+            elif task:
+                prompt = task
             else:
-                formatted_criteria = ", ".join(criteria_names[:-1]) + f" and {criteria_names[-1]}"
-                task_instruction += f" based on the criteria: {formatted_criteria}"
-        
-        response = self.agent.run(
-            task=f"{task_instruction}: {prompt}"
-        )
+                raise ValueError("No tasks or task provided")
+                
+            # 添加评估标准到任务描述中
+            task_instruction = "You are an expert AI agent judge. Carefully review the following output(s) generated by another agent. "
+            task_instruction += "Your job is to provide a detailed, constructive, and actionable critique that will help the agent improve its future performance. "
+            task_instruction += "Your feedback should address the following points:\n"
+            task_instruction += "1. Strengths: What did the agent do well? Highlight any correct reasoning, clarity, or effective problem-solving.\n"
+            task_instruction += "2. Weaknesses: Identify any errors, omissions, unclear reasoning, or areas where the output could be improved.\n"
+            task_instruction += "3. Suggestions: Offer specific, practical recommendations for how the agent can improve its next attempt. "
+            task_instruction += "This may include advice on reasoning, structure, completeness, or style.\n"
+            task_instruction += "4. If relevant, point out any factual inaccuracies or logical inconsistencies.\n"
+            
+            # 在任务说明中添加评估标准
+            if self.evaluation_criteria:
+                criteria_names = list(self.evaluation_criteria.keys())
+                task_instruction += "\nPlease use these specific evaluation criteria with their respective weights:\n"
+                for criterion, weight in self.evaluation_criteria.items():
+                    task_instruction += f"- {criterion}: weight = {weight}\n"
+                    
+            task_instruction += "Be thorough, objective, and professional. Your goal is to help the agent learn and produce better results in the future.\n\n"
+            task_instruction += f"Output(s) to evaluate:\n{prompt}\n"
 
-        logger.debug(f"Received response: {response}")
-        return response
+            response = self.agent.run(
+                task=task_instruction,
+                img=img,
+            )
 
-    def run(self, tasks: List[str]) -> List[str]:
+            return response
+        except Exception as e:
+            error_message = (
+                f"AgentJudge encountered an error: {e}\n"
+                f"Traceback:\n{traceback.format_exc()}\n\n"
+                "If this issue persists, please:\n"
+                "- Open a GitHub issue: https://github.com/swarms-ai/swarms/issues\n"
+                "- Join our Discord for real-time support: swarms.ai\n"
+                "- Or book a call: https://cal.com/swarms\n"
+            )
+            raise AgentJudgeExecutionError(error_message)
+
+    def run(
+        self,
+        task: str = None,
+        tasks: Optional[List[str]] = None,
+        img: Optional[str] = None,
+    ):
         """
-        Executes the tasks in a loop, updating context and collecting responses.
+        Executes evaluation in multiple iterations with context building and refinement.
+        This method runs the evaluation process for the specified number of max_loops,
+        where each iteration builds upon the previous context. This allows for iterative
+        refinement of evaluations and deeper analysis over multiple passes.
 
         Args:
-            tasks (List[str]): A list of tasks to be executed.
+            task (str, optional): A single task/output to be evaluated.
+            tasks (List[str], optional): A list of tasks/outputs to be evaluated.
+            img (str, optional): Path to an image file for multimodal evaluation.
 
         Returns:
-            List[str]: A list of responses generated by the agent for each iteration.
+            List[str]: A list of evaluation responses, one for each iteration.
+                      Each subsequent evaluation includes context from previous iterations.
+
+        Example:
+            ```python
+            # Single task with iterative refinement
+            judge = AgentJudge(max_loops=3)
+            evaluations = judge.run(task="Agent output to evaluate")
+            # Returns 3 evaluations, each building on the previous
+
+            # Multiple tasks with context building
+            evaluations = judge.run(tasks=[
+                "First agent response",
+                "Second agent response"
+            ])
+
+            # With image analysis
+            evaluations = judge.run(
+                task="Analyze this chart",
+                img="chart.png"
+            )
+            ```
+
+        Note:
+            - The first iteration evaluates the original task(s)
+            - Subsequent iterations include context from previous evaluations
+            - This enables deeper analysis and refinement of judgments
+            - Useful for complex evaluations requiring multiple perspectives
         """
-        responses = []
-        context = ""
-
-        for _ in range(self.max_loops):
-            # Add context to the tasks if available
-            if context:
-                contextualized_tasks = [
-                    f"Previous context: {context}\nTask: {task}"
-                    for task in tasks
-                ]
-            else:
-                contextualized_tasks = tasks
+        try:
+            responses = []
+            context = ""
+
+            # Convert single task to list for consistent processing
+            if task and not tasks:
+                tasks = [task]
+                task = None  # Clear to avoid confusion in step method
+
+            for _ in range(self.max_loops):
+                # Add context to the tasks if available
+                if context and tasks:
+                    contextualized_tasks = [
+                        f"Previous context: {context}\nTask: {t}"
+                        for t in tasks
+                    ]
+                else:
+                    contextualized_tasks = tasks
 
-            # Get response for current iteration
-            current_response = self.step(contextualized_tasks)
-            responses.append(current_response)
-            logger.debug(
-                f"Current response added: {current_response}"
+                # Get response for current iteration
+                current_response = self.step(
+                    task=task,
+                    tasks=contextualized_tasks,
+                    img=img,
+                )
+
+                responses.append(current_response)
+
+                # Update context for next iteration
+                context = current_response
+
+            return responses
+        except Exception as e:
+            error_message = (
+                f"AgentJudge encountered an error: {e}\n"
+                f"Traceback:\n{traceback.format_exc()}\n\n"
+                "If this issue persists, please:\n"
+                "- Open a GitHub issue: https://github.com/swarms-ai/swarms/issues\n"
+                "- Join our Discord for real-time support: swarms.ai\n"
+                "- Or book a call: https://cal.com/swarms\n"
             )
+            raise AgentJudgeExecutionError(error_message)
+
+    def run_batched(
+        self,
+        tasks: Optional[List[str]] = None,
+        imgs: Optional[List[str]] = None,
+    ):
+        """
+        Executes batch evaluation of multiple tasks with corresponding images.
+        This method processes multiple task-image pairs independently, where each
+        task can be evaluated with its corresponding image. Unlike the run() method,
+        this doesn't build context between different tasks - each is evaluated
+        independently.
 
-            # Update context for next iteration
-            context = current_response
-            # Add to conversation history
-            logger.debug("Added message to conversation history.")
+        Args:
+            tasks (List[str], optional): A list of tasks/outputs to be evaluated.
+            imgs (List[str], optional): A list of image paths corresponding to each task.
+                                       Must be the same length as tasks if provided.
+
+        Returns:
+            List[List[str]]: A list of evaluation responses for each task. Each inner
+                           list contains the responses from all iterations (max_loops)
+                           for that particular task.
 
+        Example:
+            ```python
+            # Batch evaluation with images
+            tasks = [
+                "Describe what you see in this image",
+                "What's wrong with this chart?",
+                "Analyze the trends shown"
+            ]
+            images = [
+                "photo1.jpg",
+                "chart1.png",
+                "graph1.png"
+            ]
+            evaluations = judge.run_batched(tasks=tasks, imgs=images)
+            # Returns evaluations for each task-image pair
+
+            # Batch evaluation without images
+            evaluations = judge.run_batched(tasks=[
+                "Agent response 1",
+                "Agent response 2",
+                "Agent response 3"
+            ])
+            ```
+
+        Note:
+            - Each task is processed independently
+            - If imgs is provided, it must have the same length as tasks
+            - Each task goes through max_loops iterations independently
+            - No context is shared between different tasks in the batch
+        """
+        responses = []
+        for task, img in zip(tasks, imgs):
+            response = self.run(task=task, img=img)
+            responses.append(response)
         return responses
\ No newline at end of file