Merge pull request #958 from Wxysnx/agent_judge0717

Added `evaluation_criteria` parameter to AgentJudge class
4 days ago · f916c89cc1
parent dae71c765b b08d57931c
commit f916c89cc1
2 changed files with 152 additions and 20 deletions
--- a/examples/single_agent/reasoning_agent_examples/agent_judge_evaluation_criteria_example.py
+++ b/examples/single_agent/reasoning_agent_examples/agent_judge_evaluation_criteria_example.py
@ -0,0 +1,100 @@
+"""
+Agent Judge with Evaluation Criteria Example
+
+This example demonstrates how to use the AgentJudge with custom evaluation criteria.
+The evaluation_criteria parameter allows specifying different criteria with weights 
+for more targeted and customizable evaluation of agent outputs.
+"""
+
+from swarms.agents.agent_judge import AgentJudge
+import os
+from dotenv import load_dotenv
+
+load_dotenv()
+
+# Example 1: Basic usage with evaluation criteria
+print("\n=== Example 1: Using Custom Evaluation Criteria ===\n")
+
+# Create an AgentJudge with custom evaluation criteria
+judge = AgentJudge(
+    model_name="claude-3-7-sonnet-20250219",  # Use any available model
+    evaluation_criteria={
+        "correctness": 0.5,
+        "problem_solving_approach": 0.3, 
+        "explanation_clarity": 0.2
+    }
+)
+
+# Sample output to evaluate
+task_response = [
+    "Task: Determine the time complexity of a binary search algorithm and explain your reasoning.\n\n"
+    "Agent response: The time complexity of binary search is O(log n). In each step, "
+    "we divide the search space in half, resulting in a logarithmic relationship between "
+    "the input size and the number of operations. This can be proven by solving the "
+    "recurrence relation T(n) = T(n/2) + O(1), which gives us T(n) = O(log n)."
+]
+
+# Run evaluation
+evaluation = judge.run(task_response)
+print(evaluation[0])
+
+# Example 2: Specialized criteria for code evaluation
+print("\n=== Example 2: Code Evaluation with Specialized Criteria ===\n")
+
+code_judge = AgentJudge(
+    model_name="claude-3-7-sonnet-20250219",
+    agent_name="code_judge",
+    evaluation_criteria={
+        "code_correctness": 0.4,
+        "code_efficiency": 0.3,
+        "code_readability": 0.3
+    }
+)
+
+# Sample code to evaluate
+code_response = [
+    "Task: Write a function to find the maximum subarray sum in an array of integers.\n\n"
+    "Agent response:\n```python\n"
+    "def max_subarray_sum(arr):\n"
+    "    current_sum = max_sum = arr[0]\n"
+    "    for i in range(1, len(arr)):\n"
+    "        current_sum = max(arr[i], current_sum + arr[i])\n"
+    "        max_sum = max(max_sum, current_sum)\n"
+    "    return max_sum\n\n"
+    "# Example usage\n"
+    "print(max_subarray_sum([-2, 1, -3, 4, -1, 2, 1, -5, 4]))  # Output: 6 (subarray [4, -1, 2, 1])\n"
+    "```\n"
+    "This implementation uses Kadane's algorithm which has O(n) time complexity and "
+    "O(1) space complexity, making it optimal for this problem."
+]
+
+code_evaluation = code_judge.run(code_response)
+print(code_evaluation[0])
+
+# Example 3: Comparing multiple responses
+print("\n=== Example 3: Comparing Multiple Agent Responses ===\n")
+
+comparison_judge = AgentJudge(
+    model_name="claude-3-7-sonnet-20250219",
+    evaluation_criteria={
+        "accuracy": 0.6,
+        "completeness": 0.4
+    }
+)
+
+multiple_responses = comparison_judge.run([
+    "Task: Explain the CAP theorem in distributed systems.\n\n"
+    "Agent A response: CAP theorem states that a distributed system cannot simultaneously "
+    "provide Consistency, Availability, and Partition tolerance. In practice, you must "
+    "choose two out of these three properties.",
+    
+    "Task: Explain the CAP theorem in distributed systems.\n\n"
+    "Agent B response: The CAP theorem, formulated by Eric Brewer, states that in a "
+    "distributed data store, you can only guarantee two of the following three properties: "
+    "Consistency (all nodes see the same data at the same time), Availability (every request "
+    "receives a response), and Partition tolerance (the system continues to operate despite "
+    "network failures). Most modern distributed systems choose to sacrifice consistency in "
+    "favor of availability and partition tolerance, implementing eventual consistency models instead."
+])
+
+print(multiple_responses[0])
--- a/swarms/agents/agent_judge.py
+++ b/swarms/agents/agent_judge.py
@ -1,5 +1,7 @@
 import traceback
-from typing import List, Optional, Union
+
+from typing import List, Optional, Union, Dict
+
 import uuid

 from swarms.prompts.agent_judge_prompt import AGENT_JUDGE_PROMPT
@ -15,23 +17,20 @@ class AgentJudgeInitializationError(Exception):

    pass

-
 class AgentJudgeExecutionError(Exception):
    """
    Exception raised when there is an error executing the AgentJudge.
    """

-    pass

+    pass

 class AgentJudgeFeedbackCycleError(Exception):
    """
    Exception raised when there is an error in the feedback cycle.
    """
-
    pass

-
 class AgentJudge:
    """
    A specialized agent designed to evaluate and judge outputs from other agents or systems.
@ -53,6 +52,8 @@ class AgentJudge:
        verbose (bool): Whether to enable verbose logging.
        agent (Agent): An instance of the Agent class that performs the evaluation execution.

+        evaluation_criteria (Dict[str, float]): Dictionary of evaluation criteria and their weights.
+
    Example:
        Basic usage for evaluating agent outputs:

@ -82,7 +83,6 @@ class AgentJudge:
    Methods:
        step(task: str = None, tasks: List[str] = None, img: str = None) -> str:
            Processes a single task or list of tasks and returns the agent's evaluation.
-
        run(task: str = None, tasks: List[str] = None, img: str = None) -> List[str]:
            Executes evaluation in a loop with context building, collecting responses.

@ -99,6 +99,9 @@ class AgentJudge:
        model_name: str = "openai/o1",
        max_loops: int = 1,
        verbose: bool = False,
+
+        evaluation_criteria: Optional[Dict[str, float]] = None,
+
        *args,
        **kwargs,
    ):
@ -110,10 +113,23 @@ class AgentJudge:
        self.max_loops = max_loops
        self.verbose = verbose

+        self.evaluation_criteria = evaluation_criteria or {}
+        
+        # Enhance system prompt with evaluation criteria if provided
+        enhanced_prompt = system_prompt
+        if self.evaluation_criteria:
+            criteria_str = "\n\nEvaluation Criteria:\n"
+            for criterion, weight in self.evaluation_criteria.items():
+                criteria_str += f"- {criterion}: weight = {weight}\n"
+            enhanced_prompt += criteria_str
+
+
        self.agent = Agent(
            agent_name=agent_name,
            agent_description=description,
-            system_prompt=AGENT_JUDGE_PROMPT,
+
+            system_prompt=enhanced_prompt,
+
            model_name=model_name,
            max_loops=1,
            *args,
@ -144,6 +160,7 @@ class AgentJudge:
                f"--- TASK ---\n{task}\n--- END TASK ---\n\n"
                f"Please provide your improved and fully revised output below."
            )
+
            return agent.run(task=improved_prompt, img=img)
        except Exception as e:
            raise AgentJudgeFeedbackCycleError(
@ -207,6 +224,7 @@ class AgentJudge:
            # Single task evaluation
            evaluation = judge.step(task="The answer is 42.")

+
            # Multiple tasks evaluation
            evaluation = judge.step(tasks=[
                "Response 1: Paris is the capital of France",
@ -228,20 +246,29 @@ class AgentJudge:
                prompt = task
            else:
                raise ValueError("No tasks or task provided")
+                
+            # 添加评估标准到任务描述中
+            task_instruction = "You are an expert AI agent judge. Carefully review the following output(s) generated by another agent. "
+            task_instruction += "Your job is to provide a detailed, constructive, and actionable critique that will help the agent improve its future performance. "
+            task_instruction += "Your feedback should address the following points:\n"
+            task_instruction += "1. Strengths: What did the agent do well? Highlight any correct reasoning, clarity, or effective problem-solving.\n"
+            task_instruction += "2. Weaknesses: Identify any errors, omissions, unclear reasoning, or areas where the output could be improved.\n"
+            task_instruction += "3. Suggestions: Offer specific, practical recommendations for how the agent can improve its next attempt. "
+            task_instruction += "This may include advice on reasoning, structure, completeness, or style.\n"
+            task_instruction += "4. If relevant, point out any factual inaccuracies or logical inconsistencies.\n"
+            
+            # 在任务说明中添加评估标准
+            if self.evaluation_criteria:
+                criteria_names = list(self.evaluation_criteria.keys())
+                task_instruction += "\nPlease use these specific evaluation criteria with their respective weights:\n"
+                for criterion, weight in self.evaluation_criteria.items():
+                    task_instruction += f"- {criterion}: weight = {weight}\n"
+                    
+            task_instruction += "Be thorough, objective, and professional. Your goal is to help the agent learn and produce better results in the future.\n\n"
+            task_instruction += f"Output(s) to evaluate:\n{prompt}\n"

            response = self.agent.run(
-                task=(
-                    "You are an expert AI agent judge. Carefully review the following output(s) generated by another agent. "
-                    "Your job is to provide a detailed, constructive, and actionable critique that will help the agent improve its future performance. "
-                    "Your feedback should address the following points:\n"
-                    "1. Strengths: What did the agent do well? Highlight any correct reasoning, clarity, or effective problem-solving.\n"
-                    "2. Weaknesses: Identify any errors, omissions, unclear reasoning, or areas where the output could be improved.\n"
-                    "3. Suggestions: Offer specific, practical recommendations for how the agent can improve its next attempt. "
-                    "This may include advice on reasoning, structure, completeness, or style.\n"
-                    "4. If relevant, point out any factual inaccuracies or logical inconsistencies.\n"
-                    "Be thorough, objective, and professional. Your goal is to help the agent learn and produce better results in the future.\n\n"
-                    f"Output(s) to evaluate:\n{prompt}\n"
-                ),
+                task=task_instruction,
                img=img,
            )

@ -330,6 +357,7 @@ class AgentJudge:
                    tasks=contextualized_tasks,
                    img=img,
                )
+
                responses.append(current_response)

                # Update context for next iteration
@ -360,6 +388,7 @@ class AgentJudge:
        this doesn't build context between different tasks - each is evaluated
        independently.

+
        Args:
            tasks (List[str], optional): A list of tasks/outputs to be evaluated.
            imgs (List[str], optional): A list of image paths corresponding to each task.
@ -370,6 +399,7 @@ class AgentJudge:
                           list contains the responses from all iterations (max_loops)
                           for that particular task.

+
        Example:
            ```python
            # Batch evaluation with images
@ -383,7 +413,6 @@ class AgentJudge:
                "chart1.png",
                "graph1.png"
            ]
-
            evaluations = judge.run_batched(tasks=tasks, imgs=images)
            # Returns evaluations for each task-image pair

@ -395,6 +424,7 @@ class AgentJudge:
            ])
            ```

+
        Note:
            - Each task is processed independently
            - If imgs is provided, it must have the same length as tasks
@ -405,4 +435,6 @@ class AgentJudge:
        for task, img in zip(tasks, imgs):
            response = self.run(task=task, img=img)
            responses.append(response)
+
        return responses
+