add election_swarm and example

3 months ago · ac212fe31a
parent e1149cbf02
commit ac212fe31a
2 changed files with 64 additions and 460 deletions
--- a/examples/single_agent/reasoning_agent_examples/agent_judge_evaluation_criteria_example.py
+++ b/examples/single_agent/reasoning_agent_examples/agent_judge_evaluation_criteria_example.py
@ -1,100 +0,0 @@
 """
 Agent Judge with Evaluation Criteria Example
 This example demonstrates how to use the AgentJudge with custom evaluation criteria.
 The evaluation_criteria parameter allows specifying different criteria with weights 
 for more targeted and customizable evaluation of agent outputs.
 """
 from swarms.agents.agent_judge import AgentJudge
 import os
 from dotenv import load_dotenv
 load_dotenv()
 # Example 1: Basic usage with evaluation criteria
 print("\n=== Example 1: Using Custom Evaluation Criteria ===\n")
 # Create an AgentJudge with custom evaluation criteria
 judge = AgentJudge(
    model_name="claude-3-7-sonnet-20250219",  # Use any available model
    evaluation_criteria={
        "correctness": 0.5,
        "problem_solving_approach": 0.3, 
        "explanation_clarity": 0.2
    }
 )
 # Sample output to evaluate
 task_response = [
    "Task: Determine the time complexity of a binary search algorithm and explain your reasoning.\n\n"
    "Agent response: The time complexity of binary search is O(log n). In each step, "
    "we divide the search space in half, resulting in a logarithmic relationship between "
    "the input size and the number of operations. This can be proven by solving the "
    "recurrence relation T(n) = T(n/2) + O(1), which gives us T(n) = O(log n)."
 ]
 # Run evaluation
 evaluation = judge.run(task_response)
 print(evaluation[0])
 # Example 2: Specialized criteria for code evaluation
 print("\n=== Example 2: Code Evaluation with Specialized Criteria ===\n")
 code_judge = AgentJudge(
    model_name="claude-3-7-sonnet-20250219",
    agent_name="code_judge",
    evaluation_criteria={
        "code_correctness": 0.4,
        "code_efficiency": 0.3,
        "code_readability": 0.3
    }
 )
 # Sample code to evaluate
 code_response = [
    "Task: Write a function to find the maximum subarray sum in an array of integers.\n\n"
    "Agent response:\n```python\n"
    "def max_subarray_sum(arr):\n"
    "    current_sum = max_sum = arr[0]\n"
    "    for i in range(1, len(arr)):\n"
    "        current_sum = max(arr[i], current_sum + arr[i])\n"
    "        max_sum = max(max_sum, current_sum)\n"
    "    return max_sum\n\n"
    "# Example usage\n"
    "print(max_subarray_sum([-2, 1, -3, 4, -1, 2, 1, -5, 4]))  # Output: 6 (subarray [4, -1, 2, 1])\n"
    "```\n"
    "This implementation uses Kadane's algorithm which has O(n) time complexity and "
    "O(1) space complexity, making it optimal for this problem."
 ]
 code_evaluation = code_judge.run(code_response)
 print(code_evaluation[0])
 # Example 3: Comparing multiple responses
 print("\n=== Example 3: Comparing Multiple Agent Responses ===\n")
 comparison_judge = AgentJudge(
    model_name="claude-3-7-sonnet-20250219",
    evaluation_criteria={
        "accuracy": 0.6,
        "completeness": 0.4
    }
 )
 multiple_responses = comparison_judge.run([
    "Task: Explain the CAP theorem in distributed systems.\n\n"
    "Agent A response: CAP theorem states that a distributed system cannot simultaneously "
    "provide Consistency, Availability, and Partition tolerance. In practice, you must "
    "choose two out of these three properties.",
    "Task: Explain the CAP theorem in distributed systems.\n\n"
    "Agent B response: The CAP theorem, formulated by Eric Brewer, states that in a "
    "distributed data store, you can only guarantee two of the following three properties: "
    "Consistency (all nodes see the same data at the same time), Availability (every request "
    "receives a response), and Partition tolerance (the system continues to operate despite "
    "network failures). Most modern distributed systems choose to sacrifice consistency in "
    "favor of availability and partition tolerance, implementing eventual consistency models instead."
 ])
 print(multiple_responses[0])
--- a/swarms/agents/agent_judge.py
+++ b/swarms/agents/agent_judge.py
@ -1,415 +1,119 @@
-import traceback
+from typing import List
 from typing import List, Optional, Union, Dict
 import uuid
 from swarms.prompts.agent_judge_prompt import AGENT_JUDGE_PROMPT
 from swarms.structs.agent import Agent
 from swarms.structs.conversation import Conversation
 from swarms.utils.any_to_str import any_to_str
-class AgentJudgeInitializationError(Exception):
+from loguru import logger
    """
    Exception raised when there is an error initializing the AgentJudge.
    """
    pass
 class AgentJudgeExecutionError(Exception):
    """
    Exception raised when there is an error executing the AgentJudge.
    """
    pass
 class AgentJudgeFeedbackCycleError(Exception):
    """
    Exception raised when there is an error in the feedback cycle.
    """
    pass
 class AgentJudge:
    """
-    A specialized agent designed to evaluate and judge outputs from other agents or systems.
+    A class to represent an agent judge that processes tasks and generates responses.
    The AgentJudge acts as a quality control mechanism, providing objective assessments
    and feedback on various types of content, decisions, or outputs. It's based on research
    in LLM-based evaluation systems and can maintain context across multiple evaluations.
    This implementation supports both single task evaluation and batch processing with
    iterative refinement capabilities.
    Attributes:
        id (str): Unique identifier for the judge agent instance.
        agent_name (str): The name of the agent judge.
-        system_prompt (str): The system prompt for the agent containing evaluation instructions.
+        system_prompt (str): The system prompt for the agent.
-        model_name (str): The model name used for generating evaluations (e.g., "openai/o1", "gpt-4").
+        model_name (str): The model name used for generating responses.
        conversation (Conversation): An instance of the Conversation class to manage conversation history.
-        max_loops (int): The maximum number of evaluation iterations to run.
+        max_loops (int): The maximum number of iterations to run the tasks.
-        verbose (bool): Whether to enable verbose logging.
+        agent (Agent): An instance of the Agent class that performs the task execution.
        agent (Agent): An instance of the Agent class that performs the evaluation execution.
        evaluation_criteria (Dict[str, float]): Dictionary of evaluation criteria and their weights.
    Example:
        Basic usage for evaluating agent outputs:
        ```python
        from swarms import AgentJudge
        # Initialize the judge
        judge = AgentJudge(
            agent_name="quality-judge",
            model_name="gpt-4",
            max_loops=1
        )
        # Evaluate a single output
        output = "The capital of France is Paris."
        evaluation = judge.step(task=output)
        print(evaluation)
        # Evaluate multiple outputs with context building
        outputs = [
            "Agent response 1: The calculation is 2+2=4",
            "Agent response 2: The weather is sunny today"
        ]
        evaluations = judge.run(tasks=outputs)
        ```
    Methods:
-        step(task: str = None, tasks: List[str] = None, img: str = None) -> str:
+        step(tasks: List[str]) -> str:
-            Processes a single task or list of tasks and returns the agent's evaluation.
+            Processes a list of tasks and returns the agent's response.
-        run(task: str = None, tasks: List[str] = None, img: str = None) -> List[str]:
+
-            Executes evaluation in a loop with context building, collecting responses.
+        run(tasks: List[str]) -> List[str]:
-        run_batched(tasks: List[str] = None, imgs: List[str] = None) -> List[str]:
+            Executes the tasks in a loop, updating context and collecting responses.
            Executes batch evaluation of tasks with corresponding images.
    """
    def __init__(
        self,
-        id: str = str(uuid.uuid4()),
+        agent_name: str = "agent-judge-01",
        agent_name: str = "Agent Judge",
        description: str = "You're an expert AI agent judge. Carefully review the following output(s) generated by another agent. Your job is to provide a detailed, constructive, and actionable critique that will help the agent improve its future performance.",
        system_prompt: str = AGENT_JUDGE_PROMPT,
        model_name: str = "openai/o1",
        max_loops: int = 1,
-        verbose: bool = False,
+    ) -> None:
-        evaluation_criteria: Optional[Dict[str, float]] = None,
+        """
-        *args,
+        Initializes the AgentJudge with the specified parameters.
-        **kwargs,
+
-    ):
+        Args:
-        self.id = id
+            agent_name (str): The name of the agent judge.
            system_prompt (str): The system prompt for the agent.
            model_name (str): The model name used for generating responses.
            max_loops (int): The maximum number of iterations to run the tasks.
        """
        self.agent_name = agent_name
        self.system_prompt = system_prompt
        self.model_name = model_name
        self.conversation = Conversation(time_enabled=False)
        self.max_loops = max_loops
        self.verbose = verbose
        self.evaluation_criteria = evaluation_criteria or {}
        # Enhance system prompt with evaluation criteria if provided
        enhanced_prompt = system_prompt
        if self.evaluation_criteria:
            criteria_str = "\n\nEvaluation Criteria:\n"
            for criterion, weight in self.evaluation_criteria.items():
                criteria_str += f"- {criterion}: weight = {weight}\n"
            enhanced_prompt += criteria_str
        self.agent = Agent(
            agent_name=agent_name,
-            agent_description=description,
+            agent_description="You're the agent judge",
-            system_prompt=enhanced_prompt,
+            system_prompt=AGENT_JUDGE_PROMPT,
            model_name=model_name,
            max_loops=1,
            *args,
            **kwargs,
        )
-    def feedback_cycle_step(
+    def step(self, tasks: List[str]) -> str:
        self,
        agent: Union[Agent, callable],
        task: str,
        img: Optional[str] = None,
    ):
        try:
            # First run the main agent
            agent_output = agent.run(task=task, img=img)
            # Then run the judge agent
            judge_output = self.run(task=agent_output, img=img)
            # Run the main agent again with the judge's feedback, using a much improved prompt
            improved_prompt = (
                f"You have received the following detailed feedback from the expert agent judge ({self.agent_name}):\n\n"
                f"--- FEEDBACK START ---\n{judge_output}\n--- FEEDBACK END ---\n\n"
                f"Your task is to thoughtfully revise and enhance your previous output based on this critique. "
                f"Carefully address all identified weaknesses, incorporate the suggestions, and strive to maximize the strengths noted. "
                f"Be specific, accurate, and actionable in your improvements. "
                f"Here is the original task for reference:\n\n"
                f"--- TASK ---\n{task}\n--- END TASK ---\n\n"
                f"Please provide your improved and fully revised output below."
            )
            return agent.run(task=improved_prompt, img=img)
        except Exception as e:
            raise AgentJudgeFeedbackCycleError(
                f"Error In Agent Judge Feedback Cycle: {e} Traceback: {traceback.format_exc()}"
            )
    def feedback_cycle(
        self,
        agent: Union[Agent, callable],
        task: str,
        img: Optional[str] = None,
        loops: int = 1,
    ):
        loop = 0
        original_task = task  # Preserve the original task
        current_output = None  # Track the current output
        all_outputs = []  # Collect all outputs from each iteration
        while loop < loops:
            # First iteration: run the standard feedback cycle step
            current_output = self.feedback_cycle_step(
                agent, original_task, img
            )
            # Add the current output to our collection
            all_outputs.append(current_output)
            loop += 1
        return all_outputs
    def step(
        self,
        task: str = None,
        tasks: Optional[List[str]] = None,
        img: Optional[str] = None,
    ) -> str:
        """
-        Processes a single task or list of tasks and returns the agent's evaluation.
+        Processes a list of tasks and returns the agent's response.
        This method performs a one-shot evaluation of the provided content. It takes
        either a single task string or a list of tasks and generates a comprehensive
        evaluation with strengths, weaknesses, and improvement suggestions.
        Args:
-            task (str, optional): A single task/output to be evaluated.
+            tasks (List[str]): A list of tasks to be processed.
            tasks (List[str], optional): A list of tasks/outputs to be evaluated.
            img (str, optional): Path to an image file for multimodal evaluation.
        Returns:
-            str: A detailed evaluation response from the agent including:
+            str: The response generated by the agent.
                - Strengths: What the agent/output did well
                - Weaknesses: Areas that need improvement
                - Suggestions: Specific recommendations for improvement
                - Factual accuracy assessment
        Raises:
            ValueError: If neither task nor tasks are provided.
        Example:
            ```python
            # Single task evaluation
            evaluation = judge.step(task="The answer is 42.")
            # Multiple tasks evaluation
            evaluation = judge.step(tasks=[
                "Response 1: Paris is the capital of France",
                "Response 2: 2 + 2 = 5"  # Incorrect
            ])
            # Multimodal evaluation
            evaluation = judge.step(
                task="Describe this image",
                img="path/to/image.jpg"
            )
            ```
        """
-        try:
+        prompt = any_to_str(tasks)
-            prompt = ""
+        logger.debug(f"Running step with prompt: {prompt}")
            if tasks:
                prompt = any_to_str(tasks)
            elif task:
                prompt = task
            else:
                raise ValueError("No tasks or task provided")
            # 添加评估标准到任务描述中
            task_instruction = "You are an expert AI agent judge. Carefully review the following output(s) generated by another agent. "
            task_instruction += "Your job is to provide a detailed, constructive, and actionable critique that will help the agent improve its future performance. "
            task_instruction += "Your feedback should address the following points:\n"
            task_instruction += "1. Strengths: What did the agent do well? Highlight any correct reasoning, clarity, or effective problem-solving.\n"
            task_instruction += "2. Weaknesses: Identify any errors, omissions, unclear reasoning, or areas where the output could be improved.\n"
            task_instruction += "3. Suggestions: Offer specific, practical recommendations for how the agent can improve its next attempt. "
            task_instruction += "This may include advice on reasoning, structure, completeness, or style.\n"
            task_instruction += "4. If relevant, point out any factual inaccuracies or logical inconsistencies.\n"
-            # 在任务说明中添加评估标准
+        print(prompt)
            if self.evaluation_criteria:
                criteria_names = list(self.evaluation_criteria.keys())
                task_instruction += "\nPlease use these specific evaluation criteria with their respective weights:\n"
                for criterion, weight in self.evaluation_criteria.items():
                    task_instruction += f"- {criterion}: weight = {weight}\n"
-            task_instruction += "Be thorough, objective, and professional. Your goal is to help the agent learn and produce better results in the future.\n\n"
+        response = self.agent.run(
-            task_instruction += f"Output(s) to evaluate:\n{prompt}\n"
+            task=f"Evaluate the following output or outputs: {prompt}"
-
+        )
-            response = self.agent.run(
+        logger.debug(f"Received response: {response}")
                task=task_instruction,
                img=img,
            )
-            return response
+        return response
        except Exception as e:
            error_message = (
                f"AgentJudge encountered an error: {e}\n"
                f"Traceback:\n{traceback.format_exc()}\n\n"
                "If this issue persists, please:\n"
                "- Open a GitHub issue: https://github.com/swarms-ai/swarms/issues\n"
                "- Join our Discord for real-time support: swarms.ai\n"
                "- Or book a call: https://cal.com/swarms\n"
            )
            raise AgentJudgeExecutionError(error_message)
-    def run(
+    def run(self, tasks: List[str]) -> List[str]:
        self,
        task: str = None,
        tasks: Optional[List[str]] = None,
        img: Optional[str] = None,
    ):
        """
-        Executes evaluation in multiple iterations with context building and refinement.
+        Executes the tasks in a loop, updating context and collecting responses.
        This method runs the evaluation process for the specified number of max_loops,
        where each iteration builds upon the previous context. This allows for iterative
        refinement of evaluations and deeper analysis over multiple passes.
        Args:
-            task (str, optional): A single task/output to be evaluated.
+            tasks (List[str]): A list of tasks to be executed.
            tasks (List[str], optional): A list of tasks/outputs to be evaluated.
            img (str, optional): Path to an image file for multimodal evaluation.
        Returns:
-            List[str]: A list of evaluation responses, one for each iteration.
+            List[str]: A list of responses generated by the agent for each iteration.
                      Each subsequent evaluation includes context from previous iterations.
        Example:
            ```python
            # Single task with iterative refinement
            judge = AgentJudge(max_loops=3)
            evaluations = judge.run(task="Agent output to evaluate")
            # Returns 3 evaluations, each building on the previous
            # Multiple tasks with context building
            evaluations = judge.run(tasks=[
                "First agent response",
                "Second agent response"
            ])
            # With image analysis
            evaluations = judge.run(
                task="Analyze this chart",
                img="chart.png"
            )
            ```
        Note:
            - The first iteration evaluates the original task(s)
            - Subsequent iterations include context from previous evaluations
            - This enables deeper analysis and refinement of judgments
            - Useful for complex evaluations requiring multiple perspectives
        """
-        try:
+        responses = []
-            responses = []
+        context = ""
-            context = ""
+
-
+        for _ in range(self.max_loops):
-            # Convert single task to list for consistent processing
+            # Add context to the tasks if available
-            if task and not tasks:
+            if context:
-                tasks = [task]
+                contextualized_tasks = [
-                task = None  # Clear to avoid confusion in step method
+                    f"Previous context: {context}\nTask: {task}"
-
+                    for task in tasks
-            for _ in range(self.max_loops):
+                ]
-                # Add context to the tasks if available
+            else:
-                if context and tasks:
+                contextualized_tasks = tasks
                    contextualized_tasks = [
                        f"Previous context: {context}\nTask: {t}"
                        for t in tasks
                    ]
                else:
                    contextualized_tasks = tasks
                # Get response for current iteration
                current_response = self.step(
                    task=task,
                    tasks=contextualized_tasks,
                    img=img,
                )
                responses.append(current_response)
                # Update context for next iteration
                context = current_response
-            return responses
+            # Get response for current iteration
-        except Exception as e:
+            current_response = self.step(contextualized_tasks)
-            error_message = (
+            responses.append(current_response)
-                f"AgentJudge encountered an error: {e}\n"
+            logger.debug(
-                f"Traceback:\n{traceback.format_exc()}\n\n"
+                f"Current response added: {current_response}"
                "If this issue persists, please:\n"
                "- Open a GitHub issue: https://github.com/swarms-ai/swarms/issues\n"
                "- Join our Discord for real-time support: swarms.ai\n"
                "- Or book a call: https://cal.com/swarms\n"
            )
            raise AgentJudgeExecutionError(error_message)
    def run_batched(
        self,
        tasks: Optional[List[str]] = None,
        imgs: Optional[List[str]] = None,
    ):
        """
        Executes batch evaluation of multiple tasks with corresponding images.
        This method processes multiple task-image pairs independently, where each
        task can be evaluated with its corresponding image. Unlike the run() method,
        this doesn't build context between different tasks - each is evaluated
        independently.
        Args:
            tasks (List[str], optional): A list of tasks/outputs to be evaluated.
            imgs (List[str], optional): A list of image paths corresponding to each task.
                                       Must be the same length as tasks if provided.
-        Returns:
+            # Update context for next iteration
-            List[List[str]]: A list of evaluation responses for each task. Each inner
+            context = current_response
                           list contains the responses from all iterations (max_loops)
                           for that particular task.
        Example:
            ```python
            # Batch evaluation with images
            tasks = [
                "Describe what you see in this image",
                "What's wrong with this chart?",
                "Analyze the trends shown"
            ]
            images = [
                "photo1.jpg",
                "chart1.png",
                "graph1.png"
            ]
            evaluations = judge.run_batched(tasks=tasks, imgs=images)
            # Returns evaluations for each task-image pair
-            # Batch evaluation without images
+            # Add to conversation history
-            evaluations = judge.run_batched(tasks=[
+            logger.debug("Added message to conversation history.")
                "Agent response 1",
                "Agent response 2",
                "Agent response 3"
            ])
            ```
        Note:
            - Each task is processed independently
            - If imgs is provided, it must have the same length as tasks
            - Each task goes through max_loops iterations independently
            - No context is shared between different tasks in the batch
        """
        responses = []
        for task, img in zip(tasks, imgs):
            response = self.run(task=task, img=img)
            responses.append(response)
        return responses