[FEAT][CouncilAsAJudge]

2 weeks ago · 5d5ac53c9e
parent 3edf5553f7
commit 5d5ac53c9e
10 changed files with 292 additions and 342 deletions
--- a/council_judge_example.py
+++ b/council_judge_example.py
@ -9,7 +9,7 @@ if __name__ == "__main__":
    base_agent = Agent(
        agent_name="Financial-Analysis-Agent",
        system_prompt="You are a financial expert helping users understand and establish ROTH IRAs.",
-        model_name="gpt-4o-mini",
+        model_name="claude-opus-4-20250514",
        max_loops=1,
    )

--- a/docs/swarms/structs/council_of_judges.md
+++ b/docs/swarms/structs/council_of_judges.md
@ -0,0 +1,283 @@
+# CouncilAsAJudge
+
+The `CouncilAsAJudge` is a sophisticated evaluation system that employs multiple AI agents to assess model responses across various dimensions. It provides comprehensive, multi-dimensional analysis of AI model outputs through parallel evaluation and aggregation.
+
+## Overview
+
+The `CouncilAsAJudge` implements a council of specialized AI agents that evaluate different aspects of a model's response. Each agent focuses on a specific dimension of evaluation, and their findings are aggregated into a comprehensive report.
+
+## Key Features
+
+- Parallel evaluation across multiple dimensions
+- Caching system for improved performance
+- Dynamic model selection
+- Comprehensive evaluation metrics
+- Thread-safe execution
+- Detailed technical analysis
+
+## Installation
+
+```bash
+pip install swarms
+```
+
+## Basic Usage
+
+```python
+from swarms import Agent, CouncilAsAJudge
+
+# Create a base agent
+base_agent = Agent(
+    agent_name="Financial-Analysis-Agent",
+    system_prompt="You are a financial expert helping users understand and establish ROTH IRAs.",
+    model_name="claude-opus-4-20250514",
+    max_loops=1,
+)
+
+# Run the base agent
+user_query = "How can I establish a ROTH IRA to buy stocks and get a tax break?"
+model_output = base_agent.run(user_query)
+
+# Create and run the council
+panel = CouncilAsAJudge()
+results = panel.run(user_query, model_output)
+print(results)
+```
+
+## Advanced Usage
+
+### Custom Model Configuration
+
+```python
+from swarms import CouncilAsAJudge
+
+# Initialize with custom model
+council = CouncilAsAJudge(
+    model_name="anthropic/claude-3-sonnet-20240229",
+    output_type="all",
+    cache_size=256,
+    max_workers=4,
+    random_model_name=False
+)
+```
+
+### Parallel Processing Configuration
+
+```python
+from swarms import CouncilAsAJudge
+
+# Configure parallel processing
+council = CouncilAsAJudge(
+    max_workers=8,  # Custom number of worker threads
+    random_model_name=True  # Enable dynamic model selection
+)
+```
+
+## Evaluation Dimensions
+
+The council evaluates responses across six key dimensions:
+
+1. **Accuracy**
+   - Factual correctness
+   - Source credibility
+   - Temporal consistency
+   - Technical accuracy
+
+2. **Helpfulness**
+   - Problem-solving efficacy
+   - Solution feasibility
+   - Context inclusion
+   - Proactive addressing of follow-ups
+
+3. **Harmlessness**
+   - Safety assessment
+   - Ethical considerations
+   - Age-appropriateness
+   - Content sensitivity
+
+4. **Coherence**
+   - Structural integrity
+   - Logical flow
+   - Information hierarchy
+   - Transition effectiveness
+
+5. **Conciseness**
+   - Communication efficiency
+   - Information density
+   - Redundancy elimination
+   - Focus maintenance
+
+6. **Instruction Adherence**
+   - Requirement coverage
+   - Constraint compliance
+   - Format matching
+   - Scope appropriateness
+
+## API Reference
+
+### CouncilAsAJudge
+
+```python
+class CouncilAsAJudge:
+    def __init__(
+        self,
+        id: str = swarm_id(),
+        name: str = "CouncilAsAJudge",
+        description: str = "Evaluates the model's response across multiple dimensions",
+        model_name: str = "gpt-4o-mini",
+        output_type: str = "all",
+        cache_size: int = 128,
+        max_workers: int = None,
+        random_model_name: bool = True,
+    )
+```
+
+#### Parameters
+
+- `id` (str): Unique identifier for the council
+- `name` (str): Display name of the council
+- `description` (str): Description of the council's purpose
+- `model_name` (str): Name of the model to use for evaluations
+- `output_type` (str): Type of output to return
+- `cache_size` (int): Size of the LRU cache for prompts
+- `max_workers` (int): Maximum number of worker threads
+- `random_model_name` (bool): Whether to use random model selection
+
+### Methods
+
+#### run
+
+```python
+def run(self, task: str, model_response: str) -> None
+```
+
+Evaluates a model response across all dimensions.
+
+##### Parameters
+
+- `task` (str): Original user prompt
+- `model_response` (str): Model's response to evaluate
+
+##### Returns
+
+- Comprehensive evaluation report
+
+## Examples
+
+### Financial Analysis Example
+
+```python
+from swarms import Agent, CouncilAsAJudge
+
+# Create financial analysis agent
+financial_agent = Agent(
+    agent_name="Financial-Analysis-Agent",
+    system_prompt="You are a financial expert helping users understand and establish ROTH IRAs.",
+    model_name="claude-opus-4-20250514",
+    max_loops=1,
+)
+
+# Run analysis
+query = "How can I establish a ROTH IRA to buy stocks and get a tax break?"
+response = financial_agent.run(query)
+
+# Evaluate response
+council = CouncilAsAJudge()
+evaluation = council.run(query, response)
+print(evaluation)
+```
+
+### Technical Documentation Example
+
+```python
+from swarms import Agent, CouncilAsAJudge
+
+# Create documentation agent
+doc_agent = Agent(
+    agent_name="Documentation-Agent",
+    system_prompt="You are a technical documentation expert.",
+    model_name="gpt-4",
+    max_loops=1,
+)
+
+# Generate documentation
+query = "Explain how to implement a REST API using FastAPI"
+response = doc_agent.run(query)
+
+# Evaluate documentation quality
+council = CouncilAsAJudge(
+    model_name="anthropic/claude-3-sonnet-20240229",
+    output_type="all"
+)
+evaluation = council.run(query, response)
+print(evaluation)
+```
+
+## Best Practices
+
+### Model Selection
+
+!!! tip "Model Selection Best Practices"
+    - Choose appropriate models for your use case
+    - Consider using random model selection for diverse evaluations
+    - Match model capabilities to evaluation requirements
+
+### Performance Optimization
+
+!!! note "Performance Tips"
+    - Adjust cache size based on memory constraints
+    - Configure worker threads based on CPU cores
+    - Monitor memory usage with large responses
+
+### Error Handling
+
+!!! warning "Error Handling Guidelines"
+    - Implement proper exception handling
+    - Monitor evaluation failures
+    - Log evaluation results for analysis
+
+### Resource Management
+
+!!! info "Resource Management"
+    - Clean up resources after evaluation
+    - Monitor thread pool usage
+    - Implement proper shutdown procedures
+
+## Troubleshooting
+
+### Memory Issues
+
+!!! danger "Memory Problems"
+    If you encounter memory-related problems:
+
+    - Reduce cache size
+    - Decrease number of worker threads
+    - Process smaller chunks of text
+
+### Performance Problems
+
+!!! warning "Performance Issues"
+    To improve performance:
+
+    - Increase cache size
+    - Adjust worker thread count
+    - Use more efficient models
+
+### Evaluation Failures
+
+!!! danger "Evaluation Issues"
+    When evaluations fail:
+
+    - Check model availability
+    - Verify input format
+    - Monitor error logs
+
+## Contributing
+
+!!! success "Contributing"
+    Contributions are welcome! Please feel free to submit a Pull Request.
+
+## License
+
+!!! info "License"
+    This project is licensed under the MIT License - see the LICENSE file for details.
--- a/example_concurrent.py
+++ b/example_concurrent.py
@ -1,92 +0,0 @@
-from swarms.structs.agent import Agent
-from te import run_concurrently_greenlets, with_retries
-from typing import Callable, List, Tuple
-
-
-# Define some example agent tasks
-@with_retries(max_retries=2)
-def financial_analysis_task(query: str) -> str:
-    agent = Agent(
-        agent_name="Financial-Analysis-Agent",
-        agent_description="Personal finance advisor agent",
-        system_prompt="You are a personal finance advisor agent",
-        max_loops=2,
-        model_name="gpt-4o-mini",
-        dynamic_temperature_enabled=True,
-        interactive=False,
-        output_type="final",
-        safety_prompt_on=True,
-    )
-    return agent.run(query)
-
-
-@with_retries(max_retries=2)
-def investment_advice_task(query: str) -> str:
-    agent = Agent(
-        agent_name="Investment-Advisor-Agent",
-        agent_description="Investment strategy advisor agent",
-        system_prompt="You are an investment strategy advisor agent",
-        max_loops=2,
-        model_name="gpt-4o-mini",
-        dynamic_temperature_enabled=True,
-        interactive=False,
-        output_type="final",
-        safety_prompt_on=True,
-    )
-    return agent.run(query)
-
-
-async def market_analysis_task(query: str) -> str:
-    agent = Agent(
-        agent_name="Market-Analysis-Agent",
-        agent_description="Market analysis agent",
-        system_prompt="You are a market analysis agent",
-        max_loops=2,
-        model_name="gpt-4o-mini",
-        dynamic_temperature_enabled=True,
-        interactive=False,
-        output_type="final",
-        safety_prompt_on=True,
-    )
-    return agent.run(query)
-
-
-def main():
-    # Define the tasks to run concurrently
-    tasks: List[Tuple[Callable, tuple, dict]] = [
-        (
-            financial_analysis_task,
-            ("What are the best practices for saving money?",),
-            {},
-        ),
-        (
-            investment_advice_task,
-            ("What are the current market trends?",),
-            {},
-        ),
-        (
-            market_analysis_task,
-            ("Analyze the current market conditions",),
-            {},
-        ),
-    ]
-
-    # Run the tasks concurrently
-    results = run_concurrently_greenlets(
-        tasks,
-        timeout=30,  # 30 seconds global timeout
-        max_concurrency=3,  # Run 3 tasks concurrently
-        max_retries=2,
-        task_timeout=10,  # 10 seconds per task timeout
-    )
-
-    # Process and display results
-    for i, result in enumerate(results):
-        if isinstance(result, Exception):
-            print(f"Task {i} failed with error: {result}")
-        else:
-            print(f"Task {i} succeeded with result: {result}")
-
-
-if __name__ == "__main__":
-    main()
--- a/examples/models/claude_4.py
+++ b/examples/models/claude_4.py
--- a/examples/models/claude_4_example.py
+++ b/examples/models/claude_4_example.py
--- a/examples/redis_conversation.py
+++ b/examples/redis_conversation.py
--- a/examples/single_agent/rag/pinecone_example.py
+++ b/examples/single_agent/rag/pinecone_example.py
--- a/swarms/structs/init.py
+++ b/swarms/structs/init.py
@ -78,6 +78,7 @@ from swarms.structs.swarming_architectures import (
    star_swarm,
 )
 from swarms.structs.auto_swarm_builder import AutoSwarmBuilder
+from swarms.structs.council_judge import CouncilAsAJudge

 __all__ = [
    "Agent",
@ -146,4 +147,5 @@ __all__ = [
    "get_agents_info",
    "get_swarms_info",
    "AutoSwarmBuilder",
+    "CouncilAsAJudge",
 ]
--- a/swarms/structs/council_judge.py
+++ b/swarms/structs/council_judge.py
@ -1,11 +1,13 @@
-from typing import Dict, Tuple
-from functools import lru_cache
 import multiprocessing
+import uuid
 from concurrent.futures import ThreadPoolExecutor, as_completed
+from functools import lru_cache
+from typing import Dict, Tuple
+
+from loguru import logger
+
 from swarms.structs.agent import Agent
 from swarms.structs.conversation import Conversation
-from loguru import logger
-import uuid
 from swarms.structs.ma_utils import set_random_models_for_agents
 from swarms.utils.history_output_formatter import (
    history_output_formatter,
--- a/te.py
+++ b/te.py
@ -1,245 +0,0 @@
-import gevent
-from gevent import monkey, pool
-import asyncio
-from functools import wraps
-from typing import Callable, List, Tuple, Union, Optional, Any, Dict
-import time
-from contextlib import contextmanager
-from dataclasses import dataclass
-from datetime import datetime
-from loguru import logger
-
-# Move monkey patching to the top and be more specific about what we patch
-monkey.patch_all(thread=False, select=False, ssl=False)
-
-
-@dataclass
-class TaskMetrics:
-    start_time: datetime
-    end_time: Optional[datetime] = None
-    success: bool = False
-    error: Optional[Exception] = None
-    retries: int = 0
-
-
-class TaskExecutionError(Exception):
-    """Custom exception for task execution errors"""
-
-    def __init__(self, task_name: str, error: Exception):
-        self.task_name = task_name
-        self.original_error = error
-        super().__init__(
-            f"Task {task_name} failed with error: {str(error)}"
-        )
-
-
-@contextmanager
-def task_timer(task_name: str):
-    """Context manager for timing task execution"""
-    start_time = datetime.now()
-    try:
-        yield
-    finally:
-        end_time = datetime.now()
-        duration = (end_time - start_time).total_seconds()
-        logger.debug(
-            f"Task {task_name} completed in {duration:.2f} seconds"
-        )
-
-
-def with_retries(max_retries: int = 3, delay: float = 1.0):
-    def decorator(func):
-        @wraps(func)
-        def wrapper(*args, **kwargs):
-            last_exception = None
-            for attempt in range(max_retries):
-                try:
-                    return func(*args, **kwargs)
-                except Exception as e:
-                    last_exception = e
-                    if attempt < max_retries - 1:
-                        time.sleep(
-                            delay * (attempt + 1)
-                        )  # Exponential backoff
-                        logger.warning(
-                            f"Retry {attempt + 1}/{max_retries} for {func.__name__}"
-                        )
-                    else:
-                        logger.error(
-                            f"All {max_retries} retries failed for {func.__name__}"
-                        )
-                        return last_exception  # Return the exception instead of raising it
-            return last_exception
-
-        return wrapper
-
-    return decorator
-
-
-def run_concurrently_greenlets(
-    tasks: List[Union[Callable, Tuple[Callable, tuple, dict]]],
-    timeout: Optional[float] = None,
-    max_concurrency: int = 100,
-    max_retries: int = 3,
-    task_timeout: Optional[float] = None,
-    metrics: Optional[Dict[str, TaskMetrics]] = None,
-) -> List[Any]:
-    """
-    Execute multiple tasks concurrently using gevent greenlets.
-
-    Args:
-        tasks: List of tasks to execute. Each task can be a callable or a tuple of (callable, args, kwargs)
-        timeout: Global timeout for all tasks in seconds
-        max_concurrency: Maximum number of concurrent tasks
-        max_retries: Maximum number of retries per task
-        task_timeout: Individual task timeout in seconds
-        metrics: Optional dictionary to store task execution metrics
-
-    Returns:
-        List of results from all tasks. Failed tasks will return their exception.
-    """
-    if metrics is None:
-        metrics = {}
-
-    pool_obj = pool.Pool(max_concurrency)
-    jobs = []
-    start_time = datetime.now()
-
-    def wrapper(task_info):
-        if isinstance(task_info, tuple):
-            fn, args, kwargs = task_info
-        else:
-            fn, args, kwargs = task_info, (), {}
-
-        task_name = (
-            fn.__name__ if hasattr(fn, "__name__") else str(fn)
-        )
-        metrics[task_name] = TaskMetrics(start_time=datetime.now())
-
-        with task_timer(task_name):
-            try:
-                if asyncio.iscoroutinefunction(fn):
-                    # Handle async functions
-                    loop = asyncio.new_event_loop()
-                    asyncio.set_event_loop(loop)
-                    try:
-                        if task_timeout:
-                            result = asyncio.wait_for(
-                                fn(*args, **kwargs),
-                                timeout=task_timeout,
-                            )
-                        else:
-                            result = loop.run_until_complete(
-                                fn(*args, **kwargs)
-                            )
-                        metrics[task_name].success = True
-                        return result
-                    finally:
-                        loop.close()
-                else:
-                    if task_timeout:
-                        with gevent.Timeout(
-                            task_timeout,
-                            TimeoutError(
-                                f"Task {task_name} timed out after {task_timeout} seconds"
-                            ),
-                        ):
-                            result = fn(*args, **kwargs)
-                    else:
-                        result = fn(*args, **kwargs)
-
-                    if isinstance(result, Exception):
-                        metrics[task_name].error = result
-                        return result
-
-                    metrics[task_name].success = True
-                    return result
-            except Exception as e:
-                metrics[task_name].error = e
-                logger.exception(
-                    f"Task {task_name} failed with error: {str(e)}"
-                )
-                return TaskExecutionError(task_name, e)
-            finally:
-                metrics[task_name].end_time = datetime.now()
-
-    try:
-        for task in tasks:
-            jobs.append(pool_obj.spawn(wrapper, task))
-
-        gevent.joinall(jobs, timeout=timeout)
-
-        results = []
-        for job in jobs:
-            if job.ready():
-                results.append(job.value)
-            else:
-                timeout_error = TimeoutError("Task timed out")
-                results.append(timeout_error)
-                if hasattr(job, "value") and hasattr(
-                    job.value, "__name__"
-                ):
-                    metrics[job.value.__name__].error = timeout_error
-                    metrics[job.value.__name__].end_time = (
-                        datetime.now()
-                    )
-
-        return results
-    except Exception:
-        logger.exception("Fatal error in task execution")
-        raise
-    finally:
-        # Cleanup
-        pool_obj.kill()
-        execution_time = (datetime.now() - start_time).total_seconds()
-        logger.info(
-            f"Total execution time: {execution_time:.2f} seconds"
-        )
-
-        # Log metrics summary
-        success_count = sum(1 for m in metrics.values() if m.success)
-        failure_count = len(metrics) - success_count
-        logger.info(
-            f"Task execution summary: {success_count} succeeded, {failure_count} failed"
-        )
-
-
-# # Example tasks
-# @with_retries(max_retries=3)
-# def task_1(x: int, y: int):
-#     import time
-
-#     time.sleep(1)
-#     return f"task 1 done with {x + y}"
-
-
-# @with_retries(max_retries=3)
-# def task_3():
-#     import time
-
-#     time.sleep(0.5)
-#     return "task 3 done"
-
-
-# async def async_task(x: int):
-#     await asyncio.sleep(1)
-#     return f"async task done with {x}"
-
-
-# if __name__ == "__main__":
-#     # Example usage with different types of tasks
-#     tasks = [
-#         (task_1, (1, 2), {}),  # Function with args
-#         (task_3, (), {}),  # Function without args (explicit)
-#         (async_task, (42,), {}),  # Async function
-#     ]
-
-#     results = run_concurrently_greenlets(
-#         tasks, timeout=5, max_concurrency=10, max_retries=3
-#     )
-
-#     for i, result in enumerate(results):
-#         if isinstance(result, Exception):
-#             print(f"Task {i} failed with {result}")
-#         else:
-#             print(f"Task {i} succeeded with result: {result}")