You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
324 lines
12 KiB
324 lines
12 KiB
"""
|
|
Tests for the autonomous evaluation feature in AutoSwarmBuilder.
|
|
|
|
This test suite validates the iterative improvement functionality and evaluation system.
|
|
"""
|
|
|
|
import pytest
|
|
from unittest.mock import patch, MagicMock
|
|
|
|
from swarms.structs.auto_swarm_builder import (
|
|
AutoSwarmBuilder,
|
|
IterativeImprovementConfig,
|
|
EvaluationResult,
|
|
)
|
|
|
|
|
|
class TestAutonomousEvaluation:
|
|
"""Test suite for autonomous evaluation features"""
|
|
|
|
def test_iterative_improvement_config_defaults(self):
|
|
"""Test default configuration values"""
|
|
config = IterativeImprovementConfig()
|
|
|
|
assert config.max_iterations == 3
|
|
assert config.improvement_threshold == 0.1
|
|
assert "accuracy" in config.evaluation_dimensions
|
|
assert "helpfulness" in config.evaluation_dimensions
|
|
assert config.use_judge_agent is True
|
|
assert config.store_all_iterations is True
|
|
|
|
def test_iterative_improvement_config_custom(self):
|
|
"""Test custom configuration values"""
|
|
config = IterativeImprovementConfig(
|
|
max_iterations=5,
|
|
improvement_threshold=0.2,
|
|
evaluation_dimensions=["accuracy", "coherence"],
|
|
use_judge_agent=False,
|
|
store_all_iterations=False,
|
|
)
|
|
|
|
assert config.max_iterations == 5
|
|
assert config.improvement_threshold == 0.2
|
|
assert len(config.evaluation_dimensions) == 2
|
|
assert config.use_judge_agent is False
|
|
assert config.store_all_iterations is False
|
|
|
|
def test_evaluation_result_model(self):
|
|
"""Test EvaluationResult model creation and validation"""
|
|
result = EvaluationResult(
|
|
iteration=1,
|
|
task="Test task",
|
|
output="Test output",
|
|
evaluation_scores={"accuracy": 0.8, "helpfulness": 0.7},
|
|
feedback="Good performance",
|
|
strengths=["Clear response"],
|
|
weaknesses=["Could be more detailed"],
|
|
suggestions=["Add more examples"],
|
|
)
|
|
|
|
assert result.iteration == 1
|
|
assert result.task == "Test task"
|
|
assert result.evaluation_scores["accuracy"] == 0.8
|
|
assert len(result.strengths) == 1
|
|
assert len(result.weaknesses) == 1
|
|
assert len(result.suggestions) == 1
|
|
|
|
def test_auto_swarm_builder_init_with_evaluation(self):
|
|
"""Test AutoSwarmBuilder initialization with evaluation enabled"""
|
|
config = IterativeImprovementConfig(max_iterations=2)
|
|
|
|
with patch('swarms.structs.auto_swarm_builder.CouncilAsAJudge'):
|
|
with patch('swarms.structs.auto_swarm_builder.Agent'):
|
|
swarm = AutoSwarmBuilder(
|
|
name="TestSwarm",
|
|
description="Test swarm with evaluation",
|
|
enable_evaluation=True,
|
|
evaluation_config=config,
|
|
)
|
|
|
|
assert swarm.enable_evaluation is True
|
|
assert swarm.evaluation_config.max_iterations == 2
|
|
assert swarm.current_iteration == 0
|
|
assert len(swarm.evaluation_history) == 0
|
|
|
|
def test_auto_swarm_builder_init_without_evaluation(self):
|
|
"""Test AutoSwarmBuilder initialization with evaluation disabled"""
|
|
swarm = AutoSwarmBuilder(
|
|
name="TestSwarm",
|
|
description="Test swarm without evaluation",
|
|
enable_evaluation=False,
|
|
)
|
|
|
|
assert swarm.enable_evaluation is False
|
|
assert swarm.current_iteration == 0
|
|
assert len(swarm.evaluation_history) == 0
|
|
|
|
@patch('swarms.structs.auto_swarm_builder.CouncilAsAJudge')
|
|
@patch('swarms.structs.auto_swarm_builder.Agent')
|
|
def test_evaluation_system_initialization(self, mock_agent, mock_council):
|
|
"""Test evaluation system initialization"""
|
|
config = IterativeImprovementConfig()
|
|
|
|
swarm = AutoSwarmBuilder(
|
|
name="TestSwarm",
|
|
enable_evaluation=True,
|
|
evaluation_config=config,
|
|
)
|
|
|
|
# Verify CouncilAsAJudge was initialized
|
|
mock_council.assert_called_once()
|
|
|
|
# Verify improvement agent was created
|
|
mock_agent.assert_called_once()
|
|
assert mock_agent.call_args[1]['agent_name'] == 'ImprovementStrategist'
|
|
|
|
def test_get_improvement_agent_prompt(self):
|
|
"""Test improvement agent prompt generation"""
|
|
swarm = AutoSwarmBuilder(enable_evaluation=False)
|
|
prompt = swarm._get_improvement_agent_prompt()
|
|
|
|
assert "improvement strategist" in prompt.lower()
|
|
assert "evaluation feedback" in prompt.lower()
|
|
assert "recommendations" in prompt.lower()
|
|
|
|
def test_extract_dimension_score(self):
|
|
"""Test dimension score extraction from feedback"""
|
|
swarm = AutoSwarmBuilder(enable_evaluation=False)
|
|
|
|
# Test positive feedback
|
|
positive_feedback = "The response is accurate and helpful"
|
|
accuracy_score = swarm._extract_dimension_score(positive_feedback, "accuracy")
|
|
helpfulness_score = swarm._extract_dimension_score(positive_feedback, "helpfulness")
|
|
|
|
assert accuracy_score > 0.5
|
|
assert helpfulness_score > 0.5
|
|
|
|
# Test negative feedback
|
|
negative_feedback = "The response is inaccurate and unhelpful"
|
|
accuracy_score_neg = swarm._extract_dimension_score(negative_feedback, "accuracy")
|
|
helpfulness_score_neg = swarm._extract_dimension_score(negative_feedback, "helpfulness")
|
|
|
|
assert accuracy_score_neg < 0.5
|
|
assert helpfulness_score_neg < 0.5
|
|
|
|
# Test neutral feedback
|
|
neutral_feedback = "The response exists"
|
|
neutral_score = swarm._extract_dimension_score(neutral_feedback, "accuracy")
|
|
assert neutral_score == 0.5
|
|
|
|
def test_parse_feedback(self):
|
|
"""Test feedback parsing into strengths, weaknesses, and suggestions"""
|
|
swarm = AutoSwarmBuilder(enable_evaluation=False)
|
|
|
|
feedback = """
|
|
The response shows good understanding of the topic.
|
|
However, there are some issues with clarity.
|
|
I suggest adding more examples to improve comprehension.
|
|
The strength is in the factual accuracy.
|
|
The weakness is the lack of structure.
|
|
Recommend reorganizing the content.
|
|
"""
|
|
|
|
strengths, weaknesses, suggestions = swarm._parse_feedback(feedback)
|
|
|
|
assert len(strengths) > 0
|
|
assert len(weaknesses) > 0
|
|
assert len(suggestions) > 0
|
|
|
|
def test_get_evaluation_results(self):
|
|
"""Test getting evaluation results"""
|
|
swarm = AutoSwarmBuilder(enable_evaluation=False)
|
|
|
|
# Initially empty
|
|
assert len(swarm.get_evaluation_results()) == 0
|
|
|
|
# Add mock evaluation result
|
|
mock_result = EvaluationResult(
|
|
iteration=1,
|
|
task="test",
|
|
output="test output",
|
|
evaluation_scores={"accuracy": 0.8},
|
|
feedback="good",
|
|
strengths=["clear"],
|
|
weaknesses=["brief"],
|
|
suggestions=["expand"],
|
|
)
|
|
swarm.evaluation_history.append(mock_result)
|
|
|
|
results = swarm.get_evaluation_results()
|
|
assert len(results) == 1
|
|
assert results[0].iteration == 1
|
|
|
|
def test_get_best_iteration(self):
|
|
"""Test getting the best performing iteration"""
|
|
swarm = AutoSwarmBuilder(enable_evaluation=False)
|
|
|
|
# No iterations initially
|
|
assert swarm.get_best_iteration() is None
|
|
|
|
# Add mock evaluation results
|
|
result1 = EvaluationResult(
|
|
iteration=1,
|
|
task="test",
|
|
output="output1",
|
|
evaluation_scores={"accuracy": 0.6, "helpfulness": 0.5},
|
|
feedback="ok",
|
|
strengths=[],
|
|
weaknesses=[],
|
|
suggestions=[],
|
|
)
|
|
|
|
result2 = EvaluationResult(
|
|
iteration=2,
|
|
task="test",
|
|
output="output2",
|
|
evaluation_scores={"accuracy": 0.8, "helpfulness": 0.7},
|
|
feedback="better",
|
|
strengths=[],
|
|
weaknesses=[],
|
|
suggestions=[],
|
|
)
|
|
|
|
swarm.evaluation_history.extend([result1, result2])
|
|
|
|
best = swarm.get_best_iteration()
|
|
assert best.iteration == 2 # Second iteration has higher scores
|
|
|
|
@patch('swarms.structs.auto_swarm_builder.OpenAIFunctionCaller')
|
|
def test_create_agents_with_feedback_first_iteration(self, mock_function_caller):
|
|
"""Test agent creation for first iteration (no feedback)"""
|
|
swarm = AutoSwarmBuilder(enable_evaluation=False)
|
|
|
|
# Mock the function caller
|
|
mock_instance = MagicMock()
|
|
mock_function_caller.return_value = mock_instance
|
|
mock_instance.run.return_value.model_dump.return_value = {
|
|
"agents": [
|
|
{
|
|
"name": "TestAgent",
|
|
"description": "A test agent",
|
|
"system_prompt": "You are a test agent"
|
|
}
|
|
]
|
|
}
|
|
|
|
# Mock build_agent method
|
|
with patch.object(swarm, 'build_agent') as mock_build_agent:
|
|
mock_agent = MagicMock()
|
|
mock_build_agent.return_value = mock_agent
|
|
|
|
agents = swarm.create_agents_with_feedback("test task")
|
|
|
|
assert len(agents) == 1
|
|
mock_build_agent.assert_called_once()
|
|
|
|
def test_run_single_iteration_mode(self):
|
|
"""Test running in single iteration mode (evaluation disabled)"""
|
|
swarm = AutoSwarmBuilder(enable_evaluation=False)
|
|
|
|
with patch.object(swarm, 'create_agents') as mock_create:
|
|
with patch.object(swarm, 'initialize_swarm_router') as mock_router:
|
|
mock_create.return_value = []
|
|
mock_router.return_value = "test result"
|
|
|
|
result = swarm.run("test task")
|
|
|
|
assert result == "test result"
|
|
mock_create.assert_called_once_with("test task")
|
|
mock_router.assert_called_once()
|
|
|
|
|
|
class TestEvaluationIntegration:
|
|
"""Integration tests for the evaluation system"""
|
|
|
|
@patch('swarms.structs.auto_swarm_builder.CouncilAsAJudge')
|
|
@patch('swarms.structs.auto_swarm_builder.Agent')
|
|
@patch('swarms.structs.auto_swarm_builder.OpenAIFunctionCaller')
|
|
def test_evaluation_workflow(self, mock_function_caller, mock_agent, mock_council):
|
|
"""Test the complete evaluation workflow"""
|
|
# Setup mocks
|
|
mock_council_instance = MagicMock()
|
|
mock_council.return_value = mock_council_instance
|
|
mock_council_instance.run.return_value = "Evaluation feedback"
|
|
|
|
mock_agent_instance = MagicMock()
|
|
mock_agent.return_value = mock_agent_instance
|
|
mock_agent_instance.run.return_value = "Improvement suggestions"
|
|
|
|
mock_function_caller_instance = MagicMock()
|
|
mock_function_caller.return_value = mock_function_caller_instance
|
|
mock_function_caller_instance.run.return_value.model_dump.return_value = {
|
|
"agents": [
|
|
{
|
|
"name": "TestAgent",
|
|
"description": "Test",
|
|
"system_prompt": "Test prompt"
|
|
}
|
|
]
|
|
}
|
|
|
|
# Configure swarm
|
|
config = IterativeImprovementConfig(max_iterations=1)
|
|
swarm = AutoSwarmBuilder(
|
|
name="TestSwarm",
|
|
enable_evaluation=True,
|
|
evaluation_config=config,
|
|
)
|
|
|
|
# Mock additional methods
|
|
with patch.object(swarm, 'build_agent') as mock_build:
|
|
with patch.object(swarm, 'initialize_swarm_router') as mock_router:
|
|
mock_build.return_value = mock_agent_instance
|
|
mock_router.return_value = "Task output"
|
|
|
|
# Run the swarm
|
|
result = swarm.run("test task")
|
|
|
|
# Verify evaluation was performed
|
|
assert len(swarm.evaluation_history) == 1
|
|
assert result == "Task output"
|
|
|
|
|
|
if __name__ == "__main__":
|
|
pytest.main([__file__]) |