You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
swarms/tests/structs/test_council_as_judge_funct...

190 lines
6.1 KiB

import pytest
from swarms.structs.council_as_judge import (
EvaluationError,
DimensionEvaluationError,
AggregationError,
EVAL_DIMENSIONS,
judge_system_prompt,
build_judge_prompt,
aggregator_system_prompt,
build_aggregation_prompt,
)
def test_evaluation_error_is_exception():
"""Test that EvaluationError is an Exception subclass"""
assert issubclass(EvaluationError, Exception)
def test_dimension_evaluation_error_is_evaluation_error():
"""Test that DimensionEvaluationError is an EvaluationError subclass"""
assert issubclass(DimensionEvaluationError, EvaluationError)
def test_aggregation_error_is_evaluation_error():
"""Test that AggregationError is an EvaluationError subclass"""
assert issubclass(AggregationError, EvaluationError)
def test_eval_dimensions_exists():
"""Test that EVAL_DIMENSIONS dictionary exists"""
assert isinstance(EVAL_DIMENSIONS, dict)
assert len(EVAL_DIMENSIONS) > 0
def test_eval_dimensions_contains_expected_keys():
"""Test that EVAL_DIMENSIONS contains expected evaluation dimensions"""
expected_dimensions = [
"accuracy",
"helpfulness",
"harmlessness",
"coherence",
"conciseness",
"instruction_adherence",
]
for dimension in expected_dimensions:
assert dimension in EVAL_DIMENSIONS
def test_eval_dimensions_values_are_strings():
"""Test that all EVAL_DIMENSIONS values are strings"""
for dimension, description in EVAL_DIMENSIONS.items():
assert isinstance(description, str)
assert len(description) > 0
def test_judge_system_prompt_returns_string():
"""Test that judge_system_prompt returns a string"""
result = judge_system_prompt()
assert isinstance(result, str)
assert len(result) > 0
def test_judge_system_prompt_contains_key_phrases():
"""Test that judge_system_prompt contains expected content"""
result = judge_system_prompt()
assert "evaluator" in result.lower()
assert "feedback" in result.lower()
def test_build_judge_prompt_valid_dimension():
"""Test build_judge_prompt with valid dimension"""
result = build_judge_prompt(
dimension_name="accuracy",
task="Test task",
task_response="Test response"
)
assert isinstance(result, str)
assert "accuracy" in result.lower()
assert "Test task" in result
assert "Test response" in result
def test_build_judge_prompt_invalid_dimension_raises_error():
"""Test that build_judge_prompt raises KeyError for invalid dimension"""
with pytest.raises(KeyError, match="Unknown evaluation dimension"):
build_judge_prompt(
dimension_name="invalid_dimension",
task="Test task",
task_response="Test response"
)
def test_build_judge_prompt_includes_evaluation_focus():
"""Test that build_judge_prompt includes evaluation focus from EVAL_DIMENSIONS"""
result = build_judge_prompt(
dimension_name="helpfulness",
task="Test task",
task_response="Test response"
)
# Should contain some content from EVAL_DIMENSIONS["helpfulness"]
assert "helpfulness" in result.lower()
def test_aggregator_system_prompt_returns_string():
"""Test that aggregator_system_prompt returns a string"""
result = aggregator_system_prompt()
assert isinstance(result, str)
assert len(result) > 0
def test_aggregator_system_prompt_contains_key_phrases():
"""Test that aggregator_system_prompt contains expected content"""
result = aggregator_system_prompt()
assert "synthesizing" in result.lower() or "synthesis" in result.lower()
assert "report" in result.lower()
def test_build_aggregation_prompt_basic():
"""Test build_aggregation_prompt with basic input"""
rationales = {
"accuracy": "This response is accurate",
"helpfulness": "This response is helpful"
}
result = build_aggregation_prompt(rationales)
assert isinstance(result, str)
assert "accuracy" in result.lower()
assert "helpfulness" in result.lower()
assert "This response is accurate" in result
assert "This response is helpful" in result
def test_build_aggregation_prompt_empty_dict():
"""Test build_aggregation_prompt with empty dictionary"""
result = build_aggregation_prompt({})
assert isinstance(result, str)
assert len(result) > 0
def test_build_aggregation_prompt_single_dimension():
"""Test build_aggregation_prompt with single dimension"""
rationales = {"accuracy": "Accuracy analysis"}
result = build_aggregation_prompt(rationales)
assert "accuracy" in result.lower()
assert "Accuracy analysis" in result
def test_build_aggregation_prompt_multiple_dimensions():
"""Test build_aggregation_prompt with multiple dimensions"""
rationales = {
"accuracy": "Accuracy text",
"helpfulness": "Helpfulness text",
"coherence": "Coherence text"
}
result = build_aggregation_prompt(rationales)
for dim, text in rationales.items():
assert dim.upper() in result
assert text in result
def test_evaluation_error_can_be_raised():
"""Test that EvaluationError can be raised"""
with pytest.raises(EvaluationError, match="Test error"):
raise EvaluationError("Test error")
def test_dimension_evaluation_error_can_be_raised():
"""Test that DimensionEvaluationError can be raised"""
with pytest.raises(DimensionEvaluationError, match="Dimension error"):
raise DimensionEvaluationError("Dimension error")
def test_aggregation_error_can_be_raised():
"""Test that AggregationError can be raised"""
with pytest.raises(AggregationError, match="Aggregation error"):
raise AggregationError("Aggregation error")
def test_judge_system_prompt_is_cacheable():
"""Test that judge_system_prompt can be called multiple times"""
result1 = judge_system_prompt()
result2 = judge_system_prompt()
assert result1 == result2
def test_aggregator_system_prompt_is_cacheable():
"""Test that aggregator_system_prompt can be called multiple times"""
result1 = aggregator_system_prompt()
result2 = aggregator_system_prompt()
assert result1 == result2