You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
190 lines
6.1 KiB
190 lines
6.1 KiB
import pytest
|
|
from swarms.structs.council_as_judge import (
|
|
EvaluationError,
|
|
DimensionEvaluationError,
|
|
AggregationError,
|
|
EVAL_DIMENSIONS,
|
|
judge_system_prompt,
|
|
build_judge_prompt,
|
|
aggregator_system_prompt,
|
|
build_aggregation_prompt,
|
|
)
|
|
|
|
|
|
def test_evaluation_error_is_exception():
|
|
"""Test that EvaluationError is an Exception subclass"""
|
|
assert issubclass(EvaluationError, Exception)
|
|
|
|
|
|
def test_dimension_evaluation_error_is_evaluation_error():
|
|
"""Test that DimensionEvaluationError is an EvaluationError subclass"""
|
|
assert issubclass(DimensionEvaluationError, EvaluationError)
|
|
|
|
|
|
def test_aggregation_error_is_evaluation_error():
|
|
"""Test that AggregationError is an EvaluationError subclass"""
|
|
assert issubclass(AggregationError, EvaluationError)
|
|
|
|
|
|
def test_eval_dimensions_exists():
|
|
"""Test that EVAL_DIMENSIONS dictionary exists"""
|
|
assert isinstance(EVAL_DIMENSIONS, dict)
|
|
assert len(EVAL_DIMENSIONS) > 0
|
|
|
|
|
|
def test_eval_dimensions_contains_expected_keys():
|
|
"""Test that EVAL_DIMENSIONS contains expected evaluation dimensions"""
|
|
expected_dimensions = [
|
|
"accuracy",
|
|
"helpfulness",
|
|
"harmlessness",
|
|
"coherence",
|
|
"conciseness",
|
|
"instruction_adherence",
|
|
]
|
|
for dimension in expected_dimensions:
|
|
assert dimension in EVAL_DIMENSIONS
|
|
|
|
|
|
def test_eval_dimensions_values_are_strings():
|
|
"""Test that all EVAL_DIMENSIONS values are strings"""
|
|
for dimension, description in EVAL_DIMENSIONS.items():
|
|
assert isinstance(description, str)
|
|
assert len(description) > 0
|
|
|
|
|
|
def test_judge_system_prompt_returns_string():
|
|
"""Test that judge_system_prompt returns a string"""
|
|
result = judge_system_prompt()
|
|
assert isinstance(result, str)
|
|
assert len(result) > 0
|
|
|
|
|
|
def test_judge_system_prompt_contains_key_phrases():
|
|
"""Test that judge_system_prompt contains expected content"""
|
|
result = judge_system_prompt()
|
|
assert "evaluator" in result.lower()
|
|
assert "feedback" in result.lower()
|
|
|
|
|
|
def test_build_judge_prompt_valid_dimension():
|
|
"""Test build_judge_prompt with valid dimension"""
|
|
result = build_judge_prompt(
|
|
dimension_name="accuracy",
|
|
task="Test task",
|
|
task_response="Test response"
|
|
)
|
|
assert isinstance(result, str)
|
|
assert "accuracy" in result.lower()
|
|
assert "Test task" in result
|
|
assert "Test response" in result
|
|
|
|
|
|
def test_build_judge_prompt_invalid_dimension_raises_error():
|
|
"""Test that build_judge_prompt raises KeyError for invalid dimension"""
|
|
with pytest.raises(KeyError, match="Unknown evaluation dimension"):
|
|
build_judge_prompt(
|
|
dimension_name="invalid_dimension",
|
|
task="Test task",
|
|
task_response="Test response"
|
|
)
|
|
|
|
|
|
def test_build_judge_prompt_includes_evaluation_focus():
|
|
"""Test that build_judge_prompt includes evaluation focus from EVAL_DIMENSIONS"""
|
|
result = build_judge_prompt(
|
|
dimension_name="helpfulness",
|
|
task="Test task",
|
|
task_response="Test response"
|
|
)
|
|
# Should contain some content from EVAL_DIMENSIONS["helpfulness"]
|
|
assert "helpfulness" in result.lower()
|
|
|
|
|
|
def test_aggregator_system_prompt_returns_string():
|
|
"""Test that aggregator_system_prompt returns a string"""
|
|
result = aggregator_system_prompt()
|
|
assert isinstance(result, str)
|
|
assert len(result) > 0
|
|
|
|
|
|
def test_aggregator_system_prompt_contains_key_phrases():
|
|
"""Test that aggregator_system_prompt contains expected content"""
|
|
result = aggregator_system_prompt()
|
|
assert "synthesizing" in result.lower() or "synthesis" in result.lower()
|
|
assert "report" in result.lower()
|
|
|
|
|
|
def test_build_aggregation_prompt_basic():
|
|
"""Test build_aggregation_prompt with basic input"""
|
|
rationales = {
|
|
"accuracy": "This response is accurate",
|
|
"helpfulness": "This response is helpful"
|
|
}
|
|
result = build_aggregation_prompt(rationales)
|
|
assert isinstance(result, str)
|
|
assert "accuracy" in result.lower()
|
|
assert "helpfulness" in result.lower()
|
|
assert "This response is accurate" in result
|
|
assert "This response is helpful" in result
|
|
|
|
|
|
def test_build_aggregation_prompt_empty_dict():
|
|
"""Test build_aggregation_prompt with empty dictionary"""
|
|
result = build_aggregation_prompt({})
|
|
assert isinstance(result, str)
|
|
assert len(result) > 0
|
|
|
|
|
|
def test_build_aggregation_prompt_single_dimension():
|
|
"""Test build_aggregation_prompt with single dimension"""
|
|
rationales = {"accuracy": "Accuracy analysis"}
|
|
result = build_aggregation_prompt(rationales)
|
|
assert "accuracy" in result.lower()
|
|
assert "Accuracy analysis" in result
|
|
|
|
|
|
def test_build_aggregation_prompt_multiple_dimensions():
|
|
"""Test build_aggregation_prompt with multiple dimensions"""
|
|
rationales = {
|
|
"accuracy": "Accuracy text",
|
|
"helpfulness": "Helpfulness text",
|
|
"coherence": "Coherence text"
|
|
}
|
|
result = build_aggregation_prompt(rationales)
|
|
for dim, text in rationales.items():
|
|
assert dim.upper() in result
|
|
assert text in result
|
|
|
|
|
|
def test_evaluation_error_can_be_raised():
|
|
"""Test that EvaluationError can be raised"""
|
|
with pytest.raises(EvaluationError, match="Test error"):
|
|
raise EvaluationError("Test error")
|
|
|
|
|
|
def test_dimension_evaluation_error_can_be_raised():
|
|
"""Test that DimensionEvaluationError can be raised"""
|
|
with pytest.raises(DimensionEvaluationError, match="Dimension error"):
|
|
raise DimensionEvaluationError("Dimension error")
|
|
|
|
|
|
def test_aggregation_error_can_be_raised():
|
|
"""Test that AggregationError can be raised"""
|
|
with pytest.raises(AggregationError, match="Aggregation error"):
|
|
raise AggregationError("Aggregation error")
|
|
|
|
|
|
def test_judge_system_prompt_is_cacheable():
|
|
"""Test that judge_system_prompt can be called multiple times"""
|
|
result1 = judge_system_prompt()
|
|
result2 = judge_system_prompt()
|
|
assert result1 == result2
|
|
|
|
|
|
def test_aggregator_system_prompt_is_cacheable():
|
|
"""Test that aggregator_system_prompt can be called multiple times"""
|
|
result1 = aggregator_system_prompt()
|
|
result2 = aggregator_system_prompt()
|
|
assert result1 == result2
|