You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
swarms/tests/chunkers/basechunker.py

84 lines
2.2 KiB

1 year ago
import pytest
from swarms.chunkers.base import (
BaseChunker,
TextArtifact,
ChunkSeparator,
OpenAiTokenizer,
) # adjust the import paths accordingly
1 year ago
# 1. Test Initialization
def test_chunker_initialization():
chunker = BaseChunker()
assert isinstance(chunker, BaseChunker)
assert chunker.max_tokens == chunker.tokenizer.max_tokens
1 year ago
def test_default_separators():
chunker = BaseChunker()
assert chunker.separators == BaseChunker.DEFAULT_SEPARATORS
1 year ago
def test_default_tokenizer():
chunker = BaseChunker()
assert isinstance(chunker.tokenizer, OpenAiTokenizer)
1 year ago
# 2. Test Basic Chunking
@pytest.mark.parametrize(
"input_text, expected_output",
[
("This is a test.", [TextArtifact("This is a test.")]),
("Hello World!", [TextArtifact("Hello World!")]),
# Add more simple cases
],
)
def test_basic_chunk(input_text, expected_output):
chunker = BaseChunker()
result = chunker.chunk(input_text)
assert result == expected_output
1 year ago
# 3. Test Chunking with Different Separators
def test_custom_separators():
custom_separator = ChunkSeparator(";")
chunker = BaseChunker(separators=[custom_separator])
input_text = "Hello;World!"
expected_output = [TextArtifact("Hello;"), TextArtifact("World!")]
result = chunker.chunk(input_text)
assert result == expected_output
1 year ago
# 4. Test Recursive Chunking
def test_recursive_chunking():
chunker = BaseChunker(max_tokens=5)
input_text = "This is a more complex text."
expected_output = [
TextArtifact("This"),
TextArtifact("is a"),
TextArtifact("more"),
TextArtifact("complex"),
TextArtifact("text."),
1 year ago
]
result = chunker.chunk(input_text)
assert result == expected_output
1 year ago
# 5. Test Edge Cases and Special Scenarios
def test_empty_text():
chunker = BaseChunker()
result = chunker.chunk("")
assert result == []
1 year ago
def test_whitespace_text():
chunker = BaseChunker()
result = chunker.chunk(" ")
assert result == [TextArtifact(" ")]
1 year ago
def test_single_word():
chunker = BaseChunker()
result = chunker.chunk("Hello")
assert result == [TextArtifact("Hello")]