import pytest from swarms.chunkers.base import ( BaseChunker, TextArtifact, ChunkSeparator, OpenAiTokenizer, ) # adjust the import paths accordingly # 1. Test Initialization def test_chunker_initialization(): chunker = BaseChunker() assert isinstance(chunker, BaseChunker) assert chunker.max_tokens == chunker.tokenizer.max_tokens def test_default_separators(): chunker = BaseChunker() assert chunker.separators == BaseChunker.DEFAULT_SEPARATORS def test_default_tokenizer(): chunker = BaseChunker() assert isinstance(chunker.tokenizer, OpenAiTokenizer) # 2. Test Basic Chunking @pytest.mark.parametrize( "input_text, expected_output", [ ("This is a test.", [TextArtifact("This is a test.")]), ("Hello World!", [TextArtifact("Hello World!")]), # Add more simple cases ], ) def test_basic_chunk(input_text, expected_output): chunker = BaseChunker() result = chunker.chunk(input_text) assert result == expected_output # 3. Test Chunking with Different Separators def test_custom_separators(): custom_separator = ChunkSeparator(";") chunker = BaseChunker(separators=[custom_separator]) input_text = "Hello;World!" expected_output = [TextArtifact("Hello;"), TextArtifact("World!")] result = chunker.chunk(input_text) assert result == expected_output # 4. Test Recursive Chunking def test_recursive_chunking(): chunker = BaseChunker(max_tokens=5) input_text = "This is a more complex text." expected_output = [ TextArtifact("This"), TextArtifact("is a"), TextArtifact("more"), TextArtifact("complex"), TextArtifact("text."), ] result = chunker.chunk(input_text) assert result == expected_output # 5. Test Edge Cases and Special Scenarios def test_empty_text(): chunker = BaseChunker() result = chunker.chunk("") assert result == [] def test_whitespace_text(): chunker = BaseChunker() result = chunker.chunk(" ") assert result == [TextArtifact(" ")] def test_single_word(): chunker = BaseChunker() result = chunker.chunk("Hello") assert result == [TextArtifact("Hello")]