From 997fd1e1430429127fd977f9628d006e47f329a6 Mon Sep 17 00:00:00 2001 From: Kye Date: Tue, 26 Mar 2024 10:14:00 -0700 Subject: [PATCH] [CLEANUP] --- swarms/chunkers/__init__.py | 5 - swarms/chunkers/base_chunker.py | 163 ------------------ swarms/chunkers/chunk_seperator.py | 7 - swarms/chunkers/text_chunker.py | 13 -- swarms/memory/__init__.py | 2 +- swarms/models/__init__.py | 5 +- swarms/models/petals.py | 1 + swarms/schedulers/__init__.py | 6 - swarms/structs/__init__.py | 7 + .../{schedulers => structs}/agent_process.py | 0 10 files changed, 11 insertions(+), 198 deletions(-) delete mode 100644 swarms/chunkers/__init__.py delete mode 100644 swarms/chunkers/base_chunker.py delete mode 100644 swarms/chunkers/chunk_seperator.py delete mode 100644 swarms/chunkers/text_chunker.py delete mode 100644 swarms/schedulers/__init__.py rename swarms/{schedulers => structs}/agent_process.py (100%) diff --git a/swarms/chunkers/__init__.py b/swarms/chunkers/__init__.py deleted file mode 100644 index b55d15c2..00000000 --- a/swarms/chunkers/__init__.py +++ /dev/null @@ -1,5 +0,0 @@ -from swarms.chunkers.base_chunker import BaseChunker -from swarms.chunkers.chunk_seperator import ChunkSeparator -from swarms.chunkers.text_chunker import TextChunker - -__all__ = ["ChunkSeparator", "BaseChunker", "TextChunker"] diff --git a/swarms/chunkers/base_chunker.py b/swarms/chunkers/base_chunker.py deleted file mode 100644 index 47f73a4e..00000000 --- a/swarms/chunkers/base_chunker.py +++ /dev/null @@ -1,163 +0,0 @@ -from __future__ import annotations - -from abc import ABC -from dataclasses import dataclass, field - -from swarms.artifacts.text_artifact import TextArtifact -from swarms.chunkers.chunk_seperator import ChunkSeparator -from swarms.tokenizers.base_tokenizer import BaseTokenizer -from swarms.tokenizers.openai_tokenizers import OpenAITokenizer - - -@dataclass -class BaseChunker(ABC): - """ - Base class for chunking text into smaller chunks. - """ - - DEFAULT_SEPARATORS = [ChunkSeparator(" ")] - - separators: list[ChunkSeparator] = field( - default_factory=lambda: BaseChunker.DEFAULT_SEPARATORS - ) - tokenizer: BaseTokenizer = field( - default_factory=lambda: OpenAITokenizer( - model=OpenAITokenizer.DEFAULT_OPENAI_GPT_3_CHAT_MODEL - ) - ) - max_tokens: int = field( - default_factory=lambda: BaseChunker.tokenizer.max_tokens - ) - - def chunk(self, text: str | str) -> list[str]: - """ - Chunk the given text into smaller chunks. - - Args: - text (TextArtifact | str): The text to be chunked. - - Returns: - list[TextArtifact]: The list of chunked text artifacts. - """ - text = text.value if isinstance(text, str) else text - - return [ - TextArtifact(c) for c in self._chunk_recursively(text) - ] - - def _chunk_recursively( - self, - chunk: str, - current_separator: ChunkSeparator | None = None, - ) -> list[str]: - """ - Recursively chunk the given chunk into smaller subchunks. - - Args: - chunk (str): The chunk to be recursively chunked. - current_separator (Optional[ChunkSeparator], optional): The current separator to be used. Defaults to None. - - Returns: - list[str]: The list of recursively chunked subchunks. - """ - token_count = self.tokenizer.count_tokens(chunk) - - if token_count <= self.max_tokens: - return [chunk] - else: - balance_index = -1 - balance_diff = float("inf") - tokens_count = 0 - half_token_count = token_count // 2 - - # If a separator is provided, only use separators after it. - if current_separator: - separators = self.separators[ - self.separators.index(current_separator) : - ] - else: - separators = self.separators - - # Loop through available separators to find the best split. - for separator in separators: - # Split the chunk into subchunks using the current separator. - subchunks = list( - filter(None, chunk.split(separator.value)) - ) - - # Check if the split resulted in more than one subchunk. - if len(subchunks) > 1: - # Iterate through the subchunks and calculate token counts. - for index, subchunk in enumerate(subchunks): - if index < len(subchunks): - if separator.is_prefix: - subchunk = separator.value + subchunk - else: - subchunk = subchunk + separator.value - - tokens_count += self.tokenizer.count_tokens( - subchunk - ) - - # Update the best split if the current one is more balanced. - if ( - abs(tokens_count - half_token_count) - < balance_diff - ): - balance_index = index - balance_diff = abs( - tokens_count - half_token_count - ) - - # Create the two subchunks based on the best separator. - if separator.is_prefix: - # If the separator is a prefix, append it before this subchunk. - first_subchunk = ( - separator.value - + separator.value.join( - subchunks[: balance_index + 1] - ) - ) - second_subchunk = ( - separator.value - + separator.value.join( - subchunks[balance_index + 1 :] - ) - ) - else: - # If the separator is not a prefix, append it after this subchunk. - first_subchunk = ( - separator.value.join( - subchunks[: balance_index + 1] - ) - + separator.value - ) - second_subchunk = separator.value.join( - subchunks[balance_index + 1 :] - ) - - # Continue recursively chunking the subchunks. - first_subchunk_rec = self._chunk_recursively( - first_subchunk.strip(), separator - ) - second_subchunk_rec = self._chunk_recursively( - second_subchunk.strip(), separator - ) - - # Return the concatenated results of the subchunks if both are non-empty. - if first_subchunk_rec and second_subchunk_rec: - return ( - first_subchunk_rec + second_subchunk_rec - ) - # If only one subchunk is non-empty, return it. - elif first_subchunk_rec: - return first_subchunk_rec - elif second_subchunk_rec: - return second_subchunk_rec - else: - return [] - # If none of the separators result in a balanced split, split the chunk in half. - midpoint = len(chunk) // 2 - return self._chunk_recursively( - chunk[:midpoint] - ) + self._chunk_recursively(chunk[midpoint:]) diff --git a/swarms/chunkers/chunk_seperator.py b/swarms/chunkers/chunk_seperator.py deleted file mode 100644 index d554be48..00000000 --- a/swarms/chunkers/chunk_seperator.py +++ /dev/null @@ -1,7 +0,0 @@ -from dataclasses import dataclass - - -@dataclass -class ChunkSeparator: - value: str - is_prefix: bool = False diff --git a/swarms/chunkers/text_chunker.py b/swarms/chunkers/text_chunker.py deleted file mode 100644 index b8b17bf1..00000000 --- a/swarms/chunkers/text_chunker.py +++ /dev/null @@ -1,13 +0,0 @@ -from swarms.chunkers.base_chunker import BaseChunker -from swarms.chunkers.chunk_seperator import ChunkSeparator - - -class TextChunker(BaseChunker): - DEFAULT_SEPARATORS = [ - ChunkSeparator("\n\n"), - ChunkSeparator("\n"), - ChunkSeparator(". "), - ChunkSeparator("! "), - ChunkSeparator("? "), - ChunkSeparator(" "), - ] diff --git a/swarms/memory/__init__.py b/swarms/memory/__init__.py index b92e35e1..7b56e444 100644 --- a/swarms/memory/__init__.py +++ b/swarms/memory/__init__.py @@ -14,4 +14,4 @@ __all__ = [ "DictSharedMemory", "ShortTermMemory", "VisualShortTermMemory", -] \ No newline at end of file +] diff --git a/swarms/models/__init__.py b/swarms/models/__init__.py index f26a2eeb..18f25b53 100644 --- a/swarms/models/__init__.py +++ b/swarms/models/__init__.py @@ -38,8 +38,6 @@ from swarms.models.popular_llms import ( ReplicateLLM as Replicate, ) from swarms.models.qwen import QwenVLMultiModal # noqa: E402 - -# from swarms.models.roboflow_model import RoboflowMultiModal from swarms.models.sam_supervision import SegmentAnythingMarkGenerator from swarms.models.sampling_params import SamplingParams, SamplingType from swarms.models.together import TogetherLLM # noqa: E402 @@ -88,4 +86,5 @@ __all__ = [ "AudioModality", "ImageModality", "VideoModality", -] \ No newline at end of file + "MosaicML", +] diff --git a/swarms/models/petals.py b/swarms/models/petals.py index 699d7d9d..7a7823f2 100644 --- a/swarms/models/petals.py +++ b/swarms/models/petals.py @@ -1,6 +1,7 @@ from transformers import AutoModelForCausalLM, AutoTokenizer from swarms.models.base_llm import AbstractLLM + class Petals(AbstractLLM): """Petals Bloom models.""" diff --git a/swarms/schedulers/__init__.py b/swarms/schedulers/__init__.py deleted file mode 100644 index 803b2278..00000000 --- a/swarms/schedulers/__init__.py +++ /dev/null @@ -1,6 +0,0 @@ -from swarms.schedulers.agent_process import ( - AgentProcess, - AgentProcessQueue, -) - -__all__ = ["AgentProcess", "AgentProcessQueue"] diff --git a/swarms/structs/__init__.py b/swarms/structs/__init__.py index f1090b06..18999e9f 100644 --- a/swarms/structs/__init__.py +++ b/swarms/structs/__init__.py @@ -75,6 +75,11 @@ from swarms.structs.utils import ( parse_tasks, ) from swarms.structs.auto_swarm import AutoSwarm, AutoSwarmRouter +from swarms.structs.agent_process import ( + AgentProcess, + AgentProcessQueue, +) + __all__ = [ "Agent", @@ -142,4 +147,6 @@ __all__ = [ "AgentJob", "AutoSwarm", "AutoSwarmRouter", + "AgentProcess", + "AgentProcessQueue", ] diff --git a/swarms/schedulers/agent_process.py b/swarms/structs/agent_process.py similarity index 100% rename from swarms/schedulers/agent_process.py rename to swarms/structs/agent_process.py