pull/430/head
Kye 10 months ago
parent f4c6692671
commit 997fd1e143

@ -1,5 +0,0 @@
from swarms.chunkers.base_chunker import BaseChunker
from swarms.chunkers.chunk_seperator import ChunkSeparator
from swarms.chunkers.text_chunker import TextChunker
__all__ = ["ChunkSeparator", "BaseChunker", "TextChunker"]

@ -1,163 +0,0 @@
from __future__ import annotations
from abc import ABC
from dataclasses import dataclass, field
from swarms.artifacts.text_artifact import TextArtifact
from swarms.chunkers.chunk_seperator import ChunkSeparator
from swarms.tokenizers.base_tokenizer import BaseTokenizer
from swarms.tokenizers.openai_tokenizers import OpenAITokenizer
@dataclass
class BaseChunker(ABC):
"""
Base class for chunking text into smaller chunks.
"""
DEFAULT_SEPARATORS = [ChunkSeparator(" ")]
separators: list[ChunkSeparator] = field(
default_factory=lambda: BaseChunker.DEFAULT_SEPARATORS
)
tokenizer: BaseTokenizer = field(
default_factory=lambda: OpenAITokenizer(
model=OpenAITokenizer.DEFAULT_OPENAI_GPT_3_CHAT_MODEL
)
)
max_tokens: int = field(
default_factory=lambda: BaseChunker.tokenizer.max_tokens
)
def chunk(self, text: str | str) -> list[str]:
"""
Chunk the given text into smaller chunks.
Args:
text (TextArtifact | str): The text to be chunked.
Returns:
list[TextArtifact]: The list of chunked text artifacts.
"""
text = text.value if isinstance(text, str) else text
return [
TextArtifact(c) for c in self._chunk_recursively(text)
]
def _chunk_recursively(
self,
chunk: str,
current_separator: ChunkSeparator | None = None,
) -> list[str]:
"""
Recursively chunk the given chunk into smaller subchunks.
Args:
chunk (str): The chunk to be recursively chunked.
current_separator (Optional[ChunkSeparator], optional): The current separator to be used. Defaults to None.
Returns:
list[str]: The list of recursively chunked subchunks.
"""
token_count = self.tokenizer.count_tokens(chunk)
if token_count <= self.max_tokens:
return [chunk]
else:
balance_index = -1
balance_diff = float("inf")
tokens_count = 0
half_token_count = token_count // 2
# If a separator is provided, only use separators after it.
if current_separator:
separators = self.separators[
self.separators.index(current_separator) :
]
else:
separators = self.separators
# Loop through available separators to find the best split.
for separator in separators:
# Split the chunk into subchunks using the current separator.
subchunks = list(
filter(None, chunk.split(separator.value))
)
# Check if the split resulted in more than one subchunk.
if len(subchunks) > 1:
# Iterate through the subchunks and calculate token counts.
for index, subchunk in enumerate(subchunks):
if index < len(subchunks):
if separator.is_prefix:
subchunk = separator.value + subchunk
else:
subchunk = subchunk + separator.value
tokens_count += self.tokenizer.count_tokens(
subchunk
)
# Update the best split if the current one is more balanced.
if (
abs(tokens_count - half_token_count)
< balance_diff
):
balance_index = index
balance_diff = abs(
tokens_count - half_token_count
)
# Create the two subchunks based on the best separator.
if separator.is_prefix:
# If the separator is a prefix, append it before this subchunk.
first_subchunk = (
separator.value
+ separator.value.join(
subchunks[: balance_index + 1]
)
)
second_subchunk = (
separator.value
+ separator.value.join(
subchunks[balance_index + 1 :]
)
)
else:
# If the separator is not a prefix, append it after this subchunk.
first_subchunk = (
separator.value.join(
subchunks[: balance_index + 1]
)
+ separator.value
)
second_subchunk = separator.value.join(
subchunks[balance_index + 1 :]
)
# Continue recursively chunking the subchunks.
first_subchunk_rec = self._chunk_recursively(
first_subchunk.strip(), separator
)
second_subchunk_rec = self._chunk_recursively(
second_subchunk.strip(), separator
)
# Return the concatenated results of the subchunks if both are non-empty.
if first_subchunk_rec and second_subchunk_rec:
return (
first_subchunk_rec + second_subchunk_rec
)
# If only one subchunk is non-empty, return it.
elif first_subchunk_rec:
return first_subchunk_rec
elif second_subchunk_rec:
return second_subchunk_rec
else:
return []
# If none of the separators result in a balanced split, split the chunk in half.
midpoint = len(chunk) // 2
return self._chunk_recursively(
chunk[:midpoint]
) + self._chunk_recursively(chunk[midpoint:])

@ -1,7 +0,0 @@
from dataclasses import dataclass
@dataclass
class ChunkSeparator:
value: str
is_prefix: bool = False

@ -1,13 +0,0 @@
from swarms.chunkers.base_chunker import BaseChunker
from swarms.chunkers.chunk_seperator import ChunkSeparator
class TextChunker(BaseChunker):
DEFAULT_SEPARATORS = [
ChunkSeparator("\n\n"),
ChunkSeparator("\n"),
ChunkSeparator(". "),
ChunkSeparator("! "),
ChunkSeparator("? "),
ChunkSeparator(" "),
]

@ -38,8 +38,6 @@ from swarms.models.popular_llms import (
ReplicateLLM as Replicate,
)
from swarms.models.qwen import QwenVLMultiModal # noqa: E402
# from swarms.models.roboflow_model import RoboflowMultiModal
from swarms.models.sam_supervision import SegmentAnythingMarkGenerator
from swarms.models.sampling_params import SamplingParams, SamplingType
from swarms.models.together import TogetherLLM # noqa: E402
@ -88,4 +86,5 @@ __all__ = [
"AudioModality",
"ImageModality",
"VideoModality",
"MosaicML",
]

@ -1,6 +1,7 @@
from transformers import AutoModelForCausalLM, AutoTokenizer
from swarms.models.base_llm import AbstractLLM
class Petals(AbstractLLM):
"""Petals Bloom models."""

@ -1,6 +0,0 @@
from swarms.schedulers.agent_process import (
AgentProcess,
AgentProcessQueue,
)
__all__ = ["AgentProcess", "AgentProcessQueue"]

@ -75,6 +75,11 @@ from swarms.structs.utils import (
parse_tasks,
)
from swarms.structs.auto_swarm import AutoSwarm, AutoSwarmRouter
from swarms.structs.agent_process import (
AgentProcess,
AgentProcessQueue,
)
__all__ = [
"Agent",
@ -142,4 +147,6 @@ __all__ = [
"AgentJob",
"AutoSwarm",
"AutoSwarmRouter",
"AgentProcess",
"AgentProcessQueue",
]

Loading…
Cancel
Save