swarms/swarms/chunkers/omni_chunker.py

"""
Omni Chunker is a chunker that chunks all files into select chunks of size x strings

Usage:
--------------
from swarms.chunkers.omni_chunker import OmniChunker

# Example
pdf = "swarmdeck.pdf"
chunker = OmniChunker(chunk_size=1000, beautify=True)
chunks = chunker(pdf)
print(chunks)


"""
from dataclasses import dataclass
from typing import List, Optional, Callable
from termcolor import colored
import os


@dataclass
class OmniChunker:
    """ """

    chunk_size: int = 1000
    beautify: bool = False
    use_tokenizer: bool = False
    tokenizer: Optional[Callable[[str], List[str]]] = None

    def __call__(self, file_path: str) -> List[str]:
        """
        Chunk the given file into parts of size `chunk_size`.

        Args:
            file_path (str): The path to the file to chunk.

        Returns:
            List[str]: A list of string chunks from the file.
        """
        if not os.path.isfile(file_path):
            print(colored("The file does not exist.", "red"))
            return []

        file_extension = os.path.splitext(file_path)[1]
        try:
            with open(file_path, "rb") as file:
                content = file.read()
                # Decode content based on MIME type or file extension
                decoded_content = self.decode_content(content, file_extension)
                chunks = self.chunk_content(decoded_content)
                return chunks

        except Exception as e:
            print(colored(f"Error reading file: {e}", "red"))
            return []

    def decode_content(self, content: bytes, file_extension: str) -> str:
        """
        Decode the content of the file based on its MIME type or file extension.

        Args:
            content (bytes): The content of the file.
            file_extension (str): The file extension of the file.

        Returns:
            str: The decoded content of the file.
        """
        # Add logic to handle different file types based on the extension
        # For simplicity, this example assumes text files encoded in utf-8
        try:
            return content.decode("utf-8")
        except UnicodeDecodeError as e:
            print(
                colored(
                    f"Could not decode file with extension {file_extension}: {e}",
                    "yellow",
                )
            )
            return ""

    def chunk_content(self, content: str) -> List[str]:
        """
        Split the content into chunks of size `chunk_size`.

        Args:
            content (str): The content to chunk.

        Returns:
            List[str]: The list of chunks.
        """
        return [
            content[i : i + self.chunk_size]
            for i in range(0, len(content), self.chunk_size)
        ]

    def __str__(self):
        return f"OmniChunker(chunk_size={self.chunk_size}, beautify={self.beautify})"

    def metrics(self):
        return {
            "chunk_size": self.chunk_size,
            "beautify": self.beautify,
        }

    def print_dashboard(self):
        print(
            colored(
                f"""
            Omni Chunker
            ------------
            {self.metrics()}
            """,
                "cyan",
            )
        )