[CLEANUP]

2 years ago · e8681b223c
parent c21af37f64
commit e8681b223c
35 changed files with 858 additions and 36 deletions
--- a/example.py
+++ b/example.py
@ -3,7 +3,7 @@ import os
 from dotenv import load_dotenv

 # Import the OpenAIChat model and the Agent struct
-from swarms import OpenAIChat, Agent
+from swarms import Agent, OpenAIChat

 # Load the environment variables
 load_dotenv()
@ -22,7 +22,7 @@ llm = OpenAIChat(
 ## Initialize the workflow
 agent = Agent(
    llm=llm,
-    max_loops=1,
+    max_loops=4,
    autosave=True,
    dashboard=True,
 )
--- a/playground/demos/visuo/text_to_sql_agent.py
+++ b/playground/demos/visuo/text_to_sql_agent.py
@ -0,0 +1,33 @@
+import os
+
+from dotenv import load_dotenv
+
+# Import the OpenAIChat model and the Agent struct
+from swarms import Agent, HuggingfaceLLM
+
+# Load the environment variables
+load_dotenv()
+
+# Get the API key from the environment
+api_key = os.environ.get("OPENAI_API_KEY")
+
+# Initialize the language model
+llm = HuggingfaceLLM(
+    model_id="codellama/CodeLlama-70b-hf",
+    max_length=4000,
+    quantize=True,
+    temperature=0.5,
+)
+
+## Initialize the workflow
+agent = Agent(
+    llm=llm,
+    max_loops="auto",
+    system_prompt=None,
+    autosave=True,
+    dashboard=True,
+    tools=[None],
+)
+
+# Run the workflow on a task
+agent.run("Generate a 10,000 word blog on health and wellness.")
--- a/playground/models/roboflow_example.py
+++ b/playground/models/roboflow_example.py
@ -0,0 +1,13 @@
+from swarms import RoboflowMultiModal
+
+
+# Initialize the model
+model = RoboflowMultiModal(
+    api_key="api",
+    project_id="your project id",
+    hosted=False,
+)
+
+
+# Run the model on an img
+out = model("img.png")
--- a/playground/structs/chat.py
+++ b/playground/structs/chat.py
--- a/playground/structs/debate.py
+++ b/playground/structs/debate.py
--- a/playground/structs/dialogue_simulator.py
+++ b/playground/structs/dialogue_simulator.py
--- a/playground/structs/easy_example.py
+++ b/playground/structs/easy_example.py
--- a/playground/structs/godmode.py
+++ b/playground/structs/godmode.py
--- a/playground/structs/groupchat.py
+++ b/playground/structs/groupchat.py
--- a/playground/structs/gui_app.py
+++ b/playground/structs/gui_app.py
--- a/playground/structs/multi_agent_collab.py
+++ b/playground/structs/multi_agent_collab.py
--- a/playground/structs/multi_agent_debate.py
+++ b/playground/structs/multi_agent_debate.py
--- a/playground/structs/orchestrate.py
+++ b/playground/structs/orchestrate.py
--- a/playground/structs/orchestrator.py
+++ b/playground/structs/orchestrator.py
--- a/playground/structs/social_app.py
+++ b/playground/structs/social_app.py
--- a/playground/structs/swarms_example.py
+++ b/playground/structs/swarms_example.py
--- a/playground/structs/todo_app.py
+++ b/playground/structs/todo_app.py
--- a/playground/swarms/autoscaler.py
+++ b/playground/swarms/autoscaler.py
@ -1,7 +0,0 @@
-from swarms import AutoScaler
-
-auto_scaler = AutoScaler()
-auto_scaler.start()
-
-for i in range(100):
-    auto_scaler.add_task(f"Task {i}")
--- a/playground/workflow_example_example.py
+++ b/playground/workflow_example_example.py
--- a/pyproject.toml
+++ b/pyproject.toml
@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"

 [tool.poetry]
 name = "swarms"
-version = "3.9.4"
+version = "3.9.5"
 description = "Swarms - Pytorch"
 license = "MIT"
 authors = ["Kye Gomez <kye@apac.ai>"]
@ -70,6 +70,7 @@ timm = "*"
 supervision = "*"
 scikit-image = "*"
 pinecone-client = "*"
+roboflow = "*"



--- a/requirements.txt
+++ b/requirements.txt
@ -1,4 +1,4 @@
-gitorch==2.1.1
+torch==2.1.1
 transformers
 pandas==1.5.3
 langchain==0.0.333
@ -51,3 +51,4 @@ ultralytics
 supervision
 scikit-image
 pinecone-client
+roboflow
--- a/scripts/playground_to_examples.sh
+++ b/scripts/playground_to_examples.sh
@ -0,0 +1,13 @@
+#!/bin/bash
+
+# Change to the root directory
+cd /
+
+# Iterate over all the .py files in the directory
+for file in *.py; do
+    # Get the base name of the file without the .py
+    base_name=$(basename "$file" .py)
+
+    # Rename the file to remove .py from the end
+    mv "$file" "${base_name}"
+done
--- a/swarms/models/init.py
+++ b/swarms/models/init.py
@ -50,6 +50,8 @@ from swarms.models.qwen import QwenVLMultiModal  # noqa: E402
 from swarms.models.clipq import CLIPQ  # noqa: E402
 from swarms.models.kosmos_two import Kosmos  # noqa: E402
 from swarms.models.fuyu import Fuyu  # noqa: E402
+from swarms.models.roboflow_model import RoboflowMultiModal
+from swarms.models.sam_supervision import SegmentAnythingMarkGenerator

 # from swarms.models.dalle3 import Dalle3
 # from swarms.models.distilled_whisperx import DistilWhisperModel # noqa: E402
@ -118,4 +120,6 @@ __all__ = [
    "Kosmos",
    "Fuyu",
    "BaseEmbeddingModel",
+    "RoboflowMultiModal",
+    "SegmentAnythingMarkGenerator",
 ]
--- a/swarms/models/gpt4_sam.py
+++ b/swarms/models/gpt4_sam.py
@ -0,0 +1,81 @@
+import cv2
+
+from swarms.models.base_multimodal_model import BaseMultiModalModel
+from swarms.models.sam_supervision import SegmentAnythingMarkGenerator
+from swarms.utils.supervision_masking import refine_marks
+from swarms.utils.supervision_visualizer import MarkVisualizer
+from typing import Any
+
+
+class GPT4VSAM(BaseMultiModalModel):
+    """
+    GPT4VSAM class represents a multi-modal model that combines the capabilities of GPT-4 and SegmentAnythingMarkGenerator.
+    It takes an instance of BaseMultiModalModel (vlm) and a device as input and provides methods for loading images and making predictions.
+
+    Args:
+        vlm (BaseMultiModalModel): An instance of BaseMultiModalModel representing the visual language model.
+        device (str, optional): The device to be used for computation. Defaults to "cuda".
+
+    Attributes:
+        vlm (BaseMultiModalModel): An instance of BaseMultiModalModel representing the visual language model.
+        device (str): The device to be used for computation.
+        sam (SegmentAnythingMarkGenerator): An instance of SegmentAnythingMarkGenerator for generating marks.
+        visualizer (MarkVisualizer): An instance of MarkVisualizer for visualizing marks.
+
+    Methods:
+        load_img(img: str) -> Any: Loads an image from the given file path.
+        __call__(task: str, img: str, *args, **kwargs) -> Any: Makes predictions using the visual language model.
+
+    """
+
+    def __init__(
+        self,
+        vlm: BaseMultiModalModel,
+        device: str = "cuda",
+        return_related_marks: bool = False,
+        *args,
+        **kwargs,
+    ):
+        super().__init__(*args, **kwargs)
+        self.vlm = vlm
+        self.device = device
+        self.return_related_marks = return_related_marks
+
+        self.sam = SegmentAnythingMarkGenerator(
+            device, *args, **kwargs
+        )
+        self.visualizer = MarkVisualizer(*args, **kwargs)
+
+    def load_img(self, img: str) -> Any:
+        """
+        Loads an image from the given file path.
+
+        Args:
+            img (str): The file path of the image.
+
+        Returns:
+            Any: The loaded image.
+
+        """
+        return cv2.imread(img)
+
+    def __call__(self, task: str, img: str, *args, **kwargs) -> Any:
+        """
+        Makes predictions using the visual language model.
+
+        Args:
+            task (str): The task for which predictions are to be made.
+            img (str): The file path of the image.
+            *args: Additional positional arguments.
+            **kwargs: Additional keyword arguments.
+
+        Returns:
+            Any: The predictions made by the visual language model.
+
+        """
+        img = self.load_img(img)
+
+        marks = self.sam(image=img)
+        marks = refine_marks(marks=marks)
+
+        return self.vlm(task, img, *args, **kwargs)
--- a/swarms/models/odin.py
+++ b/swarms/models/odin.py
@ -1,6 +1,6 @@
 import os
 import supervision as sv
-from ultralytics import YOLO
+from ultralytics_example import YOLO
 from tqdm import tqdm
 from swarms.models.base_llm import AbstractLLM
 from swarms.utils.download_weights_from_url import (
--- a/swarms/models/roboflow_model.py
+++ b/swarms/models/roboflow_model.py
@ -0,0 +1,64 @@
+from typing import Union
+
+from roboflow import Roboflow
+
+from swarms.models.base_multimodal_model import BaseMultiModalModel
+
+
+class RoboflowMultiModal(BaseMultiModalModel):
+    """
+    Initializes the RoboflowModel with the given API key, project ID, and version.
+
+    Args:
+        api_key (str): The API key for Roboflow.
+        project_id (str): The ID of the project.
+        version (str): The version of the model.
+        confidence (int, optional): The confidence threshold. Defaults to 50.
+        overlap (int, optional): The overlap threshold. Defaults to 25.
+    """
+
+    def __init__(
+        self,
+        api_key: str,
+        project_id: str,
+        version: str,
+        confidence: int = 50,
+        overlap: int = 25,
+        hosted: bool = False,
+        *args,
+        **kwargs,
+    ):
+        super().__init__(*args, **kwargs)
+        self.api_key = api_key
+        self.project_id = project_id
+        self.verison = version
+        self.confidence = confidence
+        self.overlap = overlap
+        self.hosted = hosted
+
+        try:
+            rf = Roboflow(api_key=api_key, *args, **kwargs)
+            project = rf.workspace().project(project_id)
+            self.model = project.version(version).model
+            self.model.confidence = confidence
+            self.model.overlap = overlap
+        except Exception as e:
+            print(f"Error initializing RoboflowModel: {str(e)}")
+
+    def __call__(self, img: Union[str, bytes]):
+        """
+        Runs inference on an image and retrieves predictions.
+
+        Args:
+            img (Union[str, bytes]): The path to the image or the URL of the image.
+            hosted (bool, optional): Whether the image is hosted. Defaults to False.
+
+        Returns:
+            Optional[roboflow.Prediction]: The prediction or None if an error occurs.
+        """
+        try:
+            prediction = self.model.predict(img, hosted=self.hosted)
+            return prediction
+        except Exception as e:
+            print(f"Error running inference: {str(e)}")
+            return None
--- a/swarms/models/sam_supervision.py
+++ b/swarms/models/sam_supervision.py
@ -0,0 +1,116 @@
+import cv2
+import numpy as np
+import supervision as sv
+from PIL import Image
+from transformers import (
+    pipeline,
+    SamModel,
+    SamProcessor,
+    SamImageProcessor,
+)
+from typing import Optional
+
+from swarms.utils.supervision_masking import masks_to_marks
+from swarms.models.base_multimodal_model import BaseMultiModalModel
+
+
+class SegmentAnythingMarkGenerator(BaseMultiModalModel):
+    """
+    A class for performing image segmentation using a specified model.
+
+    Parameters:
+        device (str): The device to run the model on (e.g., 'cpu', 'cuda').
+        model_name (str): The name of the model to be loaded. Defaults to
+            'facebook/sam-vit-huge'.
+    """
+
+    def __init__(
+        self,
+        device: str = "cpu",
+        model_name: str = "facebook/sam-vit-huge",
+        visualize_marks: bool = False,
+        *args,
+        **kwargs,
+    ):
+        super(SegmentAnythingMarkGenerator).__init__(*args, **kwargs)
+        self.device = device
+        self.model_name = model_name
+        self.visualize_marks = visualize_marks
+
+        self.model = SamModel.from_pretrained(
+            model_name, *args, **kwargs
+        ).to(device)
+        self.processor = SamProcessor.from_pretrained(model_name)
+        self.image_processor = SamImageProcessor.from_pretrained(
+            model_name
+        )
+        self.device = device
+        self.pipeline = pipeline(
+            task="mask-generation",
+            model=self.model,
+            image_processor=self.image_processor,
+            device=self.device,
+        )
+
+    def __call__(
+        self, image: np.ndarray, mask: Optional[np.ndarray] = None
+    ) -> sv.Detections:
+        """
+        Generate image segmentation marks.
+
+        Parameters:
+            image (np.ndarray): The image to be marked in BGR format.
+            mask: (Optional[np.ndarray]): The mask to be used as a guide for
+                segmentation.
+
+        Returns:
+            sv.Detections: An object containing the segmentation masks and their
+                corresponding bounding box coordinates.
+        """
+        image = Image.fromarray(
+            cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+        )
+        if mask is None:
+            outputs = self.pipeline(image, points_per_batch=64)
+            masks = np.array(outputs["masks"])
+            return masks_to_marks(masks=masks)
+        else:
+            inputs = self.processor(image, return_tensors="pt").to(
+                self.device
+            )
+            image_embeddings = self.model.get_image_embeddings(
+                inputs.pixel_values
+            )
+            masks = []
+            for polygon in sv.mask_to_polygons(mask.astype(bool)):
+                indexes = np.random.choice(
+                    a=polygon.shape[0], size=5, replace=True
+                )
+                input_points = polygon[indexes]
+                inputs = self.processor(
+                    images=image,
+                    input_points=[[input_points]],
+                    return_tensors="pt",
+                ).to(self.device)
+                del inputs["pixel_values"]
+                outputs = self.model(
+                    image_embeddings=image_embeddings, **inputs
+                )
+                mask = (
+                    self.processor.image_processor.post_process_masks(
+                        masks=outputs.pred_masks.cpu().detach(),
+                        original_sizes=inputs["original_sizes"]
+                        .cpu()
+                        .detach(),
+                        reshaped_input_sizes=inputs[
+                            "reshaped_input_sizes"
+                        ]
+                        .cpu()
+                        .detach(),
+                    )[0][0][0].numpy()
+                )
+                masks.append(mask)
+            masks = np.array(masks)
+            return masks_to_marks(masks=masks)
+
+    # def visualize_img(self):
--- a/swarms/models/ultralytics_model.py
+++ b/swarms/models/ultralytics_model.py
@ -1,5 +1,6 @@
 from swarms.models.base_multimodal_model import BaseMultiModalModel
 from ultralytics import YOLO
+from typing import List


 class UltralyticsModel(BaseMultiModalModel):
@ -12,13 +13,22 @@ class UltralyticsModel(BaseMultiModalModel):
        **kwargs: Arbitrary keyword arguments.
    """

-    def __init__(self, model_name: str, *args, **kwargs):
+    def __init__(
+        self, model_name: str = "yolov8n.pt", *args, **kwargs
+    ):
        super().__init__(*args, **kwargs)
        self.model_name = model_name

-        self.model = YOLO(model_name, *args, **kwargs)
+        try:
+            self.model = YOLO(model_name, *args, **kwargs)
+        except Exception as e:
+            raise ValueError(
+                f"Failed to initialize Ultralytics model: {str(e)}"
+            )

-    def __call__(self, task: str, *args, **kwargs):
+    def __call__(
+        self, task: str, tasks: List[str] = None, *args, **kwargs
+    ):
        """
        Calls the Ultralytics model.

@ -30,4 +40,13 @@ class UltralyticsModel(BaseMultiModalModel):
        Returns:
            The result of the model call.
        """
-        return self.model(task, *args, **kwargs)
+        try:
+            if tasks:
+                return self.model([tasks], *args, **kwargs)
+            else:
+                return self.model(task, *args, **kwargs)
+        except Exception as e:
+            raise ValueError(
+                f"Failed to perform task '{task}' with Ultralytics"
+                f" model: {str(e)}"
+            )
--- a/swarms/structs/agent.py
+++ b/swarms/structs/agent.py
@ -16,6 +16,7 @@ from swarms.prompts.agent_system_prompts import (
 from swarms.prompts.multi_modal_autonomous_instruction_prompt import (
    MULTI_MODAL_AUTO_AGENT_SYSTEM_PROMPT_1,
 )
+from swarms.tokenizers.base_tokenizer import BaseTokenizer
 from swarms.tools.tool import BaseTool
 from swarms.utils.code_interpreter import SubprocessCodeInterpreter
 from swarms.utils.data_to_text import data_to_text
@ -151,7 +152,7 @@ class Agent:
        dynamic_loops: Optional[bool] = False,
        interactive: bool = False,
        dashboard: bool = False,
-        agent_name: str = None,
+        agent_name: str = "swarm-worker-01",
        agent_description: str = None,
        system_prompt: str = AGENT_SYSTEM_PROMPT_3,
        tools: List[BaseTool] = None,
@ -167,7 +168,7 @@ class Agent:
        multi_modal: Optional[bool] = None,
        pdf_path: Optional[str] = None,
        list_of_pdf: Optional[str] = None,
-        tokenizer: Optional[Any] = None,
+        tokenizer: Optional[BaseTokenizer] = None,
        long_term_memory: Optional[AbstractVectorDatabase] = None,
        preset_stopping_token: Optional[bool] = False,
        traceback: Any = None,
@ -187,7 +188,7 @@ class Agent:
        self.retry_attempts = retry_attempts
        self.retry_interval = retry_interval
        self.task = None
-        self.stopping_token = stopping_token  # or "<DONE>"
+        self.stopping_token = stopping_token
        self.interactive = interactive
        self.dashboard = dashboard
        self.return_history = return_history
@ -248,9 +249,14 @@ class Agent:
        if self.docs:
            self.ingest_docs(self.docs)

+        # If docs folder exists then get the docs from docs folder
        if self.docs_folder:
            self.get_docs_from_doc_folders()

+        # If tokenizer and context length exists then:
+        if self.tokenizer and self.context_length:
+            self.truncate_history()
+
    def set_system_prompt(self, system_prompt: str):
        """Set the system prompt"""
        self.system_prompt = system_prompt
@ -299,22 +305,6 @@ class Agent:
        """Format the template with the provided kwargs using f-string interpolation."""
        return template.format(**kwargs)

-    def truncate_history(self):
-        """
-        Take the history and truncate it to fit into the model context length
-        """
-        # truncated_history = self.short_memory[-1][-self.context_length :]
-        # self.short_memory[-1] = truncated_history
-        # out = limit_tokens_from_string(
-        #     "\n".join(truncated_history), self.llm.model_name
-        # )
-        truncated_history = self.short_memory[-1][
-            -self.context_length :
-        ]
-        text = "\n".join(truncated_history)
-        out = limit_tokens_from_string(text, "gpt-4")
-        return out
-
    def add_task_to_memory(self, task: str):
        """Add the task to the memory"""
        try:
@ -1155,6 +1145,27 @@ class Agent:
        message = f"{agent_name}: {message}"
        return self.run(message, *args, **kwargs)

+    def truncate_history(self):
+        """
+        Truncates the short-term memory of the agent based on the count of tokens.
+
+        The method counts the tokens in the short-term memory using the tokenizer and
+        compares it with the length of the memory. If the length of the memory is greater
+        than the count, the memory is truncated to match the count.
+
+        Parameters:
+            None
+
+        Returns:
+            None
+        """
+        # Count the short term history with the tokenizer
+        count = self.tokenizer.count_tokens(self.short_memory)
+
+        # Now the logic that truncates the memory if it's more than the count
+        if len(self.short_memory) > count:
+            self.short_memory = self.short_memory[:count]
+
    def get_docs_from_doc_folders(self):
        """Get the docs from the files"""
        # Get the list of files then extract them and add them to the memory
--- a/swarms/structs/conversation.py
+++ b/swarms/structs/conversation.py
@ -5,6 +5,7 @@ from termcolor import colored

 from swarms.memory.base_db import AbstractDatabase
 from swarms.structs.base import BaseStructure
+from swarms.tokenizers.base_tokenizer import BaseTokenizer


 class Conversation(BaseStructure):
@ -65,6 +66,8 @@ class Conversation(BaseStructure):
        database: AbstractDatabase = None,
        autosave: bool = False,
        save_filepath: str = None,
+        tokenizer: BaseTokenizer = None,
+        context_length: int = 8192,
        *args,
        **kwargs,
    ):
@ -75,11 +78,17 @@ class Conversation(BaseStructure):
        self.autosave = autosave
        self.save_filepath = save_filepath
        self.conversation_history = []
+        self.tokenizer = tokenizer
+        self.context_length = context_length

        # If system prompt is not None, add it to the conversation history
        if self.system_prompt:
            self.add("system", self.system_prompt)

+        # If tokenizer then truncate
+        if tokenizer:
+            self.truncate_memory_with_tokenizer()
+
    def add(self, role: str, content: str, *args, **kwargs):
        """Add a message to the conversation history

@ -348,3 +357,40 @@ class Conversation(BaseStructure):
    def fetch_one_from_database(self, *args, **kwargs):
        """Fetch one from the database"""
        return self.database.fetch_one()
+
+    def truncate_memory_with_tokenizer(self):
+        """
+        Truncates the conversation history based on the total number of tokens using a tokenizer.
+
+        Returns:
+            None
+        """
+        total_tokens = 0
+        truncated_history = []
+
+        for message in self.conversation_history:
+            role = message.get("role")
+            content = message.get("content")
+            tokens = self.tokenizer.count_tokens(
+                text=content
+            )  # Count the number of tokens
+            count = tokens  # Assign the token count
+            total_tokens += count
+
+            if total_tokens <= self.context_length:
+                truncated_history.append(message)
+            else:
+                remaining_tokens = self.context_length - (
+                    total_tokens - count
+                )
+                truncated_content = content[
+                    :remaining_tokens
+                ]  # Truncate the content based on the remaining tokens
+                truncated_message = {
+                    "role": role,
+                    "content": truncated_content,
+                }
+                truncated_history.append(truncated_message)
+                break
+
+        self.conversation_history = truncated_history
--- a/swarms/utils/init.py
+++ b/swarms/utils/init.py
@ -33,6 +33,16 @@ from swarms.utils.remove_json_whitespace import (
    remove_whitespace_from_yaml,
 )
 from swarms.utils.exponential_backoff import ExponentialBackoffMixin
+from swarms.utils.download_img import download_img_from_url
+from swarms.utils.supervision_masking import (
+    FeatureType,
+    compute_mask_iou_vectorized,
+    mask_non_max_suppression,
+    filter_masks_by_relative_area,
+    masks_to_marks,
+    refine_marks,
+)
+from swarms.utils.supervision_visualizer import MarkVisualizer

 __all__ = [
    "SubprocessCodeInterpreter",
@ -59,4 +69,12 @@ __all__ = [
    "remove_whitespace_from_json",
    "remove_whitespace_from_yaml",
    "ExponentialBackoffMixin",
+    "download_img_from_url",
+    "FeatureType",
+    "compute_mask_iou_vectorized",
+    "mask_non_max_suppression",
+    "filter_masks_by_relative_area",
+    "masks_to_marks",
+    "refine_marks",
+    "MarkVisualizer",
 ]
--- a/swarms/utils/download_img.py
+++ b/swarms/utils/download_img.py
@ -0,0 +1,31 @@
+from io import BytesIO
+import requests
+from PIL import Image
+
+
+def download_img_from_url(url: str):
+    """
+    Downloads an image from the given URL and saves it locally.
+
+    Args:
+        url (str): The URL of the image to download.
+
+    Raises:
+        ValueError: If the URL is empty or invalid.
+        IOError: If there is an error while downloading or saving the image.
+    """
+    if not url:
+        raise ValueError("URL cannot be empty.")
+
+    try:
+        response = requests.get(url)
+        response.raise_for_status()
+
+        image = Image.open(BytesIO(response.content))
+        image.save("downloaded_image.jpg")
+
+        print("Image downloaded successfully.")
+    except requests.exceptions.RequestException as e:
+        raise IOError("Error while downloading the image.") from e
+    except IOError as e:
+        raise IOError("Error while saving the image.") from e
--- a/swarms/utils/logger.py
+++ b/swarms/utils/logger.py
@ -1,4 +1,6 @@
 import logging
+import functools
+

 logger = logging.getLogger()
 formatter = logging.Formatter("%(message)s")
@ -10,3 +12,35 @@ ch.setFormatter(formatter)
 logger.addHandler(ch)

 logger.setLevel(logging.DEBUG)
+
+
+def log_wrapper(func):
+    """
+    A decorator that logs the inputs, outputs, and any exceptions of the function it wraps.
+
+    Args:
+        func (callable): The function to wrap.
+
+    Returns:
+        callable: The wrapped function.
+    """
+
+    @functools.wraps(func)
+    def wrapper(*args, **kwargs):
+        logger.debug(
+            f"Calling function {func.__name__} with args {args} and"
+            f" kwargs {kwargs}"
+        )
+        try:
+            result = func(*args, **kwargs)
+            logger.debug(
+                f"Function {func.__name__} returned {result}"
+            )
+            return result
+        except Exception as e:
+            logger.error(
+                f"Function {func.__name__} raised an exception: {e}"
+            )
+            raise
+
+    return wrapper
--- a/swarms/utils/supervision_masking.py
+++ b/swarms/utils/supervision_masking.py
@ -0,0 +1,259 @@
+from enum import Enum
+
+import cv2
+import numpy as np
+import supervision as sv
+
+
+class FeatureType(Enum):
+    """
+    An enumeration to represent the types of features for mask adjustment in image
+    segmentation.
+    """
+
+    ISLAND = "ISLAND"
+    HOLE = "HOLE"
+
+    @classmethod
+    def list(cls):
+        return list(map(lambda c: c.value, cls))
+
+
+def compute_mask_iou_vectorized(masks: np.ndarray) -> np.ndarray:
+    """
+    Vectorized computation of the Intersection over Union (IoU) for all pairs of masks.
+
+    Parameters:
+        masks (np.ndarray): A 3D numpy array with shape `(N, H, W)`, where `N` is the
+            number of masks, `H` is the height, and `W` is the width.
+
+    Returns:
+        np.ndarray: A 2D numpy array of shape `(N, N)` where each element `[i, j]` is
+            the IoU between masks `i` and `j`.
+
+    Raises:
+        ValueError: If any of the masks is found to be empty.
+    """
+    if np.any(masks.sum(axis=(1, 2)) == 0):
+        raise ValueError(
+            "One or more masks are empty. Please filter out empty"
+            " masks before using `compute_iou_vectorized` function."
+        )
+
+    masks_bool = masks.astype(bool)
+    masks_flat = masks_bool.reshape(masks.shape[0], -1)
+    intersection = np.logical_and(
+        masks_flat[:, None], masks_flat[None, :]
+    ).sum(axis=2)
+    union = np.logical_or(
+        masks_flat[:, None], masks_flat[None, :]
+    ).sum(axis=2)
+    iou_matrix = intersection / union
+    return iou_matrix
+
+
+def mask_non_max_suppression(
+    masks: np.ndarray, iou_threshold: float = 0.6
+) -> np.ndarray:
+    """
+    Performs Non-Max Suppression on a set of masks by prioritizing larger masks and
+        removing smaller masks that overlap significantly.
+
+    When the IoU between two masks exceeds the specified threshold, the smaller mask
+    (in terms of area) is discarded. This process is repeated for each pair of masks,
+    effectively filtering out masks that are significantly overlapped by larger ones.
+
+    Parameters:
+        masks (np.ndarray): A 3D numpy array with shape `(N, H, W)`, where `N` is the
+            number of masks, `H` is the height, and `W` is the width.
+        iou_threshold (float): The IoU threshold for determining significant overlap.
+
+    Returns:
+        np.ndarray: A 3D numpy array of filtered masks.
+    """
+    num_masks = masks.shape[0]
+    areas = masks.sum(axis=(1, 2))
+    sorted_idx = np.argsort(-areas)
+    keep_mask = np.ones(num_masks, dtype=bool)
+    iou_matrix = compute_mask_iou_vectorized(masks)
+    for i in range(num_masks):
+        if not keep_mask[sorted_idx[i]]:
+            continue
+
+        overlapping_masks = iou_matrix[sorted_idx[i]] > iou_threshold
+        overlapping_masks[sorted_idx[i]] = False
+        overlapping_indices = np.where(overlapping_masks)[0]
+        keep_mask[sorted_idx[overlapping_indices]] = False
+
+    return masks[keep_mask]
+
+
+def filter_masks_by_relative_area(
+    masks: np.ndarray,
+    minimum_area: float = 0.01,
+    maximum_area: float = 1.0,
+) -> np.ndarray:
+    """
+    Filters masks based on their relative area within the total area of each mask.
+
+    Parameters:
+        masks (np.ndarray): A 3D numpy array with shape `(N, H, W)`, where `N` is the
+            number of masks, `H` is the height, and `W` is the width.
+        minimum_area (float): The minimum relative area threshold. Must be between `0`
+            and `1`.
+        maximum_area (float): The maximum relative area threshold. Must be between `0`
+            and `1`.
+
+    Returns:
+        np.ndarray: A 3D numpy array containing masks that fall within the specified
+            relative area range.
+
+    Raises:
+        ValueError: If `minimum_area` or `maximum_area` are outside the `0` to `1`
+            range, or if `minimum_area` is greater than `maximum_area`.
+    """
+
+    if not (isinstance(masks, np.ndarray) and masks.ndim == 3):
+        raise ValueError("Input must be a 3D numpy array.")
+
+    if not (0 <= minimum_area <= 1) or not (0 <= maximum_area <= 1):
+        raise ValueError(
+            "`minimum_area` and `maximum_area` must be between 0"
+            " and 1."
+        )
+
+    if minimum_area > maximum_area:
+        raise ValueError(
+            "`minimum_area` must be less than or equal to"
+            " `maximum_area`."
+        )
+
+    total_area = masks.shape[1] * masks.shape[2]
+    relative_areas = masks.sum(axis=(1, 2)) / total_area
+    return masks[
+        (relative_areas >= minimum_area)
+        & (relative_areas <= maximum_area)
+    ]
+
+
+def adjust_mask_features_by_relative_area(
+    mask: np.ndarray,
+    area_threshold: float,
+    feature_type: FeatureType = FeatureType.ISLAND,
+) -> np.ndarray:
+    """
+    Adjusts a mask by removing small islands or filling small holes based on a relative
+    area threshold.
+
+    !!! warning
+
+        Running this function on a mask with small islands may result in empty masks.
+
+    Parameters:
+        mask (np.ndarray): A 2D numpy array with shape `(H, W)`, where `H` is the
+            height, and `W` is the width.
+        area_threshold (float): Threshold for relative area to remove or fill features.
+        feature_type (FeatureType): Type of feature to adjust (`ISLAND` for removing
+            islands, `HOLE` for filling holes).
+
+    Returns:
+        np.ndarray: A 2D numpy array containing mask.
+    """
+    height, width = mask.shape
+    total_area = width * height
+
+    mask = np.uint8(mask * 255)
+    operation = (
+        cv2.RETR_EXTERNAL
+        if feature_type == FeatureType.ISLAND
+        else cv2.RETR_CCOMP
+    )
+    contours, _ = cv2.findContours(
+        mask, operation, cv2.CHAIN_APPROX_SIMPLE
+    )
+
+    for contour in contours:
+        area = cv2.contourArea(contour)
+        relative_area = area / total_area
+        if relative_area < area_threshold:
+            cv2.drawContours(
+                image=mask,
+                contours=[contour],
+                contourIdx=-1,
+                color=(
+                    0 if feature_type == FeatureType.ISLAND else 255
+                ),
+                thickness=-1,
+            )
+    return np.where(mask > 0, 1, 0).astype(bool)
+
+
+def masks_to_marks(masks: np.ndarray) -> sv.Detections:
+    """
+    Converts a set of masks to a marks (sv.Detections) object.
+
+    Parameters:
+        masks (np.ndarray): A 3D numpy array with shape `(N, H, W)`, where `N` is the
+            number of masks, `H` is the height, and `W` is the width.
+
+    Returns:
+        sv.Detections: An object containing the masks and their bounding box
+            coordinates.
+    """
+    if len(masks) == 0:
+        marks = sv.Detections.empty()
+        marks.mask = np.empty((0, 0, 0), dtype=bool)
+        return marks
+    return sv.Detections(
+        mask=masks, xyxy=sv.mask_to_xyxy(masks=masks)
+    )
+
+
+def refine_marks(
+    marks: sv.Detections,
+    maximum_hole_area: float = 0.01,
+    maximum_island_area: float = 0.01,
+    minimum_mask_area: float = 0.02,
+    maximum_mask_area: float = 1.0,
+) -> sv.Detections:
+    """
+    Refines a set of masks by removing small islands and holes, and filtering by mask
+    area.
+
+    Parameters:
+        marks (sv.Detections): An object containing the masks and their bounding box
+            coordinates.
+        maximum_hole_area (float): The maximum relative area of holes to be filled in
+            each mask.
+        maximum_island_area (float): The maximum relative area of islands to be removed
+            from each mask.
+        minimum_mask_area (float): The minimum relative area for a mask to be retained.
+        maximum_mask_area (float): The maximum relative area for a mask to be retained.
+
+    Returns:
+        sv.Detections: An object containing the masks and their bounding box
+            coordinates.
+    """
+    result_masks = []
+    for mask in marks.mask:
+        mask = adjust_mask_features_by_relative_area(
+            mask=mask,
+            area_threshold=maximum_island_area,
+            feature_type=FeatureType.ISLAND,
+        )
+        mask = adjust_mask_features_by_relative_area(
+            mask=mask,
+            area_threshold=maximum_hole_area,
+            feature_type=FeatureType.HOLE,
+        )
+        if np.any(mask):
+            result_masks.append(mask)
+    result_masks = np.array(result_masks)
+    result_masks = filter_masks_by_relative_area(
+        masks=result_masks,
+        minimum_area=minimum_mask_area,
+        maximum_area=maximum_mask_area,
+    )
+    return sv.Detections(
+        mask=result_masks, xyxy=sv.mask_to_xyxy(masks=result_masks)
+    )
--- a/swarms/utils/supervision_visualizer.py
+++ b/swarms/utils/supervision_visualizer.py
@ -0,0 +1,85 @@
+import numpy as np
+import supervision as sv
+
+
+class MarkVisualizer:
+    """
+    A class for visualizing different marks including bounding boxes, masks, polygons,
+    and labels.
+
+    Parameters:
+        line_thickness (int): The thickness of the lines for boxes and polygons.
+        mask_opacity (float): The opacity level for masks.
+        text_scale (float): The scale of the text for labels.
+    """
+
+    def __init__(
+        self,
+        line_thickness: int = 2,
+        mask_opacity: float = 0.1,
+        text_scale: float = 0.6,
+    ) -> None:
+        self.box_annotator = sv.BoundingBoxAnnotator(
+            color_lookup=sv.ColorLookup.INDEX,
+            thickness=line_thickness,
+        )
+        self.mask_annotator = sv.MaskAnnotator(
+            color_lookup=sv.ColorLookup.INDEX, opacity=mask_opacity
+        )
+        self.polygon_annotator = sv.PolygonAnnotator(
+            color_lookup=sv.ColorLookup.INDEX,
+            thickness=line_thickness,
+        )
+        self.label_annotator = sv.LabelAnnotator(
+            color=sv.Color.black(),
+            text_color=sv.Color.white(),
+            color_lookup=sv.ColorLookup.INDEX,
+            text_position=sv.Position.CENTER_OF_MASS,
+            text_scale=text_scale,
+        )
+
+    def visualize(
+        self,
+        image: np.ndarray,
+        marks: sv.Detections,
+        with_box: bool = False,
+        with_mask: bool = False,
+        with_polygon: bool = True,
+        with_label: bool = True,
+    ) -> np.ndarray:
+        """
+        Visualizes annotations on an image.
+
+        This method takes an image and an instance of sv.Detections, and overlays
+        the specified types of marks (boxes, masks, polygons, labels) on the image.
+
+        Parameters:
+            image (np.ndarray): The image on which to overlay annotations.
+            marks (sv.Detections): The detection results containing the annotations.
+            with_box (bool): Whether to draw bounding boxes. Defaults to False.
+            with_mask (bool): Whether to overlay masks. Defaults to False.
+            with_polygon (bool): Whether to draw polygons. Defaults to True.
+            with_label (bool): Whether to add labels. Defaults to True.
+
+        Returns:
+            np.ndarray: The annotated image.
+        """
+        annotated_image = image.copy()
+        if with_box:
+            annotated_image = self.box_annotator.annotate(
+                scene=annotated_image, detections=marks
+            )
+        if with_mask:
+            annotated_image = self.mask_annotator.annotate(
+                scene=annotated_image, detections=marks
+            )
+        if with_polygon:
+            annotated_image = self.polygon_annotator.annotate(
+                scene=annotated_image, detections=marks
+            )
+        if with_label:
+            labels = list(map(str, range(len(marks))))
+            annotated_image = self.label_annotator.annotate(
+                scene=annotated_image, detections=marks, labels=labels
+            )
+        return annotated_image