[FEAT][CLIPQ]

2 years ago · cdf68c9467
parent b097e6016d
commit cdf68c9467
3 changed files with 165 additions and 2 deletions
--- a/swarms/agents/worker_agent.py
+++ b/swarms/agents/worker_agent.py
@ -9,7 +9,6 @@ from langchain_experimental.autonomous_agents import AutoGPT
 from swarms.utils.decorators import error_decorator, timing_decorator
 class Worker:
    """
    The Worker class represents an autonomous agent that can perform tassks through
@ -165,7 +164,7 @@ class Worker:
    # @log_decorator
    @error_decorator
    @timing_decorator
-    def run(self, task: str = None, *args, **kwargs):
+    def run(self, task: str = None, img = None, *args, **kwargs):
        """
        Run the autonomous agent on a given task.
--- a/swarms/models/init.py
+++ b/swarms/models/init.py
@ -47,6 +47,7 @@ from swarms.models.ultralytics_model import (
 from swarms.models.vip_llava import VipLlavaMultiModal  # noqa: E402
 from swarms.models.llava import LavaMultiModal  # noqa: E402
 from swarms.models.qwen import QwenVLMultiModal  # noqa: E402
 from swarms.models.clipq import CLIPQ  # noqa: E402
 # from swarms.models.dalle3 import Dalle3
 # from swarms.models.distilled_whisperx import DistilWhisperModel # noqa: E402
@ -110,4 +111,6 @@ __all__ = [
    "VipLlavaMultiModal",
    "LavaMultiModal",
    "QwenVLMultiModal",
    "CLIPQ",
 ]
--- a/swarms/models/clipq.py
+++ b/swarms/models/clipq.py
@ -0,0 +1,161 @@
 from io import BytesIO
 import requests
 import torch
 from PIL import Image
 from torchvision.transforms import GaussianBlur
 from transformers import CLIPModel, CLIPProcessor
 class CLIPQ:
    """
    ClipQ is an CLIQ based model that can be used to generate captions for images.
    Attributes:
        model_name (str): The name of the model to be used.
        query_text (str): The query text to be used for the model.
    Args:
        model_name (str): The name of the model to be used.
        query_text (str): The query text to be used for the model.
    """
    def __init__(
        self,
        model_name: str = "openai/clip-vit-base-patch16",
        query_text: str = "A photo ",
        *args,
        **kwargs
    ):
        self.model = CLIPModel.from_pretrained(model_name, *args, **kwargs)
        self.processor = CLIPProcessor.from_pretrained(model_name)
        self.query_text = query_text
    def fetch_image_from_url(self, url = "https://picsum.photos/800"):
        """Fetches an image from the given url"""
        response = requests.get(url)
        if response.status_code != 200:
            raise Exception("Failed to fetch an image")
        image = Image.open(BytesIO(response.content))
        return image
    def load_image_from_path(self, path):
        """Loads an image from the given path"""
        return Image.open(path)
    def split_image(self, image, h_splits: int = 2, v_splits: int = 2):
        """Splits the given image into h_splits x v_splits parts"""
        width, height = image.size
        w_step, h_step = width // h_splits, height // v_splits
        slices = []
        for i in range(v_splits):
            for j in range(h_splits):
                slice = image.crop(
                    (j * w_step, i * h_step, (j + 1) * w_step, (i + 1) * h_step)
                )
                slices.append(slice)
        return slices
    def get_vectors(
        self,
        image,
        h_splits: int = 2,
        v_splits: int = 2,
    ):
        """Gets the vectors for the given image"""
        slices = self.split_image(image, h_splits, v_splits)
        vectors = []
        for slice in slices:
            inputs = self.processor(
                text=self.query_text, images=slice, return_tensors="pt", padding=True
            )
            outputs = self.model(**inputs)
            vectors.append(outputs.image_embeds.squeeze().detach().numpy())
        return vectors
    def run_from_url(
        self,
        url: str = "https://picsum.photos/800",
        h_splits: int = 2,
        v_splits: int = 2,
    ):
        """Runs the model on the image fetched from the given url"""
        image = self.fetch_image_from_url(url)
        return self.get_vectors(image, h_splits, v_splits)
    def check_hard_chunking(self, quadrants):
        """Check if the chunking is hard"""
        variances = []
        for quadrant in quadrants:
            edge_pixels = torch.cat(
                [
                    quadrant[0, 1],
                    quadrant[-1, :],
                ]
            )
            variances.append(torch.var(edge_pixels).item())
        return variances
    def embed_whole_image(self, image):
        """Embed the entire image"""
        inputs = self.processor(
            image,
            return_tensors="pt",
        )
        with torch.no_grad():
            outputs = self.model(**inputs)
            return outputs.image_embeds.squeeze()
    def apply_noise_reduction(self, image, kernel_size: int = 5):
        """Implement an upscaling method to upscale the image and tiling issues"""
        blur = GaussianBlur(kernel_size)
        return blur(image)
    def run_from_path(self, path: str = None, h_splits: int = 2, v_splits: int = 2):
        """Runs the model on the image loaded from the given path"""
        image = self.load_image_from_path(path)
        return self.get_vectors(image, h_splits, v_splits)
    def get_captions(self, image, candidate_captions):
        """Get the best caption for the given image"""
        inputs_image = self.processor(
            images=image,
            return_tensors="pt",
        )
        inputs_text = self.processor(
            text=candidate_captions,
            images=inputs_image.pixel_values[0],  # Fix the argument name
            return_tensors="pt",
            padding=True,
            truncation=True,
        )
        image_embeds = self.model(
            pixel_values=inputs_image.pixel_values[0]
        ).image_embeds
        text_embeds = self.model(
            input_ids=inputs_text.input_ids, attention_mask=inputs_text.attention_mask
        ).text_embeds
        # Calculate similarity between image and text
        similarities = (image_embeds @ text_embeds.T).squeeze(0)
        best_caption_index = similarities.argmax().item()
        return candidate_captions[best_caption_index]
    def get_and_concat_captions(
        self, image, candidate_captions, h_splits=2, v_splits=2
    ):
        """Get the best caption for the given image"""
        slices = self.split_image(image, h_splits, v_splits)
        captions = [self.get_captions(slice, candidate_captions) for slice in slices]
        concated_captions = "".join(captions)
        return concated_captions