Jarvis demo, base multimmodalmodel, whisperx -> whisperx_model

1 year ago · 51c82cf1f2
parent 9390efb8aa
commit 51c82cf1f2
8 changed files with 263 additions and 42 deletions
--- a/playground/demos/jarvis_multi_modal_auto_agent/jarvis.py
+++ b/playground/demos/jarvis_multi_modal_auto_agent/jarvis.py
@ -0,0 +1,20 @@
+from swarms.structs import Flow
+from swarms.models.gpt4_vision_api import GPT4VisionAPI
+from swarms.prompts.multi_modal_autonomous_instruction_prompt import (
+    MULTI_MODAL_AUTO_AGENT_SYSTEM_PROMPT_1,
+)
+
+
+llm = GPT4VisionAPI()
+
+task = "What is the color of the object?"
+img = "images/swarms.jpeg"
+
+## Initialize the workflow
+flow = Flow(
+    llm=llm,
+    sop=MULTI_MODAL_AUTO_AGENT_SYSTEM_PROMPT_1,
+    max_loops="auto",
+)
+
+flow.run(task=task, img=img)
--- a/swarms/models/init.py
+++ b/swarms/models/init.py
@ -20,8 +20,6 @@ from swarms.models.mpt import MPT7B  # noqa: E402

 # MultiModal Models
 from swarms.models.idefics import Idefics  # noqa: E402
-
-# from swarms.models.kosmos_two import Kosmos  # noqa: E402
 from swarms.models.vilt import Vilt  # noqa: E402
 from swarms.models.nougat import Nougat  # noqa: E402
 from swarms.models.layoutlm_document_qa import LayoutLMDocumentQA  # noqa: E402
@ -30,6 +28,8 @@ from swarms.models.gpt4_vision_api import GPT4VisionAPI  # noqa: E402
 # from swarms.models.gpt4v import GPT4Vision
 # from swarms.models.dalle3 import Dalle3
 # from swarms.models.distilled_whisperx import DistilWhisperModel # noqa: E402
+# from swarms.models.whisperx_model import WhisperX  # noqa: E402
+# from swarms.models.kosmos_two import Kosmos  # noqa: E402

 __all__ = [
    "Anthropic",
--- a/swarms/models/base_multimodal_model.py
+++ b/swarms/models/base_multimodal_model.py
@ -0,0 +1,209 @@
+import asyncio
+import base64
+import concurrent.futures
+import time
+from concurrent import ThreadPoolExecutor
+from io import BytesIO
+from typing import List, Optional, Tuple
+
+import requests
+from ABC import abstractmethod
+from PIL import Image
+
+
+class BaseMultiModalModel:
+    def __init__(
+        self,
+        model_name: Optional[str],
+        temperature: Optional[int] = 0.5,
+        max_tokens: Optional[int] = 500,
+        max_workers: Optional[int] = 10,
+        top_p: Optional[int] = 1,
+        top_k: Optional[int] = 50,
+        device: Optional[str] = "cuda",
+        max_new_tokens: Optional[int] = 500,
+        retries: Optional[int] = 3,
+    ):
+        self.model_name = model_name
+        self.temperature = temperature
+        self.max_tokens = max_tokens
+        self.max_workers = max_workers
+        self.top_p = top_p
+        self.top_k = top_k
+        self.device = device
+        self.max_new_tokens = max_new_tokens
+        self.retries = retries
+        self.chat_history = []
+
+    
+    @abstractmethod
+    def __call__(self, text: str, img: str):
+        """Run the model"""
+        pass
+
+    def run(self, task: str, img: str):
+        """Run the model"""
+        pass
+
+    async def arun(self, task: str, img: str):
+        """Run the model asynchronously"""
+        pass
+
+    def get_img_from_web(self, img: str):
+        """Get the image from the web"""
+        try:
+            response = requests.get(img)
+            response.raise_for_status()
+            image_pil = Image.open(BytesIO(response.content))
+            return image_pil
+        except requests.RequestException as error:
+            print(f"Error fetching image from {img} and error: {error}")
+            return None
+        
+    def encode_img(self, img: str):
+        """Encode the image to base64"""
+        with open(img, "rb") as image_file:
+            return base64.b64encode(image_file.read()).decode("utf-8")
+    
+    def get_img(self, img: str):
+        """Get the image from the path"""
+        image_pil = Image.open(img)
+        return image_pil
+    
+    def clear_chat_history(self):
+        """Clear the chat history"""
+        self.chat_history = []
+
+    def run_many(
+        self,
+        tasks: List[str],
+        imgs: List[str],
+    ):
+        """
+        Run the model on multiple tasks and images all at once using concurrent
+
+        Args:
+            tasks (List[str]): List of tasks
+            imgs (List[str]): List of image paths
+        
+        Returns:
+            List[str]: List of responses
+            
+        
+        """
+        # Instantiate the thread pool executor
+        with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
+            results = executor.map(self.run, tasks, imgs)
+
+        # Print the results for debugging
+        for result in results:
+            print(result)
+
+
+    def run_batch(self, tasks_images: List[Tuple[str, str]]) -> List[str]:
+        """Process a batch of tasks and images"""
+        with concurrent.futures.ThreadPoolExecutor() as executor:
+            futures = [
+                executor.submit(self.run, task, img)
+                for task, img in tasks_images
+            ]
+            results = [future.result() for future in futures]
+        return results
+
+    async def run_batch_async(
+        self, tasks_images: List[Tuple[str, str]]
+    ) -> List[str]:
+        """Process a batch of tasks and images asynchronously"""
+        loop = asyncio.get_event_loop()
+        futures = [
+            loop.run_in_executor(None, self.run, task, img)
+            for task, img in tasks_images
+        ]
+        return await asyncio.gather(*futures)
+
+    async def run_batch_async_with_retries(
+        self, tasks_images: List[Tuple[str, str]]
+    ) -> List[str]:
+        """Process a batch of tasks and images asynchronously with retries"""
+        loop = asyncio.get_event_loop()
+        futures = [
+            loop.run_in_executor(None, self.run_with_retries, task, img)
+            for task, img in tasks_images
+        ]
+        return await asyncio.gather(*futures)
+    
+    def unique_chat_history(self):
+        """Get the unique chat history"""
+        return list(set(self.chat_history))
+    
+    def run_with_retries(self, task: str, img: str):
+        """Run the model with retries"""
+        for i in range(self.retries):
+            try:
+                return self.run(task, img)
+            except Exception as error:
+                print(f"Error with the request {error}")
+                continue
+    
+    def run_batch_with_retries(self, tasks_images: List[Tuple[str, str]]):
+        """Run the model with retries"""
+        for i in range(self.retries):
+            try:
+                return self.run_batch(tasks_images)
+            except Exception as error:
+                print(f"Error with the request {error}")
+                continue
+
+    def _tokens_per_second(self) -> float:
+        """Tokens per second"""
+        elapsed_time = self.end_time - self.start_time
+        if elapsed_time == 0:
+            return float("inf")
+        return self._num_tokens() / elapsed_time
+
+    def _time_for_generation(self, task: str) -> float:
+        """Time for Generation"""
+        self.start_time = time.time()
+        self.run(task)
+        self.end_time = time.time()
+        return self.end_time - self.start_time
+
+    @abstractmethod
+    def generate_summary(self, text: str) -> str:
+        """Generate Summary"""
+        pass
+
+    def set_temperature(self, value: float):
+        """Set Temperature"""
+        self.temperature = value
+
+    def set_max_tokens(self, value: int):
+        """Set new max tokens"""
+        self.max_tokens = value
+
+    def get_generation_time(self) -> float:
+        """Get generation time"""
+        if self.start_time and self.end_time:
+            return self.end_time - self.start_time
+        return 0
+    
+    def get_chat_history(self):
+        """Get the chat history"""
+        return self.chat_history
+    
+    def get_unique_chat_history(self):
+        """Get the unique chat history"""
+        return list(set(self.chat_history))
+    
+    def get_chat_history_length(self):
+        """Get the chat history length"""
+        return len(self.chat_history)
+    
+    def get_unique_chat_history_length(self):
+        """Get the unique chat history length"""
+        return len(list(set(self.chat_history)))
+    
+    def get_chat_history_tokens(self):
+        """Get the chat history tokens"""
+        return self._num_tokens()
+    
--- a/swarms/models/fuyu.py
+++ b/swarms/models/fuyu.py
@ -63,9 +63,9 @@ class Fuyu:

    def __call__(self, text: str, img: str):
        """Call the model with text and img paths"""
-        image_pil = Image.open(img)
+        img = self.get_img(img)
        model_inputs = self.processor(
-            text=text, images=[image_pil], device=self.device_map
+            text=text, images=[img], device=self.device_map
        )

        for k, v in model_inputs.items():
@ -79,13 +79,13 @@ class Fuyu:
        )
        return print(str(text))

-    def get_img_from_web(self, img_url: str):
+    def get_img_from_web(self, img: str):
        """Get the image from the web"""
        try:
-            response = requests.get(img_url)
+            response = requests.get(img)
            response.raise_for_status()
            image_pil = Image.open(BytesIO(response.content))
            return image_pil
        except requests.RequestException as error:
-            print(f"Error fetching image from {img_url} and error: {error}")
+            print(f"Error fetching image from {img} and error: {error}")
            return None
--- a/swarms/models/gpt4_vision_api.py
+++ b/swarms/models/gpt4_vision_api.py
@ -114,7 +114,6 @@ class GPT4VisionAPI:
        except Exception as error:
            print(f"Error with the request: {error}")
            raise error
-        # Function to handle vision tasks

    def __call__(self, task: str, img: str):
        """Run the model."""
--- a/swarms/models/kosmos_two.py
+++ b/swarms/models/kosmos_two.py
@ -18,38 +18,31 @@ def is_overlapping(rect1, rect2):

 class Kosmos:
    """
+    Kosmos model by Yen-Chun Shieh
+
+    Parameters
+    ----------
+    model_name : str
+        Path to the pretrained model
+    
+    Examples
+    --------
+    >>> kosmos = Kosmos()
+    >>> kosmos("Hello, my name is", "path/to/image.png")

-    Args:
-
-
-    # Initialize Kosmos
-    kosmos = Kosmos()
-
-    # Perform multimodal grounding
-    kosmos.multimodal_grounding("Find the red apple in the image.", "https://example.com/apple.jpg")
-
-    # Perform referring expression comprehension
-    kosmos.referring_expression_comprehension("Show me the green bottle.", "https://example.com/bottle.jpg")
-
-    # Generate referring expressions
-    kosmos.referring_expression_generation("It is on the table.", "https://example.com/table.jpg")
-
-    # Perform grounded visual question answering
-    kosmos.grounded_vqa("What is the color of the car?", "https://example.com/car.jpg")
-
-    # Generate grounded image caption
-    kosmos.grounded_image_captioning("https://example.com/beach.jpg")
    """

    def __init__(
        self,
        model_name="ydshieh/kosmos-2-patch14-224",
+        *args,
+        **kwargs,
    ):
        self.model = AutoModelForVision2Seq.from_pretrained(
-            model_name, trust_remote_code=True
+            model_name, trust_remote_code=True, *args, **kwargs
        )
        self.processor = AutoProcessor.from_pretrained(
-            model_name, trust_remote_code=True
+            model_name, trust_remote_code=True, *args, **kwargs
        )

    def get_image(self, url):
--- a/swarms/models/whisperx_model.py
+++ b/swarms/models/whisperx_model.py
@ -2,7 +2,7 @@ import os
 import subprocess

 try:
-    import whisperx
+    import swarms.models.whisperx_model as whisperx_model
    from pydub import AudioSegment
    from pytube import YouTube
 except Exception as error:
@ -66,17 +66,17 @@ class WhisperX:
        compute_type = "float16"

        # 1. Transcribe with original Whisper (batched) 🗣️
-        model = whisperx.load_model(
+        model = whisperx_model.load_model(
            "large-v2", device, compute_type=compute_type
        )
-        audio = whisperx.load_audio(audio_file)
+        audio = whisperx_model.load_audio(audio_file)
        result = model.transcribe(audio, batch_size=batch_size)

        # 2. Align Whisper output 🔍
-        model_a, metadata = whisperx.load_align_model(
+        model_a, metadata = whisperx_model.load_align_model(
            language_code=result["language"], device=device
        )
-        result = whisperx.align(
+        result = whisperx_model.align(
            result["segments"],
            model_a,
            metadata,
@ -86,7 +86,7 @@ class WhisperX:
        )

        # 3. Assign speaker labels 🏷️
-        diarize_model = whisperx.DiarizationPipeline(
+        diarize_model = whisperx_model.DiarizationPipeline(
            use_auth_token=self.hf_api_key, device=device
        )
        diarize_model(audio_file)
@ -99,16 +99,16 @@ class WhisperX:
            print("The key 'segments' is not found in the result.")

    def transcribe(self, audio_file):
-        model = whisperx.load_model("large-v2", self.device, self.compute_type)
-        audio = whisperx.load_audio(audio_file)
+        model = whisperx_model.load_model("large-v2", self.device, self.compute_type)
+        audio = whisperx_model.load_audio(audio_file)
        result = model.transcribe(audio, batch_size=self.batch_size)

        # 2. Align Whisper output 🔍
-        model_a, metadata = whisperx.load_align_model(
+        model_a, metadata = whisperx_model.load_align_model(
            language_code=result["language"], device=self.device
        )

-        result = whisperx.align(
+        result = whisperx_model.align(
            result["segments"],
            model_a,
            metadata,
@ -118,7 +118,7 @@ class WhisperX:
        )

        # 3. Assign speaker labels 🏷️
-        diarize_model = whisperx.DiarizationPipeline(
+        diarize_model = whisperx_model.DiarizationPipeline(
            use_auth_token=self.hf_api_key, device=self.device
        )

--- a/tests/models/test_whisperx.py
+++ b/tests/models/test_whisperx.py
@ -7,7 +7,7 @@ import pytest
 import whisperx
 from pydub import AudioSegment
 from pytube import YouTube
-from swarms.models.whisperx import WhisperX
+from swarms.models.whisperx_model import WhisperX


 # Fixture to create a temporary directory for testing