[FEAT][Flow.run() img = None for conditional img inputs, BaseMultiModalModel, and multi modal swarms of manufacturing agents

2 years ago · a92a6a5c13
parent f895497f88
commit a92a6a5c13
10 changed files with 189 additions and 28 deletions
--- a/example.py
+++ b/example.py
@ -1,9 +1,21 @@
 import os
 from dotenv import load_dotenv
 # Import the OpenAIChat model and the Flow struct
 from swarms.models import OpenAIChat
 from swarms.structs import Flow
 # Load the environment variables
 load_dotenv()
 # Get the API key from the environment
 api_key = os.environ.get("OPENAI_API_KEY")
 # Initialize the language model
 llm = OpenAIChat(
    temperature=0.5,
    openai_api_key=api_key,
 )
--- a/multi_modal_auto_agent.py
+++ b/multi_modal_auto_agent.py
@ -1,5 +1,8 @@
 from swarms.structs import Flow
 from swarms.models.gpt4_vision_api import GPT4VisionAPI
 from swarms.prompts.multi_modal_autonomous_instruction_prompt import (
    MULTI_MODAL_AUTO_AGENT_SYSTEM_PROMPT_1,
 )
 llm = GPT4VisionAPI()
@ -10,6 +13,7 @@ img = "images/swarms.jpeg"
 ## Initialize the workflow
 flow = Flow(
    llm=llm,
    sop=MULTI_MODAL_AUTO_AGENT_SYSTEM_PROMPT_1,
    max_loops="auto",
 )
--- a/playground/demos/idea_2_img/main.py
+++ b/playground/demos/idea_2_img/main.py
@ -0,0 +1,7 @@
 """
 Idea 2 img
 task -> gpt4 text -> dalle3 img -> gpt4vision img + text analyze img -> dalle3 img -> loop    
 """
 from swarms.models.gpt4_vision_api import GPT4VisionAPI
--- a/playground/demos/swarm_of_mma_manufacturing/main.py
+++ b/playground/demos/swarm_of_mma_manufacturing/main.py
@ -0,0 +1,15 @@
 """
 Swarm of multi modal autonomous agents for manufacturing!
 ---------------------------------------------------------    
 Health Security agent: Agent that monitors the health of working conditions: input image of factory output: health safety index 0.0 - 1.0 being the highest
 Quality Control agent: Agent that monitors the quality of the product: input image of product output: quality index 0.0 - 1.0 being the highest
 Productivity agent: Agent that monitors the productivity of the factory: input image of factory output: productivity index 0.0 - 1.0 being the highest
 Safety agent: Agent that monitors the safety of the factory: input image of factory output: safety index 0.0 - 1.0 being the highest
 Security agent: Agent that monitors the security of the factory: input image of factory output: security index 0.0 - 1.0 being the highest
 Sustainability agent: Agent that monitors the sustainability of the factory: input image of factory output: sustainability index 0.0 - 1.0 being the highest
 Efficiency agent: Agent that monitors the efficiency of the factory: input image of factory output: efficiency index 0.0 - 1.0 being the highest    
 Flow:
 health security agent -> quality control agent -> productivity agent -> safety agent -> security agent -> sustainability agent -> efficiency agent 
 """
--- a/swarms/models/base_multimodal_model.py
+++ b/swarms/models/base_multimodal_model.py
@ -1,3 +1,4 @@
 from abc import abstractmethod
 import asyncio
 import base64
 import concurrent.futures
@ -7,8 +8,8 @@ from io import BytesIO
 from typing import List, Optional, Tuple
 import requests
 from ABC import abstractmethod
 from PIL import Image
 from termcolor import colored
 class BaseMultiModalModel:
@ -37,7 +38,6 @@ class BaseMultiModalModel:
        self.retries = retries
        self.chat_history = []
    @abstractmethod
    def __call__(self, text: str, img: str):
        """Run the model"""
@ -61,17 +61,17 @@ class BaseMultiModalModel:
        except requests.RequestException as error:
            print(f"Error fetching image from {img} and error: {error}")
            return None
-        
+
    def encode_img(self, img: str):
        """Encode the image to base64"""
        with open(img, "rb") as image_file:
            return base64.b64encode(image_file.read()).decode("utf-8")
-    
+
    def get_img(self, img: str):
        """Get the image from the path"""
        image_pil = Image.open(img)
        return image_pil
-    
+
    def clear_chat_history(self):
        """Clear the chat history"""
        self.chat_history = []
@ -87,11 +87,11 @@ class BaseMultiModalModel:
        Args:
            tasks (List[str]): List of tasks
            imgs (List[str]): List of image paths
-        
+
        Returns:
            List[str]: List of responses
-            
+
-        
+
        """
        # Instantiate the thread pool executor
        with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
@ -101,7 +101,6 @@ class BaseMultiModalModel:
        for result in results:
            print(result)
    def run_batch(self, tasks_images: List[Tuple[str, str]]) -> List[str]:
        """Process a batch of tasks and images"""
        with concurrent.futures.ThreadPoolExecutor() as executor:
@ -133,11 +132,11 @@ class BaseMultiModalModel:
            for task, img in tasks_images
        ]
        return await asyncio.gather(*futures)
-    
+
    def unique_chat_history(self):
        """Get the unique chat history"""
        return list(set(self.chat_history))
-    
+
    def run_with_retries(self, task: str, img: str):
        """Run the model with retries"""
        for i in range(self.retries):
@ -146,7 +145,7 @@ class BaseMultiModalModel:
            except Exception as error:
                print(f"Error with the request {error}")
                continue
-    
+
    def run_batch_with_retries(self, tasks_images: List[Tuple[str, str]]):
        """Run the model with retries"""
        for i in range(self.retries):
@ -188,28 +187,37 @@ class BaseMultiModalModel:
        if self.start_time and self.end_time:
            return self.end_time - self.start_time
        return 0
-    
+
    def get_chat_history(self):
        """Get the chat history"""
        return self.chat_history
-    
+
    def get_unique_chat_history(self):
        """Get the unique chat history"""
        return list(set(self.chat_history))
-    
+
    def get_chat_history_length(self):
        """Get the chat history length"""
        return len(self.chat_history)
-    
+
    def get_unique_chat_history_length(self):
        """Get the unique chat history length"""
        return len(list(set(self.chat_history)))
-    
+
    def get_chat_history_tokens(self):
        """Get the chat history tokens"""
        return self._num_tokens()
-    
+
    def print_beautiful(self, content: str, color: str = "cyan"):
        """Print Beautifully with termcolor"""
        content = colored(content, color)
-        print(content)
+        print(content)
    def stream(self, content: str):
        """Stream the output
        Args:
            content (str): _description_
        """
        for chunk in content:
            print(chunk)
--- a/swarms/models/gpt4_vision_api.py
+++ b/swarms/models/gpt4_vision_api.py
@ -1,6 +1,7 @@
-import logging 
+import logging
 import asyncio
 import base64
 from typing import Optional
 import concurrent.futures
 from termcolor import colored
 import json
@ -12,6 +13,13 @@ import aiohttp
 import requests
 from dotenv import load_dotenv
 try:
    import cv2
 except ImportError:
    print("OpenCV not installed. Please install OpenCV to use this model.")
    raise ImportError
 # Load environment variables
 load_dotenv()
 openai_api_key = os.getenv("OPENAI_API_KEY")
@ -59,7 +67,8 @@ class GPT4VisionAPI:
        max_workers: int = 10,
        max_tokens: str = 300,
        openai_proxy: str = "https://api.openai.com/v1/chat/completions",
-        beautify: bool = False
+        beautify: bool = False,
        streaming_enabled: Optional[bool] = False,
    ):
        super().__init__()
        self.openai_api_key = openai_api_key
@ -69,6 +78,7 @@ class GPT4VisionAPI:
        self.max_tokens = max_tokens
        self.openai_proxy = openai_proxy
        self.beautify = beautify
        self.streaming_enabled = streaming_enabled
        if self.logging_enabled:
            logging.basicConfig(level=logging.DEBUG)
@ -123,14 +133,101 @@ class GPT4VisionAPI:
            out = response.json()
            content = out["choices"][0]["message"]["content"]
            if self.streaming_enabled:
                content = self.stream_response(content)
            else:
                pass
            if self.beautify:
                content = colored(content, "cyan")
                print(content)
            else:
                print(content)
        except Exception as error:
            print(f"Error with the request: {error}")
            raise error
    def video_prompt(self, frames):
        """
        SystemPrompt is a class that generates a prompt for the user to respond to.
        The prompt is generated based on the current state of the system.
        Parameters
        ----------
        frames : list
            A list of base64 frames
        Returns
        -------
        PROMPT : str
            The system prompt
        Examples
        --------
        >>> from swarms.models import GPT4VisionAPI
        >>> llm = GPT4VisionAPI()
        >>> video = "video.mp4"
        >>> base64_frames = llm.process_video(video)
        >>> prompt = llm.video_prompt(base64_frames)
        >>> print(prompt)
        """
        PROMPT = f"""
        These are frames from a video that I want to upload. Generate a compelling description that I can upload along with the video:
        {frames}
        """
        return PROMPT
    def stream_response(self, content: str):
        """Stream the response of the output
        Args:
            content (str): _description_
        """
        for chunk in content:
            print(chunk)
    def process_video(self, video: str):
        """
        Process a video into a list of base64 frames
        Parameters
        ----------
        video : str
            The path to the video file
        Returns
        -------
        base64_frames : list
            A list of base64 frames
        Examples
        --------
        >>> from swarms.models import GPT4VisionAPI
        >>> llm = GPT4VisionAPI()
        >>> video = "video.mp4"
        >>> base64_frames = llm.process_video(video)
        """
        video = cv2.VideoCapture(video)
        base64_frames = []
        while video.isOpened():
            success, frame = video.read()
            if not success:
                break
            _, buffer = cv2.imencode(".jpg", frame)
            base64_frames.append(base64.b64encode(buffer).decode("utf-8"))
        video.release()
        print(len(base64_frames), "frames read.")
        for img in base64_frames:
            base64.b64decode(img.encode("utf-8"))
    def __call__(self, task: str, img: str):
        """Run the model."""
        try:
@ -168,10 +265,17 @@ class GPT4VisionAPI:
            out = response.json()
            content = out["choices"][0]["message"]["content"]
            if self.streaming_enabled:
                content = self.stream_response(content)
            else:
                pass
            if self.beautify:
                content = colored(content, "cyan")
                print(content)
            else:
                print(content)
        except Exception as error:
            print(f"Error with the request: {error}")
            raise error
--- a/swarms/models/kosmos_two.py
+++ b/swarms/models/kosmos_two.py
@ -24,7 +24,7 @@ class Kosmos:
    ----------
    model_name : str
        Path to the pretrained model
-    
+
    Examples
    --------
    >>> kosmos = Kosmos()
--- a/swarms/models/whisperx_model.py
+++ b/swarms/models/whisperx_model.py
@ -99,7 +99,9 @@ class WhisperX:
            print("The key 'segments' is not found in the result.")
    def transcribe(self, audio_file):
-        model = whisperx_model.load_model("large-v2", self.device, self.compute_type)
+        model = whisperx_model.load_model(
            "large-v2", self.device, self.compute_type
        )
        audio = whisperx_model.load_audio(audio_file)
        result = model.transcribe(audio, batch_size=self.batch_size)
--- a/swarms/structs/flow.py
+++ b/swarms/structs/flow.py
@ -498,7 +498,7 @@ class Flow:
            )
            print(error)
-    def run(self, task: str, img: Optional[str], **kwargs):
+    def run(self, task: Optional[str], img: Optional[str] = None, **kwargs):
        """
        Run the autonomous agent loop
@ -528,7 +528,11 @@ class Flow:
                self.print_dashboard(task)
            loop_count = 0
            # While the max_loops is auto or the loop count is less than the max_loops
            while self.max_loops == "auto" or loop_count < self.max_loops:
                # Loop count
                loop_count += 1
                print(
                    colored(f"\nLoop {loop_count} of {self.max_loops}", "blue")
--- a/swarms/utils/disable_logging.py
+++ b/swarms/utils/disable_logging.py
@ -1,15 +1,14 @@
 import logging
 import os
 import warnings
 def disable_logging():
    warnings.filterwarnings("ignore", category=UserWarning)
    # disable tensorflow warnings
    os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
    # Set the logging level for the entire module
    logging.basicConfig(level=logging.WARNING)
@ -20,6 +19,12 @@ def disable_logging():
    except Exception as error:
        print(f"Pytorch logging not disabled: {error}")
-    for logger_name in ['tensorflow', 'h5py', 'numexpr', 'git', 'wandb.docker.auth']:
+    for logger_name in [
        "tensorflow",
        "h5py",
        "numexpr",
        "git",
        "wandb.docker.auth",
    ]:
        logger = logging.getLogger(logger_name)
-        logger.setLevel(logging.WARNING) # Supress DEBUG and info logs
+        logger.setLevel(logging.WARNING)  # Supress DEBUG and info logs