[FEAT][Flow.run() img = None for conditional img inputs, BaseMultiModalModel, and multi modal swarms of manufacturing agents

pull/197/head
Kye 1 year ago
parent f895497f88
commit a92a6a5c13

@ -1,9 +1,21 @@
import os
from dotenv import load_dotenv
# Import the OpenAIChat model and the Flow struct
from swarms.models import OpenAIChat from swarms.models import OpenAIChat
from swarms.structs import Flow from swarms.structs import Flow
# Load the environment variables
load_dotenv()
# Get the API key from the environment
api_key = os.environ.get("OPENAI_API_KEY")
# Initialize the language model # Initialize the language model
llm = OpenAIChat( llm = OpenAIChat(
temperature=0.5, temperature=0.5,
openai_api_key=api_key,
) )

@ -1,5 +1,8 @@
from swarms.structs import Flow from swarms.structs import Flow
from swarms.models.gpt4_vision_api import GPT4VisionAPI from swarms.models.gpt4_vision_api import GPT4VisionAPI
from swarms.prompts.multi_modal_autonomous_instruction_prompt import (
MULTI_MODAL_AUTO_AGENT_SYSTEM_PROMPT_1,
)
llm = GPT4VisionAPI() llm = GPT4VisionAPI()
@ -10,6 +13,7 @@ img = "images/swarms.jpeg"
## Initialize the workflow ## Initialize the workflow
flow = Flow( flow = Flow(
llm=llm, llm=llm,
sop=MULTI_MODAL_AUTO_AGENT_SYSTEM_PROMPT_1,
max_loops="auto", max_loops="auto",
) )

@ -0,0 +1,7 @@
"""
Idea 2 img
task -> gpt4 text -> dalle3 img -> gpt4vision img + text analyze img -> dalle3 img -> loop
"""
from swarms.models.gpt4_vision_api import GPT4VisionAPI

@ -0,0 +1,15 @@
"""
Swarm of multi modal autonomous agents for manufacturing!
---------------------------------------------------------
Health Security agent: Agent that monitors the health of working conditions: input image of factory output: health safety index 0.0 - 1.0 being the highest
Quality Control agent: Agent that monitors the quality of the product: input image of product output: quality index 0.0 - 1.0 being the highest
Productivity agent: Agent that monitors the productivity of the factory: input image of factory output: productivity index 0.0 - 1.0 being the highest
Safety agent: Agent that monitors the safety of the factory: input image of factory output: safety index 0.0 - 1.0 being the highest
Security agent: Agent that monitors the security of the factory: input image of factory output: security index 0.0 - 1.0 being the highest
Sustainability agent: Agent that monitors the sustainability of the factory: input image of factory output: sustainability index 0.0 - 1.0 being the highest
Efficiency agent: Agent that monitors the efficiency of the factory: input image of factory output: efficiency index 0.0 - 1.0 being the highest
Flow:
health security agent -> quality control agent -> productivity agent -> safety agent -> security agent -> sustainability agent -> efficiency agent
"""

@ -1,3 +1,4 @@
from abc import abstractmethod
import asyncio import asyncio
import base64 import base64
import concurrent.futures import concurrent.futures
@ -7,8 +8,8 @@ from io import BytesIO
from typing import List, Optional, Tuple from typing import List, Optional, Tuple
import requests import requests
from ABC import abstractmethod
from PIL import Image from PIL import Image
from termcolor import colored
class BaseMultiModalModel: class BaseMultiModalModel:
@ -37,7 +38,6 @@ class BaseMultiModalModel:
self.retries = retries self.retries = retries
self.chat_history = [] self.chat_history = []
@abstractmethod @abstractmethod
def __call__(self, text: str, img: str): def __call__(self, text: str, img: str):
"""Run the model""" """Run the model"""
@ -101,7 +101,6 @@ class BaseMultiModalModel:
for result in results: for result in results:
print(result) print(result)
def run_batch(self, tasks_images: List[Tuple[str, str]]) -> List[str]: def run_batch(self, tasks_images: List[Tuple[str, str]]) -> List[str]:
"""Process a batch of tasks and images""" """Process a batch of tasks and images"""
with concurrent.futures.ThreadPoolExecutor() as executor: with concurrent.futures.ThreadPoolExecutor() as executor:
@ -213,3 +212,12 @@ class BaseMultiModalModel:
"""Print Beautifully with termcolor""" """Print Beautifully with termcolor"""
content = colored(content, color) content = colored(content, color)
print(content) print(content)
def stream(self, content: str):
"""Stream the output
Args:
content (str): _description_
"""
for chunk in content:
print(chunk)

@ -1,6 +1,7 @@
import logging import logging
import asyncio import asyncio
import base64 import base64
from typing import Optional
import concurrent.futures import concurrent.futures
from termcolor import colored from termcolor import colored
import json import json
@ -12,6 +13,13 @@ import aiohttp
import requests import requests
from dotenv import load_dotenv from dotenv import load_dotenv
try:
import cv2
except ImportError:
print("OpenCV not installed. Please install OpenCV to use this model.")
raise ImportError
# Load environment variables # Load environment variables
load_dotenv() load_dotenv()
openai_api_key = os.getenv("OPENAI_API_KEY") openai_api_key = os.getenv("OPENAI_API_KEY")
@ -59,7 +67,8 @@ class GPT4VisionAPI:
max_workers: int = 10, max_workers: int = 10,
max_tokens: str = 300, max_tokens: str = 300,
openai_proxy: str = "https://api.openai.com/v1/chat/completions", openai_proxy: str = "https://api.openai.com/v1/chat/completions",
beautify: bool = False beautify: bool = False,
streaming_enabled: Optional[bool] = False,
): ):
super().__init__() super().__init__()
self.openai_api_key = openai_api_key self.openai_api_key = openai_api_key
@ -69,6 +78,7 @@ class GPT4VisionAPI:
self.max_tokens = max_tokens self.max_tokens = max_tokens
self.openai_proxy = openai_proxy self.openai_proxy = openai_proxy
self.beautify = beautify self.beautify = beautify
self.streaming_enabled = streaming_enabled
if self.logging_enabled: if self.logging_enabled:
logging.basicConfig(level=logging.DEBUG) logging.basicConfig(level=logging.DEBUG)
@ -123,14 +133,101 @@ class GPT4VisionAPI:
out = response.json() out = response.json()
content = out["choices"][0]["message"]["content"] content = out["choices"][0]["message"]["content"]
if self.streaming_enabled:
content = self.stream_response(content)
else:
pass
if self.beautify: if self.beautify:
content = colored(content, "cyan") content = colored(content, "cyan")
print(content)
else: else:
print(content) print(content)
except Exception as error: except Exception as error:
print(f"Error with the request: {error}") print(f"Error with the request: {error}")
raise error raise error
def video_prompt(self, frames):
"""
SystemPrompt is a class that generates a prompt for the user to respond to.
The prompt is generated based on the current state of the system.
Parameters
----------
frames : list
A list of base64 frames
Returns
-------
PROMPT : str
The system prompt
Examples
--------
>>> from swarms.models import GPT4VisionAPI
>>> llm = GPT4VisionAPI()
>>> video = "video.mp4"
>>> base64_frames = llm.process_video(video)
>>> prompt = llm.video_prompt(base64_frames)
>>> print(prompt)
"""
PROMPT = f"""
These are frames from a video that I want to upload. Generate a compelling description that I can upload along with the video:
{frames}
"""
return PROMPT
def stream_response(self, content: str):
"""Stream the response of the output
Args:
content (str): _description_
"""
for chunk in content:
print(chunk)
def process_video(self, video: str):
"""
Process a video into a list of base64 frames
Parameters
----------
video : str
The path to the video file
Returns
-------
base64_frames : list
A list of base64 frames
Examples
--------
>>> from swarms.models import GPT4VisionAPI
>>> llm = GPT4VisionAPI()
>>> video = "video.mp4"
>>> base64_frames = llm.process_video(video)
"""
video = cv2.VideoCapture(video)
base64_frames = []
while video.isOpened():
success, frame = video.read()
if not success:
break
_, buffer = cv2.imencode(".jpg", frame)
base64_frames.append(base64.b64encode(buffer).decode("utf-8"))
video.release()
print(len(base64_frames), "frames read.")
for img in base64_frames:
base64.b64decode(img.encode("utf-8"))
def __call__(self, task: str, img: str): def __call__(self, task: str, img: str):
"""Run the model.""" """Run the model."""
try: try:
@ -168,10 +265,17 @@ class GPT4VisionAPI:
out = response.json() out = response.json()
content = out["choices"][0]["message"]["content"] content = out["choices"][0]["message"]["content"]
if self.streaming_enabled:
content = self.stream_response(content)
else:
pass
if self.beautify: if self.beautify:
content = colored(content, "cyan") content = colored(content, "cyan")
print(content)
else: else:
print(content) print(content)
except Exception as error: except Exception as error:
print(f"Error with the request: {error}") print(f"Error with the request: {error}")
raise error raise error

@ -99,7 +99,9 @@ class WhisperX:
print("The key 'segments' is not found in the result.") print("The key 'segments' is not found in the result.")
def transcribe(self, audio_file): def transcribe(self, audio_file):
model = whisperx_model.load_model("large-v2", self.device, self.compute_type) model = whisperx_model.load_model(
"large-v2", self.device, self.compute_type
)
audio = whisperx_model.load_audio(audio_file) audio = whisperx_model.load_audio(audio_file)
result = model.transcribe(audio, batch_size=self.batch_size) result = model.transcribe(audio, batch_size=self.batch_size)

@ -498,7 +498,7 @@ class Flow:
) )
print(error) print(error)
def run(self, task: str, img: Optional[str], **kwargs): def run(self, task: Optional[str], img: Optional[str] = None, **kwargs):
""" """
Run the autonomous agent loop Run the autonomous agent loop
@ -528,7 +528,11 @@ class Flow:
self.print_dashboard(task) self.print_dashboard(task)
loop_count = 0 loop_count = 0
# While the max_loops is auto or the loop count is less than the max_loops
while self.max_loops == "auto" or loop_count < self.max_loops: while self.max_loops == "auto" or loop_count < self.max_loops:
# Loop count
loop_count += 1 loop_count += 1
print( print(
colored(f"\nLoop {loop_count} of {self.max_loops}", "blue") colored(f"\nLoop {loop_count} of {self.max_loops}", "blue")

@ -1,15 +1,14 @@
import logging import logging
import os import os
import warnings import warnings
def disable_logging(): def disable_logging():
warnings.filterwarnings("ignore", category=UserWarning) warnings.filterwarnings("ignore", category=UserWarning)
# disable tensorflow warnings # disable tensorflow warnings
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3" os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
# Set the logging level for the entire module # Set the logging level for the entire module
logging.basicConfig(level=logging.WARNING) logging.basicConfig(level=logging.WARNING)
@ -20,6 +19,12 @@ def disable_logging():
except Exception as error: except Exception as error:
print(f"Pytorch logging not disabled: {error}") print(f"Pytorch logging not disabled: {error}")
for logger_name in ['tensorflow', 'h5py', 'numexpr', 'git', 'wandb.docker.auth']: for logger_name in [
"tensorflow",
"h5py",
"numexpr",
"git",
"wandb.docker.auth",
]:
logger = logging.getLogger(logger_name) logger = logging.getLogger(logger_name)
logger.setLevel(logging.WARNING) # Supress DEBUG and info logs logger.setLevel(logging.WARNING) # Supress DEBUG and info logs

Loading…
Cancel
Save