clean up

2 years ago · 1808da08d5
parent e49d85b65c
commit 1808da08d5
8 changed files with 362 additions and 244 deletions
--- a/swarms/agents/init.py
+++ b/swarms/agents/init.py
@ -16,5 +16,13 @@
 #agents
 # from swarms.agents.profitpilot import ProfitPilot
 from swarms.agents.aot import AoTAgent
-# from swarms.agents.multi_modal_agent import MultiModalVisualAgent
+# from swarms.agents.omni_modal_agent import OmniModalAgent
-# from swarms.agents.omni_modal_agent import OmniModalAgent
+from swarms.agents.multi_modal_visual_agent import MultiModalAgent
 #utils
 from swarms.agents.message import Message
 from swarms.agents.stream_response import stream
--- a/swarms/agents/message.py
+++ b/swarms/agents/message.py
@ -21,4 +21,7 @@ class Message:
        self.metadata = metadata or {}
    def __repr__(self):
        """
        __repr__ means 
        """
        return f"{self.timestamp} - {self.sender}: {self.content}"
--- a/swarms/agents/multi_modal_agent.py
+++ b/swarms/agents/multi_modal_agent.py
@ -1,175 +0,0 @@
 from swarms.agents.multi_modal_workers.multi_modal_agent import MultiModalVisualAgent
 from swarms.agents.message import Message
 class MultiModalAgent:
    """
    A user-friendly abstraction over the MultiModalVisualAgent that provides a simple interface 
    to process both text and images.
    Initializes the MultiModalAgent.
        Parameters:
            load_dict (dict, optional): Dictionary of class names and devices to load. Defaults to a basic configuration.
            temperature (float, optional): Temperature for the OpenAI model. Defaults to 0.
            default_language (str, optional): Default language for the agent. Defaults to "English".
    Usage
    --------------
    For chats:
    ------------
    agent = MultiModalAgent()
    agent.chat("Hello")
    -----------
    Or just with text
    ------------
    agent = MultiModalAgent()
    agent.run_text("Hello")
    """
    def __init__(
        self,
        load_dict,
        temperature,
        language: str = "english"
    ):
        self.load_dict = load_dict
        self.temperature = temperature
        self.langigage = language
        if load_dict is None:
            load_dict = {
                "ImageCaptioning": "default_device"
            }
        self.agent = MultiModalVisualAgent(
            load_dict,
            temperature
        )
        self.language = language
        self.history = []
    def run_text(
        self, 
        text: str = None, 
        language=None
    ):
        """Run text through the model"""
        if language is None:
            language = self.language
        try:
            self.agent.init_agent(language)
            return self.agent.run_text(text)
        except Exception as e:
            return f"Error processing text: {str(e)}"
    def run_img(
        self, 
        image_path: str, 
        language=None
    ):
        """If language is None"""
        if language is None:
            language = self.default_language
        try:
            return self.agent.run_image(
                image_path,
                language
            )
        except Exception as error:
            return f"Error processing image: {str(error)}"
    def chat(
        self,
        msg: str = None,
        language: str = None,
        streaming: bool = False
    ):
        """
        Run chat with the multi-modal agent
        Args:
            msg (str, optional): Message to send to the agent. Defaults to None.
            language (str, optional): Language to use. Defaults to None.
            streaming (bool, optional): Whether to stream the response. Defaults to False.
        Returns:
            str: Response from the agent
        Usage:
        --------------
        agent = MultiModalAgent()
        agent.chat("Hello")
        """
        if language is None:
            language = self.default_language
        #add users message to the history
        self.history.append(
            Message(
                "User",
                msg
            )
        )
        #process msg
        try:
            self.agent.init_agent(language)
            response = self.agent.run_text(msg)
            #add agent's response to the history
            self.history.append(
                Message(
                    "Agent",
                    response
                )
            )
            #if streaming is = True
            if streaming:
                return self._stream_response(response)
            else:
                response
        except Exception as error:
            error_message = f"Error processing message: {str(error)}"
            #add error to history
            self.history.append(
                Message(
                    "Agent",
                    error_message
                )
            )
            return error_message
    def _stream_response(
        self, 
        response: str = None
    ):
        """
        Yield the response token by token (word by word)
        Usage:
        --------------
        for token in _stream_response(response):
            print(token)
        """
        for token in response.split():
            yield token
    def clear(self):
        """Clear agent's memory"""
        try:
            self.agent.clear_memory()
        except Exception as e:
            return f"Error cleaning memory: {str(e)}"
--- a/swarms/agents/multi_modal_workers/multi_modal_agent.py
+++ b/swarms/agents/multi_modal_workers/multi_modal_agent.py
@ -1578,4 +1578,191 @@ class MultiModalVisualAgent:
        return AI_prompt
    def clear_memory(self):
-        self.memory.clear()
+        self.memory.clear()
 ###### usage
 from swarms.agents.message import Message
 class MultiModalAgent:
    """
    A user-friendly abstraction over the MultiModalVisualAgent that provides a simple interface 
    to process both text and images.
    Initializes the MultiModalAgent.
    Architecture:
    Parameters:
        load_dict (dict, optional): Dictionary of class names and devices to load. 
        Defaults to a basic configuration.
        temperature (float, optional): Temperature for the OpenAI model. Defaults to 0.
        default_language (str, optional): Default language for the agent. 
        Defaults to "English".
    Usage
    --------------
    For chats:
    ------------
    agent = MultiModalAgent()
    agent.chat("Hello")
    -----------
    Or just with text
    ------------
    agent = MultiModalAgent()
    agent.run_text("Hello")
    """
    def __init__(
        self,
        load_dict,
        temperature,
        language: str = "english"
    ):
        self.load_dict = load_dict
        self.temperature = temperature
        self.langigage = language
        if load_dict is None:
            load_dict = {
                "ImageCaptioning": "default_device"
            }
        self.agent = MultiModalVisualAgent(
            load_dict,
            temperature
        )
        self.language = language
        self.history = []
    def run_text(
        self, 
        text: str = None, 
        language=None
    ):
        """Run text through the model"""
        if language is None:
            language = self.language
        try:
            self.agent.init_agent(language)
            return self.agent.run_text(text)
        except Exception as e:
            return f"Error processing text: {str(e)}"
    def run_img(
        self, 
        image_path: str, 
        language=None
    ):
        """If language is None"""
        if language is None:
            language = self.default_language
        try:
            return self.agent.run_image(
                image_path,
                language
            )
        except Exception as error:
            return f"Error processing image: {str(error)}"
    def chat(
        self,
        msg: str = None,
        language: str = None,
        streaming: bool = False
    ):
        """
        Run chat with the multi-modal agent
        Args:
            msg (str, optional): Message to send to the agent. Defaults to None.
            language (str, optional): Language to use. Defaults to None.
            streaming (bool, optional): Whether to stream the response. Defaults to False.
        Returns:
            str: Response from the agent
        Usage:
        --------------
        agent = MultiModalAgent()
        agent.chat("Hello")
        """
        if language is None:
            language = self.default_language
        #add users message to the history
        self.history.append(
            Message(
                "User",
                msg
            )
        )
        #process msg
        try:
            self.agent.init_agent(language)
            response = self.agent.run_text(msg)
            #add agent's response to the history
            self.history.append(
                Message(
                    "Agent",
                    response
                )
            )
            #if streaming is = True
            if streaming:
                return self._stream_response(response)
            else:
                response
        except Exception as error:
            error_message = f"Error processing message: {str(error)}"
            #add error to history
            self.history.append(
                Message(
                    "Agent",
                    error_message
                )
            )
            return error_message
    def _stream_response(
        self, 
        response: str = None
    ):
        """
        Yield the response token by token (word by word)
        Usage:
        --------------
        for token in _stream_response(response):
            print(token)
        """
        for token in response.split():
            yield token
    def clear(self):
        """Clear agent's memory"""
        try:
            self.agent.clear_memory()
        except Exception as e:
            return f"Error cleaning memory: {str(e)}"
--- a/swarms/agents/multi_modal_workers/omni_agent/model_server.py
+++ b/swarms/agents/multi_modal_workers/omni_agent/model_server.py
@ -1,45 +1,67 @@
 import argparse
 import logging
 import os
 import random
 import time
 import traceback
 import uuid
 import warnings
 import numpy as np
-from transformers import pipeline
+import soundfile as sf
 from diffusers import DiffusionPipeline, StableDiffusionControlNetPipeline, ControlNetModel, UniPCMultistepScheduler
 from diffusers.utils import load_image
 from diffusers import DiffusionPipeline, DPMSolverMultistepScheduler
 from diffusers.utils import export_to_video
 from transformers import SpeechT5Processor, SpeechT5HifiGan, SpeechT5ForSpeechToSpeech
 from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer
 from datasets import load_dataset
 from PIL import Image
 # import flask
 # from flask import request, jsonify
 import waitress
 # from flask_cors import CORS
 from torchvision import transforms
 import torch
 import torchaudio
-from transformers import MaskFormerFeatureExtractor, MaskFormerForInstanceSegmentation
+
-from controlnet_aux import OpenposeDetector, MLSDdetector, HEDdetector, CannyDetector, MidasDetector
+# import flask
-from controlnet_aux.open_pose.body import Body
+from flask import request, jsonify
-from controlnet_aux.mlsd.models.mbv2_mlsd_large import MobileV2_MLSD_Large
+import waitress
 import yaml
 from asteroid.models import BaseModel
 from controlnet_aux import (
    CannyDetector,
    HEDdetector,
    MidasDetector,
    MLSDdetector,
    OpenposeDetector,
 )
 from controlnet_aux.hed import Network
-from transformers import DPTForDepthEstimation, DPTFeatureExtractor
+from controlnet_aux.mlsd.models.mbv2_mlsd_large import MobileV2_MLSD_Large
-import warnings
+from controlnet_aux.open_pose.body import Body
-import time
+from datasets import load_dataset
 from diffusers import (
    ControlNetModel,
    DiffusionPipeline,
    DPMSolverMultistepScheduler,
    StableDiffusionControlNetPipeline,
    UniPCMultistepScheduler,
 )
 from diffusers.utils import export_to_video, load_image
 from espnet2.bin.tts_inference import Text2Speech
-import soundfile as sf
+from PIL import Image
 from asteroid.models import BaseModel
 import traceback
 import os
 import yaml
 # from flask_cors import CORS
 from torchvision import transforms
 from transformers import (
    AutoTokenizer,
    DPTFeatureExtractor,
    DPTForDepthEstimation,
    MaskFormerFeatureExtractor,
    MaskFormerForInstanceSegmentation,
    SpeechT5ForSpeechToSpeech,
    SpeechT5HifiGan,
    SpeechT5Processor,
    VisionEncoderDecoderModel,
    ViTImageProcessor,
    pipeline,
 )
 #logs
 warnings.filterwarnings("ignore")
 parser = argparse.ArgumentParser()
 parser.add_argument("--config", type=str, default="configs/config.default.yaml")
 args = parser.parse_args()
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)
 handler = logging.StreamHandler()
--- a/swarms/agents/multi_modal_workers/omni_agent/omni_chat.py
+++ b/swarms/agents/multi_modal_workers/omni_agent/omni_chat.py
@ -1,28 +1,87 @@
 import argparse
 import base64
 import copy
 from io import BytesIO
 import io
 import json
 import logging
 import os
 import random
 import re
 import threading
 import time
 import traceback
 import uuid
 from io import BytesIO
 from queue import Queue
 import requests
-import re
+import tiktoken
 import json
 import logging
 import argparse
 import yaml
 from PIL import Image, ImageDraw
 from diffusers.utils import load_image
 from pydub import AudioSegment
 import threading
 from queue import Queue
 # import flask
 # from flask import request, jsonify
 # from flask_cors import CORS, cross_origin
 from swarms.workers.multi_modal_workers.omni_agent.get_token_ids import get_token_ids_for_task_parsing, get_token_ids_for_choose_model, count_tokens, get_max_context_length
 from huggingface_hub.inference_api import InferenceApi
 from PIL import Image, ImageDraw
 from pydub import AudioSegment
 #tokenizations
 encodings = {
    "gpt-4": tiktoken.get_encoding("cl100k_base"),
    "gpt-4-32k": tiktoken.get_encoding("cl100k_base"),
    "gpt-3.5-turbo": tiktoken.get_encoding("cl100k_base"),
    "gpt-3.5-turbo-0301": tiktoken.get_encoding("cl100k_base"),
    "text-davinci-003": tiktoken.get_encoding("p50k_base"),
    "text-davinci-002": tiktoken.get_encoding("p50k_base"),
    "text-davinci-001": tiktoken.get_encoding("r50k_base"),
    "text-curie-001": tiktoken.get_encoding("r50k_base"),
    "text-babbage-001": tiktoken.get_encoding("r50k_base"),
    "text-ada-001": tiktoken.get_encoding("r50k_base"),
    "davinci": tiktoken.get_encoding("r50k_base"),
    "curie": tiktoken.get_encoding("r50k_base"),
    "babbage": tiktoken.get_encoding("r50k_base"),
    "ada": tiktoken.get_encoding("r50k_base"),
 }
 max_length = {
    "gpt-4": 8192,
    "gpt-4-32k": 32768,
    "gpt-3.5-turbo": 4096,
    "gpt-3.5-turbo-0301": 4096,
    "text-davinci-003": 4096,
    "text-davinci-002": 4096,
    "text-davinci-001": 2049,
    "text-curie-001": 2049,
    "text-babbage-001": 2049,
    "text-ada-001": 2049,
    "davinci": 2049,
    "curie": 2049,
    "babbage": 2049,
    "ada": 2049
 }
 def count_tokens(model_name, text):
    return len(encodings[model_name].encode(text))
 def get_max_context_length(model_name):
    return max_length[model_name]
 def get_token_ids_for_task_parsing(model_name):
    text = '''{"task": "text-classification",  "token-classification", "text2text-generation", "summarization", "translation",  "question-answering", "conversational", "text-generation", "sentence-similarity", "tabular-classification", "object-detection", "image-classification", "image-to-image", "image-to-text", "text-to-image", "visual-question-answering", "document-question-answering", "image-segmentation", "text-to-speech", "text-to-video", "automatic-speech-recognition", "audio-to-audio", "audio-classification", "canny-control", "hed-control", "mlsd-control", "normal-control", "openpose-control", "canny-text-to-image", "depth-text-to-image", "hed-text-to-image", "mlsd-text-to-image", "normal-text-to-image", "openpose-text-to-image", "seg-text-to-image", "args", "text", "path", "dep", "id", "<GENERATED>-"}'''
    res = encodings[model_name].encode(text)
    res = list(set(res))
    return res
 def get_token_ids_for_choose_model(model_name):
    text = '''{"id": "reason"}'''
    res = encodings[model_name].encode(text)
    res = list(set(res))
    return res
 #########
 parser = argparse.ArgumentParser()
 parser.add_argument("--config", type=str, default="swarms/agents/workers/multi_modal_workers/omni_agent/config.yml")
--- a/swarms/agents/omni_modal_agent.py
+++ b/swarms/agents/omni_modal_agent.py
@ -1,30 +1,36 @@
 from swarms.agents.multi_modal_workers.omni_agent.omni_chat import chat_huggingface
-class OmniModalAgent:
+# class OmniModalAgent:
-    def __init__(
+#     def __init__(
-        self, 
+#         self, 
-        api_key, 
+#         api_key, 
-        api_endpoint, 
+#         api_endpoint, 
-        api_type
+#         api_type
-    ):
+#     ):
-        self.api_key = api_key
+#         self.api_key = api_key
-        self.api_endpoint = api_endpoint
+#         self.api_endpoint = api_endpoint
-        self.api_type = api_type
+#         self.api_type = api_type
-
+
-    def chat(
+#     def chat(
-        self, 
+#         self, 
-        data
+#         data
-    ):
+#     ):
-        """Chat with omni-modality model that uses huggingface to query for a specific model at run time. Translate text to speech, create images and more"""
+#         """Chat with omni-modality model that uses huggingface to query for a specific model at run time. Translate text to speech, create images and more"""
-        messages = data.get("messages")
+#         messages = data.get("messages")
-        api_key = data.get("api_key", self.api_key)
+#         api_key = data.get("api_key", self.api_key)
-        api_endpoint = data.get("api_endpoint", self.api_endpoint)
+#         api_endpoint = data.get("api_endpoint", self.api_endpoint)
-        api_type = data.get("api_type", self.api_type)
+#         api_type = data.get("api_type", self.api_type)
-
+
-        if not(api_key and api_type and api_endpoint):
+#         if not(api_key and api_type and api_endpoint):
-            raise ValueError("Please provide api_key, api_type, and api_endpoint")
+#             raise ValueError("Please provide api_key, api_type, and api_endpoint")
-        response = chat_huggingface(messages, api_key, api_type, api_endpoint)
+#         response = chat_huggingface(messages, api_key, api_type, api_endpoint)
-        return response
+#         return response
 # class OmniModalAgent:
 #     def __init__(
 #     )
--- a/swarms/agents/stream_response.py
+++ b/swarms/agents/stream_response.py
@ -0,0 +1,8 @@
 def stream(response):
    """
    Yield the response token by token (word by word) from llm
    """
    for token in response.split():
        yield token