diff --git a/swarms/agents/__init__.py b/swarms/agents/__init__.py index 24c0ae9e..d3f76509 100644 --- a/swarms/agents/__init__.py +++ b/swarms/agents/__init__.py @@ -16,5 +16,13 @@ #agents # from swarms.agents.profitpilot import ProfitPilot from swarms.agents.aot import AoTAgent -# from swarms.agents.multi_modal_agent import MultiModalVisualAgent -# from swarms.agents.omni_modal_agent import OmniModalAgent \ No newline at end of file +# from swarms.agents.omni_modal_agent import OmniModalAgent +from swarms.agents.multi_modal_visual_agent import MultiModalAgent + + + + + +#utils +from swarms.agents.message import Message +from swarms.agents.stream_response import stream diff --git a/swarms/agents/message.py b/swarms/agents/message.py index f1eabab1..215f742d 100644 --- a/swarms/agents/message.py +++ b/swarms/agents/message.py @@ -21,4 +21,7 @@ class Message: self.metadata = metadata or {} def __repr__(self): + """ + __repr__ means + """ return f"{self.timestamp} - {self.sender}: {self.content}" diff --git a/swarms/agents/multi_modal_agent.py b/swarms/agents/multi_modal_agent.py deleted file mode 100644 index 022c8f43..00000000 --- a/swarms/agents/multi_modal_agent.py +++ /dev/null @@ -1,175 +0,0 @@ -from swarms.agents.multi_modal_workers.multi_modal_agent import MultiModalVisualAgent -from swarms.agents.message import Message - -class MultiModalAgent: - """ - A user-friendly abstraction over the MultiModalVisualAgent that provides a simple interface - to process both text and images. - - Initializes the MultiModalAgent. - - Parameters: - load_dict (dict, optional): Dictionary of class names and devices to load. Defaults to a basic configuration. - temperature (float, optional): Temperature for the OpenAI model. Defaults to 0. - default_language (str, optional): Default language for the agent. Defaults to "English". - - Usage - -------------- - For chats: - ------------ - agent = MultiModalAgent() - agent.chat("Hello") - - ----------- - - Or just with text - ------------ - agent = MultiModalAgent() - agent.run_text("Hello") - - - """ - def __init__( - self, - load_dict, - temperature, - language: str = "english" - ): - self.load_dict = load_dict - self.temperature = temperature - self.langigage = language - - if load_dict is None: - load_dict = { - "ImageCaptioning": "default_device" - } - - self.agent = MultiModalVisualAgent( - load_dict, - temperature - ) - self.language = language - self.history = [] - - - def run_text( - self, - text: str = None, - language=None - ): - """Run text through the model""" - - if language is None: - language = self.language - - try: - self.agent.init_agent(language) - return self.agent.run_text(text) - except Exception as e: - return f"Error processing text: {str(e)}" - - def run_img( - self, - image_path: str, - language=None - ): - """If language is None""" - if language is None: - language = self.default_language - - try: - return self.agent.run_image( - image_path, - language - ) - except Exception as error: - return f"Error processing image: {str(error)}" - - def chat( - self, - msg: str = None, - language: str = None, - streaming: bool = False - ): - """ - Run chat with the multi-modal agent - - Args: - msg (str, optional): Message to send to the agent. Defaults to None. - language (str, optional): Language to use. Defaults to None. - streaming (bool, optional): Whether to stream the response. Defaults to False. - - Returns: - str: Response from the agent - - Usage: - -------------- - agent = MultiModalAgent() - agent.chat("Hello") - - """ - if language is None: - language = self.default_language - - #add users message to the history - self.history.append( - Message( - "User", - msg - ) - ) - - #process msg - try: - self.agent.init_agent(language) - response = self.agent.run_text(msg) - - #add agent's response to the history - self.history.append( - Message( - "Agent", - response - ) - ) - - #if streaming is = True - if streaming: - return self._stream_response(response) - else: - response - - except Exception as error: - error_message = f"Error processing message: {str(error)}" - - #add error to history - self.history.append( - Message( - "Agent", - error_message - ) - ) - return error_message - - def _stream_response( - self, - response: str = None - ): - """ - Yield the response token by token (word by word) - - Usage: - -------------- - for token in _stream_response(response): - print(token) - - """ - for token in response.split(): - yield token - - def clear(self): - """Clear agent's memory""" - try: - self.agent.clear_memory() - except Exception as e: - return f"Error cleaning memory: {str(e)}" - diff --git a/swarms/agents/multi_modal_workers/multi_modal_agent.py b/swarms/agents/multi_modal_visual_agent.py similarity index 94% rename from swarms/agents/multi_modal_workers/multi_modal_agent.py rename to swarms/agents/multi_modal_visual_agent.py index a3db768d..871f90d0 100644 --- a/swarms/agents/multi_modal_workers/multi_modal_agent.py +++ b/swarms/agents/multi_modal_visual_agent.py @@ -1578,4 +1578,191 @@ class MultiModalVisualAgent: return AI_prompt def clear_memory(self): - self.memory.clear() \ No newline at end of file + self.memory.clear() + + + + +###### usage +from swarms.agents.message import Message + +class MultiModalAgent: + """ + A user-friendly abstraction over the MultiModalVisualAgent that provides a simple interface + to process both text and images. + + Initializes the MultiModalAgent. + + Architecture: + + + Parameters: + load_dict (dict, optional): Dictionary of class names and devices to load. + Defaults to a basic configuration. + + temperature (float, optional): Temperature for the OpenAI model. Defaults to 0. + + default_language (str, optional): Default language for the agent. + Defaults to "English". + + Usage + -------------- + For chats: + ------------ + agent = MultiModalAgent() + agent.chat("Hello") + + ----------- + + Or just with text + ------------ + agent = MultiModalAgent() + agent.run_text("Hello") + + + """ + def __init__( + self, + load_dict, + temperature, + language: str = "english" + ): + self.load_dict = load_dict + self.temperature = temperature + self.langigage = language + + if load_dict is None: + load_dict = { + "ImageCaptioning": "default_device" + } + + self.agent = MultiModalVisualAgent( + load_dict, + temperature + ) + self.language = language + self.history = [] + + + def run_text( + self, + text: str = None, + language=None + ): + """Run text through the model""" + + if language is None: + language = self.language + + try: + self.agent.init_agent(language) + return self.agent.run_text(text) + except Exception as e: + return f"Error processing text: {str(e)}" + + def run_img( + self, + image_path: str, + language=None + ): + """If language is None""" + if language is None: + language = self.default_language + + try: + return self.agent.run_image( + image_path, + language + ) + except Exception as error: + return f"Error processing image: {str(error)}" + + def chat( + self, + msg: str = None, + language: str = None, + streaming: bool = False + ): + """ + Run chat with the multi-modal agent + + Args: + msg (str, optional): Message to send to the agent. Defaults to None. + language (str, optional): Language to use. Defaults to None. + streaming (bool, optional): Whether to stream the response. Defaults to False. + + Returns: + str: Response from the agent + + Usage: + -------------- + agent = MultiModalAgent() + agent.chat("Hello") + + """ + if language is None: + language = self.default_language + + #add users message to the history + self.history.append( + Message( + "User", + msg + ) + ) + + #process msg + try: + self.agent.init_agent(language) + response = self.agent.run_text(msg) + + #add agent's response to the history + self.history.append( + Message( + "Agent", + response + ) + ) + + #if streaming is = True + if streaming: + return self._stream_response(response) + else: + response + + except Exception as error: + error_message = f"Error processing message: {str(error)}" + + #add error to history + self.history.append( + Message( + "Agent", + error_message + ) + ) + return error_message + + def _stream_response( + self, + response: str = None + ): + """ + Yield the response token by token (word by word) + + Usage: + -------------- + for token in _stream_response(response): + print(token) + + """ + for token in response.split(): + yield token + + def clear(self): + """Clear agent's memory""" + try: + self.agent.clear_memory() + except Exception as e: + return f"Error cleaning memory: {str(e)}" + + diff --git a/swarms/agents/multi_modal_workers/omni_agent/model_server.py b/swarms/agents/multi_modal_workers/omni_agent/model_server.py index 11aaa811..a0481c89 100644 --- a/swarms/agents/multi_modal_workers/omni_agent/model_server.py +++ b/swarms/agents/multi_modal_workers/omni_agent/model_server.py @@ -1,45 +1,67 @@ import argparse import logging +import os import random +import time +import traceback import uuid +import warnings + import numpy as np -from transformers import pipeline -from diffusers import DiffusionPipeline, StableDiffusionControlNetPipeline, ControlNetModel, UniPCMultistepScheduler -from diffusers.utils import load_image -from diffusers import DiffusionPipeline, DPMSolverMultistepScheduler -from diffusers.utils import export_to_video -from transformers import SpeechT5Processor, SpeechT5HifiGan, SpeechT5ForSpeechToSpeech -from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer -from datasets import load_dataset -from PIL import Image -# import flask -# from flask import request, jsonify -import waitress -# from flask_cors import CORS -from torchvision import transforms +import soundfile as sf import torch import torchaudio -from transformers import MaskFormerFeatureExtractor, MaskFormerForInstanceSegmentation -from controlnet_aux import OpenposeDetector, MLSDdetector, HEDdetector, CannyDetector, MidasDetector -from controlnet_aux.open_pose.body import Body -from controlnet_aux.mlsd.models.mbv2_mlsd_large import MobileV2_MLSD_Large + +# import flask +from flask import request, jsonify +import waitress +import yaml +from asteroid.models import BaseModel +from controlnet_aux import ( + CannyDetector, + HEDdetector, + MidasDetector, + MLSDdetector, + OpenposeDetector, +) from controlnet_aux.hed import Network -from transformers import DPTForDepthEstimation, DPTFeatureExtractor -import warnings -import time +from controlnet_aux.mlsd.models.mbv2_mlsd_large import MobileV2_MLSD_Large +from controlnet_aux.open_pose.body import Body +from datasets import load_dataset +from diffusers import ( + ControlNetModel, + DiffusionPipeline, + DPMSolverMultistepScheduler, + StableDiffusionControlNetPipeline, + UniPCMultistepScheduler, +) +from diffusers.utils import export_to_video, load_image from espnet2.bin.tts_inference import Text2Speech -import soundfile as sf -from asteroid.models import BaseModel -import traceback -import os -import yaml +from PIL import Image +# from flask_cors import CORS +from torchvision import transforms +from transformers import ( + AutoTokenizer, + DPTFeatureExtractor, + DPTForDepthEstimation, + MaskFormerFeatureExtractor, + MaskFormerForInstanceSegmentation, + SpeechT5ForSpeechToSpeech, + SpeechT5HifiGan, + SpeechT5Processor, + VisionEncoderDecoderModel, + ViTImageProcessor, + pipeline, +) + + + +#logs warnings.filterwarnings("ignore") - parser = argparse.ArgumentParser() parser.add_argument("--config", type=str, default="configs/config.default.yaml") args = parser.parse_args() - logger = logging.getLogger(__name__) logger.setLevel(logging.INFO) handler = logging.StreamHandler() diff --git a/swarms/agents/multi_modal_workers/omni_agent/omni_chat.py b/swarms/agents/multi_modal_workers/omni_agent/omni_chat.py index f0371eec..833f1ef2 100644 --- a/swarms/agents/multi_modal_workers/omni_agent/omni_chat.py +++ b/swarms/agents/multi_modal_workers/omni_agent/omni_chat.py @@ -1,28 +1,87 @@ +import argparse import base64 import copy -from io import BytesIO import io +import json +import logging import os import random +import re +import threading import time import traceback import uuid +from io import BytesIO +from queue import Queue + import requests -import re -import json -import logging -import argparse +import tiktoken import yaml -from PIL import Image, ImageDraw from diffusers.utils import load_image -from pydub import AudioSegment -import threading -from queue import Queue -# import flask -# from flask import request, jsonify -# from flask_cors import CORS, cross_origin -from swarms.workers.multi_modal_workers.omni_agent.get_token_ids import get_token_ids_for_task_parsing, get_token_ids_for_choose_model, count_tokens, get_max_context_length from huggingface_hub.inference_api import InferenceApi +from PIL import Image, ImageDraw +from pydub import AudioSegment + +#tokenizations +encodings = { + "gpt-4": tiktoken.get_encoding("cl100k_base"), + "gpt-4-32k": tiktoken.get_encoding("cl100k_base"), + "gpt-3.5-turbo": tiktoken.get_encoding("cl100k_base"), + "gpt-3.5-turbo-0301": tiktoken.get_encoding("cl100k_base"), + "text-davinci-003": tiktoken.get_encoding("p50k_base"), + "text-davinci-002": tiktoken.get_encoding("p50k_base"), + "text-davinci-001": tiktoken.get_encoding("r50k_base"), + "text-curie-001": tiktoken.get_encoding("r50k_base"), + "text-babbage-001": tiktoken.get_encoding("r50k_base"), + "text-ada-001": tiktoken.get_encoding("r50k_base"), + "davinci": tiktoken.get_encoding("r50k_base"), + "curie": tiktoken.get_encoding("r50k_base"), + "babbage": tiktoken.get_encoding("r50k_base"), + "ada": tiktoken.get_encoding("r50k_base"), +} + +max_length = { + "gpt-4": 8192, + "gpt-4-32k": 32768, + "gpt-3.5-turbo": 4096, + "gpt-3.5-turbo-0301": 4096, + "text-davinci-003": 4096, + "text-davinci-002": 4096, + "text-davinci-001": 2049, + "text-curie-001": 2049, + "text-babbage-001": 2049, + "text-ada-001": 2049, + "davinci": 2049, + "curie": 2049, + "babbage": 2049, + "ada": 2049 +} + +def count_tokens(model_name, text): + return len(encodings[model_name].encode(text)) + +def get_max_context_length(model_name): + return max_length[model_name] + +def get_token_ids_for_task_parsing(model_name): + text = '''{"task": "text-classification", "token-classification", "text2text-generation", "summarization", "translation", "question-answering", "conversational", "text-generation", "sentence-similarity", "tabular-classification", "object-detection", "image-classification", "image-to-image", "image-to-text", "text-to-image", "visual-question-answering", "document-question-answering", "image-segmentation", "text-to-speech", "text-to-video", "automatic-speech-recognition", "audio-to-audio", "audio-classification", "canny-control", "hed-control", "mlsd-control", "normal-control", "openpose-control", "canny-text-to-image", "depth-text-to-image", "hed-text-to-image", "mlsd-text-to-image", "normal-text-to-image", "openpose-text-to-image", "seg-text-to-image", "args", "text", "path", "dep", "id", "-"}''' + res = encodings[model_name].encode(text) + res = list(set(res)) + return res + +def get_token_ids_for_choose_model(model_name): + text = '''{"id": "reason"}''' + res = encodings[model_name].encode(text) + res = list(set(res)) + return res + + + + + + + +######### parser = argparse.ArgumentParser() parser.add_argument("--config", type=str, default="swarms/agents/workers/multi_modal_workers/omni_agent/config.yml") diff --git a/swarms/agents/omni_modal_agent.py b/swarms/agents/omni_modal_agent.py index c9d28396..025e3517 100644 --- a/swarms/agents/omni_modal_agent.py +++ b/swarms/agents/omni_modal_agent.py @@ -1,30 +1,36 @@ from swarms.agents.multi_modal_workers.omni_agent.omni_chat import chat_huggingface -class OmniModalAgent: - def __init__( - self, - api_key, - api_endpoint, - api_type - ): - self.api_key = api_key - self.api_endpoint = api_endpoint - self.api_type = api_type - - def chat( - self, - data - ): - """Chat with omni-modality model that uses huggingface to query for a specific model at run time. Translate text to speech, create images and more""" - messages = data.get("messages") - api_key = data.get("api_key", self.api_key) - api_endpoint = data.get("api_endpoint", self.api_endpoint) - api_type = data.get("api_type", self.api_type) - - if not(api_key and api_type and api_endpoint): - raise ValueError("Please provide api_key, api_type, and api_endpoint") +# class OmniModalAgent: +# def __init__( +# self, +# api_key, +# api_endpoint, +# api_type +# ): +# self.api_key = api_key +# self.api_endpoint = api_endpoint +# self.api_type = api_type + +# def chat( +# self, +# data +# ): +# """Chat with omni-modality model that uses huggingface to query for a specific model at run time. Translate text to speech, create images and more""" +# messages = data.get("messages") +# api_key = data.get("api_key", self.api_key) +# api_endpoint = data.get("api_endpoint", self.api_endpoint) +# api_type = data.get("api_type", self.api_type) + +# if not(api_key and api_type and api_endpoint): +# raise ValueError("Please provide api_key, api_type, and api_endpoint") - response = chat_huggingface(messages, api_key, api_type, api_endpoint) - return response +# response = chat_huggingface(messages, api_key, api_type, api_endpoint) +# return response + + + +# class OmniModalAgent: +# def __init__( +# ) \ No newline at end of file diff --git a/swarms/agents/stream_response.py b/swarms/agents/stream_response.py new file mode 100644 index 00000000..419c2081 --- /dev/null +++ b/swarms/agents/stream_response.py @@ -0,0 +1,8 @@ + + +def stream(response): + """ + Yield the response token by token (word by word) from llm + """ + for token in response.split(): + yield token \ No newline at end of file