pull/58/head
Kye 1 year ago
parent e49d85b65c
commit 1808da08d5

@ -16,5 +16,13 @@
#agents #agents
# from swarms.agents.profitpilot import ProfitPilot # from swarms.agents.profitpilot import ProfitPilot
from swarms.agents.aot import AoTAgent from swarms.agents.aot import AoTAgent
# from swarms.agents.multi_modal_agent import MultiModalVisualAgent
# from swarms.agents.omni_modal_agent import OmniModalAgent # from swarms.agents.omni_modal_agent import OmniModalAgent
from swarms.agents.multi_modal_visual_agent import MultiModalAgent
#utils
from swarms.agents.message import Message
from swarms.agents.stream_response import stream

@ -21,4 +21,7 @@ class Message:
self.metadata = metadata or {} self.metadata = metadata or {}
def __repr__(self): def __repr__(self):
"""
__repr__ means
"""
return f"{self.timestamp} - {self.sender}: {self.content}" return f"{self.timestamp} - {self.sender}: {self.content}"

@ -1,175 +0,0 @@
from swarms.agents.multi_modal_workers.multi_modal_agent import MultiModalVisualAgent
from swarms.agents.message import Message
class MultiModalAgent:
"""
A user-friendly abstraction over the MultiModalVisualAgent that provides a simple interface
to process both text and images.
Initializes the MultiModalAgent.
Parameters:
load_dict (dict, optional): Dictionary of class names and devices to load. Defaults to a basic configuration.
temperature (float, optional): Temperature for the OpenAI model. Defaults to 0.
default_language (str, optional): Default language for the agent. Defaults to "English".
Usage
--------------
For chats:
------------
agent = MultiModalAgent()
agent.chat("Hello")
-----------
Or just with text
------------
agent = MultiModalAgent()
agent.run_text("Hello")
"""
def __init__(
self,
load_dict,
temperature,
language: str = "english"
):
self.load_dict = load_dict
self.temperature = temperature
self.langigage = language
if load_dict is None:
load_dict = {
"ImageCaptioning": "default_device"
}
self.agent = MultiModalVisualAgent(
load_dict,
temperature
)
self.language = language
self.history = []
def run_text(
self,
text: str = None,
language=None
):
"""Run text through the model"""
if language is None:
language = self.language
try:
self.agent.init_agent(language)
return self.agent.run_text(text)
except Exception as e:
return f"Error processing text: {str(e)}"
def run_img(
self,
image_path: str,
language=None
):
"""If language is None"""
if language is None:
language = self.default_language
try:
return self.agent.run_image(
image_path,
language
)
except Exception as error:
return f"Error processing image: {str(error)}"
def chat(
self,
msg: str = None,
language: str = None,
streaming: bool = False
):
"""
Run chat with the multi-modal agent
Args:
msg (str, optional): Message to send to the agent. Defaults to None.
language (str, optional): Language to use. Defaults to None.
streaming (bool, optional): Whether to stream the response. Defaults to False.
Returns:
str: Response from the agent
Usage:
--------------
agent = MultiModalAgent()
agent.chat("Hello")
"""
if language is None:
language = self.default_language
#add users message to the history
self.history.append(
Message(
"User",
msg
)
)
#process msg
try:
self.agent.init_agent(language)
response = self.agent.run_text(msg)
#add agent's response to the history
self.history.append(
Message(
"Agent",
response
)
)
#if streaming is = True
if streaming:
return self._stream_response(response)
else:
response
except Exception as error:
error_message = f"Error processing message: {str(error)}"
#add error to history
self.history.append(
Message(
"Agent",
error_message
)
)
return error_message
def _stream_response(
self,
response: str = None
):
"""
Yield the response token by token (word by word)
Usage:
--------------
for token in _stream_response(response):
print(token)
"""
for token in response.split():
yield token
def clear(self):
"""Clear agent's memory"""
try:
self.agent.clear_memory()
except Exception as e:
return f"Error cleaning memory: {str(e)}"

@ -1579,3 +1579,190 @@ class MultiModalVisualAgent:
def clear_memory(self): def clear_memory(self):
self.memory.clear() self.memory.clear()
###### usage
from swarms.agents.message import Message
class MultiModalAgent:
"""
A user-friendly abstraction over the MultiModalVisualAgent that provides a simple interface
to process both text and images.
Initializes the MultiModalAgent.
Architecture:
Parameters:
load_dict (dict, optional): Dictionary of class names and devices to load.
Defaults to a basic configuration.
temperature (float, optional): Temperature for the OpenAI model. Defaults to 0.
default_language (str, optional): Default language for the agent.
Defaults to "English".
Usage
--------------
For chats:
------------
agent = MultiModalAgent()
agent.chat("Hello")
-----------
Or just with text
------------
agent = MultiModalAgent()
agent.run_text("Hello")
"""
def __init__(
self,
load_dict,
temperature,
language: str = "english"
):
self.load_dict = load_dict
self.temperature = temperature
self.langigage = language
if load_dict is None:
load_dict = {
"ImageCaptioning": "default_device"
}
self.agent = MultiModalVisualAgent(
load_dict,
temperature
)
self.language = language
self.history = []
def run_text(
self,
text: str = None,
language=None
):
"""Run text through the model"""
if language is None:
language = self.language
try:
self.agent.init_agent(language)
return self.agent.run_text(text)
except Exception as e:
return f"Error processing text: {str(e)}"
def run_img(
self,
image_path: str,
language=None
):
"""If language is None"""
if language is None:
language = self.default_language
try:
return self.agent.run_image(
image_path,
language
)
except Exception as error:
return f"Error processing image: {str(error)}"
def chat(
self,
msg: str = None,
language: str = None,
streaming: bool = False
):
"""
Run chat with the multi-modal agent
Args:
msg (str, optional): Message to send to the agent. Defaults to None.
language (str, optional): Language to use. Defaults to None.
streaming (bool, optional): Whether to stream the response. Defaults to False.
Returns:
str: Response from the agent
Usage:
--------------
agent = MultiModalAgent()
agent.chat("Hello")
"""
if language is None:
language = self.default_language
#add users message to the history
self.history.append(
Message(
"User",
msg
)
)
#process msg
try:
self.agent.init_agent(language)
response = self.agent.run_text(msg)
#add agent's response to the history
self.history.append(
Message(
"Agent",
response
)
)
#if streaming is = True
if streaming:
return self._stream_response(response)
else:
response
except Exception as error:
error_message = f"Error processing message: {str(error)}"
#add error to history
self.history.append(
Message(
"Agent",
error_message
)
)
return error_message
def _stream_response(
self,
response: str = None
):
"""
Yield the response token by token (word by word)
Usage:
--------------
for token in _stream_response(response):
print(token)
"""
for token in response.split():
yield token
def clear(self):
"""Clear agent's memory"""
try:
self.agent.clear_memory()
except Exception as e:
return f"Error cleaning memory: {str(e)}"

@ -1,45 +1,67 @@
import argparse import argparse
import logging import logging
import os
import random import random
import time
import traceback
import uuid import uuid
import warnings
import numpy as np import numpy as np
from transformers import pipeline import soundfile as sf
from diffusers import DiffusionPipeline, StableDiffusionControlNetPipeline, ControlNetModel, UniPCMultistepScheduler
from diffusers.utils import load_image
from diffusers import DiffusionPipeline, DPMSolverMultistepScheduler
from diffusers.utils import export_to_video
from transformers import SpeechT5Processor, SpeechT5HifiGan, SpeechT5ForSpeechToSpeech
from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer
from datasets import load_dataset
from PIL import Image
# import flask
# from flask import request, jsonify
import waitress
# from flask_cors import CORS
from torchvision import transforms
import torch import torch
import torchaudio import torchaudio
from transformers import MaskFormerFeatureExtractor, MaskFormerForInstanceSegmentation
from controlnet_aux import OpenposeDetector, MLSDdetector, HEDdetector, CannyDetector, MidasDetector # import flask
from controlnet_aux.open_pose.body import Body from flask import request, jsonify
from controlnet_aux.mlsd.models.mbv2_mlsd_large import MobileV2_MLSD_Large import waitress
import yaml
from asteroid.models import BaseModel
from controlnet_aux import (
CannyDetector,
HEDdetector,
MidasDetector,
MLSDdetector,
OpenposeDetector,
)
from controlnet_aux.hed import Network from controlnet_aux.hed import Network
from transformers import DPTForDepthEstimation, DPTFeatureExtractor from controlnet_aux.mlsd.models.mbv2_mlsd_large import MobileV2_MLSD_Large
import warnings from controlnet_aux.open_pose.body import Body
import time from datasets import load_dataset
from diffusers import (
ControlNetModel,
DiffusionPipeline,
DPMSolverMultistepScheduler,
StableDiffusionControlNetPipeline,
UniPCMultistepScheduler,
)
from diffusers.utils import export_to_video, load_image
from espnet2.bin.tts_inference import Text2Speech from espnet2.bin.tts_inference import Text2Speech
import soundfile as sf from PIL import Image
from asteroid.models import BaseModel
import traceback
import os
import yaml
# from flask_cors import CORS
from torchvision import transforms
from transformers import (
AutoTokenizer,
DPTFeatureExtractor,
DPTForDepthEstimation,
MaskFormerFeatureExtractor,
MaskFormerForInstanceSegmentation,
SpeechT5ForSpeechToSpeech,
SpeechT5HifiGan,
SpeechT5Processor,
VisionEncoderDecoderModel,
ViTImageProcessor,
pipeline,
)
#logs
warnings.filterwarnings("ignore") warnings.filterwarnings("ignore")
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument("--config", type=str, default="configs/config.default.yaml") parser.add_argument("--config", type=str, default="configs/config.default.yaml")
args = parser.parse_args() args = parser.parse_args()
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO) logger.setLevel(logging.INFO)
handler = logging.StreamHandler() handler = logging.StreamHandler()

@ -1,28 +1,87 @@
import argparse
import base64 import base64
import copy import copy
from io import BytesIO
import io import io
import json
import logging
import os import os
import random import random
import re
import threading
import time import time
import traceback import traceback
import uuid import uuid
from io import BytesIO
from queue import Queue
import requests import requests
import re import tiktoken
import json
import logging
import argparse
import yaml import yaml
from PIL import Image, ImageDraw
from diffusers.utils import load_image from diffusers.utils import load_image
from pydub import AudioSegment
import threading
from queue import Queue
# import flask
# from flask import request, jsonify
# from flask_cors import CORS, cross_origin
from swarms.workers.multi_modal_workers.omni_agent.get_token_ids import get_token_ids_for_task_parsing, get_token_ids_for_choose_model, count_tokens, get_max_context_length
from huggingface_hub.inference_api import InferenceApi from huggingface_hub.inference_api import InferenceApi
from PIL import Image, ImageDraw
from pydub import AudioSegment
#tokenizations
encodings = {
"gpt-4": tiktoken.get_encoding("cl100k_base"),
"gpt-4-32k": tiktoken.get_encoding("cl100k_base"),
"gpt-3.5-turbo": tiktoken.get_encoding("cl100k_base"),
"gpt-3.5-turbo-0301": tiktoken.get_encoding("cl100k_base"),
"text-davinci-003": tiktoken.get_encoding("p50k_base"),
"text-davinci-002": tiktoken.get_encoding("p50k_base"),
"text-davinci-001": tiktoken.get_encoding("r50k_base"),
"text-curie-001": tiktoken.get_encoding("r50k_base"),
"text-babbage-001": tiktoken.get_encoding("r50k_base"),
"text-ada-001": tiktoken.get_encoding("r50k_base"),
"davinci": tiktoken.get_encoding("r50k_base"),
"curie": tiktoken.get_encoding("r50k_base"),
"babbage": tiktoken.get_encoding("r50k_base"),
"ada": tiktoken.get_encoding("r50k_base"),
}
max_length = {
"gpt-4": 8192,
"gpt-4-32k": 32768,
"gpt-3.5-turbo": 4096,
"gpt-3.5-turbo-0301": 4096,
"text-davinci-003": 4096,
"text-davinci-002": 4096,
"text-davinci-001": 2049,
"text-curie-001": 2049,
"text-babbage-001": 2049,
"text-ada-001": 2049,
"davinci": 2049,
"curie": 2049,
"babbage": 2049,
"ada": 2049
}
def count_tokens(model_name, text):
return len(encodings[model_name].encode(text))
def get_max_context_length(model_name):
return max_length[model_name]
def get_token_ids_for_task_parsing(model_name):
text = '''{"task": "text-classification", "token-classification", "text2text-generation", "summarization", "translation", "question-answering", "conversational", "text-generation", "sentence-similarity", "tabular-classification", "object-detection", "image-classification", "image-to-image", "image-to-text", "text-to-image", "visual-question-answering", "document-question-answering", "image-segmentation", "text-to-speech", "text-to-video", "automatic-speech-recognition", "audio-to-audio", "audio-classification", "canny-control", "hed-control", "mlsd-control", "normal-control", "openpose-control", "canny-text-to-image", "depth-text-to-image", "hed-text-to-image", "mlsd-text-to-image", "normal-text-to-image", "openpose-text-to-image", "seg-text-to-image", "args", "text", "path", "dep", "id", "<GENERATED>-"}'''
res = encodings[model_name].encode(text)
res = list(set(res))
return res
def get_token_ids_for_choose_model(model_name):
text = '''{"id": "reason"}'''
res = encodings[model_name].encode(text)
res = list(set(res))
return res
#########
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument("--config", type=str, default="swarms/agents/workers/multi_modal_workers/omni_agent/config.yml") parser.add_argument("--config", type=str, default="swarms/agents/workers/multi_modal_workers/omni_agent/config.yml")

@ -1,30 +1,36 @@
from swarms.agents.multi_modal_workers.omni_agent.omni_chat import chat_huggingface from swarms.agents.multi_modal_workers.omni_agent.omni_chat import chat_huggingface
class OmniModalAgent: # class OmniModalAgent:
def __init__( # def __init__(
self, # self,
api_key, # api_key,
api_endpoint, # api_endpoint,
api_type # api_type
): # ):
self.api_key = api_key # self.api_key = api_key
self.api_endpoint = api_endpoint # self.api_endpoint = api_endpoint
self.api_type = api_type # self.api_type = api_type
def chat(
self,
data
):
"""Chat with omni-modality model that uses huggingface to query for a specific model at run time. Translate text to speech, create images and more"""
messages = data.get("messages")
api_key = data.get("api_key", self.api_key)
api_endpoint = data.get("api_endpoint", self.api_endpoint)
api_type = data.get("api_type", self.api_type)
if not(api_key and api_type and api_endpoint):
raise ValueError("Please provide api_key, api_type, and api_endpoint")
response = chat_huggingface(messages, api_key, api_type, api_endpoint)
return response
# def chat(
# self,
# data
# ):
# """Chat with omni-modality model that uses huggingface to query for a specific model at run time. Translate text to speech, create images and more"""
# messages = data.get("messages")
# api_key = data.get("api_key", self.api_key)
# api_endpoint = data.get("api_endpoint", self.api_endpoint)
# api_type = data.get("api_type", self.api_type)
# if not(api_key and api_type and api_endpoint):
# raise ValueError("Please provide api_key, api_type, and api_endpoint")
# response = chat_huggingface(messages, api_key, api_type, api_endpoint)
# return response
# class OmniModalAgent:
# def __init__(
# )

@ -0,0 +1,8 @@
def stream(response):
"""
Yield the response token by token (word by word) from llm
"""
for token in response.split():
yield token
Loading…
Cancel
Save