omni agent cleanup

2 years ago · a89ec99299
parent 5c3937339a
commit a89ec99299
7 changed files with 692 additions and 709 deletions
--- a/swarms/agents/workers/multi_modal_agents/init.py
+++ b/swarms/agents/workers/multi_modal_agents/init.py
@ -1 +1 @@
-from swarms.agents.workers.multi_modal_agents.omni_agent import chat_huggingface
+from swarms.agents.workers.multi_modal_agents.omni_agent.omni_agent import chat_huggingface
--- a/swarms/agents/workers/multi_modal_agents/omni_agent/init.py
+++ b/swarms/agents/workers/multi_modal_agents/omni_agent/init.py
--- a/swarms/agents/workers/multi_modal_agents/omni_agent/get_token_ids.py
+++ b/swarms/agents/workers/multi_modal_agents/omni_agent/get_token_ids.py
@ -0,0 +1,53 @@
+import tiktoken
+
+encodings = {
+    "gpt-4": tiktoken.get_encoding("cl100k_base"),
+    "gpt-4-32k": tiktoken.get_encoding("cl100k_base"),
+    "gpt-3.5-turbo": tiktoken.get_encoding("cl100k_base"),
+    "gpt-3.5-turbo-0301": tiktoken.get_encoding("cl100k_base"),
+    "text-davinci-003": tiktoken.get_encoding("p50k_base"),
+    "text-davinci-002": tiktoken.get_encoding("p50k_base"),
+    "text-davinci-001": tiktoken.get_encoding("r50k_base"),
+    "text-curie-001": tiktoken.get_encoding("r50k_base"),
+    "text-babbage-001": tiktoken.get_encoding("r50k_base"),
+    "text-ada-001": tiktoken.get_encoding("r50k_base"),
+    "davinci": tiktoken.get_encoding("r50k_base"),
+    "curie": tiktoken.get_encoding("r50k_base"),
+    "babbage": tiktoken.get_encoding("r50k_base"),
+    "ada": tiktoken.get_encoding("r50k_base"),
+}
+
+max_length = {
+    "gpt-4": 8192,
+    "gpt-4-32k": 32768,
+    "gpt-3.5-turbo": 4096,
+    "gpt-3.5-turbo-0301": 4096,
+    "text-davinci-003": 4096,
+    "text-davinci-002": 4096,
+    "text-davinci-001": 2049,
+    "text-curie-001": 2049,
+    "text-babbage-001": 2049,
+    "text-ada-001": 2049,
+    "davinci": 2049,
+    "curie": 2049,
+    "babbage": 2049,
+    "ada": 2049
+}
+
+def count_tokens(model_name, text):
+    return len(encodings[model_name].encode(text))
+
+def get_max_context_length(model_name):
+    return max_length[model_name]
+
+def get_token_ids_for_task_parsing(model_name):
+    text = '''{"task": "text-classification",  "token-classification", "text2text-generation", "summarization", "translation",  "question-answering", "conversational", "text-generation", "sentence-similarity", "tabular-classification", "object-detection", "image-classification", "image-to-image", "image-to-text", "text-to-image", "visual-question-answering", "document-question-answering", "image-segmentation", "text-to-speech", "text-to-video", "automatic-speech-recognition", "audio-to-audio", "audio-classification", "canny-control", "hed-control", "mlsd-control", "normal-control", "openpose-control", "canny-text-to-image", "depth-text-to-image", "hed-text-to-image", "mlsd-text-to-image", "normal-text-to-image", "openpose-text-to-image", "seg-text-to-image", "args", "text", "path", "dep", "id", "<GENERATED>-"}'''
+    res = encodings[model_name].encode(text)
+    res = list(set(res))
+    return res
+
+def get_token_ids_for_choose_model(model_name):
+    text = '''{"id": "reason"}'''
+    res = encodings[model_name].encode(text)
+    res = list(set(res))
+    return res
--- a/swarms/agents/workers/multi_modal_agents/omni_agent/model_server.py
+++ b/swarms/agents/workers/multi_modal_agents/omni_agent/model_server.py
@ -0,0 +1,635 @@
+import argparse
+import logging
+import random
+import uuid
+import numpy as np
+from transformers import pipeline
+from diffusers import DiffusionPipeline, StableDiffusionControlNetPipeline, ControlNetModel, UniPCMultistepScheduler
+from diffusers.utils import load_image
+from diffusers import DiffusionPipeline, DPMSolverMultistepScheduler
+from diffusers.utils import export_to_video
+from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan, SpeechT5ForSpeechToSpeech
+from transformers import BlipProcessor, BlipForConditionalGeneration
+from transformers import TrOCRProcessor, VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer
+from datasets import load_dataset
+from PIL import Image
+import flask
+from flask import request, jsonify
+import waitress
+from flask_cors import CORS
+import io
+from torchvision import transforms
+import torch
+import torchaudio
+from speechbrain.pretrained import WaveformEnhancement
+import joblib
+from huggingface_hub import hf_hub_url, cached_download
+from transformers import AutoImageProcessor, TimesformerForVideoClassification
+from transformers import MaskFormerFeatureExtractor, MaskFormerForInstanceSegmentation, AutoFeatureExtractor
+from controlnet_aux import OpenposeDetector, MLSDdetector, HEDdetector, CannyDetector, MidasDetector
+from controlnet_aux.open_pose.body import Body
+from controlnet_aux.mlsd.models.mbv2_mlsd_large import MobileV2_MLSD_Large
+from controlnet_aux.hed import Network
+from transformers import DPTForDepthEstimation, DPTFeatureExtractor
+import warnings
+import time
+from espnet2.bin.tts_inference import Text2Speech
+import soundfile as sf
+from asteroid.models import BaseModel
+import traceback
+import os
+import yaml
+
+warnings.filterwarnings("ignore")
+
+parser = argparse.ArgumentParser()
+parser.add_argument("--config", type=str, default="configs/config.default.yaml")
+args = parser.parse_args()
+
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+handler = logging.StreamHandler()
+handler.setLevel(logging.INFO)
+formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+handler.setFormatter(formatter)
+logger.addHandler(handler)
+
+config = yaml.load(open(args.config, "r"), Loader=yaml.FullLoader)
+
+# host = config["local_inference_endpoint"]["host"]
+port = config["local_inference_endpoint"]["port"]
+
+local_deployment = config["local_deployment"]
+device = config.get("device", "cuda:0") 
+
+PROXY = None
+if config["proxy"]:
+    PROXY = {
+        "https": config["proxy"],
+    }
+
+app = flask.Flask(__name__)
+CORS(app)
+
+start = time.time()
+
+local_fold = "models"
+# if args.config.endswith(".dev"):
+#     local_fold = "models_dev"
+
+
+def load_pipes(local_deployment):
+    other_pipes = {}
+    standard_pipes = {}
+    controlnet_sd_pipes = {}
+    if local_deployment in ["full"]:
+        other_pipes = {
+            "nlpconnect/vit-gpt2-image-captioning":{
+                "model": VisionEncoderDecoderModel.from_pretrained(f"{local_fold}/nlpconnect/vit-gpt2-image-captioning"),
+                "feature_extractor": ViTImageProcessor.from_pretrained(f"{local_fold}/nlpconnect/vit-gpt2-image-captioning"),
+                "tokenizer": AutoTokenizer.from_pretrained(f"{local_fold}/nlpconnect/vit-gpt2-image-captioning"),
+                "device": device
+            },
+            # "Salesforce/blip-image-captioning-large": {
+            #     "model": BlipForConditionalGeneration.from_pretrained(f"{local_fold}/Salesforce/blip-image-captioning-large"),
+            #     "processor": BlipProcessor.from_pretrained(f"{local_fold}/Salesforce/blip-image-captioning-large"),
+            #     "device": device
+            # },
+            "damo-vilab/text-to-video-ms-1.7b": {
+                "model": DiffusionPipeline.from_pretrained(f"{local_fold}/damo-vilab/text-to-video-ms-1.7b", torch_dtype=torch.float16, variant="fp16"),
+                "device": device
+            },
+            # "facebook/maskformer-swin-large-ade": {
+            #     "model": MaskFormerForInstanceSegmentation.from_pretrained(f"{local_fold}/facebook/maskformer-swin-large-ade"),
+            #     "feature_extractor" : AutoFeatureExtractor.from_pretrained("facebook/maskformer-swin-large-ade"),
+            #     "device": device
+            # },
+            # "microsoft/trocr-base-printed": {
+            #     "processor": TrOCRProcessor.from_pretrained(f"{local_fold}/microsoft/trocr-base-printed"),
+            #     "model": VisionEncoderDecoderModel.from_pretrained(f"{local_fold}/microsoft/trocr-base-printed"),
+            #     "device": device
+            # },
+            # "microsoft/trocr-base-handwritten": {
+            #     "processor": TrOCRProcessor.from_pretrained(f"{local_fold}/microsoft/trocr-base-handwritten"),
+            #     "model": VisionEncoderDecoderModel.from_pretrained(f"{local_fold}/microsoft/trocr-base-handwritten"),
+            #     "device": device
+            # },
+            "JorisCos/DCCRNet_Libri1Mix_enhsingle_16k": {
+                "model": BaseModel.from_pretrained("JorisCos/DCCRNet_Libri1Mix_enhsingle_16k"),
+                "device": device
+            },
+            "espnet/kan-bayashi_ljspeech_vits": {
+                "model": Text2Speech.from_pretrained(f"espnet/kan-bayashi_ljspeech_vits"),
+                "device": device
+            },
+            "lambdalabs/sd-image-variations-diffusers": {
+                "model": DiffusionPipeline.from_pretrained(f"{local_fold}/lambdalabs/sd-image-variations-diffusers"), #torch_dtype=torch.float16
+                "device": device
+            },
+            # "CompVis/stable-diffusion-v1-4": {
+            #     "model": DiffusionPipeline.from_pretrained(f"{local_fold}/CompVis/stable-diffusion-v1-4"),
+            #     "device": device
+            # },
+            # "stabilityai/stable-diffusion-2-1": {
+            #     "model": DiffusionPipeline.from_pretrained(f"{local_fold}/stabilityai/stable-diffusion-2-1"),
+            #     "device": device
+            # },
+            "runwayml/stable-diffusion-v1-5": {
+                "model": DiffusionPipeline.from_pretrained(f"{local_fold}/runwayml/stable-diffusion-v1-5"),
+                "device": device
+            },
+            # "microsoft/speecht5_tts":{
+            #     "processor": SpeechT5Processor.from_pretrained(f"{local_fold}/microsoft/speecht5_tts"),
+            #     "model": SpeechT5ForTextToSpeech.from_pretrained(f"{local_fold}/microsoft/speecht5_tts"),
+            #     "vocoder":  SpeechT5HifiGan.from_pretrained(f"{local_fold}/microsoft/speecht5_hifigan"),
+            #     "embeddings_dataset": load_dataset(f"{local_fold}/Matthijs/cmu-arctic-xvectors", split="validation"),
+            #     "device": device
+            # },
+            # "speechbrain/mtl-mimic-voicebank": {
+            #     "model": WaveformEnhancement.from_hparams(source="speechbrain/mtl-mimic-voicebank", savedir="models/mtl-mimic-voicebank"),
+            #     "device": device
+            # },
+            "microsoft/speecht5_vc":{
+                "processor": SpeechT5Processor.from_pretrained(f"{local_fold}/microsoft/speecht5_vc"),
+                "model": SpeechT5ForSpeechToSpeech.from_pretrained(f"{local_fold}/microsoft/speecht5_vc"),
+                "vocoder": SpeechT5HifiGan.from_pretrained(f"{local_fold}/microsoft/speecht5_hifigan"),
+                "embeddings_dataset": load_dataset(f"{local_fold}/Matthijs/cmu-arctic-xvectors", split="validation"),
+                "device": device
+            },
+            # "julien-c/wine-quality": {
+            #     "model": joblib.load(cached_download(hf_hub_url("julien-c/wine-quality", "sklearn_model.joblib")))
+            # },
+            # "facebook/timesformer-base-finetuned-k400": {
+            #     "processor": AutoImageProcessor.from_pretrained(f"{local_fold}/facebook/timesformer-base-finetuned-k400"),
+            #     "model": TimesformerForVideoClassification.from_pretrained(f"{local_fold}/facebook/timesformer-base-finetuned-k400"),
+            #     "device": device
+            # },
+            "facebook/maskformer-swin-base-coco": {
+                "feature_extractor": MaskFormerFeatureExtractor.from_pretrained(f"{local_fold}/facebook/maskformer-swin-base-coco"),
+                "model": MaskFormerForInstanceSegmentation.from_pretrained(f"{local_fold}/facebook/maskformer-swin-base-coco"),
+                "device": device
+            },
+            "Intel/dpt-hybrid-midas": {
+                "model": DPTForDepthEstimation.from_pretrained(f"{local_fold}/Intel/dpt-hybrid-midas", low_cpu_mem_usage=True),
+                "feature_extractor": DPTFeatureExtractor.from_pretrained(f"{local_fold}/Intel/dpt-hybrid-midas"),
+                "device": device
+            }
+        }
+
+    if local_deployment in ["full", "standard"]:
+        standard_pipes = {
+            # "superb/wav2vec2-base-superb-ks": {
+            #     "model": pipeline(task="audio-classification", model=f"{local_fold}/superb/wav2vec2-base-superb-ks"), 
+            #     "device": device
+            # },
+            "openai/whisper-base": {
+                "model": pipeline(task="automatic-speech-recognition", model=f"{local_fold}/openai/whisper-base"), 
+                "device": device
+            },
+            "microsoft/speecht5_asr": {
+                "model": pipeline(task="automatic-speech-recognition", model=f"{local_fold}/microsoft/speecht5_asr"), 
+                "device": device
+            },
+            "Intel/dpt-large": {
+                "model": pipeline(task="depth-estimation", model=f"{local_fold}/Intel/dpt-large"), 
+                "device": device
+            },
+            # "microsoft/beit-base-patch16-224-pt22k-ft22k": {
+            #     "model": pipeline(task="image-classification", model=f"{local_fold}/microsoft/beit-base-patch16-224-pt22k-ft22k"), 
+            #     "device": device
+            # },
+            "facebook/detr-resnet-50-panoptic": {
+                "model": pipeline(task="image-segmentation", model=f"{local_fold}/facebook/detr-resnet-50-panoptic"), 
+                "device": device
+            },
+            "facebook/detr-resnet-101": {
+                "model": pipeline(task="object-detection", model=f"{local_fold}/facebook/detr-resnet-101"), 
+                "device": device
+            },
+            # "openai/clip-vit-large-patch14": {
+            #     "model": pipeline(task="zero-shot-image-classification", model=f"{local_fold}/openai/clip-vit-large-patch14"), 
+            #     "device": device
+            # },
+            "google/owlvit-base-patch32": {
+                "model": pipeline(task="zero-shot-object-detection", model=f"{local_fold}/google/owlvit-base-patch32"), 
+                "device": device
+            },
+            # "microsoft/DialoGPT-medium": {
+            #     "model": pipeline(task="conversational", model=f"{local_fold}/microsoft/DialoGPT-medium"), 
+            #     "device": device
+            # },
+            # "bert-base-uncased": {
+            #     "model": pipeline(task="fill-mask", model=f"{local_fold}/bert-base-uncased"), 
+            #     "device": device
+            # },
+            # "deepset/roberta-base-squad2": {
+            #     "model": pipeline(task = "question-answering", model=f"{local_fold}/deepset/roberta-base-squad2"), 
+            #     "device": device
+            # },
+            # "facebook/bart-large-cnn": {
+            #     "model": pipeline(task="summarization", model=f"{local_fold}/facebook/bart-large-cnn"), 
+            #     "device": device
+            # },
+            # "google/tapas-base-finetuned-wtq": {
+            #     "model": pipeline(task="table-question-answering", model=f"{local_fold}/google/tapas-base-finetuned-wtq"), 
+            #     "device": device
+            # },
+            # "distilbert-base-uncased-finetuned-sst-2-english": {
+            #     "model": pipeline(task="text-classification", model=f"{local_fold}/distilbert-base-uncased-finetuned-sst-2-english"), 
+            #     "device": device
+            # },
+            # "gpt2": {
+            #     "model": pipeline(task="text-generation", model="gpt2"), 
+            #     "device": device
+            # },
+            # "mrm8488/t5-base-finetuned-question-generation-ap": {
+            #     "model": pipeline(task="text2text-generation", model=f"{local_fold}/mrm8488/t5-base-finetuned-question-generation-ap"), 
+            #     "device": device
+            # },
+            # "Jean-Baptiste/camembert-ner": {
+            #     "model": pipeline(task="token-classification", model=f"{local_fold}/Jean-Baptiste/camembert-ner", aggregation_strategy="simple"), 
+            #     "device": device
+            # },
+            # "t5-base": {
+            #     "model": pipeline(task="translation", model=f"{local_fold}/t5-base"), 
+            #     "device": device
+            # },
+            "impira/layoutlm-document-qa": {
+                "model": pipeline(task="document-question-answering", model=f"{local_fold}/impira/layoutlm-document-qa"), 
+                "device": device
+            },
+            "ydshieh/vit-gpt2-coco-en": {
+                "model": pipeline(task="image-to-text", model=f"{local_fold}/ydshieh/vit-gpt2-coco-en"), 
+                "device": device
+            },
+            "dandelin/vilt-b32-finetuned-vqa": {
+                "model": pipeline(task="visual-question-answering", model=f"{local_fold}/dandelin/vilt-b32-finetuned-vqa"), 
+                "device": device
+            }
+        }
+
+    if local_deployment in ["full", "standard", "minimal"]:
+        controlnet = ControlNetModel.from_pretrained(f"{local_fold}/lllyasviel/sd-controlnet-canny", torch_dtype=torch.float16)
+        controlnetpipe = StableDiffusionControlNetPipeline.from_pretrained(
+            f"{local_fold}/runwayml/stable-diffusion-v1-5", controlnet=controlnet, torch_dtype=torch.float16
+        )
+
+        def mlsd_control_network():
+            model = MobileV2_MLSD_Large()
+            model.load_state_dict(torch.load(f"{local_fold}/lllyasviel/ControlNet/annotator/ckpts/mlsd_large_512_fp32.pth"), strict=True)
+            return MLSDdetector(model)
+
+
+        hed_network = Network(f"{local_fold}/lllyasviel/ControlNet/annotator/ckpts/network-bsds500.pth")
+
+        controlnet_sd_pipes = {
+            "openpose-control": {
+                "model": OpenposeDetector(Body(f"{local_fold}/lllyasviel/ControlNet/annotator/ckpts/body_pose_model.pth"))
+            },
+            "mlsd-control": {
+                "model": mlsd_control_network()
+            },
+            "hed-control": {
+                "model": HEDdetector(hed_network)
+            },
+            "scribble-control": {
+                "model": HEDdetector(hed_network)
+            },
+            "midas-control": {
+                "model": MidasDetector(model_path=f"{local_fold}/lllyasviel/ControlNet/annotator/ckpts/dpt_hybrid-midas-501f0c75.pt")
+            },
+            "canny-control": {
+                "model": CannyDetector()
+            },
+            "lllyasviel/sd-controlnet-canny":{
+                "control": controlnet, 
+                "model": controlnetpipe,
+                "device": device
+            },
+            "lllyasviel/sd-controlnet-depth":{
+                "control": ControlNetModel.from_pretrained(f"{local_fold}/lllyasviel/sd-controlnet-depth", torch_dtype=torch.float16),
+                "model": controlnetpipe,
+                "device": device
+            },
+            "lllyasviel/sd-controlnet-hed":{
+                "control": ControlNetModel.from_pretrained(f"{local_fold}/lllyasviel/sd-controlnet-hed", torch_dtype=torch.float16), 
+                "model": controlnetpipe,
+                "device": device
+            },
+            "lllyasviel/sd-controlnet-mlsd":{
+                "control": ControlNetModel.from_pretrained(f"{local_fold}/lllyasviel/sd-controlnet-mlsd", torch_dtype=torch.float16), 
+                "model": controlnetpipe,
+                "device": device
+            },
+            "lllyasviel/sd-controlnet-openpose":{
+                "control": ControlNetModel.from_pretrained(f"{local_fold}/lllyasviel/sd-controlnet-openpose", torch_dtype=torch.float16), 
+                "model": controlnetpipe,
+                "device": device
+            },
+            "lllyasviel/sd-controlnet-scribble":{
+                "control": ControlNetModel.from_pretrained(f"{local_fold}/lllyasviel/sd-controlnet-scribble", torch_dtype=torch.float16), 
+                "model": controlnetpipe,
+                "device": device
+            },
+            "lllyasviel/sd-controlnet-seg":{
+                "control": ControlNetModel.from_pretrained(f"{local_fold}/lllyasviel/sd-controlnet-seg", torch_dtype=torch.float16), 
+                "model": controlnetpipe,
+                "device": device
+            }    
+        }
+    pipes = {**standard_pipes, **other_pipes, **controlnet_sd_pipes}
+    return pipes
+
+pipes = load_pipes(local_deployment)
+
+end = time.time()
+during = end - start
+
+print(f"[ ready ] {during}s")
+
+@app.route('/running', methods=['GET'])
+def running():
+    return jsonify({"running": True})
+
+@app.route('/status/<path:model_id>', methods=['GET'])
+def status(model_id):
+    disabled_models = ["microsoft/trocr-base-printed", "microsoft/trocr-base-handwritten"]
+    if model_id in pipes.keys() and model_id not in disabled_models:
+        print(f"[ check {model_id} ] success")
+        return jsonify({"loaded": True})
+    else:
+        print(f"[ check {model_id} ] failed")
+        return jsonify({"loaded": False})
+
+@app.route('/models/<path:model_id>', methods=['POST'])
+def models(model_id):
+    while "using" in pipes[model_id] and pipes[model_id]["using"]:
+        print(f"[ inference {model_id} ] waiting")
+        time.sleep(0.1)
+    pipes[model_id]["using"] = True
+    print(f"[ inference {model_id} ] start")
+
+    start = time.time()
+
+    pipe = pipes[model_id]["model"]
+    
+    if "device" in pipes[model_id]:
+        try:
+            pipe.to(pipes[model_id]["device"])
+        except:
+            pipe.device = torch.device(pipes[model_id]["device"])
+            pipe.model.to(pipes[model_id]["device"])
+    
+    result = None
+    try:
+        # text to video
+        if model_id == "damo-vilab/text-to-video-ms-1.7b":
+            pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
+            # pipe.enable_model_cpu_offload()
+            prompt = request.get_json()["text"]
+            video_frames = pipe(prompt, num_inference_steps=50, num_frames=40).frames
+            video_path = export_to_video(video_frames)
+            file_name = str(uuid.uuid4())[:4]
+            os.system(f"LD_LIBRARY_PATH=/usr/local/lib /usr/local/bin/ffmpeg -i {video_path} -vcodec libx264 public/videos/{file_name}.mp4")
+            result = {"path": f"/videos/{file_name}.mp4"}
+
+        # controlnet
+        if model_id.startswith("lllyasviel/sd-controlnet-"):
+            pipe.controlnet.to('cpu')
+            pipe.controlnet = pipes[model_id]["control"].to(pipes[model_id]["device"])
+            pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)
+            control_image = load_image(request.get_json()["img_url"])
+            # generator = torch.manual_seed(66)
+            out_image: Image = pipe(request.get_json()["text"], num_inference_steps=20, image=control_image).images[0]
+            file_name = str(uuid.uuid4())[:4]
+            out_image.save(f"public/images/{file_name}.png")
+            result = {"path": f"/images/{file_name}.png"}
+
+        if model_id.endswith("-control"):
+            image = load_image(request.get_json()["img_url"])
+            if "scribble" in model_id:
+                control = pipe(image, scribble = True)
+            elif "canny" in model_id:
+                control = pipe(image, low_threshold=100, high_threshold=200)
+            else:
+                control = pipe(image)
+            file_name = str(uuid.uuid4())[:4]
+            control.save(f"public/images/{file_name}.png")
+            result = {"path": f"/images/{file_name}.png"}
+
+        # image to image
+        if model_id == "lambdalabs/sd-image-variations-diffusers":
+            im = load_image(request.get_json()["img_url"])
+            file_name = str(uuid.uuid4())[:4]
+            with open(f"public/images/{file_name}.png", "wb") as f:
+                f.write(request.data)
+            tform = transforms.Compose([
+                transforms.ToTensor(),
+                transforms.Resize(
+                    (224, 224),
+                    interpolation=transforms.InterpolationMode.BICUBIC,
+                    antialias=False,
+                    ),
+                transforms.Normalize(
+                [0.48145466, 0.4578275, 0.40821073],
+                [0.26862954, 0.26130258, 0.27577711]),
+            ])
+            inp = tform(im).to(pipes[model_id]["device"]).unsqueeze(0)
+            out = pipe(inp, guidance_scale=3)
+            out["images"][0].save(f"public/images/{file_name}.jpg")
+            result = {"path": f"/images/{file_name}.jpg"}
+
+        # image to text
+        if model_id == "Salesforce/blip-image-captioning-large":
+            raw_image = load_image(request.get_json()["img_url"]).convert('RGB')
+            text = request.get_json()["text"]
+            inputs = pipes[model_id]["processor"](raw_image, return_tensors="pt").to(pipes[model_id]["device"])
+            out = pipe.generate(**inputs)
+            caption = pipes[model_id]["processor"].decode(out[0], skip_special_tokens=True)
+            result = {"generated text": caption}
+        if model_id == "ydshieh/vit-gpt2-coco-en":
+            img_url = request.get_json()["img_url"]
+            generated_text = pipe(img_url)[0]['generated_text']
+            result = {"generated text": generated_text}
+        if model_id == "nlpconnect/vit-gpt2-image-captioning":
+            image = load_image(request.get_json()["img_url"]).convert("RGB")
+            pixel_values = pipes[model_id]["feature_extractor"](images=image, return_tensors="pt").pixel_values
+            pixel_values = pixel_values.to(pipes[model_id]["device"])
+            generated_ids = pipe.generate(pixel_values, **{"max_length": 200, "num_beams": 1})
+            generated_text = pipes[model_id]["tokenizer"].batch_decode(generated_ids, skip_special_tokens=True)[0]
+            result = {"generated text": generated_text}
+        # image to text: OCR
+        if model_id == "microsoft/trocr-base-printed" or  model_id == "microsoft/trocr-base-handwritten":
+            image = load_image(request.get_json()["img_url"]).convert("RGB")
+            pixel_values = pipes[model_id]["processor"](image, return_tensors="pt").pixel_values
+            pixel_values = pixel_values.to(pipes[model_id]["device"])
+            generated_ids = pipe.generate(pixel_values)
+            generated_text = pipes[model_id]["processor"].batch_decode(generated_ids, skip_special_tokens=True)[0]
+            result = {"generated text": generated_text}
+
+        # text to image
+        if model_id == "runwayml/stable-diffusion-v1-5":
+            file_name = str(uuid.uuid4())[:4]
+            text = request.get_json()["text"]
+            out = pipe(prompt=text)
+            out["images"][0].save(f"public/images/{file_name}.jpg")
+            result = {"path": f"/images/{file_name}.jpg"}
+
+        # object detection
+        if model_id == "google/owlvit-base-patch32" or model_id == "facebook/detr-resnet-101":
+            img_url = request.get_json()["img_url"]
+            open_types = ["cat", "couch", "person", "car", "dog", "horse", "sheep", "cow", "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee", "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard", "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple", "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch", "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone", "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear", "hair drier", "toothbrush", "traffic light", "fire hydrant", "stop sign", "parking meter", "bench", "bird"]
+            result = pipe(img_url, candidate_labels=open_types)
+        
+        # VQA
+        if model_id == "dandelin/vilt-b32-finetuned-vqa":
+            question = request.get_json()["text"]
+            img_url = request.get_json()["img_url"]
+            result = pipe(question=question, image=img_url)
+        
+        #DQA
+        if model_id == "impira/layoutlm-document-qa":
+            question = request.get_json()["text"]
+            img_url = request.get_json()["img_url"]
+            result = pipe(img_url, question)
+
+        # depth-estimation
+        if model_id == "Intel/dpt-large":
+            output = pipe(request.get_json()["img_url"])
+            image = output['depth']
+            name = str(uuid.uuid4())[:4]
+            image.save(f"public/images/{name}.jpg")
+            result = {"path": f"/images/{name}.jpg"}
+
+        if model_id == "Intel/dpt-hybrid-midas" and model_id == "Intel/dpt-large":
+            image = load_image(request.get_json()["img_url"])
+            inputs = pipes[model_id]["feature_extractor"](images=image, return_tensors="pt")
+            with torch.no_grad():
+                outputs = pipe(**inputs)
+                predicted_depth = outputs.predicted_depth
+            prediction = torch.nn.functional.interpolate(
+                predicted_depth.unsqueeze(1),
+                size=image.size[::-1],
+                mode="bicubic",
+                align_corners=False,
+            )
+            output = prediction.squeeze().cpu().numpy()
+            formatted = (output * 255 / np.max(output)).astype("uint8")
+            image = Image.fromarray(formatted)
+            name = str(uuid.uuid4())[:4]
+            image.save(f"public/images/{name}.jpg")
+            result = {"path": f"/images/{name}.jpg"}
+
+        # TTS
+        if model_id == "espnet/kan-bayashi_ljspeech_vits":
+            text = request.get_json()["text"]
+            wav = pipe(text)["wav"]
+            name = str(uuid.uuid4())[:4]
+            sf.write(f"public/audios/{name}.wav", wav.cpu().numpy(), pipe.fs, "PCM_16")
+            result = {"path": f"/audios/{name}.wav"}
+
+        if model_id == "microsoft/speecht5_tts":
+            text = request.get_json()["text"]
+            inputs = pipes[model_id]["processor"](text=text, return_tensors="pt")
+            embeddings_dataset = pipes[model_id]["embeddings_dataset"]
+            speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0).to(pipes[model_id]["device"])
+            pipes[model_id]["vocoder"].to(pipes[model_id]["device"])
+            speech = pipe.generate_speech(inputs["input_ids"].to(pipes[model_id]["device"]), speaker_embeddings, vocoder=pipes[model_id]["vocoder"])
+            name = str(uuid.uuid4())[:4]
+            sf.write(f"public/audios/{name}.wav", speech.cpu().numpy(), samplerate=16000)
+            result = {"path": f"/audios/{name}.wav"}
+
+        # ASR
+        if model_id == "openai/whisper-base" or model_id == "microsoft/speecht5_asr":
+            audio_url = request.get_json()["audio_url"]
+            result = { "text": pipe(audio_url)["text"]}
+
+        # audio to audio
+        if model_id == "JorisCos/DCCRNet_Libri1Mix_enhsingle_16k":
+            audio_url = request.get_json()["audio_url"]
+            wav, sr = torchaudio.load(audio_url)
+            with torch.no_grad():
+                result_wav = pipe(wav.to(pipes[model_id]["device"]))
+            name = str(uuid.uuid4())[:4]
+            sf.write(f"public/audios/{name}.wav", result_wav.cpu().squeeze().numpy(), sr)
+            result = {"path": f"/audios/{name}.wav"}
+        
+        if model_id == "microsoft/speecht5_vc":
+            audio_url = request.get_json()["audio_url"]
+            wav, sr = torchaudio.load(audio_url)
+            inputs = pipes[model_id]["processor"](audio=wav, sampling_rate=sr, return_tensors="pt")
+            embeddings_dataset = pipes[model_id]["embeddings_dataset"]
+            speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
+            pipes[model_id]["vocoder"].to(pipes[model_id]["device"])
+            speech = pipe.generate_speech(inputs["input_ids"].to(pipes[model_id]["device"]), speaker_embeddings, vocoder=pipes[model_id]["vocoder"])
+            name = str(uuid.uuid4())[:4]
+            sf.write(f"public/audios/{name}.wav", speech.cpu().numpy(), samplerate=16000)
+            result = {"path": f"/audios/{name}.wav"}
+        
+        # segmentation
+        if model_id == "facebook/detr-resnet-50-panoptic":
+            result = []
+            segments = pipe(request.get_json()["img_url"])
+            image = load_image(request.get_json()["img_url"])
+
+            colors = []
+            for i in range(len(segments)):
+                colors.append((random.randint(100, 255), random.randint(100, 255), random.randint(100, 255), 50))
+
+            for segment in segments:
+                mask = segment["mask"]
+                mask = mask.convert('L')
+                layer = Image.new('RGBA', mask.size, colors[i])
+                image.paste(layer, (0, 0), mask)
+            name = str(uuid.uuid4())[:4]
+            image.save(f"public/images/{name}.jpg")
+            result = {"path": f"/images/{name}.jpg"}
+
+        if model_id == "facebook/maskformer-swin-base-coco" or model_id == "facebook/maskformer-swin-large-ade":
+            image = load_image(request.get_json()["img_url"])
+            inputs = pipes[model_id]["feature_extractor"](images=image, return_tensors="pt").to(pipes[model_id]["device"])
+            outputs = pipe(**inputs)
+            result = pipes[model_id]["feature_extractor"].post_process_panoptic_segmentation(outputs, target_sizes=[image.size[::-1]])[0]
+            predicted_panoptic_map = result["segmentation"].cpu().numpy()
+            predicted_panoptic_map = Image.fromarray(predicted_panoptic_map.astype(np.uint8))
+            name = str(uuid.uuid4())[:4]
+            predicted_panoptic_map.save(f"public/images/{name}.jpg")
+            result = {"path": f"/images/{name}.jpg"}
+
+    except Exception as e:
+        print(e)
+        traceback.print_exc()
+        result = {"error": {"message": "Error when running the model inference."}}
+
+    if "device" in pipes[model_id]:
+        try:
+            pipe.to("cpu")
+            torch.cuda.empty_cache()
+        except:
+            pipe.device = torch.device("cpu")
+            pipe.model.to("cpu")
+            torch.cuda.empty_cache()
+
+    pipes[model_id]["using"] = False
+
+    if result is None:
+        result = {"error": {"message": "model not found"}}
+    
+    end = time.time()
+    during = end - start
+    print(f"[ complete {model_id} ] {during}s")
+    print(f"[ result {model_id} ] {result}")
+
+    return jsonify(result)
+
+
+if __name__ == '__main__':
+    # temp folders
+    if not os.path.exists("public/audios"):
+        os.makedirs("public/audios")
+    if not os.path.exists("public/images"):
+        os.makedirs("public/images")
+    if not os.path.exists("public/videos"):
+        os.makedirs("public/videos")
+        
+    waitress.serve(app, host="0.0.0.0", port=port)
--- a/swarms/agents/workers/multi_modal_agents/omni_agent/omni_chat.py
+++ b/swarms/agents/workers/multi_modal_agents/omni_agent/omni_chat.py
@ -1,4 +1,3 @@
-
 import base64
 import copy
 from io import BytesIO
@ -23,7 +22,7 @@ import flask
 from flask import request, jsonify
 import waitress
 from flask_cors import CORS, cross_origin
-from get_token_ids import get_token_ids_for_task_parsing, get_token_ids_for_choose_model, count_tokens, get_max_context_length
+from swarms.agents.workers.multi_modal_agents.omni_agent.get_token_ids import get_token_ids_for_task_parsing, get_token_ids_for_choose_model, count_tokens, get_max_context_length
 from huggingface_hub.inference_api import InferenceApi
 from huggingface_hub.inference_api import ALL_TASKS

@ -1068,646 +1067,4 @@ if __name__ == "__main__":
    elif args.mode == "server":
        server()
    elif args.mode == "cli":
-        cli()
-########################## => awesome chat
-
-
-
-
-########################## => models server 
-import argparse
-import logging
-import random
-import uuid
-import numpy as np
-from transformers import pipeline
-from diffusers import DiffusionPipeline, StableDiffusionControlNetPipeline, ControlNetModel, UniPCMultistepScheduler
-from diffusers.utils import load_image
-from diffusers import DiffusionPipeline, DPMSolverMultistepScheduler
-from diffusers.utils import export_to_video
-from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan, SpeechT5ForSpeechToSpeech
-from transformers import BlipProcessor, BlipForConditionalGeneration
-from transformers import TrOCRProcessor, VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer
-from datasets import load_dataset
-from PIL import Image
-import flask
-from flask import request, jsonify
-import waitress
-from flask_cors import CORS
-import io
-from torchvision import transforms
-import torch
-import torchaudio
-from speechbrain.pretrained import WaveformEnhancement
-import joblib
-from huggingface_hub import hf_hub_url, cached_download
-from transformers import AutoImageProcessor, TimesformerForVideoClassification
-from transformers import MaskFormerFeatureExtractor, MaskFormerForInstanceSegmentation, AutoFeatureExtractor
-from controlnet_aux import OpenposeDetector, MLSDdetector, HEDdetector, CannyDetector, MidasDetector
-from controlnet_aux.open_pose.body import Body
-from controlnet_aux.mlsd.models.mbv2_mlsd_large import MobileV2_MLSD_Large
-from controlnet_aux.hed import Network
-from transformers import DPTForDepthEstimation, DPTFeatureExtractor
-import warnings
-import time
-from espnet2.bin.tts_inference import Text2Speech
-import soundfile as sf
-from asteroid.models import BaseModel
-import traceback
-import os
-import yaml
-
-warnings.filterwarnings("ignore")
-
-parser = argparse.ArgumentParser()
-parser.add_argument("--config", type=str, default="configs/config.default.yaml")
-args = parser.parse_args()
-
-logger = logging.getLogger(__name__)
-logger.setLevel(logging.INFO)
-handler = logging.StreamHandler()
-handler.setLevel(logging.INFO)
-formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
-handler.setFormatter(formatter)
-logger.addHandler(handler)
-
-config = yaml.load(open(args.config, "r"), Loader=yaml.FullLoader)
-
-# host = config["local_inference_endpoint"]["host"]
-port = config["local_inference_endpoint"]["port"]
-
-local_deployment = config["local_deployment"]
-device = config.get("device", "cuda:0") 
-
-PROXY = None
-if config["proxy"]:
-    PROXY = {
-        "https": config["proxy"],
-    }
-
-app = flask.Flask(__name__)
-CORS(app)
-
-start = time.time()
-
-local_fold = "models"
-# if args.config.endswith(".dev"):
-#     local_fold = "models_dev"
-
-
-def load_pipes(local_deployment):
-    other_pipes = {}
-    standard_pipes = {}
-    controlnet_sd_pipes = {}
-    if local_deployment in ["full"]:
-        other_pipes = {
-            "nlpconnect/vit-gpt2-image-captioning":{
-                "model": VisionEncoderDecoderModel.from_pretrained(f"{local_fold}/nlpconnect/vit-gpt2-image-captioning"),
-                "feature_extractor": ViTImageProcessor.from_pretrained(f"{local_fold}/nlpconnect/vit-gpt2-image-captioning"),
-                "tokenizer": AutoTokenizer.from_pretrained(f"{local_fold}/nlpconnect/vit-gpt2-image-captioning"),
-                "device": device
-            },
-            # "Salesforce/blip-image-captioning-large": {
-            #     "model": BlipForConditionalGeneration.from_pretrained(f"{local_fold}/Salesforce/blip-image-captioning-large"),
-            #     "processor": BlipProcessor.from_pretrained(f"{local_fold}/Salesforce/blip-image-captioning-large"),
-            #     "device": device
-            # },
-            "damo-vilab/text-to-video-ms-1.7b": {
-                "model": DiffusionPipeline.from_pretrained(f"{local_fold}/damo-vilab/text-to-video-ms-1.7b", torch_dtype=torch.float16, variant="fp16"),
-                "device": device
-            },
-            # "facebook/maskformer-swin-large-ade": {
-            #     "model": MaskFormerForInstanceSegmentation.from_pretrained(f"{local_fold}/facebook/maskformer-swin-large-ade"),
-            #     "feature_extractor" : AutoFeatureExtractor.from_pretrained("facebook/maskformer-swin-large-ade"),
-            #     "device": device
-            # },
-            # "microsoft/trocr-base-printed": {
-            #     "processor": TrOCRProcessor.from_pretrained(f"{local_fold}/microsoft/trocr-base-printed"),
-            #     "model": VisionEncoderDecoderModel.from_pretrained(f"{local_fold}/microsoft/trocr-base-printed"),
-            #     "device": device
-            # },
-            # "microsoft/trocr-base-handwritten": {
-            #     "processor": TrOCRProcessor.from_pretrained(f"{local_fold}/microsoft/trocr-base-handwritten"),
-            #     "model": VisionEncoderDecoderModel.from_pretrained(f"{local_fold}/microsoft/trocr-base-handwritten"),
-            #     "device": device
-            # },
-            "JorisCos/DCCRNet_Libri1Mix_enhsingle_16k": {
-                "model": BaseModel.from_pretrained("JorisCos/DCCRNet_Libri1Mix_enhsingle_16k"),
-                "device": device
-            },
-            "espnet/kan-bayashi_ljspeech_vits": {
-                "model": Text2Speech.from_pretrained(f"espnet/kan-bayashi_ljspeech_vits"),
-                "device": device
-            },
-            "lambdalabs/sd-image-variations-diffusers": {
-                "model": DiffusionPipeline.from_pretrained(f"{local_fold}/lambdalabs/sd-image-variations-diffusers"), #torch_dtype=torch.float16
-                "device": device
-            },
-            # "CompVis/stable-diffusion-v1-4": {
-            #     "model": DiffusionPipeline.from_pretrained(f"{local_fold}/CompVis/stable-diffusion-v1-4"),
-            #     "device": device
-            # },
-            # "stabilityai/stable-diffusion-2-1": {
-            #     "model": DiffusionPipeline.from_pretrained(f"{local_fold}/stabilityai/stable-diffusion-2-1"),
-            #     "device": device
-            # },
-            "runwayml/stable-diffusion-v1-5": {
-                "model": DiffusionPipeline.from_pretrained(f"{local_fold}/runwayml/stable-diffusion-v1-5"),
-                "device": device
-            },
-            # "microsoft/speecht5_tts":{
-            #     "processor": SpeechT5Processor.from_pretrained(f"{local_fold}/microsoft/speecht5_tts"),
-            #     "model": SpeechT5ForTextToSpeech.from_pretrained(f"{local_fold}/microsoft/speecht5_tts"),
-            #     "vocoder":  SpeechT5HifiGan.from_pretrained(f"{local_fold}/microsoft/speecht5_hifigan"),
-            #     "embeddings_dataset": load_dataset(f"{local_fold}/Matthijs/cmu-arctic-xvectors", split="validation"),
-            #     "device": device
-            # },
-            # "speechbrain/mtl-mimic-voicebank": {
-            #     "model": WaveformEnhancement.from_hparams(source="speechbrain/mtl-mimic-voicebank", savedir="models/mtl-mimic-voicebank"),
-            #     "device": device
-            # },
-            "microsoft/speecht5_vc":{
-                "processor": SpeechT5Processor.from_pretrained(f"{local_fold}/microsoft/speecht5_vc"),
-                "model": SpeechT5ForSpeechToSpeech.from_pretrained(f"{local_fold}/microsoft/speecht5_vc"),
-                "vocoder": SpeechT5HifiGan.from_pretrained(f"{local_fold}/microsoft/speecht5_hifigan"),
-                "embeddings_dataset": load_dataset(f"{local_fold}/Matthijs/cmu-arctic-xvectors", split="validation"),
-                "device": device
-            },
-            # "julien-c/wine-quality": {
-            #     "model": joblib.load(cached_download(hf_hub_url("julien-c/wine-quality", "sklearn_model.joblib")))
-            # },
-            # "facebook/timesformer-base-finetuned-k400": {
-            #     "processor": AutoImageProcessor.from_pretrained(f"{local_fold}/facebook/timesformer-base-finetuned-k400"),
-            #     "model": TimesformerForVideoClassification.from_pretrained(f"{local_fold}/facebook/timesformer-base-finetuned-k400"),
-            #     "device": device
-            # },
-            "facebook/maskformer-swin-base-coco": {
-                "feature_extractor": MaskFormerFeatureExtractor.from_pretrained(f"{local_fold}/facebook/maskformer-swin-base-coco"),
-                "model": MaskFormerForInstanceSegmentation.from_pretrained(f"{local_fold}/facebook/maskformer-swin-base-coco"),
-                "device": device
-            },
-            "Intel/dpt-hybrid-midas": {
-                "model": DPTForDepthEstimation.from_pretrained(f"{local_fold}/Intel/dpt-hybrid-midas", low_cpu_mem_usage=True),
-                "feature_extractor": DPTFeatureExtractor.from_pretrained(f"{local_fold}/Intel/dpt-hybrid-midas"),
-                "device": device
-            }
-        }
-
-    if local_deployment in ["full", "standard"]:
-        standard_pipes = {
-            # "superb/wav2vec2-base-superb-ks": {
-            #     "model": pipeline(task="audio-classification", model=f"{local_fold}/superb/wav2vec2-base-superb-ks"), 
-            #     "device": device
-            # },
-            "openai/whisper-base": {
-                "model": pipeline(task="automatic-speech-recognition", model=f"{local_fold}/openai/whisper-base"), 
-                "device": device
-            },
-            "microsoft/speecht5_asr": {
-                "model": pipeline(task="automatic-speech-recognition", model=f"{local_fold}/microsoft/speecht5_asr"), 
-                "device": device
-            },
-            "Intel/dpt-large": {
-                "model": pipeline(task="depth-estimation", model=f"{local_fold}/Intel/dpt-large"), 
-                "device": device
-            },
-            # "microsoft/beit-base-patch16-224-pt22k-ft22k": {
-            #     "model": pipeline(task="image-classification", model=f"{local_fold}/microsoft/beit-base-patch16-224-pt22k-ft22k"), 
-            #     "device": device
-            # },
-            "facebook/detr-resnet-50-panoptic": {
-                "model": pipeline(task="image-segmentation", model=f"{local_fold}/facebook/detr-resnet-50-panoptic"), 
-                "device": device
-            },
-            "facebook/detr-resnet-101": {
-                "model": pipeline(task="object-detection", model=f"{local_fold}/facebook/detr-resnet-101"), 
-                "device": device
-            },
-            # "openai/clip-vit-large-patch14": {
-            #     "model": pipeline(task="zero-shot-image-classification", model=f"{local_fold}/openai/clip-vit-large-patch14"), 
-            #     "device": device
-            # },
-            "google/owlvit-base-patch32": {
-                "model": pipeline(task="zero-shot-object-detection", model=f"{local_fold}/google/owlvit-base-patch32"), 
-                "device": device
-            },
-            # "microsoft/DialoGPT-medium": {
-            #     "model": pipeline(task="conversational", model=f"{local_fold}/microsoft/DialoGPT-medium"), 
-            #     "device": device
-            # },
-            # "bert-base-uncased": {
-            #     "model": pipeline(task="fill-mask", model=f"{local_fold}/bert-base-uncased"), 
-            #     "device": device
-            # },
-            # "deepset/roberta-base-squad2": {
-            #     "model": pipeline(task = "question-answering", model=f"{local_fold}/deepset/roberta-base-squad2"), 
-            #     "device": device
-            # },
-            # "facebook/bart-large-cnn": {
-            #     "model": pipeline(task="summarization", model=f"{local_fold}/facebook/bart-large-cnn"), 
-            #     "device": device
-            # },
-            # "google/tapas-base-finetuned-wtq": {
-            #     "model": pipeline(task="table-question-answering", model=f"{local_fold}/google/tapas-base-finetuned-wtq"), 
-            #     "device": device
-            # },
-            # "distilbert-base-uncased-finetuned-sst-2-english": {
-            #     "model": pipeline(task="text-classification", model=f"{local_fold}/distilbert-base-uncased-finetuned-sst-2-english"), 
-            #     "device": device
-            # },
-            # "gpt2": {
-            #     "model": pipeline(task="text-generation", model="gpt2"), 
-            #     "device": device
-            # },
-            # "mrm8488/t5-base-finetuned-question-generation-ap": {
-            #     "model": pipeline(task="text2text-generation", model=f"{local_fold}/mrm8488/t5-base-finetuned-question-generation-ap"), 
-            #     "device": device
-            # },
-            # "Jean-Baptiste/camembert-ner": {
-            #     "model": pipeline(task="token-classification", model=f"{local_fold}/Jean-Baptiste/camembert-ner", aggregation_strategy="simple"), 
-            #     "device": device
-            # },
-            # "t5-base": {
-            #     "model": pipeline(task="translation", model=f"{local_fold}/t5-base"), 
-            #     "device": device
-            # },
-            "impira/layoutlm-document-qa": {
-                "model": pipeline(task="document-question-answering", model=f"{local_fold}/impira/layoutlm-document-qa"), 
-                "device": device
-            },
-            "ydshieh/vit-gpt2-coco-en": {
-                "model": pipeline(task="image-to-text", model=f"{local_fold}/ydshieh/vit-gpt2-coco-en"), 
-                "device": device
-            },
-            "dandelin/vilt-b32-finetuned-vqa": {
-                "model": pipeline(task="visual-question-answering", model=f"{local_fold}/dandelin/vilt-b32-finetuned-vqa"), 
-                "device": device
-            }
-        }
-
-    if local_deployment in ["full", "standard", "minimal"]:
-        controlnet = ControlNetModel.from_pretrained(f"{local_fold}/lllyasviel/sd-controlnet-canny", torch_dtype=torch.float16)
-        controlnetpipe = StableDiffusionControlNetPipeline.from_pretrained(
-            f"{local_fold}/runwayml/stable-diffusion-v1-5", controlnet=controlnet, torch_dtype=torch.float16
-        )
-
-        def mlsd_control_network():
-            model = MobileV2_MLSD_Large()
-            model.load_state_dict(torch.load(f"{local_fold}/lllyasviel/ControlNet/annotator/ckpts/mlsd_large_512_fp32.pth"), strict=True)
-            return MLSDdetector(model)
-
-
-        hed_network = Network(f"{local_fold}/lllyasviel/ControlNet/annotator/ckpts/network-bsds500.pth")
-
-        controlnet_sd_pipes = {
-            "openpose-control": {
-                "model": OpenposeDetector(Body(f"{local_fold}/lllyasviel/ControlNet/annotator/ckpts/body_pose_model.pth"))
-            },
-            "mlsd-control": {
-                "model": mlsd_control_network()
-            },
-            "hed-control": {
-                "model": HEDdetector(hed_network)
-            },
-            "scribble-control": {
-                "model": HEDdetector(hed_network)
-            },
-            "midas-control": {
-                "model": MidasDetector(model_path=f"{local_fold}/lllyasviel/ControlNet/annotator/ckpts/dpt_hybrid-midas-501f0c75.pt")
-            },
-            "canny-control": {
-                "model": CannyDetector()
-            },
-            "lllyasviel/sd-controlnet-canny":{
-                "control": controlnet, 
-                "model": controlnetpipe,
-                "device": device
-            },
-            "lllyasviel/sd-controlnet-depth":{
-                "control": ControlNetModel.from_pretrained(f"{local_fold}/lllyasviel/sd-controlnet-depth", torch_dtype=torch.float16),
-                "model": controlnetpipe,
-                "device": device
-            },
-            "lllyasviel/sd-controlnet-hed":{
-                "control": ControlNetModel.from_pretrained(f"{local_fold}/lllyasviel/sd-controlnet-hed", torch_dtype=torch.float16), 
-                "model": controlnetpipe,
-                "device": device
-            },
-            "lllyasviel/sd-controlnet-mlsd":{
-                "control": ControlNetModel.from_pretrained(f"{local_fold}/lllyasviel/sd-controlnet-mlsd", torch_dtype=torch.float16), 
-                "model": controlnetpipe,
-                "device": device
-            },
-            "lllyasviel/sd-controlnet-openpose":{
-                "control": ControlNetModel.from_pretrained(f"{local_fold}/lllyasviel/sd-controlnet-openpose", torch_dtype=torch.float16), 
-                "model": controlnetpipe,
-                "device": device
-            },
-            "lllyasviel/sd-controlnet-scribble":{
-                "control": ControlNetModel.from_pretrained(f"{local_fold}/lllyasviel/sd-controlnet-scribble", torch_dtype=torch.float16), 
-                "model": controlnetpipe,
-                "device": device
-            },
-            "lllyasviel/sd-controlnet-seg":{
-                "control": ControlNetModel.from_pretrained(f"{local_fold}/lllyasviel/sd-controlnet-seg", torch_dtype=torch.float16), 
-                "model": controlnetpipe,
-                "device": device
-            }    
-        }
-    pipes = {**standard_pipes, **other_pipes, **controlnet_sd_pipes}
-    return pipes
-
-pipes = load_pipes(local_deployment)
-
-end = time.time()
-during = end - start
-
-print(f"[ ready ] {during}s")
-
-@app.route('/running', methods=['GET'])
-def running():
-    return jsonify({"running": True})
-
-@app.route('/status/<path:model_id>', methods=['GET'])
-def status(model_id):
-    disabled_models = ["microsoft/trocr-base-printed", "microsoft/trocr-base-handwritten"]
-    if model_id in pipes.keys() and model_id not in disabled_models:
-        print(f"[ check {model_id} ] success")
-        return jsonify({"loaded": True})
-    else:
-        print(f"[ check {model_id} ] failed")
-        return jsonify({"loaded": False})
-
-@app.route('/models/<path:model_id>', methods=['POST'])
-def models(model_id):
-    while "using" in pipes[model_id] and pipes[model_id]["using"]:
-        print(f"[ inference {model_id} ] waiting")
-        time.sleep(0.1)
-    pipes[model_id]["using"] = True
-    print(f"[ inference {model_id} ] start")
-
-    start = time.time()
-
-    pipe = pipes[model_id]["model"]
-    
-    if "device" in pipes[model_id]:
-        try:
-            pipe.to(pipes[model_id]["device"])
-        except:
-            pipe.device = torch.device(pipes[model_id]["device"])
-            pipe.model.to(pipes[model_id]["device"])
-    
-    result = None
-    try:
-        # text to video
-        if model_id == "damo-vilab/text-to-video-ms-1.7b":
-            pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
-            # pipe.enable_model_cpu_offload()
-            prompt = request.get_json()["text"]
-            video_frames = pipe(prompt, num_inference_steps=50, num_frames=40).frames
-            video_path = export_to_video(video_frames)
-            file_name = str(uuid.uuid4())[:4]
-            os.system(f"LD_LIBRARY_PATH=/usr/local/lib /usr/local/bin/ffmpeg -i {video_path} -vcodec libx264 public/videos/{file_name}.mp4")
-            result = {"path": f"/videos/{file_name}.mp4"}
-
-        # controlnet
-        if model_id.startswith("lllyasviel/sd-controlnet-"):
-            pipe.controlnet.to('cpu')
-            pipe.controlnet = pipes[model_id]["control"].to(pipes[model_id]["device"])
-            pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)
-            control_image = load_image(request.get_json()["img_url"])
-            # generator = torch.manual_seed(66)
-            out_image: Image = pipe(request.get_json()["text"], num_inference_steps=20, image=control_image).images[0]
-            file_name = str(uuid.uuid4())[:4]
-            out_image.save(f"public/images/{file_name}.png")
-            result = {"path": f"/images/{file_name}.png"}
-
-        if model_id.endswith("-control"):
-            image = load_image(request.get_json()["img_url"])
-            if "scribble" in model_id:
-                control = pipe(image, scribble = True)
-            elif "canny" in model_id:
-                control = pipe(image, low_threshold=100, high_threshold=200)
-            else:
-                control = pipe(image)
-            file_name = str(uuid.uuid4())[:4]
-            control.save(f"public/images/{file_name}.png")
-            result = {"path": f"/images/{file_name}.png"}
-
-        # image to image
-        if model_id == "lambdalabs/sd-image-variations-diffusers":
-            im = load_image(request.get_json()["img_url"])
-            file_name = str(uuid.uuid4())[:4]
-            with open(f"public/images/{file_name}.png", "wb") as f:
-                f.write(request.data)
-            tform = transforms.Compose([
-                transforms.ToTensor(),
-                transforms.Resize(
-                    (224, 224),
-                    interpolation=transforms.InterpolationMode.BICUBIC,
-                    antialias=False,
-                    ),
-                transforms.Normalize(
-                [0.48145466, 0.4578275, 0.40821073],
-                [0.26862954, 0.26130258, 0.27577711]),
-            ])
-            inp = tform(im).to(pipes[model_id]["device"]).unsqueeze(0)
-            out = pipe(inp, guidance_scale=3)
-            out["images"][0].save(f"public/images/{file_name}.jpg")
-            result = {"path": f"/images/{file_name}.jpg"}
-
-        # image to text
-        if model_id == "Salesforce/blip-image-captioning-large":
-            raw_image = load_image(request.get_json()["img_url"]).convert('RGB')
-            text = request.get_json()["text"]
-            inputs = pipes[model_id]["processor"](raw_image, return_tensors="pt").to(pipes[model_id]["device"])
-            out = pipe.generate(**inputs)
-            caption = pipes[model_id]["processor"].decode(out[0], skip_special_tokens=True)
-            result = {"generated text": caption}
-        if model_id == "ydshieh/vit-gpt2-coco-en":
-            img_url = request.get_json()["img_url"]
-            generated_text = pipe(img_url)[0]['generated_text']
-            result = {"generated text": generated_text}
-        if model_id == "nlpconnect/vit-gpt2-image-captioning":
-            image = load_image(request.get_json()["img_url"]).convert("RGB")
-            pixel_values = pipes[model_id]["feature_extractor"](images=image, return_tensors="pt").pixel_values
-            pixel_values = pixel_values.to(pipes[model_id]["device"])
-            generated_ids = pipe.generate(pixel_values, **{"max_length": 200, "num_beams": 1})
-            generated_text = pipes[model_id]["tokenizer"].batch_decode(generated_ids, skip_special_tokens=True)[0]
-            result = {"generated text": generated_text}
-        # image to text: OCR
-        if model_id == "microsoft/trocr-base-printed" or  model_id == "microsoft/trocr-base-handwritten":
-            image = load_image(request.get_json()["img_url"]).convert("RGB")
-            pixel_values = pipes[model_id]["processor"](image, return_tensors="pt").pixel_values
-            pixel_values = pixel_values.to(pipes[model_id]["device"])
-            generated_ids = pipe.generate(pixel_values)
-            generated_text = pipes[model_id]["processor"].batch_decode(generated_ids, skip_special_tokens=True)[0]
-            result = {"generated text": generated_text}
-
-        # text to image
-        if model_id == "runwayml/stable-diffusion-v1-5":
-            file_name = str(uuid.uuid4())[:4]
-            text = request.get_json()["text"]
-            out = pipe(prompt=text)
-            out["images"][0].save(f"public/images/{file_name}.jpg")
-            result = {"path": f"/images/{file_name}.jpg"}
-
-        # object detection
-        if model_id == "google/owlvit-base-patch32" or model_id == "facebook/detr-resnet-101":
-            img_url = request.get_json()["img_url"]
-            open_types = ["cat", "couch", "person", "car", "dog", "horse", "sheep", "cow", "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee", "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard", "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple", "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch", "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone", "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear", "hair drier", "toothbrush", "traffic light", "fire hydrant", "stop sign", "parking meter", "bench", "bird"]
-            result = pipe(img_url, candidate_labels=open_types)
-        
-        # VQA
-        if model_id == "dandelin/vilt-b32-finetuned-vqa":
-            question = request.get_json()["text"]
-            img_url = request.get_json()["img_url"]
-            result = pipe(question=question, image=img_url)
-        
-        #DQA
-        if model_id == "impira/layoutlm-document-qa":
-            question = request.get_json()["text"]
-            img_url = request.get_json()["img_url"]
-            result = pipe(img_url, question)
-
-        # depth-estimation
-        if model_id == "Intel/dpt-large":
-            output = pipe(request.get_json()["img_url"])
-            image = output['depth']
-            name = str(uuid.uuid4())[:4]
-            image.save(f"public/images/{name}.jpg")
-            result = {"path": f"/images/{name}.jpg"}
-
-        if model_id == "Intel/dpt-hybrid-midas" and model_id == "Intel/dpt-large":
-            image = load_image(request.get_json()["img_url"])
-            inputs = pipes[model_id]["feature_extractor"](images=image, return_tensors="pt")
-            with torch.no_grad():
-                outputs = pipe(**inputs)
-                predicted_depth = outputs.predicted_depth
-            prediction = torch.nn.functional.interpolate(
-                predicted_depth.unsqueeze(1),
-                size=image.size[::-1],
-                mode="bicubic",
-                align_corners=False,
-            )
-            output = prediction.squeeze().cpu().numpy()
-            formatted = (output * 255 / np.max(output)).astype("uint8")
-            image = Image.fromarray(formatted)
-            name = str(uuid.uuid4())[:4]
-            image.save(f"public/images/{name}.jpg")
-            result = {"path": f"/images/{name}.jpg"}
-
-        # TTS
-        if model_id == "espnet/kan-bayashi_ljspeech_vits":
-            text = request.get_json()["text"]
-            wav = pipe(text)["wav"]
-            name = str(uuid.uuid4())[:4]
-            sf.write(f"public/audios/{name}.wav", wav.cpu().numpy(), pipe.fs, "PCM_16")
-            result = {"path": f"/audios/{name}.wav"}
-
-        if model_id == "microsoft/speecht5_tts":
-            text = request.get_json()["text"]
-            inputs = pipes[model_id]["processor"](text=text, return_tensors="pt")
-            embeddings_dataset = pipes[model_id]["embeddings_dataset"]
-            speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0).to(pipes[model_id]["device"])
-            pipes[model_id]["vocoder"].to(pipes[model_id]["device"])
-            speech = pipe.generate_speech(inputs["input_ids"].to(pipes[model_id]["device"]), speaker_embeddings, vocoder=pipes[model_id]["vocoder"])
-            name = str(uuid.uuid4())[:4]
-            sf.write(f"public/audios/{name}.wav", speech.cpu().numpy(), samplerate=16000)
-            result = {"path": f"/audios/{name}.wav"}
-
-        # ASR
-        if model_id == "openai/whisper-base" or model_id == "microsoft/speecht5_asr":
-            audio_url = request.get_json()["audio_url"]
-            result = { "text": pipe(audio_url)["text"]}
-
-        # audio to audio
-        if model_id == "JorisCos/DCCRNet_Libri1Mix_enhsingle_16k":
-            audio_url = request.get_json()["audio_url"]
-            wav, sr = torchaudio.load(audio_url)
-            with torch.no_grad():
-                result_wav = pipe(wav.to(pipes[model_id]["device"]))
-            name = str(uuid.uuid4())[:4]
-            sf.write(f"public/audios/{name}.wav", result_wav.cpu().squeeze().numpy(), sr)
-            result = {"path": f"/audios/{name}.wav"}
-        
-        if model_id == "microsoft/speecht5_vc":
-            audio_url = request.get_json()["audio_url"]
-            wav, sr = torchaudio.load(audio_url)
-            inputs = pipes[model_id]["processor"](audio=wav, sampling_rate=sr, return_tensors="pt")
-            embeddings_dataset = pipes[model_id]["embeddings_dataset"]
-            speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
-            pipes[model_id]["vocoder"].to(pipes[model_id]["device"])
-            speech = pipe.generate_speech(inputs["input_ids"].to(pipes[model_id]["device"]), speaker_embeddings, vocoder=pipes[model_id]["vocoder"])
-            name = str(uuid.uuid4())[:4]
-            sf.write(f"public/audios/{name}.wav", speech.cpu().numpy(), samplerate=16000)
-            result = {"path": f"/audios/{name}.wav"}
-        
-        # segmentation
-        if model_id == "facebook/detr-resnet-50-panoptic":
-            result = []
-            segments = pipe(request.get_json()["img_url"])
-            image = load_image(request.get_json()["img_url"])
-
-            colors = []
-            for i in range(len(segments)):
-                colors.append((random.randint(100, 255), random.randint(100, 255), random.randint(100, 255), 50))
-
-            for segment in segments:
-                mask = segment["mask"]
-                mask = mask.convert('L')
-                layer = Image.new('RGBA', mask.size, colors[i])
-                image.paste(layer, (0, 0), mask)
-            name = str(uuid.uuid4())[:4]
-            image.save(f"public/images/{name}.jpg")
-            result = {"path": f"/images/{name}.jpg"}
-
-        if model_id == "facebook/maskformer-swin-base-coco" or model_id == "facebook/maskformer-swin-large-ade":
-            image = load_image(request.get_json()["img_url"])
-            inputs = pipes[model_id]["feature_extractor"](images=image, return_tensors="pt").to(pipes[model_id]["device"])
-            outputs = pipe(**inputs)
-            result = pipes[model_id]["feature_extractor"].post_process_panoptic_segmentation(outputs, target_sizes=[image.size[::-1]])[0]
-            predicted_panoptic_map = result["segmentation"].cpu().numpy()
-            predicted_panoptic_map = Image.fromarray(predicted_panoptic_map.astype(np.uint8))
-            name = str(uuid.uuid4())[:4]
-            predicted_panoptic_map.save(f"public/images/{name}.jpg")
-            result = {"path": f"/images/{name}.jpg"}
-
-    except Exception as e:
-        print(e)
-        traceback.print_exc()
-        result = {"error": {"message": "Error when running the model inference."}}
-
-    if "device" in pipes[model_id]:
-        try:
-            pipe.to("cpu")
-            torch.cuda.empty_cache()
-        except:
-            pipe.device = torch.device("cpu")
-            pipe.model.to("cpu")
-            torch.cuda.empty_cache()
-
-    pipes[model_id]["using"] = False
-
-    if result is None:
-        result = {"error": {"message": "model not found"}}
-    
-    end = time.time()
-    during = end - start
-    print(f"[ complete {model_id} ] {during}s")
-    print(f"[ result {model_id} ] {result}")
-
-    return jsonify(result)
-
-
-if __name__ == '__main__':
-    # temp folders
-    if not os.path.exists("public/audios"):
-        os.makedirs("public/audios")
-    if not os.path.exists("public/images"):
-        os.makedirs("public/images")
-    if not os.path.exists("public/videos"):
-        os.makedirs("public/videos")
-        
-    waitress.serve(app, host="0.0.0.0", port=port)
-########################## => models server end
+        cli()
--- a/swarms/agents/workers/omni_agent.py
+++ b/swarms/agents/workers/omni_agent.py
@ -1,6 +1,6 @@
 #boss node -> worker agent -> omni agent [worker of the worker]
 from langchain.tools import tool
-from swarms.agents.workers.multi_modal_agents.omni_agent import chat_huggingface
+from swarms.agents.workers.multi_modal_agents.omni_agent.omni_chat import chat_huggingface

 class OmniWorkerAgent:
    def __init__(self, api_key, api_endpoint, api_type):
--- a/swarms/utils/utils.py
+++ b/swarms/utils/utils.py
@ -472,68 +472,6 @@ class ImageCaptioning(BaseHandler):



-################# server/get token ids
-import tiktoken
-
-encodings = {
-    "gpt-4": tiktoken.get_encoding("cl100k_base"),
-    "gpt-4-32k": tiktoken.get_encoding("cl100k_base"),
-    "gpt-3.5-turbo": tiktoken.get_encoding("cl100k_base"),
-    "gpt-3.5-turbo-0301": tiktoken.get_encoding("cl100k_base"),
-    "text-davinci-003": tiktoken.get_encoding("p50k_base"),
-    "text-davinci-002": tiktoken.get_encoding("p50k_base"),
-    "text-davinci-001": tiktoken.get_encoding("r50k_base"),
-    "text-curie-001": tiktoken.get_encoding("r50k_base"),
-    "text-babbage-001": tiktoken.get_encoding("r50k_base"),
-    "text-ada-001": tiktoken.get_encoding("r50k_base"),
-    "davinci": tiktoken.get_encoding("r50k_base"),
-    "curie": tiktoken.get_encoding("r50k_base"),
-    "babbage": tiktoken.get_encoding("r50k_base"),
-    "ada": tiktoken.get_encoding("r50k_base"),
-}
-
-max_length = {
-    "gpt-4": 8192,
-    "gpt-4-32k": 32768,
-    "gpt-3.5-turbo": 4096,
-    "gpt-3.5-turbo-0301": 4096,
-    "text-davinci-003": 4096,
-    "text-davinci-002": 4096,
-    "text-davinci-001": 2049,
-    "text-curie-001": 2049,
-    "text-babbage-001": 2049,
-    "text-ada-001": 2049,
-    "davinci": 2049,
-    "curie": 2049,
-    "babbage": 2049,
-    "ada": 2049
-}
-
-def count_tokens(model_name, text):
-    return len(encodings[model_name].encode(text))
-
-def get_max_context_length(model_name):
-    return max_length[model_name]
-
-def get_token_ids_for_task_parsing(model_name):
-    text = '''{"task": "text-classification",  "token-classification", "text2text-generation", "summarization", "translation",  "question-answering", "conversational", "text-generation", "sentence-similarity", "tabular-classification", "object-detection", "image-classification", "image-to-image", "image-to-text", "text-to-image", "visual-question-answering", "document-question-answering", "image-segmentation", "text-to-speech", "text-to-video", "automatic-speech-recognition", "audio-to-audio", "audio-classification", "canny-control", "hed-control", "mlsd-control", "normal-control", "openpose-control", "canny-text-to-image", "depth-text-to-image", "hed-text-to-image", "mlsd-text-to-image", "normal-text-to-image", "openpose-text-to-image", "seg-text-to-image", "args", "text", "path", "dep", "id", "<GENERATED>-"}'''
-    res = encodings[model_name].encode(text)
-    res = list(set(res))
-    return res
-
-def get_token_ids_for_choose_model(model_name):
-    text = '''{"id": "reason"}'''
-    res = encodings[model_name].encode(text)
-    res = list(set(res))
-    return res
-################# END
-
-
-
-
-
-# ################# MultiAgent
-
 # from autogpt.agent import Agent
 # from swarms.agents.swarms import worker_node