omni agent cleanup

NewTools
Kye 2 years ago
parent 63640405ee
commit 97eeb7ca04

@ -1 +1 @@
from swarms.agents.workers.multi_modal_agents.omni_agent import chat_huggingface
from swarms.agents.workers.multi_modal_agents.omni_agent.omni_agent import chat_huggingface

@ -0,0 +1,53 @@
import tiktoken
encodings = {
"gpt-4": tiktoken.get_encoding("cl100k_base"),
"gpt-4-32k": tiktoken.get_encoding("cl100k_base"),
"gpt-3.5-turbo": tiktoken.get_encoding("cl100k_base"),
"gpt-3.5-turbo-0301": tiktoken.get_encoding("cl100k_base"),
"text-davinci-003": tiktoken.get_encoding("p50k_base"),
"text-davinci-002": tiktoken.get_encoding("p50k_base"),
"text-davinci-001": tiktoken.get_encoding("r50k_base"),
"text-curie-001": tiktoken.get_encoding("r50k_base"),
"text-babbage-001": tiktoken.get_encoding("r50k_base"),
"text-ada-001": tiktoken.get_encoding("r50k_base"),
"davinci": tiktoken.get_encoding("r50k_base"),
"curie": tiktoken.get_encoding("r50k_base"),
"babbage": tiktoken.get_encoding("r50k_base"),
"ada": tiktoken.get_encoding("r50k_base"),
}
max_length = {
"gpt-4": 8192,
"gpt-4-32k": 32768,
"gpt-3.5-turbo": 4096,
"gpt-3.5-turbo-0301": 4096,
"text-davinci-003": 4096,
"text-davinci-002": 4096,
"text-davinci-001": 2049,
"text-curie-001": 2049,
"text-babbage-001": 2049,
"text-ada-001": 2049,
"davinci": 2049,
"curie": 2049,
"babbage": 2049,
"ada": 2049
}
def count_tokens(model_name, text):
return len(encodings[model_name].encode(text))
def get_max_context_length(model_name):
return max_length[model_name]
def get_token_ids_for_task_parsing(model_name):
text = '''{"task": "text-classification", "token-classification", "text2text-generation", "summarization", "translation", "question-answering", "conversational", "text-generation", "sentence-similarity", "tabular-classification", "object-detection", "image-classification", "image-to-image", "image-to-text", "text-to-image", "visual-question-answering", "document-question-answering", "image-segmentation", "text-to-speech", "text-to-video", "automatic-speech-recognition", "audio-to-audio", "audio-classification", "canny-control", "hed-control", "mlsd-control", "normal-control", "openpose-control", "canny-text-to-image", "depth-text-to-image", "hed-text-to-image", "mlsd-text-to-image", "normal-text-to-image", "openpose-text-to-image", "seg-text-to-image", "args", "text", "path", "dep", "id", "<GENERATED>-"}'''
res = encodings[model_name].encode(text)
res = list(set(res))
return res
def get_token_ids_for_choose_model(model_name):
text = '''{"id": "reason"}'''
res = encodings[model_name].encode(text)
res = list(set(res))
return res

@ -0,0 +1,635 @@
import argparse
import logging
import random
import uuid
import numpy as np
from transformers import pipeline
from diffusers import DiffusionPipeline, StableDiffusionControlNetPipeline, ControlNetModel, UniPCMultistepScheduler
from diffusers.utils import load_image
from diffusers import DiffusionPipeline, DPMSolverMultistepScheduler
from diffusers.utils import export_to_video
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan, SpeechT5ForSpeechToSpeech
from transformers import BlipProcessor, BlipForConditionalGeneration
from transformers import TrOCRProcessor, VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer
from datasets import load_dataset
from PIL import Image
import flask
from flask import request, jsonify
import waitress
from flask_cors import CORS
import io
from torchvision import transforms
import torch
import torchaudio
from speechbrain.pretrained import WaveformEnhancement
import joblib
from huggingface_hub import hf_hub_url, cached_download
from transformers import AutoImageProcessor, TimesformerForVideoClassification
from transformers import MaskFormerFeatureExtractor, MaskFormerForInstanceSegmentation, AutoFeatureExtractor
from controlnet_aux import OpenposeDetector, MLSDdetector, HEDdetector, CannyDetector, MidasDetector
from controlnet_aux.open_pose.body import Body
from controlnet_aux.mlsd.models.mbv2_mlsd_large import MobileV2_MLSD_Large
from controlnet_aux.hed import Network
from transformers import DPTForDepthEstimation, DPTFeatureExtractor
import warnings
import time
from espnet2.bin.tts_inference import Text2Speech
import soundfile as sf
from asteroid.models import BaseModel
import traceback
import os
import yaml
warnings.filterwarnings("ignore")
parser = argparse.ArgumentParser()
parser.add_argument("--config", type=str, default="configs/config.default.yaml")
args = parser.parse_args()
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
handler = logging.StreamHandler()
handler.setLevel(logging.INFO)
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
handler.setFormatter(formatter)
logger.addHandler(handler)
config = yaml.load(open(args.config, "r"), Loader=yaml.FullLoader)
# host = config["local_inference_endpoint"]["host"]
port = config["local_inference_endpoint"]["port"]
local_deployment = config["local_deployment"]
device = config.get("device", "cuda:0")
PROXY = None
if config["proxy"]:
PROXY = {
"https": config["proxy"],
}
app = flask.Flask(__name__)
CORS(app)
start = time.time()
local_fold = "models"
# if args.config.endswith(".dev"):
# local_fold = "models_dev"
def load_pipes(local_deployment):
other_pipes = {}
standard_pipes = {}
controlnet_sd_pipes = {}
if local_deployment in ["full"]:
other_pipes = {
"nlpconnect/vit-gpt2-image-captioning":{
"model": VisionEncoderDecoderModel.from_pretrained(f"{local_fold}/nlpconnect/vit-gpt2-image-captioning"),
"feature_extractor": ViTImageProcessor.from_pretrained(f"{local_fold}/nlpconnect/vit-gpt2-image-captioning"),
"tokenizer": AutoTokenizer.from_pretrained(f"{local_fold}/nlpconnect/vit-gpt2-image-captioning"),
"device": device
},
# "Salesforce/blip-image-captioning-large": {
# "model": BlipForConditionalGeneration.from_pretrained(f"{local_fold}/Salesforce/blip-image-captioning-large"),
# "processor": BlipProcessor.from_pretrained(f"{local_fold}/Salesforce/blip-image-captioning-large"),
# "device": device
# },
"damo-vilab/text-to-video-ms-1.7b": {
"model": DiffusionPipeline.from_pretrained(f"{local_fold}/damo-vilab/text-to-video-ms-1.7b", torch_dtype=torch.float16, variant="fp16"),
"device": device
},
# "facebook/maskformer-swin-large-ade": {
# "model": MaskFormerForInstanceSegmentation.from_pretrained(f"{local_fold}/facebook/maskformer-swin-large-ade"),
# "feature_extractor" : AutoFeatureExtractor.from_pretrained("facebook/maskformer-swin-large-ade"),
# "device": device
# },
# "microsoft/trocr-base-printed": {
# "processor": TrOCRProcessor.from_pretrained(f"{local_fold}/microsoft/trocr-base-printed"),
# "model": VisionEncoderDecoderModel.from_pretrained(f"{local_fold}/microsoft/trocr-base-printed"),
# "device": device
# },
# "microsoft/trocr-base-handwritten": {
# "processor": TrOCRProcessor.from_pretrained(f"{local_fold}/microsoft/trocr-base-handwritten"),
# "model": VisionEncoderDecoderModel.from_pretrained(f"{local_fold}/microsoft/trocr-base-handwritten"),
# "device": device
# },
"JorisCos/DCCRNet_Libri1Mix_enhsingle_16k": {
"model": BaseModel.from_pretrained("JorisCos/DCCRNet_Libri1Mix_enhsingle_16k"),
"device": device
},
"espnet/kan-bayashi_ljspeech_vits": {
"model": Text2Speech.from_pretrained(f"espnet/kan-bayashi_ljspeech_vits"),
"device": device
},
"lambdalabs/sd-image-variations-diffusers": {
"model": DiffusionPipeline.from_pretrained(f"{local_fold}/lambdalabs/sd-image-variations-diffusers"), #torch_dtype=torch.float16
"device": device
},
# "CompVis/stable-diffusion-v1-4": {
# "model": DiffusionPipeline.from_pretrained(f"{local_fold}/CompVis/stable-diffusion-v1-4"),
# "device": device
# },
# "stabilityai/stable-diffusion-2-1": {
# "model": DiffusionPipeline.from_pretrained(f"{local_fold}/stabilityai/stable-diffusion-2-1"),
# "device": device
# },
"runwayml/stable-diffusion-v1-5": {
"model": DiffusionPipeline.from_pretrained(f"{local_fold}/runwayml/stable-diffusion-v1-5"),
"device": device
},
# "microsoft/speecht5_tts":{
# "processor": SpeechT5Processor.from_pretrained(f"{local_fold}/microsoft/speecht5_tts"),
# "model": SpeechT5ForTextToSpeech.from_pretrained(f"{local_fold}/microsoft/speecht5_tts"),
# "vocoder": SpeechT5HifiGan.from_pretrained(f"{local_fold}/microsoft/speecht5_hifigan"),
# "embeddings_dataset": load_dataset(f"{local_fold}/Matthijs/cmu-arctic-xvectors", split="validation"),
# "device": device
# },
# "speechbrain/mtl-mimic-voicebank": {
# "model": WaveformEnhancement.from_hparams(source="speechbrain/mtl-mimic-voicebank", savedir="models/mtl-mimic-voicebank"),
# "device": device
# },
"microsoft/speecht5_vc":{
"processor": SpeechT5Processor.from_pretrained(f"{local_fold}/microsoft/speecht5_vc"),
"model": SpeechT5ForSpeechToSpeech.from_pretrained(f"{local_fold}/microsoft/speecht5_vc"),
"vocoder": SpeechT5HifiGan.from_pretrained(f"{local_fold}/microsoft/speecht5_hifigan"),
"embeddings_dataset": load_dataset(f"{local_fold}/Matthijs/cmu-arctic-xvectors", split="validation"),
"device": device
},
# "julien-c/wine-quality": {
# "model": joblib.load(cached_download(hf_hub_url("julien-c/wine-quality", "sklearn_model.joblib")))
# },
# "facebook/timesformer-base-finetuned-k400": {
# "processor": AutoImageProcessor.from_pretrained(f"{local_fold}/facebook/timesformer-base-finetuned-k400"),
# "model": TimesformerForVideoClassification.from_pretrained(f"{local_fold}/facebook/timesformer-base-finetuned-k400"),
# "device": device
# },
"facebook/maskformer-swin-base-coco": {
"feature_extractor": MaskFormerFeatureExtractor.from_pretrained(f"{local_fold}/facebook/maskformer-swin-base-coco"),
"model": MaskFormerForInstanceSegmentation.from_pretrained(f"{local_fold}/facebook/maskformer-swin-base-coco"),
"device": device
},
"Intel/dpt-hybrid-midas": {
"model": DPTForDepthEstimation.from_pretrained(f"{local_fold}/Intel/dpt-hybrid-midas", low_cpu_mem_usage=True),
"feature_extractor": DPTFeatureExtractor.from_pretrained(f"{local_fold}/Intel/dpt-hybrid-midas"),
"device": device
}
}
if local_deployment in ["full", "standard"]:
standard_pipes = {
# "superb/wav2vec2-base-superb-ks": {
# "model": pipeline(task="audio-classification", model=f"{local_fold}/superb/wav2vec2-base-superb-ks"),
# "device": device
# },
"openai/whisper-base": {
"model": pipeline(task="automatic-speech-recognition", model=f"{local_fold}/openai/whisper-base"),
"device": device
},
"microsoft/speecht5_asr": {
"model": pipeline(task="automatic-speech-recognition", model=f"{local_fold}/microsoft/speecht5_asr"),
"device": device
},
"Intel/dpt-large": {
"model": pipeline(task="depth-estimation", model=f"{local_fold}/Intel/dpt-large"),
"device": device
},
# "microsoft/beit-base-patch16-224-pt22k-ft22k": {
# "model": pipeline(task="image-classification", model=f"{local_fold}/microsoft/beit-base-patch16-224-pt22k-ft22k"),
# "device": device
# },
"facebook/detr-resnet-50-panoptic": {
"model": pipeline(task="image-segmentation", model=f"{local_fold}/facebook/detr-resnet-50-panoptic"),
"device": device
},
"facebook/detr-resnet-101": {
"model": pipeline(task="object-detection", model=f"{local_fold}/facebook/detr-resnet-101"),
"device": device
},
# "openai/clip-vit-large-patch14": {
# "model": pipeline(task="zero-shot-image-classification", model=f"{local_fold}/openai/clip-vit-large-patch14"),
# "device": device
# },
"google/owlvit-base-patch32": {
"model": pipeline(task="zero-shot-object-detection", model=f"{local_fold}/google/owlvit-base-patch32"),
"device": device
},
# "microsoft/DialoGPT-medium": {
# "model": pipeline(task="conversational", model=f"{local_fold}/microsoft/DialoGPT-medium"),
# "device": device
# },
# "bert-base-uncased": {
# "model": pipeline(task="fill-mask", model=f"{local_fold}/bert-base-uncased"),
# "device": device
# },
# "deepset/roberta-base-squad2": {
# "model": pipeline(task = "question-answering", model=f"{local_fold}/deepset/roberta-base-squad2"),
# "device": device
# },
# "facebook/bart-large-cnn": {
# "model": pipeline(task="summarization", model=f"{local_fold}/facebook/bart-large-cnn"),
# "device": device
# },
# "google/tapas-base-finetuned-wtq": {
# "model": pipeline(task="table-question-answering", model=f"{local_fold}/google/tapas-base-finetuned-wtq"),
# "device": device
# },
# "distilbert-base-uncased-finetuned-sst-2-english": {
# "model": pipeline(task="text-classification", model=f"{local_fold}/distilbert-base-uncased-finetuned-sst-2-english"),
# "device": device
# },
# "gpt2": {
# "model": pipeline(task="text-generation", model="gpt2"),
# "device": device
# },
# "mrm8488/t5-base-finetuned-question-generation-ap": {
# "model": pipeline(task="text2text-generation", model=f"{local_fold}/mrm8488/t5-base-finetuned-question-generation-ap"),
# "device": device
# },
# "Jean-Baptiste/camembert-ner": {
# "model": pipeline(task="token-classification", model=f"{local_fold}/Jean-Baptiste/camembert-ner", aggregation_strategy="simple"),
# "device": device
# },
# "t5-base": {
# "model": pipeline(task="translation", model=f"{local_fold}/t5-base"),
# "device": device
# },
"impira/layoutlm-document-qa": {
"model": pipeline(task="document-question-answering", model=f"{local_fold}/impira/layoutlm-document-qa"),
"device": device
},
"ydshieh/vit-gpt2-coco-en": {
"model": pipeline(task="image-to-text", model=f"{local_fold}/ydshieh/vit-gpt2-coco-en"),
"device": device
},
"dandelin/vilt-b32-finetuned-vqa": {
"model": pipeline(task="visual-question-answering", model=f"{local_fold}/dandelin/vilt-b32-finetuned-vqa"),
"device": device
}
}
if local_deployment in ["full", "standard", "minimal"]:
controlnet = ControlNetModel.from_pretrained(f"{local_fold}/lllyasviel/sd-controlnet-canny", torch_dtype=torch.float16)
controlnetpipe = StableDiffusionControlNetPipeline.from_pretrained(
f"{local_fold}/runwayml/stable-diffusion-v1-5", controlnet=controlnet, torch_dtype=torch.float16
)
def mlsd_control_network():
model = MobileV2_MLSD_Large()
model.load_state_dict(torch.load(f"{local_fold}/lllyasviel/ControlNet/annotator/ckpts/mlsd_large_512_fp32.pth"), strict=True)
return MLSDdetector(model)
hed_network = Network(f"{local_fold}/lllyasviel/ControlNet/annotator/ckpts/network-bsds500.pth")
controlnet_sd_pipes = {
"openpose-control": {
"model": OpenposeDetector(Body(f"{local_fold}/lllyasviel/ControlNet/annotator/ckpts/body_pose_model.pth"))
},
"mlsd-control": {
"model": mlsd_control_network()
},
"hed-control": {
"model": HEDdetector(hed_network)
},
"scribble-control": {
"model": HEDdetector(hed_network)
},
"midas-control": {
"model": MidasDetector(model_path=f"{local_fold}/lllyasviel/ControlNet/annotator/ckpts/dpt_hybrid-midas-501f0c75.pt")
},
"canny-control": {
"model": CannyDetector()
},
"lllyasviel/sd-controlnet-canny":{
"control": controlnet,
"model": controlnetpipe,
"device": device
},
"lllyasviel/sd-controlnet-depth":{
"control": ControlNetModel.from_pretrained(f"{local_fold}/lllyasviel/sd-controlnet-depth", torch_dtype=torch.float16),
"model": controlnetpipe,
"device": device
},
"lllyasviel/sd-controlnet-hed":{
"control": ControlNetModel.from_pretrained(f"{local_fold}/lllyasviel/sd-controlnet-hed", torch_dtype=torch.float16),
"model": controlnetpipe,
"device": device
},
"lllyasviel/sd-controlnet-mlsd":{
"control": ControlNetModel.from_pretrained(f"{local_fold}/lllyasviel/sd-controlnet-mlsd", torch_dtype=torch.float16),
"model": controlnetpipe,
"device": device
},
"lllyasviel/sd-controlnet-openpose":{
"control": ControlNetModel.from_pretrained(f"{local_fold}/lllyasviel/sd-controlnet-openpose", torch_dtype=torch.float16),
"model": controlnetpipe,
"device": device
},
"lllyasviel/sd-controlnet-scribble":{
"control": ControlNetModel.from_pretrained(f"{local_fold}/lllyasviel/sd-controlnet-scribble", torch_dtype=torch.float16),
"model": controlnetpipe,
"device": device
},
"lllyasviel/sd-controlnet-seg":{
"control": ControlNetModel.from_pretrained(f"{local_fold}/lllyasviel/sd-controlnet-seg", torch_dtype=torch.float16),
"model": controlnetpipe,
"device": device
}
}
pipes = {**standard_pipes, **other_pipes, **controlnet_sd_pipes}
return pipes
pipes = load_pipes(local_deployment)
end = time.time()
during = end - start
print(f"[ ready ] {during}s")
@app.route('/running', methods=['GET'])
def running():
return jsonify({"running": True})
@app.route('/status/<path:model_id>', methods=['GET'])
def status(model_id):
disabled_models = ["microsoft/trocr-base-printed", "microsoft/trocr-base-handwritten"]
if model_id in pipes.keys() and model_id not in disabled_models:
print(f"[ check {model_id} ] success")
return jsonify({"loaded": True})
else:
print(f"[ check {model_id} ] failed")
return jsonify({"loaded": False})
@app.route('/models/<path:model_id>', methods=['POST'])
def models(model_id):
while "using" in pipes[model_id] and pipes[model_id]["using"]:
print(f"[ inference {model_id} ] waiting")
time.sleep(0.1)
pipes[model_id]["using"] = True
print(f"[ inference {model_id} ] start")
start = time.time()
pipe = pipes[model_id]["model"]
if "device" in pipes[model_id]:
try:
pipe.to(pipes[model_id]["device"])
except:
pipe.device = torch.device(pipes[model_id]["device"])
pipe.model.to(pipes[model_id]["device"])
result = None
try:
# text to video
if model_id == "damo-vilab/text-to-video-ms-1.7b":
pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
# pipe.enable_model_cpu_offload()
prompt = request.get_json()["text"]
video_frames = pipe(prompt, num_inference_steps=50, num_frames=40).frames
video_path = export_to_video(video_frames)
file_name = str(uuid.uuid4())[:4]
os.system(f"LD_LIBRARY_PATH=/usr/local/lib /usr/local/bin/ffmpeg -i {video_path} -vcodec libx264 public/videos/{file_name}.mp4")
result = {"path": f"/videos/{file_name}.mp4"}
# controlnet
if model_id.startswith("lllyasviel/sd-controlnet-"):
pipe.controlnet.to('cpu')
pipe.controlnet = pipes[model_id]["control"].to(pipes[model_id]["device"])
pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)
control_image = load_image(request.get_json()["img_url"])
# generator = torch.manual_seed(66)
out_image: Image = pipe(request.get_json()["text"], num_inference_steps=20, image=control_image).images[0]
file_name = str(uuid.uuid4())[:4]
out_image.save(f"public/images/{file_name}.png")
result = {"path": f"/images/{file_name}.png"}
if model_id.endswith("-control"):
image = load_image(request.get_json()["img_url"])
if "scribble" in model_id:
control = pipe(image, scribble = True)
elif "canny" in model_id:
control = pipe(image, low_threshold=100, high_threshold=200)
else:
control = pipe(image)
file_name = str(uuid.uuid4())[:4]
control.save(f"public/images/{file_name}.png")
result = {"path": f"/images/{file_name}.png"}
# image to image
if model_id == "lambdalabs/sd-image-variations-diffusers":
im = load_image(request.get_json()["img_url"])
file_name = str(uuid.uuid4())[:4]
with open(f"public/images/{file_name}.png", "wb") as f:
f.write(request.data)
tform = transforms.Compose([
transforms.ToTensor(),
transforms.Resize(
(224, 224),
interpolation=transforms.InterpolationMode.BICUBIC,
antialias=False,
),
transforms.Normalize(
[0.48145466, 0.4578275, 0.40821073],
[0.26862954, 0.26130258, 0.27577711]),
])
inp = tform(im).to(pipes[model_id]["device"]).unsqueeze(0)
out = pipe(inp, guidance_scale=3)
out["images"][0].save(f"public/images/{file_name}.jpg")
result = {"path": f"/images/{file_name}.jpg"}
# image to text
if model_id == "Salesforce/blip-image-captioning-large":
raw_image = load_image(request.get_json()["img_url"]).convert('RGB')
text = request.get_json()["text"]
inputs = pipes[model_id]["processor"](raw_image, return_tensors="pt").to(pipes[model_id]["device"])
out = pipe.generate(**inputs)
caption = pipes[model_id]["processor"].decode(out[0], skip_special_tokens=True)
result = {"generated text": caption}
if model_id == "ydshieh/vit-gpt2-coco-en":
img_url = request.get_json()["img_url"]
generated_text = pipe(img_url)[0]['generated_text']
result = {"generated text": generated_text}
if model_id == "nlpconnect/vit-gpt2-image-captioning":
image = load_image(request.get_json()["img_url"]).convert("RGB")
pixel_values = pipes[model_id]["feature_extractor"](images=image, return_tensors="pt").pixel_values
pixel_values = pixel_values.to(pipes[model_id]["device"])
generated_ids = pipe.generate(pixel_values, **{"max_length": 200, "num_beams": 1})
generated_text = pipes[model_id]["tokenizer"].batch_decode(generated_ids, skip_special_tokens=True)[0]
result = {"generated text": generated_text}
# image to text: OCR
if model_id == "microsoft/trocr-base-printed" or model_id == "microsoft/trocr-base-handwritten":
image = load_image(request.get_json()["img_url"]).convert("RGB")
pixel_values = pipes[model_id]["processor"](image, return_tensors="pt").pixel_values
pixel_values = pixel_values.to(pipes[model_id]["device"])
generated_ids = pipe.generate(pixel_values)
generated_text = pipes[model_id]["processor"].batch_decode(generated_ids, skip_special_tokens=True)[0]
result = {"generated text": generated_text}
# text to image
if model_id == "runwayml/stable-diffusion-v1-5":
file_name = str(uuid.uuid4())[:4]
text = request.get_json()["text"]
out = pipe(prompt=text)
out["images"][0].save(f"public/images/{file_name}.jpg")
result = {"path": f"/images/{file_name}.jpg"}
# object detection
if model_id == "google/owlvit-base-patch32" or model_id == "facebook/detr-resnet-101":
img_url = request.get_json()["img_url"]
open_types = ["cat", "couch", "person", "car", "dog", "horse", "sheep", "cow", "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee", "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard", "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple", "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch", "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone", "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear", "hair drier", "toothbrush", "traffic light", "fire hydrant", "stop sign", "parking meter", "bench", "bird"]
result = pipe(img_url, candidate_labels=open_types)
# VQA
if model_id == "dandelin/vilt-b32-finetuned-vqa":
question = request.get_json()["text"]
img_url = request.get_json()["img_url"]
result = pipe(question=question, image=img_url)
#DQA
if model_id == "impira/layoutlm-document-qa":
question = request.get_json()["text"]
img_url = request.get_json()["img_url"]
result = pipe(img_url, question)
# depth-estimation
if model_id == "Intel/dpt-large":
output = pipe(request.get_json()["img_url"])
image = output['depth']
name = str(uuid.uuid4())[:4]
image.save(f"public/images/{name}.jpg")
result = {"path": f"/images/{name}.jpg"}
if model_id == "Intel/dpt-hybrid-midas" and model_id == "Intel/dpt-large":
image = load_image(request.get_json()["img_url"])
inputs = pipes[model_id]["feature_extractor"](images=image, return_tensors="pt")
with torch.no_grad():
outputs = pipe(**inputs)
predicted_depth = outputs.predicted_depth
prediction = torch.nn.functional.interpolate(
predicted_depth.unsqueeze(1),
size=image.size[::-1],
mode="bicubic",
align_corners=False,
)
output = prediction.squeeze().cpu().numpy()
formatted = (output * 255 / np.max(output)).astype("uint8")
image = Image.fromarray(formatted)
name = str(uuid.uuid4())[:4]
image.save(f"public/images/{name}.jpg")
result = {"path": f"/images/{name}.jpg"}
# TTS
if model_id == "espnet/kan-bayashi_ljspeech_vits":
text = request.get_json()["text"]
wav = pipe(text)["wav"]
name = str(uuid.uuid4())[:4]
sf.write(f"public/audios/{name}.wav", wav.cpu().numpy(), pipe.fs, "PCM_16")
result = {"path": f"/audios/{name}.wav"}
if model_id == "microsoft/speecht5_tts":
text = request.get_json()["text"]
inputs = pipes[model_id]["processor"](text=text, return_tensors="pt")
embeddings_dataset = pipes[model_id]["embeddings_dataset"]
speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0).to(pipes[model_id]["device"])
pipes[model_id]["vocoder"].to(pipes[model_id]["device"])
speech = pipe.generate_speech(inputs["input_ids"].to(pipes[model_id]["device"]), speaker_embeddings, vocoder=pipes[model_id]["vocoder"])
name = str(uuid.uuid4())[:4]
sf.write(f"public/audios/{name}.wav", speech.cpu().numpy(), samplerate=16000)
result = {"path": f"/audios/{name}.wav"}
# ASR
if model_id == "openai/whisper-base" or model_id == "microsoft/speecht5_asr":
audio_url = request.get_json()["audio_url"]
result = { "text": pipe(audio_url)["text"]}
# audio to audio
if model_id == "JorisCos/DCCRNet_Libri1Mix_enhsingle_16k":
audio_url = request.get_json()["audio_url"]
wav, sr = torchaudio.load(audio_url)
with torch.no_grad():
result_wav = pipe(wav.to(pipes[model_id]["device"]))
name = str(uuid.uuid4())[:4]
sf.write(f"public/audios/{name}.wav", result_wav.cpu().squeeze().numpy(), sr)
result = {"path": f"/audios/{name}.wav"}
if model_id == "microsoft/speecht5_vc":
audio_url = request.get_json()["audio_url"]
wav, sr = torchaudio.load(audio_url)
inputs = pipes[model_id]["processor"](audio=wav, sampling_rate=sr, return_tensors="pt")
embeddings_dataset = pipes[model_id]["embeddings_dataset"]
speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
pipes[model_id]["vocoder"].to(pipes[model_id]["device"])
speech = pipe.generate_speech(inputs["input_ids"].to(pipes[model_id]["device"]), speaker_embeddings, vocoder=pipes[model_id]["vocoder"])
name = str(uuid.uuid4())[:4]
sf.write(f"public/audios/{name}.wav", speech.cpu().numpy(), samplerate=16000)
result = {"path": f"/audios/{name}.wav"}
# segmentation
if model_id == "facebook/detr-resnet-50-panoptic":
result = []
segments = pipe(request.get_json()["img_url"])
image = load_image(request.get_json()["img_url"])
colors = []
for i in range(len(segments)):
colors.append((random.randint(100, 255), random.randint(100, 255), random.randint(100, 255), 50))
for segment in segments:
mask = segment["mask"]
mask = mask.convert('L')
layer = Image.new('RGBA', mask.size, colors[i])
image.paste(layer, (0, 0), mask)
name = str(uuid.uuid4())[:4]
image.save(f"public/images/{name}.jpg")
result = {"path": f"/images/{name}.jpg"}
if model_id == "facebook/maskformer-swin-base-coco" or model_id == "facebook/maskformer-swin-large-ade":
image = load_image(request.get_json()["img_url"])
inputs = pipes[model_id]["feature_extractor"](images=image, return_tensors="pt").to(pipes[model_id]["device"])
outputs = pipe(**inputs)
result = pipes[model_id]["feature_extractor"].post_process_panoptic_segmentation(outputs, target_sizes=[image.size[::-1]])[0]
predicted_panoptic_map = result["segmentation"].cpu().numpy()
predicted_panoptic_map = Image.fromarray(predicted_panoptic_map.astype(np.uint8))
name = str(uuid.uuid4())[:4]
predicted_panoptic_map.save(f"public/images/{name}.jpg")
result = {"path": f"/images/{name}.jpg"}
except Exception as e:
print(e)
traceback.print_exc()
result = {"error": {"message": "Error when running the model inference."}}
if "device" in pipes[model_id]:
try:
pipe.to("cpu")
torch.cuda.empty_cache()
except:
pipe.device = torch.device("cpu")
pipe.model.to("cpu")
torch.cuda.empty_cache()
pipes[model_id]["using"] = False
if result is None:
result = {"error": {"message": "model not found"}}
end = time.time()
during = end - start
print(f"[ complete {model_id} ] {during}s")
print(f"[ result {model_id} ] {result}")
return jsonify(result)
if __name__ == '__main__':
# temp folders
if not os.path.exists("public/audios"):
os.makedirs("public/audios")
if not os.path.exists("public/images"):
os.makedirs("public/images")
if not os.path.exists("public/videos"):
os.makedirs("public/videos")
waitress.serve(app, host="0.0.0.0", port=port)

@ -1,4 +1,3 @@
import base64
import copy
from io import BytesIO
@ -23,7 +22,7 @@ import flask
from flask import request, jsonify
import waitress
from flask_cors import CORS, cross_origin
from get_token_ids import get_token_ids_for_task_parsing, get_token_ids_for_choose_model, count_tokens, get_max_context_length
from swarms.agents.workers.multi_modal_agents.omni_agent.get_token_ids import get_token_ids_for_task_parsing, get_token_ids_for_choose_model, count_tokens, get_max_context_length
from huggingface_hub.inference_api import InferenceApi
from huggingface_hub.inference_api import ALL_TASKS
@ -1068,646 +1067,4 @@ if __name__ == "__main__":
elif args.mode == "server":
server()
elif args.mode == "cli":
cli()
########################## => awesome chat
########################## => models server
import argparse
import logging
import random
import uuid
import numpy as np
from transformers import pipeline
from diffusers import DiffusionPipeline, StableDiffusionControlNetPipeline, ControlNetModel, UniPCMultistepScheduler
from diffusers.utils import load_image
from diffusers import DiffusionPipeline, DPMSolverMultistepScheduler
from diffusers.utils import export_to_video
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan, SpeechT5ForSpeechToSpeech
from transformers import BlipProcessor, BlipForConditionalGeneration
from transformers import TrOCRProcessor, VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer
from datasets import load_dataset
from PIL import Image
import flask
from flask import request, jsonify
import waitress
from flask_cors import CORS
import io
from torchvision import transforms
import torch
import torchaudio
from speechbrain.pretrained import WaveformEnhancement
import joblib
from huggingface_hub import hf_hub_url, cached_download
from transformers import AutoImageProcessor, TimesformerForVideoClassification
from transformers import MaskFormerFeatureExtractor, MaskFormerForInstanceSegmentation, AutoFeatureExtractor
from controlnet_aux import OpenposeDetector, MLSDdetector, HEDdetector, CannyDetector, MidasDetector
from controlnet_aux.open_pose.body import Body
from controlnet_aux.mlsd.models.mbv2_mlsd_large import MobileV2_MLSD_Large
from controlnet_aux.hed import Network
from transformers import DPTForDepthEstimation, DPTFeatureExtractor
import warnings
import time
from espnet2.bin.tts_inference import Text2Speech
import soundfile as sf
from asteroid.models import BaseModel
import traceback
import os
import yaml
warnings.filterwarnings("ignore")
parser = argparse.ArgumentParser()
parser.add_argument("--config", type=str, default="configs/config.default.yaml")
args = parser.parse_args()
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
handler = logging.StreamHandler()
handler.setLevel(logging.INFO)
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
handler.setFormatter(formatter)
logger.addHandler(handler)
config = yaml.load(open(args.config, "r"), Loader=yaml.FullLoader)
# host = config["local_inference_endpoint"]["host"]
port = config["local_inference_endpoint"]["port"]
local_deployment = config["local_deployment"]
device = config.get("device", "cuda:0")
PROXY = None
if config["proxy"]:
PROXY = {
"https": config["proxy"],
}
app = flask.Flask(__name__)
CORS(app)
start = time.time()
local_fold = "models"
# if args.config.endswith(".dev"):
# local_fold = "models_dev"
def load_pipes(local_deployment):
other_pipes = {}
standard_pipes = {}
controlnet_sd_pipes = {}
if local_deployment in ["full"]:
other_pipes = {
"nlpconnect/vit-gpt2-image-captioning":{
"model": VisionEncoderDecoderModel.from_pretrained(f"{local_fold}/nlpconnect/vit-gpt2-image-captioning"),
"feature_extractor": ViTImageProcessor.from_pretrained(f"{local_fold}/nlpconnect/vit-gpt2-image-captioning"),
"tokenizer": AutoTokenizer.from_pretrained(f"{local_fold}/nlpconnect/vit-gpt2-image-captioning"),
"device": device
},
# "Salesforce/blip-image-captioning-large": {
# "model": BlipForConditionalGeneration.from_pretrained(f"{local_fold}/Salesforce/blip-image-captioning-large"),
# "processor": BlipProcessor.from_pretrained(f"{local_fold}/Salesforce/blip-image-captioning-large"),
# "device": device
# },
"damo-vilab/text-to-video-ms-1.7b": {
"model": DiffusionPipeline.from_pretrained(f"{local_fold}/damo-vilab/text-to-video-ms-1.7b", torch_dtype=torch.float16, variant="fp16"),
"device": device
},
# "facebook/maskformer-swin-large-ade": {
# "model": MaskFormerForInstanceSegmentation.from_pretrained(f"{local_fold}/facebook/maskformer-swin-large-ade"),
# "feature_extractor" : AutoFeatureExtractor.from_pretrained("facebook/maskformer-swin-large-ade"),
# "device": device
# },
# "microsoft/trocr-base-printed": {
# "processor": TrOCRProcessor.from_pretrained(f"{local_fold}/microsoft/trocr-base-printed"),
# "model": VisionEncoderDecoderModel.from_pretrained(f"{local_fold}/microsoft/trocr-base-printed"),
# "device": device
# },
# "microsoft/trocr-base-handwritten": {
# "processor": TrOCRProcessor.from_pretrained(f"{local_fold}/microsoft/trocr-base-handwritten"),
# "model": VisionEncoderDecoderModel.from_pretrained(f"{local_fold}/microsoft/trocr-base-handwritten"),
# "device": device
# },
"JorisCos/DCCRNet_Libri1Mix_enhsingle_16k": {
"model": BaseModel.from_pretrained("JorisCos/DCCRNet_Libri1Mix_enhsingle_16k"),
"device": device
},
"espnet/kan-bayashi_ljspeech_vits": {
"model": Text2Speech.from_pretrained(f"espnet/kan-bayashi_ljspeech_vits"),
"device": device
},
"lambdalabs/sd-image-variations-diffusers": {
"model": DiffusionPipeline.from_pretrained(f"{local_fold}/lambdalabs/sd-image-variations-diffusers"), #torch_dtype=torch.float16
"device": device
},
# "CompVis/stable-diffusion-v1-4": {
# "model": DiffusionPipeline.from_pretrained(f"{local_fold}/CompVis/stable-diffusion-v1-4"),
# "device": device
# },
# "stabilityai/stable-diffusion-2-1": {
# "model": DiffusionPipeline.from_pretrained(f"{local_fold}/stabilityai/stable-diffusion-2-1"),
# "device": device
# },
"runwayml/stable-diffusion-v1-5": {
"model": DiffusionPipeline.from_pretrained(f"{local_fold}/runwayml/stable-diffusion-v1-5"),
"device": device
},
# "microsoft/speecht5_tts":{
# "processor": SpeechT5Processor.from_pretrained(f"{local_fold}/microsoft/speecht5_tts"),
# "model": SpeechT5ForTextToSpeech.from_pretrained(f"{local_fold}/microsoft/speecht5_tts"),
# "vocoder": SpeechT5HifiGan.from_pretrained(f"{local_fold}/microsoft/speecht5_hifigan"),
# "embeddings_dataset": load_dataset(f"{local_fold}/Matthijs/cmu-arctic-xvectors", split="validation"),
# "device": device
# },
# "speechbrain/mtl-mimic-voicebank": {
# "model": WaveformEnhancement.from_hparams(source="speechbrain/mtl-mimic-voicebank", savedir="models/mtl-mimic-voicebank"),
# "device": device
# },
"microsoft/speecht5_vc":{
"processor": SpeechT5Processor.from_pretrained(f"{local_fold}/microsoft/speecht5_vc"),
"model": SpeechT5ForSpeechToSpeech.from_pretrained(f"{local_fold}/microsoft/speecht5_vc"),
"vocoder": SpeechT5HifiGan.from_pretrained(f"{local_fold}/microsoft/speecht5_hifigan"),
"embeddings_dataset": load_dataset(f"{local_fold}/Matthijs/cmu-arctic-xvectors", split="validation"),
"device": device
},
# "julien-c/wine-quality": {
# "model": joblib.load(cached_download(hf_hub_url("julien-c/wine-quality", "sklearn_model.joblib")))
# },
# "facebook/timesformer-base-finetuned-k400": {
# "processor": AutoImageProcessor.from_pretrained(f"{local_fold}/facebook/timesformer-base-finetuned-k400"),
# "model": TimesformerForVideoClassification.from_pretrained(f"{local_fold}/facebook/timesformer-base-finetuned-k400"),
# "device": device
# },
"facebook/maskformer-swin-base-coco": {
"feature_extractor": MaskFormerFeatureExtractor.from_pretrained(f"{local_fold}/facebook/maskformer-swin-base-coco"),
"model": MaskFormerForInstanceSegmentation.from_pretrained(f"{local_fold}/facebook/maskformer-swin-base-coco"),
"device": device
},
"Intel/dpt-hybrid-midas": {
"model": DPTForDepthEstimation.from_pretrained(f"{local_fold}/Intel/dpt-hybrid-midas", low_cpu_mem_usage=True),
"feature_extractor": DPTFeatureExtractor.from_pretrained(f"{local_fold}/Intel/dpt-hybrid-midas"),
"device": device
}
}
if local_deployment in ["full", "standard"]:
standard_pipes = {
# "superb/wav2vec2-base-superb-ks": {
# "model": pipeline(task="audio-classification", model=f"{local_fold}/superb/wav2vec2-base-superb-ks"),
# "device": device
# },
"openai/whisper-base": {
"model": pipeline(task="automatic-speech-recognition", model=f"{local_fold}/openai/whisper-base"),
"device": device
},
"microsoft/speecht5_asr": {
"model": pipeline(task="automatic-speech-recognition", model=f"{local_fold}/microsoft/speecht5_asr"),
"device": device
},
"Intel/dpt-large": {
"model": pipeline(task="depth-estimation", model=f"{local_fold}/Intel/dpt-large"),
"device": device
},
# "microsoft/beit-base-patch16-224-pt22k-ft22k": {
# "model": pipeline(task="image-classification", model=f"{local_fold}/microsoft/beit-base-patch16-224-pt22k-ft22k"),
# "device": device
# },
"facebook/detr-resnet-50-panoptic": {
"model": pipeline(task="image-segmentation", model=f"{local_fold}/facebook/detr-resnet-50-panoptic"),
"device": device
},
"facebook/detr-resnet-101": {
"model": pipeline(task="object-detection", model=f"{local_fold}/facebook/detr-resnet-101"),
"device": device
},
# "openai/clip-vit-large-patch14": {
# "model": pipeline(task="zero-shot-image-classification", model=f"{local_fold}/openai/clip-vit-large-patch14"),
# "device": device
# },
"google/owlvit-base-patch32": {
"model": pipeline(task="zero-shot-object-detection", model=f"{local_fold}/google/owlvit-base-patch32"),
"device": device
},
# "microsoft/DialoGPT-medium": {
# "model": pipeline(task="conversational", model=f"{local_fold}/microsoft/DialoGPT-medium"),
# "device": device
# },
# "bert-base-uncased": {
# "model": pipeline(task="fill-mask", model=f"{local_fold}/bert-base-uncased"),
# "device": device
# },
# "deepset/roberta-base-squad2": {
# "model": pipeline(task = "question-answering", model=f"{local_fold}/deepset/roberta-base-squad2"),
# "device": device
# },
# "facebook/bart-large-cnn": {
# "model": pipeline(task="summarization", model=f"{local_fold}/facebook/bart-large-cnn"),
# "device": device
# },
# "google/tapas-base-finetuned-wtq": {
# "model": pipeline(task="table-question-answering", model=f"{local_fold}/google/tapas-base-finetuned-wtq"),
# "device": device
# },
# "distilbert-base-uncased-finetuned-sst-2-english": {
# "model": pipeline(task="text-classification", model=f"{local_fold}/distilbert-base-uncased-finetuned-sst-2-english"),
# "device": device
# },
# "gpt2": {
# "model": pipeline(task="text-generation", model="gpt2"),
# "device": device
# },
# "mrm8488/t5-base-finetuned-question-generation-ap": {
# "model": pipeline(task="text2text-generation", model=f"{local_fold}/mrm8488/t5-base-finetuned-question-generation-ap"),
# "device": device
# },
# "Jean-Baptiste/camembert-ner": {
# "model": pipeline(task="token-classification", model=f"{local_fold}/Jean-Baptiste/camembert-ner", aggregation_strategy="simple"),
# "device": device
# },
# "t5-base": {
# "model": pipeline(task="translation", model=f"{local_fold}/t5-base"),
# "device": device
# },
"impira/layoutlm-document-qa": {
"model": pipeline(task="document-question-answering", model=f"{local_fold}/impira/layoutlm-document-qa"),
"device": device
},
"ydshieh/vit-gpt2-coco-en": {
"model": pipeline(task="image-to-text", model=f"{local_fold}/ydshieh/vit-gpt2-coco-en"),
"device": device
},
"dandelin/vilt-b32-finetuned-vqa": {
"model": pipeline(task="visual-question-answering", model=f"{local_fold}/dandelin/vilt-b32-finetuned-vqa"),
"device": device
}
}
if local_deployment in ["full", "standard", "minimal"]:
controlnet = ControlNetModel.from_pretrained(f"{local_fold}/lllyasviel/sd-controlnet-canny", torch_dtype=torch.float16)
controlnetpipe = StableDiffusionControlNetPipeline.from_pretrained(
f"{local_fold}/runwayml/stable-diffusion-v1-5", controlnet=controlnet, torch_dtype=torch.float16
)
def mlsd_control_network():
model = MobileV2_MLSD_Large()
model.load_state_dict(torch.load(f"{local_fold}/lllyasviel/ControlNet/annotator/ckpts/mlsd_large_512_fp32.pth"), strict=True)
return MLSDdetector(model)
hed_network = Network(f"{local_fold}/lllyasviel/ControlNet/annotator/ckpts/network-bsds500.pth")
controlnet_sd_pipes = {
"openpose-control": {
"model": OpenposeDetector(Body(f"{local_fold}/lllyasviel/ControlNet/annotator/ckpts/body_pose_model.pth"))
},
"mlsd-control": {
"model": mlsd_control_network()
},
"hed-control": {
"model": HEDdetector(hed_network)
},
"scribble-control": {
"model": HEDdetector(hed_network)
},
"midas-control": {
"model": MidasDetector(model_path=f"{local_fold}/lllyasviel/ControlNet/annotator/ckpts/dpt_hybrid-midas-501f0c75.pt")
},
"canny-control": {
"model": CannyDetector()
},
"lllyasviel/sd-controlnet-canny":{
"control": controlnet,
"model": controlnetpipe,
"device": device
},
"lllyasviel/sd-controlnet-depth":{
"control": ControlNetModel.from_pretrained(f"{local_fold}/lllyasviel/sd-controlnet-depth", torch_dtype=torch.float16),
"model": controlnetpipe,
"device": device
},
"lllyasviel/sd-controlnet-hed":{
"control": ControlNetModel.from_pretrained(f"{local_fold}/lllyasviel/sd-controlnet-hed", torch_dtype=torch.float16),
"model": controlnetpipe,
"device": device
},
"lllyasviel/sd-controlnet-mlsd":{
"control": ControlNetModel.from_pretrained(f"{local_fold}/lllyasviel/sd-controlnet-mlsd", torch_dtype=torch.float16),
"model": controlnetpipe,
"device": device
},
"lllyasviel/sd-controlnet-openpose":{
"control": ControlNetModel.from_pretrained(f"{local_fold}/lllyasviel/sd-controlnet-openpose", torch_dtype=torch.float16),
"model": controlnetpipe,
"device": device
},
"lllyasviel/sd-controlnet-scribble":{
"control": ControlNetModel.from_pretrained(f"{local_fold}/lllyasviel/sd-controlnet-scribble", torch_dtype=torch.float16),
"model": controlnetpipe,
"device": device
},
"lllyasviel/sd-controlnet-seg":{
"control": ControlNetModel.from_pretrained(f"{local_fold}/lllyasviel/sd-controlnet-seg", torch_dtype=torch.float16),
"model": controlnetpipe,
"device": device
}
}
pipes = {**standard_pipes, **other_pipes, **controlnet_sd_pipes}
return pipes
pipes = load_pipes(local_deployment)
end = time.time()
during = end - start
print(f"[ ready ] {during}s")
@app.route('/running', methods=['GET'])
def running():
return jsonify({"running": True})
@app.route('/status/<path:model_id>', methods=['GET'])
def status(model_id):
disabled_models = ["microsoft/trocr-base-printed", "microsoft/trocr-base-handwritten"]
if model_id in pipes.keys() and model_id not in disabled_models:
print(f"[ check {model_id} ] success")
return jsonify({"loaded": True})
else:
print(f"[ check {model_id} ] failed")
return jsonify({"loaded": False})
@app.route('/models/<path:model_id>', methods=['POST'])
def models(model_id):
while "using" in pipes[model_id] and pipes[model_id]["using"]:
print(f"[ inference {model_id} ] waiting")
time.sleep(0.1)
pipes[model_id]["using"] = True
print(f"[ inference {model_id} ] start")
start = time.time()
pipe = pipes[model_id]["model"]
if "device" in pipes[model_id]:
try:
pipe.to(pipes[model_id]["device"])
except:
pipe.device = torch.device(pipes[model_id]["device"])
pipe.model.to(pipes[model_id]["device"])
result = None
try:
# text to video
if model_id == "damo-vilab/text-to-video-ms-1.7b":
pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
# pipe.enable_model_cpu_offload()
prompt = request.get_json()["text"]
video_frames = pipe(prompt, num_inference_steps=50, num_frames=40).frames
video_path = export_to_video(video_frames)
file_name = str(uuid.uuid4())[:4]
os.system(f"LD_LIBRARY_PATH=/usr/local/lib /usr/local/bin/ffmpeg -i {video_path} -vcodec libx264 public/videos/{file_name}.mp4")
result = {"path": f"/videos/{file_name}.mp4"}
# controlnet
if model_id.startswith("lllyasviel/sd-controlnet-"):
pipe.controlnet.to('cpu')
pipe.controlnet = pipes[model_id]["control"].to(pipes[model_id]["device"])
pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)
control_image = load_image(request.get_json()["img_url"])
# generator = torch.manual_seed(66)
out_image: Image = pipe(request.get_json()["text"], num_inference_steps=20, image=control_image).images[0]
file_name = str(uuid.uuid4())[:4]
out_image.save(f"public/images/{file_name}.png")
result = {"path": f"/images/{file_name}.png"}
if model_id.endswith("-control"):
image = load_image(request.get_json()["img_url"])
if "scribble" in model_id:
control = pipe(image, scribble = True)
elif "canny" in model_id:
control = pipe(image, low_threshold=100, high_threshold=200)
else:
control = pipe(image)
file_name = str(uuid.uuid4())[:4]
control.save(f"public/images/{file_name}.png")
result = {"path": f"/images/{file_name}.png"}
# image to image
if model_id == "lambdalabs/sd-image-variations-diffusers":
im = load_image(request.get_json()["img_url"])
file_name = str(uuid.uuid4())[:4]
with open(f"public/images/{file_name}.png", "wb") as f:
f.write(request.data)
tform = transforms.Compose([
transforms.ToTensor(),
transforms.Resize(
(224, 224),
interpolation=transforms.InterpolationMode.BICUBIC,
antialias=False,
),
transforms.Normalize(
[0.48145466, 0.4578275, 0.40821073],
[0.26862954, 0.26130258, 0.27577711]),
])
inp = tform(im).to(pipes[model_id]["device"]).unsqueeze(0)
out = pipe(inp, guidance_scale=3)
out["images"][0].save(f"public/images/{file_name}.jpg")
result = {"path": f"/images/{file_name}.jpg"}
# image to text
if model_id == "Salesforce/blip-image-captioning-large":
raw_image = load_image(request.get_json()["img_url"]).convert('RGB')
text = request.get_json()["text"]
inputs = pipes[model_id]["processor"](raw_image, return_tensors="pt").to(pipes[model_id]["device"])
out = pipe.generate(**inputs)
caption = pipes[model_id]["processor"].decode(out[0], skip_special_tokens=True)
result = {"generated text": caption}
if model_id == "ydshieh/vit-gpt2-coco-en":
img_url = request.get_json()["img_url"]
generated_text = pipe(img_url)[0]['generated_text']
result = {"generated text": generated_text}
if model_id == "nlpconnect/vit-gpt2-image-captioning":
image = load_image(request.get_json()["img_url"]).convert("RGB")
pixel_values = pipes[model_id]["feature_extractor"](images=image, return_tensors="pt").pixel_values
pixel_values = pixel_values.to(pipes[model_id]["device"])
generated_ids = pipe.generate(pixel_values, **{"max_length": 200, "num_beams": 1})
generated_text = pipes[model_id]["tokenizer"].batch_decode(generated_ids, skip_special_tokens=True)[0]
result = {"generated text": generated_text}
# image to text: OCR
if model_id == "microsoft/trocr-base-printed" or model_id == "microsoft/trocr-base-handwritten":
image = load_image(request.get_json()["img_url"]).convert("RGB")
pixel_values = pipes[model_id]["processor"](image, return_tensors="pt").pixel_values
pixel_values = pixel_values.to(pipes[model_id]["device"])
generated_ids = pipe.generate(pixel_values)
generated_text = pipes[model_id]["processor"].batch_decode(generated_ids, skip_special_tokens=True)[0]
result = {"generated text": generated_text}
# text to image
if model_id == "runwayml/stable-diffusion-v1-5":
file_name = str(uuid.uuid4())[:4]
text = request.get_json()["text"]
out = pipe(prompt=text)
out["images"][0].save(f"public/images/{file_name}.jpg")
result = {"path": f"/images/{file_name}.jpg"}
# object detection
if model_id == "google/owlvit-base-patch32" or model_id == "facebook/detr-resnet-101":
img_url = request.get_json()["img_url"]
open_types = ["cat", "couch", "person", "car", "dog", "horse", "sheep", "cow", "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee", "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard", "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple", "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch", "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone", "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear", "hair drier", "toothbrush", "traffic light", "fire hydrant", "stop sign", "parking meter", "bench", "bird"]
result = pipe(img_url, candidate_labels=open_types)
# VQA
if model_id == "dandelin/vilt-b32-finetuned-vqa":
question = request.get_json()["text"]
img_url = request.get_json()["img_url"]
result = pipe(question=question, image=img_url)
#DQA
if model_id == "impira/layoutlm-document-qa":
question = request.get_json()["text"]
img_url = request.get_json()["img_url"]
result = pipe(img_url, question)
# depth-estimation
if model_id == "Intel/dpt-large":
output = pipe(request.get_json()["img_url"])
image = output['depth']
name = str(uuid.uuid4())[:4]
image.save(f"public/images/{name}.jpg")
result = {"path": f"/images/{name}.jpg"}
if model_id == "Intel/dpt-hybrid-midas" and model_id == "Intel/dpt-large":
image = load_image(request.get_json()["img_url"])
inputs = pipes[model_id]["feature_extractor"](images=image, return_tensors="pt")
with torch.no_grad():
outputs = pipe(**inputs)
predicted_depth = outputs.predicted_depth
prediction = torch.nn.functional.interpolate(
predicted_depth.unsqueeze(1),
size=image.size[::-1],
mode="bicubic",
align_corners=False,
)
output = prediction.squeeze().cpu().numpy()
formatted = (output * 255 / np.max(output)).astype("uint8")
image = Image.fromarray(formatted)
name = str(uuid.uuid4())[:4]
image.save(f"public/images/{name}.jpg")
result = {"path": f"/images/{name}.jpg"}
# TTS
if model_id == "espnet/kan-bayashi_ljspeech_vits":
text = request.get_json()["text"]
wav = pipe(text)["wav"]
name = str(uuid.uuid4())[:4]
sf.write(f"public/audios/{name}.wav", wav.cpu().numpy(), pipe.fs, "PCM_16")
result = {"path": f"/audios/{name}.wav"}
if model_id == "microsoft/speecht5_tts":
text = request.get_json()["text"]
inputs = pipes[model_id]["processor"](text=text, return_tensors="pt")
embeddings_dataset = pipes[model_id]["embeddings_dataset"]
speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0).to(pipes[model_id]["device"])
pipes[model_id]["vocoder"].to(pipes[model_id]["device"])
speech = pipe.generate_speech(inputs["input_ids"].to(pipes[model_id]["device"]), speaker_embeddings, vocoder=pipes[model_id]["vocoder"])
name = str(uuid.uuid4())[:4]
sf.write(f"public/audios/{name}.wav", speech.cpu().numpy(), samplerate=16000)
result = {"path": f"/audios/{name}.wav"}
# ASR
if model_id == "openai/whisper-base" or model_id == "microsoft/speecht5_asr":
audio_url = request.get_json()["audio_url"]
result = { "text": pipe(audio_url)["text"]}
# audio to audio
if model_id == "JorisCos/DCCRNet_Libri1Mix_enhsingle_16k":
audio_url = request.get_json()["audio_url"]
wav, sr = torchaudio.load(audio_url)
with torch.no_grad():
result_wav = pipe(wav.to(pipes[model_id]["device"]))
name = str(uuid.uuid4())[:4]
sf.write(f"public/audios/{name}.wav", result_wav.cpu().squeeze().numpy(), sr)
result = {"path": f"/audios/{name}.wav"}
if model_id == "microsoft/speecht5_vc":
audio_url = request.get_json()["audio_url"]
wav, sr = torchaudio.load(audio_url)
inputs = pipes[model_id]["processor"](audio=wav, sampling_rate=sr, return_tensors="pt")
embeddings_dataset = pipes[model_id]["embeddings_dataset"]
speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
pipes[model_id]["vocoder"].to(pipes[model_id]["device"])
speech = pipe.generate_speech(inputs["input_ids"].to(pipes[model_id]["device"]), speaker_embeddings, vocoder=pipes[model_id]["vocoder"])
name = str(uuid.uuid4())[:4]
sf.write(f"public/audios/{name}.wav", speech.cpu().numpy(), samplerate=16000)
result = {"path": f"/audios/{name}.wav"}
# segmentation
if model_id == "facebook/detr-resnet-50-panoptic":
result = []
segments = pipe(request.get_json()["img_url"])
image = load_image(request.get_json()["img_url"])
colors = []
for i in range(len(segments)):
colors.append((random.randint(100, 255), random.randint(100, 255), random.randint(100, 255), 50))
for segment in segments:
mask = segment["mask"]
mask = mask.convert('L')
layer = Image.new('RGBA', mask.size, colors[i])
image.paste(layer, (0, 0), mask)
name = str(uuid.uuid4())[:4]
image.save(f"public/images/{name}.jpg")
result = {"path": f"/images/{name}.jpg"}
if model_id == "facebook/maskformer-swin-base-coco" or model_id == "facebook/maskformer-swin-large-ade":
image = load_image(request.get_json()["img_url"])
inputs = pipes[model_id]["feature_extractor"](images=image, return_tensors="pt").to(pipes[model_id]["device"])
outputs = pipe(**inputs)
result = pipes[model_id]["feature_extractor"].post_process_panoptic_segmentation(outputs, target_sizes=[image.size[::-1]])[0]
predicted_panoptic_map = result["segmentation"].cpu().numpy()
predicted_panoptic_map = Image.fromarray(predicted_panoptic_map.astype(np.uint8))
name = str(uuid.uuid4())[:4]
predicted_panoptic_map.save(f"public/images/{name}.jpg")
result = {"path": f"/images/{name}.jpg"}
except Exception as e:
print(e)
traceback.print_exc()
result = {"error": {"message": "Error when running the model inference."}}
if "device" in pipes[model_id]:
try:
pipe.to("cpu")
torch.cuda.empty_cache()
except:
pipe.device = torch.device("cpu")
pipe.model.to("cpu")
torch.cuda.empty_cache()
pipes[model_id]["using"] = False
if result is None:
result = {"error": {"message": "model not found"}}
end = time.time()
during = end - start
print(f"[ complete {model_id} ] {during}s")
print(f"[ result {model_id} ] {result}")
return jsonify(result)
if __name__ == '__main__':
# temp folders
if not os.path.exists("public/audios"):
os.makedirs("public/audios")
if not os.path.exists("public/images"):
os.makedirs("public/images")
if not os.path.exists("public/videos"):
os.makedirs("public/videos")
waitress.serve(app, host="0.0.0.0", port=port)
########################## => models server end
cli()

@ -1,6 +1,6 @@
#boss node -> worker agent -> omni agent [worker of the worker]
from langchain.tools import tool
from swarms.agents.workers.multi_modal_agents.omni_agent import chat_huggingface
from swarms.agents.workers.multi_modal_agents.omni_agent.omni_chat import chat_huggingface
class OmniWorkerAgent:
def __init__(self, api_key, api_endpoint, api_type):

@ -472,68 +472,6 @@ class ImageCaptioning(BaseHandler):
################# server/get token ids
import tiktoken
encodings = {
"gpt-4": tiktoken.get_encoding("cl100k_base"),
"gpt-4-32k": tiktoken.get_encoding("cl100k_base"),
"gpt-3.5-turbo": tiktoken.get_encoding("cl100k_base"),
"gpt-3.5-turbo-0301": tiktoken.get_encoding("cl100k_base"),
"text-davinci-003": tiktoken.get_encoding("p50k_base"),
"text-davinci-002": tiktoken.get_encoding("p50k_base"),
"text-davinci-001": tiktoken.get_encoding("r50k_base"),
"text-curie-001": tiktoken.get_encoding("r50k_base"),
"text-babbage-001": tiktoken.get_encoding("r50k_base"),
"text-ada-001": tiktoken.get_encoding("r50k_base"),
"davinci": tiktoken.get_encoding("r50k_base"),
"curie": tiktoken.get_encoding("r50k_base"),
"babbage": tiktoken.get_encoding("r50k_base"),
"ada": tiktoken.get_encoding("r50k_base"),
}
max_length = {
"gpt-4": 8192,
"gpt-4-32k": 32768,
"gpt-3.5-turbo": 4096,
"gpt-3.5-turbo-0301": 4096,
"text-davinci-003": 4096,
"text-davinci-002": 4096,
"text-davinci-001": 2049,
"text-curie-001": 2049,
"text-babbage-001": 2049,
"text-ada-001": 2049,
"davinci": 2049,
"curie": 2049,
"babbage": 2049,
"ada": 2049
}
def count_tokens(model_name, text):
return len(encodings[model_name].encode(text))
def get_max_context_length(model_name):
return max_length[model_name]
def get_token_ids_for_task_parsing(model_name):
text = '''{"task": "text-classification", "token-classification", "text2text-generation", "summarization", "translation", "question-answering", "conversational", "text-generation", "sentence-similarity", "tabular-classification", "object-detection", "image-classification", "image-to-image", "image-to-text", "text-to-image", "visual-question-answering", "document-question-answering", "image-segmentation", "text-to-speech", "text-to-video", "automatic-speech-recognition", "audio-to-audio", "audio-classification", "canny-control", "hed-control", "mlsd-control", "normal-control", "openpose-control", "canny-text-to-image", "depth-text-to-image", "hed-text-to-image", "mlsd-text-to-image", "normal-text-to-image", "openpose-text-to-image", "seg-text-to-image", "args", "text", "path", "dep", "id", "<GENERATED>-"}'''
res = encodings[model_name].encode(text)
res = list(set(res))
return res
def get_token_ids_for_choose_model(model_name):
text = '''{"id": "reason"}'''
res = encodings[model_name].encode(text)
res = list(set(res))
return res
################# END
# ################# MultiAgent
# from autogpt.agent import Agent
# from swarms.agents.swarms import worker_node

Loading…
Cancel
Save