diff --git a/swarms/agents/multi_modal_visual_agent.py b/swarms/agents/multi_modal_visual_agent.py index 616c67fb..83e9dcb0 100644 --- a/swarms/agents/multi_modal_visual_agent.py +++ b/swarms/agents/multi_modal_visual_agent.py @@ -1,58 +1,45 @@ -# coding: utf-8 -import argparse -import inspect -import math import os +import gradio as gr import random +import torch +import cv2 import re import uuid - -import cv2 -import gradio as gr -import matplotlib.pyplot as plt +from PIL import Image, ImageDraw, ImageOps, ImageFont +import math import numpy as np -import torch -import wget -from controlnet_aux import HEDdetector, MLSDdetector, OpenposeDetector -from diffusers import ( - ControlNetModel, - EulerAncestralDiscreteScheduler, - StableDiffusionControlNetPipeline, - StableDiffusionInpaintPipeline, - StableDiffusionInstructPix2PixPipeline, - StableDiffusionPipeline, - UniPCMultistepScheduler, -) +import argparse +import inspect +import tempfile +from transformers import CLIPSegProcessor, CLIPSegForImageSegmentation +from transformers import pipeline, BlipProcessor, BlipForConditionalGeneration, BlipForQuestionAnswering +from transformers import AutoImageProcessor, UperNetForSemanticSegmentation + +from diffusers import StableDiffusionPipeline, StableDiffusionInpaintPipeline, StableDiffusionInstructPix2PixPipeline +from diffusers import EulerAncestralDiscreteScheduler +from diffusers import StableDiffusionControlNetPipeline, ControlNetModel, UniPCMultistepScheduler from diffusers.pipelines.stable_diffusion import StableDiffusionSafetyChecker + +from controlnet_aux import OpenposeDetector, MLSDdetector, HEDdetector + from langchain.agents.initialize import initialize_agent from langchain.agents.tools import Tool from langchain.chains.conversation.memory import ConversationBufferMemory from langchain.llms.openai import OpenAI -from PIL import Image, ImageDraw, ImageFont, ImageOps -from transformers import ( - BlipForConditionalGeneration, - BlipForQuestionAnswering, - BlipProcessor, - pipeline, -) # Grounding DINO -# import groundingdino.datasets.transforms as T -from swarms.workers.models import ( - Compose, - Normalize, - RandomResize, - SLConfig, - ToTensor, - build_model, - clean_state_dict, - get_phrases_from_posmap, -) -from swarms.workers.models.segment_anything import ( - SamAutomaticMaskGenerator, - SamPredictor, - build_sam, -) +import groundingdino.datasets.transforms as T +from groundingdino.models import build_model +from groundingdino.util import box_ops +from groundingdino.util.slconfig import SLConfig +from groundingdino.util.utils import clean_state_dict, get_phrases_from_posmap + +# segment anything +from segment_anything import build_sam, SamPredictor, SamAutomaticMaskGenerator +import cv2 +import numpy as np +import matplotlib.pyplot as plt +import wget VISUAL_AGENT_PREFIX = """ Worker Multi-Modal Agent is designed to be able to assist with