diff --git a/mm_agent_example.py b/mm_agent_example.py index 3177939c..8a7ca8c8 100644 --- a/mm_agent_example.py +++ b/mm_agent_example.py @@ -1,7 +1,7 @@ from swarms.agents import MultiModalAgent load_dict = { - "ImageCaptioning": "cuda" + "ImageCaptioning": "cpu" } node = MultiModalAgent(load_dict) diff --git a/requirements.txt b/requirements.txt index 138b4c0c..61717a95 100644 --- a/requirements.txt +++ b/requirements.txt @@ -28,6 +28,32 @@ colored +addict +albumentations +basicsr +controlnet-aux +diffusers +einops +imageio +imageio-ffmpeg +invisible-watermark +kornia +numpy +omegaconf +open_clip_torch +openai +opencv-python +prettytable +safetensors +streamlit +test-tube +timm +torchmetrics +transformers +webdataset +yapf + + mkdocs mkdocs-material mkdocs-glightbox diff --git a/swarms/agents/multi_modal_visual_agent.py b/swarms/agents/multi_modal_visual_agent.py index 2ea3e5c3..0c9b4506 100644 --- a/swarms/agents/multi_modal_visual_agent.py +++ b/swarms/agents/multi_modal_visual_agent.py @@ -1,5 +1,4 @@ import os -import gradio as gr import random import torch import cv2 diff --git a/swarms/tools/autogpt.py b/swarms/tools/autogpt.py index 2ed041c7..13b074ed 100644 --- a/swarms/tools/autogpt.py +++ b/swarms/tools/autogpt.py @@ -17,12 +17,11 @@ from langchain.chains.qa_with_sources.loading import BaseCombineDocumentsChain from langchain.chat_models import ChatOpenAI from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.tools import BaseTool +from langchain.tools.file_management.read import ReadFileTool +from langchain.tools.file_management.write import WriteFileTool from pydantic import Field from swarms.utils.logger import logger -from langchain.tools.file_management.write import WriteFileTool -from langchain.tools.file_management.read import ReadFileTool - @contextmanager @@ -141,6 +140,7 @@ query_website_tool = WebpageQATool(qa_chain=load_qa_with_sources_chain(llm)) # code_intepret = CodeInterpreter() import interpreter + @tool def compile(task: str): """ @@ -169,41 +169,42 @@ def compile(task: str): # mm model workers -import torch -from PIL import Image -from transformers import ( - BlipForQuestionAnswering, - BlipProcessor, -) - -@tool -def VQAinference(self, inputs): - """ - Answer Question About The Image, VQA Multi-Modal Worker agent - description="useful when you need an answer for a question based on an image. " - "like: what is the background color of the last image, how many cats in this figure, what is in this figure. " - "The input to this tool should be a comma separated string of two, representing the image_path and the question", +# import torch +# from PIL import Image +# from transformers import ( +# BlipForQuestionAnswering, +# BlipProcessor, +# ) + + +# @tool +# def VQAinference(self, inputs): +# """ +# Answer Question About The Image, VQA Multi-Modal Worker agent +# description="useful when you need an answer for a question based on an image. " +# "like: what is the background color of the last image, how many cats in this figure, what is in this figure. " +# "The input to this tool should be a comma separated string of two, representing the image_path and the question", - """ - device = "cuda:0" - torch_dtype = torch.float16 if "cuda" in device else torch.float32 - processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base") - model = BlipForQuestionAnswering.from_pretrained( - "Salesforce/blip-vqa-base", torch_dtype=torch_dtype - ).to(device) - - image_path, question = inputs.split(",") - raw_image = Image.open(image_path).convert("RGB") - inputs = processor(raw_image, question, return_tensors="pt").to( - device, torch_dtype - ) - out = model.generate(**inputs) - answer = processor.decode(out[0], skip_special_tokens=True) - - logger.debug( - f"\nProcessed VisualQuestionAnswering, Input Image: {image_path}, Input Question: {question}, " - f"Output Answer: {answer}" - ) - - return answer +# """ +# device = "cuda:0" +# torch_dtype = torch.float16 if "cuda" in device else torch.float32 +# processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base") +# model = BlipForQuestionAnswering.from_pretrained( +# "Salesforce/blip-vqa-base", torch_dtype=torch_dtype +# ).to(device) + +# image_path, question = inputs.split(",") +# raw_image = Image.open(image_path).convert("RGB") +# inputs = processor(raw_image, question, return_tensors="pt").to( +# device, torch_dtype +# ) +# out = model.generate(**inputs) +# answer = processor.decode(out[0], skip_special_tokens=True) + +# logger.debug( +# f"\nProcessed VisualQuestionAnswering, Input Image: {image_path}, Input Question: {question}, " +# f"Output Answer: {answer}" +# ) + +# return answer diff --git a/swarms/workers/worker.py b/swarms/workers/worker.py index f4651b9d..692b3302 100644 --- a/swarms/workers/worker.py +++ b/swarms/workers/worker.py @@ -174,7 +174,7 @@ class Worker: query_website_tool, HumanInputRun(), compile, - VQAinference + # VQAinference ] if external_tools is not None: self.tools.extend(external_tools)