pull/58/head
Kye 1 year ago
parent 9b5414fbc4
commit cc041dcf77

@ -1,7 +1,7 @@
from swarms.agents import MultiModalAgent
load_dict = {
"ImageCaptioning": "cuda"
"ImageCaptioning": "cpu"
}
node = MultiModalAgent(load_dict)

@ -28,6 +28,32 @@ colored
addict
albumentations
basicsr
controlnet-aux
diffusers
einops
imageio
imageio-ffmpeg
invisible-watermark
kornia
numpy
omegaconf
open_clip_torch
openai
opencv-python
prettytable
safetensors
streamlit
test-tube
timm
torchmetrics
transformers
webdataset
yapf
mkdocs
mkdocs-material
mkdocs-glightbox

@ -1,5 +1,4 @@
import os
import gradio as gr
import random
import torch
import cv2

@ -17,12 +17,11 @@ from langchain.chains.qa_with_sources.loading import BaseCombineDocumentsChain
from langchain.chat_models import ChatOpenAI
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.tools import BaseTool
from langchain.tools.file_management.read import ReadFileTool
from langchain.tools.file_management.write import WriteFileTool
from pydantic import Field
from swarms.utils.logger import logger
from langchain.tools.file_management.write import WriteFileTool
from langchain.tools.file_management.read import ReadFileTool
@contextmanager
@ -141,6 +140,7 @@ query_website_tool = WebpageQATool(qa_chain=load_qa_with_sources_chain(llm))
# code_intepret = CodeInterpreter()
import interpreter
@tool
def compile(task: str):
"""
@ -169,41 +169,42 @@ def compile(task: str):
# mm model workers
import torch
from PIL import Image
from transformers import (
BlipForQuestionAnswering,
BlipProcessor,
)
@tool
def VQAinference(self, inputs):
"""
Answer Question About The Image, VQA Multi-Modal Worker agent
description="useful when you need an answer for a question based on an image. "
"like: what is the background color of the last image, how many cats in this figure, what is in this figure. "
"The input to this tool should be a comma separated string of two, representing the image_path and the question",
"""
device = "cuda:0"
torch_dtype = torch.float16 if "cuda" in device else torch.float32
processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base")
model = BlipForQuestionAnswering.from_pretrained(
"Salesforce/blip-vqa-base", torch_dtype=torch_dtype
).to(device)
image_path, question = inputs.split(",")
raw_image = Image.open(image_path).convert("RGB")
inputs = processor(raw_image, question, return_tensors="pt").to(
device, torch_dtype
)
out = model.generate(**inputs)
answer = processor.decode(out[0], skip_special_tokens=True)
logger.debug(
f"\nProcessed VisualQuestionAnswering, Input Image: {image_path}, Input Question: {question}, "
f"Output Answer: {answer}"
)
return answer
# import torch
# from PIL import Image
# from transformers import (
# BlipForQuestionAnswering,
# BlipProcessor,
# )
# @tool
# def VQAinference(self, inputs):
# """
# Answer Question About The Image, VQA Multi-Modal Worker agent
# description="useful when you need an answer for a question based on an image. "
# "like: what is the background color of the last image, how many cats in this figure, what is in this figure. "
# "The input to this tool should be a comma separated string of two, representing the image_path and the question",
# """
# device = "cuda:0"
# torch_dtype = torch.float16 if "cuda" in device else torch.float32
# processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base")
# model = BlipForQuestionAnswering.from_pretrained(
# "Salesforce/blip-vqa-base", torch_dtype=torch_dtype
# ).to(device)
# image_path, question = inputs.split(",")
# raw_image = Image.open(image_path).convert("RGB")
# inputs = processor(raw_image, question, return_tensors="pt").to(
# device, torch_dtype
# )
# out = model.generate(**inputs)
# answer = processor.decode(out[0], skip_special_tokens=True)
# logger.debug(
# f"\nProcessed VisualQuestionAnswering, Input Image: {image_path}, Input Question: {question}, "
# f"Output Answer: {answer}"
# )
# return answer

@ -174,7 +174,7 @@ class Worker:
query_website_tool,
HumanInputRun(),
compile,
VQAinference
# VQAinference
]
if external_tools is not None:
self.tools.extend(external_tools)

Loading…
Cancel
Save