vqa

2 years ago · 84dd4d2aca
parent c46c9c00a7
commit 84dd4d2aca
2 changed files with 52 additions and 7 deletions
--- a/swarms/tools/autogpt.py
+++ b/swarms/tools/autogpt.py
@ -10,19 +10,17 @@ from langchain.agents import tool
 from langchain.agents.agent_toolkits.pandas.base import create_pandas_dataframe_agent
 from langchain.chains.qa_with_sources.loading import load_qa_with_sources_chain
 from langchain.docstore.document import Document
-from langchain.memory.chat_message_histories import FileChatMessageHistory
-from langchain.tools.human.tool import HumanInputRun

 ROOT_DIR = "./data/"

 from langchain.chains.qa_with_sources.loading import BaseCombineDocumentsChain
 from langchain.chat_models import ChatOpenAI
 from langchain.text_splitter import RecursiveCharacterTextSplitter
-from langchain.tools import BaseTool, DuckDuckGoSearchRun
-from langchain.tools.file_management.read import ReadFileTool
-from langchain.tools.file_management.write import WriteFileTool
+from langchain.tools import BaseTool
 from pydantic import Field

+from swarms.utils.logger import logger
+
 llm = ChatOpenAI(model_name="gpt-4", temperature=1.0)


@ -161,3 +159,48 @@ def compile(task: str):
    interpreter.chat()
    interpreter.reset()

+
+
+
+
+# mm model workers
+import os
+
+import torch
+from PIL import Image
+from transformers import (
+    BlipForQuestionAnswering,
+    BlipProcessor,
+)
+
+@tool
+def VQAinference(self, inputs):
+    """
+    Answer Question About The Image, VQA Multi-Modal Worker agent
+    description="useful when you need an answer for a question based on an image. "
+    "like: what is the background color of the last image, how many cats in this figure, what is in this figure. "
+    "The input to this tool should be a comma separated string of two, representing the image_path and the question",
+    
+    """
+    device = "cuda:0"
+    torch_dtype = torch.float16 if "cuda" in device else torch.float32
+    processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base")
+    model = BlipForQuestionAnswering.from_pretrained(
+        "Salesforce/blip-vqa-base", torch_dtype=torch_dtype
+    ).to(device)
+
+    image_path, question = inputs.split(",")
+    raw_image = Image.open(image_path).convert("RGB")
+    inputs = processor(raw_image, question, return_tensors="pt").to(
+        device, torch_dtype
+    )
+    out = model.generate(**inputs)
+    answer = processor.decode(out[0], skip_special_tokens=True)
+
+    logger.debug(
+        f"\nProcessed VisualQuestionAnswering, Input Image: {image_path}, Input Question: {question}, "
+        f"Output Answer: {answer}"
+    )
+
+    return answer
+
--- a/swarms/workers/worker.py
+++ b/swarms/workers/worker.py
@ -12,7 +12,8 @@ from swarms.tools.autogpt import (
    process_csv,
    # web_search,
    query_website_tool,
-    compile
+    compile, 
+    VQAinference
 )
 from swarms.utils.decorators import error_decorator, log_decorator, timing_decorator

@ -79,7 +80,8 @@ class Worker:
            #email
            #pdf
            # Tool(name="Goal Decomposition Tool", func=todo_chain.run, description="Use Case: Decompose ambitious goals into as many explicit and well defined tasks for an AI agent to follow. Rules and Regulations, don't use this tool too often only in the beginning when the user grants you a mission."),
-            compile
+            compile,
+            VQAinference
        ]
        if external_tools is not None:
            self.tools.extend(external_tools)