From 0eb927a4378dfc10a0c12d5267d71e65f6391dbc Mon Sep 17 00:00:00 2001
From: Kye <kye@apacmediasolutions.com>
Date: Sat, 23 Sep 2023 15:54:41 -0400
Subject: [PATCH] vqa

Former-commit-id: 5bb5f1f23c901ed4c513bba8f4a68957251a6682
---
 swarms/tools/autogpt.py  | 53 ++++++++++++++++++++++++++++++++++++----
 swarms/workers/worker.py |  6 +++--
 2 files changed, 52 insertions(+), 7 deletions(-)

diff --git a/swarms/tools/autogpt.py b/swarms/tools/autogpt.py
index bca03c16..a0451060 100644
--- a/swarms/tools/autogpt.py
+++ b/swarms/tools/autogpt.py
@@ -10,19 +10,17 @@ from langchain.agents import tool
 from langchain.agents.agent_toolkits.pandas.base import create_pandas_dataframe_agent
 from langchain.chains.qa_with_sources.loading import load_qa_with_sources_chain
 from langchain.docstore.document import Document
-from langchain.memory.chat_message_histories import FileChatMessageHistory
-from langchain.tools.human.tool import HumanInputRun
 
 ROOT_DIR = "./data/"
 
 from langchain.chains.qa_with_sources.loading import BaseCombineDocumentsChain
 from langchain.chat_models import ChatOpenAI
 from langchain.text_splitter import RecursiveCharacterTextSplitter
-from langchain.tools import BaseTool, DuckDuckGoSearchRun
-from langchain.tools.file_management.read import ReadFileTool
-from langchain.tools.file_management.write import WriteFileTool
+from langchain.tools import BaseTool
 from pydantic import Field
 
+from swarms.utils.logger import logger
+
 llm = ChatOpenAI(model_name="gpt-4", temperature=1.0)
 
 
@@ -161,3 +159,48 @@ def compile(task: str):
     interpreter.chat()
     interpreter.reset()
 
+
+
+
+
+# mm model workers
+import os
+
+import torch
+from PIL import Image
+from transformers import (
+    BlipForQuestionAnswering,
+    BlipProcessor,
+)
+
+@tool
+def VQAinference(self, inputs):
+    """
+    Answer Question About The Image, VQA Multi-Modal Worker agent
+    description="useful when you need an answer for a question based on an image. "
+    "like: what is the background color of the last image, how many cats in this figure, what is in this figure. "
+    "The input to this tool should be a comma separated string of two, representing the image_path and the question",
+    
+    """
+    device = "cuda:0"
+    torch_dtype = torch.float16 if "cuda" in device else torch.float32
+    processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base")
+    model = BlipForQuestionAnswering.from_pretrained(
+        "Salesforce/blip-vqa-base", torch_dtype=torch_dtype
+    ).to(device)
+
+    image_path, question = inputs.split(",")
+    raw_image = Image.open(image_path).convert("RGB")
+    inputs = processor(raw_image, question, return_tensors="pt").to(
+        device, torch_dtype
+    )
+    out = model.generate(**inputs)
+    answer = processor.decode(out[0], skip_special_tokens=True)
+
+    logger.debug(
+        f"\nProcessed VisualQuestionAnswering, Input Image: {image_path}, Input Question: {question}, "
+        f"Output Answer: {answer}"
+    )
+
+    return answer
+
diff --git a/swarms/workers/worker.py b/swarms/workers/worker.py
index ed8a1b03..b983ac96 100644
--- a/swarms/workers/worker.py
+++ b/swarms/workers/worker.py
@@ -12,7 +12,8 @@ from swarms.tools.autogpt import (
     process_csv,
     # web_search,
     query_website_tool,
-    compile
+    compile, 
+    VQAinference
 )
 from swarms.utils.decorators import error_decorator, log_decorator, timing_decorator
 
@@ -79,7 +80,8 @@ class Worker:
             #email
             #pdf
             # Tool(name="Goal Decomposition Tool", func=todo_chain.run, description="Use Case: Decompose ambitious goals into as many explicit and well defined tasks for an AI agent to follow. Rules and Regulations, don't use this tool too often only in the beginning when the user grants you a mission."),
-            compile
+            compile,
+            VQAinference
         ]
         if external_tools is not None:
             self.tools.extend(external_tools)