diff --git a/mm_agent_example.py b/mm_agent_example.py
index 3177939c..8a7ca8c8 100644
--- a/mm_agent_example.py
+++ b/mm_agent_example.py
@@ -1,7 +1,7 @@
 from swarms.agents import MultiModalAgent
 
 load_dict = {
-    "ImageCaptioning": "cuda"
+    "ImageCaptioning": "cpu"
 }
 
 node = MultiModalAgent(load_dict)
diff --git a/requirements.txt b/requirements.txt
index 138b4c0c..61717a95 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -28,6 +28,32 @@ colored
 
 
 
+addict
+albumentations
+basicsr
+controlnet-aux
+diffusers
+einops
+imageio
+imageio-ffmpeg
+invisible-watermark
+kornia
+numpy
+omegaconf
+open_clip_torch
+openai
+opencv-python
+prettytable
+safetensors
+streamlit
+test-tube
+timm
+torchmetrics
+transformers
+webdataset
+yapf
+
+
 mkdocs
 mkdocs-material
 mkdocs-glightbox
diff --git a/swarms/agents/multi_modal_visual_agent.py b/swarms/agents/multi_modal_visual_agent.py
index 2ea3e5c3..0c9b4506 100644
--- a/swarms/agents/multi_modal_visual_agent.py
+++ b/swarms/agents/multi_modal_visual_agent.py
@@ -1,5 +1,4 @@
 import os
-import gradio as gr
 import random
 import torch
 import cv2
diff --git a/swarms/tools/autogpt.py b/swarms/tools/autogpt.py
index 2ed041c7..13b074ed 100644
--- a/swarms/tools/autogpt.py
+++ b/swarms/tools/autogpt.py
@@ -17,12 +17,11 @@ from langchain.chains.qa_with_sources.loading import BaseCombineDocumentsChain
 from langchain.chat_models import ChatOpenAI
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain.tools import BaseTool
+from langchain.tools.file_management.read import ReadFileTool
+from langchain.tools.file_management.write import WriteFileTool
 from pydantic import Field
 
 from swarms.utils.logger import logger
-from langchain.tools.file_management.write import WriteFileTool
-from langchain.tools.file_management.read import ReadFileTool
-
 
 
 @contextmanager
@@ -141,6 +140,7 @@ query_website_tool = WebpageQATool(qa_chain=load_qa_with_sources_chain(llm))
 # code_intepret = CodeInterpreter()
 import interpreter
 
+
 @tool
 def compile(task: str):
     """
@@ -169,41 +169,42 @@ def compile(task: str):
 
 # mm model workers
 
-import torch
-from PIL import Image
-from transformers import (
-    BlipForQuestionAnswering,
-    BlipProcessor,
-)
-
-@tool
-def VQAinference(self, inputs):
-    """
-    Answer Question About The Image, VQA Multi-Modal Worker agent
-    description="useful when you need an answer for a question based on an image. "
-    "like: what is the background color of the last image, how many cats in this figure, what is in this figure. "
-    "The input to this tool should be a comma separated string of two, representing the image_path and the question",
+# import torch
+# from PIL import Image
+# from transformers import (
+#     BlipForQuestionAnswering,
+#     BlipProcessor,
+# )
+
+
+# @tool
+# def VQAinference(self, inputs):
+#     """
+#     Answer Question About The Image, VQA Multi-Modal Worker agent
+#     description="useful when you need an answer for a question based on an image. "
+#     "like: what is the background color of the last image, how many cats in this figure, what is in this figure. "
+#     "The input to this tool should be a comma separated string of two, representing the image_path and the question",
     
-    """
-    device = "cuda:0"
-    torch_dtype = torch.float16 if "cuda" in device else torch.float32
-    processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base")
-    model = BlipForQuestionAnswering.from_pretrained(
-        "Salesforce/blip-vqa-base", torch_dtype=torch_dtype
-    ).to(device)
-
-    image_path, question = inputs.split(",")
-    raw_image = Image.open(image_path).convert("RGB")
-    inputs = processor(raw_image, question, return_tensors="pt").to(
-        device, torch_dtype
-    )
-    out = model.generate(**inputs)
-    answer = processor.decode(out[0], skip_special_tokens=True)
-
-    logger.debug(
-        f"\nProcessed VisualQuestionAnswering, Input Image: {image_path}, Input Question: {question}, "
-        f"Output Answer: {answer}"
-    )
-
-    return answer
+#     """
+#     device = "cuda:0"
+#     torch_dtype = torch.float16 if "cuda" in device else torch.float32
+#     processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base")
+#     model = BlipForQuestionAnswering.from_pretrained(
+#         "Salesforce/blip-vqa-base", torch_dtype=torch_dtype
+#     ).to(device)
+
+#     image_path, question = inputs.split(",")
+#     raw_image = Image.open(image_path).convert("RGB")
+#     inputs = processor(raw_image, question, return_tensors="pt").to(
+#         device, torch_dtype
+#     )
+#     out = model.generate(**inputs)
+#     answer = processor.decode(out[0], skip_special_tokens=True)
+
+#     logger.debug(
+#         f"\nProcessed VisualQuestionAnswering, Input Image: {image_path}, Input Question: {question}, "
+#         f"Output Answer: {answer}"
+#     )
+
+#     return answer
 
diff --git a/swarms/workers/worker.py b/swarms/workers/worker.py
index f4651b9d..692b3302 100644
--- a/swarms/workers/worker.py
+++ b/swarms/workers/worker.py
@@ -174,7 +174,7 @@ class Worker:
             query_website_tool,
             HumanInputRun(),
             compile,
-            VQAinference
+            # VQAinference
         ]
         if external_tools is not None:
             self.tools.extend(external_tools)