Merge branch 'yikes' of github.com:twilwa/swarms into yikes

2 years ago · a0c2f6b7c7
parent 8fd9dab80e 889f123518
commit a0c2f6b7c7
143 changed files with 4180 additions and 2143 deletions
--- a/.github/workflows/docs.yml
+++ b/.github/workflows/docs.yml
@ -0,0 +1,19 @@
 name: Docs WorkFlow
 on:
  push:
    branches:
      - master
      - main
      - develop
 jobs:
  deploy:
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v3
      - uses: actions/setup-python@v4
        with:
          python-version: 3.x
      - run: pip install mkdocs-material
      - run: pip install "mkdocstrings[python]"
      - run: mkdocs gh-deploy --force
--- a/.github/workflows/pr_request_checks.yml
+++ b/.github/workflows/pr_request_checks.yml
@ -23,5 +23,5 @@ jobs:
      - name: Run tests and checks
        run: |
-          pytest tests/unit
+          pytest tests/
          pylint swarms
--- a/.github/workflows/pull-request-links.yml
+++ b/.github/workflows/pull-request-links.yml
@ -0,0 +1,18 @@
 name: readthedocs/actions
 on:
  pull_request_target:
    types:
      - opened
    paths:
      - "docs/**"
 permissions:
  pull-requests: write
 jobs:
  pull-request-links:
    runs-on: ubuntu-latest
    steps:
      - uses: readthedocs/actions/preview@v1
        with:
          project-slug: swarms
--- a/.github/workflows/unit-test.yml
+++ b/.github/workflows/unit-test.yml
@ -24,22 +24,10 @@ jobs:
      run: pip install -r requirements.txt
    - name: Run Python unit tests
-      run: python3 -m unittest tests/swarms
+      run: python3 -m unittest tests/
    - name: Verify that the Docker image for the action builds
      run: docker build . --file Dockerfile
    - name: Integration test 1
      uses: ./
      with:
        input-one: something
        input-two: true
    - name: Integration test 2
      uses: ./
      with:
        input-one: something else
        input-two: false
    - name: Verify integration test results
-      run: python3 -m unittest unittesting/swarms
+      run: python3 -m unittest tests/
--- a/.github/workflows/welcome.yml
+++ b/.github/workflows/welcome.yml
@ -0,0 +1,18 @@
 name: Welcome WorkFlow
 on:
  issues:
    types: [opened]
  pull_request_target:
    types: [opened]
 jobs:
  build:
    name: 👋 Welcome
    runs-on: ubuntu-latest
    steps:
      - uses: actions/first-interaction@v1.1.1
        with:
          repo-token: ${{ secrets.GITHUB_TOKEN }}
          issue-message: "Hello there, thank you for opening an Issue ! 🙏🏻 The team was notified and they will get back to you asap."
          pr-message:  "Hello there, thank you for opening an PR ! 🙏🏻 The team was notified and they will get back to you asap."
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -0,0 +1,18 @@
 repos:
  - repo: https://github.com/ambv/black
    rev: 22.3.0
    hooks:
    - id: black
  - repo: https://github.com/charliermarsh/ruff-pre-commit
    rev: 'v0.0.255'
    hooks:
      - id: ruff
        args: [--fix]
  - repo: https://github.com/nbQA-dev/nbQA
    rev: 1.6.3
    hooks:
    - id: nbqa-black
      additional_dependencies: [ipython==8.12, black]
    - id: nbqa-ruff 
      args: ["--ignore=I001"]
      additional_dependencies: [ipython==8.12, ruff]
--- a/api/app.py
+++ b/api/app.py
@ -17,12 +17,15 @@ from dotenv import load_dotenv
 load_dotenv()
 class SwarmInput(BaseModel):
    api_key: str
    objective: str
 app = FastAPI()
@app.on_event("startup")
 async def startup():
    redis_host = os.getenv("REDIS_HOST", "localhost")
@ -31,6 +34,7 @@ async def startup():
    FastAPICache.init(RedisBackend(redis), prefix="fastapi-cache", coder=JsonCoder())
    await FastAPILimiter.init(f"redis://{redis_host}:{redis_port}")
@app.post("/chat", dependencies=[Depends(RateLimiter(times=2, minutes=1))])
@cache(expire=60)  # Cache results for 1 minute
 async def run(swarm_input: SwarmInput):
--- a/api/olds/container.py
+++ b/api/olds/container.py
@ -55,8 +55,6 @@ file_handler = FileHandler(handlers=handlers, path=BASE_DIR)
 templates = Jinja2Templates(directory=BASE_DIR / "api" / "templates")
-uploader = StaticUploader.from_settings(
+uploader = StaticUploader.from_settings(path=BASE_DIR / "static", endpoint="static")
 path=BASE_DIR / "static", endpoint="static"
 )
-reload_dirs = [BASE_DIR / "core", BASE_DIR / "api"]
+reload_dirs = [BASE_DIR / "core", BASE_DIR / "api"]
--- a/api/olds/main.py
+++ b/api/olds/main.py
@ -11,8 +11,15 @@ from fastapi.responses import HTMLResponse
 from fastapi.staticfiles import StaticFiles
 from pydantic import BaseModel
-from api.olds.container import agent_manager, file_handler, reload_dirs, templates, uploader
+from api.olds.container import (
    agent_manager,
    file_handler,
    reload_dirs,
    templates,
    uploader,
 )
 from api.olds.worker import get_task_result, start_worker, task_execute
 # from env import settings
 app = FastAPI()
@ -127,4 +134,4 @@ def dev():
        port=os.environ["EVAL_PORT"],
        reload=True,
        reload_dirs=reload_dirs,
-    )
+    )
--- a/api/olds/worker.py
+++ b/api/olds/worker.py
@ -41,4 +41,4 @@ def start_worker():
            "worker",
            "--loglevel=INFO",
        ]
-    )
+    )
--- a/apps/discord.py
+++ b/apps/discord.py
@ -3,28 +3,34 @@ from langchain.llms import OpenAIChat
 from swarms.agents import OmniModalAgent
 # Setup
-TOKEN = 'YOUR_DISCORD_BOT_TOKEN'
+TOKEN = "YOUR_DISCORD_BOT_TOKEN"
-bot = commands.Bot(command_prefix='!')
+bot = commands.Bot(command_prefix="!")
 # Initialize the OmniModalAgent
 llm = OpenAIChat(model_name="gpt-4")
 agent = OmniModalAgent(llm)
@bot.event
 async def on_ready():
-    print(f'We have logged in as {bot.user}')
+    print(f"We have logged in as {bot.user}")
@bot.command()
 async def greet(ctx):
    """Greets the user."""
-    await ctx.send(f'Hello, {ctx.author.name}!')
+    await ctx.send(f"Hello, {ctx.author.name}!")
@bot.command()
 async def run(ctx, *, description: str):
    """Generates a video based on the given description."""
-    response = agent.run(description)  # Assuming the response provides information or a link to the generated video
+    response = agent.run(
        description
    )  # Assuming the response provides information or a link to the generated video
    await ctx.send(response)
@bot.command()
 async def help_me(ctx):
    """Provides a list of commands and their descriptions."""
@ -35,4 +41,5 @@ async def help_me(ctx):
    """
    await ctx.send(help_text)
 bot.run(TOKEN)
--- a/apps/omni_ui.py
+++ b/apps/omni_ui.py
@ -1,38 +1,42 @@
-#Import required libraries
+# Import required libraries
 from gradio import Interface, Textbox, HTML
 import threading
 import os
 import glob
 import base64
-from langchain.llms import OpenAIChat  
+from langchain.llms import OpenAIChat
-from swarms.agents import OmniModalAgent  
+from swarms.agents import OmniModalAgent
-#Function to convert image to base64
+
 # Function to convert image to base64
 def image_to_base64(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode()
-#Function to get the most recently created image in the directory
+
 # Function to get the most recently created image in the directory
 def get_latest_image():
-    list_of_files = glob.glob('./*.png')  # Replace with your image file type
+    list_of_files = glob.glob("./*.png")  # Replace with your image file type
    if not list_of_files:
        return None
    latest_file = max(list_of_files, key=os.path.getctime)
    return latest_file
-#Initialize your OmniModalAgent
+
 # Initialize your OmniModalAgent
 llm = OpenAIChat(model_name="gpt-4")  # Replace with your actual initialization
 agent = OmniModalAgent(llm)  # Replace with your actual initialization
-#Global variable to store chat history
+# Global variable to store chat history
 chat_history = []
-#Function to update chat
+
 # Function to update chat
 def update_chat(user_input):
    global chat_history
    chat_history.append({"type": "user", "content": user_input})
-    #Get agent response
+    # Get agent response
    agent_response = agent.run(user_input)
    # Handle the case where agent_response is not in the expected dictionary format
@ -48,38 +52,43 @@ def update_chat(user_input):
    return render_chat(chat_history)
-#Function to render chat as HTML
+
 # Function to render chat as HTML
 def render_chat(chat_history):
    chat_str = "<div style='max-height:400px;overflow-y:scroll;'>"
    for message in chat_history:
-        if message['type'] == 'user':
+        if message["type"] == "user":
            chat_str += f"<p><strong>User:</strong> {message['content']}</p>"
-        elif message['type'] == 'text':
+        elif message["type"] == "text":
            chat_str += f"<p><strong>Agent:</strong> {message['content']}</p>"
-        elif message['type'] == 'image':
+        elif message["type"] == "image":
-            img_path = os.path.join(".", message['content'])
+            img_path = os.path.join(".", message["content"])
            base64_img = image_to_base64(img_path)
            chat_str += f"<p><strong>Agent:</strong> <img src='data:image/png;base64,{base64_img}' alt='image' width='200'/></p>"
    chat_str += "</div>"
    return chat_str
-#Define Gradio interface
+
 # Define Gradio interface
 iface = Interface(
-    fn=update_chat, 
+    fn=update_chat,
-    inputs=Textbox(label="Your Message", type="text"), 
+    inputs=Textbox(label="Your Message", type="text"),
    outputs=HTML(label="Chat History"),
-    live=True
+    live=True,
 )
-#Function to update the chat display
+
 # Function to update the chat display
 def update_display():
    global chat_history
    while True:
        iface.update(render_chat(chat_history))
-#Run the update_display function in a separate thread
+
 # Run the update_display function in a separate thread
 threading.Thread(target=update_display).start()
-#Run Gradio interface
+# Run Gradio interface
-iface.launch()
+iface.launch()
--- a/docs/old-docs/design/abstraction.py
+++ b/docs/old-docs/design/abstraction.py
@ -1,32 +1,19 @@
 from swarms import Model, Agent, vectorstore, tools, orchestrator
-#1 model
+# 1 model
 Model(openai)
-#2 agent level
+# 2 agent level
-Agent(
+Agent(model, vectorstore, tools)
    model,
    vectorstore,
    tools
 )
-#3 worker infrastructure level
+# 3 worker infrastructure level
-worker_node(
+worker_node(Agent, human_input, tools)
    Agent,
    human_input,
    tools
 )
-#4 swarm level basically handling infrastructure for multiple worker node
+# 4 swarm level basically handling infrastructure for multiple worker node
-swarm = orchestrator(
+swarm = orchestrator(worker_node, 100)  # nodes
    worker_node,
    100 # nodes
 )
-#5
+# 5
-hivemind = Hivemind(
+hivemind = Hivemind(swarm * 100)
    swarm * 100
 )
-#a market different pre built worker or boss agent that have access to different tools and memory, proompts
+# a market different pre built worker or boss agent that have access to different tools and memory, proompts
--- a/example.py
+++ b/example.py
@ -1,22 +1,17 @@
 from langchain.llms import OpenAIChat
 from swarms import Worker
-llm = OpenAIChat(
+llm = OpenAIChat(model_name="gpt-4", openai_api_key="api-key", temperature=0.5)
    model_name='gpt-4', 
    openai_api_key="api-key", 
    temperature=0.5
 )
 node = Worker(
    llm=llm,
    ai_name="Optimus Prime",
    ai_role="Worker in a swarm",
-    external_tools = None,
+    external_tools=None,
-    human_in_the_loop = False,
+    human_in_the_loop=False,
-    temperature = 0.5,
+    temperature=0.5,
 )
 task = "What were the winning boston marathon times for the past 5 years (ending in 2022)? Generate a table of the year, name, country of origin, and times."
 response = node.run(task)
 print(response)
--- a/omnimodal_agent_example.py
+++ b/omnimodal_agent_example.py
@ -6,4 +6,4 @@ llm = OpenAIChat(model_name="gpt-4")
 agent = OmniModalAgent(llm)
-agent.run("Create a video of a swarm of fish")
+agent.run("Create a video of a swarm of fish")
--- a/playground/DIY/hierchical.py
+++ b/playground/DIY/hierchical.py
@ -8,13 +8,13 @@ swarm = HierarchicalSwarm(
    use_vectorstore=False,
    use_async=False,
    human_in_the_loop=False,
-    logging_enabled=False
+    logging_enabled=False,
 )
-#run the swarm with an objective 
+# run the swarm with an objective
 result = swarm.run("Design a new car")
-#or huggingface
+# or huggingface
 swarm = HierarchicalSwarm(
    model_type="huggingface",
    model_id="tiaueu/falcon",
--- a/playground/agents/meta_prompter.py
+++ b/playground/agents/meta_prompter.py
@ -0,0 +1,24 @@
 from swarms.workers import Worker
 from swarms.agents.meta_prompter import MetaPrompterAgent
 from langchain.llms import OpenAI
 #init llm
 llm = OpenAI()
 #init the meta prompter agent that optimized prompts
 meta_optimizer = MetaPrompterAgent(llm=llm)
 #init the worker agent
 worker = Worker(llm)
 #broad task to complete
 task = "Create a feedforward in pytorch"
 #optimize the prompt
 optimized_prompt = meta_optimizer.run(task)    
 #run the optimized prompt with detailed instructions
 result = worker.run(optimized_prompt)
 #print
 print(result)
--- a/playground/agents/mm_agent_example.py
+++ b/playground/agents/mm_agent_example.py
@ -1,8 +1,6 @@
 from swarms.agents import MultiModalAgent
-load_dict = {
+load_dict = {"ImageCaptioning": "cuda"}
    "ImageCaptioning": "cuda"
 }
 node = MultiModalAgent(load_dict)
@ -12,5 +10,5 @@ img = node.run_img("/image1", "What is this image about?")
 chat = node.chat(
    "What is your name? Generate a picture of yourself. What is this image about?",
-    streaming=True
+    streaming=True,
 )
--- a/playground/agents/omni_exa_example.py
+++ b/playground/agents/omni_exa_example.py
@ -1,12 +1,9 @@
-#pip3 install exxa
+# pip3 install exxa
 from exa import Inference
 from swarms.agents import OmniModalAgent
-llm = Inference(
+llm = Inference(model_id="mistralai/Mistral-7B-v0.1", quantize=True)
    model_id="mistralai/Mistral-7B-v0.1",
    quantize=True
 )
 agent = OmniModalAgent(llm)
-agent.run("Create a video of a swarm of fish")
+agent.run("Create a video of a swarm of fish")
--- a/playground/models/mistral.py
+++ b/playground/models/mistral.py
@ -1,10 +1,7 @@
 from swarms.models import Mistral
-model = Mistral(
+model = Mistral(device="cuda", use_flash_attention=True)
    device="cuda", 
    use_flash_attention=True
 )
 prompt = "My favourite condiment is"
 result = model.run(prompt)
-print(result)
+print(result)
--- a/playground/structs/nonlinear_worfklow.py
+++ b/playground/structs/nonlinear_worfklow.py
@ -7,12 +7,12 @@ prompt2 = "Develop a self attention using pytorch"
 task1 = Task("task1", prompt)
 task2 = Task("task2", prompt2, parents=[task1])
-#add tasks to workflow
+# add tasks to workflow
 workflow = NonLinearWorkflow(agent)
-#add tasks to tree
+# add tasks to tree
 workflow.add(task1)
 workflow.add(task2)
-#run
+# run
-workflow.run()
+workflow.run()
--- a/playground/swarms/autoscaler.py
+++ b/playground/swarms/autoscaler.py
@ -5,4 +5,3 @@ auto_scaler.start()
 for i in range(100):
    auto_scaler.add_task(f"Task {i}")
--- a/playground/swarms/chat.py
+++ b/playground/swarms/chat.py
@ -1,11 +1,7 @@
 from swarms import Orchestrator, Worker
 # Instantiate the Orchestrator with 10 agents
-orchestrator = Orchestrator(
+orchestrator = Orchestrator(Worker, agent_list=[Worker] * 10, task_queue=[])
    Worker, 
    agent_list=[Worker]*10, 
    task_queue=[]
 )
 # Agent 1 sends a message to Agent 2
-orchestrator.chat(sender_id=1, receiver_id=2, message="Hello, Agent 2!")
+orchestrator.chat(sender_id=1, receiver_id=2, message="Hello, Agent 2!")
--- a/playground/swarms/debate.py
+++ b/playground/swarms/debate.py
@ -89,6 +89,7 @@ class DialogueSimulator:
        return speaker.name, message
 class BiddingDialogueAgent(DialogueAgent):
    def __init__(
        self,
@ -114,6 +115,7 @@ class BiddingDialogueAgent(DialogueAgent):
        bid_string = self.model([SystemMessage(content=prompt)]).content
        return bid_string
 character_names = ["Donald Trump", "Kanye West", "Elizabeth Warren"]
 topic = "transcontinental high speed rail"
 word_limit = 50
@ -202,8 +204,6 @@ for (
    print(f"\n{character_header}")
    print(f"\n{character_system_message.content}")
 class BidOutputParser(RegexParser):
    def get_format_instructions(self) -> str:
@ -214,6 +214,7 @@ bid_parser = BidOutputParser(
    regex=r"<(\d+)>", output_keys=["bid"], default_output_key="bid"
 )
 def generate_character_bidding_template(character_header):
    bidding_template = f"""{character_header}
@ -232,6 +233,7 @@ def generate_character_bidding_template(character_header):
    """
    return bidding_template
 character_bidding_templates = [
    generate_character_bidding_template(character_header)
    for character_header in character_headers
@ -263,6 +265,7 @@ specified_topic = ChatOpenAI(temperature=1.0)(topic_specifier_prompt).content
 print(f"Original topic:\n{topic}\n")
 print(f"Detailed topic:\n{specified_topic}\n")
@tenacity.retry(
    stop=tenacity.stop_after_attempt(2),
    wait=tenacity.wait_none(),  # No waiting time between retries
@ -280,6 +283,7 @@ def ask_for_bid(agent) -> str:
    bid = int(bid_parser.parse(bid_string)["bid"])
    return bid
 def select_next_speaker(step: int, agents: List[DialogueAgent]) -> int:
    bids = []
    for agent in agents:
@ -300,6 +304,7 @@ def select_next_speaker(step: int, agents: List[DialogueAgent]) -> int:
    print("\n")
    return idx
 characters = []
 for character_name, character_system_message, bidding_template in zip(
    character_names, character_system_messages, character_bidding_templates
@ -326,4 +331,4 @@ while n < max_iters:
    name, message = simulator.step()
    print(f"({name}): {message}")
    print("\n")
-    n += 1
+    n += 1
--- a/playground/swarms/dialogue_simulator.py
+++ b/playground/swarms/dialogue_simulator.py
@ -4,12 +4,12 @@ worker1 = Worker(ai_name="Plinus", openai_api_key="")
 worker2 = Worker(ai_name="Optimus Prime", openai_api_key="")
 collab = DialogueSimulator(
-    [worker1, worker2], 
+    [worker1, worker2],
    # DialogueSimulator.select_next_speaker
 )
 collab.run(
-    max_iters = 4,
+    max_iters=4,
-    name = "plinus",
+    name="plinus",
-    message = "how can we enable multi agent collaboration",
+    message="how can we enable multi agent collaboration",
-)
+)
--- a/playground/swarms/easy_example.py
+++ b/playground/swarms/easy_example.py
@ -5,4 +5,3 @@ api_key = "APIKEY"
 objective = "What is the capital of the UK?"
 result = swarm(api_key, objective)
 print(result)  # Prints: "The capital of the UK is London."
--- a/playground/swarms/godmode.py
+++ b/playground/swarms/godmode.py
@ -1,4 +1,3 @@
 from langchain.models import Anthropic, GooglePalm, OpenAIChat
 from swarms.swarms import GodMode
@ -7,14 +6,10 @@ palm = GooglePalm(google_api_key="")
 gpt = OpenAIChat(openai_api_key="")
 # Usage
-llms = [
+llms = [claude, palm, gpt]
   claude,
   palm,
   gpt 
 ]
 god_mode = GodMode(llms)
 task = "What are the biggest risks facing humanity?"
-god_mode.print_responses(task)
+god_mode.print_responses(task)
--- a/playground/swarms/group_chat.py
+++ b/playground/swarms/group_chat.py
@ -1,2 +1 @@
 from swarms.swarms import GroupChat
--- a/playground/swarms/groupchat.py
+++ b/playground/swarms/groupchat.py
@ -2,44 +2,36 @@ from langchain.llms import OpenAIChat
 from swarms.swarms import GroupChat, GroupChatManager
 from swarms.workers import Worker
-llm = OpenAIChat(
+llm = OpenAIChat(model_name="gpt-4", openai_api_key="api-key", temperature=0.5)
    model_name='gpt-4', 
    openai_api_key="api-key", 
    temperature=0.5
 )
 node = Worker(
    llm=llm,
    ai_name="Optimus Prime",
    ai_role="Worker in a swarm",
-    external_tools = None,
+    external_tools=None,
-    human_in_the_loop = False,
+    human_in_the_loop=False,
-    temperature = 0.5,
+    temperature=0.5,
 )
 node2 = Worker(
    llm=llm,
    ai_name="Optimus Prime",
    ai_role="Worker in a swarm",
-    external_tools = None,
+    external_tools=None,
-    human_in_the_loop = False,
+    human_in_the_loop=False,
-    temperature = 0.5,
+    temperature=0.5,
 )
 node3 = Worker(
    llm=llm,
    ai_name="Optimus Prime",
    ai_role="Worker in a swarm",
-    external_tools = None,
+    external_tools=None,
-    human_in_the_loop = False,
+    human_in_the_loop=False,
-    temperature = 0.5,
+    temperature=0.5,
 )
-nodes = [
+nodes = [node, node2, node3]
    node,
    node2,
    node3
 ]
 messages = [
    {
--- a/playground/swarms/gui_app.py
+++ b/playground/swarms/gui_app.py
@ -20,4 +20,4 @@ I want it to have neumorphism-style. Serve it on port 4500.
 """
 # Run HierarchicalSwarm
-swarm.run(objective)
+swarm.run(objective)
--- a/playground/swarms/multi_agent_collab.py
+++ b/playground/swarms/multi_agent_collab.py
@ -1,9 +1,11 @@
 from swarms import DialogueSimulator, Worker
 def select_next_speaker(step: int, agents) -> int:
    idx = (step) % len(agents)
    return idx
 debate = DialogueSimulator(Worker, select_next_speaker)
 debate.run()
--- a/playground/swarms/multi_agent_debate.py
+++ b/playground/swarms/multi_agent_debate.py
@ -5,11 +5,7 @@ worker1 = Worker(openai_api_key="", ai_name="Optimus Prime")
 worker2 = Worker(openai_api_key="", ai_name="Bumblebee")
 worker3 = Worker(openai_api_key="", ai_name="Megatron")
-agents = [
+agents = [worker1, worker2, worker3]
    worker1,
    worker2,
    worker3
 ]
 # Initialize multi-agent debate with the selection function
 debate = MultiAgentDebate(agents, select_speaker)
@ -20,4 +16,4 @@ results = debate.run(task, max_iters=4)
 # Print results
 for result in results:
-    print(f"Agent {result['agent']} responded: {result['response']}")
+    print(f"Agent {result['agent']} responded: {result['response']}")
--- a/playground/swarms/orchestrate.py
+++ b/playground/swarms/orchestrate.py
@ -3,12 +3,11 @@ from swarms import Worker, Orchestrator
 node = Worker(
    openai_api_key="",
    ai_name="Optimus Prime",
 )
 # Instantiate the Orchestrator with 10 agents
-orchestrator = Orchestrator(node, agent_list=[node]*10, task_queue=[])
+orchestrator = Orchestrator(node, agent_list=[node] * 10, task_queue=[])
 # Agent 7 sends a message to Agent 9
-orchestrator.chat(sender_id=7, receiver_id=9, message="Can you help me with this task?")
+orchestrator.chat(sender_id=7, receiver_id=9, message="Can you help me with this task?")
--- a/playground/swarms/orchestrator.py
+++ b/playground/swarms/orchestrator.py
@ -3,12 +3,11 @@ from swarms import Worker, Orchestrator
 node = Worker(
    openai_api_key="",
    ai_name="Optimus Prime",
 )
 # Instantiate the Orchestrator with 10 agents
-orchestrator = Orchestrator(node, agent_list=[node]*10, task_queue=[])
+orchestrator = Orchestrator(node, agent_list=[node] * 10, task_queue=[])
 # Agent 7 sends a message to Agent 9
-orchestrator.chat(sender_id=7, receiver_id=9, message="Can you help me with this task?")
+orchestrator.chat(sender_id=7, receiver_id=9, message="Can you help me with this task?")
--- a/playground/swarms/social_app.py
+++ b/playground/swarms/social_app.py
@ -16,4 +16,4 @@ The ports you can use are 4500 and 6500.
 """
 # Run HierarchicalSwarm
-swarm.run(objective)
+swarm.run(objective)
--- a/playground/swarms/swarms_example.py
+++ b/playground/swarms/swarms_example.py
@ -10,4 +10,4 @@ swarm = HierarchicalSwarm(api_key)
 objective = "Find 20 potential customers for a HierarchicalSwarm based AI Agent automation infrastructure"
 # Run HierarchicalSwarm
-swarm.run(objective)
+swarm.run(objective)
--- a/playground/swarms/todo_app.py
+++ b/playground/swarms/todo_app.py
@ -17,4 +17,4 @@ The ports you can use are 4500 and 6500.
 """
 # Run HierarchicalSwarm
-swarm.run(objective)
+swarm.run(objective)
--- a/playground/worker/ultranode_example.py
+++ b/playground/worker/ultranode_example.py
@ -12,4 +12,4 @@ I want it to have neumorphism-style. Serve it on port 4500.
 """
 node = WorkerUltraUltraNode(objective)
-result = node.execute()
+result = node.execute()
--- a/playground/worker/worker.py
+++ b/playground/worker/worker.py
@ -1,19 +1,15 @@
 from langchain.models import OpenAIChat
 from swarms import Worker
-llm = OpenAIChat(
+llm = OpenAIChat(model_name="gpt-4", openai_api_key="api-key", temperature=0.5)
    model_name='gpt-4', 
    openai_api_key="api-key", 
    temperature=0.5
 )
 node = Worker(
    llm=llm,
    ai_name="Optimus Prime",
    ai_role="Worker in a swarm",
-    external_tools = None,
+    external_tools=None,
-    human_in_the_loop = False,
+    human_in_the_loop=False,
-    temperature = 0.5,
+    temperature=0.5,
 )
 task = "What were the winning boston marathon times for the past 5 years (ending in 2022)? Generate a table of the year, name, country of origin, and times."
--- a/playground/worker/worker_ultra.py
+++ b/playground/worker/worker_ultra.py
@ -22,4 +22,4 @@ worker = WorkerUltra(objective, api_key)
 result = worker.execute()
 # Print the result
-print(result)
+print(result)
--- a/playground/workflow.py
+++ b/playground/workflow.py
@ -1,4 +1,3 @@
 from swarms import Workflow
 from swarms.tools.autogpt import ChatOpenAI
--- a/pyproject.toml
+++ b/pyproject.toml
@ -50,6 +50,7 @@ exxa = "*"
 open-interpreter = "*"
 tabulate = "*"
 termcolor = "*"
 black = "*"
 [tool.poetry.dev-dependencies]
 first_dependency = {git = "https://github.com/IDEA-Research/GroundingDINO.git"}
--- a/setup.py
+++ b/setup.py
@ -1,50 +1,50 @@
 from setuptools import setup, find_packages
 setup(
-  name = 'swarms',
+    name="swarms",
-  packages = find_packages(exclude=[]),
+    packages=find_packages(exclude=[]),
-  version = '1.4.1',
+    version="1.4.1",
-  license='MIT',
+    license="MIT",
-  description = 'Swarms - Pytorch',
+    description="Swarms - Pytorch",
-  author = 'Kye Gomez',
+    author="Kye Gomez",
-  author_email = 'kye@apac.ai',
+    author_email="kye@apac.ai",
-  long_description_content_type = 'text/markdown',
+    long_description_content_type="text/markdown",
-  url = 'https://github.com/kyegomez/swarms',
+    url="https://github.com/kyegomez/swarms",
-  keywords = [
+    keywords=[
-    'artificial intelligence',
+        "artificial intelligence",
-    'deep learning',
+        "deep learning",
-    'optimizers',
+        "optimizers",
-    "Prompt Engineering"
+        "Prompt Engineering",
  ],
  install_requires=[
        'transformers',
        'openai',
        'langchain==0.0.240',
        'asyncio',
        'nest_asyncio',
        'pegasusx',
        'google-generativeai',
        'oceandb',
        'langchain-experimental',
        'playwright',
        'duckduckgo_search',
        'faiss-cpu',
        'wget',
        'httpx',
        'ggl',
        'beautifulsoup4',
        'pydantic',
        'tenacity',
        'celery',
        'redis',
        'google-search-results==2.4.2',
        'Pillow',
    ],
-  classifiers=[
+    install_requires=[
-    'Development Status :: 4 - Beta',
+        "transformers",
-    'Intended Audience :: Developers',
+        "openai",
-    'Topic :: Scientific/Engineering :: Artificial Intelligence',
+        "langchain==0.0.240",
-    'License :: OSI Approved :: MIT License',
+        "asyncio",
-    'Programming Language :: Python :: 3.6',
+        "nest_asyncio",
-  ],
+        "pegasusx",
-)
+        "google-generativeai",
        "oceandb",
        "langchain-experimental",
        "playwright",
        "duckduckgo_search",
        "faiss-cpu",
        "wget",
        "httpx",
        "ggl",
        "beautifulsoup4",
        "pydantic",
        "tenacity",
        "celery",
        "redis",
        "google-search-results==2.4.2",
        "Pillow",
    ],
    classifiers=[
        "Development Status :: 4 - Beta",
        "Intended Audience :: Developers",
        "Topic :: Scientific/Engineering :: Artificial Intelligence",
        "License :: OSI Approved :: MIT License",
        "Programming Language :: Python :: 3.6",
    ],
 )
--- a/swarms/init.py
+++ b/swarms/init.py
@ -7,6 +7,7 @@ from swarms import models
 from swarms.workers.worker import Worker
 from swarms import workers
 from swarms.logo import logo2
 print(logo2)
 # worker
--- a/swarms/agents/init.py
+++ b/swarms/agents/init.py
@ -1,4 +1,3 @@
 """Agent Infrastructure, models, memory, utils, tools"""
 # agents
--- a/swarms/agents/aot.py
+++ b/swarms/agents/aot.py
@ -4,7 +4,9 @@ import time
 import openai
-logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+logging.basicConfig(
    level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
 )
 logger = logging.getLogger(__name__)
@ -25,11 +27,13 @@ class OpenAI:
            raise Exception("Please provide OpenAI API key")
        if api_base == "" or api_base is None:
-            api_base = os.environ.get("OPENAI_API_BASE", "")  # if not set, use the default base path of "https://api.openai.com/v1"
+            api_base = os.environ.get(
                "OPENAI_API_BASE", ""
            )  # if not set, use the default base path of "https://api.openai.com/v1"
        if api_base != "":
            # e.g. https://api.openai.com/v1/ or your custom url
            openai.api_base = api_base
-            print(f'Using custom api_base {api_base}')
+            print(f"Using custom api_base {api_base}")
        if api_model == "" or api_model is None:
            api_model = os.environ.get("OPENAI_API_MODEL", "")
@ -37,29 +41,17 @@ class OpenAI:
            self.api_model = api_model
        else:
            self.api_model = "text-davinci-003"
-        print(f'Using api_model {self.api_model}')
+        print(f"Using api_model {self.api_model}")
-        self.use_chat_api = 'gpt' in self.api_model
+        self.use_chat_api = "gpt" in self.api_model
        self.strategy = strategy
        self.evaluation_strategy = evaluation_strategy
-    def run(
+    def run(self, prompt, max_tokens, temperature, k=1, stop=None):
        self,
        prompt,
        max_tokens,
        temperature,
        k=1,
        stop=None
    ):
        while True:
            try:
                if self.use_chat_api:
-                    messages = [
+                    messages = [{"role": "user", "content": prompt}]
                        {
                            "role": "user",
                            "content": prompt
                        }
                    ]
                    response = openai.ChatCompletion.create(
                        model=self.api_model,
                        messages=messages,
@ -75,17 +67,21 @@ class OpenAI:
                        stop=stop,
                        temperature=temperature,
                    )
-                with open("openai.logs", 'a') as log_file:
+                with open("openai.logs", "a") as log_file:
-                    log_file.write("\n" + "-----------" + '\n' + "Prompt : " + prompt + "\n")
+                    log_file.write(
                        "\n" + "-----------" + "\n" + "Prompt : " + prompt + "\n"
                    )
                return response
            except openai.error.RateLimitError as e:
                sleep_duratoin = os.environ.get("OPENAI_RATE_TIMEOUT", 30)
-                print(f'{str(e)}, sleep for {sleep_duratoin}s, set it by env OPENAI_RATE_TIMEOUT')
+                print(
                    f"{str(e)}, sleep for {sleep_duratoin}s, set it by env OPENAI_RATE_TIMEOUT"
                )
                time.sleep(sleep_duratoin)
    def openai_choice2text_handler(self, choice):
        if self.use_chat_api:
-            text = choice['message']['content']
+            text = choice["message"]["content"]
        else:
            text = choice.text.strip()
        return text
@ -102,20 +98,16 @@ class OpenAI:
        else:
            response = self.run(prompt, 300, 0.5, k)
-            thoughts = [self.openai_choice2text_handler(choice) for choice in response.choices]
+            thoughts = [
                self.openai_choice2text_handler(choice) for choice in response.choices
            ]
            return thoughts
-    def generate_thoughts(
+    def generate_thoughts(self, state, k, initial_prompt, rejected_solutions=None):
-        self,
+        if isinstance(state, str):
        state,
        k,
        initial_prompt,
        rejected_solutions=None
    ):
        if (isinstance(state, str)):
            state_text = state
        else:
-            state_text = '\n'.join(state)
+            state_text = "\n".join(state)
        print("New state generating thought:", state, "\n\n")
        prompt = f"""
        Accomplish the task below by decomposing it as many very explicit subtasks as possible, be very explicit and thorough denoted by
@ -135,14 +127,10 @@ class OpenAI:
        # print(f"Generated thoughts: {thoughts}")
        return thoughts
-    def generate_solution(self,
+    def generate_solution(self, initial_prompt, state, rejected_solutions=None):
                          initial_prompt,
                          state,
                          rejected_solutions=None):
        try:
            if isinstance(state, list):
-                state_text = '\n'.join(state)
+                state_text = "\n".join(state)
            else:
                state_text = state
@ -156,7 +144,7 @@ class OpenAI:
            ###{rejected_solutions}###,
            complete the {initial_prompt} without making the same mistakes you did with the evaluated rejected solutions. Be simple. Be direct. Provide intuitive solutions as soon as you think of them."""
            answer = self.generate_text(prompt, 1)
-            print(f'Generated Solution Summary {answer}')
+            print(f"Generated Solution Summary {answer}")
            return answer
        except Exception as e:
            logger.error(f"Error in generate_solutions: {e}")
@ -166,14 +154,20 @@ class OpenAI:
        if not states:
            return {}
-        if self.evaluation_strategy == 'value':
+        if self.evaluation_strategy == "value":
            state_values = {}
            for state in states:
-                if (isinstance(state, str)):
+                if isinstance(state, str):
                    state_text = state
                else:
-                    state_text = '\n'.join(state)
+                    state_text = "\n".join(state)
-                print("We receive a state of type", type(state), "For state: ", state, "\n\n")
+                print(
                    "We receive a state of type",
                    type(state),
                    "For state: ",
                    state,
                    "\n\n",
                )
                prompt = f""" To achieve the following goal: '{initial_prompt}', pessimistically value the context of the past solutions and more importantly the latest generated solution you had AS A FLOAT BETWEEN 0 AND 1\n
                    Past solutions:\n\n
                    {state_text}\n
@ -244,7 +238,11 @@ class AoTAgent:
        for next_state in thoughts:
            state_value = self.evaluated_thoughts[next_state]
            if state_value > self.value_threshold:
-                child = (state, next_state) if isinstance(state, str) else (*state, next_state)
+                child = (
                    (state, next_state)
                    if isinstance(state, str)
                    else (*state, next_state)
                )
                self.dfs(child, step + 1)
                # backtracking
@ -255,17 +253,18 @@ class AoTAgent:
    def generate_and_filter_thoughts(self, state):
        thoughts = self.model.generate_thoughts(
-            state,
+            state, self.num_thoughts, self.initial_prompt
            self.num_thoughts,
            self.initial_prompt
        )
        self.evaluated_thoughts = self.model.evaluate_states(
-            thoughts,
+            thoughts, self.initial_prompt
            self.initial_prompt
        )
-        filtered_thoughts = [thought for thought in thoughts if self.evaluated_thoughts[thought] >= self.pruning_threshold]
+        filtered_thoughts = [
            thought
            for thought in thoughts
            if self.evaluated_thoughts[thought] >= self.pruning_threshold
        ]
        print(f"filtered_thoughts: {filtered_thoughts}")
        return filtered_thoughts
--- a/swarms/agents/base.py
+++ b/swarms/agents/base.py
@ -18,7 +18,7 @@ class AbstractAgent:
        self,
        name: str,
        # tools: List[Tool],
-        #memory: Memory
+        # memory: Memory
    ):
        """
        Args:
@ -51,10 +51,7 @@ class AbstractAgent:
    def chat(self, messages: List[Dict]):
        """Chat with the agent"""
-    def _achat(
+    def _achat(self, messages: List[Dict]):
        self,
        messages: List[Dict]
    ):
        """Asynchronous Chat"""
    def step(self, message: str):
--- a/swarms/agents/conversabe_agent.py
+++ b/swarms/agents/conversabe_agent.py
@ -43,7 +43,9 @@ class ConversableAgent(Agent):
    DEFAULT_CONFIG = {
        "model": DEFAULT_MODEL,
    }
-    MAX_CONSECUTIVE_AUTO_REPLY = 100  # maximum number of consecutive auto replies (subject to future change)
+    MAX_CONSECUTIVE_AUTO_REPLY = (
        100  # maximum number of consecutive auto replies (subject to future change)
    )
    def __init__(
        self,
@ -103,7 +105,9 @@ class ConversableAgent(Agent):
        self._oai_messages = defaultdict(list)
        self._oai_system_message = [{"content": system_message, "role": "system"}]
        self._is_termination_msg = (
-            is_termination_msg if is_termination_msg is not None else (lambda x: x.get("content") == "TERMINATE")
+            is_termination_msg
            if is_termination_msg is not None
            else (lambda x: x.get("content") == "TERMINATE")
        )
        if llm_config is False:
            self.llm_config = False
@ -112,21 +116,33 @@ class ConversableAgent(Agent):
            if isinstance(llm_config, dict):
                self.llm_config.update(llm_config)
-        self._code_execution_config = {} if code_execution_config is None else code_execution_config
+        self._code_execution_config = (
            {} if code_execution_config is None else code_execution_config
        )
        self.human_input_mode = human_input_mode
        self._max_consecutive_auto_reply = (
-            max_consecutive_auto_reply if max_consecutive_auto_reply is not None else self.MAX_CONSECUTIVE_AUTO_REPLY
+            max_consecutive_auto_reply
            if max_consecutive_auto_reply is not None
            else self.MAX_CONSECUTIVE_AUTO_REPLY
        )
        self._consecutive_auto_reply_counter = defaultdict(int)
-        self._max_consecutive_auto_reply_dict = defaultdict(self.max_consecutive_auto_reply)
+        self._max_consecutive_auto_reply_dict = defaultdict(
            self.max_consecutive_auto_reply
        )
        self._function_map = {} if function_map is None else function_map
        self._default_auto_reply = default_auto_reply
        self._reply_func_list = []
        self.reply_at_receive = defaultdict(bool)
        self.register_reply([Agent, None], ConversableAgent.generate_oai_reply)
-        self.register_reply([Agent, None], ConversableAgent.generate_code_execution_reply)
+        self.register_reply(
-        self.register_reply([Agent, None], ConversableAgent.generate_function_call_reply)
+            [Agent, None], ConversableAgent.generate_code_execution_reply
-        self.register_reply([Agent, None], ConversableAgent.check_termination_and_human_reply)
+        )
        self.register_reply(
            [Agent, None], ConversableAgent.generate_function_call_reply
        )
        self.register_reply(
            [Agent, None], ConversableAgent.check_termination_and_human_reply
        )
    def register_reply(
        self,
@ -170,7 +186,9 @@ class ConversableAgent(Agent):
                The function returns None. Signature: ```def reset_config(config: Any)```
        """
        if not isinstance(trigger, (type, str, Agent, Callable, list)):
-            raise ValueError("trigger must be a class, a string, an agent, a callable or a list.")
+            raise ValueError(
                "trigger must be a class, a string, an agent, a callable or a list."
            )
        self._reply_func_list.insert(
            position,
            {
@ -195,7 +213,9 @@ class ConversableAgent(Agent):
        """
        self._oai_system_message[0]["content"] = system_message
-    def update_max_consecutive_auto_reply(self, value: int, sender: Optional[Agent] = None):
+    def update_max_consecutive_auto_reply(
        self, value: int, sender: Optional[Agent] = None
    ):
        """Update the maximum number of consecutive auto replies.
        Args:
@ -211,7 +231,11 @@ class ConversableAgent(Agent):
    def max_consecutive_auto_reply(self, sender: Optional[Agent] = None) -> int:
        """The maximum number of consecutive auto replies."""
-        return self._max_consecutive_auto_reply if sender is None else self._max_consecutive_auto_reply_dict[sender]
+        return (
            self._max_consecutive_auto_reply
            if sender is None
            else self._max_consecutive_auto_reply_dict[sender]
        )
    @property
    def chat_messages(self) -> Dict[Agent, List[Dict]]:
@ -236,7 +260,9 @@ class ConversableAgent(Agent):
            if n_conversations == 1:
                for conversation in self._oai_messages.values():
                    return conversation[-1]
-            raise ValueError("More than one conversation is found. Please specify the sender to get the last message.")
+            raise ValueError(
                "More than one conversation is found. Please specify the sender to get the last message."
            )
        return self._oai_messages[agent][-1]
    @property
@ -244,7 +270,11 @@ class ConversableAgent(Agent):
        """Bool value of whether to use docker to execute the code,
        or str value of the docker image name to use, or None when code execution is disabled.
        """
-        return None if self._code_execution_config is False else self._code_execution_config.get("use_docker")
+        return (
            None
            if self._code_execution_config is False
            else self._code_execution_config.get("use_docker")
        )
    @staticmethod
    def _message_to_dict(message: Union[Dict, str]):
@ -257,7 +287,9 @@ class ConversableAgent(Agent):
        else:
            return message
-    def _append_oai_message(self, message: Union[Dict, str], role, conversation_id: Agent) -> bool:
+    def _append_oai_message(
        self, message: Union[Dict, str], role, conversation_id: Agent
    ) -> bool:
        """Append a message to the ChatCompletion conversation.
        If the message received is a string, it will be put in the "content" field of the new dictionary.
@ -275,16 +307,24 @@ class ConversableAgent(Agent):
        """
        message = self._message_to_dict(message)
        # create oai message to be appended to the oai conversation that can be passed to oai directly.
-        oai_message = {k: message[k] for k in ("content", "function_call", "name", "context") if k in message}
+        oai_message = {
            k: message[k]
            for k in ("content", "function_call", "name", "context")
            if k in message
        }
        if "content" not in oai_message:
            if "function_call" in oai_message:
-                oai_message["content"] = None  # if only function_call is provided, content will be set to None.
+                oai_message[
                    "content"
                ] = None  # if only function_call is provided, content will be set to None.
            else:
                return False
        oai_message["role"] = "function" if message.get("role") == "function" else role
        if "function_call" in oai_message:
-            oai_message["role"] = "assistant"  # only messages with role 'assistant' can have a function call.
+            oai_message[
                "role"
            ] = "assistant"  # only messages with role 'assistant' can have a function call.
        self._oai_messages[conversation_id].append(oai_message)
        return True
@ -390,7 +430,9 @@ class ConversableAgent(Agent):
        # print the message received
        print(colored(sender.name, "yellow"), "(to", f"{self.name}):\n", flush=True)
        if message.get("role") == "function":
-            func_print = f"***** Response from calling function \"{message['name']}\" *****"
+            func_print = (
                f"***** Response from calling function \"{message['name']}\" *****"
            )
            print(colored(func_print, "green"), flush=True)
            print(message["content"], flush=True)
            print(colored("*" * len(func_print), "green"), flush=True)
@ -401,7 +443,8 @@ class ConversableAgent(Agent):
                    content = oai.ChatCompletion.instantiate(
                        content,
                        message["context"],
-                        self.llm_config and self.llm_config.get("allow_format_str_template", False),
+                        self.llm_config
                        and self.llm_config.get("allow_format_str_template", False),
                    )
                print(content, flush=True)
            if "function_call" in message:
@ -457,7 +500,11 @@ class ConversableAgent(Agent):
            ValueError: if the message can't be converted into a valid ChatCompletion message.
        """
        self._process_received_message(message, sender, silent)
-        if request_reply is False or request_reply is None and self.reply_at_receive[sender] is False:
+        if (
            request_reply is False
            or request_reply is None
            and self.reply_at_receive[sender] is False
        ):
            return
        reply = self.generate_reply(messages=self.chat_messages[sender], sender=sender)
        if reply is not None:
@ -493,7 +540,11 @@ class ConversableAgent(Agent):
            ValueError: if the message can't be converted into a valid ChatCompletion message.
        """
        self._process_received_message(message, sender, silent)
-        if request_reply is False or request_reply is None and self.reply_at_receive[sender] is False:
+        if (
            request_reply is False
            or request_reply is None
            and self.reply_at_receive[sender] is False
        ):
            return
        reply = await self.a_generate_reply(sender=sender)
        if reply is not None:
@ -551,7 +602,9 @@ class ConversableAgent(Agent):
                "message" needs to be provided if the `generate_init_message` method is not overridden.
        """
        self._prepare_chat(recipient, clear_history)
-        await self.a_send(self.generate_init_message(**context), recipient, silent=silent)
+        await self.a_send(
            self.generate_init_message(**context), recipient, silent=silent
        )
    def reset(self):
        """Reset the agent."""
@ -604,7 +657,9 @@ class ConversableAgent(Agent):
        # TODO: #1143 handle token limit exceeded error
        response = oai.ChatCompletion.create(
-            context=messages[-1].pop("context", None), messages=self._oai_system_message + messages, **llm_config
+            context=messages[-1].pop("context", None),
            messages=self._oai_system_message + messages,
            **llm_config,
        )
        return True, oai.ChatCompletion.extract_text_or_function_call(response)[0]
@ -615,7 +670,9 @@ class ConversableAgent(Agent):
        config: Optional[Any] = None,
    ):
        """Generate a reply using code execution."""
-        code_execution_config = config if config is not None else self._code_execution_config
+        code_execution_config = (
            config if config is not None else self._code_execution_config
        )
        if code_execution_config is False:
            return False, None
        if messages is None:
@ -634,7 +691,9 @@ class ConversableAgent(Agent):
            # found code blocks, execute code and push "last_n_messages" back
            exitcode, logs = self.execute_code_blocks(code_blocks)
            code_execution_config["last_n_messages"] = last_n_messages
-            exitcode2str = "execution succeeded" if exitcode == 0 else "execution failed"
+            exitcode2str = (
                "execution succeeded" if exitcode == 0 else "execution failed"
            )
            return True, f"exitcode: {exitcode} ({exitcode2str})\nCode output: {logs}"
        # no code blocks are found, push last_n_messages back and return.
@ -681,7 +740,10 @@ class ConversableAgent(Agent):
            # if the human input is empty, and the message is a termination message, then we will terminate the conversation
            reply = reply if reply or not self._is_termination_msg(message) else "exit"
        else:
-            if self._consecutive_auto_reply_counter[sender] >= self._max_consecutive_auto_reply_dict[sender]:
+            if (
                self._consecutive_auto_reply_counter[sender]
                >= self._max_consecutive_auto_reply_dict[sender]
            ):
                if self.human_input_mode == "NEVER":
                    reply = "exit"
                else:
@ -776,7 +838,12 @@ class ConversableAgent(Agent):
            if asyncio.coroutines.iscoroutinefunction(reply_func):
                continue
            if self._match_trigger(reply_func_tuple["trigger"], sender):
-                final, reply = reply_func(self, messages=messages, sender=sender, config=reply_func_tuple["config"])
+                final, reply = reply_func(
                    self,
                    messages=messages,
                    sender=sender,
                    config=reply_func_tuple["config"],
                )
                if final:
                    return reply
        return self._default_auto_reply
@ -827,10 +894,18 @@ class ConversableAgent(Agent):
            if self._match_trigger(reply_func_tuple["trigger"], sender):
                if asyncio.coroutines.iscoroutinefunction(reply_func):
                    final, reply = await reply_func(
-                        self, messages=messages, sender=sender, config=reply_func_tuple["config"]
+                        self,
                        messages=messages,
                        sender=sender,
                        config=reply_func_tuple["config"],
                    )
                else:
-                    final, reply = reply_func(self, messages=messages, sender=sender, config=reply_func_tuple["config"])
+                    final, reply = reply_func(
                        self,
                        messages=messages,
                        sender=sender,
                        config=reply_func_tuple["config"],
                    )
                if final:
                    return reply
        return self._default_auto_reply
@ -897,7 +972,9 @@ class ConversableAgent(Agent):
                flush=True,
            )
            if lang in ["bash", "shell", "sh"]:
-                exitcode, logs, image = self.run_code(code, lang=lang, **self._code_execution_config)
+                exitcode, logs, image = self.run_code(
                    code, lang=lang, **self._code_execution_config
                )
            elif lang in ["python", "Python"]:
                if code.startswith("# filename: "):
                    filename = code[11: code.find("\n")].strip()
--- a/swarms/agents/message.py
+++ b/swarms/agents/message.py
@ -23,6 +23,6 @@ class Message:
    def __repr__(self):
        """
-        __repr__ means
+        __repr__ means... 
        """
        return f"{self.timestamp} - {self.sender}: {self.content}"
--- a/swarms/agents/meta_prompter.py
+++ b/swarms/agents/meta_prompter.py
@ -0,0 +1,158 @@
 from langchain.chains import LLMChain
 from langchain.prompts import PromptTemplate
 from langchain.memory import ConversationBufferWindowMemory
 class MetaPrompterAgent:
    """
    Meta Prompting Agent
    The Meta Prompting Agent has 1 purpose: to create better prompts for an agent.
    The meta prompting agent would be used in this flow:
    user task -> MetaPrompterAgent -> Agent
    Args:
        llm (BaseLanguageModel): Language Model
        max_iters (int, optional): Maximum number of iterations. Defaults to 3.
        max_meta_iters (int, optional): Maximum number of meta iterations. Defaults to 5.
        failed_phrase (str, optional): Phrase to indicate failure. Defaults to "task failed".
        success_phrase (str, optional): Phrase to indicate success. Defaults to "task succeeded".
        instructions (str, optional): Instructions to be used in the meta prompt. Defaults to "None".
        template (str, optional): Template to be used in the meta prompt. Defaults to None.
        memory (ConversationBufferWindowMemory, optional): Memory to be used in the meta prompt. Defaults to None.
        meta_template (str, optional): Template to be used in the meta prompt. Defaults to None.
        human_input (bool, optional): Whether to use human input. Defaults to False.
    Returns:
        str: Response from the agent
    Usage:
    --------------
    from swarms.workers import Worker
    from swarms.agents.meta_prompter import MetaPrompterAgent
    from langchain.llms import OpenAI
    #init llm
    llm = OpenAI()
    #init the meta prompter agent that optimized prompts
    meta_optimizer = MetaPrompterAgent(llm=llm)
    #init the worker agent
    worker = Worker(llm)
    #broad task to complete
    task = "Create a feedforward in pytorch"
    #optimize the prompt
    optimized_prompt = meta_optimizer.run(task)
    #run the optimized prompt with detailed instructions
    result = worker.run(optimized_prompt)
    print(result)
    """
    def __init__(
        self,
        llm,
        max_iters: int = 3,
        max_meta_iters: int = 5,
        failed_phrase: str = "task failed",
        success_phrase: str = "task succeeded",
        instructions: str = "None",
        template: str = None,
        memory=None,
        meta_template: str = None,
        human_input: bool = False,
    ):
        self.llm = llm
        self.max_iters = max_iters
        self.max_meta_iters = max_meta_iters
        self.failed_phrase = failed_phrase
        self.success_phrase = success_phrase
        self.instructions = instructions
        self.template = template
        self.memory = memory
        self.meta_template = meta_template
        self.human_input = human_input
        if memory is None:
            memory = ConversationBufferWindowMemory()
            memory.ai_prefix = "Assistant:"
        template = f"""
        Instructions: {self.instructions}
        {{{memory.memory_key}}}
        Human: {{human_input}}
        Assistant:
        """
        prompt = PromptTemplate(
            input_variables=["history", "human_input"], template=template
        )
        self.chain = LLMChain(
            llm=self.llm(),
            prompt=prompt,
            verbose=True,
            memory=ConversationBufferWindowMemory(),
        )
    def get_chat_history(self, chain_memory):
        """Get Chat History from the memory"""
        memory_key = chain_memory.memory_key
        chat_history = chain_memory.load_memory_variables(memory_key)[memory_key]
        return chat_history
    def get_new_instructions(self, meta_output):
        """Get New Instructions from the meta_output"""
        delimiter = "Instructions: "
        new_instructions = meta_output[meta_output.find(delimiter) + len(delimiter) :]
        return new_instructions
    def run(self, task: str):
        """
        Run the MetaPrompterAgent
        Args:
            task (str): The task to be completed
        Returns:
            str: The response from the agent
        """
        key_phrases = [self.success_phrase, self.failed_phrase]
        for i in range(self.max_meta_iters):
            print(f"[Epsisode: {i+1}/{self.max_meta_iters}]")
            chain = self.chain(memory=None)
            output = chain.predict(human_input=task)
            for j in range(self.max_iters):
                print(f"(Step {j+1}/{self.max_iters})")
                print(f"Assistant: {output}")
                print(f"Human: ")
                if self.human_input:
                    human_input = input()
                if any(phrase in human_input.lower() for phrase in key_phrases):
                    break
                output = chain.predict(human_input.lower)
            if self.success_phrase in human_input.lower():
                print(f"You succeed! Thanks for using!")
                return
            meta_chain = self.initialize_meta_chain()
            meta_output = meta_chain.predict(
                chat_history=self.get_chat_history(chain.memory)
            )
            print(f"Feedback: {meta_output}")
            self.instructions = self.get_new_instructions(meta_output)
            print(f"New Instruction: {self.instructions}")
            print("\n" + "#" * 80 + "\n")
--- a/swarms/agents/models/groundingdino/datasets/cocogrounding_eval.py
+++ b/swarms/agents/models/groundingdino/datasets/cocogrounding_eval.py
@ -66,7 +66,9 @@ class CocoGroundingEvaluator(object):
    def synchronize_between_processes(self):
        for iou_type in self.iou_types:
            self.eval_imgs[iou_type] = np.concatenate(self.eval_imgs[iou_type], 2)
-            create_common_coco_eval(self.coco_eval[iou_type], self.img_ids, self.eval_imgs[iou_type])
+            create_common_coco_eval(
                self.coco_eval[iou_type], self.img_ids, self.eval_imgs[iou_type]
            )
    def accumulate(self):
        for coco_eval in self.coco_eval.values():
@ -127,7 +129,9 @@ class CocoGroundingEvaluator(object):
            labels = prediction["labels"].tolist()
            rles = [
-                mask_util.encode(np.array(mask[0, :, :, np.newaxis], dtype=np.uint8, order="F"))[0]
+                mask_util.encode(
                    np.array(mask[0, :, :, np.newaxis], dtype=np.uint8, order="F")
                )[0]
                for mask in masks
            ]
            for rle in rles:
@ -227,7 +231,9 @@ def evaluate(self):
    # add backward compatibility if useSegm is specified in params
    if p.useSegm is not None:
        p.iouType = "segm" if p.useSegm == 1 else "bbox"
-        print("useSegm (deprecated) is not None. Running {} evaluation".format(p.iouType))
+        print(
            "useSegm (deprecated) is not None. Running {} evaluation".format(p.iouType)
        )
    # print('Evaluate annotation type *{}*'.format(p.iouType))
    p.imgIds = list(np.unique(p.imgIds))
    if p.useCats:
@ -246,7 +252,8 @@ def evaluate(self):
    self.ious = {
        (imgId, catId): computeIoU(imgId, catId)
        for imgId in p.imgIds
-        for catId in catIds}
+        for catId in catIds
    }
    evaluateImg = self.evaluateImg
    maxDet = p.maxDets[-1]
--- a/swarms/agents/models/groundingdino/datasets/transforms.py
+++ b/swarms/agents/models/groundingdino/datasets/transforms.py
@ -73,9 +73,9 @@ def hflip(image, target):
    target = target.copy()
    if "boxes" in target:
        boxes = target["boxes"]
-        boxes = boxes[:, [2, 1, 0, 3]] * torch.as_tensor([-1, 1, -1, 1]) + torch.as_tensor(
+        boxes = boxes[:, [2, 1, 0, 3]] * torch.as_tensor(
-            [w, 0, w, 0]
+            [-1, 1, -1, 1]
-        )
+        ) + torch.as_tensor([w, 0, w, 0])
        target["boxes"] = boxes
    if "masks" in target:
@ -119,7 +119,9 @@ def resize(image, target, size, max_size=None):
    if target is None:
        return rescaled_image, None
-    ratios = tuple(float(s) / float(s_orig) for s, s_orig in zip(rescaled_image.size, image.size))
+    ratios = tuple(
        float(s) / float(s_orig) for s, s_orig in zip(rescaled_image.size, image.size)
    )
    ratio_width, ratio_height = ratios
    target = target.copy()
@ -140,7 +142,8 @@ def resize(image, target, size, max_size=None):
    if "masks" in target:
        target["masks"] = (
-            interpolate(target["masks"][:, None].float(), size, mode="nearest")[:, 0] > 0.5
+            interpolate(target["masks"][:, None].float(), size, mode="nearest")[:, 0]
            > 0.5
        )
    return rescaled_image, target
@ -155,7 +158,9 @@ def pad(image, target, padding):
    # should we do something wrt the original size?
    target["size"] = torch.tensor(padded_image.size[::-1])
    if "masks" in target:
-        target["masks"] = torch.nn.functional.pad(target["masks"], (0, padding[0], 0, padding[1]))
+        target["masks"] = torch.nn.functional.pad(
            target["masks"], (0, padding[0], 0, padding[1])
        )
    return padded_image, target
--- a/swarms/agents/models/groundingdino/models/GroundingDINO/backbone/backbone.py
+++ b/swarms/agents/models/groundingdino/models/GroundingDINO/backbone/backbone.py
@ -47,14 +47,27 @@ class FrozenBatchNorm2d(torch.nn.Module):
        self.register_buffer("running_var", torch.ones(n))
    def _load_from_state_dict(
-        self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
+        self,
        state_dict,
        prefix,
        local_metadata,
        strict,
        missing_keys,
        unexpected_keys,
        error_msgs,
    ):
        num_batches_tracked_key = prefix + "num_batches_tracked"
        if num_batches_tracked_key in state_dict:
            del state_dict[num_batches_tracked_key]
        super(FrozenBatchNorm2d, self)._load_from_state_dict(
-            state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
+            state_dict,
            prefix,
            local_metadata,
            strict,
            missing_keys,
            unexpected_keys,
            error_msgs,
        )
    def forward(self, x):
@ -91,7 +104,11 @@ class BackboneBase(nn.Module):
        return_layers = {}
        for idx, layer_index in enumerate(return_interm_indices):
            return_layers.update(
-                {"layer{}".format(5 - len(return_interm_indices) + idx): "{}".format(layer_index)}
+                {
                    "layer{}".format(5 - len(return_interm_indices) + idx): "{}".format(
                        layer_index
                    )
                }
            )
        # if len:
@ -136,7 +153,10 @@ class Backbone(BackboneBase):
        else:
            raise NotImplementedError("Why you can get here with name {}".format(name))
        # num_channels = 512 if name in ('resnet18', 'resnet34') else 2048
-        assert name not in ("resnet18", "resnet34"), "Only resnet50 and resnet101 are available."
+        assert name not in (
            "resnet18",
            "resnet34",
        ), "Only resnet50 and resnet101 are available."
        assert return_interm_indices in [[0, 1, 2, 3], [1, 2, 3], [3]]
        num_channels_all = [256, 512, 1024, 2048]
        num_channels = num_channels_all[4 - len(return_interm_indices):]
--- a/swarms/agents/models/groundingdino/models/GroundingDINO/backbone/position_encoding.py
+++ b/swarms/agents/models/groundingdino/models/GroundingDINO/backbone/position_encoding.py
@ -33,7 +33,9 @@ class PositionEmbeddingSine(nn.Module):
    used by the Attention is all you need paper, generalized to work on images.
    """
-    def __init__(self, num_pos_feats=64, temperature=10000, normalize=False, scale=None):
+    def __init__(
        self, num_pos_feats=64, temperature=10000, normalize=False, scale=None
    ):
        super().__init__()
        self.num_pos_feats = num_pos_feats
        self.temperature = temperature
@ -82,7 +84,12 @@ class PositionEmbeddingSineHW(nn.Module):
    """
    def __init__(
-        self, num_pos_feats=64, temperatureH=10000, temperatureW=10000, normalize=False, scale=None
+        self,
        num_pos_feats=64,
        temperatureH=10000,
        temperatureW=10000,
        normalize=False,
        scale=None,
    ):
        super().__init__()
        self.num_pos_feats = num_pos_feats
@ -111,11 +118,15 @@ class PositionEmbeddingSineHW(nn.Module):
            x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale
        dim_tx = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device)
-        dim_tx = self.temperatureW ** (2 * (torch.div(dim_tx, 2, rounding_mode='floor')) / self.num_pos_feats)
+        dim_tx = self.temperatureW ** (
            2 * (torch.div(dim_tx, 2, rounding_mode="floor")) / self.num_pos_feats
        )
        pos_x = x_embed[:, :, :, None] / dim_tx
        dim_ty = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device)
-        dim_ty = self.temperatureH ** (2 * (torch.div(dim_ty, 2, rounding_mode='floor')) / self.num_pos_feats)
+        dim_ty = self.temperatureH ** (
            2 * (torch.div(dim_ty, 2, rounding_mode="floor")) / self.num_pos_feats
        )
        pos_y = y_embed[:, :, :, None] / dim_ty
        pos_x = torch.stack(
--- a/swarms/agents/models/groundingdino/models/GroundingDINO/backbone/swin_transformer.py
+++ b/swarms/agents/models/groundingdino/models/GroundingDINO/backbone/swin_transformer.py
@ -25,7 +25,12 @@ class Mlp(nn.Module):
    """Multilayer perceptron."""
    def __init__(
-        self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.0
+        self,
        in_features,
        hidden_features=None,
        out_features=None,
        act_layer=nn.GELU,
        drop=0.0,
    ):
        super().__init__()
        out_features = out_features or in_features
@ -54,7 +59,9 @@ def window_partition(x, window_size):
    """
    B, H, W, C = x.shape
    x = x.view(B, H // window_size, window_size, W // window_size, window_size, C)
-    windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C)
+    windows = (
        x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C)
    )
    return windows
@ -69,7 +76,9 @@ def window_reverse(windows, window_size, H, W):
        x: (B, H, W, C)
    """
    B = int(windows.shape[0] / (H * W / window_size / window_size))
-    x = windows.view(B, H // window_size, W // window_size, window_size, window_size, -1)
+    x = windows.view(
        B, H // window_size, W // window_size, window_size, window_size, -1
    )
    x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1)
    return x
@ -97,7 +106,6 @@ class WindowAttention(nn.Module):
        attn_drop=0.0,
        proj_drop=0.0,
    ):
        super().__init__()
        self.dim = dim
        self.window_size = window_size  # Wh, Ww
@ -115,8 +123,12 @@ class WindowAttention(nn.Module):
        coords_w = torch.arange(self.window_size[1])
        coords = torch.stack(torch.meshgrid([coords_h, coords_w]))  # 2, Wh, Ww
        coords_flatten = torch.flatten(coords, 1)  # 2, Wh*Ww
-        relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :]  # 2, Wh*Ww, Wh*Ww
+        relative_coords = (
-        relative_coords = relative_coords.permute(1, 2, 0).contiguous()  # Wh*Ww, Wh*Ww, 2
+            coords_flatten[:, :, None] - coords_flatten[:, None, :]
        )  # 2, Wh*Ww, Wh*Ww
        relative_coords = relative_coords.permute(
            1, 2, 0
        ).contiguous()  # Wh*Ww, Wh*Ww, 2
        relative_coords[:, :, 0] += self.window_size[0] - 1  # shift to start from 0
        relative_coords[:, :, 1] += self.window_size[1] - 1
        relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1
@ -143,7 +155,11 @@ class WindowAttention(nn.Module):
            .reshape(B_, N, 3, self.num_heads, C // self.num_heads)
            .permute(2, 0, 3, 1, 4)
        )
-        q, k, v = qkv[0], qkv[1], qkv[2]  # make torchscript happy (cannot use tensor as tuple)
+        q, k, v = (
            qkv[0],
            qkv[1],
            qkv[2],
        )  # make torchscript happy (cannot use tensor as tuple)
        q = q * self.scale
        attn = q @ k.transpose(-2, -1)
@ -151,7 +167,9 @@ class WindowAttention(nn.Module):
        relative_position_bias = self.relative_position_bias_table[
            self.relative_position_index.view(-1)
        ].view(
-            self.window_size[0] * self.window_size[1], self.window_size[0] * self.window_size[1], -1
+            self.window_size[0] * self.window_size[1],
            self.window_size[0] * self.window_size[1],
            -1,
        )  # Wh*Ww,Wh*Ww,nH
        relative_position_bias = relative_position_bias.permute(
            2, 0, 1
@ -160,7 +178,9 @@ class WindowAttention(nn.Module):
        if mask is not None:
            nW = mask.shape[0]
-            attn = attn.view(B_ // nW, nW, self.num_heads, N, N) + mask.unsqueeze(1).unsqueeze(0)
+            attn = attn.view(B_ // nW, nW, self.num_heads, N, N) + mask.unsqueeze(
                1
            ).unsqueeze(0)
            attn = attn.view(-1, self.num_heads, N, N)
            attn = self.softmax(attn)
        else:
@ -212,7 +232,9 @@ class SwinTransformerBlock(nn.Module):
        self.window_size = window_size
        self.shift_size = shift_size
        self.mlp_ratio = mlp_ratio
-        assert 0 <= self.shift_size < self.window_size, "shift_size must in 0-window_size"
+        assert (
            0 <= self.shift_size < self.window_size
        ), "shift_size must in 0-window_size"
        self.norm1 = norm_layer(dim)
        self.attn = WindowAttention(
@ -229,7 +251,10 @@ class SwinTransformerBlock(nn.Module):
        self.norm2 = norm_layer(dim)
        mlp_hidden_dim = int(dim * mlp_ratio)
        self.mlp = Mlp(
-            in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop
+            in_features=dim,
            hidden_features=mlp_hidden_dim,
            act_layer=act_layer,
            drop=drop,
        )
        self.H = None
@ -259,7 +284,9 @@ class SwinTransformerBlock(nn.Module):
        # cyclic shift
        if self.shift_size > 0:
-            shifted_x = torch.roll(x, shifts=(-self.shift_size, -self.shift_size), dims=(1, 2))
+            shifted_x = torch.roll(
                x, shifts=(-self.shift_size, -self.shift_size), dims=(1, 2)
            )
            attn_mask = mask_matrix
        else:
            shifted_x = x
@ -274,7 +301,9 @@ class SwinTransformerBlock(nn.Module):
        )  # nW*B, window_size*window_size, C
        # W-MSA/SW-MSA
-        attn_windows = self.attn(x_windows, mask=attn_mask)  # nW*B, window_size*window_size, C
+        attn_windows = self.attn(
            x_windows, mask=attn_mask
        )  # nW*B, window_size*window_size, C
        # merge windows
        attn_windows = attn_windows.view(-1, self.window_size, self.window_size, C)
@ -282,7 +311,9 @@ class SwinTransformerBlock(nn.Module):
        # reverse cyclic shift
        if self.shift_size > 0:
-            x = torch.roll(shifted_x, shifts=(self.shift_size, self.shift_size), dims=(1, 2))
+            x = torch.roll(
                shifted_x, shifts=(self.shift_size, self.shift_size), dims=(1, 2)
            )
        else:
            x = shifted_x
@ -393,7 +424,9 @@ class BasicLayer(nn.Module):
                    qk_scale=qk_scale,
                    drop=drop,
                    attn_drop=attn_drop,
-                    drop_path=drop_path[i] if isinstance(drop_path, list) else drop_path,
+                    drop_path=drop_path[i]
                    if isinstance(drop_path, list)
                    else drop_path,
                    norm_layer=norm_layer,
                )
                for i in range(depth)
@ -473,7 +506,9 @@ class PatchEmbed(nn.Module):
        self.in_chans = in_chans
        self.embed_dim = embed_dim
-        self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
+        self.proj = nn.Conv2d(
            in_chans, embed_dim, kernel_size=patch_size, stride=patch_size
        )
        if norm_layer is not None:
            self.norm = norm_layer(embed_dim)
        else:
@ -700,7 +735,11 @@ class SwinTransformer(nn.Module):
                norm_layer = getattr(self, f"norm{i}")
                x_out = norm_layer(x_out)
-                out = x_out.view(-1, H, W, self.num_features[i]).permute(0, 3, 1, 2).contiguous()
+                out = (
                    x_out.view(-1, H, W, self.num_features[i])
                    .permute(0, 3, 1, 2)
                    .contiguous()
                )
                outs.append(out)
        # in:
        #   torch.Size([2, 3, 1024, 1024])
@ -735,7 +774,11 @@ class SwinTransformer(nn.Module):
                norm_layer = getattr(self, f"norm{i}")
                x_out = norm_layer(x_out)
-                out = x_out.view(-1, H, W, self.num_features[i]).permute(0, 3, 1, 2).contiguous()
+                out = (
                    x_out.view(-1, H, W, self.num_features[i])
                    .permute(0, 3, 1, 2)
                    .contiguous()
                )
                outs.append(out)
        # in:
        #   torch.Size([2, 3, 1024, 1024])
@ -748,7 +791,9 @@ class SwinTransformer(nn.Module):
        for idx, out_i in enumerate(outs):
            m = tensor_list.mask
            assert m is not None
-            mask = F.interpolate(m[None].float(), size=out_i.shape[-2:]).to(torch.bool)[0]
+            mask = F.interpolate(m[None].float(), size=out_i.shape[-2:]).to(torch.bool)[
                0
            ]
            outs_dict[idx] = NestedTensor(out_i, mask)
        return outs_dict
@ -776,13 +821,22 @@ def build_swin_transformer(modelname, pretrain_img_size, **kw):
            embed_dim=128, depths=[2, 2, 18, 2], num_heads=[4, 8, 16, 32], window_size=7
        ),
        "swin_B_384_22k": dict(
-            embed_dim=128, depths=[2, 2, 18, 2], num_heads=[4, 8, 16, 32], window_size=12
+            embed_dim=128,
            depths=[2, 2, 18, 2],
            num_heads=[4, 8, 16, 32],
            window_size=12,
        ),
        "swin_L_224_22k": dict(
-            embed_dim=192, depths=[2, 2, 18, 2], num_heads=[6, 12, 24, 48], window_size=7
+            embed_dim=192,
            depths=[2, 2, 18, 2],
            num_heads=[6, 12, 24, 48],
            window_size=7,
        ),
        "swin_L_384_22k": dict(
-            embed_dim=192, depths=[2, 2, 18, 2], num_heads=[6, 12, 24, 48], window_size=12
+            embed_dim=192,
            depths=[2, 2, 18, 2],
            num_heads=[6, 12, 24, 48],
            window_size=12,
        ),
    }
    kw_cgf = model_para_dict[modelname]
--- a/swarms/agents/models/groundingdino/models/GroundingDINO/bertwarper.py
+++ b/swarms/agents/models/groundingdino/models/GroundingDINO/bertwarper.py
@ -61,14 +61,18 @@ class BertModelWarper(nn.Module):
            decoding (see :obj:`past_key_values`).
        """
        output_attentions = (
-            output_attentions if output_attentions is not None else self.config.output_attentions
+            output_attentions
            if output_attentions is not None
            else self.config.output_attentions
        )
        output_hidden_states = (
            output_hidden_states
            if output_hidden_states is not None
            else self.config.output_hidden_states
        )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        return_dict = (
            return_dict if return_dict is not None else self.config.use_return_dict
        )
        if self.config.is_decoder:
            use_cache = use_cache if use_cache is not None else self.config.use_cache
@ -76,7 +80,9 @@ class BertModelWarper(nn.Module):
            use_cache = False
        if input_ids is not None and inputs_embeds is not None:
-            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+            raise ValueError(
                "You cannot specify both input_ids and inputs_embeds at the same time"
            )
        elif input_ids is not None:
            input_shape = input_ids.size()
            batch_size, seq_length = input_shape
@ -109,11 +115,17 @@ class BertModelWarper(nn.Module):
        # If a 2D or 3D attention mask is provided for the cross-attention
        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
        if self.config.is_decoder and encoder_hidden_states is not None:
-            encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
+            (
                encoder_batch_size,
                encoder_sequence_length,
                _,
            ) = encoder_hidden_states.size()
            encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
            if encoder_attention_mask is None:
                encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device)
-            encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
+            encoder_extended_attention_mask = self.invert_attention_mask(
                encoder_attention_mask
            )
        else:
            encoder_extended_attention_mask = None
        # if os.environ.get('IPDB_SHILONG_DEBUG', None) == 'INFO':
@ -147,7 +159,9 @@ class BertModelWarper(nn.Module):
            return_dict=return_dict,
        )
        sequence_output = encoder_outputs[0]
-        pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
+        pooled_output = (
            self.pooler(sequence_output) if self.pooler is not None else None
        )
        if not return_dict:
            return (sequence_output, pooled_output) + encoder_outputs[1:]
@ -193,7 +207,10 @@ def generate_masks_with_special_tokens(tokenized, special_tokens_list, tokenizer
    # generate attention mask and positional ids
    attention_mask = (
-        torch.eye(num_token, device=input_ids.device).bool().unsqueeze(0).repeat(bs, 1, 1)
+        torch.eye(num_token, device=input_ids.device)
        .bool()
        .unsqueeze(0)
        .repeat(bs, 1, 1)
    )
    position_ids = torch.zeros((bs, num_token), device=input_ids.device)
    previous_col = 0
@ -203,7 +220,9 @@ def generate_masks_with_special_tokens(tokenized, special_tokens_list, tokenizer
            attention_mask[row, col, col] = True
            position_ids[row, col] = 0
        else:
-            attention_mask[row, previous_col + 1: col + 1, previous_col + 1: col + 1] = True
+            attention_mask[
                row, previous_col + 1: col + 1, previous_col + 1: col + 1
            ] = True
            position_ids[row, previous_col + 1: col + 1] = torch.arange(
                0, col - previous_col, device=input_ids.device
            )
@ -217,7 +236,9 @@ def generate_masks_with_special_tokens(tokenized, special_tokens_list, tokenizer
    return attention_mask, position_ids.to(torch.long)
-def generate_masks_with_special_tokens_and_transfer_map(tokenized, special_tokens_list, tokenizer):
+def generate_masks_with_special_tokens_and_transfer_map(
    tokenized, special_tokens_list, tokenizer
 ):
    """Generate attention mask between each pair of special tokens
    Args:
        input_ids (torch.Tensor): input ids. Shape: [bs, num_token]
@ -237,7 +258,10 @@ def generate_masks_with_special_tokens_and_transfer_map(tokenized, special_token
    # generate attention mask and positional ids
    attention_mask = (
-        torch.eye(num_token, device=input_ids.device).bool().unsqueeze(0).repeat(bs, 1, 1)
+        torch.eye(num_token, device=input_ids.device)
        .bool()
        .unsqueeze(0)
        .repeat(bs, 1, 1)
    )
    position_ids = torch.zeros((bs, num_token), device=input_ids.device)
    cate_to_token_mask_list = [[] for _ in range(bs)]
@ -248,7 +272,9 @@ def generate_masks_with_special_tokens_and_transfer_map(tokenized, special_token
            attention_mask[row, col, col] = True
            position_ids[row, col] = 0
        else:
-            attention_mask[row, previous_col + 1: col + 1, previous_col + 1: col + 1] = True
+            attention_mask[
                row, previous_col + 1: col + 1, previous_col + 1: col + 1
            ] = True
            position_ids[row, previous_col + 1: col + 1] = torch.arange(
                0, col - previous_col, device=input_ids.device
            )
--- a/swarms/agents/models/groundingdino/models/GroundingDINO/fuse_modules.py
+++ b/swarms/agents/models/groundingdino/models/GroundingDINO/fuse_modules.py
@ -127,7 +127,11 @@ class BiMultiHeadAttention(nn.Module):
        self._reset_parameters()
    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
-        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+        return (
            tensor.view(bsz, seq_len, self.num_heads, self.head_dim)
            .transpose(1, 2)
            .contiguous()
        )
    def _reset_parameters(self):
        nn.init.xavier_uniform_(self.v_proj.weight)
@ -171,7 +175,9 @@ class BiMultiHeadAttention(nn.Module):
        value_l_states = value_l_states.view(*proj_shape)
        src_len = key_states.size(1)
-        attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))  # bs*nhead, nimg, ntxt
+        attn_weights = torch.bmm(
            query_states, key_states.transpose(1, 2)
        )  # bs*nhead, nimg, ntxt
        if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
            raise ValueError(
@ -191,7 +197,9 @@ class BiMultiHeadAttention(nn.Module):
            )  # Do not increase 50000, data type half has quite limited range
        attn_weights_T = attn_weights.transpose(1, 2)
-        attn_weights_l = attn_weights_T - torch.max(attn_weights_T, dim=-1, keepdim=True)[0]
+        attn_weights_l = (
            attn_weights_T - torch.max(attn_weights_T, dim=-1, keepdim=True)[0]
        )
        if self.clamp_min_for_underflow:
            attn_weights_l = torch.clamp(
                attn_weights_l, min=-50000
@ -204,7 +212,9 @@ class BiMultiHeadAttention(nn.Module):
        # mask vison for language
        if attention_mask_v is not None:
            attention_mask_v = (
-                attention_mask_v[:, None, None, :].repeat(1, self.num_heads, 1, 1).flatten(0, 1)
+                attention_mask_v[:, None, None, :]
                .repeat(1, self.num_heads, 1, 1)
                .flatten(0, 1)
            )
            attn_weights_l.masked_fill_(attention_mask_v, float("-inf"))
@ -213,7 +223,9 @@ class BiMultiHeadAttention(nn.Module):
        # mask language for vision
        if attention_mask_l is not None:
            attention_mask_l = (
-                attention_mask_l[:, None, None, :].repeat(1, self.num_heads, 1, 1).flatten(0, 1)
+                attention_mask_l[:, None, None, :]
                .repeat(1, self.num_heads, 1, 1)
                .flatten(0, 1)
            )
            attn_weights.masked_fill_(attention_mask_l, float("-inf"))
        attn_weights_v = attn_weights.softmax(dim=-1)
@ -275,13 +287,21 @@ class BiAttentionBlock(nn.Module):
        self.layer_norm_v = nn.LayerNorm(v_dim)
        self.layer_norm_l = nn.LayerNorm(l_dim)
        self.attn = BiMultiHeadAttention(
-            v_dim=v_dim, l_dim=l_dim, embed_dim=embed_dim, num_heads=num_heads, dropout=dropout
+            v_dim=v_dim,
            l_dim=l_dim,
            embed_dim=embed_dim,
            num_heads=num_heads,
            dropout=dropout,
        )
        # add layer scale for training stability
        self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
-        self.gamma_v = nn.Parameter(init_values * torch.ones((v_dim)), requires_grad=True)
+        self.gamma_v = nn.Parameter(
-        self.gamma_l = nn.Parameter(init_values * torch.ones((l_dim)), requires_grad=True)
+            init_values * torch.ones((v_dim)), requires_grad=True
        )
        self.gamma_l = nn.Parameter(
            init_values * torch.ones((l_dim)), requires_grad=True
        )
    def forward(self, v, l, attention_mask_v=None, attention_mask_l=None):
        v = self.layer_norm_v(v)
--- a/swarms/agents/models/groundingdino/models/GroundingDINO/groundingdino.py
+++ b/swarms/agents/models/groundingdino/models/GroundingDINO/groundingdino.py
@ -100,13 +100,17 @@ class GroundingDINO(nn.Module):
        self.bert.pooler.dense.bias.requires_grad_(False)
        self.bert = BertModelWarper(bert_model=self.bert)
-        self.feat_map = nn.Linear(self.bert.config.hidden_size, self.hidden_dim, bias=True)
+        self.feat_map = nn.Linear(
            self.bert.config.hidden_size, self.hidden_dim, bias=True
        )
        nn.init.constant_(self.feat_map.bias.data, 0)
        nn.init.xavier_uniform_(self.feat_map.weight.data)
        # freeze
        # special tokens
-        self.specical_tokens = self.tokenizer.convert_tokens_to_ids(["[CLS]", "[SEP]", ".", "?"])
+        self.specical_tokens = self.tokenizer.convert_tokens_to_ids(
            ["[CLS]", "[SEP]", ".", "?"]
        )
        # prepare input projection layers
        if num_feature_levels > 1:
@ -123,14 +127,18 @@ class GroundingDINO(nn.Module):
            for _ in range(num_feature_levels - num_backbone_outs):
                input_proj_list.append(
                    nn.Sequential(
-                        nn.Conv2d(in_channels, hidden_dim, kernel_size=3, stride=2, padding=1),
+                        nn.Conv2d(
                            in_channels, hidden_dim, kernel_size=3, stride=2, padding=1
                        ),
                        nn.GroupNorm(32, hidden_dim),
                    )
                )
                in_channels = hidden_dim
            self.input_proj = nn.ModuleList(input_proj_list)
        else:
-            assert two_stage_type == "no", "two_stage_type should be no if num_feature_levels=1 !!!"
+            assert (
                two_stage_type == "no"
            ), "two_stage_type should be no if num_feature_levels=1 !!!"
            self.input_proj = nn.ModuleList(
                [
                    nn.Sequential(
@ -157,12 +165,17 @@ class GroundingDINO(nn.Module):
        nn.init.constant_(_bbox_embed.layers[-1].bias.data, 0)
        if dec_pred_bbox_embed_share:
-            box_embed_layerlist = [_bbox_embed for i in range(transformer.num_decoder_layers)]
+            box_embed_layerlist = [
                _bbox_embed for i in range(transformer.num_decoder_layers)
            ]
        else:
            box_embed_layerlist = [
-                copy.deepcopy(_bbox_embed) for i in range(transformer.num_decoder_layers)
+                copy.deepcopy(_bbox_embed)
                for i in range(transformer.num_decoder_layers)
            ]
-        class_embed_layerlist = [_class_embed for i in range(transformer.num_decoder_layers)]
+        class_embed_layerlist = [
            _class_embed for i in range(transformer.num_decoder_layers)
        ]
        self.bbox_embed = nn.ModuleList(box_embed_layerlist)
        self.class_embed = nn.ModuleList(class_embed_layerlist)
        self.transformer.decoder.bbox_embed = self.bbox_embed
@ -170,9 +183,10 @@ class GroundingDINO(nn.Module):
        # two stage
        self.two_stage_type = two_stage_type
-        assert two_stage_type in ["no", "standard"], "unknown param {} of two_stage_type".format(
+        assert two_stage_type in [
-            two_stage_type
+            "no",
-        )
+            "standard",
        ], "unknown param {} of two_stage_type".format(two_stage_type)
        if two_stage_type != "no":
            if two_stage_bbox_embed_share:
                assert dec_pred_bbox_embed_share
@ -237,12 +251,18 @@ class GroundingDINO(nn.Module):
            ]
            position_ids = position_ids[:, : self.max_text_len]
            tokenized["input_ids"] = tokenized["input_ids"][:, : self.max_text_len]
-            tokenized["attention_mask"] = tokenized["attention_mask"][:, : self.max_text_len]
+            tokenized["attention_mask"] = tokenized["attention_mask"][
-            tokenized["token_type_ids"] = tokenized["token_type_ids"][:, : self.max_text_len]
+                :, : self.max_text_len
            ]
            tokenized["token_type_ids"] = tokenized["token_type_ids"][
                :, : self.max_text_len
            ]
        # extract text embeddings
        if self.sub_sentence_present:
-            tokenized_for_encoder = {k: v for k, v in tokenized.items() if k != "attention_mask"}
+            tokenized_for_encoder = {
                k: v for k, v in tokenized.items() if k != "attention_mask"
            }
            tokenized_for_encoder["attention_mask"] = text_self_attention_masks
            tokenized_for_encoder["position_ids"] = position_ids
        else:
@ -251,7 +271,9 @@ class GroundingDINO(nn.Module):
        bert_output = self.bert(**tokenized_for_encoder)  # bs, 195, 768
-        encoded_text = self.feat_map(bert_output["last_hidden_state"])  # bs, 195, d_model
+        encoded_text = self.feat_map(
            bert_output["last_hidden_state"]
        )  # bs, 195, d_model
        text_token_mask = tokenized.attention_mask.bool()  # bs, 195
        # text_token_mask: True for nomask, False for mask
        # text_self_attention_masks: True for nomask, False for mask
@ -292,7 +314,9 @@ class GroundingDINO(nn.Module):
                else:
                    src = self.input_proj[l](srcs[-1])
                m = samples.mask
-                mask = F.interpolate(m[None].float(), size=src.shape[-2:]).to(torch.bool)[0]
+                mask = F.interpolate(m[None].float(), size=src.shape[-2:]).to(
                    torch.bool
                )[0]
                pos_l = self.backbone[1](NestedTensor(src, mask)).to(src.dtype)
                srcs.append(src)
                masks.append(mask)
@ -350,7 +374,6 @@ class GroundingDINO(nn.Module):
@MODULE_BUILD_FUNCS.registe_with_name(module_name="groundingdino")
 def build_groundingdino(args):
    backbone = build_backbone(args)
    transformer = build_transformer(args)
--- a/swarms/agents/models/groundingdino/models/GroundingDINO/ms_deform_attn.py
+++ b/swarms/agents/models/groundingdino/models/GroundingDINO/ms_deform_attn.py
@ -34,7 +34,9 @@ except BaseException:
 # helpers
 def _is_power_of_2(n):
    if (not isinstance(n, int)) or (n < 0):
-        raise ValueError("invalid input for _is_power_of_2: {} (type: {})".format(n, type(n)))
+        raise ValueError(
            "invalid input for _is_power_of_2: {} (type: {})".format(n, type(n))
        )
    return (n & (n - 1) == 0) and n != 0
@ -96,7 +98,6 @@ def multi_scale_deformable_attn_pytorch(
    sampling_locations: torch.Tensor,
    attention_weights: torch.Tensor,
 ) -> torch.Tensor:
    bs, _, num_heads, embed_dims = value.shape
    _, num_queries, num_heads, num_levels, num_points, _ = sampling_locations.shape
    value_list = value.split([H_ * W_ for H_, W_ in value_spatial_shapes], dim=1)
@ -108,7 +109,10 @@ def multi_scale_deformable_attn_pytorch(
        # bs, num_heads*embed_dims, H_*W_ ->
        # bs*num_heads, embed_dims, H_, W_
        value_l_ = (
-            value_list[level].flatten(2).transpose(1, 2).reshape(bs * num_heads, embed_dims, H_, W_)
+            value_list[level]
            .flatten(2)
            .transpose(1, 2)
            .reshape(bs * num_heads, embed_dims, H_, W_)
        )
        # bs, num_queries, num_heads, num_points, 2 ->
        # bs, num_heads, num_queries, num_points, 2 ->
@ -116,7 +120,11 @@ def multi_scale_deformable_attn_pytorch(
        sampling_grid_l_ = sampling_grids[:, :, :, level].transpose(1, 2).flatten(0, 1)
        # bs*num_heads, embed_dims, num_queries, num_points
        sampling_value_l_ = F.grid_sample(
-            value_l_, sampling_grid_l_, mode="bilinear", padding_mode="zeros", align_corners=False
+            value_l_,
            sampling_grid_l_,
            mode="bilinear",
            padding_mode="zeros",
            align_corners=False,
        )
        sampling_value_list.append(sampling_value_l_)
    # (bs, num_queries, num_heads, num_levels, num_points) ->
@ -184,8 +192,12 @@ class MultiScaleDeformableAttention(nn.Module):
        self.num_heads = num_heads
        self.num_levels = num_levels
        self.num_points = num_points
-        self.sampling_offsets = nn.Linear(embed_dim, num_heads * num_levels * num_points * 2)
+        self.sampling_offsets = nn.Linear(
-        self.attention_weights = nn.Linear(embed_dim, num_heads * num_levels * num_points)
+            embed_dim, num_heads * num_levels * num_points * 2
        )
        self.attention_weights = nn.Linear(
            embed_dim, num_heads * num_levels * num_points
        )
        self.value_proj = nn.Linear(embed_dim, embed_dim)
        self.output_proj = nn.Linear(embed_dim, embed_dim)
@ -306,7 +318,9 @@ class MultiScaleDeformableAttention(nn.Module):
        # bs, num_query, num_heads, num_levels, num_points, 2
        if reference_points.shape[-1] == 2:
-            offset_normalizer = torch.stack([spatial_shapes[..., 1], spatial_shapes[..., 0]], -1)
+            offset_normalizer = torch.stack(
                [spatial_shapes[..., 1], spatial_shapes[..., 0]], -1
            )
            sampling_locations = (
                reference_points[:, :, None, :, None, :]
                + sampling_offsets / offset_normalizer[None, None, None, :, None, :]
@ -370,7 +384,9 @@ def create_dummy_class(klass, dependency, message=""):
    Returns:
        class: a class object
    """
-    err = "Cannot import '{}', therefore '{}' is not available.".format(dependency, klass)
+    err = "Cannot import '{}', therefore '{}' is not available.".format(
        dependency, klass
    )
    if message:
        err = err + " " + message
@ -399,7 +415,9 @@ def create_dummy_func(func, dependency, message=""):
    Returns:
        function: a function object
    """
-    err = "Cannot import '{}', therefore '{}' is not available.".format(dependency, func)
+    err = "Cannot import '{}', therefore '{}' is not available.".format(
        dependency, func
    )
    if message:
        err = err + " " + message
--- a/swarms/agents/models/groundingdino/models/GroundingDINO/transformer.py
+++ b/swarms/agents/models/groundingdino/models/GroundingDINO/transformer.py
@ -82,7 +82,13 @@ class Transformer(nn.Module):
        # choose encoder layer type
        encoder_layer = DeformableTransformerEncoderLayer(
-            d_model, dim_feedforward, dropout, activation, num_feature_levels, nhead, enc_n_points
+            d_model,
            dim_feedforward,
            dropout,
            activation,
            num_feature_levels,
            nhead,
            enc_n_points,
        )
        if use_text_enhancer:
@ -154,7 +160,9 @@ class Transformer(nn.Module):
        if num_feature_levels > 1:
            if self.num_encoder_layers > 0:
-                self.level_embed = nn.Parameter(torch.Tensor(num_feature_levels, d_model))
+                self.level_embed = nn.Parameter(
                    torch.Tensor(num_feature_levels, d_model)
                )
            else:
                self.level_embed = None
@ -169,9 +177,10 @@ class Transformer(nn.Module):
        # for two stage
        self.two_stage_type = two_stage_type
-        assert two_stage_type in ["no", "standard"], "unknown param {} of two_stage_type".format(
+        assert two_stage_type in [
-            two_stage_type
+            "no",
-        )
+            "standard",
        ], "unknown param {} of two_stage_type".format(two_stage_type)
        if two_stage_type == "standard":
            # anchor selection at the output of encoder
            self.enc_output = nn.Linear(d_model, d_model)
@ -208,7 +217,16 @@ class Transformer(nn.Module):
    def init_ref_points(self, use_num_queries):
        self.refpoint_embed = nn.Embedding(use_num_queries, 4)
-    def forward(self, srcs, masks, refpoint_embed, pos_embeds, tgt, attn_mask=None, text_dict=None):
+    def forward(
        self,
        srcs,
        masks,
        refpoint_embed,
        pos_embeds,
        tgt,
        attn_mask=None,
        text_dict=None,
    ):
        """
        Input:
            - srcs: List of multi features [bs, ci, hi, wi]
@ -287,7 +305,9 @@ class Transformer(nn.Module):
            output_memory = self.enc_output_norm(self.enc_output(output_memory))
            if text_dict is not None:
-                enc_outputs_class_unselected = self.enc_out_class_embed(output_memory, text_dict)
+                enc_outputs_class_unselected = self.enc_out_class_embed(
                    output_memory, text_dict
                )
            else:
                enc_outputs_class_unselected = self.enc_out_class_embed(output_memory)
@ -301,7 +321,9 @@ class Transformer(nn.Module):
            # gather boxes
            refpoint_embed_undetach = torch.gather(
-                enc_outputs_coord_unselected, 1, topk_proposals.unsqueeze(-1).repeat(1, 1, 4)
+                enc_outputs_coord_unselected,
                1,
                topk_proposals.unsqueeze(-1).repeat(1, 1, 4),
            )  # unsigmoid
            refpoint_embed_ = refpoint_embed_undetach.detach()
            init_box_proposal = torch.gather(
@ -310,7 +332,9 @@ class Transformer(nn.Module):
            # gather tgt
            tgt_undetach = torch.gather(
-                output_memory, 1, topk_proposals.unsqueeze(-1).repeat(1, 1, self.d_model)
+                output_memory,
                1,
                topk_proposals.unsqueeze(-1).repeat(1, 1, self.d_model),
            )
            if self.embed_init_tgt:
                tgt_ = (
@ -350,7 +374,9 @@ class Transformer(nn.Module):
            init_box_proposal = refpoint_embed_.sigmoid()
        else:
-            raise NotImplementedError("unknown two_stage_type {}".format(self.two_stage_type))
+            raise NotImplementedError(
                "unknown two_stage_type {}".format(self.two_stage_type)
            )
        #########################################################
        # End preparing tgt
        # - tgt: bs, NQ, d_model
@ -432,7 +458,9 @@ class TransformerEncoder(nn.Module):
        self.text_layers = []
        self.fusion_layers = []
        if num_layers > 0:
-            self.layers = _get_clones(encoder_layer, num_layers, layer_share=enc_layer_share)
+            self.layers = _get_clones(
                encoder_layer, num_layers, layer_share=enc_layer_share
            )
            if text_enhance_layer is not None:
                self.text_layers = _get_clones(
@ -465,7 +493,6 @@ class TransformerEncoder(nn.Module):
    def get_reference_points(spatial_shapes, valid_ratios, device):
        reference_points_list = []
        for lvl, (H_, W_) in enumerate(spatial_shapes):
            ref_y, ref_x = torch.meshgrid(
                torch.linspace(0.5, H_ - 0.5, H_, dtype=torch.float32, device=device),
                torch.linspace(0.5, W_ - 0.5, W_, dtype=torch.float32, device=device),
@ -534,7 +561,9 @@ class TransformerEncoder(nn.Module):
                    .unsqueeze(-1)
                    .repeat(bs, 1, 1)
                )
-                pos_text = get_sine_pos_embed(pos_text, num_pos_feats=256, exchange_xy=False)
+                pos_text = get_sine_pos_embed(
                    pos_text, num_pos_feats=256, exchange_xy=False
                )
            if position_ids is not None:
                pos_text = get_sine_pos_embed(
                    position_ids[..., None], num_pos_feats=256, exchange_xy=False
@ -662,7 +691,6 @@ class TransformerDecoder(nn.Module):
        ref_points = [reference_points]
        for layer_id, layer in enumerate(self.layers):
            if reference_points.shape[-1] == 4:
                reference_points_input = (
                    reference_points[:, :, None]
@ -670,7 +698,9 @@ class TransformerDecoder(nn.Module):
                )  # nq, bs, nlevel, 4
            else:
                assert reference_points.shape[-1] == 2
-                reference_points_input = reference_points[:, :, None] * valid_ratios[None, :]
+                reference_points_input = (
                    reference_points[:, :, None] * valid_ratios[None, :]
                )
            query_sine_embed = gen_sineembed_for_position(
                reference_points_input[:, :, 0, :]
            )  # nq, bs, 256*2
@ -777,7 +807,13 @@ class DeformableTransformerEncoderLayer(nn.Module):
        return src
    def forward(
-        self, src, pos, reference_points, spatial_shapes, level_start_index, key_padding_mask=None
+        self,
        src,
        pos,
        reference_points,
        spatial_shapes,
        level_start_index,
        key_padding_mask=None,
    ):
        # self attention
        # import ipdb; ipdb.set_trace()
--- a/swarms/agents/models/groundingdino/models/GroundingDINO/transformer_vanilla.py
+++ b/swarms/agents/models/groundingdino/models/GroundingDINO/transformer_vanilla.py
@ -26,7 +26,9 @@ from .utils import (
 class TextTransformer(nn.Module):
-    def __init__(self, num_layers, d_model=256, nheads=8, dim_feedforward=2048, dropout=0.1):
+    def __init__(
        self, num_layers, d_model=256, nheads=8, dim_feedforward=2048, dropout=0.1
    ):
        super().__init__()
        self.num_layers = num_layers
        self.d_model = d_model
@ -35,7 +37,10 @@ class TextTransformer(nn.Module):
        self.norm = None
        single_encoder_layer = TransformerEncoderLayer(
-            d_model=d_model, nhead=nheads, dim_feedforward=dim_feedforward, dropout=dropout
+            d_model=d_model,
            nhead=nheads,
            dim_feedforward=dim_feedforward,
            dropout=dropout,
        )
        self.layers = _get_clones(single_encoder_layer, num_layers)
--- a/swarms/agents/models/groundingdino/models/GroundingDINO/utils.py
+++ b/swarms/agents/models/groundingdino/models/GroundingDINO/utils.py
@ -39,14 +39,20 @@ def get_sine_pos_embed(
    """
    scale = 2 * math.pi
    dim_t = torch.arange(num_pos_feats, dtype=torch.float32, device=pos_tensor.device)
-    dim_t = temperature ** (2 * torch.div(dim_t, 2, rounding_mode="floor") / num_pos_feats)
+    dim_t = temperature ** (
        2 * torch.div(dim_t, 2, rounding_mode="floor") / num_pos_feats
    )
    def sine_func(x: torch.Tensor):
        sin_x = x * scale / dim_t
-        sin_x = torch.stack((sin_x[..., 0::2].sin(), sin_x[..., 1::2].cos()), dim=3).flatten(2)
+        sin_x = torch.stack(
            (sin_x[..., 0::2].sin(), sin_x[..., 1::2].cos()), dim=3
        ).flatten(2)
        return sin_x
-    pos_res = [sine_func(x) for x in pos_tensor.split([1] * pos_tensor.shape[-1], dim=-1)]
+    pos_res = [
        sine_func(x) for x in pos_tensor.split([1] * pos_tensor.shape[-1], dim=-1)
    ]
    if exchange_xy:
        pos_res[0], pos_res[1] = pos_res[1], pos_res[0]
    pos_res = torch.cat(pos_res, dim=-1)
@ -70,7 +76,9 @@ def gen_encoder_output_proposals(
    proposals = []
    _cur = 0
    for lvl, (H_, W_) in enumerate(spatial_shapes):
-        mask_flatten_ = memory_padding_mask[:, _cur: (_cur + H_ * W_)].view(N_, H_, W_, 1)
+        mask_flatten_ = memory_padding_mask[:, _cur: (_cur + H_ * W_)].view(
            N_, H_, W_, 1
        )
        valid_H = torch.sum(~mask_flatten_[:, :, 0, 0], 1)
        valid_W = torch.sum(~mask_flatten_[:, 0, :, 0], 1)
@ -82,7 +90,9 @@ def gen_encoder_output_proposals(
        )
        grid = torch.cat([grid_x.unsqueeze(-1), grid_y.unsqueeze(-1)], -1)  # H_, W_, 2
-        scale = torch.cat([valid_W.unsqueeze(-1), valid_H.unsqueeze(-1)], 1).view(N_, 1, 1, 2)
+        scale = torch.cat([valid_W.unsqueeze(-1), valid_H.unsqueeze(-1)], 1).view(
            N_, 1, 1, 2
        )
        grid = (grid.unsqueeze(0).expand(N_, -1, -1, -1) + 0.5) / scale
        if learnedwh is not None:
@ -99,15 +109,21 @@ def gen_encoder_output_proposals(
        _cur += H_ * W_
    # import ipdb; ipdb.set_trace()
    output_proposals = torch.cat(proposals, 1)
-    output_proposals_valid = ((output_proposals > 0.01) & (output_proposals < 0.99)).all(
+    output_proposals_valid = (
-        -1, keepdim=True
+        (output_proposals > 0.01) & (output_proposals < 0.99)
-    )
+    ).all(-1, keepdim=True)
    output_proposals = torch.log(output_proposals / (1 - output_proposals))  # unsigmoid
-    output_proposals = output_proposals.masked_fill(memory_padding_mask.unsqueeze(-1), float("inf"))
+    output_proposals = output_proposals.masked_fill(
-    output_proposals = output_proposals.masked_fill(~output_proposals_valid, float("inf"))
+        memory_padding_mask.unsqueeze(-1), float("inf")
    )
    output_proposals = output_proposals.masked_fill(
        ~output_proposals_valid, float("inf")
    )
    output_memory = memory
-    output_memory = output_memory.masked_fill(memory_padding_mask.unsqueeze(-1), float(0))
+    output_memory = output_memory.masked_fill(
        memory_padding_mask.unsqueeze(-1), float(0)
    )
    output_memory = output_memory.masked_fill(~output_proposals_valid, float(0))
    # output_memory = output_memory.masked_fill(memory_padding_mask.unsqueeze(-1), float('inf'))
@ -136,7 +152,12 @@ class RandomBoxPerturber:
 def sigmoid_focal_loss(
-    inputs, targets, num_boxes, alpha: float = 0.25, gamma: float = 2, no_reduction=False
+    inputs,
    targets,
    num_boxes,
    alpha: float = 0.25,
    gamma: float = 2,
    no_reduction=False,
 ):
    """
    Loss used in RetinaNet for dense detection: https://arxiv.org/abs/1708.02002.
@ -206,23 +227,31 @@ def gen_sineembed_for_position(pos_tensor):
    # sineembed_tensor = torch.zeros(n_query, bs, 256)
    scale = 2 * math.pi
    dim_t = torch.arange(128, dtype=torch.float32, device=pos_tensor.device)
-    dim_t = 10000 ** (2 * (torch.div(dim_t, 2, rounding_mode='floor')) / 128)
+    dim_t = 10000 ** (2 * (torch.div(dim_t, 2, rounding_mode="floor")) / 128)
    x_embed = pos_tensor[:, :, 0] * scale
    y_embed = pos_tensor[:, :, 1] * scale
    pos_x = x_embed[:, :, None] / dim_t
    pos_y = y_embed[:, :, None] / dim_t
-    pos_x = torch.stack((pos_x[:, :, 0::2].sin(), pos_x[:, :, 1::2].cos()), dim=3).flatten(2)
+    pos_x = torch.stack(
-    pos_y = torch.stack((pos_y[:, :, 0::2].sin(), pos_y[:, :, 1::2].cos()), dim=3).flatten(2)
+        (pos_x[:, :, 0::2].sin(), pos_x[:, :, 1::2].cos()), dim=3
    ).flatten(2)
    pos_y = torch.stack(
        (pos_y[:, :, 0::2].sin(), pos_y[:, :, 1::2].cos()), dim=3
    ).flatten(2)
    if pos_tensor.size(-1) == 2:
        pos = torch.cat((pos_y, pos_x), dim=2)
    elif pos_tensor.size(-1) == 4:
        w_embed = pos_tensor[:, :, 2] * scale
        pos_w = w_embed[:, :, None] / dim_t
-        pos_w = torch.stack((pos_w[:, :, 0::2].sin(), pos_w[:, :, 1::2].cos()), dim=3).flatten(2)
+        pos_w = torch.stack(
            (pos_w[:, :, 0::2].sin(), pos_w[:, :, 1::2].cos()), dim=3
        ).flatten(2)
        h_embed = pos_tensor[:, :, 3] * scale
        pos_h = h_embed[:, :, None] / dim_t
-        pos_h = torch.stack((pos_h[:, :, 0::2].sin(), pos_h[:, :, 1::2].cos()), dim=3).flatten(2)
+        pos_h = torch.stack(
            (pos_h[:, :, 0::2].sin(), pos_h[:, :, 1::2].cos()), dim=3
        ).flatten(2)
        pos = torch.cat((pos_y, pos_x, pos_w, pos_h), dim=2)
    else:
@ -262,7 +291,9 @@ class ContrastiveEmbed(nn.Module):
        res.masked_fill_(~text_token_mask[:, None, :], float("-inf"))
        # padding to max_text_len
-        new_res = torch.full((*res.shape[:-1], self.max_text_len), float("-inf"), device=res.device)
+        new_res = torch.full(
            (*res.shape[:-1], self.max_text_len), float("-inf"), device=res.device
        )
        new_res[..., : res.shape[-1]] = res
        return new_res
--- a/swarms/agents/models/groundingdino/models/registry.py
+++ b/swarms/agents/models/groundingdino/models/registry.py
@ -57,7 +57,9 @@ class Registry(object):
        if module_name is None:
            module_name = module_build_function.__name__
        if not force and module_name in self._module_dict:
-            raise KeyError("{} is already registered in {}".format(module_name, self.name))
+            raise KeyError(
                "{} is already registered in {}".format(module_name, self.name)
            )
        self._module_dict[module_name] = module_build_function
        return module_build_function
--- a/swarms/agents/models/groundingdino/util/get_tokenlizer.py
+++ b/swarms/agents/models/groundingdino/util/get_tokenlizer.py
@ -22,7 +22,9 @@ def get_tokenlizer(text_encoder_type):
 def get_pretrained_language_model(text_encoder_type):
-    if text_encoder_type == "bert-base-uncased" or (os.path.isdir(text_encoder_type) and os.path.exists(text_encoder_type)):
+    if text_encoder_type == "bert-base-uncased" or (
        os.path.isdir(text_encoder_type) and os.path.exists(text_encoder_type)
    ):
        return BertModel.from_pretrained(text_encoder_type)
    if text_encoder_type == "roberta-base":
        return RobertaModel.from_pretrained(text_encoder_type)
--- a/swarms/agents/models/groundingdino/util/inference.py
+++ b/swarms/agents/models/groundingdino/util/inference.py
@ -26,7 +26,9 @@ def preprocess_caption(caption: str) -> str:
    return result + "."
-def load_model(model_config_path: str, model_checkpoint_path: str, device: str = "cuda"):
+def load_model(
    model_config_path: str, model_checkpoint_path: str, device: str = "cuda"
 ):
    args = SLConfig.fromfile(model_config_path)
    args.device = device
    model = build_model(args)
@ -51,13 +53,13 @@ def load_image(image_path: str) -> Tuple[np.array, torch.Tensor]:
 def predict(
-        model,
+    model,
-        image: torch.Tensor,
+    image: torch.Tensor,
-        caption: str,
+    caption: str,
-        box_threshold: float,
+    box_threshold: float,
-        text_threshold: float,
+    text_threshold: float,
-        device: str = "cuda",
+    device: str = "cuda",
-        remove_combined: bool = False
+    remove_combined: bool = False,
 ) -> Tuple[torch.Tensor, torch.Tensor, List[str]]:
    caption = preprocess_caption(caption=caption)
@ -67,8 +69,12 @@ def predict(
    with torch.no_grad():
        outputs = model(image[None], captions=[caption])
-    prediction_logits = outputs["pred_logits"].cpu().sigmoid()[0]  # prediction_logits.shape = (nq, 256)
+    prediction_logits = (
-    prediction_boxes = outputs["pred_boxes"].cpu()[0]  # prediction_boxes.shape = (nq, 4)
+        outputs["pred_logits"].cpu().sigmoid()[0]
    )  # prediction_logits.shape = (nq, 256)
    prediction_boxes = outputs["pred_boxes"].cpu()[
        0
    ]  # prediction_boxes.shape = (nq, 4)
    mask = prediction_logits.max(dim=1)[0] > box_threshold
    logits = prediction_logits[mask]  # logits.shape = (n, 256)
@ -78,7 +84,11 @@ def predict(
    tokenized = tokenizer(caption)
    if remove_combined:
-        sep_idx = [i for i in range(len(tokenized['input_ids'])) if tokenized['input_ids'][i] in [101, 102, 1012]]
+        sep_idx = [
            i
            for i in range(len(tokenized["input_ids"]))
            if tokenized["input_ids"][i] in [101, 102, 1012]
        ]
        phrases = []
        for logit in logits:
@ -86,32 +96,40 @@ def predict(
            insert_idx = bisect.bisect_left(sep_idx, max_idx)
            right_idx = sep_idx[insert_idx]
            left_idx = sep_idx[insert_idx - 1]
-            phrases.append(get_phrases_from_posmap(logit > text_threshold, tokenized, tokenizer, left_idx, right_idx).replace('.', ''))
+            phrases.append(
                get_phrases_from_posmap(
                    logit > text_threshold, tokenized, tokenizer, left_idx, right_idx
                ).replace(".", "")
            )
    else:
        phrases = [
-            get_phrases_from_posmap(logit > text_threshold, tokenized, tokenizer).replace('.', '')
+            get_phrases_from_posmap(
-            for logit
+                logit > text_threshold, tokenized, tokenizer
-            in logits
+            ).replace(".", "")
            for logit in logits
        ]
    return boxes, logits.max(dim=1)[0], phrases
-def annotate(image_source: np.ndarray, boxes: torch.Tensor, logits: torch.Tensor, phrases: List[str]) -> np.ndarray:
+def annotate(
    image_source: np.ndarray,
    boxes: torch.Tensor,
    logits: torch.Tensor,
    phrases: List[str],
 ) -> np.ndarray:
    h, w, _ = image_source.shape
    boxes = boxes * torch.Tensor([w, h, w, h])
    xyxy = box_convert(boxes=boxes, in_fmt="cxcywh", out_fmt="xyxy").numpy()
    detections = sv.Detections(xyxy=xyxy)
-    labels = [
+    labels = [f"{phrase} {logit:.2f}" for phrase, logit in zip(phrases, logits)]
        f"{phrase} {logit:.2f}"
        for phrase, logit
        in zip(phrases, logits)
    ]
    box_annotator = sv.BoxAnnotator()
    annotated_frame = cv2.cvtColor(image_source, cv2.COLOR_RGB2BGR)
-    annotated_frame = box_annotator.annotate(scene=annotated_frame, detections=detections, labels=labels)
+    annotated_frame = box_annotator.annotate(
        scene=annotated_frame, detections=detections, labels=labels
    )
    return annotated_frame
@ -121,17 +139,13 @@ def annotate(image_source: np.ndarray, boxes: torch.Tensor, logits: torch.Tensor
 class Model:
    def __init__(
-        self,
+        self, model_config_path: str, model_checkpoint_path: str, device: str = "cuda"
        model_config_path: str,
        model_checkpoint_path: str,
        device: str = "cuda"
    ):
        self.model = load_model(
            model_config_path=model_config_path,
            model_checkpoint_path=model_checkpoint_path,
-            device=device
+            device=device,
        ).to(device)
        self.device = device
@ -140,7 +154,7 @@ class Model:
        image: np.ndarray,
        caption: str,
        box_threshold: float = 0.35,
-        text_threshold: float = 0.25
+        text_threshold: float = 0.25,
    ) -> Tuple[sv.Detections, List[str]]:
        """
        import cv2
@ -167,13 +181,12 @@ class Model:
            caption=caption,
            box_threshold=box_threshold,
            text_threshold=text_threshold,
-            device=self.device)
+            device=self.device,
        )
        source_h, source_w, _ = image.shape
        detections = Model.post_process_result(
-            source_h=source_h,
+            source_h=source_h, source_w=source_w, boxes=boxes, logits=logits
-            source_w=source_w,
+        )
            boxes=boxes,
            logits=logits)
        return detections, phrases
    def predict_with_classes(
@ -181,7 +194,7 @@ class Model:
        image: np.ndarray,
        classes: List[str],
        box_threshold: float,
-        text_threshold: float
+        text_threshold: float,
    ) -> sv.Detections:
        """
        import cv2
@ -210,13 +223,12 @@ class Model:
            caption=caption,
            box_threshold=box_threshold,
            text_threshold=text_threshold,
-            device=self.device)
+            device=self.device,
        )
        source_h, source_w, _ = image.shape
        detections = Model.post_process_result(
-            source_h=source_h,
+            source_h=source_h, source_w=source_w, boxes=boxes, logits=logits
-            source_w=source_w,
+        )
            boxes=boxes,
            logits=logits)
        class_id = Model.phrases2classes(phrases=phrases, classes=classes)
        detections.class_id = class_id
        return detections
@ -236,10 +248,7 @@ class Model:
    @staticmethod
    def post_process_result(
-            source_h: int,
+        source_h: int, source_w: int, boxes: torch.Tensor, logits: torch.Tensor
            source_w: int,
            boxes: torch.Tensor,
            logits: torch.Tensor
    ) -> sv.Detections:
        boxes = boxes * torch.Tensor([source_w, source_h, source_w, source_h])
        xyxy = box_convert(boxes=boxes, in_fmt="cxcywh", out_fmt="xyxy").numpy()
--- a/swarms/agents/models/groundingdino/util/logger.py
+++ b/swarms/agents/models/groundingdino/util/logger.py
@ -29,7 +29,9 @@ class _ColorfulFormatter(logging.Formatter):
 # so that calling setup_logger multiple times won't add many handlers
@functools.lru_cache()
-def setup_logger(output=None, distributed_rank=0, *, color=True, name="imagenet", abbrev_name=None):
+def setup_logger(
    output=None, distributed_rank=0, *, color=True, name="imagenet", abbrev_name=None
 ):
    """
    Initialize the detectron2 logger and set its verbosity level to "INFO".
--- a/swarms/agents/models/groundingdino/util/misc.py
+++ b/swarms/agents/models/groundingdino/util/misc.py
@ -135,7 +135,9 @@ def all_gather_cpu(data):
    # obtain Tensor size of each rank
    local_size = torch.tensor([tensor.numel()], device=device, dtype=torch.long)
-    size_list = [torch.tensor([0], device=device, dtype=torch.long) for _ in range(world_size)]
+    size_list = [
        torch.tensor([0], device=device, dtype=torch.long) for _ in range(world_size)
    ]
    if cpu_group is None:
        dist.all_gather(size_list, local_size)
    else:
@ -153,7 +155,9 @@ def all_gather_cpu(data):
    for _ in size_list:
        tensor_list.append(torch.empty((max_size,), dtype=torch.uint8, device=device))
    if local_size != max_size:
-        padding = torch.empty(size=(max_size - local_size,), dtype=torch.uint8, device=device)
+        padding = torch.empty(
            size=(max_size - local_size,), dtype=torch.uint8, device=device
        )
        tensor = torch.cat((tensor, padding), dim=0)
    if cpu_group is None:
        dist.all_gather(tensor_list, tensor)
@ -205,7 +209,9 @@ def all_gather(data):
    for _ in size_list:
        tensor_list.append(torch.empty((max_size,), dtype=torch.uint8, device="cuda"))
    if local_size != max_size:
-        padding = torch.empty(size=(max_size - local_size,), dtype=torch.uint8, device="cuda")
+        padding = torch.empty(
            size=(max_size - local_size,), dtype=torch.uint8, device="cuda"
        )
        tensor = torch.cat((tensor, padding), dim=0)
    dist.all_gather(tensor_list, tensor)
@ -261,7 +267,9 @@ class MetricLogger(object):
            return self.meters[attr]
        if attr in self.__dict__:
            return self.__dict__[attr]
-        raise AttributeError("'{}' object has no attribute '{}'".format(type(self).__name__, attr))
+        raise AttributeError(
            "'{}' object has no attribute '{}'".format(type(self).__name__, attr)
        )
    def __str__(self):
        loss_str = []
@ -434,7 +442,9 @@ class NestedTensor(object):
        return NestedTensor(cast_tensor, cast_mask)
    def to_img_list_single(self, tensor, mask):
-        assert tensor.dim() == 3, "dim of tensor should be 3 but {}".format(tensor.dim())
+        assert tensor.dim() == 3, "dim of tensor should be 3 but {}".format(
            tensor.dim()
        )
        maxH = (~mask).sum(0).max()
        maxW = (~mask).sum(1).max()
        img = tensor[:, :maxH, :maxW]
@ -516,11 +526,15 @@ def _onnx_nested_tensor_from_tensor_list(tensor_list: List[Tensor]) -> NestedTen
    padded_masks = []
    for img in tensor_list:
        padding = [(s1 - s2) for s1, s2 in zip(max_size, tuple(img.shape))]
-        padded_img = torch.nn.functional.pad(img, (0, padding[2], 0, padding[1], 0, padding[0]))
+        padded_img = torch.nn.functional.pad(
            img, (0, padding[2], 0, padding[1], 0, padding[0])
        )
        padded_imgs.append(padded_img)
        m = torch.zeros_like(img[0], dtype=torch.int, device=img.device)
-        padded_mask = torch.nn.functional.pad(m, (0, padding[2], 0, padding[1]), "constant", 1)
+        padded_mask = torch.nn.functional.pad(
            m, (0, padding[2], 0, padding[1]), "constant", 1
        )
        padded_masks.append(padded_mask.to(torch.bool))
    tensor = torch.stack(padded_imgs)
@ -575,7 +589,9 @@ def save_on_master(*args, **kwargs):
 def init_distributed_mode(args):
-    if "WORLD_SIZE" in os.environ and os.environ["WORLD_SIZE"] != "":  # 'RANK' in os.environ and
+    if (
        "WORLD_SIZE" in os.environ and os.environ["WORLD_SIZE"] != ""
    ):  # 'RANK' in os.environ and
        args.rank = int(os.environ["RANK"])
        args.world_size = int(os.environ["WORLD_SIZE"])
        args.gpu = args.local_rank = int(os.environ["LOCAL_RANK"])
@ -615,11 +631,17 @@ def init_distributed_mode(args):
        args.local_rank = 0
        return
-    print("world_size:{} rank:{} local_rank:{}".format(args.world_size, args.rank, args.local_rank))
+    print(
        "world_size:{} rank:{} local_rank:{}".format(
            args.world_size, args.rank, args.local_rank
        )
    )
    args.distributed = True
    torch.cuda.set_device(args.local_rank)
    args.dist_backend = "nccl"
-    print("| distributed init (rank {}): {}".format(args.rank, args.dist_url), flush=True)
+    print(
        "| distributed init (rank {}): {}".format(args.rank, args.dist_url), flush=True
    )
    torch.distributed.init_process_group(
        backend=args.dist_backend,
@ -666,7 +688,9 @@ def accuracy_onehot(pred, gt):
    return acc
-def interpolate(input, size=None, scale_factor=None, mode="nearest", align_corners=None):
+def interpolate(
    input, size=None, scale_factor=None, mode="nearest", align_corners=None
 ):
    # type: (Tensor, Optional[List[int]], Optional[float], str, Optional[bool]) -> Tensor
    """
    Equivalent to nn.functional.interpolate, but with support for empty batch sizes.
@ -675,13 +699,17 @@ def interpolate(input, size=None, scale_factor=None, mode="nearest", align_corne
    """
    if __torchvision_need_compat_flag < 0.7:
        if input.numel() > 0:
-            return torch.nn.functional.interpolate(input, size, scale_factor, mode, align_corners)
+            return torch.nn.functional.interpolate(
                input, size, scale_factor, mode, align_corners
            )
        output_shape = _output_size(2, input, size, scale_factor)
        output_shape = list(input.shape[:-2]) + list(output_shape)
        return _new_empty_tensor(input, output_shape)
    else:
-        return torchvision.ops.misc.interpolate(input, size, scale_factor, mode, align_corners)
+        return torchvision.ops.misc.interpolate(
            input, size, scale_factor, mode, align_corners
        )
 class color_sys:
@ -693,7 +721,12 @@ class color_sys:
            lightness = (50 + np.random.rand() * 10) / 100.0
            saturation = (90 + np.random.rand() * 10) / 100.0
            colors.append(
-                tuple([int(j * 255) for j in colorsys.hls_to_rgb(hue, lightness, saturation)])
+                tuple(
                    [
                        int(j * 255)
                        for j in colorsys.hls_to_rgb(hue, lightness, saturation)
                    ]
                )
            )
        self.colors = colors
--- a/swarms/agents/models/groundingdino/util/slconfig.py
+++ b/swarms/agents/models/groundingdino/util/slconfig.py
@ -31,7 +31,9 @@ class ConfigDict(Dict):
        try:
            value = super(ConfigDict, self).__getattr__(name)
        except KeyError:
-            ex = AttributeError(f"'{self.__class__.__name__}' object has no " f"attribute '{name}'")
+            ex = AttributeError(
                f"'{self.__class__.__name__}' object has no " f"attribute '{name}'"
            )
        except Exception as e:
            ex = e
        else:
@ -79,9 +81,11 @@ class SLConfig(object):
        check_file_exist(filename)
        if filename.lower().endswith(".py"):
            with tempfile.TemporaryDirectory() as temp_config_dir:
-                temp_config_file = tempfile.NamedTemporaryFile(dir=temp_config_dir, suffix=".py")
+                temp_config_file = tempfile.NamedTemporaryFile(
                    dir=temp_config_dir, suffix=".py"
                )
                temp_config_name = osp.basename(temp_config_file.name)
-                if os.name == 'nt':
+                if os.name == "nt":
                    temp_config_file.close()
                shutil.copyfile(filename, osp.join(temp_config_dir, temp_config_name))
                temp_module_name = osp.splitext(temp_config_name)[0]
@ -90,7 +94,9 @@ class SLConfig(object):
                mod = import_module(temp_module_name)
                sys.path.pop(0)
                cfg_dict = {
-                    name: value for name, value in mod.__dict__.items() if not name.startswith("__")
+                    name: value
                    for name, value in mod.__dict__.items()
                    if not name.startswith("__")
                }
                # delete imported module
                del sys.modules[temp_module_name]
@ -111,7 +117,9 @@ class SLConfig(object):
        if BASE_KEY in cfg_dict:
            cfg_dir = osp.dirname(filename)
            base_filename = cfg_dict.pop(BASE_KEY)
-            base_filename = base_filename if isinstance(base_filename, list) else [base_filename]
+            base_filename = (
                base_filename if isinstance(base_filename, list) else [base_filename]
            )
            cfg_dict_list = list()
            cfg_text_list = list()
@ -156,7 +164,6 @@ class SLConfig(object):
        b = b.copy()
        for k, v in a.items():
            if isinstance(v, dict) and k in b and not v.pop(DELETE_KEY, False):
                if not isinstance(b[k], dict) and not isinstance(b[k], list):
                    # if :
                    # import ipdb; ipdb.set_trace()
@ -172,7 +179,8 @@ class SLConfig(object):
                    _ = int(k)
                except BaseException:
                    raise TypeError(
-                        f"b is a list, " f"index {k} should be an int when input but {type(k)}"
+                        f"b is a list, "
                        f"index {k} should be an int when input but {type(k)}"
                    )
                b[int(k)] = SLConfig._merge_a_into_b(v, b[int(k)])
            else:
@ -215,7 +223,6 @@ class SLConfig(object):
    @property
    def pretty_text(self):
        indent = 4
        def _indent(s_, num_spaces):
--- a/swarms/agents/models/groundingdino/util/utils.py
+++ b/swarms/agents/models/groundingdino/util/utils.py
@ -40,7 +40,9 @@ def renorm(
 ) -> torch.FloatTensor:
    # img: tensor(3,H,W) or tensor(B,3,H,W)
    # return: same as img
-    assert img.dim() == 3 or img.dim() == 4, "img.dim() should be 3 or 4 but %d" % img.dim()
+    assert img.dim() == 3 or img.dim() == 4, (
        "img.dim() should be 3 or 4 but %d" % img.dim()
    )
    if img.dim() == 3:
        assert img.size(0) == 3, 'img.size(0) shoule be 3 but "%d". (%s)' % (
            img.size(0),
@ -147,8 +149,12 @@ class CocoClassMapper:
            "89": 79,
            "90": 80,
        }
-        self.origin2compact_mapper = {int(k): v - 1 for k, v in self.category_map_str.items()}
+        self.origin2compact_mapper = {
-        self.compact2origin_mapper = {int(v - 1): int(k) for k, v in self.category_map_str.items()}
+            int(k): v - 1 for k, v in self.category_map_str.items()
        }
        self.compact2origin_mapper = {
            int(v - 1): int(k) for k, v in self.category_map_str.items()
        }
    def origin2compact(self, idx):
        return self.origin2compact_mapper[int(idx)]
@ -271,6 +277,7 @@ def get_embedder(multires, i=0):
    def embed(x, eo=embedder_obj):
        return eo.embed(x)
    return embed, embedder_obj.out_dim
@ -381,7 +388,9 @@ class NiceRepr:
            return str(len(self))
        else:
            # In all other cases force the subclass to overload __nice__
-            raise NotImplementedError(f"Define the __nice__ method for {self.__class__!r}")
+            raise NotImplementedError(
                f"Define the __nice__ method for {self.__class__!r}"
            )
    def __repr__(self):
        """str: the string of the module"""
@ -496,7 +505,9 @@ class ModelEma(torch.nn.Module):
                ema_v.copy_(update_fn(ema_v, model_v))
    def update(self, model):
-        self._update(model, update_fn=lambda e, m: self.decay * e + (1.0 - self.decay) * m)
+        self._update(
            model, update_fn=lambda e, m: self.decay * e + (1.0 - self.decay) * m
        )
    def set(self, model):
        self._update(model, update_fn=lambda e, m: m)
@ -594,12 +605,17 @@ def targets_to(targets: List[Dict[str, Any]], device):
        "dataset_type",
    ]
    return [
-        {k: v.to(device) if k not in excluded_keys else v for k, v in t.items()} for t in targets
+        {k: v.to(device) if k not in excluded_keys else v for k, v in t.items()}
        for t in targets
    ]
 def get_phrases_from_posmap(
-    posmap: torch.BoolTensor, tokenized: Dict, tokenizer: AutoTokenizer, left_idx: int = 0, right_idx: int = 255
+    posmap: torch.BoolTensor,
    tokenized: Dict,
    tokenizer: AutoTokenizer,
    left_idx: int = 0,
    right_idx: int = 255,
 ):
    assert isinstance(posmap, torch.Tensor), "posmap must be torch.Tensor"
    if posmap.dim() == 1:
--- a/swarms/agents/models/groundingdino/util/visualizer.py
+++ b/swarms/agents/models/groundingdino/util/visualizer.py
@ -23,7 +23,9 @@ def renorm(
 ) -> torch.FloatTensor:
    # img: tensor(3,H,W) or tensor(B,3,H,W)
    # return: same as img
-    assert img.dim() == 3 or img.dim() == 4, "img.dim() should be 3 or 4 but %d" % img.dim()
+    assert img.dim() == 3 or img.dim() == 4, (
        "img.dim() should be 3 or 4 but %d" % img.dim()
    )
    if img.dim() == 3:
        assert img.size(0) == 3, 'img.size(0) shoule be 3 but "%d". (%s)' % (
            img.size(0),
@ -124,7 +126,10 @@ class COCOVisualizer:
            )
        else:
            savename = "{}/{}-{}-{}.png".format(
-                savedir, caption, int(image_id), str(datetime.datetime.now()).replace(" ", "-")
+                savedir,
                caption,
                int(image_id),
                str(datetime.datetime.now()).replace(" ", "-"),
            )
        print("savename: {}".format(savename))
        os.makedirs(os.path.dirname(savename), exist_ok=True)
@ -188,7 +193,9 @@ class COCOVisualizer:
                )
        if "box_label" in tgt:
-            assert len(tgt["box_label"]) == numbox, f"{len(tgt['box_label'])} = {numbox}, "
+            assert (
                len(tgt["box_label"]) == numbox
            ), f"{len(tgt['box_label'])} = {numbox}, "
            for idx, bl in enumerate(tgt["box_label"]):
                _string = str(bl)
                bbox_x, bbox_y, bbox_w, bbox_h = boxes[idx]
@ -214,7 +221,9 @@ class COCOVisualizer:
                tgt["attn"] = [tgt["attn"]]
            for item in tgt["attn"]:
                attn_map, basergb = item
-                attn_map = (attn_map - attn_map.min()) / (attn_map.max() - attn_map.min() + 1e-3)
+                attn_map = (attn_map - attn_map.min()) / (
                    attn_map.max() - attn_map.min() + 1e-3
                )
                attn_map = (attn_map * 255).astype(np.uint8)
                cm = ColorMap(basergb)
                heatmap = cm(attn_map)
@ -310,7 +319,9 @@ class COCOVisualizer:
            # p = PatchCollection(polygons, facecolor=color, linewidths=0, alpha=0.4)
            # ax.add_collection(p)
-            p = PatchCollection(polygons, facecolor="none", edgecolors=color, linewidths=2)
+            p = PatchCollection(
                polygons, facecolor="none", edgecolors=color, linewidths=2
            )
            ax.add_collection(p)
        elif datasetType == "captions":
            for ann in anns:
--- a/swarms/agents/models/groundingdino/util/vl_utils.py
+++ b/swarms/agents/models/groundingdino/util/vl_utils.py
@ -16,7 +16,7 @@ def create_positive_map_from_span(tokenized, token_span, max_text_len=256):
    """
    positive_map = torch.zeros((len(token_span), max_text_len), dtype=torch.float)
    for j, tok_list in enumerate(token_span):
-        for (beg, end) in tok_list:
+        for beg, end in tok_list:
            beg_pos = tokenized.char_to_token(beg)
            end_pos = tokenized.char_to_token(end - 1)
            if beg_pos is None:
--- a/swarms/agents/models/segment_anything/scripts/amg.py
+++ b/swarms/agents/models/segment_anything/scripts/amg.py
@ -52,7 +52,9 @@ parser.add_argument(
    help="The path to the SAM checkpoint to use for mask generation.",
 )
-parser.add_argument("--device", type=str, default="cuda", help="The device to run generation on.")
+parser.add_argument(
    "--device", type=str, default="cuda", help="The device to run generation on."
 )
 parser.add_argument(
    "--convert-to-rle",
@ -204,7 +206,9 @@ def main(args: argparse.Namespace) -> None:
        targets = [args.input]
    else:
        targets = [
-            f for f in os.listdir(args.input) if not os.path.isdir(os.path.join(args.input, f))
+            f
            for f in os.listdir(args.input)
            if not os.path.isdir(os.path.join(args.input, f))
        ]
        targets = [os.path.join(args.input, f) for f in targets]
--- a/swarms/agents/models/segment_anything/scripts/export_onnx_model.py
+++ b/swarms/agents/models/segment_anything/scripts/export_onnx_model.py
@ -24,7 +24,10 @@ parser = argparse.ArgumentParser(
 )
 parser.add_argument(
-    "--checkpoint", type=str, required=True, help="The path to the SAM model checkpoint."
+    "--checkpoint",
    type=str,
    required=True,
    help="The path to the SAM model checkpoint.",
 )
 parser.add_argument(
@ -129,7 +132,9 @@ def run_export(
    mask_input_size = [4 * x for x in embed_size]
    dummy_inputs = {
        "image_embeddings": torch.randn(1, embed_dim, *embed_size, dtype=torch.float),
-        "point_coords": torch.randint(low=0, high=1024, size=(1, 5, 2), dtype=torch.float),
+        "point_coords": torch.randint(
            low=0, high=1024, size=(1, 5, 2), dtype=torch.float
        ),
        "point_labels": torch.randint(low=0, high=4, size=(1, 5), dtype=torch.float),
        "mask_input": torch.randn(1, 1, *mask_input_size, dtype=torch.float),
        "has_mask_input": torch.tensor([1], dtype=torch.float),
--- a/swarms/agents/models/segment_anything/segment_anything/automatic_mask_generator.py
+++ b/swarms/agents/models/segment_anything/segment_anything/automatic_mask_generator.py
@ -172,7 +172,9 @@ class SamAutomaticMaskGenerator:
        # Encode masks
        if self.output_mode == "coco_rle":
-            mask_data["segmentations"] = [coco_encode_rle(rle) for rle in mask_data["rles"]]
+            mask_data["segmentations"] = [
                coco_encode_rle(rle) for rle in mask_data["rles"]
            ]
        elif self.output_mode == "binary_mask":
            mask_data["segmentations"] = [rle_to_mask(rle) for rle in mask_data["rles"]]
        else:
@ -242,7 +244,9 @@ class SamAutomaticMaskGenerator:
        # Generate masks for this crop in batches
        data = MaskData()
        for (points,) in batch_iterator(self.points_per_batch, points_for_image):
-            batch_data = self._process_batch(points, cropped_im_size, crop_box, orig_size)
+            batch_data = self._process_batch(
                points, cropped_im_size, crop_box, orig_size
            )
            data.cat(batch_data)
            del batch_data
        self.predictor.reset_image()
@ -275,7 +279,9 @@ class SamAutomaticMaskGenerator:
        # Run model on this batch
        transformed_points = self.predictor.transform.apply_coords(points, im_size)
        in_points = torch.as_tensor(transformed_points, device=self.predictor.device)
-        in_labels = torch.ones(in_points.shape[0], dtype=torch.int, device=in_points.device)
+        in_labels = torch.ones(
            in_points.shape[0], dtype=torch.int, device=in_points.device
        )
        masks, iou_preds, _ = self.predictor.predict_torch(
            in_points[:, None, :],
            in_labels[:, None],
@ -298,7 +304,9 @@ class SamAutomaticMaskGenerator:
        # Calculate stability score
        data["stability_score"] = calculate_stability_score(
-            data["masks"], self.predictor.model.mask_threshold, self.stability_score_offset
+            data["masks"],
            self.predictor.model.mask_threshold,
            self.stability_score_offset,
        )
        if self.stability_score_thresh > 0.0:
            keep_mask = data["stability_score"] >= self.stability_score_thresh
@ -309,7 +317,9 @@ class SamAutomaticMaskGenerator:
        data["boxes"] = batched_mask_to_box(data["masks"])
        # Filter boxes that touch crop boundaries
-        keep_mask = ~is_box_near_crop_edge(data["boxes"], crop_box, [0, 0, orig_w, orig_h])
+        keep_mask = ~is_box_near_crop_edge(
            data["boxes"], crop_box, [0, 0, orig_w, orig_h]
        )
        if not torch.all(keep_mask):
            data.filter(keep_mask)
--- a/swarms/agents/models/segment_anything/segment_anything/build_sam.py
+++ b/swarms/agents/models/segment_anything/segment_anything/build_sam.py
@ -8,7 +8,13 @@ import torch
 from functools import partial
-from .modeling import ImageEncoderViT, MaskDecoder, PromptEncoder, Sam, TwoWayTransformer
+from .modeling import (
    ImageEncoderViT,
    MaskDecoder,
    PromptEncoder,
    Sam,
    TwoWayTransformer,
 )
 def build_sam_vit_h(checkpoint=None):
--- a/swarms/agents/models/segment_anything/segment_anything/modeling/image_encoder.py
+++ b/swarms/agents/models/segment_anything/segment_anything/modeling/image_encoder.py
@ -66,7 +66,9 @@ class ImageEncoderViT(nn.Module):
        if use_abs_pos:
            # Initialize absolute positional embedding with pretrain image size.
            self.pos_embed = nn.Parameter(
-                torch.zeros(1, img_size // patch_size, img_size // patch_size, embed_dim)
+                torch.zeros(
                    1, img_size // patch_size, img_size // patch_size, embed_dim
                )
            )
        self.blocks = nn.ModuleList()
@ -159,7 +161,9 @@ class Block(nn.Module):
        )
        self.norm2 = norm_layer(dim)
-        self.mlp = MLPBlock(embedding_dim=dim, mlp_dim=int(dim * mlp_ratio), act=act_layer)
+        self.mlp = MLPBlock(
            embedding_dim=dim, mlp_dim=int(dim * mlp_ratio), act=act_layer
        )
        self.window_size = window_size
@ -224,23 +228,34 @@ class Attention(nn.Module):
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        B, H, W, _ = x.shape
        # qkv with shape (3, B, nHead, H * W, C)
-        qkv = self.qkv(x).reshape(B, H * W, 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
+        qkv = (
            self.qkv(x).reshape(B, H * W, 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
        )
        # q, k, v with shape (B * nHead, H * W, C)
        q, k, v = qkv.reshape(3, B * self.num_heads, H * W, -1).unbind(0)
        attn = (q * self.scale) @ k.transpose(-2, -1)
        if self.use_rel_pos:
-            attn = add_decomposed_rel_pos(attn, q, self.rel_pos_h, self.rel_pos_w, (H, W), (H, W))
+            attn = add_decomposed_rel_pos(
                attn, q, self.rel_pos_h, self.rel_pos_w, (H, W), (H, W)
            )
        attn = attn.softmax(dim=-1)
-        x = (attn @ v).view(B, self.num_heads, H, W, -1).permute(0, 2, 3, 1, 4).reshape(B, H, W, -1)
+        x = (
            (attn @ v)
            .view(B, self.num_heads, H, W, -1)
            .permute(0, 2, 3, 1, 4)
            .reshape(B, H, W, -1)
        )
        x = self.proj(x)
        return x
-def window_partition(x: torch.Tensor, window_size: int) -> Tuple[torch.Tensor, Tuple[int, int]]:
+def window_partition(
    x: torch.Tensor, window_size: int
 ) -> Tuple[torch.Tensor, Tuple[int, int]]:
    """
    Partition into non-overlapping windows with padding if needed.
    Args:
@ -260,12 +275,17 @@ def window_partition(x: torch.Tensor, window_size: int) -> Tuple[torch.Tensor, T
    Hp, Wp = H + pad_h, W + pad_w
    x = x.view(B, Hp // window_size, window_size, Wp // window_size, window_size, C)
-    windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C)
+    windows = (
        x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C)
    )
    return windows, (Hp, Wp)
 def window_unpartition(
-    windows: torch.Tensor, window_size: int, pad_hw: Tuple[int, int], hw: Tuple[int, int]
+    windows: torch.Tensor,
    window_size: int,
    pad_hw: Tuple[int, int],
    hw: Tuple[int, int],
 ) -> torch.Tensor:
    """
    Window unpartition into original sequences and removing padding.
@ -281,7 +301,9 @@ def window_unpartition(
    Hp, Wp = pad_hw
    H, W = hw
    B = windows.shape[0] // (Hp * Wp // window_size // window_size)
-    x = windows.view(B, Hp // window_size, Wp // window_size, window_size, window_size, -1)
+    x = windows.view(
        B, Hp // window_size, Wp // window_size, window_size, window_size, -1
    )
    x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, Hp, Wp, -1)
    if Hp > H or Wp > W:
@ -355,7 +377,9 @@ def add_decomposed_rel_pos(
    rel_w = torch.einsum("bhwc,wkc->bhwk", r_q, Rw)
    attn = (
-        attn.view(B, q_h, q_w, k_h, k_w) + rel_h[:, :, :, :, None] + rel_w[:, :, :, None, :]
+        attn.view(B, q_h, q_w, k_h, k_w)
        + rel_h[:, :, :, :, None]
        + rel_w[:, :, :, None, :]
    ).view(B, q_h * q_w, k_h * k_w)
    return attn
--- a/swarms/agents/models/segment_anything/segment_anything/modeling/mask_decoder.py
+++ b/swarms/agents/models/segment_anything/segment_anything/modeling/mask_decoder.py
@ -51,10 +51,14 @@ class MaskDecoder(nn.Module):
        self.mask_tokens = nn.Embedding(self.num_mask_tokens, transformer_dim)
        self.output_upscaling = nn.Sequential(
-            nn.ConvTranspose2d(transformer_dim, transformer_dim // 4, kernel_size=2, stride=2),
+            nn.ConvTranspose2d(
                transformer_dim, transformer_dim // 4, kernel_size=2, stride=2
            ),
            LayerNorm2d(transformer_dim // 4),
            activation(),
-            nn.ConvTranspose2d(transformer_dim // 4, transformer_dim // 8, kernel_size=2, stride=2),
+            nn.ConvTranspose2d(
                transformer_dim // 4, transformer_dim // 8, kernel_size=2, stride=2
            ),
            activation(),
        )
        self.output_hypernetworks_mlps = nn.ModuleList(
@ -118,8 +122,12 @@ class MaskDecoder(nn.Module):
    ) -> Tuple[torch.Tensor, torch.Tensor]:
        """Predicts masks. See 'forward' for more details."""
        # Concatenate output tokens
-        output_tokens = torch.cat([self.iou_token.weight, self.mask_tokens.weight], dim=0)
+        output_tokens = torch.cat(
-        output_tokens = output_tokens.unsqueeze(0).expand(sparse_prompt_embeddings.size(0), -1, -1)
+            [self.iou_token.weight, self.mask_tokens.weight], dim=0
        )
        output_tokens = output_tokens.unsqueeze(0).expand(
            sparse_prompt_embeddings.size(0), -1, -1
        )
        tokens = torch.cat((output_tokens, sparse_prompt_embeddings), dim=1)
        # Expand per-image data in batch direction to be per-mask
@ -138,7 +146,9 @@ class MaskDecoder(nn.Module):
        upscaled_embedding = self.output_upscaling(src)
        hyper_in_list: List[torch.Tensor] = []
        for i in range(self.num_mask_tokens):
-            hyper_in_list.append(self.output_hypernetworks_mlps[i](mask_tokens_out[:, i, :]))
+            hyper_in_list.append(
                self.output_hypernetworks_mlps[i](mask_tokens_out[:, i, :])
            )
        hyper_in = torch.stack(hyper_in_list, dim=1)
        b, c, h, w = upscaled_embedding.shape
        masks = (hyper_in @ upscaled_embedding.view(b, c, h * w)).view(b, -1, h, w)
--- a/swarms/agents/models/segment_anything/segment_anything/modeling/prompt_encoder.py
+++ b/swarms/agents/models/segment_anything/segment_anything/modeling/prompt_encoder.py
@ -43,11 +43,16 @@ class PromptEncoder(nn.Module):
        self.pe_layer = PositionEmbeddingRandom(embed_dim // 2)
        self.num_point_embeddings: int = 4  # pos/neg point + 2 box corners
-        point_embeddings = [nn.Embedding(1, embed_dim) for i in range(self.num_point_embeddings)]
+        point_embeddings = [
            nn.Embedding(1, embed_dim) for i in range(self.num_point_embeddings)
        ]
        self.point_embeddings = nn.ModuleList(point_embeddings)
        self.not_a_point_embed = nn.Embedding(1, embed_dim)
-        self.mask_input_size = (4 * image_embedding_size[0], 4 * image_embedding_size[1])
+        self.mask_input_size = (
            4 * image_embedding_size[0],
            4 * image_embedding_size[1],
        )
        self.mask_downscaling = nn.Sequential(
            nn.Conv2d(1, mask_in_chans // 4, kernel_size=2, stride=2),
            LayerNorm2d(mask_in_chans // 4),
@ -83,7 +88,9 @@ class PromptEncoder(nn.Module):
            padding_label = -torch.ones((labels.shape[0], 1), device=labels.device)
            points = torch.cat([points, padding_point], dim=1)
            labels = torch.cat([labels, padding_label], dim=1)
-        point_embedding = self.pe_layer.forward_with_coords(points, self.input_image_size)
+        point_embedding = self.pe_layer.forward_with_coords(
            points, self.input_image_size
        )
        point_embedding[labels == -1] = 0.0
        point_embedding[labels == -1] += self.not_a_point_embed.weight
        point_embedding[labels == 0] += self.point_embeddings[0].weight
@ -94,7 +101,9 @@ class PromptEncoder(nn.Module):
        """Embeds box prompts."""
        boxes = boxes + 0.5  # Shift to center of pixel
        coords = boxes.reshape(-1, 2, 2)
-        corner_embedding = self.pe_layer.forward_with_coords(coords, self.input_image_size)
+        corner_embedding = self.pe_layer.forward_with_coords(
            coords, self.input_image_size
        )
        corner_embedding[:, 0, :] += self.point_embeddings[2].weight
        corner_embedding[:, 1, :] += self.point_embeddings[3].weight
        return corner_embedding
@ -149,7 +158,9 @@ class PromptEncoder(nn.Module):
            Bx(embed_dim)x(embed_H)x(embed_W)
        """
        bs = self._get_batch_size(points, boxes, masks)
-        sparse_embeddings = torch.empty((bs, 0, self.embed_dim), device=self._get_device())
+        sparse_embeddings = torch.empty(
            (bs, 0, self.embed_dim), device=self._get_device()
        )
        if points is not None:
            coords, labels = points
            point_embeddings = self._embed_points(coords, labels, pad=(boxes is None))
--- a/swarms/agents/models/segment_anything/segment_anything/modeling/sam.py
+++ b/swarms/agents/models/segment_anything/segment_anything/modeling/sam.py
@ -43,7 +43,9 @@ class Sam(nn.Module):
        self.image_encoder = image_encoder
        self.prompt_encoder = prompt_encoder
        self.mask_decoder = mask_decoder
-        self.register_buffer("pixel_mean", torch.Tensor(pixel_mean).view(-1, 1, 1), False)
+        self.register_buffer(
            "pixel_mean", torch.Tensor(pixel_mean).view(-1, 1, 1), False
        )
        self.register_buffer("pixel_std", torch.Tensor(pixel_std).view(-1, 1, 1), False)
    @property
@ -94,7 +96,9 @@ class Sam(nn.Module):
                shape BxCxHxW, where H=W=256. Can be passed as mask input
                to subsequent iterations of prediction.
        """
-        input_images = torch.stack([self.preprocess(x["image"]) for x in batched_input], dim=0)
+        input_images = torch.stack(
            [self.preprocess(x["image"]) for x in batched_input], dim=0
        )
        image_embeddings = self.image_encoder(input_images)
        outputs = []
@ -158,7 +162,9 @@ class Sam(nn.Module):
            align_corners=False,
        )
        masks = masks[..., : input_size[0], : input_size[1]]
-        masks = F.interpolate(masks, original_size, mode="bilinear", align_corners=False)
+        masks = F.interpolate(
            masks, original_size, mode="bilinear", align_corners=False
        )
        return masks
    def preprocess(self, x: torch.Tensor) -> torch.Tensor:
--- a/swarms/agents/models/segment_anything/segment_anything/modeling/transformer.py
+++ b/swarms/agents/models/segment_anything/segment_anything/modeling/transformer.py
@ -198,7 +198,9 @@ class Attention(nn.Module):
        self.embedding_dim = embedding_dim
        self.internal_dim = embedding_dim // downsample_rate
        self.num_heads = num_heads
-        assert self.internal_dim % num_heads == 0, "num_heads must divide embedding_dim."
+        assert (
            self.internal_dim % num_heads == 0
        ), "num_heads must divide embedding_dim."
        self.q_proj = nn.Linear(embedding_dim, self.internal_dim)
        self.k_proj = nn.Linear(embedding_dim, self.internal_dim)
--- a/swarms/agents/models/segment_anything/segment_anything/predictor.py
+++ b/swarms/agents/models/segment_anything/segment_anything/predictor.py
@ -55,7 +55,9 @@ class SamPredictor:
        # Transform the image to the form expected by the model
        input_image = self.transform.apply_image(image)
        input_image_torch = torch.as_tensor(input_image, device=self.device)
-        input_image_torch = input_image_torch.permute(2, 0, 1).contiguous()[None, :, :, :]
+        input_image_torch = input_image_torch.permute(2, 0, 1).contiguous()[
            None, :, :, :
        ]
        self.set_torch_image(input_image_torch, image.shape[:2])
@ -131,7 +133,9 @@ class SamPredictor:
            a subsequent iteration as mask input.
        """
        if not self.is_image_set:
-            raise RuntimeError("An image must be set with .set_image(...) before mask prediction.")
+            raise RuntimeError(
                "An image must be set with .set_image(...) before mask prediction."
            )
        # Transform input prompts
        coords_torch, labels_torch, box_torch, mask_input_torch = None, None, None, None
@ -140,15 +144,21 @@ class SamPredictor:
                point_labels is not None
            ), "point_labels must be supplied if point_coords is supplied."
            point_coords = self.transform.apply_coords(point_coords, self.original_size)
-            coords_torch = torch.as_tensor(point_coords, dtype=torch.float, device=self.device)
+            coords_torch = torch.as_tensor(
-            labels_torch = torch.as_tensor(point_labels, dtype=torch.int, device=self.device)
+                point_coords, dtype=torch.float, device=self.device
            )
            labels_torch = torch.as_tensor(
                point_labels, dtype=torch.int, device=self.device
            )
            coords_torch, labels_torch = coords_torch[None, :, :], labels_torch[None, :]
        if box is not None:
            box = self.transform.apply_boxes(box, self.original_size)
            box_torch = torch.as_tensor(box, dtype=torch.float, device=self.device)
            box_torch = box_torch[None, :]
        if mask_input is not None:
-            mask_input_torch = torch.as_tensor(mask_input, dtype=torch.float, device=self.device)
+            mask_input_torch = torch.as_tensor(
                mask_input, dtype=torch.float, device=self.device
            )
            mask_input_torch = mask_input_torch[None, :, :, :]
        masks, iou_predictions, low_res_masks = self.predict_torch(
@ -211,7 +221,9 @@ class SamPredictor:
            a subsequent iteration as mask input.
        """
        if not self.is_image_set:
-            raise RuntimeError("An image must be set with .set_image(...) before mask prediction.")
+            raise RuntimeError(
                "An image must be set with .set_image(...) before mask prediction."
            )
        if point_coords is not None:
            points = (point_coords, point_labels)
@ -235,7 +247,9 @@ class SamPredictor:
        )
        # Upscale the masks to the original image resolution
-        masks = self.model.postprocess_masks(low_res_masks, self.input_size, self.original_size)
+        masks = self.model.postprocess_masks(
            low_res_masks, self.input_size, self.original_size
        )
        if not return_logits:
            masks = masks > self.model.mask_threshold
@ -252,7 +266,9 @@ class SamPredictor:
            raise RuntimeError(
                "An image must be set with .set_image(...) to generate an embedding."
            )
-        assert self.features is not None, "Features must exist if an image has been set."
+        assert (
            self.features is not None
        ), "Features must exist if an image has been set."
        return self.features
    @property
--- a/swarms/agents/models/segment_anything/segment_anything/utils/onnx.py
+++ b/swarms/agents/models/segment_anything/segment_anything/utils/onnx.py
@ -48,32 +48,43 @@ class SamOnnxModel(nn.Module):
        transformed_size = torch.floor(transformed_size + 0.5).to(torch.int64)
        return transformed_size
-    def _embed_points(self, point_coords: torch.Tensor, point_labels: torch.Tensor) -> torch.Tensor:
+    def _embed_points(
        self, point_coords: torch.Tensor, point_labels: torch.Tensor
    ) -> torch.Tensor:
        point_coords = point_coords + 0.5
        point_coords = point_coords / self.img_size
        point_embedding = self.model.prompt_encoder.pe_layer._pe_encoding(point_coords)
        point_labels = point_labels.unsqueeze(-1).expand_as(point_embedding)
        point_embedding = point_embedding * (point_labels != -1)
-        point_embedding = point_embedding + self.model.prompt_encoder.not_a_point_embed.weight * (
+        point_embedding = (
-            point_labels == -1
+            point_embedding
            + self.model.prompt_encoder.not_a_point_embed.weight * (point_labels == -1)
        )
        for i in range(self.model.prompt_encoder.num_point_embeddings):
-            point_embedding = point_embedding + self.model.prompt_encoder.point_embeddings[
+            point_embedding = (
-                i
+                point_embedding
-            ].weight * (point_labels == i)
+                + self.model.prompt_encoder.point_embeddings[i].weight
                * (point_labels == i)
            )
        return point_embedding
-    def _embed_masks(self, input_mask: torch.Tensor, has_mask_input: torch.Tensor) -> torch.Tensor:
+    def _embed_masks(
-        mask_embedding = has_mask_input * self.model.prompt_encoder.mask_downscaling(input_mask)
+        self, input_mask: torch.Tensor, has_mask_input: torch.Tensor
    ) -> torch.Tensor:
        mask_embedding = has_mask_input * self.model.prompt_encoder.mask_downscaling(
            input_mask
        )
        mask_embedding = mask_embedding + (
            1 - has_mask_input
        ) * self.model.prompt_encoder.no_mask_embed.weight.reshape(1, -1, 1, 1)
        return mask_embedding
-    def mask_postprocessing(self, masks: torch.Tensor, orig_im_size: torch.Tensor) -> torch.Tensor:
+    def mask_postprocessing(
        self, masks: torch.Tensor, orig_im_size: torch.Tensor
    ) -> torch.Tensor:
        masks = F.interpolate(
            masks,
            size=(self.img_size, self.img_size),
@ -81,7 +92,9 @@ class SamOnnxModel(nn.Module):
            align_corners=False,
        )
-        prepadded_size = self.resize_longest_image_size(orig_im_size, self.img_size).to(torch.int64)
+        prepadded_size = self.resize_longest_image_size(orig_im_size, self.img_size).to(
            torch.int64
        )
        masks = masks[..., : prepadded_size[0], : prepadded_size[1]]  # type: ignore
        orig_im_size = orig_im_size.to(torch.int64)
--- a/swarms/agents/models/segment_anything/segment_anything/utils/transforms.py
+++ b/swarms/agents/models/segment_anything/segment_anything/utils/transforms.py
@ -27,10 +27,14 @@ class ResizeLongestSide:
        """
        Expects a numpy array with shape HxWxC in uint8 format.
        """
-        target_size = self.get_preprocess_shape(image.shape[0], image.shape[1], self.target_length)
+        target_size = self.get_preprocess_shape(
            image.shape[0], image.shape[1], self.target_length
        )
        return np.array(resize(to_pil_image(image), target_size))
-    def apply_coords(self, coords: np.ndarray, original_size: Tuple[int, ...]) -> np.ndarray:
+    def apply_coords(
        self, coords: np.ndarray, original_size: Tuple[int, ...]
    ) -> np.ndarray:
        """
        Expects a numpy array of length 2 in the final dimension. Requires the
        original image size in (H, W) format.
@ -44,7 +48,9 @@ class ResizeLongestSide:
        coords[..., 1] = coords[..., 1] * (new_h / old_h)
        return coords
-    def apply_boxes(self, boxes: np.ndarray, original_size: Tuple[int, ...]) -> np.ndarray:
+    def apply_boxes(
        self, boxes: np.ndarray, original_size: Tuple[int, ...]
    ) -> np.ndarray:
        """
        Expects a numpy array shape Bx4. Requires the original image size
        in (H, W) format.
@ -59,7 +65,9 @@ class ResizeLongestSide:
        the transformation expected by the model.
        """
        # Expects an image in BCHW format. May not exactly match apply_image.
-        target_size = self.get_preprocess_shape(image.shape[2], image.shape[3], self.target_length)
+        target_size = self.get_preprocess_shape(
            image.shape[2], image.shape[3], self.target_length
        )
        return F.interpolate(
            image, target_size, mode="bilinear", align_corners=False, antialias=True
        )
@ -91,7 +99,9 @@ class ResizeLongestSide:
        return boxes.reshape(-1, 4)
    @staticmethod
-    def get_preprocess_shape(oldh: int, oldw: int, long_side_length: int) -> Tuple[int, int]:
+    def get_preprocess_shape(
        oldh: int, oldw: int, long_side_length: int
    ) -> Tuple[int, int]:
        """
        Compute the output size given input size and target long side length.
        """
--- a/swarms/agents/multi_modal_visual_agent.py
+++ b/swarms/agents/multi_modal_visual_agent.py
--- a/swarms/agents/multi_modal_workers/omni_agent/get_token_ids.py
+++ b/swarms/agents/multi_modal_workers/omni_agent/get_token_ids.py
@ -31,7 +31,7 @@ max_length = {
    "davinci": 2049,
    "curie": 2049,
    "babbage": 2049,
-    "ada": 2049
+    "ada": 2049,
 }
@ -44,14 +44,14 @@ def get_max_context_length(model_name):
 def get_token_ids_for_task_parsing(model_name):
-    text = '''{"task": "text-classification",  "token-classification", "text2text-generation", "summarization", "translation",  "question-answering", "conversational", "text-generation", "sentence-similarity", "tabular-classification", "object-detection", "image-classification", "image-to-image", "image-to-text", "text-to-image", "visual-question-answering", "document-question-answering", "image-segmentation", "text-to-speech", "text-to-video", "automatic-speech-recognition", "audio-to-audio", "audio-classification", "canny-control", "hed-control", "mlsd-control", "normal-control", "openpose-control", "canny-text-to-image", "depth-text-to-image", "hed-text-to-image", "mlsd-text-to-image", "normal-text-to-image", "openpose-text-to-image", "seg-text-to-image", "args", "text", "path", "dep", "id", "<GENERATED>-"}'''
+    text = """{"task": "text-classification",  "token-classification", "text2text-generation", "summarization", "translation",  "question-answering", "conversational", "text-generation", "sentence-similarity", "tabular-classification", "object-detection", "image-classification", "image-to-image", "image-to-text", "text-to-image", "visual-question-answering", "document-question-answering", "image-segmentation", "text-to-speech", "text-to-video", "automatic-speech-recognition", "audio-to-audio", "audio-classification", "canny-control", "hed-control", "mlsd-control", "normal-control", "openpose-control", "canny-text-to-image", "depth-text-to-image", "hed-text-to-image", "mlsd-text-to-image", "normal-text-to-image", "openpose-text-to-image", "seg-text-to-image", "args", "text", "path", "dep", "id", "<GENERATED>-"}"""
    res = encodings[model_name].encode(text)
    res = list(set(res))
    return res
 def get_token_ids_for_choose_model(model_name):
-    text = '''{"id": "reason"}'''
+    text = """{"id": "reason"}"""
    res = encodings[model_name].encode(text)
    res = list(set(res))
    return res
--- a/swarms/agents/multi_modal_workers/omni_agent/model_server.py
+++ b/swarms/agents/multi_modal_workers/omni_agent/model_server.py
@ -65,7 +65,7 @@ logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)
 handler = logging.StreamHandler()
 handler.setLevel(logging.INFO)
-formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
 handler.setFormatter(formatter)
 logger.addHandler(handler)
@ -100,10 +100,16 @@ def load_pipes(local_deployment):
    if local_deployment in ["full"]:
        other_pipes = {
            "nlpconnect/vit-gpt2-image-captioning": {
-                "model": VisionEncoderDecoderModel.from_pretrained(f"{local_fold}/nlpconnect/vit-gpt2-image-captioning"),
+                "model": VisionEncoderDecoderModel.from_pretrained(
-                "feature_extractor": ViTImageProcessor.from_pretrained(f"{local_fold}/nlpconnect/vit-gpt2-image-captioning"),
+                    f"{local_fold}/nlpconnect/vit-gpt2-image-captioning"
-                "tokenizer": AutoTokenizer.from_pretrained(f"{local_fold}/nlpconnect/vit-gpt2-image-captioning"),
+                ),
-                "device": device
+                "feature_extractor": ViTImageProcessor.from_pretrained(
                    f"{local_fold}/nlpconnect/vit-gpt2-image-captioning"
                ),
                "tokenizer": AutoTokenizer.from_pretrained(
                    f"{local_fold}/nlpconnect/vit-gpt2-image-captioning"
                ),
                "device": device,
            },
            # "Salesforce/blip-image-captioning-large": {
            #     "model": BlipForConditionalGeneration.from_pretrained(f"{local_fold}/Salesforce/blip-image-captioning-large"),
@ -111,8 +117,12 @@ def load_pipes(local_deployment):
            #     "device": device
            # },
            "damo-vilab/text-to-video-ms-1.7b": {
-                "model": DiffusionPipeline.from_pretrained(f"{local_fold}/damo-vilab/text-to-video-ms-1.7b", torch_dtype=torch.float16, variant="fp16"),
+                "model": DiffusionPipeline.from_pretrained(
-                "device": device
+                    f"{local_fold}/damo-vilab/text-to-video-ms-1.7b",
                    torch_dtype=torch.float16,
                    variant="fp16",
                ),
                "device": device,
            },
            # "facebook/maskformer-swin-large-ade": {
            #     "model": MaskFormerForInstanceSegmentation.from_pretrained(f"{local_fold}/facebook/maskformer-swin-large-ade"),
@ -130,16 +140,22 @@ def load_pipes(local_deployment):
            #     "device": device
            # },
            "JorisCos/DCCRNet_Libri1Mix_enhsingle_16k": {
-                "model": BaseModel.from_pretrained("JorisCos/DCCRNet_Libri1Mix_enhsingle_16k"),
+                "model": BaseModel.from_pretrained(
-                "device": device
+                    "JorisCos/DCCRNet_Libri1Mix_enhsingle_16k"
                ),
                "device": device,
            },
            "espnet/kan-bayashi_ljspeech_vits": {
-                "model": Text2Speech.from_pretrained("espnet/kan-bayashi_ljspeech_vits"),
+                "model": Text2Speech.from_pretrained(
-                "device": device
+                    "espnet/kan-bayashi_ljspeech_vits"
                ),
                "device": device,
            },
            "lambdalabs/sd-image-variations-diffusers": {
-                "model": DiffusionPipeline.from_pretrained(f"{local_fold}/lambdalabs/sd-image-variations-diffusers"),  # torch_dtype=torch.float16
+                "model": DiffusionPipeline.from_pretrained(
-                "device": device
+                    f"{local_fold}/lambdalabs/sd-image-variations-diffusers"
                ),  # torch_dtype=torch.float16
                "device": device,
            },
            # "CompVis/stable-diffusion-v1-4": {
            #     "model": DiffusionPipeline.from_pretrained(f"{local_fold}/CompVis/stable-diffusion-v1-4"),
@ -150,8 +166,10 @@ def load_pipes(local_deployment):
            #     "device": device
            # },
            "runwayml/stable-diffusion-v1-5": {
-                "model": DiffusionPipeline.from_pretrained(f"{local_fold}/runwayml/stable-diffusion-v1-5"),
+                "model": DiffusionPipeline.from_pretrained(
-                "device": device
+                    f"{local_fold}/runwayml/stable-diffusion-v1-5"
                ),
                "device": device,
            },
            # "microsoft/speecht5_tts":{
            #     "processor": SpeechT5Processor.from_pretrained(f"{local_fold}/microsoft/speecht5_tts"),
@ -165,11 +183,19 @@ def load_pipes(local_deployment):
            #     "device": device
            # },
            "microsoft/speecht5_vc": {
-                "processor": SpeechT5Processor.from_pretrained(f"{local_fold}/microsoft/speecht5_vc"),
+                "processor": SpeechT5Processor.from_pretrained(
-                "model": SpeechT5ForSpeechToSpeech.from_pretrained(f"{local_fold}/microsoft/speecht5_vc"),
+                    f"{local_fold}/microsoft/speecht5_vc"
-                "vocoder": SpeechT5HifiGan.from_pretrained(f"{local_fold}/microsoft/speecht5_hifigan"),
+                ),
-                "embeddings_dataset": load_dataset(f"{local_fold}/Matthijs/cmu-arctic-xvectors", split="validation"),
+                "model": SpeechT5ForSpeechToSpeech.from_pretrained(
-                "device": device
+                    f"{local_fold}/microsoft/speecht5_vc"
                ),
                "vocoder": SpeechT5HifiGan.from_pretrained(
                    f"{local_fold}/microsoft/speecht5_hifigan"
                ),
                "embeddings_dataset": load_dataset(
                    f"{local_fold}/Matthijs/cmu-arctic-xvectors", split="validation"
                ),
                "device": device,
            },
            # "julien-c/wine-quality": {
            #     "model": joblib.load(cached_download(hf_hub_url("julien-c/wine-quality", "sklearn_model.joblib")))
@ -180,15 +206,23 @@ def load_pipes(local_deployment):
            #     "device": device
            # },
            "facebook/maskformer-swin-base-coco": {
-                "feature_extractor": MaskFormerFeatureExtractor.from_pretrained(f"{local_fold}/facebook/maskformer-swin-base-coco"),
+                "feature_extractor": MaskFormerFeatureExtractor.from_pretrained(
-                "model": MaskFormerForInstanceSegmentation.from_pretrained(f"{local_fold}/facebook/maskformer-swin-base-coco"),
+                    f"{local_fold}/facebook/maskformer-swin-base-coco"
-                "device": device
+                ),
                "model": MaskFormerForInstanceSegmentation.from_pretrained(
                    f"{local_fold}/facebook/maskformer-swin-base-coco"
                ),
                "device": device,
            },
            "Intel/dpt-hybrid-midas": {
-                "model": DPTForDepthEstimation.from_pretrained(f"{local_fold}/Intel/dpt-hybrid-midas", low_cpu_mem_usage=True),
+                "model": DPTForDepthEstimation.from_pretrained(
-                "feature_extractor": DPTFeatureExtractor.from_pretrained(f"{local_fold}/Intel/dpt-hybrid-midas"),
+                    f"{local_fold}/Intel/dpt-hybrid-midas", low_cpu_mem_usage=True
-                "device": device
+                ),
-            }
+                "feature_extractor": DPTFeatureExtractor.from_pretrained(
                    f"{local_fold}/Intel/dpt-hybrid-midas"
                ),
                "device": device,
            },
        }
    if local_deployment in ["full", "standard"]:
@ -198,36 +232,53 @@ def load_pipes(local_deployment):
            #     "device": device
            # },
            "openai/whisper-base": {
-                "model": pipeline(task="automatic-speech-recognition", model=f"{local_fold}/openai/whisper-base"),
+                "model": pipeline(
-                "device": device
+                    task="automatic-speech-recognition",
                    model=f"{local_fold}/openai/whisper-base",
                ),
                "device": device,
            },
            "microsoft/speecht5_asr": {
-                "model": pipeline(task="automatic-speech-recognition", model=f"{local_fold}/microsoft/speecht5_asr"),
+                "model": pipeline(
-                "device": device
+                    task="automatic-speech-recognition",
                    model=f"{local_fold}/microsoft/speecht5_asr",
                ),
                "device": device,
            },
            "Intel/dpt-large": {
-                "model": pipeline(task="depth-estimation", model=f"{local_fold}/Intel/dpt-large"),
+                "model": pipeline(
-                "device": device
+                    task="depth-estimation", model=f"{local_fold}/Intel/dpt-large"
                ),
                "device": device,
            },
            # "microsoft/beit-base-patch16-224-pt22k-ft22k": {
            #     "model": pipeline(task="image-classification", model=f"{local_fold}/microsoft/beit-base-patch16-224-pt22k-ft22k"),
            #     "device": device
            # },
            "facebook/detr-resnet-50-panoptic": {
-                "model": pipeline(task="image-segmentation", model=f"{local_fold}/facebook/detr-resnet-50-panoptic"),
+                "model": pipeline(
-                "device": device
+                    task="image-segmentation",
                    model=f"{local_fold}/facebook/detr-resnet-50-panoptic",
                ),
                "device": device,
            },
            "facebook/detr-resnet-101": {
-                "model": pipeline(task="object-detection", model=f"{local_fold}/facebook/detr-resnet-101"),
+                "model": pipeline(
-                "device": device
+                    task="object-detection",
                    model=f"{local_fold}/facebook/detr-resnet-101",
                ),
                "device": device,
            },
            # "openai/clip-vit-large-patch14": {
            #     "model": pipeline(task="zero-shot-image-classification", model=f"{local_fold}/openai/clip-vit-large-patch14"),
            #     "device": device
            # },
            "google/owlvit-base-patch32": {
-                "model": pipeline(task="zero-shot-object-detection", model=f"{local_fold}/google/owlvit-base-patch32"),
+                "model": pipeline(
-                "device": device
+                    task="zero-shot-object-detection",
                    model=f"{local_fold}/google/owlvit-base-patch32",
                ),
                "device": device,
            },
            # "microsoft/DialoGPT-medium": {
            #     "model": pipeline(task="conversational", model=f"{local_fold}/microsoft/DialoGPT-medium"),
@ -270,86 +321,121 @@ def load_pipes(local_deployment):
            #     "device": device
            # },
            "impira/layoutlm-document-qa": {
-                "model": pipeline(task="document-question-answering", model=f"{local_fold}/impira/layoutlm-document-qa"),
+                "model": pipeline(
-                "device": device
+                    task="document-question-answering",
                    model=f"{local_fold}/impira/layoutlm-document-qa",
                ),
                "device": device,
            },
            "ydshieh/vit-gpt2-coco-en": {
-                "model": pipeline(task="image-to-text", model=f"{local_fold}/ydshieh/vit-gpt2-coco-en"),
+                "model": pipeline(
-                "device": device
+                    task="image-to-text", model=f"{local_fold}/ydshieh/vit-gpt2-coco-en"
                ),
                "device": device,
            },
            "dandelin/vilt-b32-finetuned-vqa": {
-                "model": pipeline(task="visual-question-answering", model=f"{local_fold}/dandelin/vilt-b32-finetuned-vqa"),
+                "model": pipeline(
-                "device": device
+                    task="visual-question-answering",
-            }
+                    model=f"{local_fold}/dandelin/vilt-b32-finetuned-vqa",
                ),
                "device": device,
            },
        }
    if local_deployment in ["full", "standard", "minimal"]:
-        controlnet = ControlNetModel.from_pretrained(f"{local_fold}/lllyasviel/sd-controlnet-canny", torch_dtype=torch.float16)
+        controlnet = ControlNetModel.from_pretrained(
            f"{local_fold}/lllyasviel/sd-controlnet-canny", torch_dtype=torch.float16
        )
        controlnetpipe = StableDiffusionControlNetPipeline.from_pretrained(
-            f"{local_fold}/runwayml/stable-diffusion-v1-5", controlnet=controlnet, torch_dtype=torch.float16
+            f"{local_fold}/runwayml/stable-diffusion-v1-5",
            controlnet=controlnet,
            torch_dtype=torch.float16,
        )
        def mlsd_control_network():
            model = MobileV2_MLSD_Large()
-            model.load_state_dict(torch.load(f"{local_fold}/lllyasviel/ControlNet/annotator/ckpts/mlsd_large_512_fp32.pth"), strict=True)
+            model.load_state_dict(
                torch.load(
                    f"{local_fold}/lllyasviel/ControlNet/annotator/ckpts/mlsd_large_512_fp32.pth"
                ),
                strict=True,
            )
            return MLSDdetector(model)
-        hed_network = Network(f"{local_fold}/lllyasviel/ControlNet/annotator/ckpts/network-bsds500.pth")
+        hed_network = Network(
            f"{local_fold}/lllyasviel/ControlNet/annotator/ckpts/network-bsds500.pth"
        )
        controlnet_sd_pipes = {
            "openpose-control": {
-                "model": OpenposeDetector(Body(f"{local_fold}/lllyasviel/ControlNet/annotator/ckpts/body_pose_model.pth"))
+                "model": OpenposeDetector(
-            },
+                    Body(
-            "mlsd-control": {
+                        f"{local_fold}/lllyasviel/ControlNet/annotator/ckpts/body_pose_model.pth"
-                "model": mlsd_control_network()
+                    )
-            },
+                )
            "hed-control": {
                "model": HEDdetector(hed_network)
            },
            "scribble-control": {
                "model": HEDdetector(hed_network)
            },
            "mlsd-control": {"model": mlsd_control_network()},
            "hed-control": {"model": HEDdetector(hed_network)},
            "scribble-control": {"model": HEDdetector(hed_network)},
            "midas-control": {
-                "model": MidasDetector(model_path=f"{local_fold}/lllyasviel/ControlNet/annotator/ckpts/dpt_hybrid-midas-501f0c75.pt")
+                "model": MidasDetector(
-            },
+                    model_path=f"{local_fold}/lllyasviel/ControlNet/annotator/ckpts/dpt_hybrid-midas-501f0c75.pt"
-            "canny-control": {
+                )
                "model": CannyDetector()
            },
            "canny-control": {"model": CannyDetector()},
            "lllyasviel/sd-controlnet-canny": {
                "control": controlnet,
                "model": controlnetpipe,
-                "device": device
+                "device": device,
            },
            "lllyasviel/sd-controlnet-depth": {
-                "control": ControlNetModel.from_pretrained(f"{local_fold}/lllyasviel/sd-controlnet-depth", torch_dtype=torch.float16),
+                "control": ControlNetModel.from_pretrained(
                    f"{local_fold}/lllyasviel/sd-controlnet-depth",
                    torch_dtype=torch.float16,
                ),
                "model": controlnetpipe,
-                "device": device
+                "device": device,
            },
            "lllyasviel/sd-controlnet-hed": {
-                "control": ControlNetModel.from_pretrained(f"{local_fold}/lllyasviel/sd-controlnet-hed", torch_dtype=torch.float16),
+                "control": ControlNetModel.from_pretrained(
                    f"{local_fold}/lllyasviel/sd-controlnet-hed",
                    torch_dtype=torch.float16,
                ),
                "model": controlnetpipe,
-                "device": device
+                "device": device,
            },
            "lllyasviel/sd-controlnet-mlsd": {
-                "control": ControlNetModel.from_pretrained(f"{local_fold}/lllyasviel/sd-controlnet-mlsd", torch_dtype=torch.float16),
+                "control": ControlNetModel.from_pretrained(
                    f"{local_fold}/lllyasviel/sd-controlnet-mlsd",
                    torch_dtype=torch.float16,
                ),
                "model": controlnetpipe,
-                "device": device
+                "device": device,
            },
            "lllyasviel/sd-controlnet-openpose": {
-                "control": ControlNetModel.from_pretrained(f"{local_fold}/lllyasviel/sd-controlnet-openpose", torch_dtype=torch.float16),
+                "control": ControlNetModel.from_pretrained(
                    f"{local_fold}/lllyasviel/sd-controlnet-openpose",
                    torch_dtype=torch.float16,
                ),
                "model": controlnetpipe,
-                "device": device
+                "device": device,
            },
            "lllyasviel/sd-controlnet-scribble": {
-                "control": ControlNetModel.from_pretrained(f"{local_fold}/lllyasviel/sd-controlnet-scribble", torch_dtype=torch.float16),
+                "control": ControlNetModel.from_pretrained(
                    f"{local_fold}/lllyasviel/sd-controlnet-scribble",
                    torch_dtype=torch.float16,
                ),
                "model": controlnetpipe,
-                "device": device
+                "device": device,
            },
            "lllyasviel/sd-controlnet-seg": {
-                "control": ControlNetModel.from_pretrained(f"{local_fold}/lllyasviel/sd-controlnet-seg", torch_dtype=torch.float16),
+                "control": ControlNetModel.from_pretrained(
                    f"{local_fold}/lllyasviel/sd-controlnet-seg",
                    torch_dtype=torch.float16,
                ),
                "model": controlnetpipe,
-                "device": device
+                "device": device,
-            }
+            },
        }
    pipes = {**standard_pipes, **other_pipes, **controlnet_sd_pipes}
    return pipes
@ -363,14 +449,17 @@ during = end - start
 print(f"[ ready ] {during}s")
-@app.route('/running', methods=['GET'])
+@app.route("/running", methods=["GET"])
 def running():
    return jsonify({"running": True})
-@app.route('/status/<path:model_id>', methods=['GET'])
+@app.route("/status/<path:model_id>", methods=["GET"])
 def status(model_id):
-    disabled_models = ["microsoft/trocr-base-printed", "microsoft/trocr-base-handwritten"]
+    disabled_models = [
        "microsoft/trocr-base-printed",
        "microsoft/trocr-base-handwritten",
    ]
    if model_id in pipes.keys() and model_id not in disabled_models:
        print(f"[ check {model_id} ] success")
        return jsonify({"loaded": True})
@ -379,7 +468,7 @@ def status(model_id):
        return jsonify({"loaded": False})
-@app.route('/models/<path:model_id>', methods=['POST'])
+@app.route("/models/<path:model_id>", methods=["POST"])
 def models(model_id):
    while "using" in pipes[model_id] and pipes[model_id]["using"]:
        print(f"[ inference {model_id} ] waiting")
@ -402,23 +491,29 @@ def models(model_id):
    try:
        # text to video
        if model_id == "damo-vilab/text-to-video-ms-1.7b":
-            pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
+            pipe.scheduler = DPMSolverMultistepScheduler.from_config(
                pipe.scheduler.config
            )
            # pipe.enable_model_cpu_offload()
            prompt = request.get_json()["text"]
            video_frames = pipe(prompt, num_inference_steps=50, num_frames=40).frames
            video_path = export_to_video(video_frames)
            file_name = str(uuid.uuid4())[:4]
-            os.system(f"LD_LIBRARY_PATH=/usr/local/lib /usr/local/bin/ffmpeg -i {video_path} -vcodec libx264 public/videos/{file_name}.mp4")
+            os.system(
                f"LD_LIBRARY_PATH=/usr/local/lib /usr/local/bin/ffmpeg -i {video_path} -vcodec libx264 public/videos/{file_name}.mp4"
            )
            result = {"path": f"/videos/{file_name}.mp4"}
        # controlnet
        if model_id.startswith("lllyasviel/sd-controlnet-"):
-            pipe.controlnet.to('cpu')
+            pipe.controlnet.to("cpu")
            pipe.controlnet = pipes[model_id]["control"].to(pipes[model_id]["device"])
            pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)
            control_image = load_image(request.get_json()["img_url"])
            # generator = torch.manual_seed(66)
-            out_image: Image = pipe(request.get_json()["text"], num_inference_steps=20, image=control_image).images[0]
+            out_image: Image = pipe(
                request.get_json()["text"], num_inference_steps=20, image=control_image
            ).images[0]
            file_name = str(uuid.uuid4())[:4]
            out_image.save(f"public/images/{file_name}.png")
            result = {"path": f"/images/{file_name}.png"}
@ -441,17 +536,20 @@ def models(model_id):
            file_name = str(uuid.uuid4())[:4]
            with open(f"public/images/{file_name}.png", "wb") as f:
                f.write(request.data)
-            tform = transforms.Compose([
+            tform = transforms.Compose(
-                transforms.ToTensor(),
+                [
-                transforms.Resize(
+                    transforms.ToTensor(),
-                    (224, 224),
+                    transforms.Resize(
-                    interpolation=transforms.InterpolationMode.BICUBIC,
+                        (224, 224),
-                    antialias=False,
+                        interpolation=transforms.InterpolationMode.BICUBIC,
-                ),
+                        antialias=False,
-                transforms.Normalize(
+                    ),
-                    [0.48145466, 0.4578275, 0.40821073],
+                    transforms.Normalize(
-                    [0.26862954, 0.26130258, 0.27577711]),
+                        [0.48145466, 0.4578275, 0.40821073],
-            ])
+                        [0.26862954, 0.26130258, 0.27577711],
                    ),
                ]
            )
            inp = tform(im).to(pipes[model_id]["device"]).unsqueeze(0)
            out = pipe(inp, guidance_scale=3)
            out["images"][0].save(f"public/images/{file_name}.jpg")
@ -459,30 +557,47 @@ def models(model_id):
        # image to text
        if model_id == "Salesforce/blip-image-captioning-large":
-            raw_image = load_image(request.get_json()["img_url"]).convert('RGB')
+            raw_image = load_image(request.get_json()["img_url"]).convert("RGB")
            text = request.get_json()["text"]
-            inputs = pipes[model_id]["processor"](raw_image, return_tensors="pt").to(pipes[model_id]["device"])
+            inputs = pipes[model_id]["processor"](raw_image, return_tensors="pt").to(
                pipes[model_id]["device"]
            )
            out = pipe.generate(**inputs)
-            caption = pipes[model_id]["processor"].decode(out[0], skip_special_tokens=True)
+            caption = pipes[model_id]["processor"].decode(
                out[0], skip_special_tokens=True
            )
            result = {"generated text": caption}
        if model_id == "ydshieh/vit-gpt2-coco-en":
            img_url = request.get_json()["img_url"]
-            generated_text = pipe(img_url)[0]['generated_text']
+            generated_text = pipe(img_url)[0]["generated_text"]
            result = {"generated text": generated_text}
        if model_id == "nlpconnect/vit-gpt2-image-captioning":
            image = load_image(request.get_json()["img_url"]).convert("RGB")
-            pixel_values = pipes[model_id]["feature_extractor"](images=image, return_tensors="pt").pixel_values
+            pixel_values = pipes[model_id]["feature_extractor"](
                images=image, return_tensors="pt"
            ).pixel_values
            pixel_values = pixel_values.to(pipes[model_id]["device"])
-            generated_ids = pipe.generate(pixel_values, **{"max_length": 200, "num_beams": 1})
+            generated_ids = pipe.generate(
-            generated_text = pipes[model_id]["tokenizer"].batch_decode(generated_ids, skip_special_tokens=True)[0]
+                pixel_values, **{"max_length": 200, "num_beams": 1}
            )
            generated_text = pipes[model_id]["tokenizer"].batch_decode(
                generated_ids, skip_special_tokens=True
            )[0]
            result = {"generated text": generated_text}
        # image to text: OCR
-        if model_id == "microsoft/trocr-base-printed" or model_id == "microsoft/trocr-base-handwritten":
+        if (
            model_id == "microsoft/trocr-base-printed"
            or model_id == "microsoft/trocr-base-handwritten"
        ):
            image = load_image(request.get_json()["img_url"]).convert("RGB")
-            pixel_values = pipes[model_id]["processor"](image, return_tensors="pt").pixel_values
+            pixel_values = pipes[model_id]["processor"](
                image, return_tensors="pt"
            ).pixel_values
            pixel_values = pixel_values.to(pipes[model_id]["device"])
            generated_ids = pipe.generate(pixel_values)
-            generated_text = pipes[model_id]["processor"].batch_decode(generated_ids, skip_special_tokens=True)[0]
+            generated_text = pipes[model_id]["processor"].batch_decode(
                generated_ids, skip_special_tokens=True
            )[0]
            result = {"generated text": generated_text}
        # text to image
@ -494,9 +609,87 @@ def models(model_id):
            result = {"path": f"/images/{file_name}.jpg"}
        # object detection
-        if model_id == "google/owlvit-base-patch32" or model_id == "facebook/detr-resnet-101":
+        if (
            model_id == "google/owlvit-base-patch32"
            or model_id == "facebook/detr-resnet-101"
        ):
            img_url = request.get_json()["img_url"]
-            open_types = ["cat", "couch", "person", "car", "dog", "horse", "sheep", "cow", "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee", "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard", "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple", "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch", "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone", "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear", "hair drier", "toothbrush", "traffic light", "fire hydrant", "stop sign", "parking meter", "bench", "bird"]
+            open_types = [
                "cat",
                "couch",
                "person",
                "car",
                "dog",
                "horse",
                "sheep",
                "cow",
                "elephant",
                "bear",
                "zebra",
                "giraffe",
                "backpack",
                "umbrella",
                "handbag",
                "tie",
                "suitcase",
                "frisbee",
                "skis",
                "snowboard",
                "sports ball",
                "kite",
                "baseball bat",
                "baseball glove",
                "skateboard",
                "surfboard",
                "tennis racket",
                "bottle",
                "wine glass",
                "cup",
                "fork",
                "knife",
                "spoon",
                "bowl",
                "banana",
                "apple",
                "sandwich",
                "orange",
                "broccoli",
                "carrot",
                "hot dog",
                "pizza",
                "donut",
                "cake",
                "chair",
                "couch",
                "potted plant",
                "bed",
                "dining table",
                "toilet",
                "tv",
                "laptop",
                "mouse",
                "remote",
                "keyboard",
                "cell phone",
                "microwave",
                "oven",
                "toaster",
                "sink",
                "refrigerator",
                "book",
                "clock",
                "vase",
                "scissors",
                "teddy bear",
                "hair drier",
                "toothbrush",
                "traffic light",
                "fire hydrant",
                "stop sign",
                "parking meter",
                "bench",
                "bird",
            ]
            result = pipe(img_url, candidate_labels=open_types)
        # VQA
@ -514,14 +707,16 @@ def models(model_id):
        # depth-estimation
        if model_id == "Intel/dpt-large":
            output = pipe(request.get_json()["img_url"])
-            image = output['depth']
+            image = output["depth"]
            name = str(uuid.uuid4())[:4]
            image.save(f"public/images/{name}.jpg")
            result = {"path": f"/images/{name}.jpg"}
        if model_id == "Intel/dpt-hybrid-midas" and model_id == "Intel/dpt-large":
            image = load_image(request.get_json()["img_url"])
-            inputs = pipes[model_id]["feature_extractor"](images=image, return_tensors="pt")
+            inputs = pipes[model_id]["feature_extractor"](
                images=image, return_tensors="pt"
            )
            with torch.no_grad():
                outputs = pipe(**inputs)
                predicted_depth = outputs.predicted_depth
@ -550,11 +745,21 @@ def models(model_id):
            text = request.get_json()["text"]
            inputs = pipes[model_id]["processor"](text=text, return_tensors="pt")
            embeddings_dataset = pipes[model_id]["embeddings_dataset"]
-            speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0).to(pipes[model_id]["device"])
+            speaker_embeddings = (
                torch.tensor(embeddings_dataset[7306]["xvector"])
                .unsqueeze(0)
                .to(pipes[model_id]["device"])
            )
            pipes[model_id]["vocoder"].to(pipes[model_id]["device"])
-            speech = pipe.generate_speech(inputs["input_ids"].to(pipes[model_id]["device"]), speaker_embeddings, vocoder=pipes[model_id]["vocoder"])
+            speech = pipe.generate_speech(
                inputs["input_ids"].to(pipes[model_id]["device"]),
                speaker_embeddings,
                vocoder=pipes[model_id]["vocoder"],
            )
            name = str(uuid.uuid4())[:4]
-            sf.write(f"public/audios/{name}.wav", speech.cpu().numpy(), samplerate=16000)
+            sf.write(
                f"public/audios/{name}.wav", speech.cpu().numpy(), samplerate=16000
            )
            result = {"path": f"/audios/{name}.wav"}
        # ASR
@ -569,19 +774,31 @@ def models(model_id):
            with torch.no_grad():
                result_wav = pipe(wav.to(pipes[model_id]["device"]))
            name = str(uuid.uuid4())[:4]
-            sf.write(f"public/audios/{name}.wav", result_wav.cpu().squeeze().numpy(), sr)
+            sf.write(
                f"public/audios/{name}.wav", result_wav.cpu().squeeze().numpy(), sr
            )
            result = {"path": f"/audios/{name}.wav"}
        if model_id == "microsoft/speecht5_vc":
            audio_url = request.get_json()["audio_url"]
            wav, sr = torchaudio.load(audio_url)
-            inputs = pipes[model_id]["processor"](audio=wav, sampling_rate=sr, return_tensors="pt")
+            inputs = pipes[model_id]["processor"](
                audio=wav, sampling_rate=sr, return_tensors="pt"
            )
            embeddings_dataset = pipes[model_id]["embeddings_dataset"]
-            speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
+            speaker_embeddings = torch.tensor(
                embeddings_dataset[7306]["xvector"]
            ).unsqueeze(0)
            pipes[model_id]["vocoder"].to(pipes[model_id]["device"])
-            speech = pipe.generate_speech(inputs["input_ids"].to(pipes[model_id]["device"]), speaker_embeddings, vocoder=pipes[model_id]["vocoder"])
+            speech = pipe.generate_speech(
                inputs["input_ids"].to(pipes[model_id]["device"]),
                speaker_embeddings,
                vocoder=pipes[model_id]["vocoder"],
            )
            name = str(uuid.uuid4())[:4]
-            sf.write(f"public/audios/{name}.wav", speech.cpu().numpy(), samplerate=16000)
+            sf.write(
                f"public/audios/{name}.wav", speech.cpu().numpy(), samplerate=16000
            )
            result = {"path": f"/audios/{name}.wav"}
        # segmentation
@ -592,24 +809,44 @@ def models(model_id):
            colors = []
            for i in range(len(segments)):
-                colors.append((random.randint(100, 255), random.randint(100, 255), random.randint(100, 255), 50))
+                colors.append(
                    (
                        random.randint(100, 255),
                        random.randint(100, 255),
                        random.randint(100, 255),
                        50,
                    )
                )
            for segment in segments:
                mask = segment["mask"]
-                mask = mask.convert('L')
+                mask = mask.convert("L")
-                layer = Image.new('RGBA', mask.size, colors[i])
+                layer = Image.new("RGBA", mask.size, colors[i])
                image.paste(layer, (0, 0), mask)
            name = str(uuid.uuid4())[:4]
            image.save(f"public/images/{name}.jpg")
            result = {"path": f"/images/{name}.jpg"}
-        if model_id == "facebook/maskformer-swin-base-coco" or model_id == "facebook/maskformer-swin-large-ade":
+        if (
            model_id == "facebook/maskformer-swin-base-coco"
            or model_id == "facebook/maskformer-swin-large-ade"
        ):
            image = load_image(request.get_json()["img_url"])
-            inputs = pipes[model_id]["feature_extractor"](images=image, return_tensors="pt").to(pipes[model_id]["device"])
+            inputs = pipes[model_id]["feature_extractor"](
                images=image, return_tensors="pt"
            ).to(pipes[model_id]["device"])
            outputs = pipe(**inputs)
-            result = pipes[model_id]["feature_extractor"].post_process_panoptic_segmentation(outputs, target_sizes=[image.size[::-1]])[0]
+            result = pipes[model_id][
                "feature_extractor"
            ].post_process_panoptic_segmentation(
                outputs, target_sizes=[image.size[::-1]]
            )[
                0
            ]
            predicted_panoptic_map = result["segmentation"].cpu().numpy()
-            predicted_panoptic_map = Image.fromarray(predicted_panoptic_map.astype(np.uint8))
+            predicted_panoptic_map = Image.fromarray(
                predicted_panoptic_map.astype(np.uint8)
            )
            name = str(uuid.uuid4())[:4]
            predicted_panoptic_map.save(f"public/images/{name}.jpg")
            result = {"path": f"/images/{name}.jpg"}
@ -641,7 +878,7 @@ def models(model_id):
    return jsonify(result)
-if __name__ == '__main__':
+if __name__ == "__main__":
    # temp folders
    if not os.path.exists("public/audios"):
        os.makedirs("public/audios")
--- a/swarms/agents/multi_modal_workers/omni_agent/omni_chat.py
+++ b/swarms/agents/multi_modal_workers/omni_agent/omni_chat.py
--- a/swarms/agents/omni_modal_agent.py
+++ b/swarms/agents/omni_modal_agent.py
@ -12,17 +12,13 @@ from langchain_experimental.autonomous_agents.hugginggpt.task_planner import (
    load_chat_planner,
 )
 from transformers import load_tool
 from swarms.agents.message import Message
 class Step:
    def __init__(
-        self,
+        self, task: str, id: int, dep: List[int], args: Dict[str, str], tool: BaseTool
        task: str,
        id: int,
        dep: List[int],
        args: Dict[str, str],
        tool: BaseTool
    ):
        self.task = task
        self.id = id
@ -32,10 +28,7 @@ class Step:
 class Plan:
-    def __init__(
+    def __init__(self, steps: List[Step]):
        self,
        steps: List[Step]
    ):
        self.steps = steps
    def __str__(self) -> str:
@ -104,10 +97,7 @@ class OmniModalAgent:
        # self.task_executor = TaskExecutor
        self.history = []
-    def run(
+    def run(self, input: str) -> str:
        self,
        input: str
    ) -> str:
        """Run the OmniAgent"""
        plan = self.chat_planner.plan(
            inputs={
@ -124,11 +114,7 @@ class OmniModalAgent:
        return response
-    def chat(
+    def chat(self, msg: str = None, streaming: bool = False):
        self,
        msg: str = None,
        streaming: bool = False
    ):
        """
        Run chat
@ -148,24 +134,14 @@ class OmniModalAgent:
        """
        # add users message to the history
-        self.history.append(
+        self.history.append(Message("User", msg))
            Message(
                "User",
                msg
            )
        )
        # process msg
        try:
            response = self.agent.run(msg)
            # add agent's response to the history
-            self.history.append(
+            self.history.append(Message("Agent", response))
                Message(
                    "Agent",
                    response
                )
            )
            # if streaming is = True
            if streaming:
@ -177,19 +153,11 @@ class OmniModalAgent:
            error_message = f"Error processing message: {str(error)}"
            # add error to history
-            self.history.append(
+            self.history.append(Message("Agent", error_message))
                Message(
                    "Agent",
                    error_message
                )
            )
            return error_message
-    def _stream_response(
+    def _stream_response(self, response: str = None):
        self,
        response: str = None
    ):
        """
        Yield the response token by token (word by word)
--- a/swarms/agents/profitpilot.py
+++ b/swarms/agents/profitpilot.py
@ -56,36 +56,36 @@ class StageAnalyzerChain(LLMChain):
 class SalesConversationChain(LLMChain):
    """
-    Chain to generate the next utterance for the conversation.
+        Chain to generate the next utterance for the conversation.
-    # test the intermediate chains
+        # test the intermediate chains
-    verbose = True
+        verbose = True
-    llm = ChatOpenAI(temperature=0.9)
+        llm = ChatOpenAI(temperature=0.9)
-    stage_analyzer_chain = StageAnalyzerChain.from_llm(llm, verbose=verbose)
+        stage_analyzer_chain = StageAnalyzerChain.from_llm(llm, verbose=verbose)
    sales_conversation_utterance_chain = SalesConversationChain.from_llm(
        llm, verbose=verbose
    )
        sales_conversation_utterance_chain = SalesConversationChain.from_llm(
            llm, verbose=verbose
        )
    stage_analyzer_chain.run(conversation_history="")
-    sales_conversation_utterance_chain.run(
+        stage_analyzer_chain.run(conversation_history="")
-    salesperson_name="Ted Lasso",
+
-    salesperson_role="Business Development Representative",
+        sales_conversation_utterance_chain.run(
-    company_name="Sleep Haven",
+        salesperson_name="Ted Lasso",
-    company_business="Sleep Haven is a premium mattress company that provides customers with the most comfortable and supportive sleeping experience possible. We offer a range of high-quality mattresses, pillows, and bedding accessories that are designed to meet the unique needs of our customers.",
+        salesperson_role="Business Development Representative",
-    company_values="Our mission at Sleep Haven is to help people achieve a better night's sleep by providing them with the best possible sleep solutions. We believe that quality sleep is essential to overall health and well-being, and we are committed to helping our customers achieve optimal sleep by offering exceptional products and customer service.",
+        company_name="Sleep Haven",
-    conversation_purpose="find out whether they are looking to achieve better sleep via buying a premier mattress.",
+        company_business="Sleep Haven is a premium mattress company that provides customers with the most comfortable and supportive sleeping experience possible. We offer a range of high-quality mattresses, pillows, and bedding accessories that are designed to meet the unique needs of our customers.",
-    conversation_history="Hello, this is Ted Lasso from Sleep Haven. How are you doing today? <END_OF_TURN>\nUser: I am well, howe are you?<END_OF_TURN>",
+        company_values="Our mission at Sleep Haven is to help people achieve a better night's sleep by providing them with the best possible sleep solutions. We believe that quality sleep is essential to overall health and well-being, and we are committed to helping our customers achieve optimal sleep by offering exceptional products and customer service.",
-    conversation_type="call",
+        conversation_purpose="find out whether they are looking to achieve better sleep via buying a premier mattress.",
-    conversation_stage=conversation_stages.get(
+        conversation_history="Hello, this is Ted Lasso from Sleep Haven. How are you doing today? <END_OF_TURN>\nUser: I am well, howe are you?<END_OF_TURN>",
-        "1",
+        conversation_type="call",
-        "Introduction: Start the conversation by introducing yourself and your company. Be polite and respectful while keeping the tone of the conversation professional.",
+        conversation_stage=conversation_stages.get(
-    ),
+            "1",
-)
+            "Introduction: Start the conversation by introducing yourself and your company. Be polite and respectful while keeping the tone of the conversation professional.",
        ),
    )
    """
@ -166,14 +166,12 @@ def get_tools(product_catalog):
            func=knowledge_base.run,
            description="useful for when you need to answer questions about product information",
        ),
        # Interpreter
        Tool(
            name="Code Interepeter",
            func=compile,
-            description="Useful when you need to run code locally, such as Python, Javascript, Shell, and more."
+            description="Useful when you need to run code locally, such as Python, Javascript, Shell, and more.",
        )
        # omnimodal agent
    ]
@ -354,12 +352,7 @@ class ProfitPilot(Chain, BaseModel):
        return {}
    @classmethod
-    def from_llm(
+    def from_llm(cls, llm: BaseLLM, verbose: bool = False, **kwargs):  # noqa: F821
        cls,
        llm: BaseLLM,
        verbose: bool = False,
        **kwargs
    ):  # noqa: F821
        """Initialize the SalesGPT Controller."""
        stage_analyzer_chain = StageAnalyzerChain.from_llm(llm, verbose=verbose)
--- a/swarms/agents/stream_response.py
+++ b/swarms/agents/stream_response.py
@ -1,5 +1,3 @@
 def stream(response):
    """
    Yield the response token by token (word by word) from llm
--- a/swarms/artifacts/base.py
+++ b/swarms/artifacts/base.py
@ -10,9 +10,14 @@ from marshmallow.exceptions import RegistryError
@define
 class BaseArtifact(ABC):
    id: str = field(default=Factory(lambda: uuid.uuid4().hex), kw_only=True)
-    name: str = field(default=Factory(lambda self: self.id, takes_self=True), kw_only=True)
+    name: str = field(
        default=Factory(lambda self: self.id, takes_self=True), kw_only=True
    )
    value: any = field()
-    type: str = field(default=Factory(lambda self: self.__class__.__name__, takes_self=True), kw_only=True)
+    type: str = field(
        default=Factory(lambda self: self.__class__.__name__, takes_self=True),
        kw_only=True,
    )
    @classmethod
    def value_to_bytes(cls, value: any) -> bytes:
@ -38,7 +43,7 @@ class BaseArtifact(ABC):
            ErrorArtifactSchema,
            BlobArtifactSchema,
            CsvRowArtifactSchema,
-            ListArtifactSchema
+            ListArtifactSchema,
        )
        class_registry.register("TextArtifact", TextArtifactSchema)
--- a/swarms/artifacts/main.py
+++ b/swarms/artifacts/main.py
@ -12,14 +12,8 @@ class Artifact(BaseModel):
    Artifact that has the task has been produced
    """
-    artifact_id: StrictStr = Field(
+    artifact_id: StrictStr = Field(..., description="ID of the artifact")
-        ...,
+    file_name: StrictStr = Field(..., description="Filename of the artifact")
        description="ID of the artifact"
    )
    file_name: StrictStr = Field(
        ...,
        description="Filename of the artifact"
    )
    relative_path: Optional[StrictStr] = Field(
        None, description="Relative path of the artifact"
    )
--- a/swarms/boss/boss_node.py
+++ b/swarms/boss/boss_node.py
@ -10,7 +10,9 @@ from langchain.vectorstores import FAISS
 from langchain_experimental.autonomous_agents import BabyAGI
 from pydantic import ValidationError
-logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+logging.basicConfig(
    level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
 )
 # ---------- Boss Node ----------
@ -48,7 +50,7 @@ class Boss:
        boss_system_prompt="You are a boss planner in a swarm...",
        llm_class=OpenAI,
        worker_node=None,
-        verbose=False
+        verbose=False,
    ):
        # Store parameters
        self.api_key = api_key or os.getenv("OPENAI_API_KEY")
@ -85,11 +87,7 @@ class Boss:
            embedding_size = 8192
            index = faiss.IndexFlatL2(embedding_size)
-            return FAISS(
+            return FAISS(embeddings_model.embed_query, index, InMemoryDocstore({}), {})
                embeddings_model.embed_query,
                index,
                InMemoryDocstore({}), {}
            )
        except Exception as e:
            logging.error(f"Failed to initialize vector store: {e}")
@ -102,9 +100,13 @@ class Boss:
            Tool(
                name="Goal Decomposition Tool",
                func=todo_chain.run,
-                description="Use Case: Decompose ambitious goals into as many explicit and well defined tasks for an AI agent to follow. Rules and Regulations, don't use this tool too often only in the beginning when the user grants you a mission."
+                description="Use Case: Decompose ambitious goals into as many explicit and well defined tasks for an AI agent to follow. Rules and Regulations, don't use this tool too often only in the beginning when the user grants you a mission.",
            ),
            Tool(
                name="Swarm Worker Agent",
                func=worker_node,
                description="Use Case: When you want to delegate and assign the decomposed goal sub tasks to a worker agent in your swarm, Rules and Regulations, Provide a task specification sheet to the worker agent. It can use the browser, process csvs and generate content",
            ),
            Tool(name="Swarm Worker Agent", func=worker_node, description="Use Case: When you want to delegate and assign the decomposed goal sub tasks to a worker agent in your swarm, Rules and Regulations, Provide a task specification sheet to the worker agent. It can use the browser, process csvs and generate content")
        ]
        suffix = """Question: {task}\n{agent_scratchpad}"""
@ -118,7 +120,9 @@ class Boss:
        llm_chain = LLMChain(llm=self.llm, prompt=prompt)
        agent = ZeroShotAgent(llm_chain=llm_chain, allowed_tools=tools)
-        return AgentExecutor.from_agent_and_tools(agent=agent, tools=tools, verbose=self.verbose)
+        return AgentExecutor.from_agent_and_tools(
            agent=agent, tools=tools, verbose=self.verbose
        )
    def _initialize_baby_agi(self, human_in_the_loop):
        try:
@ -127,7 +131,7 @@ class Boss:
                vectorstore=self.vectorstore,
                task_execution_chain=self.agent_executor,
                max_iterations=self.max_iterations,
-                human_in_the_loop=human_in_the_loop
+                human_in_the_loop=human_in_the_loop,
            )
        except ValidationError as e:
            logging.error(f"Validation Error while initializing BabyAGI: {e}")
--- a/swarms/embeddings/openai.py
+++ b/swarms/embeddings/openai.py
@ -28,7 +28,9 @@ from tenacity import (
 from swarms.embeddings.base import Embeddings
-def get_from_dict_or_env(values: dict, key: str, env_key: str, default: Any = None) -> Any:
+def get_from_dict_or_env(
    values: dict, key: str, env_key: str, default: Any = None
 ) -> Any:
    import os
    return values.get(key) or os.getenv(env_key) or default
--- a/swarms/embeddings/pegasus.py
+++ b/swarms/embeddings/pegasus.py
@ -8,10 +8,7 @@ from pegasus import Pegasus
 class PegasusEmbedding:
    def __init__(
-        self,
+        self, modality: str, multi_process: bool = False, n_processes: int = 4
        modality: str,
        multi_process: bool = False,
        n_processes: int = 4
    ):
        self.modality = modality
        self.multi_process = multi_process
@ -19,7 +16,9 @@ class PegasusEmbedding:
        try:
            self.pegasus = Pegasus(modality, multi_process, n_processes)
        except Exception as e:
-            logging.error(f"Failed to initialize Pegasus with modality: {modality}: {e}")
+            logging.error(
                f"Failed to initialize Pegasus with modality: {modality}: {e}"
            )
            raise
    def embed(self, data: Union[str, list[str]]):
--- a/swarms/hivemind/hivemind.py
+++ b/swarms/hivemind/hivemind.py
@ -10,16 +10,13 @@ import logging
 from swarms.swarms.swarms import HierarchicalSwarm
-logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')
+logging.basicConfig(
    level=logging.DEBUG, format="%(asctime)s - %(levelname)s - %(message)s"
 )
 class HiveMind:
-    def __init__(
+    def __init__(self, openai_api_key="", num_swarms=1, max_workers=None):
        self,
        openai_api_key="",
        num_swarms=1,
        max_workers=None
    ):
        self.openai_api_key = openai_api_key
        self.num_swarms = num_swarms
        self.swarms = [HierarchicalSwarm(openai_api_key) for _ in range(num_swarms)]
@ -43,8 +40,13 @@ class HiveMind:
            logging.error(f"An error occurred in run: {e}")
    def run(self, objective, timeout=None):
-        with concurrent.futures.ThreadPoolExecutor(max_workers=self.max_workers) as executor:
+        with concurrent.futures.ThreadPoolExecutor(
-            futures = {executor.submit(self.run_swarm, swarm, objective) for swarm in self.swarms}
+            max_workers=self.max_workers
        ) as executor:
            futures = {
                executor.submit(self.run_swarm, swarm, objective)
                for swarm in self.swarms
            }
            results = []
            for future in concurrent.futures.as_completed(futures, timeout=timeout):
                try:
--- a/swarms/memory/embed.py
+++ b/swarms/memory/embed.py
@ -4,8 +4,7 @@ from chromadb import EmbeddingFunction
 def openai_embed(self, input, api_key, model_name):
    openai = EmbeddingFunction.OpenAIEmbeddingFunction(
-        api_key=api_key,
+        api_key=api_key, model_name=model_name
        model_name=model_name
    )
    embedding = openai(input)
    return embedding
--- a/swarms/memory/schemas.py
+++ b/swarms/memory/schemas.py
@ -26,19 +26,16 @@ class Artifact(BaseModel):
    relative_path: Optional[str] = Field(
        None,
        description="Relative path of the artifact in the agent's workspace",
-        example="python/code/"
+        example="python/code/",
    )
 class ArtifactUpload(BaseModel):
-    file: bytes = Field(
+    file: bytes = Field(..., description="File to upload")
        ...,
        description="File to upload"
    )
    relative_path: Optional[str] = Field(
        None,
        description="Relative path of the artifact in the agent's workspace",
-        example="python/code/"
+        example="python/code/",
    )
--- a/swarms/models/init.py
+++ b/swarms/models/init.py
@ -1,7 +1,9 @@
 # prompts
 from swarms.models.anthropic import Anthropic
 # from swarms.models.palm import GooglePalm
 from swarms.models.petals import Petals
 # from swarms.models.chat_openai import OpenAIChat
 from swarms.models.prompts.debate import *
 from swarms.models.mistral import Mistral
--- a/swarms/models/anthropic.py
+++ b/swarms/models/anthropic.py
@ -13,7 +13,7 @@ class Anthropic:
        top_k=None,
        top_p=None,
        streaming=False,
-        default_request_timeout=None
+        default_request_timeout=None,
    ):
        self.model = model
        self.max_tokens_to_sample = max_tokens_to_sample
@ -22,7 +22,9 @@ class Anthropic:
        self.top_p = top_p
        self.streaming = streaming
        self.default_request_timeout = default_request_timeout or 600
-        self.anthropic_api_url = os.getenv("ANTHROPIC_API_URL", "https://api.anthropic.com")
+        self.anthropic_api_url = os.getenv(
            "ANTHROPIC_API_URL", "https://api.anthropic.com"
        )
        self.anthropic_api_key = os.getenv("ANTHROPIC_API_KEY")
    def _default_params(self):
@ -44,12 +46,13 @@ class Anthropic:
        stop = stop or []
        params = self._default_params()
        headers = {"Authorization": f"Bearer {self.anthropic_api_key}"}
-        data = {
+        data = {"prompt": prompt, "stop_sequences": stop, **params}
-            "prompt": prompt,
+        response = requests.post(
-            "stop_sequences": stop,
+            f"{self.anthropic_api_url}/completions",
-            **params
+            headers=headers,
-        }
+            json=data,
-        response = requests.post(f"{self.anthropic_api_url}/completions", headers=headers, json=data, timeout=self.default_request_timeout)
+            timeout=self.default_request_timeout,
        )
        return response.json().get("completion")
    def __call__(self, prompt, stop=None):
@ -57,10 +60,11 @@ class Anthropic:
        stop = stop or []
        params = self._default_params()
        headers = {"Authorization": f"Bearer {self.anthropic_api_key}"}
-        data = {
+        data = {"prompt": prompt, "stop_sequences": stop, **params}
-            "prompt": prompt,
+        response = requests.post(
-            "stop_sequences": stop,
+            f"{self.anthropic_api_url}/completions",
-            **params
+            headers=headers,
-        }
+            json=data,
-        response = requests.post(f"{self.anthropic_api_url}/completions", headers=headers, json=data, timeout=self.default_request_timeout)
+            timeout=self.default_request_timeout,
        )
        return response.json().get("completion")
--- a/Show More
+++ b/Show More
`@ -6,4 +6,4 @@ llm = OpenAIChat(model_name="gpt-4")`

	`agent = OmniModalAgent(llm)`	`agent = OmniModalAgent(llm)`

	`agent.run("Create a video of a swarm of fish")`	`agent.run("Create a video of a swarm of fish")`
`@ -5,4 +5,3 @@ auto_scaler.start()`

	`for i in range(100):`	`for i in range(100):`
	`auto_scaler.add_task(f"Task {i}")`	`auto_scaler.add_task(f"Task {i}")`
`@ -1,2 +1 @@`
	`from swarms.swarms import GroupChat`	`from swarms.swarms import GroupChat`
`@ -1,4 +1,3 @@`

	`from swarms import Workflow`	`from swarms import Workflow`
	`from swarms.tools.autogpt import ChatOpenAI`	`from swarms.tools.autogpt import ChatOpenAI`
`@ -1,4 +1,3 @@`

	`"""Agent Infrastructure, models, memory, utils, tools"""`	`"""Agent Infrastructure, models, memory, utils, tools"""`

	`# agents`	`# agents`