From 9014683f9afadcc3c928ccbf7fe9c44faef2f997 Mon Sep 17 00:00:00 2001 From: Kye Date: Fri, 6 Oct 2023 22:10:46 -0400 Subject: [PATCH] code quality --- api/app.py | 4 + api/olds/container.py | 6 +- api/olds/main.py | 11 +- api/olds/worker.py | 2 +- apps/discord.py | 17 +- apps/omni_ui.py | 55 +- docs/old-docs/design/abstraction.py | 33 +- example.py | 13 +- omnimodal_agent_example.py | 2 +- playground/DIY/hierchical.py | 6 +- playground/agents/mm_agent_example.py | 6 +- playground/agents/omni_exa_example.py | 9 +- playground/models/mistral.py | 7 +- playground/structs/nonlinear_worfklow.py | 8 +- playground/swarms/autoscaler.py | 1 - playground/swarms/chat.py | 8 +- playground/swarms/debate.py | 11 +- playground/swarms/dialogue_simulator.py | 10 +- playground/swarms/easy_example.py | 1 - playground/swarms/godmode.py | 9 +- playground/swarms/group_chat.py | 1 - playground/swarms/groupchat.py | 30 +- playground/swarms/gui_app.py | 2 +- playground/swarms/multi_agent_collab.py | 2 + playground/swarms/multi_agent_debate.py | 8 +- playground/swarms/orchestrate.py | 5 +- playground/swarms/orchestrator.py | 5 +- playground/swarms/social_app.py | 2 +- playground/swarms/swarms_example.py | 2 +- playground/swarms/todo_app.py | 2 +- playground/worker/ultranode_example.py | 2 +- playground/worker/worker.py | 12 +- playground/worker/worker_ultra.py | 2 +- playground/workflow.py | 1 - setup.py | 92 +- swarms/__init__.py | 1 + swarms/agents/__init__.py | 1 - swarms/agents/aot.py | 101 +- swarms/agents/base.py | 7 +- swarms/agents/conversabe_agent.py | 139 +- .../datasets/cocogrounding_eval.py | 15 +- .../groundingdino/datasets/transforms.py | 19 +- .../models/GroundingDINO/backbone/backbone.py | 32 +- .../backbone/position_encoding.py | 19 +- .../backbone/swin_transformer.py | 100 +- .../models/GroundingDINO/bertwarper.py | 54 +- .../models/GroundingDINO/fuse_modules.py | 36 +- .../models/GroundingDINO/groundingdino.py | 55 +- .../models/GroundingDINO/ms_deform_attn.py | 36 +- .../models/GroundingDINO/transformer.py | 68 +- .../GroundingDINO/transformer_vanilla.py | 9 +- .../models/GroundingDINO/utils.py | 67 +- .../models/groundingdino/models/registry.py | 4 +- .../groundingdino/util/get_tokenlizer.py | 4 +- .../models/groundingdino/util/inference.py | 97 +- .../models/groundingdino/util/logger.py | 4 +- .../agents/models/groundingdino/util/misc.py | 61 +- .../models/groundingdino/util/slconfig.py | 23 +- .../agents/models/groundingdino/util/utils.py | 32 +- .../models/groundingdino/util/visualizer.py | 21 +- .../models/groundingdino/util/vl_utils.py | 4 +- .../models/segment_anything/scripts/amg.py | 8 +- .../scripts/export_onnx_model.py | 9 +- .../automatic_mask_generator.py | 20 +- .../segment_anything/build_sam.py | 8 +- .../modeling/image_encoder.py | 44 +- .../segment_anything/modeling/mask_decoder.py | 22 +- .../modeling/prompt_encoder.py | 21 +- .../segment_anything/modeling/sam.py | 12 +- .../segment_anything/modeling/transformer.py | 4 +- .../segment_anything/predictor.py | 32 +- .../segment_anything/utils/amg.py | 4 +- .../segment_anything/utils/onnx.py | 33 +- .../segment_anything/utils/transforms.py | 20 +- swarms/agents/multi_modal_visual_agent.py | 1433 ++++++++++------- .../omni_agent/get_token_ids.py | 6 +- .../omni_agent/model_server.py | 491 ++++-- .../omni_agent/omni_chat.py | 593 +++++-- swarms/agents/omni_modal_agent.py | 49 +- swarms/agents/profitpilot.py | 59 +- swarms/agents/stream_response.py | 2 - swarms/artifacts/base.py | 11 +- swarms/artifacts/main.py | 10 +- swarms/boss/boss_node.py | 26 +- swarms/embeddings/openai.py | 12 +- swarms/embeddings/pegasus.py | 9 +- swarms/hivemind/hivemind.py | 20 +- swarms/memory/embed.py | 3 +- swarms/memory/schemas.py | 9 +- swarms/models/__init__.py | 2 + swarms/models/anthropic.py | 32 +- swarms/models/chat_openai.py | 4 +- swarms/models/mistral.py | 46 +- swarms/models/petals.py | 2 +- swarms/models/prompts/agent_output_parser.py | 1 + swarms/models/prompts/agent_prompt_auto.py | 14 +- swarms/models/prompts/agent_prompts.py | 79 +- swarms/models/prompts/debate.py | 4 +- .../prompts/prebuild/project_manager.py | 4 +- .../models/prompts/prebuild/sales_prompts.py | 18 +- .../prompts/prebuild/summaries_prompts.py | 1 - swarms/structs/nonlinear_workflow.py | 24 +- swarms/structs/task.py | 39 +- swarms/structs/workflow.py | 9 +- swarms/swarms/autoscaler.py | 1 + swarms/swarms/dialogue_simulator.py | 11 +- swarms/swarms/god_mode.py | 11 +- swarms/swarms/groupchat.py | 18 +- swarms/swarms/multi_agent_collab.py | 11 +- swarms/swarms/multi_agent_debate.py | 15 +- swarms/swarms/orchestrate.py | 91 +- swarms/swarms/scable_groupchat.py | 62 +- swarms/swarms/simple_swarm.py | 6 +- swarms/tools/autogpt.py | 28 +- swarms/tools/base.py | 31 +- swarms/tools/developer.py | 18 +- swarms/tools/requests.py | 1 - swarms/tools/stt.py | 50 +- swarms/utils/decorators.py | 22 +- swarms/utils/main.py | 17 +- swarms/workers/base.py | 13 +- swarms/workers/worker.py | 61 +- tests/agents/agents.py | 118 +- tests/agents/omni_modal.py | 6 +- tests/boss/boss_node.py | 97 +- tests/models/LLM.py | 27 +- tests/models/hf.py | 47 +- tests/orchestrate.py | 11 +- tests/swarms.py | 84 +- tests/workers/multi_model_worker.py | 19 +- tests/workers/omni_worker.py | 62 +- tests/workers/worker_agent_ultra.py | 24 +- tests/workers/worker_node.py | 34 +- tests/workers/worker_ultra.py | 56 +- 134 files changed, 3572 insertions(+), 2125 deletions(-) diff --git a/api/app.py b/api/app.py index 9b58bb8f..fc2b0aec 100644 --- a/api/app.py +++ b/api/app.py @@ -17,12 +17,15 @@ from dotenv import load_dotenv load_dotenv() + class SwarmInput(BaseModel): api_key: str objective: str + app = FastAPI() + @app.on_event("startup") async def startup(): redis_host = os.getenv("REDIS_HOST", "localhost") @@ -31,6 +34,7 @@ async def startup(): FastAPICache.init(RedisBackend(redis), prefix="fastapi-cache", coder=JsonCoder()) await FastAPILimiter.init(f"redis://{redis_host}:{redis_port}") + @app.post("/chat", dependencies=[Depends(RateLimiter(times=2, minutes=1))]) @cache(expire=60) # Cache results for 1 minute async def run(swarm_input: SwarmInput): diff --git a/api/olds/container.py b/api/olds/container.py index a240e4c1..f90dfd31 100644 --- a/api/olds/container.py +++ b/api/olds/container.py @@ -55,8 +55,6 @@ file_handler = FileHandler(handlers=handlers, path=BASE_DIR) templates = Jinja2Templates(directory=BASE_DIR / "api" / "templates") -uploader = StaticUploader.from_settings( - path=BASE_DIR / "static", endpoint="static" -) +uploader = StaticUploader.from_settings(path=BASE_DIR / "static", endpoint="static") -reload_dirs = [BASE_DIR / "core", BASE_DIR / "api"] \ No newline at end of file +reload_dirs = [BASE_DIR / "core", BASE_DIR / "api"] diff --git a/api/olds/main.py b/api/olds/main.py index c2027a9e..0ba69379 100644 --- a/api/olds/main.py +++ b/api/olds/main.py @@ -11,8 +11,15 @@ from fastapi.responses import HTMLResponse from fastapi.staticfiles import StaticFiles from pydantic import BaseModel -from api.olds.container import agent_manager, file_handler, reload_dirs, templates, uploader +from api.olds.container import ( + agent_manager, + file_handler, + reload_dirs, + templates, + uploader, +) from api.olds.worker import get_task_result, start_worker, task_execute + # from env import settings app = FastAPI() @@ -127,4 +134,4 @@ def dev(): port=os.environ["EVAL_PORT"], reload=True, reload_dirs=reload_dirs, - ) \ No newline at end of file + ) diff --git a/api/olds/worker.py b/api/olds/worker.py index 9128928b..911101a5 100644 --- a/api/olds/worker.py +++ b/api/olds/worker.py @@ -41,4 +41,4 @@ def start_worker(): "worker", "--loglevel=INFO", ] - ) \ No newline at end of file + ) diff --git a/apps/discord.py b/apps/discord.py index e2c5ece5..84f0cace 100644 --- a/apps/discord.py +++ b/apps/discord.py @@ -3,28 +3,34 @@ from langchain.llms import OpenAIChat from swarms.agents import OmniModalAgent # Setup -TOKEN = 'YOUR_DISCORD_BOT_TOKEN' -bot = commands.Bot(command_prefix='!') +TOKEN = "YOUR_DISCORD_BOT_TOKEN" +bot = commands.Bot(command_prefix="!") # Initialize the OmniModalAgent llm = OpenAIChat(model_name="gpt-4") agent = OmniModalAgent(llm) + @bot.event async def on_ready(): - print(f'We have logged in as {bot.user}') + print(f"We have logged in as {bot.user}") + @bot.command() async def greet(ctx): """Greets the user.""" - await ctx.send(f'Hello, {ctx.author.name}!') + await ctx.send(f"Hello, {ctx.author.name}!") + @bot.command() async def run(ctx, *, description: str): """Generates a video based on the given description.""" - response = agent.run(description) # Assuming the response provides information or a link to the generated video + response = agent.run( + description + ) # Assuming the response provides information or a link to the generated video await ctx.send(response) + @bot.command() async def help_me(ctx): """Provides a list of commands and their descriptions.""" @@ -35,4 +41,5 @@ async def help_me(ctx): """ await ctx.send(help_text) + bot.run(TOKEN) diff --git a/apps/omni_ui.py b/apps/omni_ui.py index 05ac8194..73613ed5 100644 --- a/apps/omni_ui.py +++ b/apps/omni_ui.py @@ -1,38 +1,42 @@ -#Import required libraries +# Import required libraries from gradio import Interface, Textbox, HTML import threading import os import glob import base64 -from langchain.llms import OpenAIChat -from swarms.agents import OmniModalAgent +from langchain.llms import OpenAIChat +from swarms.agents import OmniModalAgent -#Function to convert image to base64 + +# Function to convert image to base64 def image_to_base64(image_path): with open(image_path, "rb") as image_file: return base64.b64encode(image_file.read()).decode() -#Function to get the most recently created image in the directory + +# Function to get the most recently created image in the directory def get_latest_image(): - list_of_files = glob.glob('./*.png') # Replace with your image file type + list_of_files = glob.glob("./*.png") # Replace with your image file type if not list_of_files: return None latest_file = max(list_of_files, key=os.path.getctime) return latest_file -#Initialize your OmniModalAgent + +# Initialize your OmniModalAgent llm = OpenAIChat(model_name="gpt-4") # Replace with your actual initialization agent = OmniModalAgent(llm) # Replace with your actual initialization -#Global variable to store chat history +# Global variable to store chat history chat_history = [] -#Function to update chat + +# Function to update chat def update_chat(user_input): global chat_history chat_history.append({"type": "user", "content": user_input}) - #Get agent response + # Get agent response agent_response = agent.run(user_input) # Handle the case where agent_response is not in the expected dictionary format @@ -48,38 +52,43 @@ def update_chat(user_input): return render_chat(chat_history) -#Function to render chat as HTML + +# Function to render chat as HTML + def render_chat(chat_history): chat_str = "
" for message in chat_history: - if message['type'] == 'user': + if message["type"] == "user": chat_str += f"

User: {message['content']}

" - elif message['type'] == 'text': + elif message["type"] == "text": chat_str += f"

Agent: {message['content']}

" - elif message['type'] == 'image': - img_path = os.path.join(".", message['content']) + elif message["type"] == "image": + img_path = os.path.join(".", message["content"]) base64_img = image_to_base64(img_path) chat_str += f"

Agent: image

" chat_str += "
" return chat_str -#Define Gradio interface + +# Define Gradio interface iface = Interface( - fn=update_chat, - inputs=Textbox(label="Your Message", type="text"), + fn=update_chat, + inputs=Textbox(label="Your Message", type="text"), outputs=HTML(label="Chat History"), - live=True + live=True, ) -#Function to update the chat display + +# Function to update the chat display def update_display(): global chat_history while True: iface.update(render_chat(chat_history)) -#Run the update_display function in a separate thread + +# Run the update_display function in a separate thread threading.Thread(target=update_display).start() -#Run Gradio interface -iface.launch() \ No newline at end of file +# Run Gradio interface +iface.launch() diff --git a/docs/old-docs/design/abstraction.py b/docs/old-docs/design/abstraction.py index c1d86c4d..75862e72 100644 --- a/docs/old-docs/design/abstraction.py +++ b/docs/old-docs/design/abstraction.py @@ -1,32 +1,19 @@ from swarms import Model, Agent, vectorstore, tools, orchestrator -#1 model +# 1 model Model(openai) -#2 agent level -Agent( - model, - vectorstore, - tools -) +# 2 agent level +Agent(model, vectorstore, tools) -#3 worker infrastructure level -worker_node( - Agent, - human_input, - tools -) +# 3 worker infrastructure level +worker_node(Agent, human_input, tools) -#4 swarm level basically handling infrastructure for multiple worker node -swarm = orchestrator( - worker_node, - 100 # nodes -) +# 4 swarm level basically handling infrastructure for multiple worker node +swarm = orchestrator(worker_node, 100) # nodes -#5 -hivemind = Hivemind( - swarm * 100 -) +# 5 +hivemind = Hivemind(swarm * 100) -#a market different pre built worker or boss agent that have access to different tools and memory, proompts \ No newline at end of file +# a market different pre built worker or boss agent that have access to different tools and memory, proompts diff --git a/example.py b/example.py index 0a0ae998..825f1617 100644 --- a/example.py +++ b/example.py @@ -1,22 +1,17 @@ from langchain.llms import OpenAIChat from swarms import Worker -llm = OpenAIChat( - model_name='gpt-4', - openai_api_key="api-key", - temperature=0.5 -) +llm = OpenAIChat(model_name="gpt-4", openai_api_key="api-key", temperature=0.5) node = Worker( llm=llm, ai_name="Optimus Prime", ai_role="Worker in a swarm", - external_tools = None, - human_in_the_loop = False, - temperature = 0.5, + external_tools=None, + human_in_the_loop=False, + temperature=0.5, ) task = "What were the winning boston marathon times for the past 5 years (ending in 2022)? Generate a table of the year, name, country of origin, and times." response = node.run(task) print(response) - diff --git a/omnimodal_agent_example.py b/omnimodal_agent_example.py index 904d9e9e..a9ab0995 100644 --- a/omnimodal_agent_example.py +++ b/omnimodal_agent_example.py @@ -6,4 +6,4 @@ llm = OpenAIChat(model_name="gpt-4") agent = OmniModalAgent(llm) -agent.run("Create a video of a swarm of fish") \ No newline at end of file +agent.run("Create a video of a swarm of fish") diff --git a/playground/DIY/hierchical.py b/playground/DIY/hierchical.py index 983c1875..0734c4f6 100644 --- a/playground/DIY/hierchical.py +++ b/playground/DIY/hierchical.py @@ -8,13 +8,13 @@ swarm = HierarchicalSwarm( use_vectorstore=False, use_async=False, human_in_the_loop=False, - logging_enabled=False + logging_enabled=False, ) -#run the swarm with an objective +# run the swarm with an objective result = swarm.run("Design a new car") -#or huggingface +# or huggingface swarm = HierarchicalSwarm( model_type="huggingface", model_id="tiaueu/falcon", diff --git a/playground/agents/mm_agent_example.py b/playground/agents/mm_agent_example.py index 3177939c..0da0d469 100644 --- a/playground/agents/mm_agent_example.py +++ b/playground/agents/mm_agent_example.py @@ -1,8 +1,6 @@ from swarms.agents import MultiModalAgent -load_dict = { - "ImageCaptioning": "cuda" -} +load_dict = {"ImageCaptioning": "cuda"} node = MultiModalAgent(load_dict) @@ -12,5 +10,5 @@ img = node.run_img("/image1", "What is this image about?") chat = node.chat( "What is your name? Generate a picture of yourself. What is this image about?", - streaming=True + streaming=True, ) diff --git a/playground/agents/omni_exa_example.py b/playground/agents/omni_exa_example.py index c5d87647..094b6413 100644 --- a/playground/agents/omni_exa_example.py +++ b/playground/agents/omni_exa_example.py @@ -1,12 +1,9 @@ -#pip3 install exxa +# pip3 install exxa from exa import Inference from swarms.agents import OmniModalAgent -llm = Inference( - model_id="mistralai/Mistral-7B-v0.1", - quantize=True -) +llm = Inference(model_id="mistralai/Mistral-7B-v0.1", quantize=True) agent = OmniModalAgent(llm) -agent.run("Create a video of a swarm of fish") \ No newline at end of file +agent.run("Create a video of a swarm of fish") diff --git a/playground/models/mistral.py b/playground/models/mistral.py index 8ae3c413..f1731aff 100644 --- a/playground/models/mistral.py +++ b/playground/models/mistral.py @@ -1,10 +1,7 @@ from swarms.models import Mistral -model = Mistral( - device="cuda", - use_flash_attention=True -) +model = Mistral(device="cuda", use_flash_attention=True) prompt = "My favourite condiment is" result = model.run(prompt) -print(result) \ No newline at end of file +print(result) diff --git a/playground/structs/nonlinear_worfklow.py b/playground/structs/nonlinear_worfklow.py index e53c8796..6c264d63 100644 --- a/playground/structs/nonlinear_worfklow.py +++ b/playground/structs/nonlinear_worfklow.py @@ -7,12 +7,12 @@ prompt2 = "Develop a self attention using pytorch" task1 = Task("task1", prompt) task2 = Task("task2", prompt2, parents=[task1]) -#add tasks to workflow +# add tasks to workflow workflow = NonLinearWorkflow(agent) -#add tasks to tree +# add tasks to tree workflow.add(task1) workflow.add(task2) -#run -workflow.run() \ No newline at end of file +# run +workflow.run() diff --git a/playground/swarms/autoscaler.py b/playground/swarms/autoscaler.py index 85b1dcbb..82bcadb6 100644 --- a/playground/swarms/autoscaler.py +++ b/playground/swarms/autoscaler.py @@ -5,4 +5,3 @@ auto_scaler.start() for i in range(100): auto_scaler.add_task(f"Task {i}") - diff --git a/playground/swarms/chat.py b/playground/swarms/chat.py index 134f7f16..b0ebc39a 100644 --- a/playground/swarms/chat.py +++ b/playground/swarms/chat.py @@ -1,11 +1,7 @@ from swarms import Orchestrator, Worker # Instantiate the Orchestrator with 10 agents -orchestrator = Orchestrator( - Worker, - agent_list=[Worker]*10, - task_queue=[] -) +orchestrator = Orchestrator(Worker, agent_list=[Worker] * 10, task_queue=[]) # Agent 1 sends a message to Agent 2 -orchestrator.chat(sender_id=1, receiver_id=2, message="Hello, Agent 2!") \ No newline at end of file +orchestrator.chat(sender_id=1, receiver_id=2, message="Hello, Agent 2!") diff --git a/playground/swarms/debate.py b/playground/swarms/debate.py index 60c799b4..c80e6f31 100644 --- a/playground/swarms/debate.py +++ b/playground/swarms/debate.py @@ -89,6 +89,7 @@ class DialogueSimulator: return speaker.name, message + class BiddingDialogueAgent(DialogueAgent): def __init__( self, @@ -114,6 +115,7 @@ class BiddingDialogueAgent(DialogueAgent): bid_string = self.model([SystemMessage(content=prompt)]).content return bid_string + character_names = ["Donald Trump", "Kanye West", "Elizabeth Warren"] topic = "transcontinental high speed rail" word_limit = 50 @@ -202,8 +204,6 @@ for ( print(f"\n{character_header}") print(f"\n{character_system_message.content}") - - class BidOutputParser(RegexParser): def get_format_instructions(self) -> str: @@ -214,6 +214,7 @@ bid_parser = BidOutputParser( regex=r"<(\d+)>", output_keys=["bid"], default_output_key="bid" ) + def generate_character_bidding_template(character_header): bidding_template = f"""{character_header} @@ -232,6 +233,7 @@ def generate_character_bidding_template(character_header): """ return bidding_template + character_bidding_templates = [ generate_character_bidding_template(character_header) for character_header in character_headers @@ -263,6 +265,7 @@ specified_topic = ChatOpenAI(temperature=1.0)(topic_specifier_prompt).content print(f"Original topic:\n{topic}\n") print(f"Detailed topic:\n{specified_topic}\n") + @tenacity.retry( stop=tenacity.stop_after_attempt(2), wait=tenacity.wait_none(), # No waiting time between retries @@ -280,6 +283,7 @@ def ask_for_bid(agent) -> str: bid = int(bid_parser.parse(bid_string)["bid"]) return bid + def select_next_speaker(step: int, agents: List[DialogueAgent]) -> int: bids = [] for agent in agents: @@ -300,6 +304,7 @@ def select_next_speaker(step: int, agents: List[DialogueAgent]) -> int: print("\n") return idx + characters = [] for character_name, character_system_message, bidding_template in zip( character_names, character_system_messages, character_bidding_templates @@ -326,4 +331,4 @@ while n < max_iters: name, message = simulator.step() print(f"({name}): {message}") print("\n") - n += 1 \ No newline at end of file + n += 1 diff --git a/playground/swarms/dialogue_simulator.py b/playground/swarms/dialogue_simulator.py index f02bdc82..a0800c49 100644 --- a/playground/swarms/dialogue_simulator.py +++ b/playground/swarms/dialogue_simulator.py @@ -4,12 +4,12 @@ worker1 = Worker(ai_name="Plinus", openai_api_key="") worker2 = Worker(ai_name="Optimus Prime", openai_api_key="") collab = DialogueSimulator( - [worker1, worker2], + [worker1, worker2], # DialogueSimulator.select_next_speaker ) collab.run( - max_iters = 4, - name = "plinus", - message = "how can we enable multi agent collaboration", -) \ No newline at end of file + max_iters=4, + name="plinus", + message="how can we enable multi agent collaboration", +) diff --git a/playground/swarms/easy_example.py b/playground/swarms/easy_example.py index 8417623a..2a537c10 100644 --- a/playground/swarms/easy_example.py +++ b/playground/swarms/easy_example.py @@ -5,4 +5,3 @@ api_key = "APIKEY" objective = "What is the capital of the UK?" result = swarm(api_key, objective) print(result) # Prints: "The capital of the UK is London." - diff --git a/playground/swarms/godmode.py b/playground/swarms/godmode.py index c6b987ad..1031d94e 100644 --- a/playground/swarms/godmode.py +++ b/playground/swarms/godmode.py @@ -1,4 +1,3 @@ - from langchain.models import Anthropic, GooglePalm, OpenAIChat from swarms.swarms import GodMode @@ -7,14 +6,10 @@ palm = GooglePalm(google_api_key="") gpt = OpenAIChat(openai_api_key="") # Usage -llms = [ - claude, - palm, - gpt -] +llms = [claude, palm, gpt] god_mode = GodMode(llms) task = "What are the biggest risks facing humanity?" -god_mode.print_responses(task) \ No newline at end of file +god_mode.print_responses(task) diff --git a/playground/swarms/group_chat.py b/playground/swarms/group_chat.py index 8b01bd34..a78ad0e5 100644 --- a/playground/swarms/group_chat.py +++ b/playground/swarms/group_chat.py @@ -1,2 +1 @@ from swarms.swarms import GroupChat - diff --git a/playground/swarms/groupchat.py b/playground/swarms/groupchat.py index 5de21a2e..2af37d93 100644 --- a/playground/swarms/groupchat.py +++ b/playground/swarms/groupchat.py @@ -2,44 +2,36 @@ from langchain.llms import OpenAIChat from swarms.swarms import GroupChat, GroupChatManager from swarms.workers import Worker -llm = OpenAIChat( - model_name='gpt-4', - openai_api_key="api-key", - temperature=0.5 -) +llm = OpenAIChat(model_name="gpt-4", openai_api_key="api-key", temperature=0.5) node = Worker( llm=llm, ai_name="Optimus Prime", ai_role="Worker in a swarm", - external_tools = None, - human_in_the_loop = False, - temperature = 0.5, + external_tools=None, + human_in_the_loop=False, + temperature=0.5, ) node2 = Worker( llm=llm, ai_name="Optimus Prime", ai_role="Worker in a swarm", - external_tools = None, - human_in_the_loop = False, - temperature = 0.5, + external_tools=None, + human_in_the_loop=False, + temperature=0.5, ) node3 = Worker( llm=llm, ai_name="Optimus Prime", ai_role="Worker in a swarm", - external_tools = None, - human_in_the_loop = False, - temperature = 0.5, + external_tools=None, + human_in_the_loop=False, + temperature=0.5, ) -nodes = [ - node, - node2, - node3 -] +nodes = [node, node2, node3] messages = [ { diff --git a/playground/swarms/gui_app.py b/playground/swarms/gui_app.py index 18d66597..d4f3cdb4 100644 --- a/playground/swarms/gui_app.py +++ b/playground/swarms/gui_app.py @@ -20,4 +20,4 @@ I want it to have neumorphism-style. Serve it on port 4500. """ # Run HierarchicalSwarm -swarm.run(objective) \ No newline at end of file +swarm.run(objective) diff --git a/playground/swarms/multi_agent_collab.py b/playground/swarms/multi_agent_collab.py index 8caebb51..0d0b115f 100644 --- a/playground/swarms/multi_agent_collab.py +++ b/playground/swarms/multi_agent_collab.py @@ -1,9 +1,11 @@ from swarms import DialogueSimulator, Worker + def select_next_speaker(step: int, agents) -> int: idx = (step) % len(agents) return idx + debate = DialogueSimulator(Worker, select_next_speaker) debate.run() diff --git a/playground/swarms/multi_agent_debate.py b/playground/swarms/multi_agent_debate.py index 468e83ba..c6637413 100644 --- a/playground/swarms/multi_agent_debate.py +++ b/playground/swarms/multi_agent_debate.py @@ -5,11 +5,7 @@ worker1 = Worker(openai_api_key="", ai_name="Optimus Prime") worker2 = Worker(openai_api_key="", ai_name="Bumblebee") worker3 = Worker(openai_api_key="", ai_name="Megatron") -agents = [ - worker1, - worker2, - worker3 -] +agents = [worker1, worker2, worker3] # Initialize multi-agent debate with the selection function debate = MultiAgentDebate(agents, select_speaker) @@ -20,4 +16,4 @@ results = debate.run(task, max_iters=4) # Print results for result in results: - print(f"Agent {result['agent']} responded: {result['response']}") \ No newline at end of file + print(f"Agent {result['agent']} responded: {result['response']}") diff --git a/playground/swarms/orchestrate.py b/playground/swarms/orchestrate.py index c52c732a..e43b75e3 100644 --- a/playground/swarms/orchestrate.py +++ b/playground/swarms/orchestrate.py @@ -3,12 +3,11 @@ from swarms import Worker, Orchestrator node = Worker( openai_api_key="", ai_name="Optimus Prime", - ) # Instantiate the Orchestrator with 10 agents -orchestrator = Orchestrator(node, agent_list=[node]*10, task_queue=[]) +orchestrator = Orchestrator(node, agent_list=[node] * 10, task_queue=[]) # Agent 7 sends a message to Agent 9 -orchestrator.chat(sender_id=7, receiver_id=9, message="Can you help me with this task?") \ No newline at end of file +orchestrator.chat(sender_id=7, receiver_id=9, message="Can you help me with this task?") diff --git a/playground/swarms/orchestrator.py b/playground/swarms/orchestrator.py index c52c732a..e43b75e3 100644 --- a/playground/swarms/orchestrator.py +++ b/playground/swarms/orchestrator.py @@ -3,12 +3,11 @@ from swarms import Worker, Orchestrator node = Worker( openai_api_key="", ai_name="Optimus Prime", - ) # Instantiate the Orchestrator with 10 agents -orchestrator = Orchestrator(node, agent_list=[node]*10, task_queue=[]) +orchestrator = Orchestrator(node, agent_list=[node] * 10, task_queue=[]) # Agent 7 sends a message to Agent 9 -orchestrator.chat(sender_id=7, receiver_id=9, message="Can you help me with this task?") \ No newline at end of file +orchestrator.chat(sender_id=7, receiver_id=9, message="Can you help me with this task?") diff --git a/playground/swarms/social_app.py b/playground/swarms/social_app.py index 7e148e62..af67b9f0 100644 --- a/playground/swarms/social_app.py +++ b/playground/swarms/social_app.py @@ -16,4 +16,4 @@ The ports you can use are 4500 and 6500. """ # Run HierarchicalSwarm -swarm.run(objective) \ No newline at end of file +swarm.run(objective) diff --git a/playground/swarms/swarms_example.py b/playground/swarms/swarms_example.py index e2a1fa4e..6dabe4a1 100644 --- a/playground/swarms/swarms_example.py +++ b/playground/swarms/swarms_example.py @@ -10,4 +10,4 @@ swarm = HierarchicalSwarm(api_key) objective = "Find 20 potential customers for a HierarchicalSwarm based AI Agent automation infrastructure" # Run HierarchicalSwarm -swarm.run(objective) \ No newline at end of file +swarm.run(objective) diff --git a/playground/swarms/todo_app.py b/playground/swarms/todo_app.py index 0f7fc8fd..1b897a1c 100644 --- a/playground/swarms/todo_app.py +++ b/playground/swarms/todo_app.py @@ -17,4 +17,4 @@ The ports you can use are 4500 and 6500. """ # Run HierarchicalSwarm -swarm.run(objective) \ No newline at end of file +swarm.run(objective) diff --git a/playground/worker/ultranode_example.py b/playground/worker/ultranode_example.py index 6a5285d7..01654959 100644 --- a/playground/worker/ultranode_example.py +++ b/playground/worker/ultranode_example.py @@ -12,4 +12,4 @@ I want it to have neumorphism-style. Serve it on port 4500. """ node = WorkerUltraUltraNode(objective) -result = node.execute() \ No newline at end of file +result = node.execute() diff --git a/playground/worker/worker.py b/playground/worker/worker.py index a6e78dcc..00f15f1a 100644 --- a/playground/worker/worker.py +++ b/playground/worker/worker.py @@ -1,19 +1,15 @@ from langchain.models import OpenAIChat from swarms import Worker -llm = OpenAIChat( - model_name='gpt-4', - openai_api_key="api-key", - temperature=0.5 -) +llm = OpenAIChat(model_name="gpt-4", openai_api_key="api-key", temperature=0.5) node = Worker( llm=llm, ai_name="Optimus Prime", ai_role="Worker in a swarm", - external_tools = None, - human_in_the_loop = False, - temperature = 0.5, + external_tools=None, + human_in_the_loop=False, + temperature=0.5, ) task = "What were the winning boston marathon times for the past 5 years (ending in 2022)? Generate a table of the year, name, country of origin, and times." diff --git a/playground/worker/worker_ultra.py b/playground/worker/worker_ultra.py index 7d2b4e73..69da3f30 100644 --- a/playground/worker/worker_ultra.py +++ b/playground/worker/worker_ultra.py @@ -22,4 +22,4 @@ worker = WorkerUltra(objective, api_key) result = worker.execute() # Print the result -print(result) \ No newline at end of file +print(result) diff --git a/playground/workflow.py b/playground/workflow.py index a40fe605..a5d0ea03 100644 --- a/playground/workflow.py +++ b/playground/workflow.py @@ -1,4 +1,3 @@ - from swarms import Workflow from swarms.tools.autogpt import ChatOpenAI diff --git a/setup.py b/setup.py index 454438a1..7128a42c 100644 --- a/setup.py +++ b/setup.py @@ -1,50 +1,50 @@ from setuptools import setup, find_packages setup( - name = 'swarms', - packages = find_packages(exclude=[]), - version = '1.4.1', - license='MIT', - description = 'Swarms - Pytorch', - author = 'Kye Gomez', - author_email = 'kye@apac.ai', - long_description_content_type = 'text/markdown', - url = 'https://github.com/kyegomez/swarms', - keywords = [ - 'artificial intelligence', - 'deep learning', - 'optimizers', - "Prompt Engineering" - ], - install_requires=[ - 'transformers', - 'openai', - 'langchain==0.0.240', - 'asyncio', - 'nest_asyncio', - 'pegasusx', - 'google-generativeai', - 'oceandb', - 'langchain-experimental', - 'playwright', - 'duckduckgo_search', - 'faiss-cpu', - 'wget', - 'httpx', - 'ggl', - 'beautifulsoup4', - 'pydantic', - 'tenacity', - 'celery', - 'redis', - 'google-search-results==2.4.2', - 'Pillow', + name="swarms", + packages=find_packages(exclude=[]), + version="1.4.1", + license="MIT", + description="Swarms - Pytorch", + author="Kye Gomez", + author_email="kye@apac.ai", + long_description_content_type="text/markdown", + url="https://github.com/kyegomez/swarms", + keywords=[ + "artificial intelligence", + "deep learning", + "optimizers", + "Prompt Engineering", ], - classifiers=[ - 'Development Status :: 4 - Beta', - 'Intended Audience :: Developers', - 'Topic :: Scientific/Engineering :: Artificial Intelligence', - 'License :: OSI Approved :: MIT License', - 'Programming Language :: Python :: 3.6', - ], -) \ No newline at end of file + install_requires=[ + "transformers", + "openai", + "langchain==0.0.240", + "asyncio", + "nest_asyncio", + "pegasusx", + "google-generativeai", + "oceandb", + "langchain-experimental", + "playwright", + "duckduckgo_search", + "faiss-cpu", + "wget", + "httpx", + "ggl", + "beautifulsoup4", + "pydantic", + "tenacity", + "celery", + "redis", + "google-search-results==2.4.2", + "Pillow", + ], + classifiers=[ + "Development Status :: 4 - Beta", + "Intended Audience :: Developers", + "Topic :: Scientific/Engineering :: Artificial Intelligence", + "License :: OSI Approved :: MIT License", + "Programming Language :: Python :: 3.6", + ], +) diff --git a/swarms/__init__.py b/swarms/__init__.py index 21bb7840..b9d8ccd4 100644 --- a/swarms/__init__.py +++ b/swarms/__init__.py @@ -7,6 +7,7 @@ from swarms import models from swarms.workers.worker import Worker from swarms import workers from swarms.logo import logo2 + print(logo2) # worker diff --git a/swarms/agents/__init__.py b/swarms/agents/__init__.py index 13e63890..7beaf170 100644 --- a/swarms/agents/__init__.py +++ b/swarms/agents/__init__.py @@ -1,4 +1,3 @@ - """Agent Infrastructure, models, memory, utils, tools""" # agents diff --git a/swarms/agents/aot.py b/swarms/agents/aot.py index dde4bdd6..48857406 100644 --- a/swarms/agents/aot.py +++ b/swarms/agents/aot.py @@ -4,7 +4,9 @@ import time import openai -logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') +logging.basicConfig( + level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s" +) logger = logging.getLogger(__name__) @@ -25,11 +27,13 @@ class OpenAI: raise Exception("Please provide OpenAI API key") if api_base == "" or api_base is None: - api_base = os.environ.get("OPENAI_API_BASE", "") # if not set, use the default base path of "https://api.openai.com/v1" + api_base = os.environ.get( + "OPENAI_API_BASE", "" + ) # if not set, use the default base path of "https://api.openai.com/v1" if api_base != "": # e.g. https://api.openai.com/v1/ or your custom url openai.api_base = api_base - print(f'Using custom api_base {api_base}') + print(f"Using custom api_base {api_base}") if api_model == "" or api_model is None: api_model = os.environ.get("OPENAI_API_MODEL", "") @@ -37,29 +41,17 @@ class OpenAI: self.api_model = api_model else: self.api_model = "text-davinci-003" - print(f'Using api_model {self.api_model}') + print(f"Using api_model {self.api_model}") - self.use_chat_api = 'gpt' in self.api_model + self.use_chat_api = "gpt" in self.api_model self.strategy = strategy self.evaluation_strategy = evaluation_strategy - def run( - self, - prompt, - max_tokens, - temperature, - k=1, - stop=None - ): + def run(self, prompt, max_tokens, temperature, k=1, stop=None): while True: try: if self.use_chat_api: - messages = [ - { - "role": "user", - "content": prompt - } - ] + messages = [{"role": "user", "content": prompt}] response = openai.ChatCompletion.create( model=self.api_model, messages=messages, @@ -75,17 +67,21 @@ class OpenAI: stop=stop, temperature=temperature, ) - with open("openai.logs", 'a') as log_file: - log_file.write("\n" + "-----------" + '\n' + "Prompt : " + prompt + "\n") + with open("openai.logs", "a") as log_file: + log_file.write( + "\n" + "-----------" + "\n" + "Prompt : " + prompt + "\n" + ) return response except openai.error.RateLimitError as e: sleep_duratoin = os.environ.get("OPENAI_RATE_TIMEOUT", 30) - print(f'{str(e)}, sleep for {sleep_duratoin}s, set it by env OPENAI_RATE_TIMEOUT') + print( + f"{str(e)}, sleep for {sleep_duratoin}s, set it by env OPENAI_RATE_TIMEOUT" + ) time.sleep(sleep_duratoin) def openai_choice2text_handler(self, choice): if self.use_chat_api: - text = choice['message']['content'] + text = choice["message"]["content"] else: text = choice.text.strip() return text @@ -102,20 +98,16 @@ class OpenAI: else: response = self.run(prompt, 300, 0.5, k) - thoughts = [self.openai_choice2text_handler(choice) for choice in response.choices] + thoughts = [ + self.openai_choice2text_handler(choice) for choice in response.choices + ] return thoughts - def generate_thoughts( - self, - state, - k, - initial_prompt, - rejected_solutions=None - ): - if (isinstance(state, str)): + def generate_thoughts(self, state, k, initial_prompt, rejected_solutions=None): + if isinstance(state, str): state_text = state else: - state_text = '\n'.join(state) + state_text = "\n".join(state) print("New state generating thought:", state, "\n\n") prompt = f""" Accomplish the task below by decomposing it as many very explicit subtasks as possible, be very explicit and thorough denoted by @@ -135,14 +127,10 @@ class OpenAI: # print(f"Generated thoughts: {thoughts}") return thoughts - def generate_solution(self, - initial_prompt, - state, - rejected_solutions=None): + def generate_solution(self, initial_prompt, state, rejected_solutions=None): try: - if isinstance(state, list): - state_text = '\n'.join(state) + state_text = "\n".join(state) else: state_text = state @@ -156,7 +144,7 @@ class OpenAI: ###{rejected_solutions}###, complete the {initial_prompt} without making the same mistakes you did with the evaluated rejected solutions. Be simple. Be direct. Provide intuitive solutions as soon as you think of them.""" answer = self.generate_text(prompt, 1) - print(f'Generated Solution Summary {answer}') + print(f"Generated Solution Summary {answer}") return answer except Exception as e: logger.error(f"Error in generate_solutions: {e}") @@ -166,14 +154,20 @@ class OpenAI: if not states: return {} - if self.evaluation_strategy == 'value': + if self.evaluation_strategy == "value": state_values = {} for state in states: - if (isinstance(state, str)): + if isinstance(state, str): state_text = state else: - state_text = '\n'.join(state) - print("We receive a state of type", type(state), "For state: ", state, "\n\n") + state_text = "\n".join(state) + print( + "We receive a state of type", + type(state), + "For state: ", + state, + "\n\n", + ) prompt = f""" To achieve the following goal: '{initial_prompt}', pessimistically value the context of the past solutions and more importantly the latest generated solution you had AS A FLOAT BETWEEN 0 AND 1\n Past solutions:\n\n {state_text}\n @@ -244,7 +238,11 @@ class AoTAgent: for next_state in thoughts: state_value = self.evaluated_thoughts[next_state] if state_value > self.value_threshold: - child = (state, next_state) if isinstance(state, str) else (*state, next_state) + child = ( + (state, next_state) + if isinstance(state, str) + else (*state, next_state) + ) self.dfs(child, step + 1) # backtracking @@ -255,17 +253,18 @@ class AoTAgent: def generate_and_filter_thoughts(self, state): thoughts = self.model.generate_thoughts( - state, - self.num_thoughts, - self.initial_prompt + state, self.num_thoughts, self.initial_prompt ) self.evaluated_thoughts = self.model.evaluate_states( - thoughts, - self.initial_prompt + thoughts, self.initial_prompt ) - filtered_thoughts = [thought for thought in thoughts if self.evaluated_thoughts[thought] >= self.pruning_threshold] + filtered_thoughts = [ + thought + for thought in thoughts + if self.evaluated_thoughts[thought] >= self.pruning_threshold + ] print(f"filtered_thoughts: {filtered_thoughts}") return filtered_thoughts diff --git a/swarms/agents/base.py b/swarms/agents/base.py index 520437b7..5e0d3a98 100644 --- a/swarms/agents/base.py +++ b/swarms/agents/base.py @@ -18,7 +18,7 @@ class AbstractAgent: self, name: str, # tools: List[Tool], - #memory: Memory + # memory: Memory ): """ Args: @@ -51,10 +51,7 @@ class AbstractAgent: def chat(self, messages: List[Dict]): """Chat with the agent""" - def _achat( - self, - messages: List[Dict] - ): + def _achat(self, messages: List[Dict]): """Asynchronous Chat""" def step(self, message: str): diff --git a/swarms/agents/conversabe_agent.py b/swarms/agents/conversabe_agent.py index d4404604..35808c4b 100644 --- a/swarms/agents/conversabe_agent.py +++ b/swarms/agents/conversabe_agent.py @@ -43,7 +43,9 @@ class ConversableAgent(Agent): DEFAULT_CONFIG = { "model": DEFAULT_MODEL, } - MAX_CONSECUTIVE_AUTO_REPLY = 100 # maximum number of consecutive auto replies (subject to future change) + MAX_CONSECUTIVE_AUTO_REPLY = ( + 100 # maximum number of consecutive auto replies (subject to future change) + ) def __init__( self, @@ -103,7 +105,9 @@ class ConversableAgent(Agent): self._oai_messages = defaultdict(list) self._oai_system_message = [{"content": system_message, "role": "system"}] self._is_termination_msg = ( - is_termination_msg if is_termination_msg is not None else (lambda x: x.get("content") == "TERMINATE") + is_termination_msg + if is_termination_msg is not None + else (lambda x: x.get("content") == "TERMINATE") ) if llm_config is False: self.llm_config = False @@ -112,21 +116,33 @@ class ConversableAgent(Agent): if isinstance(llm_config, dict): self.llm_config.update(llm_config) - self._code_execution_config = {} if code_execution_config is None else code_execution_config + self._code_execution_config = ( + {} if code_execution_config is None else code_execution_config + ) self.human_input_mode = human_input_mode self._max_consecutive_auto_reply = ( - max_consecutive_auto_reply if max_consecutive_auto_reply is not None else self.MAX_CONSECUTIVE_AUTO_REPLY + max_consecutive_auto_reply + if max_consecutive_auto_reply is not None + else self.MAX_CONSECUTIVE_AUTO_REPLY ) self._consecutive_auto_reply_counter = defaultdict(int) - self._max_consecutive_auto_reply_dict = defaultdict(self.max_consecutive_auto_reply) + self._max_consecutive_auto_reply_dict = defaultdict( + self.max_consecutive_auto_reply + ) self._function_map = {} if function_map is None else function_map self._default_auto_reply = default_auto_reply self._reply_func_list = [] self.reply_at_receive = defaultdict(bool) self.register_reply([Agent, None], ConversableAgent.generate_oai_reply) - self.register_reply([Agent, None], ConversableAgent.generate_code_execution_reply) - self.register_reply([Agent, None], ConversableAgent.generate_function_call_reply) - self.register_reply([Agent, None], ConversableAgent.check_termination_and_human_reply) + self.register_reply( + [Agent, None], ConversableAgent.generate_code_execution_reply + ) + self.register_reply( + [Agent, None], ConversableAgent.generate_function_call_reply + ) + self.register_reply( + [Agent, None], ConversableAgent.check_termination_and_human_reply + ) def register_reply( self, @@ -170,7 +186,9 @@ class ConversableAgent(Agent): The function returns None. Signature: ```def reset_config(config: Any)``` """ if not isinstance(trigger, (type, str, Agent, Callable, list)): - raise ValueError("trigger must be a class, a string, an agent, a callable or a list.") + raise ValueError( + "trigger must be a class, a string, an agent, a callable or a list." + ) self._reply_func_list.insert( position, { @@ -195,7 +213,9 @@ class ConversableAgent(Agent): """ self._oai_system_message[0]["content"] = system_message - def update_max_consecutive_auto_reply(self, value: int, sender: Optional[Agent] = None): + def update_max_consecutive_auto_reply( + self, value: int, sender: Optional[Agent] = None + ): """Update the maximum number of consecutive auto replies. Args: @@ -211,7 +231,11 @@ class ConversableAgent(Agent): def max_consecutive_auto_reply(self, sender: Optional[Agent] = None) -> int: """The maximum number of consecutive auto replies.""" - return self._max_consecutive_auto_reply if sender is None else self._max_consecutive_auto_reply_dict[sender] + return ( + self._max_consecutive_auto_reply + if sender is None + else self._max_consecutive_auto_reply_dict[sender] + ) @property def chat_messages(self) -> Dict[Agent, List[Dict]]: @@ -236,7 +260,9 @@ class ConversableAgent(Agent): if n_conversations == 1: for conversation in self._oai_messages.values(): return conversation[-1] - raise ValueError("More than one conversation is found. Please specify the sender to get the last message.") + raise ValueError( + "More than one conversation is found. Please specify the sender to get the last message." + ) return self._oai_messages[agent][-1] @property @@ -244,7 +270,11 @@ class ConversableAgent(Agent): """Bool value of whether to use docker to execute the code, or str value of the docker image name to use, or None when code execution is disabled. """ - return None if self._code_execution_config is False else self._code_execution_config.get("use_docker") + return ( + None + if self._code_execution_config is False + else self._code_execution_config.get("use_docker") + ) @staticmethod def _message_to_dict(message: Union[Dict, str]): @@ -257,7 +287,9 @@ class ConversableAgent(Agent): else: return message - def _append_oai_message(self, message: Union[Dict, str], role, conversation_id: Agent) -> bool: + def _append_oai_message( + self, message: Union[Dict, str], role, conversation_id: Agent + ) -> bool: """Append a message to the ChatCompletion conversation. If the message received is a string, it will be put in the "content" field of the new dictionary. @@ -275,16 +307,24 @@ class ConversableAgent(Agent): """ message = self._message_to_dict(message) # create oai message to be appended to the oai conversation that can be passed to oai directly. - oai_message = {k: message[k] for k in ("content", "function_call", "name", "context") if k in message} + oai_message = { + k: message[k] + for k in ("content", "function_call", "name", "context") + if k in message + } if "content" not in oai_message: if "function_call" in oai_message: - oai_message["content"] = None # if only function_call is provided, content will be set to None. + oai_message[ + "content" + ] = None # if only function_call is provided, content will be set to None. else: return False oai_message["role"] = "function" if message.get("role") == "function" else role if "function_call" in oai_message: - oai_message["role"] = "assistant" # only messages with role 'assistant' can have a function call. + oai_message[ + "role" + ] = "assistant" # only messages with role 'assistant' can have a function call. self._oai_messages[conversation_id].append(oai_message) return True @@ -390,7 +430,9 @@ class ConversableAgent(Agent): # print the message received print(colored(sender.name, "yellow"), "(to", f"{self.name}):\n", flush=True) if message.get("role") == "function": - func_print = f"***** Response from calling function \"{message['name']}\" *****" + func_print = ( + f"***** Response from calling function \"{message['name']}\" *****" + ) print(colored(func_print, "green"), flush=True) print(message["content"], flush=True) print(colored("*" * len(func_print), "green"), flush=True) @@ -401,7 +443,8 @@ class ConversableAgent(Agent): content = oai.ChatCompletion.instantiate( content, message["context"], - self.llm_config and self.llm_config.get("allow_format_str_template", False), + self.llm_config + and self.llm_config.get("allow_format_str_template", False), ) print(content, flush=True) if "function_call" in message: @@ -457,7 +500,11 @@ class ConversableAgent(Agent): ValueError: if the message can't be converted into a valid ChatCompletion message. """ self._process_received_message(message, sender, silent) - if request_reply is False or request_reply is None and self.reply_at_receive[sender] is False: + if ( + request_reply is False + or request_reply is None + and self.reply_at_receive[sender] is False + ): return reply = self.generate_reply(messages=self.chat_messages[sender], sender=sender) if reply is not None: @@ -493,7 +540,11 @@ class ConversableAgent(Agent): ValueError: if the message can't be converted into a valid ChatCompletion message. """ self._process_received_message(message, sender, silent) - if request_reply is False or request_reply is None and self.reply_at_receive[sender] is False: + if ( + request_reply is False + or request_reply is None + and self.reply_at_receive[sender] is False + ): return reply = await self.a_generate_reply(sender=sender) if reply is not None: @@ -551,7 +602,9 @@ class ConversableAgent(Agent): "message" needs to be provided if the `generate_init_message` method is not overridden. """ self._prepare_chat(recipient, clear_history) - await self.a_send(self.generate_init_message(**context), recipient, silent=silent) + await self.a_send( + self.generate_init_message(**context), recipient, silent=silent + ) def reset(self): """Reset the agent.""" @@ -604,7 +657,9 @@ class ConversableAgent(Agent): # TODO: #1143 handle token limit exceeded error response = oai.ChatCompletion.create( - context=messages[-1].pop("context", None), messages=self._oai_system_message + messages, **llm_config + context=messages[-1].pop("context", None), + messages=self._oai_system_message + messages, + **llm_config, ) return True, oai.ChatCompletion.extract_text_or_function_call(response)[0] @@ -615,7 +670,9 @@ class ConversableAgent(Agent): config: Optional[Any] = None, ): """Generate a reply using code execution.""" - code_execution_config = config if config is not None else self._code_execution_config + code_execution_config = ( + config if config is not None else self._code_execution_config + ) if code_execution_config is False: return False, None if messages is None: @@ -634,7 +691,9 @@ class ConversableAgent(Agent): # found code blocks, execute code and push "last_n_messages" back exitcode, logs = self.execute_code_blocks(code_blocks) code_execution_config["last_n_messages"] = last_n_messages - exitcode2str = "execution succeeded" if exitcode == 0 else "execution failed" + exitcode2str = ( + "execution succeeded" if exitcode == 0 else "execution failed" + ) return True, f"exitcode: {exitcode} ({exitcode2str})\nCode output: {logs}" # no code blocks are found, push last_n_messages back and return. @@ -681,7 +740,10 @@ class ConversableAgent(Agent): # if the human input is empty, and the message is a termination message, then we will terminate the conversation reply = reply if reply or not self._is_termination_msg(message) else "exit" else: - if self._consecutive_auto_reply_counter[sender] >= self._max_consecutive_auto_reply_dict[sender]: + if ( + self._consecutive_auto_reply_counter[sender] + >= self._max_consecutive_auto_reply_dict[sender] + ): if self.human_input_mode == "NEVER": reply = "exit" else: @@ -776,7 +838,12 @@ class ConversableAgent(Agent): if asyncio.coroutines.iscoroutinefunction(reply_func): continue if self._match_trigger(reply_func_tuple["trigger"], sender): - final, reply = reply_func(self, messages=messages, sender=sender, config=reply_func_tuple["config"]) + final, reply = reply_func( + self, + messages=messages, + sender=sender, + config=reply_func_tuple["config"], + ) if final: return reply return self._default_auto_reply @@ -827,10 +894,18 @@ class ConversableAgent(Agent): if self._match_trigger(reply_func_tuple["trigger"], sender): if asyncio.coroutines.iscoroutinefunction(reply_func): final, reply = await reply_func( - self, messages=messages, sender=sender, config=reply_func_tuple["config"] + self, + messages=messages, + sender=sender, + config=reply_func_tuple["config"], ) else: - final, reply = reply_func(self, messages=messages, sender=sender, config=reply_func_tuple["config"]) + final, reply = reply_func( + self, + messages=messages, + sender=sender, + config=reply_func_tuple["config"], + ) if final: return reply return self._default_auto_reply @@ -897,10 +972,12 @@ class ConversableAgent(Agent): flush=True, ) if lang in ["bash", "shell", "sh"]: - exitcode, logs, image = self.run_code(code, lang=lang, **self._code_execution_config) + exitcode, logs, image = self.run_code( + code, lang=lang, **self._code_execution_config + ) elif lang in ["python", "Python"]: if code.startswith("# filename: "): - filename = code[11: code.find("\n")].strip() + filename = code[11 : code.find("\n")].strip() else: filename = None exitcode, logs, image = self.run_code( diff --git a/swarms/agents/models/groundingdino/datasets/cocogrounding_eval.py b/swarms/agents/models/groundingdino/datasets/cocogrounding_eval.py index ecf62093..cbe5bc4b 100644 --- a/swarms/agents/models/groundingdino/datasets/cocogrounding_eval.py +++ b/swarms/agents/models/groundingdino/datasets/cocogrounding_eval.py @@ -66,7 +66,9 @@ class CocoGroundingEvaluator(object): def synchronize_between_processes(self): for iou_type in self.iou_types: self.eval_imgs[iou_type] = np.concatenate(self.eval_imgs[iou_type], 2) - create_common_coco_eval(self.coco_eval[iou_type], self.img_ids, self.eval_imgs[iou_type]) + create_common_coco_eval( + self.coco_eval[iou_type], self.img_ids, self.eval_imgs[iou_type] + ) def accumulate(self): for coco_eval in self.coco_eval.values(): @@ -127,7 +129,9 @@ class CocoGroundingEvaluator(object): labels = prediction["labels"].tolist() rles = [ - mask_util.encode(np.array(mask[0, :, :, np.newaxis], dtype=np.uint8, order="F"))[0] + mask_util.encode( + np.array(mask[0, :, :, np.newaxis], dtype=np.uint8, order="F") + )[0] for mask in masks ] for rle in rles: @@ -227,7 +231,9 @@ def evaluate(self): # add backward compatibility if useSegm is specified in params if p.useSegm is not None: p.iouType = "segm" if p.useSegm == 1 else "bbox" - print("useSegm (deprecated) is not None. Running {} evaluation".format(p.iouType)) + print( + "useSegm (deprecated) is not None. Running {} evaluation".format(p.iouType) + ) # print('Evaluate annotation type *{}*'.format(p.iouType)) p.imgIds = list(np.unique(p.imgIds)) if p.useCats: @@ -246,7 +252,8 @@ def evaluate(self): self.ious = { (imgId, catId): computeIoU(imgId, catId) for imgId in p.imgIds - for catId in catIds} + for catId in catIds + } evaluateImg = self.evaluateImg maxDet = p.maxDets[-1] diff --git a/swarms/agents/models/groundingdino/datasets/transforms.py b/swarms/agents/models/groundingdino/datasets/transforms.py index d4d4dc57..c34a1453 100644 --- a/swarms/agents/models/groundingdino/datasets/transforms.py +++ b/swarms/agents/models/groundingdino/datasets/transforms.py @@ -38,7 +38,7 @@ def crop(image, target, region): if "masks" in target: # FIXME should we update the area here if there are no boxes? - target["masks"] = target["masks"][:, i: i + h, j: j + w] + target["masks"] = target["masks"][:, i : i + h, j : j + w] fields.append("masks") # remove elements for which the boxes or masks that have zero area @@ -73,9 +73,9 @@ def hflip(image, target): target = target.copy() if "boxes" in target: boxes = target["boxes"] - boxes = boxes[:, [2, 1, 0, 3]] * torch.as_tensor([-1, 1, -1, 1]) + torch.as_tensor( - [w, 0, w, 0] - ) + boxes = boxes[:, [2, 1, 0, 3]] * torch.as_tensor( + [-1, 1, -1, 1] + ) + torch.as_tensor([w, 0, w, 0]) target["boxes"] = boxes if "masks" in target: @@ -119,7 +119,9 @@ def resize(image, target, size, max_size=None): if target is None: return rescaled_image, None - ratios = tuple(float(s) / float(s_orig) for s, s_orig in zip(rescaled_image.size, image.size)) + ratios = tuple( + float(s) / float(s_orig) for s, s_orig in zip(rescaled_image.size, image.size) + ) ratio_width, ratio_height = ratios target = target.copy() @@ -140,7 +142,8 @@ def resize(image, target, size, max_size=None): if "masks" in target: target["masks"] = ( - interpolate(target["masks"][:, None].float(), size, mode="nearest")[:, 0] > 0.5 + interpolate(target["masks"][:, None].float(), size, mode="nearest")[:, 0] + > 0.5 ) return rescaled_image, target @@ -155,7 +158,9 @@ def pad(image, target, padding): # should we do something wrt the original size? target["size"] = torch.tensor(padded_image.size[::-1]) if "masks" in target: - target["masks"] = torch.nn.functional.pad(target["masks"], (0, padding[0], 0, padding[1])) + target["masks"] = torch.nn.functional.pad( + target["masks"], (0, padding[0], 0, padding[1]) + ) return padded_image, target diff --git a/swarms/agents/models/groundingdino/models/GroundingDINO/backbone/backbone.py b/swarms/agents/models/groundingdino/models/GroundingDINO/backbone/backbone.py index 4b7ad0e2..a56f369e 100644 --- a/swarms/agents/models/groundingdino/models/GroundingDINO/backbone/backbone.py +++ b/swarms/agents/models/groundingdino/models/GroundingDINO/backbone/backbone.py @@ -47,14 +47,27 @@ class FrozenBatchNorm2d(torch.nn.Module): self.register_buffer("running_var", torch.ones(n)) def _load_from_state_dict( - self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs + self, + state_dict, + prefix, + local_metadata, + strict, + missing_keys, + unexpected_keys, + error_msgs, ): num_batches_tracked_key = prefix + "num_batches_tracked" if num_batches_tracked_key in state_dict: del state_dict[num_batches_tracked_key] super(FrozenBatchNorm2d, self)._load_from_state_dict( - state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs + state_dict, + prefix, + local_metadata, + strict, + missing_keys, + unexpected_keys, + error_msgs, ) def forward(self, x): @@ -91,7 +104,11 @@ class BackboneBase(nn.Module): return_layers = {} for idx, layer_index in enumerate(return_interm_indices): return_layers.update( - {"layer{}".format(5 - len(return_interm_indices) + idx): "{}".format(layer_index)} + { + "layer{}".format(5 - len(return_interm_indices) + idx): "{}".format( + layer_index + ) + } ) # if len: @@ -136,10 +153,13 @@ class Backbone(BackboneBase): else: raise NotImplementedError("Why you can get here with name {}".format(name)) # num_channels = 512 if name in ('resnet18', 'resnet34') else 2048 - assert name not in ("resnet18", "resnet34"), "Only resnet50 and resnet101 are available." + assert name not in ( + "resnet18", + "resnet34", + ), "Only resnet50 and resnet101 are available." assert return_interm_indices in [[0, 1, 2, 3], [1, 2, 3], [3]] num_channels_all = [256, 512, 1024, 2048] - num_channels = num_channels_all[4 - len(return_interm_indices):] + num_channels = num_channels_all[4 - len(return_interm_indices) :] super().__init__(backbone, train_backbone, num_channels, return_interm_indices) @@ -204,7 +224,7 @@ def build_backbone(args): use_checkpoint=use_checkpoint, ) - bb_num_channels = backbone.num_features[4 - len(return_interm_indices):] + bb_num_channels = backbone.num_features[4 - len(return_interm_indices) :] else: raise NotImplementedError("Unknown backbone {}".format(args.backbone)) diff --git a/swarms/agents/models/groundingdino/models/GroundingDINO/backbone/position_encoding.py b/swarms/agents/models/groundingdino/models/GroundingDINO/backbone/position_encoding.py index eac7e896..58dc7967 100644 --- a/swarms/agents/models/groundingdino/models/GroundingDINO/backbone/position_encoding.py +++ b/swarms/agents/models/groundingdino/models/GroundingDINO/backbone/position_encoding.py @@ -33,7 +33,9 @@ class PositionEmbeddingSine(nn.Module): used by the Attention is all you need paper, generalized to work on images. """ - def __init__(self, num_pos_feats=64, temperature=10000, normalize=False, scale=None): + def __init__( + self, num_pos_feats=64, temperature=10000, normalize=False, scale=None + ): super().__init__() self.num_pos_feats = num_pos_feats self.temperature = temperature @@ -82,7 +84,12 @@ class PositionEmbeddingSineHW(nn.Module): """ def __init__( - self, num_pos_feats=64, temperatureH=10000, temperatureW=10000, normalize=False, scale=None + self, + num_pos_feats=64, + temperatureH=10000, + temperatureW=10000, + normalize=False, + scale=None, ): super().__init__() self.num_pos_feats = num_pos_feats @@ -111,11 +118,15 @@ class PositionEmbeddingSineHW(nn.Module): x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale dim_tx = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device) - dim_tx = self.temperatureW ** (2 * (torch.div(dim_tx, 2, rounding_mode='floor')) / self.num_pos_feats) + dim_tx = self.temperatureW ** ( + 2 * (torch.div(dim_tx, 2, rounding_mode="floor")) / self.num_pos_feats + ) pos_x = x_embed[:, :, :, None] / dim_tx dim_ty = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device) - dim_ty = self.temperatureH ** (2 * (torch.div(dim_ty, 2, rounding_mode='floor')) / self.num_pos_feats) + dim_ty = self.temperatureH ** ( + 2 * (torch.div(dim_ty, 2, rounding_mode="floor")) / self.num_pos_feats + ) pos_y = y_embed[:, :, :, None] / dim_ty pos_x = torch.stack( diff --git a/swarms/agents/models/groundingdino/models/GroundingDINO/backbone/swin_transformer.py b/swarms/agents/models/groundingdino/models/GroundingDINO/backbone/swin_transformer.py index e4edbc5a..1a74ca36 100644 --- a/swarms/agents/models/groundingdino/models/GroundingDINO/backbone/swin_transformer.py +++ b/swarms/agents/models/groundingdino/models/GroundingDINO/backbone/swin_transformer.py @@ -25,7 +25,12 @@ class Mlp(nn.Module): """Multilayer perceptron.""" def __init__( - self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.0 + self, + in_features, + hidden_features=None, + out_features=None, + act_layer=nn.GELU, + drop=0.0, ): super().__init__() out_features = out_features or in_features @@ -54,7 +59,9 @@ def window_partition(x, window_size): """ B, H, W, C = x.shape x = x.view(B, H // window_size, window_size, W // window_size, window_size, C) - windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C) + windows = ( + x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C) + ) return windows @@ -69,7 +76,9 @@ def window_reverse(windows, window_size, H, W): x: (B, H, W, C) """ B = int(windows.shape[0] / (H * W / window_size / window_size)) - x = windows.view(B, H // window_size, W // window_size, window_size, window_size, -1) + x = windows.view( + B, H // window_size, W // window_size, window_size, window_size, -1 + ) x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1) return x @@ -97,7 +106,6 @@ class WindowAttention(nn.Module): attn_drop=0.0, proj_drop=0.0, ): - super().__init__() self.dim = dim self.window_size = window_size # Wh, Ww @@ -115,8 +123,12 @@ class WindowAttention(nn.Module): coords_w = torch.arange(self.window_size[1]) coords = torch.stack(torch.meshgrid([coords_h, coords_w])) # 2, Wh, Ww coords_flatten = torch.flatten(coords, 1) # 2, Wh*Ww - relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :] # 2, Wh*Ww, Wh*Ww - relative_coords = relative_coords.permute(1, 2, 0).contiguous() # Wh*Ww, Wh*Ww, 2 + relative_coords = ( + coords_flatten[:, :, None] - coords_flatten[:, None, :] + ) # 2, Wh*Ww, Wh*Ww + relative_coords = relative_coords.permute( + 1, 2, 0 + ).contiguous() # Wh*Ww, Wh*Ww, 2 relative_coords[:, :, 0] += self.window_size[0] - 1 # shift to start from 0 relative_coords[:, :, 1] += self.window_size[1] - 1 relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1 @@ -143,7 +155,11 @@ class WindowAttention(nn.Module): .reshape(B_, N, 3, self.num_heads, C // self.num_heads) .permute(2, 0, 3, 1, 4) ) - q, k, v = qkv[0], qkv[1], qkv[2] # make torchscript happy (cannot use tensor as tuple) + q, k, v = ( + qkv[0], + qkv[1], + qkv[2], + ) # make torchscript happy (cannot use tensor as tuple) q = q * self.scale attn = q @ k.transpose(-2, -1) @@ -151,7 +167,9 @@ class WindowAttention(nn.Module): relative_position_bias = self.relative_position_bias_table[ self.relative_position_index.view(-1) ].view( - self.window_size[0] * self.window_size[1], self.window_size[0] * self.window_size[1], -1 + self.window_size[0] * self.window_size[1], + self.window_size[0] * self.window_size[1], + -1, ) # Wh*Ww,Wh*Ww,nH relative_position_bias = relative_position_bias.permute( 2, 0, 1 @@ -160,7 +178,9 @@ class WindowAttention(nn.Module): if mask is not None: nW = mask.shape[0] - attn = attn.view(B_ // nW, nW, self.num_heads, N, N) + mask.unsqueeze(1).unsqueeze(0) + attn = attn.view(B_ // nW, nW, self.num_heads, N, N) + mask.unsqueeze( + 1 + ).unsqueeze(0) attn = attn.view(-1, self.num_heads, N, N) attn = self.softmax(attn) else: @@ -212,7 +232,9 @@ class SwinTransformerBlock(nn.Module): self.window_size = window_size self.shift_size = shift_size self.mlp_ratio = mlp_ratio - assert 0 <= self.shift_size < self.window_size, "shift_size must in 0-window_size" + assert ( + 0 <= self.shift_size < self.window_size + ), "shift_size must in 0-window_size" self.norm1 = norm_layer(dim) self.attn = WindowAttention( @@ -229,7 +251,10 @@ class SwinTransformerBlock(nn.Module): self.norm2 = norm_layer(dim) mlp_hidden_dim = int(dim * mlp_ratio) self.mlp = Mlp( - in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop + in_features=dim, + hidden_features=mlp_hidden_dim, + act_layer=act_layer, + drop=drop, ) self.H = None @@ -259,7 +284,9 @@ class SwinTransformerBlock(nn.Module): # cyclic shift if self.shift_size > 0: - shifted_x = torch.roll(x, shifts=(-self.shift_size, -self.shift_size), dims=(1, 2)) + shifted_x = torch.roll( + x, shifts=(-self.shift_size, -self.shift_size), dims=(1, 2) + ) attn_mask = mask_matrix else: shifted_x = x @@ -274,7 +301,9 @@ class SwinTransformerBlock(nn.Module): ) # nW*B, window_size*window_size, C # W-MSA/SW-MSA - attn_windows = self.attn(x_windows, mask=attn_mask) # nW*B, window_size*window_size, C + attn_windows = self.attn( + x_windows, mask=attn_mask + ) # nW*B, window_size*window_size, C # merge windows attn_windows = attn_windows.view(-1, self.window_size, self.window_size, C) @@ -282,7 +311,9 @@ class SwinTransformerBlock(nn.Module): # reverse cyclic shift if self.shift_size > 0: - x = torch.roll(shifted_x, shifts=(self.shift_size, self.shift_size), dims=(1, 2)) + x = torch.roll( + shifted_x, shifts=(self.shift_size, self.shift_size), dims=(1, 2) + ) else: x = shifted_x @@ -393,7 +424,9 @@ class BasicLayer(nn.Module): qk_scale=qk_scale, drop=drop, attn_drop=attn_drop, - drop_path=drop_path[i] if isinstance(drop_path, list) else drop_path, + drop_path=drop_path[i] + if isinstance(drop_path, list) + else drop_path, norm_layer=norm_layer, ) for i in range(depth) @@ -473,7 +506,9 @@ class PatchEmbed(nn.Module): self.in_chans = in_chans self.embed_dim = embed_dim - self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size) + self.proj = nn.Conv2d( + in_chans, embed_dim, kernel_size=patch_size, stride=patch_size + ) if norm_layer is not None: self.norm = norm_layer(embed_dim) else: @@ -614,7 +649,7 @@ class SwinTransformer(nn.Module): qk_scale=qk_scale, drop=drop_rate, attn_drop=attn_drop_rate, - drop_path=dpr[sum(depths[:i_layer]): sum(depths[: i_layer + 1])], + drop_path=dpr[sum(depths[:i_layer]) : sum(depths[: i_layer + 1])], norm_layer=norm_layer, # downsample=PatchMerging if (i_layer < self.num_layers - 1) else None, downsample=downsamplelist[i_layer], @@ -700,7 +735,11 @@ class SwinTransformer(nn.Module): norm_layer = getattr(self, f"norm{i}") x_out = norm_layer(x_out) - out = x_out.view(-1, H, W, self.num_features[i]).permute(0, 3, 1, 2).contiguous() + out = ( + x_out.view(-1, H, W, self.num_features[i]) + .permute(0, 3, 1, 2) + .contiguous() + ) outs.append(out) # in: # torch.Size([2, 3, 1024, 1024]) @@ -735,7 +774,11 @@ class SwinTransformer(nn.Module): norm_layer = getattr(self, f"norm{i}") x_out = norm_layer(x_out) - out = x_out.view(-1, H, W, self.num_features[i]).permute(0, 3, 1, 2).contiguous() + out = ( + x_out.view(-1, H, W, self.num_features[i]) + .permute(0, 3, 1, 2) + .contiguous() + ) outs.append(out) # in: # torch.Size([2, 3, 1024, 1024]) @@ -748,7 +791,9 @@ class SwinTransformer(nn.Module): for idx, out_i in enumerate(outs): m = tensor_list.mask assert m is not None - mask = F.interpolate(m[None].float(), size=out_i.shape[-2:]).to(torch.bool)[0] + mask = F.interpolate(m[None].float(), size=out_i.shape[-2:]).to(torch.bool)[ + 0 + ] outs_dict[idx] = NestedTensor(out_i, mask) return outs_dict @@ -776,13 +821,22 @@ def build_swin_transformer(modelname, pretrain_img_size, **kw): embed_dim=128, depths=[2, 2, 18, 2], num_heads=[4, 8, 16, 32], window_size=7 ), "swin_B_384_22k": dict( - embed_dim=128, depths=[2, 2, 18, 2], num_heads=[4, 8, 16, 32], window_size=12 + embed_dim=128, + depths=[2, 2, 18, 2], + num_heads=[4, 8, 16, 32], + window_size=12, ), "swin_L_224_22k": dict( - embed_dim=192, depths=[2, 2, 18, 2], num_heads=[6, 12, 24, 48], window_size=7 + embed_dim=192, + depths=[2, 2, 18, 2], + num_heads=[6, 12, 24, 48], + window_size=7, ), "swin_L_384_22k": dict( - embed_dim=192, depths=[2, 2, 18, 2], num_heads=[6, 12, 24, 48], window_size=12 + embed_dim=192, + depths=[2, 2, 18, 2], + num_heads=[6, 12, 24, 48], + window_size=12, ), } kw_cgf = model_para_dict[modelname] diff --git a/swarms/agents/models/groundingdino/models/GroundingDINO/bertwarper.py b/swarms/agents/models/groundingdino/models/GroundingDINO/bertwarper.py index e985ac5f..2ad9c020 100644 --- a/swarms/agents/models/groundingdino/models/GroundingDINO/bertwarper.py +++ b/swarms/agents/models/groundingdino/models/GroundingDINO/bertwarper.py @@ -61,14 +61,18 @@ class BertModelWarper(nn.Module): decoding (see :obj:`past_key_values`). """ output_attentions = ( - output_attentions if output_attentions is not None else self.config.output_attentions + output_attentions + if output_attentions is not None + else self.config.output_attentions ) output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = ( + return_dict if return_dict is not None else self.config.use_return_dict + ) if self.config.is_decoder: use_cache = use_cache if use_cache is not None else self.config.use_cache @@ -76,7 +80,9 @@ class BertModelWarper(nn.Module): use_cache = False if input_ids is not None and inputs_embeds is not None: - raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") + raise ValueError( + "You cannot specify both input_ids and inputs_embeds at the same time" + ) elif input_ids is not None: input_shape = input_ids.size() batch_size, seq_length = input_shape @@ -109,11 +115,17 @@ class BertModelWarper(nn.Module): # If a 2D or 3D attention mask is provided for the cross-attention # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length] if self.config.is_decoder and encoder_hidden_states is not None: - encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size() + ( + encoder_batch_size, + encoder_sequence_length, + _, + ) = encoder_hidden_states.size() encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length) if encoder_attention_mask is None: encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device) - encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask) + encoder_extended_attention_mask = self.invert_attention_mask( + encoder_attention_mask + ) else: encoder_extended_attention_mask = None # if os.environ.get('IPDB_SHILONG_DEBUG', None) == 'INFO': @@ -147,7 +159,9 @@ class BertModelWarper(nn.Module): return_dict=return_dict, ) sequence_output = encoder_outputs[0] - pooled_output = self.pooler(sequence_output) if self.pooler is not None else None + pooled_output = ( + self.pooler(sequence_output) if self.pooler is not None else None + ) if not return_dict: return (sequence_output, pooled_output) + encoder_outputs[1:] @@ -193,7 +207,10 @@ def generate_masks_with_special_tokens(tokenized, special_tokens_list, tokenizer # generate attention mask and positional ids attention_mask = ( - torch.eye(num_token, device=input_ids.device).bool().unsqueeze(0).repeat(bs, 1, 1) + torch.eye(num_token, device=input_ids.device) + .bool() + .unsqueeze(0) + .repeat(bs, 1, 1) ) position_ids = torch.zeros((bs, num_token), device=input_ids.device) previous_col = 0 @@ -203,8 +220,10 @@ def generate_masks_with_special_tokens(tokenized, special_tokens_list, tokenizer attention_mask[row, col, col] = True position_ids[row, col] = 0 else: - attention_mask[row, previous_col + 1: col + 1, previous_col + 1: col + 1] = True - position_ids[row, previous_col + 1: col + 1] = torch.arange( + attention_mask[ + row, previous_col + 1 : col + 1, previous_col + 1 : col + 1 + ] = True + position_ids[row, previous_col + 1 : col + 1] = torch.arange( 0, col - previous_col, device=input_ids.device ) @@ -217,7 +236,9 @@ def generate_masks_with_special_tokens(tokenized, special_tokens_list, tokenizer return attention_mask, position_ids.to(torch.long) -def generate_masks_with_special_tokens_and_transfer_map(tokenized, special_tokens_list, tokenizer): +def generate_masks_with_special_tokens_and_transfer_map( + tokenized, special_tokens_list, tokenizer +): """Generate attention mask between each pair of special tokens Args: input_ids (torch.Tensor): input ids. Shape: [bs, num_token] @@ -237,7 +258,10 @@ def generate_masks_with_special_tokens_and_transfer_map(tokenized, special_token # generate attention mask and positional ids attention_mask = ( - torch.eye(num_token, device=input_ids.device).bool().unsqueeze(0).repeat(bs, 1, 1) + torch.eye(num_token, device=input_ids.device) + .bool() + .unsqueeze(0) + .repeat(bs, 1, 1) ) position_ids = torch.zeros((bs, num_token), device=input_ids.device) cate_to_token_mask_list = [[] for _ in range(bs)] @@ -248,12 +272,14 @@ def generate_masks_with_special_tokens_and_transfer_map(tokenized, special_token attention_mask[row, col, col] = True position_ids[row, col] = 0 else: - attention_mask[row, previous_col + 1: col + 1, previous_col + 1: col + 1] = True - position_ids[row, previous_col + 1: col + 1] = torch.arange( + attention_mask[ + row, previous_col + 1 : col + 1, previous_col + 1 : col + 1 + ] = True + position_ids[row, previous_col + 1 : col + 1] = torch.arange( 0, col - previous_col, device=input_ids.device ) c2t_maski = torch.zeros((num_token), device=input_ids.device).bool() - c2t_maski[previous_col + 1: col] = True + c2t_maski[previous_col + 1 : col] = True cate_to_token_mask_list[row].append(c2t_maski) previous_col = col diff --git a/swarms/agents/models/groundingdino/models/GroundingDINO/fuse_modules.py b/swarms/agents/models/groundingdino/models/GroundingDINO/fuse_modules.py index 2753b3dd..350dc635 100644 --- a/swarms/agents/models/groundingdino/models/GroundingDINO/fuse_modules.py +++ b/swarms/agents/models/groundingdino/models/GroundingDINO/fuse_modules.py @@ -127,7 +127,11 @@ class BiMultiHeadAttention(nn.Module): self._reset_parameters() def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int): - return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous() + return ( + tensor.view(bsz, seq_len, self.num_heads, self.head_dim) + .transpose(1, 2) + .contiguous() + ) def _reset_parameters(self): nn.init.xavier_uniform_(self.v_proj.weight) @@ -171,7 +175,9 @@ class BiMultiHeadAttention(nn.Module): value_l_states = value_l_states.view(*proj_shape) src_len = key_states.size(1) - attn_weights = torch.bmm(query_states, key_states.transpose(1, 2)) # bs*nhead, nimg, ntxt + attn_weights = torch.bmm( + query_states, key_states.transpose(1, 2) + ) # bs*nhead, nimg, ntxt if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len): raise ValueError( @@ -191,7 +197,9 @@ class BiMultiHeadAttention(nn.Module): ) # Do not increase 50000, data type half has quite limited range attn_weights_T = attn_weights.transpose(1, 2) - attn_weights_l = attn_weights_T - torch.max(attn_weights_T, dim=-1, keepdim=True)[0] + attn_weights_l = ( + attn_weights_T - torch.max(attn_weights_T, dim=-1, keepdim=True)[0] + ) if self.clamp_min_for_underflow: attn_weights_l = torch.clamp( attn_weights_l, min=-50000 @@ -204,7 +212,9 @@ class BiMultiHeadAttention(nn.Module): # mask vison for language if attention_mask_v is not None: attention_mask_v = ( - attention_mask_v[:, None, None, :].repeat(1, self.num_heads, 1, 1).flatten(0, 1) + attention_mask_v[:, None, None, :] + .repeat(1, self.num_heads, 1, 1) + .flatten(0, 1) ) attn_weights_l.masked_fill_(attention_mask_v, float("-inf")) @@ -213,7 +223,9 @@ class BiMultiHeadAttention(nn.Module): # mask language for vision if attention_mask_l is not None: attention_mask_l = ( - attention_mask_l[:, None, None, :].repeat(1, self.num_heads, 1, 1).flatten(0, 1) + attention_mask_l[:, None, None, :] + .repeat(1, self.num_heads, 1, 1) + .flatten(0, 1) ) attn_weights.masked_fill_(attention_mask_l, float("-inf")) attn_weights_v = attn_weights.softmax(dim=-1) @@ -275,13 +287,21 @@ class BiAttentionBlock(nn.Module): self.layer_norm_v = nn.LayerNorm(v_dim) self.layer_norm_l = nn.LayerNorm(l_dim) self.attn = BiMultiHeadAttention( - v_dim=v_dim, l_dim=l_dim, embed_dim=embed_dim, num_heads=num_heads, dropout=dropout + v_dim=v_dim, + l_dim=l_dim, + embed_dim=embed_dim, + num_heads=num_heads, + dropout=dropout, ) # add layer scale for training stability self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity() - self.gamma_v = nn.Parameter(init_values * torch.ones((v_dim)), requires_grad=True) - self.gamma_l = nn.Parameter(init_values * torch.ones((l_dim)), requires_grad=True) + self.gamma_v = nn.Parameter( + init_values * torch.ones((v_dim)), requires_grad=True + ) + self.gamma_l = nn.Parameter( + init_values * torch.ones((l_dim)), requires_grad=True + ) def forward(self, v, l, attention_mask_v=None, attention_mask_l=None): v = self.layer_norm_v(v) diff --git a/swarms/agents/models/groundingdino/models/GroundingDINO/groundingdino.py b/swarms/agents/models/groundingdino/models/GroundingDINO/groundingdino.py index 2fe41f37..5a27bf74 100644 --- a/swarms/agents/models/groundingdino/models/GroundingDINO/groundingdino.py +++ b/swarms/agents/models/groundingdino/models/GroundingDINO/groundingdino.py @@ -100,13 +100,17 @@ class GroundingDINO(nn.Module): self.bert.pooler.dense.bias.requires_grad_(False) self.bert = BertModelWarper(bert_model=self.bert) - self.feat_map = nn.Linear(self.bert.config.hidden_size, self.hidden_dim, bias=True) + self.feat_map = nn.Linear( + self.bert.config.hidden_size, self.hidden_dim, bias=True + ) nn.init.constant_(self.feat_map.bias.data, 0) nn.init.xavier_uniform_(self.feat_map.weight.data) # freeze # special tokens - self.specical_tokens = self.tokenizer.convert_tokens_to_ids(["[CLS]", "[SEP]", ".", "?"]) + self.specical_tokens = self.tokenizer.convert_tokens_to_ids( + ["[CLS]", "[SEP]", ".", "?"] + ) # prepare input projection layers if num_feature_levels > 1: @@ -123,14 +127,18 @@ class GroundingDINO(nn.Module): for _ in range(num_feature_levels - num_backbone_outs): input_proj_list.append( nn.Sequential( - nn.Conv2d(in_channels, hidden_dim, kernel_size=3, stride=2, padding=1), + nn.Conv2d( + in_channels, hidden_dim, kernel_size=3, stride=2, padding=1 + ), nn.GroupNorm(32, hidden_dim), ) ) in_channels = hidden_dim self.input_proj = nn.ModuleList(input_proj_list) else: - assert two_stage_type == "no", "two_stage_type should be no if num_feature_levels=1 !!!" + assert ( + two_stage_type == "no" + ), "two_stage_type should be no if num_feature_levels=1 !!!" self.input_proj = nn.ModuleList( [ nn.Sequential( @@ -157,12 +165,17 @@ class GroundingDINO(nn.Module): nn.init.constant_(_bbox_embed.layers[-1].bias.data, 0) if dec_pred_bbox_embed_share: - box_embed_layerlist = [_bbox_embed for i in range(transformer.num_decoder_layers)] + box_embed_layerlist = [ + _bbox_embed for i in range(transformer.num_decoder_layers) + ] else: box_embed_layerlist = [ - copy.deepcopy(_bbox_embed) for i in range(transformer.num_decoder_layers) + copy.deepcopy(_bbox_embed) + for i in range(transformer.num_decoder_layers) ] - class_embed_layerlist = [_class_embed for i in range(transformer.num_decoder_layers)] + class_embed_layerlist = [ + _class_embed for i in range(transformer.num_decoder_layers) + ] self.bbox_embed = nn.ModuleList(box_embed_layerlist) self.class_embed = nn.ModuleList(class_embed_layerlist) self.transformer.decoder.bbox_embed = self.bbox_embed @@ -170,9 +183,10 @@ class GroundingDINO(nn.Module): # two stage self.two_stage_type = two_stage_type - assert two_stage_type in ["no", "standard"], "unknown param {} of two_stage_type".format( - two_stage_type - ) + assert two_stage_type in [ + "no", + "standard", + ], "unknown param {} of two_stage_type".format(two_stage_type) if two_stage_type != "no": if two_stage_bbox_embed_share: assert dec_pred_bbox_embed_share @@ -237,12 +251,18 @@ class GroundingDINO(nn.Module): ] position_ids = position_ids[:, : self.max_text_len] tokenized["input_ids"] = tokenized["input_ids"][:, : self.max_text_len] - tokenized["attention_mask"] = tokenized["attention_mask"][:, : self.max_text_len] - tokenized["token_type_ids"] = tokenized["token_type_ids"][:, : self.max_text_len] + tokenized["attention_mask"] = tokenized["attention_mask"][ + :, : self.max_text_len + ] + tokenized["token_type_ids"] = tokenized["token_type_ids"][ + :, : self.max_text_len + ] # extract text embeddings if self.sub_sentence_present: - tokenized_for_encoder = {k: v for k, v in tokenized.items() if k != "attention_mask"} + tokenized_for_encoder = { + k: v for k, v in tokenized.items() if k != "attention_mask" + } tokenized_for_encoder["attention_mask"] = text_self_attention_masks tokenized_for_encoder["position_ids"] = position_ids else: @@ -251,7 +271,9 @@ class GroundingDINO(nn.Module): bert_output = self.bert(**tokenized_for_encoder) # bs, 195, 768 - encoded_text = self.feat_map(bert_output["last_hidden_state"]) # bs, 195, d_model + encoded_text = self.feat_map( + bert_output["last_hidden_state"] + ) # bs, 195, d_model text_token_mask = tokenized.attention_mask.bool() # bs, 195 # text_token_mask: True for nomask, False for mask # text_self_attention_masks: True for nomask, False for mask @@ -292,7 +314,9 @@ class GroundingDINO(nn.Module): else: src = self.input_proj[l](srcs[-1]) m = samples.mask - mask = F.interpolate(m[None].float(), size=src.shape[-2:]).to(torch.bool)[0] + mask = F.interpolate(m[None].float(), size=src.shape[-2:]).to( + torch.bool + )[0] pos_l = self.backbone[1](NestedTensor(src, mask)).to(src.dtype) srcs.append(src) masks.append(mask) @@ -350,7 +374,6 @@ class GroundingDINO(nn.Module): @MODULE_BUILD_FUNCS.registe_with_name(module_name="groundingdino") def build_groundingdino(args): - backbone = build_backbone(args) transformer = build_transformer(args) diff --git a/swarms/agents/models/groundingdino/models/GroundingDINO/ms_deform_attn.py b/swarms/agents/models/groundingdino/models/GroundingDINO/ms_deform_attn.py index 76e2d983..4af25107 100644 --- a/swarms/agents/models/groundingdino/models/GroundingDINO/ms_deform_attn.py +++ b/swarms/agents/models/groundingdino/models/GroundingDINO/ms_deform_attn.py @@ -34,7 +34,9 @@ except BaseException: # helpers def _is_power_of_2(n): if (not isinstance(n, int)) or (n < 0): - raise ValueError("invalid input for _is_power_of_2: {} (type: {})".format(n, type(n))) + raise ValueError( + "invalid input for _is_power_of_2: {} (type: {})".format(n, type(n)) + ) return (n & (n - 1) == 0) and n != 0 @@ -96,7 +98,6 @@ def multi_scale_deformable_attn_pytorch( sampling_locations: torch.Tensor, attention_weights: torch.Tensor, ) -> torch.Tensor: - bs, _, num_heads, embed_dims = value.shape _, num_queries, num_heads, num_levels, num_points, _ = sampling_locations.shape value_list = value.split([H_ * W_ for H_, W_ in value_spatial_shapes], dim=1) @@ -108,7 +109,10 @@ def multi_scale_deformable_attn_pytorch( # bs, num_heads*embed_dims, H_*W_ -> # bs*num_heads, embed_dims, H_, W_ value_l_ = ( - value_list[level].flatten(2).transpose(1, 2).reshape(bs * num_heads, embed_dims, H_, W_) + value_list[level] + .flatten(2) + .transpose(1, 2) + .reshape(bs * num_heads, embed_dims, H_, W_) ) # bs, num_queries, num_heads, num_points, 2 -> # bs, num_heads, num_queries, num_points, 2 -> @@ -116,7 +120,11 @@ def multi_scale_deformable_attn_pytorch( sampling_grid_l_ = sampling_grids[:, :, :, level].transpose(1, 2).flatten(0, 1) # bs*num_heads, embed_dims, num_queries, num_points sampling_value_l_ = F.grid_sample( - value_l_, sampling_grid_l_, mode="bilinear", padding_mode="zeros", align_corners=False + value_l_, + sampling_grid_l_, + mode="bilinear", + padding_mode="zeros", + align_corners=False, ) sampling_value_list.append(sampling_value_l_) # (bs, num_queries, num_heads, num_levels, num_points) -> @@ -184,8 +192,12 @@ class MultiScaleDeformableAttention(nn.Module): self.num_heads = num_heads self.num_levels = num_levels self.num_points = num_points - self.sampling_offsets = nn.Linear(embed_dim, num_heads * num_levels * num_points * 2) - self.attention_weights = nn.Linear(embed_dim, num_heads * num_levels * num_points) + self.sampling_offsets = nn.Linear( + embed_dim, num_heads * num_levels * num_points * 2 + ) + self.attention_weights = nn.Linear( + embed_dim, num_heads * num_levels * num_points + ) self.value_proj = nn.Linear(embed_dim, embed_dim) self.output_proj = nn.Linear(embed_dim, embed_dim) @@ -306,7 +318,9 @@ class MultiScaleDeformableAttention(nn.Module): # bs, num_query, num_heads, num_levels, num_points, 2 if reference_points.shape[-1] == 2: - offset_normalizer = torch.stack([spatial_shapes[..., 1], spatial_shapes[..., 0]], -1) + offset_normalizer = torch.stack( + [spatial_shapes[..., 1], spatial_shapes[..., 0]], -1 + ) sampling_locations = ( reference_points[:, :, None, :, None, :] + sampling_offsets / offset_normalizer[None, None, None, :, None, :] @@ -370,7 +384,9 @@ def create_dummy_class(klass, dependency, message=""): Returns: class: a class object """ - err = "Cannot import '{}', therefore '{}' is not available.".format(dependency, klass) + err = "Cannot import '{}', therefore '{}' is not available.".format( + dependency, klass + ) if message: err = err + " " + message @@ -399,7 +415,9 @@ def create_dummy_func(func, dependency, message=""): Returns: function: a function object """ - err = "Cannot import '{}', therefore '{}' is not available.".format(dependency, func) + err = "Cannot import '{}', therefore '{}' is not available.".format( + dependency, func + ) if message: err = err + " " + message diff --git a/swarms/agents/models/groundingdino/models/GroundingDINO/transformer.py b/swarms/agents/models/groundingdino/models/GroundingDINO/transformer.py index ffc6a09c..3ac66166 100644 --- a/swarms/agents/models/groundingdino/models/GroundingDINO/transformer.py +++ b/swarms/agents/models/groundingdino/models/GroundingDINO/transformer.py @@ -82,7 +82,13 @@ class Transformer(nn.Module): # choose encoder layer type encoder_layer = DeformableTransformerEncoderLayer( - d_model, dim_feedforward, dropout, activation, num_feature_levels, nhead, enc_n_points + d_model, + dim_feedforward, + dropout, + activation, + num_feature_levels, + nhead, + enc_n_points, ) if use_text_enhancer: @@ -154,7 +160,9 @@ class Transformer(nn.Module): if num_feature_levels > 1: if self.num_encoder_layers > 0: - self.level_embed = nn.Parameter(torch.Tensor(num_feature_levels, d_model)) + self.level_embed = nn.Parameter( + torch.Tensor(num_feature_levels, d_model) + ) else: self.level_embed = None @@ -169,9 +177,10 @@ class Transformer(nn.Module): # for two stage self.two_stage_type = two_stage_type - assert two_stage_type in ["no", "standard"], "unknown param {} of two_stage_type".format( - two_stage_type - ) + assert two_stage_type in [ + "no", + "standard", + ], "unknown param {} of two_stage_type".format(two_stage_type) if two_stage_type == "standard": # anchor selection at the output of encoder self.enc_output = nn.Linear(d_model, d_model) @@ -208,7 +217,16 @@ class Transformer(nn.Module): def init_ref_points(self, use_num_queries): self.refpoint_embed = nn.Embedding(use_num_queries, 4) - def forward(self, srcs, masks, refpoint_embed, pos_embeds, tgt, attn_mask=None, text_dict=None): + def forward( + self, + srcs, + masks, + refpoint_embed, + pos_embeds, + tgt, + attn_mask=None, + text_dict=None, + ): """ Input: - srcs: List of multi features [bs, ci, hi, wi] @@ -287,7 +305,9 @@ class Transformer(nn.Module): output_memory = self.enc_output_norm(self.enc_output(output_memory)) if text_dict is not None: - enc_outputs_class_unselected = self.enc_out_class_embed(output_memory, text_dict) + enc_outputs_class_unselected = self.enc_out_class_embed( + output_memory, text_dict + ) else: enc_outputs_class_unselected = self.enc_out_class_embed(output_memory) @@ -301,7 +321,9 @@ class Transformer(nn.Module): # gather boxes refpoint_embed_undetach = torch.gather( - enc_outputs_coord_unselected, 1, topk_proposals.unsqueeze(-1).repeat(1, 1, 4) + enc_outputs_coord_unselected, + 1, + topk_proposals.unsqueeze(-1).repeat(1, 1, 4), ) # unsigmoid refpoint_embed_ = refpoint_embed_undetach.detach() init_box_proposal = torch.gather( @@ -310,7 +332,9 @@ class Transformer(nn.Module): # gather tgt tgt_undetach = torch.gather( - output_memory, 1, topk_proposals.unsqueeze(-1).repeat(1, 1, self.d_model) + output_memory, + 1, + topk_proposals.unsqueeze(-1).repeat(1, 1, self.d_model), ) if self.embed_init_tgt: tgt_ = ( @@ -350,7 +374,9 @@ class Transformer(nn.Module): init_box_proposal = refpoint_embed_.sigmoid() else: - raise NotImplementedError("unknown two_stage_type {}".format(self.two_stage_type)) + raise NotImplementedError( + "unknown two_stage_type {}".format(self.two_stage_type) + ) ######################################################### # End preparing tgt # - tgt: bs, NQ, d_model @@ -432,7 +458,9 @@ class TransformerEncoder(nn.Module): self.text_layers = [] self.fusion_layers = [] if num_layers > 0: - self.layers = _get_clones(encoder_layer, num_layers, layer_share=enc_layer_share) + self.layers = _get_clones( + encoder_layer, num_layers, layer_share=enc_layer_share + ) if text_enhance_layer is not None: self.text_layers = _get_clones( @@ -465,7 +493,6 @@ class TransformerEncoder(nn.Module): def get_reference_points(spatial_shapes, valid_ratios, device): reference_points_list = [] for lvl, (H_, W_) in enumerate(spatial_shapes): - ref_y, ref_x = torch.meshgrid( torch.linspace(0.5, H_ - 0.5, H_, dtype=torch.float32, device=device), torch.linspace(0.5, W_ - 0.5, W_, dtype=torch.float32, device=device), @@ -534,7 +561,9 @@ class TransformerEncoder(nn.Module): .unsqueeze(-1) .repeat(bs, 1, 1) ) - pos_text = get_sine_pos_embed(pos_text, num_pos_feats=256, exchange_xy=False) + pos_text = get_sine_pos_embed( + pos_text, num_pos_feats=256, exchange_xy=False + ) if position_ids is not None: pos_text = get_sine_pos_embed( position_ids[..., None], num_pos_feats=256, exchange_xy=False @@ -662,7 +691,6 @@ class TransformerDecoder(nn.Module): ref_points = [reference_points] for layer_id, layer in enumerate(self.layers): - if reference_points.shape[-1] == 4: reference_points_input = ( reference_points[:, :, None] @@ -670,7 +698,9 @@ class TransformerDecoder(nn.Module): ) # nq, bs, nlevel, 4 else: assert reference_points.shape[-1] == 2 - reference_points_input = reference_points[:, :, None] * valid_ratios[None, :] + reference_points_input = ( + reference_points[:, :, None] * valid_ratios[None, :] + ) query_sine_embed = gen_sineembed_for_position( reference_points_input[:, :, 0, :] ) # nq, bs, 256*2 @@ -777,7 +807,13 @@ class DeformableTransformerEncoderLayer(nn.Module): return src def forward( - self, src, pos, reference_points, spatial_shapes, level_start_index, key_padding_mask=None + self, + src, + pos, + reference_points, + spatial_shapes, + level_start_index, + key_padding_mask=None, ): # self attention # import ipdb; ipdb.set_trace() diff --git a/swarms/agents/models/groundingdino/models/GroundingDINO/transformer_vanilla.py b/swarms/agents/models/groundingdino/models/GroundingDINO/transformer_vanilla.py index 85f6822a..ae0ad82a 100644 --- a/swarms/agents/models/groundingdino/models/GroundingDINO/transformer_vanilla.py +++ b/swarms/agents/models/groundingdino/models/GroundingDINO/transformer_vanilla.py @@ -26,7 +26,9 @@ from .utils import ( class TextTransformer(nn.Module): - def __init__(self, num_layers, d_model=256, nheads=8, dim_feedforward=2048, dropout=0.1): + def __init__( + self, num_layers, d_model=256, nheads=8, dim_feedforward=2048, dropout=0.1 + ): super().__init__() self.num_layers = num_layers self.d_model = d_model @@ -35,7 +37,10 @@ class TextTransformer(nn.Module): self.norm = None single_encoder_layer = TransformerEncoderLayer( - d_model=d_model, nhead=nheads, dim_feedforward=dim_feedforward, dropout=dropout + d_model=d_model, + nhead=nheads, + dim_feedforward=dim_feedforward, + dropout=dropout, ) self.layers = _get_clones(single_encoder_layer, num_layers) diff --git a/swarms/agents/models/groundingdino/models/GroundingDINO/utils.py b/swarms/agents/models/groundingdino/models/GroundingDINO/utils.py index 8140b35e..2bb3e9b8 100644 --- a/swarms/agents/models/groundingdino/models/GroundingDINO/utils.py +++ b/swarms/agents/models/groundingdino/models/GroundingDINO/utils.py @@ -39,14 +39,20 @@ def get_sine_pos_embed( """ scale = 2 * math.pi dim_t = torch.arange(num_pos_feats, dtype=torch.float32, device=pos_tensor.device) - dim_t = temperature ** (2 * torch.div(dim_t, 2, rounding_mode="floor") / num_pos_feats) + dim_t = temperature ** ( + 2 * torch.div(dim_t, 2, rounding_mode="floor") / num_pos_feats + ) def sine_func(x: torch.Tensor): sin_x = x * scale / dim_t - sin_x = torch.stack((sin_x[..., 0::2].sin(), sin_x[..., 1::2].cos()), dim=3).flatten(2) + sin_x = torch.stack( + (sin_x[..., 0::2].sin(), sin_x[..., 1::2].cos()), dim=3 + ).flatten(2) return sin_x - pos_res = [sine_func(x) for x in pos_tensor.split([1] * pos_tensor.shape[-1], dim=-1)] + pos_res = [ + sine_func(x) for x in pos_tensor.split([1] * pos_tensor.shape[-1], dim=-1) + ] if exchange_xy: pos_res[0], pos_res[1] = pos_res[1], pos_res[0] pos_res = torch.cat(pos_res, dim=-1) @@ -70,7 +76,9 @@ def gen_encoder_output_proposals( proposals = [] _cur = 0 for lvl, (H_, W_) in enumerate(spatial_shapes): - mask_flatten_ = memory_padding_mask[:, _cur: (_cur + H_ * W_)].view(N_, H_, W_, 1) + mask_flatten_ = memory_padding_mask[:, _cur : (_cur + H_ * W_)].view( + N_, H_, W_, 1 + ) valid_H = torch.sum(~mask_flatten_[:, :, 0, 0], 1) valid_W = torch.sum(~mask_flatten_[:, 0, :, 0], 1) @@ -82,7 +90,9 @@ def gen_encoder_output_proposals( ) grid = torch.cat([grid_x.unsqueeze(-1), grid_y.unsqueeze(-1)], -1) # H_, W_, 2 - scale = torch.cat([valid_W.unsqueeze(-1), valid_H.unsqueeze(-1)], 1).view(N_, 1, 1, 2) + scale = torch.cat([valid_W.unsqueeze(-1), valid_H.unsqueeze(-1)], 1).view( + N_, 1, 1, 2 + ) grid = (grid.unsqueeze(0).expand(N_, -1, -1, -1) + 0.5) / scale if learnedwh is not None: @@ -99,15 +109,21 @@ def gen_encoder_output_proposals( _cur += H_ * W_ # import ipdb; ipdb.set_trace() output_proposals = torch.cat(proposals, 1) - output_proposals_valid = ((output_proposals > 0.01) & (output_proposals < 0.99)).all( - -1, keepdim=True - ) + output_proposals_valid = ( + (output_proposals > 0.01) & (output_proposals < 0.99) + ).all(-1, keepdim=True) output_proposals = torch.log(output_proposals / (1 - output_proposals)) # unsigmoid - output_proposals = output_proposals.masked_fill(memory_padding_mask.unsqueeze(-1), float("inf")) - output_proposals = output_proposals.masked_fill(~output_proposals_valid, float("inf")) + output_proposals = output_proposals.masked_fill( + memory_padding_mask.unsqueeze(-1), float("inf") + ) + output_proposals = output_proposals.masked_fill( + ~output_proposals_valid, float("inf") + ) output_memory = memory - output_memory = output_memory.masked_fill(memory_padding_mask.unsqueeze(-1), float(0)) + output_memory = output_memory.masked_fill( + memory_padding_mask.unsqueeze(-1), float(0) + ) output_memory = output_memory.masked_fill(~output_proposals_valid, float(0)) # output_memory = output_memory.masked_fill(memory_padding_mask.unsqueeze(-1), float('inf')) @@ -136,7 +152,12 @@ class RandomBoxPerturber: def sigmoid_focal_loss( - inputs, targets, num_boxes, alpha: float = 0.25, gamma: float = 2, no_reduction=False + inputs, + targets, + num_boxes, + alpha: float = 0.25, + gamma: float = 2, + no_reduction=False, ): """ Loss used in RetinaNet for dense detection: https://arxiv.org/abs/1708.02002. @@ -206,23 +227,31 @@ def gen_sineembed_for_position(pos_tensor): # sineembed_tensor = torch.zeros(n_query, bs, 256) scale = 2 * math.pi dim_t = torch.arange(128, dtype=torch.float32, device=pos_tensor.device) - dim_t = 10000 ** (2 * (torch.div(dim_t, 2, rounding_mode='floor')) / 128) + dim_t = 10000 ** (2 * (torch.div(dim_t, 2, rounding_mode="floor")) / 128) x_embed = pos_tensor[:, :, 0] * scale y_embed = pos_tensor[:, :, 1] * scale pos_x = x_embed[:, :, None] / dim_t pos_y = y_embed[:, :, None] / dim_t - pos_x = torch.stack((pos_x[:, :, 0::2].sin(), pos_x[:, :, 1::2].cos()), dim=3).flatten(2) - pos_y = torch.stack((pos_y[:, :, 0::2].sin(), pos_y[:, :, 1::2].cos()), dim=3).flatten(2) + pos_x = torch.stack( + (pos_x[:, :, 0::2].sin(), pos_x[:, :, 1::2].cos()), dim=3 + ).flatten(2) + pos_y = torch.stack( + (pos_y[:, :, 0::2].sin(), pos_y[:, :, 1::2].cos()), dim=3 + ).flatten(2) if pos_tensor.size(-1) == 2: pos = torch.cat((pos_y, pos_x), dim=2) elif pos_tensor.size(-1) == 4: w_embed = pos_tensor[:, :, 2] * scale pos_w = w_embed[:, :, None] / dim_t - pos_w = torch.stack((pos_w[:, :, 0::2].sin(), pos_w[:, :, 1::2].cos()), dim=3).flatten(2) + pos_w = torch.stack( + (pos_w[:, :, 0::2].sin(), pos_w[:, :, 1::2].cos()), dim=3 + ).flatten(2) h_embed = pos_tensor[:, :, 3] * scale pos_h = h_embed[:, :, None] / dim_t - pos_h = torch.stack((pos_h[:, :, 0::2].sin(), pos_h[:, :, 1::2].cos()), dim=3).flatten(2) + pos_h = torch.stack( + (pos_h[:, :, 0::2].sin(), pos_h[:, :, 1::2].cos()), dim=3 + ).flatten(2) pos = torch.cat((pos_y, pos_x, pos_w, pos_h), dim=2) else: @@ -262,7 +291,9 @@ class ContrastiveEmbed(nn.Module): res.masked_fill_(~text_token_mask[:, None, :], float("-inf")) # padding to max_text_len - new_res = torch.full((*res.shape[:-1], self.max_text_len), float("-inf"), device=res.device) + new_res = torch.full( + (*res.shape[:-1], self.max_text_len), float("-inf"), device=res.device + ) new_res[..., : res.shape[-1]] = res return new_res diff --git a/swarms/agents/models/groundingdino/models/registry.py b/swarms/agents/models/groundingdino/models/registry.py index 2d22a59e..c31ff3ac 100644 --- a/swarms/agents/models/groundingdino/models/registry.py +++ b/swarms/agents/models/groundingdino/models/registry.py @@ -57,7 +57,9 @@ class Registry(object): if module_name is None: module_name = module_build_function.__name__ if not force and module_name in self._module_dict: - raise KeyError("{} is already registered in {}".format(module_name, self.name)) + raise KeyError( + "{} is already registered in {}".format(module_name, self.name) + ) self._module_dict[module_name] = module_build_function return module_build_function diff --git a/swarms/agents/models/groundingdino/util/get_tokenlizer.py b/swarms/agents/models/groundingdino/util/get_tokenlizer.py index 2c8b6842..6c4c8dd5 100644 --- a/swarms/agents/models/groundingdino/util/get_tokenlizer.py +++ b/swarms/agents/models/groundingdino/util/get_tokenlizer.py @@ -22,7 +22,9 @@ def get_tokenlizer(text_encoder_type): def get_pretrained_language_model(text_encoder_type): - if text_encoder_type == "bert-base-uncased" or (os.path.isdir(text_encoder_type) and os.path.exists(text_encoder_type)): + if text_encoder_type == "bert-base-uncased" or ( + os.path.isdir(text_encoder_type) and os.path.exists(text_encoder_type) + ): return BertModel.from_pretrained(text_encoder_type) if text_encoder_type == "roberta-base": return RobertaModel.from_pretrained(text_encoder_type) diff --git a/swarms/agents/models/groundingdino/util/inference.py b/swarms/agents/models/groundingdino/util/inference.py index 55087a5e..a868a55a 100644 --- a/swarms/agents/models/groundingdino/util/inference.py +++ b/swarms/agents/models/groundingdino/util/inference.py @@ -26,7 +26,9 @@ def preprocess_caption(caption: str) -> str: return result + "." -def load_model(model_config_path: str, model_checkpoint_path: str, device: str = "cuda"): +def load_model( + model_config_path: str, model_checkpoint_path: str, device: str = "cuda" +): args = SLConfig.fromfile(model_config_path) args.device = device model = build_model(args) @@ -51,13 +53,13 @@ def load_image(image_path: str) -> Tuple[np.array, torch.Tensor]: def predict( - model, - image: torch.Tensor, - caption: str, - box_threshold: float, - text_threshold: float, - device: str = "cuda", - remove_combined: bool = False + model, + image: torch.Tensor, + caption: str, + box_threshold: float, + text_threshold: float, + device: str = "cuda", + remove_combined: bool = False, ) -> Tuple[torch.Tensor, torch.Tensor, List[str]]: caption = preprocess_caption(caption=caption) @@ -67,8 +69,12 @@ def predict( with torch.no_grad(): outputs = model(image[None], captions=[caption]) - prediction_logits = outputs["pred_logits"].cpu().sigmoid()[0] # prediction_logits.shape = (nq, 256) - prediction_boxes = outputs["pred_boxes"].cpu()[0] # prediction_boxes.shape = (nq, 4) + prediction_logits = ( + outputs["pred_logits"].cpu().sigmoid()[0] + ) # prediction_logits.shape = (nq, 256) + prediction_boxes = outputs["pred_boxes"].cpu()[ + 0 + ] # prediction_boxes.shape = (nq, 4) mask = prediction_logits.max(dim=1)[0] > box_threshold logits = prediction_logits[mask] # logits.shape = (n, 256) @@ -78,7 +84,11 @@ def predict( tokenized = tokenizer(caption) if remove_combined: - sep_idx = [i for i in range(len(tokenized['input_ids'])) if tokenized['input_ids'][i] in [101, 102, 1012]] + sep_idx = [ + i + for i in range(len(tokenized["input_ids"])) + if tokenized["input_ids"][i] in [101, 102, 1012] + ] phrases = [] for logit in logits: @@ -86,32 +96,40 @@ def predict( insert_idx = bisect.bisect_left(sep_idx, max_idx) right_idx = sep_idx[insert_idx] left_idx = sep_idx[insert_idx - 1] - phrases.append(get_phrases_from_posmap(logit > text_threshold, tokenized, tokenizer, left_idx, right_idx).replace('.', '')) + phrases.append( + get_phrases_from_posmap( + logit > text_threshold, tokenized, tokenizer, left_idx, right_idx + ).replace(".", "") + ) else: phrases = [ - get_phrases_from_posmap(logit > text_threshold, tokenized, tokenizer).replace('.', '') - for logit - in logits + get_phrases_from_posmap( + logit > text_threshold, tokenized, tokenizer + ).replace(".", "") + for logit in logits ] return boxes, logits.max(dim=1)[0], phrases -def annotate(image_source: np.ndarray, boxes: torch.Tensor, logits: torch.Tensor, phrases: List[str]) -> np.ndarray: +def annotate( + image_source: np.ndarray, + boxes: torch.Tensor, + logits: torch.Tensor, + phrases: List[str], +) -> np.ndarray: h, w, _ = image_source.shape boxes = boxes * torch.Tensor([w, h, w, h]) xyxy = box_convert(boxes=boxes, in_fmt="cxcywh", out_fmt="xyxy").numpy() detections = sv.Detections(xyxy=xyxy) - labels = [ - f"{phrase} {logit:.2f}" - for phrase, logit - in zip(phrases, logits) - ] + labels = [f"{phrase} {logit:.2f}" for phrase, logit in zip(phrases, logits)] box_annotator = sv.BoxAnnotator() annotated_frame = cv2.cvtColor(image_source, cv2.COLOR_RGB2BGR) - annotated_frame = box_annotator.annotate(scene=annotated_frame, detections=detections, labels=labels) + annotated_frame = box_annotator.annotate( + scene=annotated_frame, detections=detections, labels=labels + ) return annotated_frame @@ -121,17 +139,13 @@ def annotate(image_source: np.ndarray, boxes: torch.Tensor, logits: torch.Tensor class Model: - def __init__( - self, - model_config_path: str, - model_checkpoint_path: str, - device: str = "cuda" + self, model_config_path: str, model_checkpoint_path: str, device: str = "cuda" ): self.model = load_model( model_config_path=model_config_path, model_checkpoint_path=model_checkpoint_path, - device=device + device=device, ).to(device) self.device = device @@ -140,7 +154,7 @@ class Model: image: np.ndarray, caption: str, box_threshold: float = 0.35, - text_threshold: float = 0.25 + text_threshold: float = 0.25, ) -> Tuple[sv.Detections, List[str]]: """ import cv2 @@ -167,13 +181,12 @@ class Model: caption=caption, box_threshold=box_threshold, text_threshold=text_threshold, - device=self.device) + device=self.device, + ) source_h, source_w, _ = image.shape detections = Model.post_process_result( - source_h=source_h, - source_w=source_w, - boxes=boxes, - logits=logits) + source_h=source_h, source_w=source_w, boxes=boxes, logits=logits + ) return detections, phrases def predict_with_classes( @@ -181,7 +194,7 @@ class Model: image: np.ndarray, classes: List[str], box_threshold: float, - text_threshold: float + text_threshold: float, ) -> sv.Detections: """ import cv2 @@ -210,13 +223,12 @@ class Model: caption=caption, box_threshold=box_threshold, text_threshold=text_threshold, - device=self.device) + device=self.device, + ) source_h, source_w, _ = image.shape detections = Model.post_process_result( - source_h=source_h, - source_w=source_w, - boxes=boxes, - logits=logits) + source_h=source_h, source_w=source_w, boxes=boxes, logits=logits + ) class_id = Model.phrases2classes(phrases=phrases, classes=classes) detections.class_id = class_id return detections @@ -236,10 +248,7 @@ class Model: @staticmethod def post_process_result( - source_h: int, - source_w: int, - boxes: torch.Tensor, - logits: torch.Tensor + source_h: int, source_w: int, boxes: torch.Tensor, logits: torch.Tensor ) -> sv.Detections: boxes = boxes * torch.Tensor([source_w, source_h, source_w, source_h]) xyxy = box_convert(boxes=boxes, in_fmt="cxcywh", out_fmt="xyxy").numpy() diff --git a/swarms/agents/models/groundingdino/util/logger.py b/swarms/agents/models/groundingdino/util/logger.py index 18145f54..3566fe1b 100644 --- a/swarms/agents/models/groundingdino/util/logger.py +++ b/swarms/agents/models/groundingdino/util/logger.py @@ -29,7 +29,9 @@ class _ColorfulFormatter(logging.Formatter): # so that calling setup_logger multiple times won't add many handlers @functools.lru_cache() -def setup_logger(output=None, distributed_rank=0, *, color=True, name="imagenet", abbrev_name=None): +def setup_logger( + output=None, distributed_rank=0, *, color=True, name="imagenet", abbrev_name=None +): """ Initialize the detectron2 logger and set its verbosity level to "INFO". diff --git a/swarms/agents/models/groundingdino/util/misc.py b/swarms/agents/models/groundingdino/util/misc.py index d64b84ef..158f6a82 100644 --- a/swarms/agents/models/groundingdino/util/misc.py +++ b/swarms/agents/models/groundingdino/util/misc.py @@ -135,7 +135,9 @@ def all_gather_cpu(data): # obtain Tensor size of each rank local_size = torch.tensor([tensor.numel()], device=device, dtype=torch.long) - size_list = [torch.tensor([0], device=device, dtype=torch.long) for _ in range(world_size)] + size_list = [ + torch.tensor([0], device=device, dtype=torch.long) for _ in range(world_size) + ] if cpu_group is None: dist.all_gather(size_list, local_size) else: @@ -153,7 +155,9 @@ def all_gather_cpu(data): for _ in size_list: tensor_list.append(torch.empty((max_size,), dtype=torch.uint8, device=device)) if local_size != max_size: - padding = torch.empty(size=(max_size - local_size,), dtype=torch.uint8, device=device) + padding = torch.empty( + size=(max_size - local_size,), dtype=torch.uint8, device=device + ) tensor = torch.cat((tensor, padding), dim=0) if cpu_group is None: dist.all_gather(tensor_list, tensor) @@ -205,7 +209,9 @@ def all_gather(data): for _ in size_list: tensor_list.append(torch.empty((max_size,), dtype=torch.uint8, device="cuda")) if local_size != max_size: - padding = torch.empty(size=(max_size - local_size,), dtype=torch.uint8, device="cuda") + padding = torch.empty( + size=(max_size - local_size,), dtype=torch.uint8, device="cuda" + ) tensor = torch.cat((tensor, padding), dim=0) dist.all_gather(tensor_list, tensor) @@ -261,7 +267,9 @@ class MetricLogger(object): return self.meters[attr] if attr in self.__dict__: return self.__dict__[attr] - raise AttributeError("'{}' object has no attribute '{}'".format(type(self).__name__, attr)) + raise AttributeError( + "'{}' object has no attribute '{}'".format(type(self).__name__, attr) + ) def __str__(self): loss_str = [] @@ -434,7 +442,9 @@ class NestedTensor(object): return NestedTensor(cast_tensor, cast_mask) def to_img_list_single(self, tensor, mask): - assert tensor.dim() == 3, "dim of tensor should be 3 but {}".format(tensor.dim()) + assert tensor.dim() == 3, "dim of tensor should be 3 but {}".format( + tensor.dim() + ) maxH = (~mask).sum(0).max() maxW = (~mask).sum(1).max() img = tensor[:, :maxH, :maxW] @@ -516,11 +526,15 @@ def _onnx_nested_tensor_from_tensor_list(tensor_list: List[Tensor]) -> NestedTen padded_masks = [] for img in tensor_list: padding = [(s1 - s2) for s1, s2 in zip(max_size, tuple(img.shape))] - padded_img = torch.nn.functional.pad(img, (0, padding[2], 0, padding[1], 0, padding[0])) + padded_img = torch.nn.functional.pad( + img, (0, padding[2], 0, padding[1], 0, padding[0]) + ) padded_imgs.append(padded_img) m = torch.zeros_like(img[0], dtype=torch.int, device=img.device) - padded_mask = torch.nn.functional.pad(m, (0, padding[2], 0, padding[1]), "constant", 1) + padded_mask = torch.nn.functional.pad( + m, (0, padding[2], 0, padding[1]), "constant", 1 + ) padded_masks.append(padded_mask.to(torch.bool)) tensor = torch.stack(padded_imgs) @@ -575,7 +589,9 @@ def save_on_master(*args, **kwargs): def init_distributed_mode(args): - if "WORLD_SIZE" in os.environ and os.environ["WORLD_SIZE"] != "": # 'RANK' in os.environ and + if ( + "WORLD_SIZE" in os.environ and os.environ["WORLD_SIZE"] != "" + ): # 'RANK' in os.environ and args.rank = int(os.environ["RANK"]) args.world_size = int(os.environ["WORLD_SIZE"]) args.gpu = args.local_rank = int(os.environ["LOCAL_RANK"]) @@ -615,11 +631,17 @@ def init_distributed_mode(args): args.local_rank = 0 return - print("world_size:{} rank:{} local_rank:{}".format(args.world_size, args.rank, args.local_rank)) + print( + "world_size:{} rank:{} local_rank:{}".format( + args.world_size, args.rank, args.local_rank + ) + ) args.distributed = True torch.cuda.set_device(args.local_rank) args.dist_backend = "nccl" - print("| distributed init (rank {}): {}".format(args.rank, args.dist_url), flush=True) + print( + "| distributed init (rank {}): {}".format(args.rank, args.dist_url), flush=True + ) torch.distributed.init_process_group( backend=args.dist_backend, @@ -666,7 +688,9 @@ def accuracy_onehot(pred, gt): return acc -def interpolate(input, size=None, scale_factor=None, mode="nearest", align_corners=None): +def interpolate( + input, size=None, scale_factor=None, mode="nearest", align_corners=None +): # type: (Tensor, Optional[List[int]], Optional[float], str, Optional[bool]) -> Tensor """ Equivalent to nn.functional.interpolate, but with support for empty batch sizes. @@ -675,13 +699,17 @@ def interpolate(input, size=None, scale_factor=None, mode="nearest", align_corne """ if __torchvision_need_compat_flag < 0.7: if input.numel() > 0: - return torch.nn.functional.interpolate(input, size, scale_factor, mode, align_corners) + return torch.nn.functional.interpolate( + input, size, scale_factor, mode, align_corners + ) output_shape = _output_size(2, input, size, scale_factor) output_shape = list(input.shape[:-2]) + list(output_shape) return _new_empty_tensor(input, output_shape) else: - return torchvision.ops.misc.interpolate(input, size, scale_factor, mode, align_corners) + return torchvision.ops.misc.interpolate( + input, size, scale_factor, mode, align_corners + ) class color_sys: @@ -693,7 +721,12 @@ class color_sys: lightness = (50 + np.random.rand() * 10) / 100.0 saturation = (90 + np.random.rand() * 10) / 100.0 colors.append( - tuple([int(j * 255) for j in colorsys.hls_to_rgb(hue, lightness, saturation)]) + tuple( + [ + int(j * 255) + for j in colorsys.hls_to_rgb(hue, lightness, saturation) + ] + ) ) self.colors = colors diff --git a/swarms/agents/models/groundingdino/util/slconfig.py b/swarms/agents/models/groundingdino/util/slconfig.py index 7adf837d..f6598271 100644 --- a/swarms/agents/models/groundingdino/util/slconfig.py +++ b/swarms/agents/models/groundingdino/util/slconfig.py @@ -31,7 +31,9 @@ class ConfigDict(Dict): try: value = super(ConfigDict, self).__getattr__(name) except KeyError: - ex = AttributeError(f"'{self.__class__.__name__}' object has no " f"attribute '{name}'") + ex = AttributeError( + f"'{self.__class__.__name__}' object has no " f"attribute '{name}'" + ) except Exception as e: ex = e else: @@ -79,9 +81,11 @@ class SLConfig(object): check_file_exist(filename) if filename.lower().endswith(".py"): with tempfile.TemporaryDirectory() as temp_config_dir: - temp_config_file = tempfile.NamedTemporaryFile(dir=temp_config_dir, suffix=".py") + temp_config_file = tempfile.NamedTemporaryFile( + dir=temp_config_dir, suffix=".py" + ) temp_config_name = osp.basename(temp_config_file.name) - if os.name == 'nt': + if os.name == "nt": temp_config_file.close() shutil.copyfile(filename, osp.join(temp_config_dir, temp_config_name)) temp_module_name = osp.splitext(temp_config_name)[0] @@ -90,7 +94,9 @@ class SLConfig(object): mod = import_module(temp_module_name) sys.path.pop(0) cfg_dict = { - name: value for name, value in mod.__dict__.items() if not name.startswith("__") + name: value + for name, value in mod.__dict__.items() + if not name.startswith("__") } # delete imported module del sys.modules[temp_module_name] @@ -111,7 +117,9 @@ class SLConfig(object): if BASE_KEY in cfg_dict: cfg_dir = osp.dirname(filename) base_filename = cfg_dict.pop(BASE_KEY) - base_filename = base_filename if isinstance(base_filename, list) else [base_filename] + base_filename = ( + base_filename if isinstance(base_filename, list) else [base_filename] + ) cfg_dict_list = list() cfg_text_list = list() @@ -156,7 +164,6 @@ class SLConfig(object): b = b.copy() for k, v in a.items(): if isinstance(v, dict) and k in b and not v.pop(DELETE_KEY, False): - if not isinstance(b[k], dict) and not isinstance(b[k], list): # if : # import ipdb; ipdb.set_trace() @@ -172,7 +179,8 @@ class SLConfig(object): _ = int(k) except BaseException: raise TypeError( - f"b is a list, " f"index {k} should be an int when input but {type(k)}" + f"b is a list, " + f"index {k} should be an int when input but {type(k)}" ) b[int(k)] = SLConfig._merge_a_into_b(v, b[int(k)]) else: @@ -215,7 +223,6 @@ class SLConfig(object): @property def pretty_text(self): - indent = 4 def _indent(s_, num_spaces): diff --git a/swarms/agents/models/groundingdino/util/utils.py b/swarms/agents/models/groundingdino/util/utils.py index 68e81842..7a0815ef 100644 --- a/swarms/agents/models/groundingdino/util/utils.py +++ b/swarms/agents/models/groundingdino/util/utils.py @@ -40,7 +40,9 @@ def renorm( ) -> torch.FloatTensor: # img: tensor(3,H,W) or tensor(B,3,H,W) # return: same as img - assert img.dim() == 3 or img.dim() == 4, "img.dim() should be 3 or 4 but %d" % img.dim() + assert img.dim() == 3 or img.dim() == 4, ( + "img.dim() should be 3 or 4 but %d" % img.dim() + ) if img.dim() == 3: assert img.size(0) == 3, 'img.size(0) shoule be 3 but "%d". (%s)' % ( img.size(0), @@ -147,8 +149,12 @@ class CocoClassMapper: "89": 79, "90": 80, } - self.origin2compact_mapper = {int(k): v - 1 for k, v in self.category_map_str.items()} - self.compact2origin_mapper = {int(v - 1): int(k) for k, v in self.category_map_str.items()} + self.origin2compact_mapper = { + int(k): v - 1 for k, v in self.category_map_str.items() + } + self.compact2origin_mapper = { + int(v - 1): int(k) for k, v in self.category_map_str.items() + } def origin2compact(self, idx): return self.origin2compact_mapper[int(idx)] @@ -271,6 +277,7 @@ def get_embedder(multires, i=0): def embed(x, eo=embedder_obj): return eo.embed(x) + return embed, embedder_obj.out_dim @@ -381,7 +388,9 @@ class NiceRepr: return str(len(self)) else: # In all other cases force the subclass to overload __nice__ - raise NotImplementedError(f"Define the __nice__ method for {self.__class__!r}") + raise NotImplementedError( + f"Define the __nice__ method for {self.__class__!r}" + ) def __repr__(self): """str: the string of the module""" @@ -496,7 +505,9 @@ class ModelEma(torch.nn.Module): ema_v.copy_(update_fn(ema_v, model_v)) def update(self, model): - self._update(model, update_fn=lambda e, m: self.decay * e + (1.0 - self.decay) * m) + self._update( + model, update_fn=lambda e, m: self.decay * e + (1.0 - self.decay) * m + ) def set(self, model): self._update(model, update_fn=lambda e, m: m) @@ -594,16 +605,21 @@ def targets_to(targets: List[Dict[str, Any]], device): "dataset_type", ] return [ - {k: v.to(device) if k not in excluded_keys else v for k, v in t.items()} for t in targets + {k: v.to(device) if k not in excluded_keys else v for k, v in t.items()} + for t in targets ] def get_phrases_from_posmap( - posmap: torch.BoolTensor, tokenized: Dict, tokenizer: AutoTokenizer, left_idx: int = 0, right_idx: int = 255 + posmap: torch.BoolTensor, + tokenized: Dict, + tokenizer: AutoTokenizer, + left_idx: int = 0, + right_idx: int = 255, ): assert isinstance(posmap, torch.Tensor), "posmap must be torch.Tensor" if posmap.dim() == 1: - posmap[0: left_idx + 1] = False + posmap[0 : left_idx + 1] = False posmap[right_idx:] = False non_zero_idx = posmap.nonzero(as_tuple=True)[0].tolist() token_ids = [tokenized["input_ids"][i] for i in non_zero_idx] diff --git a/swarms/agents/models/groundingdino/util/visualizer.py b/swarms/agents/models/groundingdino/util/visualizer.py index 17b8861d..ddfcc692 100644 --- a/swarms/agents/models/groundingdino/util/visualizer.py +++ b/swarms/agents/models/groundingdino/util/visualizer.py @@ -23,7 +23,9 @@ def renorm( ) -> torch.FloatTensor: # img: tensor(3,H,W) or tensor(B,3,H,W) # return: same as img - assert img.dim() == 3 or img.dim() == 4, "img.dim() should be 3 or 4 but %d" % img.dim() + assert img.dim() == 3 or img.dim() == 4, ( + "img.dim() should be 3 or 4 but %d" % img.dim() + ) if img.dim() == 3: assert img.size(0) == 3, 'img.size(0) shoule be 3 but "%d". (%s)' % ( img.size(0), @@ -124,7 +126,10 @@ class COCOVisualizer: ) else: savename = "{}/{}-{}-{}.png".format( - savedir, caption, int(image_id), str(datetime.datetime.now()).replace(" ", "-") + savedir, + caption, + int(image_id), + str(datetime.datetime.now()).replace(" ", "-"), ) print("savename: {}".format(savename)) os.makedirs(os.path.dirname(savename), exist_ok=True) @@ -188,7 +193,9 @@ class COCOVisualizer: ) if "box_label" in tgt: - assert len(tgt["box_label"]) == numbox, f"{len(tgt['box_label'])} = {numbox}, " + assert ( + len(tgt["box_label"]) == numbox + ), f"{len(tgt['box_label'])} = {numbox}, " for idx, bl in enumerate(tgt["box_label"]): _string = str(bl) bbox_x, bbox_y, bbox_w, bbox_h = boxes[idx] @@ -214,7 +221,9 @@ class COCOVisualizer: tgt["attn"] = [tgt["attn"]] for item in tgt["attn"]: attn_map, basergb = item - attn_map = (attn_map - attn_map.min()) / (attn_map.max() - attn_map.min() + 1e-3) + attn_map = (attn_map - attn_map.min()) / ( + attn_map.max() - attn_map.min() + 1e-3 + ) attn_map = (attn_map * 255).astype(np.uint8) cm = ColorMap(basergb) heatmap = cm(attn_map) @@ -310,7 +319,9 @@ class COCOVisualizer: # p = PatchCollection(polygons, facecolor=color, linewidths=0, alpha=0.4) # ax.add_collection(p) - p = PatchCollection(polygons, facecolor="none", edgecolors=color, linewidths=2) + p = PatchCollection( + polygons, facecolor="none", edgecolors=color, linewidths=2 + ) ax.add_collection(p) elif datasetType == "captions": for ann in anns: diff --git a/swarms/agents/models/groundingdino/util/vl_utils.py b/swarms/agents/models/groundingdino/util/vl_utils.py index 1264e6f7..4fd8592c 100644 --- a/swarms/agents/models/groundingdino/util/vl_utils.py +++ b/swarms/agents/models/groundingdino/util/vl_utils.py @@ -16,7 +16,7 @@ def create_positive_map_from_span(tokenized, token_span, max_text_len=256): """ positive_map = torch.zeros((len(token_span), max_text_len), dtype=torch.float) for j, tok_list in enumerate(token_span): - for (beg, end) in tok_list: + for beg, end in tok_list: beg_pos = tokenized.char_to_token(beg) end_pos = tokenized.char_to_token(end - 1) if beg_pos is None: @@ -41,7 +41,7 @@ def create_positive_map_from_span(tokenized, token_span, max_text_len=256): positive_map[j, beg_pos] = 1 break else: - positive_map[j, beg_pos: end_pos + 1].fill_(1) + positive_map[j, beg_pos : end_pos + 1].fill_(1) return positive_map / (positive_map.sum(-1)[:, None] + 1e-6) diff --git a/swarms/agents/models/segment_anything/scripts/amg.py b/swarms/agents/models/segment_anything/scripts/amg.py index f2dbf676..f278e180 100644 --- a/swarms/agents/models/segment_anything/scripts/amg.py +++ b/swarms/agents/models/segment_anything/scripts/amg.py @@ -52,7 +52,9 @@ parser.add_argument( help="The path to the SAM checkpoint to use for mask generation.", ) -parser.add_argument("--device", type=str, default="cuda", help="The device to run generation on.") +parser.add_argument( + "--device", type=str, default="cuda", help="The device to run generation on." +) parser.add_argument( "--convert-to-rle", @@ -204,7 +206,9 @@ def main(args: argparse.Namespace) -> None: targets = [args.input] else: targets = [ - f for f in os.listdir(args.input) if not os.path.isdir(os.path.join(args.input, f)) + f + for f in os.listdir(args.input) + if not os.path.isdir(os.path.join(args.input, f)) ] targets = [os.path.join(args.input, f) for f in targets] diff --git a/swarms/agents/models/segment_anything/scripts/export_onnx_model.py b/swarms/agents/models/segment_anything/scripts/export_onnx_model.py index 5c6f8389..750cad27 100644 --- a/swarms/agents/models/segment_anything/scripts/export_onnx_model.py +++ b/swarms/agents/models/segment_anything/scripts/export_onnx_model.py @@ -24,7 +24,10 @@ parser = argparse.ArgumentParser( ) parser.add_argument( - "--checkpoint", type=str, required=True, help="The path to the SAM model checkpoint." + "--checkpoint", + type=str, + required=True, + help="The path to the SAM model checkpoint.", ) parser.add_argument( @@ -129,7 +132,9 @@ def run_export( mask_input_size = [4 * x for x in embed_size] dummy_inputs = { "image_embeddings": torch.randn(1, embed_dim, *embed_size, dtype=torch.float), - "point_coords": torch.randint(low=0, high=1024, size=(1, 5, 2), dtype=torch.float), + "point_coords": torch.randint( + low=0, high=1024, size=(1, 5, 2), dtype=torch.float + ), "point_labels": torch.randint(low=0, high=4, size=(1, 5), dtype=torch.float), "mask_input": torch.randn(1, 1, *mask_input_size, dtype=torch.float), "has_mask_input": torch.tensor([1], dtype=torch.float), diff --git a/swarms/agents/models/segment_anything/segment_anything/automatic_mask_generator.py b/swarms/agents/models/segment_anything/segment_anything/automatic_mask_generator.py index d5a8c969..04c7b31f 100644 --- a/swarms/agents/models/segment_anything/segment_anything/automatic_mask_generator.py +++ b/swarms/agents/models/segment_anything/segment_anything/automatic_mask_generator.py @@ -172,7 +172,9 @@ class SamAutomaticMaskGenerator: # Encode masks if self.output_mode == "coco_rle": - mask_data["segmentations"] = [coco_encode_rle(rle) for rle in mask_data["rles"]] + mask_data["segmentations"] = [ + coco_encode_rle(rle) for rle in mask_data["rles"] + ] elif self.output_mode == "binary_mask": mask_data["segmentations"] = [rle_to_mask(rle) for rle in mask_data["rles"]] else: @@ -242,7 +244,9 @@ class SamAutomaticMaskGenerator: # Generate masks for this crop in batches data = MaskData() for (points,) in batch_iterator(self.points_per_batch, points_for_image): - batch_data = self._process_batch(points, cropped_im_size, crop_box, orig_size) + batch_data = self._process_batch( + points, cropped_im_size, crop_box, orig_size + ) data.cat(batch_data) del batch_data self.predictor.reset_image() @@ -275,7 +279,9 @@ class SamAutomaticMaskGenerator: # Run model on this batch transformed_points = self.predictor.transform.apply_coords(points, im_size) in_points = torch.as_tensor(transformed_points, device=self.predictor.device) - in_labels = torch.ones(in_points.shape[0], dtype=torch.int, device=in_points.device) + in_labels = torch.ones( + in_points.shape[0], dtype=torch.int, device=in_points.device + ) masks, iou_preds, _ = self.predictor.predict_torch( in_points[:, None, :], in_labels[:, None], @@ -298,7 +304,9 @@ class SamAutomaticMaskGenerator: # Calculate stability score data["stability_score"] = calculate_stability_score( - data["masks"], self.predictor.model.mask_threshold, self.stability_score_offset + data["masks"], + self.predictor.model.mask_threshold, + self.stability_score_offset, ) if self.stability_score_thresh > 0.0: keep_mask = data["stability_score"] >= self.stability_score_thresh @@ -309,7 +317,9 @@ class SamAutomaticMaskGenerator: data["boxes"] = batched_mask_to_box(data["masks"]) # Filter boxes that touch crop boundaries - keep_mask = ~is_box_near_crop_edge(data["boxes"], crop_box, [0, 0, orig_w, orig_h]) + keep_mask = ~is_box_near_crop_edge( + data["boxes"], crop_box, [0, 0, orig_w, orig_h] + ) if not torch.all(keep_mask): data.filter(keep_mask) diff --git a/swarms/agents/models/segment_anything/segment_anything/build_sam.py b/swarms/agents/models/segment_anything/segment_anything/build_sam.py index 37cd2451..a28ca1e8 100644 --- a/swarms/agents/models/segment_anything/segment_anything/build_sam.py +++ b/swarms/agents/models/segment_anything/segment_anything/build_sam.py @@ -8,7 +8,13 @@ import torch from functools import partial -from .modeling import ImageEncoderViT, MaskDecoder, PromptEncoder, Sam, TwoWayTransformer +from .modeling import ( + ImageEncoderViT, + MaskDecoder, + PromptEncoder, + Sam, + TwoWayTransformer, +) def build_sam_vit_h(checkpoint=None): diff --git a/swarms/agents/models/segment_anything/segment_anything/modeling/image_encoder.py b/swarms/agents/models/segment_anything/segment_anything/modeling/image_encoder.py index 66351d9d..d7eefe70 100644 --- a/swarms/agents/models/segment_anything/segment_anything/modeling/image_encoder.py +++ b/swarms/agents/models/segment_anything/segment_anything/modeling/image_encoder.py @@ -66,7 +66,9 @@ class ImageEncoderViT(nn.Module): if use_abs_pos: # Initialize absolute positional embedding with pretrain image size. self.pos_embed = nn.Parameter( - torch.zeros(1, img_size // patch_size, img_size // patch_size, embed_dim) + torch.zeros( + 1, img_size // patch_size, img_size // patch_size, embed_dim + ) ) self.blocks = nn.ModuleList() @@ -159,7 +161,9 @@ class Block(nn.Module): ) self.norm2 = norm_layer(dim) - self.mlp = MLPBlock(embedding_dim=dim, mlp_dim=int(dim * mlp_ratio), act=act_layer) + self.mlp = MLPBlock( + embedding_dim=dim, mlp_dim=int(dim * mlp_ratio), act=act_layer + ) self.window_size = window_size @@ -224,23 +228,34 @@ class Attention(nn.Module): def forward(self, x: torch.Tensor) -> torch.Tensor: B, H, W, _ = x.shape # qkv with shape (3, B, nHead, H * W, C) - qkv = self.qkv(x).reshape(B, H * W, 3, self.num_heads, -1).permute(2, 0, 3, 1, 4) + qkv = ( + self.qkv(x).reshape(B, H * W, 3, self.num_heads, -1).permute(2, 0, 3, 1, 4) + ) # q, k, v with shape (B * nHead, H * W, C) q, k, v = qkv.reshape(3, B * self.num_heads, H * W, -1).unbind(0) attn = (q * self.scale) @ k.transpose(-2, -1) if self.use_rel_pos: - attn = add_decomposed_rel_pos(attn, q, self.rel_pos_h, self.rel_pos_w, (H, W), (H, W)) + attn = add_decomposed_rel_pos( + attn, q, self.rel_pos_h, self.rel_pos_w, (H, W), (H, W) + ) attn = attn.softmax(dim=-1) - x = (attn @ v).view(B, self.num_heads, H, W, -1).permute(0, 2, 3, 1, 4).reshape(B, H, W, -1) + x = ( + (attn @ v) + .view(B, self.num_heads, H, W, -1) + .permute(0, 2, 3, 1, 4) + .reshape(B, H, W, -1) + ) x = self.proj(x) return x -def window_partition(x: torch.Tensor, window_size: int) -> Tuple[torch.Tensor, Tuple[int, int]]: +def window_partition( + x: torch.Tensor, window_size: int +) -> Tuple[torch.Tensor, Tuple[int, int]]: """ Partition into non-overlapping windows with padding if needed. Args: @@ -260,12 +275,17 @@ def window_partition(x: torch.Tensor, window_size: int) -> Tuple[torch.Tensor, T Hp, Wp = H + pad_h, W + pad_w x = x.view(B, Hp // window_size, window_size, Wp // window_size, window_size, C) - windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C) + windows = ( + x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C) + ) return windows, (Hp, Wp) def window_unpartition( - windows: torch.Tensor, window_size: int, pad_hw: Tuple[int, int], hw: Tuple[int, int] + windows: torch.Tensor, + window_size: int, + pad_hw: Tuple[int, int], + hw: Tuple[int, int], ) -> torch.Tensor: """ Window unpartition into original sequences and removing padding. @@ -281,7 +301,9 @@ def window_unpartition( Hp, Wp = pad_hw H, W = hw B = windows.shape[0] // (Hp * Wp // window_size // window_size) - x = windows.view(B, Hp // window_size, Wp // window_size, window_size, window_size, -1) + x = windows.view( + B, Hp // window_size, Wp // window_size, window_size, window_size, -1 + ) x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, Hp, Wp, -1) if Hp > H or Wp > W: @@ -355,7 +377,9 @@ def add_decomposed_rel_pos( rel_w = torch.einsum("bhwc,wkc->bhwk", r_q, Rw) attn = ( - attn.view(B, q_h, q_w, k_h, k_w) + rel_h[:, :, :, :, None] + rel_w[:, :, :, None, :] + attn.view(B, q_h, q_w, k_h, k_w) + + rel_h[:, :, :, :, None] + + rel_w[:, :, :, None, :] ).view(B, q_h * q_w, k_h * k_w) return attn diff --git a/swarms/agents/models/segment_anything/segment_anything/modeling/mask_decoder.py b/swarms/agents/models/segment_anything/segment_anything/modeling/mask_decoder.py index c847c602..f94bee1f 100644 --- a/swarms/agents/models/segment_anything/segment_anything/modeling/mask_decoder.py +++ b/swarms/agents/models/segment_anything/segment_anything/modeling/mask_decoder.py @@ -51,10 +51,14 @@ class MaskDecoder(nn.Module): self.mask_tokens = nn.Embedding(self.num_mask_tokens, transformer_dim) self.output_upscaling = nn.Sequential( - nn.ConvTranspose2d(transformer_dim, transformer_dim // 4, kernel_size=2, stride=2), + nn.ConvTranspose2d( + transformer_dim, transformer_dim // 4, kernel_size=2, stride=2 + ), LayerNorm2d(transformer_dim // 4), activation(), - nn.ConvTranspose2d(transformer_dim // 4, transformer_dim // 8, kernel_size=2, stride=2), + nn.ConvTranspose2d( + transformer_dim // 4, transformer_dim // 8, kernel_size=2, stride=2 + ), activation(), ) self.output_hypernetworks_mlps = nn.ModuleList( @@ -118,8 +122,12 @@ class MaskDecoder(nn.Module): ) -> Tuple[torch.Tensor, torch.Tensor]: """Predicts masks. See 'forward' for more details.""" # Concatenate output tokens - output_tokens = torch.cat([self.iou_token.weight, self.mask_tokens.weight], dim=0) - output_tokens = output_tokens.unsqueeze(0).expand(sparse_prompt_embeddings.size(0), -1, -1) + output_tokens = torch.cat( + [self.iou_token.weight, self.mask_tokens.weight], dim=0 + ) + output_tokens = output_tokens.unsqueeze(0).expand( + sparse_prompt_embeddings.size(0), -1, -1 + ) tokens = torch.cat((output_tokens, sparse_prompt_embeddings), dim=1) # Expand per-image data in batch direction to be per-mask @@ -131,14 +139,16 @@ class MaskDecoder(nn.Module): # Run the transformer hs, src = self.transformer(src, pos_src, tokens) iou_token_out = hs[:, 0, :] - mask_tokens_out = hs[:, 1: (1 + self.num_mask_tokens), :] + mask_tokens_out = hs[:, 1 : (1 + self.num_mask_tokens), :] # Upscale mask embeddings and predict masks using the mask tokens src = src.transpose(1, 2).view(b, c, h, w) upscaled_embedding = self.output_upscaling(src) hyper_in_list: List[torch.Tensor] = [] for i in range(self.num_mask_tokens): - hyper_in_list.append(self.output_hypernetworks_mlps[i](mask_tokens_out[:, i, :])) + hyper_in_list.append( + self.output_hypernetworks_mlps[i](mask_tokens_out[:, i, :]) + ) hyper_in = torch.stack(hyper_in_list, dim=1) b, c, h, w = upscaled_embedding.shape masks = (hyper_in @ upscaled_embedding.view(b, c, h * w)).view(b, -1, h, w) diff --git a/swarms/agents/models/segment_anything/segment_anything/modeling/prompt_encoder.py b/swarms/agents/models/segment_anything/segment_anything/modeling/prompt_encoder.py index c3143f4f..ba3a5dc3 100644 --- a/swarms/agents/models/segment_anything/segment_anything/modeling/prompt_encoder.py +++ b/swarms/agents/models/segment_anything/segment_anything/modeling/prompt_encoder.py @@ -43,11 +43,16 @@ class PromptEncoder(nn.Module): self.pe_layer = PositionEmbeddingRandom(embed_dim // 2) self.num_point_embeddings: int = 4 # pos/neg point + 2 box corners - point_embeddings = [nn.Embedding(1, embed_dim) for i in range(self.num_point_embeddings)] + point_embeddings = [ + nn.Embedding(1, embed_dim) for i in range(self.num_point_embeddings) + ] self.point_embeddings = nn.ModuleList(point_embeddings) self.not_a_point_embed = nn.Embedding(1, embed_dim) - self.mask_input_size = (4 * image_embedding_size[0], 4 * image_embedding_size[1]) + self.mask_input_size = ( + 4 * image_embedding_size[0], + 4 * image_embedding_size[1], + ) self.mask_downscaling = nn.Sequential( nn.Conv2d(1, mask_in_chans // 4, kernel_size=2, stride=2), LayerNorm2d(mask_in_chans // 4), @@ -83,7 +88,9 @@ class PromptEncoder(nn.Module): padding_label = -torch.ones((labels.shape[0], 1), device=labels.device) points = torch.cat([points, padding_point], dim=1) labels = torch.cat([labels, padding_label], dim=1) - point_embedding = self.pe_layer.forward_with_coords(points, self.input_image_size) + point_embedding = self.pe_layer.forward_with_coords( + points, self.input_image_size + ) point_embedding[labels == -1] = 0.0 point_embedding[labels == -1] += self.not_a_point_embed.weight point_embedding[labels == 0] += self.point_embeddings[0].weight @@ -94,7 +101,9 @@ class PromptEncoder(nn.Module): """Embeds box prompts.""" boxes = boxes + 0.5 # Shift to center of pixel coords = boxes.reshape(-1, 2, 2) - corner_embedding = self.pe_layer.forward_with_coords(coords, self.input_image_size) + corner_embedding = self.pe_layer.forward_with_coords( + coords, self.input_image_size + ) corner_embedding[:, 0, :] += self.point_embeddings[2].weight corner_embedding[:, 1, :] += self.point_embeddings[3].weight return corner_embedding @@ -149,7 +158,9 @@ class PromptEncoder(nn.Module): Bx(embed_dim)x(embed_H)x(embed_W) """ bs = self._get_batch_size(points, boxes, masks) - sparse_embeddings = torch.empty((bs, 0, self.embed_dim), device=self._get_device()) + sparse_embeddings = torch.empty( + (bs, 0, self.embed_dim), device=self._get_device() + ) if points is not None: coords, labels = points point_embeddings = self._embed_points(coords, labels, pad=(boxes is None)) diff --git a/swarms/agents/models/segment_anything/segment_anything/modeling/sam.py b/swarms/agents/models/segment_anything/segment_anything/modeling/sam.py index 8074cff6..7e28671d 100644 --- a/swarms/agents/models/segment_anything/segment_anything/modeling/sam.py +++ b/swarms/agents/models/segment_anything/segment_anything/modeling/sam.py @@ -43,7 +43,9 @@ class Sam(nn.Module): self.image_encoder = image_encoder self.prompt_encoder = prompt_encoder self.mask_decoder = mask_decoder - self.register_buffer("pixel_mean", torch.Tensor(pixel_mean).view(-1, 1, 1), False) + self.register_buffer( + "pixel_mean", torch.Tensor(pixel_mean).view(-1, 1, 1), False + ) self.register_buffer("pixel_std", torch.Tensor(pixel_std).view(-1, 1, 1), False) @property @@ -94,7 +96,9 @@ class Sam(nn.Module): shape BxCxHxW, where H=W=256. Can be passed as mask input to subsequent iterations of prediction. """ - input_images = torch.stack([self.preprocess(x["image"]) for x in batched_input], dim=0) + input_images = torch.stack( + [self.preprocess(x["image"]) for x in batched_input], dim=0 + ) image_embeddings = self.image_encoder(input_images) outputs = [] @@ -158,7 +162,9 @@ class Sam(nn.Module): align_corners=False, ) masks = masks[..., : input_size[0], : input_size[1]] - masks = F.interpolate(masks, original_size, mode="bilinear", align_corners=False) + masks = F.interpolate( + masks, original_size, mode="bilinear", align_corners=False + ) return masks def preprocess(self, x: torch.Tensor) -> torch.Tensor: diff --git a/swarms/agents/models/segment_anything/segment_anything/modeling/transformer.py b/swarms/agents/models/segment_anything/segment_anything/modeling/transformer.py index 28fafea5..499bbab9 100644 --- a/swarms/agents/models/segment_anything/segment_anything/modeling/transformer.py +++ b/swarms/agents/models/segment_anything/segment_anything/modeling/transformer.py @@ -198,7 +198,9 @@ class Attention(nn.Module): self.embedding_dim = embedding_dim self.internal_dim = embedding_dim // downsample_rate self.num_heads = num_heads - assert self.internal_dim % num_heads == 0, "num_heads must divide embedding_dim." + assert ( + self.internal_dim % num_heads == 0 + ), "num_heads must divide embedding_dim." self.q_proj = nn.Linear(embedding_dim, self.internal_dim) self.k_proj = nn.Linear(embedding_dim, self.internal_dim) diff --git a/swarms/agents/models/segment_anything/segment_anything/predictor.py b/swarms/agents/models/segment_anything/segment_anything/predictor.py index 8a6e6d81..cce6f2ea 100644 --- a/swarms/agents/models/segment_anything/segment_anything/predictor.py +++ b/swarms/agents/models/segment_anything/segment_anything/predictor.py @@ -55,7 +55,9 @@ class SamPredictor: # Transform the image to the form expected by the model input_image = self.transform.apply_image(image) input_image_torch = torch.as_tensor(input_image, device=self.device) - input_image_torch = input_image_torch.permute(2, 0, 1).contiguous()[None, :, :, :] + input_image_torch = input_image_torch.permute(2, 0, 1).contiguous()[ + None, :, :, : + ] self.set_torch_image(input_image_torch, image.shape[:2]) @@ -131,7 +133,9 @@ class SamPredictor: a subsequent iteration as mask input. """ if not self.is_image_set: - raise RuntimeError("An image must be set with .set_image(...) before mask prediction.") + raise RuntimeError( + "An image must be set with .set_image(...) before mask prediction." + ) # Transform input prompts coords_torch, labels_torch, box_torch, mask_input_torch = None, None, None, None @@ -140,15 +144,21 @@ class SamPredictor: point_labels is not None ), "point_labels must be supplied if point_coords is supplied." point_coords = self.transform.apply_coords(point_coords, self.original_size) - coords_torch = torch.as_tensor(point_coords, dtype=torch.float, device=self.device) - labels_torch = torch.as_tensor(point_labels, dtype=torch.int, device=self.device) + coords_torch = torch.as_tensor( + point_coords, dtype=torch.float, device=self.device + ) + labels_torch = torch.as_tensor( + point_labels, dtype=torch.int, device=self.device + ) coords_torch, labels_torch = coords_torch[None, :, :], labels_torch[None, :] if box is not None: box = self.transform.apply_boxes(box, self.original_size) box_torch = torch.as_tensor(box, dtype=torch.float, device=self.device) box_torch = box_torch[None, :] if mask_input is not None: - mask_input_torch = torch.as_tensor(mask_input, dtype=torch.float, device=self.device) + mask_input_torch = torch.as_tensor( + mask_input, dtype=torch.float, device=self.device + ) mask_input_torch = mask_input_torch[None, :, :, :] masks, iou_predictions, low_res_masks = self.predict_torch( @@ -211,7 +221,9 @@ class SamPredictor: a subsequent iteration as mask input. """ if not self.is_image_set: - raise RuntimeError("An image must be set with .set_image(...) before mask prediction.") + raise RuntimeError( + "An image must be set with .set_image(...) before mask prediction." + ) if point_coords is not None: points = (point_coords, point_labels) @@ -235,7 +247,9 @@ class SamPredictor: ) # Upscale the masks to the original image resolution - masks = self.model.postprocess_masks(low_res_masks, self.input_size, self.original_size) + masks = self.model.postprocess_masks( + low_res_masks, self.input_size, self.original_size + ) if not return_logits: masks = masks > self.model.mask_threshold @@ -252,7 +266,9 @@ class SamPredictor: raise RuntimeError( "An image must be set with .set_image(...) to generate an embedding." ) - assert self.features is not None, "Features must exist if an image has been set." + assert ( + self.features is not None + ), "Features must exist if an image has been set." return self.features @property diff --git a/swarms/agents/models/segment_anything/segment_anything/utils/amg.py b/swarms/agents/models/segment_anything/segment_anything/utils/amg.py index cb67232a..be064071 100644 --- a/swarms/agents/models/segment_anything/segment_anything/utils/amg.py +++ b/swarms/agents/models/segment_anything/segment_anything/utils/amg.py @@ -101,7 +101,7 @@ def batch_iterator(batch_size: int, *args) -> Generator[List[Any], None, None]: ), "Batched iteration must have inputs of all the same size." n_batches = len(args[0]) // batch_size + int(len(args[0]) % batch_size != 0) for b in range(n_batches): - yield [arg[b * batch_size: (b + 1) * batch_size] for arg in args] + yield [arg[b * batch_size : (b + 1) * batch_size] for arg in args] def mask_to_rle_pytorch(tensor: torch.Tensor) -> List[Dict[str, Any]]: @@ -142,7 +142,7 @@ def rle_to_mask(rle: Dict[str, Any]) -> np.ndarray: idx = 0 parity = False for count in rle["counts"]: - mask[idx: idx + count] = parity + mask[idx : idx + count] = parity idx += count parity ^= True mask = mask.reshape(w, h) diff --git a/swarms/agents/models/segment_anything/segment_anything/utils/onnx.py b/swarms/agents/models/segment_anything/segment_anything/utils/onnx.py index 3196bdf4..8bc1d8b3 100644 --- a/swarms/agents/models/segment_anything/segment_anything/utils/onnx.py +++ b/swarms/agents/models/segment_anything/segment_anything/utils/onnx.py @@ -48,32 +48,43 @@ class SamOnnxModel(nn.Module): transformed_size = torch.floor(transformed_size + 0.5).to(torch.int64) return transformed_size - def _embed_points(self, point_coords: torch.Tensor, point_labels: torch.Tensor) -> torch.Tensor: + def _embed_points( + self, point_coords: torch.Tensor, point_labels: torch.Tensor + ) -> torch.Tensor: point_coords = point_coords + 0.5 point_coords = point_coords / self.img_size point_embedding = self.model.prompt_encoder.pe_layer._pe_encoding(point_coords) point_labels = point_labels.unsqueeze(-1).expand_as(point_embedding) point_embedding = point_embedding * (point_labels != -1) - point_embedding = point_embedding + self.model.prompt_encoder.not_a_point_embed.weight * ( - point_labels == -1 + point_embedding = ( + point_embedding + + self.model.prompt_encoder.not_a_point_embed.weight * (point_labels == -1) ) for i in range(self.model.prompt_encoder.num_point_embeddings): - point_embedding = point_embedding + self.model.prompt_encoder.point_embeddings[ - i - ].weight * (point_labels == i) + point_embedding = ( + point_embedding + + self.model.prompt_encoder.point_embeddings[i].weight + * (point_labels == i) + ) return point_embedding - def _embed_masks(self, input_mask: torch.Tensor, has_mask_input: torch.Tensor) -> torch.Tensor: - mask_embedding = has_mask_input * self.model.prompt_encoder.mask_downscaling(input_mask) + def _embed_masks( + self, input_mask: torch.Tensor, has_mask_input: torch.Tensor + ) -> torch.Tensor: + mask_embedding = has_mask_input * self.model.prompt_encoder.mask_downscaling( + input_mask + ) mask_embedding = mask_embedding + ( 1 - has_mask_input ) * self.model.prompt_encoder.no_mask_embed.weight.reshape(1, -1, 1, 1) return mask_embedding - def mask_postprocessing(self, masks: torch.Tensor, orig_im_size: torch.Tensor) -> torch.Tensor: + def mask_postprocessing( + self, masks: torch.Tensor, orig_im_size: torch.Tensor + ) -> torch.Tensor: masks = F.interpolate( masks, size=(self.img_size, self.img_size), @@ -81,7 +92,9 @@ class SamOnnxModel(nn.Module): align_corners=False, ) - prepadded_size = self.resize_longest_image_size(orig_im_size, self.img_size).to(torch.int64) + prepadded_size = self.resize_longest_image_size(orig_im_size, self.img_size).to( + torch.int64 + ) masks = masks[..., : prepadded_size[0], : prepadded_size[1]] # type: ignore orig_im_size = orig_im_size.to(torch.int64) diff --git a/swarms/agents/models/segment_anything/segment_anything/utils/transforms.py b/swarms/agents/models/segment_anything/segment_anything/utils/transforms.py index c08ba1e3..ed816615 100644 --- a/swarms/agents/models/segment_anything/segment_anything/utils/transforms.py +++ b/swarms/agents/models/segment_anything/segment_anything/utils/transforms.py @@ -27,10 +27,14 @@ class ResizeLongestSide: """ Expects a numpy array with shape HxWxC in uint8 format. """ - target_size = self.get_preprocess_shape(image.shape[0], image.shape[1], self.target_length) + target_size = self.get_preprocess_shape( + image.shape[0], image.shape[1], self.target_length + ) return np.array(resize(to_pil_image(image), target_size)) - def apply_coords(self, coords: np.ndarray, original_size: Tuple[int, ...]) -> np.ndarray: + def apply_coords( + self, coords: np.ndarray, original_size: Tuple[int, ...] + ) -> np.ndarray: """ Expects a numpy array of length 2 in the final dimension. Requires the original image size in (H, W) format. @@ -44,7 +48,9 @@ class ResizeLongestSide: coords[..., 1] = coords[..., 1] * (new_h / old_h) return coords - def apply_boxes(self, boxes: np.ndarray, original_size: Tuple[int, ...]) -> np.ndarray: + def apply_boxes( + self, boxes: np.ndarray, original_size: Tuple[int, ...] + ) -> np.ndarray: """ Expects a numpy array shape Bx4. Requires the original image size in (H, W) format. @@ -59,7 +65,9 @@ class ResizeLongestSide: the transformation expected by the model. """ # Expects an image in BCHW format. May not exactly match apply_image. - target_size = self.get_preprocess_shape(image.shape[2], image.shape[3], self.target_length) + target_size = self.get_preprocess_shape( + image.shape[2], image.shape[3], self.target_length + ) return F.interpolate( image, target_size, mode="bilinear", align_corners=False, antialias=True ) @@ -91,7 +99,9 @@ class ResizeLongestSide: return boxes.reshape(-1, 4) @staticmethod - def get_preprocess_shape(oldh: int, oldw: int, long_side_length: int) -> Tuple[int, int]: + def get_preprocess_shape( + oldh: int, oldw: int, long_side_length: int + ) -> Tuple[int, int]: """ Compute the output size given input size and target long side length. """ diff --git a/swarms/agents/multi_modal_visual_agent.py b/swarms/agents/multi_modal_visual_agent.py index d37cd1cb..7ec8b03d 100644 --- a/swarms/agents/multi_modal_visual_agent.py +++ b/swarms/agents/multi_modal_visual_agent.py @@ -9,11 +9,24 @@ from PIL import Image, ImageDraw, ImageOps, ImageFont import math import numpy as np import inspect -from transformers import pipeline, BlipProcessor, BlipForConditionalGeneration, BlipForQuestionAnswering - -from diffusers import StableDiffusionPipeline, StableDiffusionInpaintPipeline, StableDiffusionInstructPix2PixPipeline +from transformers import ( + pipeline, + BlipProcessor, + BlipForConditionalGeneration, + BlipForQuestionAnswering, +) + +from diffusers import ( + StableDiffusionPipeline, + StableDiffusionInpaintPipeline, + StableDiffusionInstructPix2PixPipeline, +) from diffusers import EulerAncestralDiscreteScheduler -from diffusers import StableDiffusionControlNetPipeline, ControlNetModel, UniPCMultistepScheduler +from diffusers import ( + StableDiffusionControlNetPipeline, + ControlNetModel, + UniPCMultistepScheduler, +) from diffusers.pipelines.stable_diffusion import StableDiffusionSafetyChecker from controlnet_aux import OpenposeDetector, MLSDdetector, HEDdetector @@ -131,7 +144,7 @@ VISUAL_AGENT_SUFFIX_CN = """你对文件名的正确性非常严格,而且永 Thought: Do I need to use a tool? {agent_scratchpad} """ -os.makedirs('image', exist_ok=True) +os.makedirs("image", exist_ok=True) def seed_everything(seed): @@ -200,10 +213,12 @@ def blend_gt2pt(old_image, new_image, sigma=0.15, steps=100): kernel[steps:-steps, :steps] = left kernel[steps:-steps, -steps:] = right - pt_gt_img = easy_img[pos_h:pos_h + old_size[1], pos_w:pos_w + old_size[0]] - gaussian_gt_img = kernel * gt_img_array + (1 - kernel) * pt_gt_img # gt img with blur img + pt_gt_img = easy_img[pos_h : pos_h + old_size[1], pos_w : pos_w + old_size[0]] + gaussian_gt_img = ( + kernel * gt_img_array + (1 - kernel) * pt_gt_img + ) # gt img with blur img gaussian_gt_img = gaussian_gt_img.astype(np.int64) - easy_img[pos_h:pos_h + old_size[1], pos_w:pos_w + old_size[0]] = gaussian_gt_img + easy_img[pos_h : pos_h + old_size[1], pos_w : pos_w + old_size[0]] = gaussian_gt_img gaussian_img = Image.fromarray(easy_img) return gaussian_img @@ -216,19 +231,19 @@ def cut_dialogue_history(history_memory, keep_last_n_words=500): print(f"history_memory:{history_memory}, n_tokens: {n_tokens}") if n_tokens < keep_last_n_words: return history_memory - paragraphs = history_memory.split('\n') + paragraphs = history_memory.split("\n") last_n_tokens = n_tokens while last_n_tokens >= keep_last_n_words: - last_n_tokens -= len(paragraphs[0].split(' ')) + last_n_tokens -= len(paragraphs[0].split(" ")) paragraphs = paragraphs[1:] - return '\n' + '\n'.join(paragraphs) + return "\n" + "\n".join(paragraphs) def get_new_image_name(org_img_name, func_name="update"): head_tail = os.path.split(org_img_name) head = head_tail[0] tail = head_tail[1] - name_split = tail.split('.')[0].split('_') + name_split = tail.split(".")[0].split("_") this_new_uuid = str(uuid.uuid4())[:4] if len(name_split) == 1: most_org_file_name = name_split[0] @@ -236,7 +251,9 @@ def get_new_image_name(org_img_name, func_name="update"): assert len(name_split) == 4 most_org_file_name = name_split[3] recent_prev_file_name = name_split[0] - new_file_name = f'{this_new_uuid}_{func_name}_{recent_prev_file_name}_{most_org_file_name}.png' + new_file_name = ( + f"{this_new_uuid}_{func_name}_{recent_prev_file_name}_{most_org_file_name}.png" + ) return os.path.join(head, new_file_name) @@ -244,28 +261,40 @@ class InstructPix2Pix: def __init__(self, device): print(f"Initializing InstructPix2Pix to {device}") self.device = device - self.torch_dtype = torch.float16 if 'cuda' in device else torch.float32 - - self.pipe = StableDiffusionInstructPix2PixPipeline.from_pretrained("timbrooks/instruct-pix2pix", - safety_checker=StableDiffusionSafetyChecker.from_pretrained('CompVis/stable-diffusion-safety-checker'), - torch_dtype=self.torch_dtype).to(device) - self.pipe.scheduler = EulerAncestralDiscreteScheduler.from_config(self.pipe.scheduler.config) - - @prompts(name="Instruct Image Using Text", - description="useful when you want to the style of the image to be like the text. " - "like: make it look like a painting. or make it like a robot. " - "The input to this tool should be a comma separated string of two, " - "representing the image_path and the text. ") + self.torch_dtype = torch.float16 if "cuda" in device else torch.float32 + + self.pipe = StableDiffusionInstructPix2PixPipeline.from_pretrained( + "timbrooks/instruct-pix2pix", + safety_checker=StableDiffusionSafetyChecker.from_pretrained( + "CompVis/stable-diffusion-safety-checker" + ), + torch_dtype=self.torch_dtype, + ).to(device) + self.pipe.scheduler = EulerAncestralDiscreteScheduler.from_config( + self.pipe.scheduler.config + ) + + @prompts( + name="Instruct Image Using Text", + description="useful when you want to the style of the image to be like the text. " + "like: make it look like a painting. or make it like a robot. " + "The input to this tool should be a comma separated string of two, " + "representing the image_path and the text. ", + ) def inference(self, inputs): """Change style of image.""" print("===>Starting InstructPix2Pix Inference") - image_path, text = inputs.split(",")[0], ','.join(inputs.split(',')[1:]) + image_path, text = inputs.split(",")[0], ",".join(inputs.split(",")[1:]) original_image = Image.open(image_path) - image = self.pipe(text, image=original_image, num_inference_steps=40, image_guidance_scale=1.2).images[0] + image = self.pipe( + text, image=original_image, num_inference_steps=40, image_guidance_scale=1.2 + ).images[0] updated_image_path = get_new_image_name(image_path, func_name="pix2pix") image.save(updated_image_path) - print(f"\nProcessed InstructPix2Pix, Input Image: {image_path}, Instruct Text: {text}, " - f"Output Image: {updated_image_path}") + print( + f"\nProcessed InstructPix2Pix, Input Image: {image_path}, Instruct Text: {text}, " + f"Output Image: {updated_image_path}" + ) return updated_image_path @@ -273,25 +302,31 @@ class Text2Image: def __init__(self, device): print(f"Initializing Text2Image to {device}") self.device = device - self.torch_dtype = torch.float16 if 'cuda' in device else torch.float32 - self.pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", - torch_dtype=self.torch_dtype) + self.torch_dtype = torch.float16 if "cuda" in device else torch.float32 + self.pipe = StableDiffusionPipeline.from_pretrained( + "runwayml/stable-diffusion-v1-5", torch_dtype=self.torch_dtype + ) self.pipe.to(device) - self.a_prompt = 'best quality, extremely detailed' - self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, ' \ - 'fewer digits, cropped, worst quality, low quality' - - @prompts(name="Generate Image From User Input Text", - description="useful when you want to generate an image from a user input text and save it to a file. " - "like: generate an image of an object or something, or generate an image that includes some objects. " - "The input to this tool should be a string, representing the text used to generate image. ") + self.a_prompt = "best quality, extremely detailed" + self.n_prompt = ( + "longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, " + "fewer digits, cropped, worst quality, low quality" + ) + + @prompts( + name="Generate Image From User Input Text", + description="useful when you want to generate an image from a user input text and save it to a file. " + "like: generate an image of an object or something, or generate an image that includes some objects. " + "The input to this tool should be a string, representing the text used to generate image. ", + ) def inference(self, text): - image_filename = os.path.join('image', f"{str(uuid.uuid4())[:8]}.png") - prompt = text + ', ' + self.a_prompt + image_filename = os.path.join("image", f"{str(uuid.uuid4())[:8]}.png") + prompt = text + ", " + self.a_prompt image = self.pipe(prompt, negative_prompt=self.n_prompt).images[0] image.save(image_filename) print( - f"\nProcessed Text2Image, Input Text: {text}, Output Image: {image_filename}") + f"\nProcessed Text2Image, Input Text: {text}, Output Image: {image_filename}" + ) return image_filename @@ -299,19 +334,28 @@ class ImageCaptioning: def __init__(self, device): print(f"Initializing ImageCaptioning to {device}") self.device = device - self.torch_dtype = torch.float16 if 'cuda' in device else torch.float32 - self.processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base") + self.torch_dtype = torch.float16 if "cuda" in device else torch.float32 + self.processor = BlipProcessor.from_pretrained( + "Salesforce/blip-image-captioning-base" + ) self.model = BlipForConditionalGeneration.from_pretrained( - "Salesforce/blip-image-captioning-base", torch_dtype=self.torch_dtype).to(self.device) - - @prompts(name="Get Photo Description", - description="useful when you want to know what is inside the photo. receives image_path as input. " - "The input to this tool should be a string, representing the image_path. ") + "Salesforce/blip-image-captioning-base", torch_dtype=self.torch_dtype + ).to(self.device) + + @prompts( + name="Get Photo Description", + description="useful when you want to know what is inside the photo. receives image_path as input. " + "The input to this tool should be a string, representing the image_path. ", + ) def inference(self, image_path): - inputs = self.processor(Image.open(image_path), return_tensors="pt").to(self.device, self.torch_dtype) + inputs = self.processor(Image.open(image_path), return_tensors="pt").to( + self.device, self.torch_dtype + ) out = self.model.generate(**inputs) captions = self.processor.decode(out[0], skip_special_tokens=True) - print(f"\nProcessed ImageCaptioning, Input Image: {image_path}, Output Text: {captions}") + print( + f"\nProcessed ImageCaptioning, Input Image: {image_path}, Output Text: {captions}" + ) return captions @@ -321,11 +365,13 @@ class Image2Canny: self.low_threshold = 100 self.high_threshold = 200 - @prompts(name="Edge Detection On Image", - description="useful when you want to detect the edge of the image. " - "like: detect the edges of this image, or canny detection on image, " - "or perform edge detection on this image, or detect the canny image of this image. " - "The input to this tool should be a string, representing the image_path") + @prompts( + name="Edge Detection On Image", + description="useful when you want to detect the edge of the image. " + "like: detect the edges of this image, or canny detection on image, " + "or perform edge detection on this image, or detect the canny image of this image. " + "The input to this tool should be a string, representing the image_path", + ) def inference(self, inputs): image = Image.open(inputs) image = np.array(image) @@ -335,387 +381,568 @@ class Image2Canny: canny = Image.fromarray(canny) updated_image_path = get_new_image_name(inputs, func_name="edge") canny.save(updated_image_path) - print(f"\nProcessed Image2Canny, Input Image: {inputs}, Output Text: {updated_image_path}") + print( + f"\nProcessed Image2Canny, Input Image: {inputs}, Output Text: {updated_image_path}" + ) return updated_image_path class CannyText2Image: def __init__(self, device): print(f"Initializing CannyText2Image to {device}") - self.torch_dtype = torch.float16 if 'cuda' in device else torch.float32 - self.controlnet = ControlNetModel.from_pretrained("fusing/stable-diffusion-v1-5-controlnet-canny", - torch_dtype=self.torch_dtype) + self.torch_dtype = torch.float16 if "cuda" in device else torch.float32 + self.controlnet = ControlNetModel.from_pretrained( + "fusing/stable-diffusion-v1-5-controlnet-canny", + torch_dtype=self.torch_dtype, + ) self.pipe = StableDiffusionControlNetPipeline.from_pretrained( - "runwayml/stable-diffusion-v1-5", controlnet=self.controlnet, safety_checker=StableDiffusionSafetyChecker.from_pretrained('CompVis/stable-diffusion-safety-checker'), - torch_dtype=self.torch_dtype) - self.pipe.scheduler = UniPCMultistepScheduler.from_config(self.pipe.scheduler.config) + "runwayml/stable-diffusion-v1-5", + controlnet=self.controlnet, + safety_checker=StableDiffusionSafetyChecker.from_pretrained( + "CompVis/stable-diffusion-safety-checker" + ), + torch_dtype=self.torch_dtype, + ) + self.pipe.scheduler = UniPCMultistepScheduler.from_config( + self.pipe.scheduler.config + ) self.pipe.to(device) self.seed = -1 - self.a_prompt = 'best quality, extremely detailed' - self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, ' \ - 'fewer digits, cropped, worst quality, low quality' - - @prompts(name="Generate Image Condition On Canny Image", - description="useful when you want to generate a new real image from both the user description and a canny image." - " like: generate a real image of a object or something from this canny image," - " or generate a new real image of a object or something from this edge image. " - "The input to this tool should be a comma separated string of two, " - "representing the image_path and the user description. ") + self.a_prompt = "best quality, extremely detailed" + self.n_prompt = ( + "longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, " + "fewer digits, cropped, worst quality, low quality" + ) + + @prompts( + name="Generate Image Condition On Canny Image", + description="useful when you want to generate a new real image from both the user description and a canny image." + " like: generate a real image of a object or something from this canny image," + " or generate a new real image of a object or something from this edge image. " + "The input to this tool should be a comma separated string of two, " + "representing the image_path and the user description. ", + ) def inference(self, inputs): - image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:]) + image_path, instruct_text = inputs.split(",")[0], ",".join( + inputs.split(",")[1:] + ) image = Image.open(image_path) self.seed = random.randint(0, 65535) seed_everything(self.seed) - prompt = f'{instruct_text}, {self.a_prompt}' - image = self.pipe(prompt, image, num_inference_steps=20, eta=0.0, negative_prompt=self.n_prompt, - guidance_scale=9.0).images[0] + prompt = f"{instruct_text}, {self.a_prompt}" + image = self.pipe( + prompt, + image, + num_inference_steps=20, + eta=0.0, + negative_prompt=self.n_prompt, + guidance_scale=9.0, + ).images[0] updated_image_path = get_new_image_name(image_path, func_name="canny2image") image.save(updated_image_path) - print(f"\nProcessed CannyText2Image, Input Canny: {image_path}, Input Text: {instruct_text}, " - f"Output Text: {updated_image_path}") + print( + f"\nProcessed CannyText2Image, Input Canny: {image_path}, Input Text: {instruct_text}, " + f"Output Text: {updated_image_path}" + ) return updated_image_path class Image2Line: def __init__(self, device): print("Initializing Image2Line") - self.detector = MLSDdetector.from_pretrained('lllyasviel/ControlNet') - - @prompts(name="Line Detection On Image", - description="useful when you want to detect the straight line of the image. " - "like: detect the straight lines of this image, or straight line detection on image, " - "or perform straight line detection on this image, or detect the straight line image of this image. " - "The input to this tool should be a string, representing the image_path") + self.detector = MLSDdetector.from_pretrained("lllyasviel/ControlNet") + + @prompts( + name="Line Detection On Image", + description="useful when you want to detect the straight line of the image. " + "like: detect the straight lines of this image, or straight line detection on image, " + "or perform straight line detection on this image, or detect the straight line image of this image. " + "The input to this tool should be a string, representing the image_path", + ) def inference(self, inputs): image = Image.open(inputs) mlsd = self.detector(image) updated_image_path = get_new_image_name(inputs, func_name="line-of") mlsd.save(updated_image_path) - print(f"\nProcessed Image2Line, Input Image: {inputs}, Output Line: {updated_image_path}") + print( + f"\nProcessed Image2Line, Input Image: {inputs}, Output Line: {updated_image_path}" + ) return updated_image_path class LineText2Image: def __init__(self, device): print(f"Initializing LineText2Image to {device}") - self.torch_dtype = torch.float16 if 'cuda' in device else torch.float32 - self.controlnet = ControlNetModel.from_pretrained("fusing/stable-diffusion-v1-5-controlnet-mlsd", - torch_dtype=self.torch_dtype) + self.torch_dtype = torch.float16 if "cuda" in device else torch.float32 + self.controlnet = ControlNetModel.from_pretrained( + "fusing/stable-diffusion-v1-5-controlnet-mlsd", torch_dtype=self.torch_dtype + ) self.pipe = StableDiffusionControlNetPipeline.from_pretrained( - "runwayml/stable-diffusion-v1-5", controlnet=self.controlnet, safety_checker=StableDiffusionSafetyChecker.from_pretrained('CompVis/stable-diffusion-safety-checker'), - torch_dtype=self.torch_dtype + "runwayml/stable-diffusion-v1-5", + controlnet=self.controlnet, + safety_checker=StableDiffusionSafetyChecker.from_pretrained( + "CompVis/stable-diffusion-safety-checker" + ), + torch_dtype=self.torch_dtype, + ) + self.pipe.scheduler = UniPCMultistepScheduler.from_config( + self.pipe.scheduler.config ) - self.pipe.scheduler = UniPCMultistepScheduler.from_config(self.pipe.scheduler.config) self.pipe.to(device) self.seed = -1 - self.a_prompt = 'best quality, extremely detailed' - self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, ' \ - 'fewer digits, cropped, worst quality, low quality' - - @prompts(name="Generate Image Condition On Line Image", - description="useful when you want to generate a new real image from both the user description " - "and a straight line image. " - "like: generate a real image of a object or something from this straight line image, " - "or generate a new real image of a object or something from this straight lines. " - "The input to this tool should be a comma separated string of two, " - "representing the image_path and the user description. ") + self.a_prompt = "best quality, extremely detailed" + self.n_prompt = ( + "longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, " + "fewer digits, cropped, worst quality, low quality" + ) + + @prompts( + name="Generate Image Condition On Line Image", + description="useful when you want to generate a new real image from both the user description " + "and a straight line image. " + "like: generate a real image of a object or something from this straight line image, " + "or generate a new real image of a object or something from this straight lines. " + "The input to this tool should be a comma separated string of two, " + "representing the image_path and the user description. ", + ) def inference(self, inputs): - image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:]) + image_path, instruct_text = inputs.split(",")[0], ",".join( + inputs.split(",")[1:] + ) image = Image.open(image_path) self.seed = random.randint(0, 65535) seed_everything(self.seed) - prompt = f'{instruct_text}, {self.a_prompt}' - image = self.pipe(prompt, image, num_inference_steps=20, eta=0.0, negative_prompt=self.n_prompt, - guidance_scale=9.0).images[0] + prompt = f"{instruct_text}, {self.a_prompt}" + image = self.pipe( + prompt, + image, + num_inference_steps=20, + eta=0.0, + negative_prompt=self.n_prompt, + guidance_scale=9.0, + ).images[0] updated_image_path = get_new_image_name(image_path, func_name="line2image") image.save(updated_image_path) - print(f"\nProcessed LineText2Image, Input Line: {image_path}, Input Text: {instruct_text}, " - f"Output Text: {updated_image_path}") + print( + f"\nProcessed LineText2Image, Input Line: {image_path}, Input Text: {instruct_text}, " + f"Output Text: {updated_image_path}" + ) return updated_image_path class Image2Hed: def __init__(self, device): print("Initializing Image2Hed") - self.detector = HEDdetector.from_pretrained('lllyasviel/ControlNet') - - @prompts(name="Hed Detection On Image", - description="useful when you want to detect the soft hed boundary of the image. " - "like: detect the soft hed boundary of this image, or hed boundary detection on image, " - "or perform hed boundary detection on this image, or detect soft hed boundary image of this image. " - "The input to this tool should be a string, representing the image_path") + self.detector = HEDdetector.from_pretrained("lllyasviel/ControlNet") + + @prompts( + name="Hed Detection On Image", + description="useful when you want to detect the soft hed boundary of the image. " + "like: detect the soft hed boundary of this image, or hed boundary detection on image, " + "or perform hed boundary detection on this image, or detect soft hed boundary image of this image. " + "The input to this tool should be a string, representing the image_path", + ) def inference(self, inputs): image = Image.open(inputs) hed = self.detector(image) updated_image_path = get_new_image_name(inputs, func_name="hed-boundary") hed.save(updated_image_path) - print(f"\nProcessed Image2Hed, Input Image: {inputs}, Output Hed: {updated_image_path}") + print( + f"\nProcessed Image2Hed, Input Image: {inputs}, Output Hed: {updated_image_path}" + ) return updated_image_path class HedText2Image: def __init__(self, device): print(f"Initializing HedText2Image to {device}") - self.torch_dtype = torch.float16 if 'cuda' in device else torch.float32 - self.controlnet = ControlNetModel.from_pretrained("fusing/stable-diffusion-v1-5-controlnet-hed", - torch_dtype=self.torch_dtype) + self.torch_dtype = torch.float16 if "cuda" in device else torch.float32 + self.controlnet = ControlNetModel.from_pretrained( + "fusing/stable-diffusion-v1-5-controlnet-hed", torch_dtype=self.torch_dtype + ) self.pipe = StableDiffusionControlNetPipeline.from_pretrained( - "runwayml/stable-diffusion-v1-5", controlnet=self.controlnet, safety_checker=StableDiffusionSafetyChecker.from_pretrained('CompVis/stable-diffusion-safety-checker'), - torch_dtype=self.torch_dtype + "runwayml/stable-diffusion-v1-5", + controlnet=self.controlnet, + safety_checker=StableDiffusionSafetyChecker.from_pretrained( + "CompVis/stable-diffusion-safety-checker" + ), + torch_dtype=self.torch_dtype, + ) + self.pipe.scheduler = UniPCMultistepScheduler.from_config( + self.pipe.scheduler.config ) - self.pipe.scheduler = UniPCMultistepScheduler.from_config(self.pipe.scheduler.config) self.pipe.to(device) self.seed = -1 - self.a_prompt = 'best quality, extremely detailed' - self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, ' \ - 'fewer digits, cropped, worst quality, low quality' - - @prompts(name="Generate Image Condition On Soft Hed Boundary Image", - description="useful when you want to generate a new real image from both the user description " - "and a soft hed boundary image. " - "like: generate a real image of a object or something from this soft hed boundary image, " - "or generate a new real image of a object or something from this hed boundary. " - "The input to this tool should be a comma separated string of two, " - "representing the image_path and the user description") + self.a_prompt = "best quality, extremely detailed" + self.n_prompt = ( + "longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, " + "fewer digits, cropped, worst quality, low quality" + ) + + @prompts( + name="Generate Image Condition On Soft Hed Boundary Image", + description="useful when you want to generate a new real image from both the user description " + "and a soft hed boundary image. " + "like: generate a real image of a object or something from this soft hed boundary image, " + "or generate a new real image of a object or something from this hed boundary. " + "The input to this tool should be a comma separated string of two, " + "representing the image_path and the user description", + ) def inference(self, inputs): - image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:]) + image_path, instruct_text = inputs.split(",")[0], ",".join( + inputs.split(",")[1:] + ) image = Image.open(image_path) self.seed = random.randint(0, 65535) seed_everything(self.seed) - prompt = f'{instruct_text}, {self.a_prompt}' - image = self.pipe(prompt, image, num_inference_steps=20, eta=0.0, negative_prompt=self.n_prompt, - guidance_scale=9.0).images[0] + prompt = f"{instruct_text}, {self.a_prompt}" + image = self.pipe( + prompt, + image, + num_inference_steps=20, + eta=0.0, + negative_prompt=self.n_prompt, + guidance_scale=9.0, + ).images[0] updated_image_path = get_new_image_name(image_path, func_name="hed2image") image.save(updated_image_path) - print(f"\nProcessed HedText2Image, Input Hed: {image_path}, Input Text: {instruct_text}, " - f"Output Image: {updated_image_path}") + print( + f"\nProcessed HedText2Image, Input Hed: {image_path}, Input Text: {instruct_text}, " + f"Output Image: {updated_image_path}" + ) return updated_image_path class Image2Scribble: def __init__(self, device): print("Initializing Image2Scribble") - self.detector = HEDdetector.from_pretrained('lllyasviel/ControlNet') - - @prompts(name="Sketch Detection On Image", - description="useful when you want to generate a scribble of the image. " - "like: generate a scribble of this image, or generate a sketch from this image, " - "detect the sketch from this image. " - "The input to this tool should be a string, representing the image_path") + self.detector = HEDdetector.from_pretrained("lllyasviel/ControlNet") + + @prompts( + name="Sketch Detection On Image", + description="useful when you want to generate a scribble of the image. " + "like: generate a scribble of this image, or generate a sketch from this image, " + "detect the sketch from this image. " + "The input to this tool should be a string, representing the image_path", + ) def inference(self, inputs): image = Image.open(inputs) scribble = self.detector(image, scribble=True) updated_image_path = get_new_image_name(inputs, func_name="scribble") scribble.save(updated_image_path) - print(f"\nProcessed Image2Scribble, Input Image: {inputs}, Output Scribble: {updated_image_path}") + print( + f"\nProcessed Image2Scribble, Input Image: {inputs}, Output Scribble: {updated_image_path}" + ) return updated_image_path class ScribbleText2Image: def __init__(self, device): print(f"Initializing ScribbleText2Image to {device}") - self.torch_dtype = torch.float16 if 'cuda' in device else torch.float32 - self.controlnet = ControlNetModel.from_pretrained("fusing/stable-diffusion-v1-5-controlnet-scribble", - torch_dtype=self.torch_dtype) + self.torch_dtype = torch.float16 if "cuda" in device else torch.float32 + self.controlnet = ControlNetModel.from_pretrained( + "fusing/stable-diffusion-v1-5-controlnet-scribble", + torch_dtype=self.torch_dtype, + ) self.pipe = StableDiffusionControlNetPipeline.from_pretrained( - "runwayml/stable-diffusion-v1-5", controlnet=self.controlnet, safety_checker=StableDiffusionSafetyChecker.from_pretrained('CompVis/stable-diffusion-safety-checker'), - torch_dtype=self.torch_dtype + "runwayml/stable-diffusion-v1-5", + controlnet=self.controlnet, + safety_checker=StableDiffusionSafetyChecker.from_pretrained( + "CompVis/stable-diffusion-safety-checker" + ), + torch_dtype=self.torch_dtype, + ) + self.pipe.scheduler = UniPCMultistepScheduler.from_config( + self.pipe.scheduler.config ) - self.pipe.scheduler = UniPCMultistepScheduler.from_config(self.pipe.scheduler.config) self.pipe.to(device) self.seed = -1 - self.a_prompt = 'best quality, extremely detailed' - self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, ' \ - 'fewer digits, cropped, worst quality, low quality' - - @prompts(name="Generate Image Condition On Sketch Image", - description="useful when you want to generate a new real image from both the user description and " - "a scribble image or a sketch image. " - "The input to this tool should be a comma separated string of two, " - "representing the image_path and the user description") + self.a_prompt = "best quality, extremely detailed" + self.n_prompt = ( + "longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, " + "fewer digits, cropped, worst quality, low quality" + ) + + @prompts( + name="Generate Image Condition On Sketch Image", + description="useful when you want to generate a new real image from both the user description and " + "a scribble image or a sketch image. " + "The input to this tool should be a comma separated string of two, " + "representing the image_path and the user description", + ) def inference(self, inputs): - image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:]) + image_path, instruct_text = inputs.split(",")[0], ",".join( + inputs.split(",")[1:] + ) image = Image.open(image_path) self.seed = random.randint(0, 65535) seed_everything(self.seed) - prompt = f'{instruct_text}, {self.a_prompt}' - image = self.pipe(prompt, image, num_inference_steps=20, eta=0.0, negative_prompt=self.n_prompt, - guidance_scale=9.0).images[0] + prompt = f"{instruct_text}, {self.a_prompt}" + image = self.pipe( + prompt, + image, + num_inference_steps=20, + eta=0.0, + negative_prompt=self.n_prompt, + guidance_scale=9.0, + ).images[0] updated_image_path = get_new_image_name(image_path, func_name="scribble2image") image.save(updated_image_path) - print(f"\nProcessed ScribbleText2Image, Input Scribble: {image_path}, Input Text: {instruct_text}, " - f"Output Image: {updated_image_path}") + print( + f"\nProcessed ScribbleText2Image, Input Scribble: {image_path}, Input Text: {instruct_text}, " + f"Output Image: {updated_image_path}" + ) return updated_image_path class Image2Pose: def __init__(self, device): print("Initializing Image2Pose") - self.detector = OpenposeDetector.from_pretrained('lllyasviel/ControlNet') - - @prompts(name="Pose Detection On Image", - description="useful when you want to detect the human pose of the image. " - "like: generate human poses of this image, or generate a pose image from this image. " - "The input to this tool should be a string, representing the image_path") + self.detector = OpenposeDetector.from_pretrained("lllyasviel/ControlNet") + + @prompts( + name="Pose Detection On Image", + description="useful when you want to detect the human pose of the image. " + "like: generate human poses of this image, or generate a pose image from this image. " + "The input to this tool should be a string, representing the image_path", + ) def inference(self, inputs): image = Image.open(inputs) pose = self.detector(image) updated_image_path = get_new_image_name(inputs, func_name="human-pose") pose.save(updated_image_path) - print(f"\nProcessed Image2Pose, Input Image: {inputs}, Output Pose: {updated_image_path}") + print( + f"\nProcessed Image2Pose, Input Image: {inputs}, Output Pose: {updated_image_path}" + ) return updated_image_path class PoseText2Image: def __init__(self, device): print(f"Initializing PoseText2Image to {device}") - self.torch_dtype = torch.float16 if 'cuda' in device else torch.float32 - self.controlnet = ControlNetModel.from_pretrained("fusing/stable-diffusion-v1-5-controlnet-openpose", - torch_dtype=self.torch_dtype) + self.torch_dtype = torch.float16 if "cuda" in device else torch.float32 + self.controlnet = ControlNetModel.from_pretrained( + "fusing/stable-diffusion-v1-5-controlnet-openpose", + torch_dtype=self.torch_dtype, + ) self.pipe = StableDiffusionControlNetPipeline.from_pretrained( - "runwayml/stable-diffusion-v1-5", controlnet=self.controlnet, safety_checker=StableDiffusionSafetyChecker.from_pretrained('CompVis/stable-diffusion-safety-checker'), - torch_dtype=self.torch_dtype) - self.pipe.scheduler = UniPCMultistepScheduler.from_config(self.pipe.scheduler.config) + "runwayml/stable-diffusion-v1-5", + controlnet=self.controlnet, + safety_checker=StableDiffusionSafetyChecker.from_pretrained( + "CompVis/stable-diffusion-safety-checker" + ), + torch_dtype=self.torch_dtype, + ) + self.pipe.scheduler = UniPCMultistepScheduler.from_config( + self.pipe.scheduler.config + ) self.pipe.to(device) self.num_inference_steps = 20 self.seed = -1 self.unconditional_guidance_scale = 9.0 - self.a_prompt = 'best quality, extremely detailed' - self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit,' \ - ' fewer digits, cropped, worst quality, low quality' - - @prompts(name="Generate Image Condition On Pose Image", - description="useful when you want to generate a new real image from both the user description " - "and a human pose image. " - "like: generate a real image of a human from this human pose image, " - "or generate a new real image of a human from this pose. " - "The input to this tool should be a comma separated string of two, " - "representing the image_path and the user description") + self.a_prompt = "best quality, extremely detailed" + self.n_prompt = ( + "longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit," + " fewer digits, cropped, worst quality, low quality" + ) + + @prompts( + name="Generate Image Condition On Pose Image", + description="useful when you want to generate a new real image from both the user description " + "and a human pose image. " + "like: generate a real image of a human from this human pose image, " + "or generate a new real image of a human from this pose. " + "The input to this tool should be a comma separated string of two, " + "representing the image_path and the user description", + ) def inference(self, inputs): - image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:]) + image_path, instruct_text = inputs.split(",")[0], ",".join( + inputs.split(",")[1:] + ) image = Image.open(image_path) self.seed = random.randint(0, 65535) seed_everything(self.seed) - prompt = f'{instruct_text}, {self.a_prompt}' - image = self.pipe(prompt, image, num_inference_steps=20, eta=0.0, negative_prompt=self.n_prompt, - guidance_scale=9.0).images[0] + prompt = f"{instruct_text}, {self.a_prompt}" + image = self.pipe( + prompt, + image, + num_inference_steps=20, + eta=0.0, + negative_prompt=self.n_prompt, + guidance_scale=9.0, + ).images[0] updated_image_path = get_new_image_name(image_path, func_name="pose2image") image.save(updated_image_path) - print(f"\nProcessed PoseText2Image, Input Pose: {image_path}, Input Text: {instruct_text}, " - f"Output Image: {updated_image_path}") + print( + f"\nProcessed PoseText2Image, Input Pose: {image_path}, Input Text: {instruct_text}, " + f"Output Image: {updated_image_path}" + ) return updated_image_path class SegText2Image: def __init__(self, device): print(f"Initializing SegText2Image to {device}") - self.torch_dtype = torch.float16 if 'cuda' in device else torch.float32 - self.controlnet = ControlNetModel.from_pretrained("fusing/stable-diffusion-v1-5-controlnet-seg", - torch_dtype=self.torch_dtype) + self.torch_dtype = torch.float16 if "cuda" in device else torch.float32 + self.controlnet = ControlNetModel.from_pretrained( + "fusing/stable-diffusion-v1-5-controlnet-seg", torch_dtype=self.torch_dtype + ) self.pipe = StableDiffusionControlNetPipeline.from_pretrained( - "runwayml/stable-diffusion-v1-5", controlnet=self.controlnet, safety_checker=StableDiffusionSafetyChecker.from_pretrained('CompVis/stable-diffusion-safety-checker'), - torch_dtype=self.torch_dtype) - self.pipe.scheduler = UniPCMultistepScheduler.from_config(self.pipe.scheduler.config) + "runwayml/stable-diffusion-v1-5", + controlnet=self.controlnet, + safety_checker=StableDiffusionSafetyChecker.from_pretrained( + "CompVis/stable-diffusion-safety-checker" + ), + torch_dtype=self.torch_dtype, + ) + self.pipe.scheduler = UniPCMultistepScheduler.from_config( + self.pipe.scheduler.config + ) self.pipe.to(device) self.seed = -1 - self.a_prompt = 'best quality, extremely detailed' - self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit,' \ - ' fewer digits, cropped, worst quality, low quality' - - @prompts(name="Generate Image Condition On Segmentations", - description="useful when you want to generate a new real image from both the user description and segmentations. " - "like: generate a real image of a object or something from this segmentation image, " - "or generate a new real image of a object or something from these segmentations. " - "The input to this tool should be a comma separated string of two, " - "representing the image_path and the user description") + self.a_prompt = "best quality, extremely detailed" + self.n_prompt = ( + "longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit," + " fewer digits, cropped, worst quality, low quality" + ) + + @prompts( + name="Generate Image Condition On Segmentations", + description="useful when you want to generate a new real image from both the user description and segmentations. " + "like: generate a real image of a object or something from this segmentation image, " + "or generate a new real image of a object or something from these segmentations. " + "The input to this tool should be a comma separated string of two, " + "representing the image_path and the user description", + ) def inference(self, inputs): - image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:]) + image_path, instruct_text = inputs.split(",")[0], ",".join( + inputs.split(",")[1:] + ) image = Image.open(image_path) self.seed = random.randint(0, 65535) seed_everything(self.seed) - prompt = f'{instruct_text}, {self.a_prompt}' - image = self.pipe(prompt, image, num_inference_steps=20, eta=0.0, negative_prompt=self.n_prompt, - guidance_scale=9.0).images[0] + prompt = f"{instruct_text}, {self.a_prompt}" + image = self.pipe( + prompt, + image, + num_inference_steps=20, + eta=0.0, + negative_prompt=self.n_prompt, + guidance_scale=9.0, + ).images[0] updated_image_path = get_new_image_name(image_path, func_name="segment2image") image.save(updated_image_path) - print(f"\nProcessed SegText2Image, Input Seg: {image_path}, Input Text: {instruct_text}, " - f"Output Image: {updated_image_path}") + print( + f"\nProcessed SegText2Image, Input Seg: {image_path}, Input Text: {instruct_text}, " + f"Output Image: {updated_image_path}" + ) return updated_image_path class Image2Depth: def __init__(self, device): print("Initializing Image2Depth") - self.depth_estimator = pipeline('depth-estimation') - - @prompts(name="Predict Depth On Image", - description="useful when you want to detect depth of the image. like: generate the depth from this image, " - "or detect the depth map on this image, or predict the depth for this image. " - "The input to this tool should be a string, representing the image_path") + self.depth_estimator = pipeline("depth-estimation") + + @prompts( + name="Predict Depth On Image", + description="useful when you want to detect depth of the image. like: generate the depth from this image, " + "or detect the depth map on this image, or predict the depth for this image. " + "The input to this tool should be a string, representing the image_path", + ) def inference(self, inputs): image = Image.open(inputs) - depth = self.depth_estimator(image)['depth'] + depth = self.depth_estimator(image)["depth"] depth = np.array(depth) depth = depth[:, :, None] depth = np.concatenate([depth, depth, depth], axis=2) depth = Image.fromarray(depth) updated_image_path = get_new_image_name(inputs, func_name="depth") depth.save(updated_image_path) - print(f"\nProcessed Image2Depth, Input Image: {inputs}, Output Depth: {updated_image_path}") + print( + f"\nProcessed Image2Depth, Input Image: {inputs}, Output Depth: {updated_image_path}" + ) return updated_image_path class DepthText2Image: def __init__(self, device): print(f"Initializing DepthText2Image to {device}") - self.torch_dtype = torch.float16 if 'cuda' in device else torch.float32 + self.torch_dtype = torch.float16 if "cuda" in device else torch.float32 self.controlnet = ControlNetModel.from_pretrained( - "fusing/stable-diffusion-v1-5-controlnet-depth", torch_dtype=self.torch_dtype) + "fusing/stable-diffusion-v1-5-controlnet-depth", + torch_dtype=self.torch_dtype, + ) self.pipe = StableDiffusionControlNetPipeline.from_pretrained( - "runwayml/stable-diffusion-v1-5", controlnet=self.controlnet, safety_checker=StableDiffusionSafetyChecker.from_pretrained('CompVis/stable-diffusion-safety-checker'), - torch_dtype=self.torch_dtype) - self.pipe.scheduler = UniPCMultistepScheduler.from_config(self.pipe.scheduler.config) + "runwayml/stable-diffusion-v1-5", + controlnet=self.controlnet, + safety_checker=StableDiffusionSafetyChecker.from_pretrained( + "CompVis/stable-diffusion-safety-checker" + ), + torch_dtype=self.torch_dtype, + ) + self.pipe.scheduler = UniPCMultistepScheduler.from_config( + self.pipe.scheduler.config + ) self.pipe.to(device) self.seed = -1 - self.a_prompt = 'best quality, extremely detailed' - self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit,' \ - ' fewer digits, cropped, worst quality, low quality' - - @prompts(name="Generate Image Condition On Depth", - description="useful when you want to generate a new real image from both the user description and depth image. " - "like: generate a real image of a object or something from this depth image, " - "or generate a new real image of a object or something from the depth map. " - "The input to this tool should be a comma separated string of two, " - "representing the image_path and the user description") + self.a_prompt = "best quality, extremely detailed" + self.n_prompt = ( + "longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit," + " fewer digits, cropped, worst quality, low quality" + ) + + @prompts( + name="Generate Image Condition On Depth", + description="useful when you want to generate a new real image from both the user description and depth image. " + "like: generate a real image of a object or something from this depth image, " + "or generate a new real image of a object or something from the depth map. " + "The input to this tool should be a comma separated string of two, " + "representing the image_path and the user description", + ) def inference(self, inputs): - image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:]) + image_path, instruct_text = inputs.split(",")[0], ",".join( + inputs.split(",")[1:] + ) image = Image.open(image_path) self.seed = random.randint(0, 65535) seed_everything(self.seed) - prompt = f'{instruct_text}, {self.a_prompt}' - image = self.pipe(prompt, image, num_inference_steps=20, eta=0.0, negative_prompt=self.n_prompt, - guidance_scale=9.0).images[0] + prompt = f"{instruct_text}, {self.a_prompt}" + image = self.pipe( + prompt, + image, + num_inference_steps=20, + eta=0.0, + negative_prompt=self.n_prompt, + guidance_scale=9.0, + ).images[0] updated_image_path = get_new_image_name(image_path, func_name="depth2image") image.save(updated_image_path) - print(f"\nProcessed DepthText2Image, Input Depth: {image_path}, Input Text: {instruct_text}, " - f"Output Image: {updated_image_path}") + print( + f"\nProcessed DepthText2Image, Input Depth: {image_path}, Input Text: {instruct_text}, " + f"Output Image: {updated_image_path}" + ) return updated_image_path class Image2Normal: def __init__(self, device): print("Initializing Image2Normal") - self.depth_estimator = pipeline("depth-estimation", model="Intel/dpt-hybrid-midas") + self.depth_estimator = pipeline( + "depth-estimation", model="Intel/dpt-hybrid-midas" + ) self.bg_threhold = 0.4 - @prompts(name="Predict Normal Map On Image", - description="useful when you want to detect norm map of the image. " - "like: generate normal map from this image, or predict normal map of this image. " - "The input to this tool should be a string, representing the image_path") + @prompts( + name="Predict Normal Map On Image", + description="useful when you want to detect norm map of the image. " + "like: generate normal map from this image, or predict normal map of this image. " + "The input to this tool should be a string, representing the image_path", + ) def inference(self, inputs): image = Image.open(inputs) original_size = image.size - image = self.depth_estimator(image)['predicted_depth'][0] + image = self.depth_estimator(image)["predicted_depth"][0] image = image.numpy() image_depth = image.copy() image_depth -= np.min(image_depth) @@ -726,74 +953,106 @@ class Image2Normal: y[image_depth < self.bg_threhold] = 0 z = np.ones_like(x) * np.pi * 2.0 image = np.stack([x, y, z], axis=2) - image /= np.sum(image ** 2.0, axis=2, keepdims=True) ** 0.5 + image /= np.sum(image**2.0, axis=2, keepdims=True) ** 0.5 image = (image * 127.5 + 127.5).clip(0, 255).astype(np.uint8) image = Image.fromarray(image) image = image.resize(original_size) updated_image_path = get_new_image_name(inputs, func_name="normal-map") image.save(updated_image_path) - print(f"\nProcessed Image2Normal, Input Image: {inputs}, Output Depth: {updated_image_path}") + print( + f"\nProcessed Image2Normal, Input Image: {inputs}, Output Depth: {updated_image_path}" + ) return updated_image_path class NormalText2Image: def __init__(self, device): print(f"Initializing NormalText2Image to {device}") - self.torch_dtype = torch.float16 if 'cuda' in device else torch.float32 + self.torch_dtype = torch.float16 if "cuda" in device else torch.float32 self.controlnet = ControlNetModel.from_pretrained( - "fusing/stable-diffusion-v1-5-controlnet-normal", torch_dtype=self.torch_dtype) + "fusing/stable-diffusion-v1-5-controlnet-normal", + torch_dtype=self.torch_dtype, + ) self.pipe = StableDiffusionControlNetPipeline.from_pretrained( - "runwayml/stable-diffusion-v1-5", controlnet=self.controlnet, safety_checker=StableDiffusionSafetyChecker.from_pretrained('CompVis/stable-diffusion-safety-checker'), - torch_dtype=self.torch_dtype) - self.pipe.scheduler = UniPCMultistepScheduler.from_config(self.pipe.scheduler.config) + "runwayml/stable-diffusion-v1-5", + controlnet=self.controlnet, + safety_checker=StableDiffusionSafetyChecker.from_pretrained( + "CompVis/stable-diffusion-safety-checker" + ), + torch_dtype=self.torch_dtype, + ) + self.pipe.scheduler = UniPCMultistepScheduler.from_config( + self.pipe.scheduler.config + ) self.pipe.to(device) self.seed = -1 - self.a_prompt = 'best quality, extremely detailed' - self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit,' \ - ' fewer digits, cropped, worst quality, low quality' - - @prompts(name="Generate Image Condition On Normal Map", - description="useful when you want to generate a new real image from both the user description and normal map. " - "like: generate a real image of a object or something from this normal map, " - "or generate a new real image of a object or something from the normal map. " - "The input to this tool should be a comma separated string of two, " - "representing the image_path and the user description") + self.a_prompt = "best quality, extremely detailed" + self.n_prompt = ( + "longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit," + " fewer digits, cropped, worst quality, low quality" + ) + + @prompts( + name="Generate Image Condition On Normal Map", + description="useful when you want to generate a new real image from both the user description and normal map. " + "like: generate a real image of a object or something from this normal map, " + "or generate a new real image of a object or something from the normal map. " + "The input to this tool should be a comma separated string of two, " + "representing the image_path and the user description", + ) def inference(self, inputs): - image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:]) + image_path, instruct_text = inputs.split(",")[0], ",".join( + inputs.split(",")[1:] + ) image = Image.open(image_path) self.seed = random.randint(0, 65535) seed_everything(self.seed) - prompt = f'{instruct_text}, {self.a_prompt}' - image = self.pipe(prompt, image, num_inference_steps=20, eta=0.0, negative_prompt=self.n_prompt, - guidance_scale=9.0).images[0] + prompt = f"{instruct_text}, {self.a_prompt}" + image = self.pipe( + prompt, + image, + num_inference_steps=20, + eta=0.0, + negative_prompt=self.n_prompt, + guidance_scale=9.0, + ).images[0] updated_image_path = get_new_image_name(image_path, func_name="normal2image") image.save(updated_image_path) - print(f"\nProcessed NormalText2Image, Input Normal: {image_path}, Input Text: {instruct_text}, " - f"Output Image: {updated_image_path}") + print( + f"\nProcessed NormalText2Image, Input Normal: {image_path}, Input Text: {instruct_text}, " + f"Output Image: {updated_image_path}" + ) return updated_image_path class VisualQuestionAnswering: def __init__(self, device): print(f"Initializing VisualQuestionAnswering to {device}") - self.torch_dtype = torch.float16 if 'cuda' in device else torch.float32 + self.torch_dtype = torch.float16 if "cuda" in device else torch.float32 self.device = device self.processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base") self.model = BlipForQuestionAnswering.from_pretrained( - "Salesforce/blip-vqa-base", torch_dtype=self.torch_dtype).to(self.device) - - @prompts(name="Answer Question About The Image", - description="useful when you need an answer for a question based on an image. " - "like: what is the background color of the last image, how many cats in this figure, what is in this figure. " - "The input to this tool should be a comma separated string of two, representing the image_path and the question") + "Salesforce/blip-vqa-base", torch_dtype=self.torch_dtype + ).to(self.device) + + @prompts( + name="Answer Question About The Image", + description="useful when you need an answer for a question based on an image. " + "like: what is the background color of the last image, how many cats in this figure, what is in this figure. " + "The input to this tool should be a comma separated string of two, representing the image_path and the question", + ) def inference(self, inputs): - image_path, question = inputs.split(",")[0], ','.join(inputs.split(',')[1:]) - raw_image = Image.open(image_path).convert('RGB') - inputs = self.processor(raw_image, question, return_tensors="pt").to(self.device, self.torch_dtype) + image_path, question = inputs.split(",")[0], ",".join(inputs.split(",")[1:]) + raw_image = Image.open(image_path).convert("RGB") + inputs = self.processor(raw_image, question, return_tensors="pt").to( + self.device, self.torch_dtype + ) out = self.model.generate(**inputs) answer = self.processor.decode(out[0], skip_special_tokens=True) - print(f"\nProcessed VisualQuestionAnswering, Input Image: {image_path}, Input Question: {question}, " - f"Output Answer: {answer}") + print( + f"\nProcessed VisualQuestionAnswering, Input Image: {image_path}, Input Question: {question}, " + f"Output Answer: {answer}" + ) return answer @@ -801,7 +1060,7 @@ class Segmenting: def __init__(self, device): print(f"Inintializing Segmentation to {device}") self.device = device - self.torch_dtype = torch.float16 if 'cuda' in device else torch.float32 + self.torch_dtype = torch.float16 if "cuda" in device else torch.float32 self.model_checkpoint_path = os.path.join("checkpoints", "sam") self.download_parameters() @@ -817,8 +1076,13 @@ class Segmenting: if not os.path.exists(self.model_checkpoint_path): wget.download(url, out=self.model_checkpoint_path) - def show_mask(self, mask: np.ndarray, image: np.ndarray, - random_color: bool = False, transparency=1) -> np.ndarray: + def show_mask( + self, + mask: np.ndarray, + image: np.ndarray, + random_color: bool = False, + transparency=1, + ) -> np.ndarray: """Visualize a mask on top of an image. Args: mask (np.ndarray): A 2D array of shape (H, W). @@ -837,18 +1101,21 @@ class Segmenting: h, w = mask.shape[-2:] mask_image = mask.reshape(h, w, 1) * color.reshape(1, 1, -1) * 255 - image = cv2.addWeighted(image, 0.7, mask_image.astype('uint8'), transparency, 0) + image = cv2.addWeighted(image, 0.7, mask_image.astype("uint8"), transparency, 0) return image def show_box(self, box, ax, label): x0, y0 = box[0], box[1] w, h = box[2] - box[0], box[3] - box[1] - ax.add_patch(plt.Rectangle((x0, y0), w, h, edgecolor='green', facecolor=(0, 0, 0, 0), lw=2)) + ax.add_patch( + plt.Rectangle( + (x0, y0), w, h, edgecolor="green", facecolor=(0, 0, 0, 0), lw=2 + ) + ) ax.text(x0, y0, label) def get_mask_with_boxes(self, image_pil, image, boxes_filt): - size = image_pil.size H, W = size[1], size[0] for i in range(boxes_filt.size(0)): @@ -857,7 +1124,9 @@ class Segmenting: boxes_filt[i][2:] += boxes_filt[i][:2] boxes_filt = boxes_filt.cpu() - transformed_boxes = self.sam_predictor.transform.apply_boxes_torch(boxes_filt, image.shape[:2]).to(self.device) + transformed_boxes = self.sam_predictor.transform.apply_boxes_torch( + boxes_filt, image.shape[:2] + ).to(self.device) masks, _, _ = self.sam_predictor.predict_torch( point_coords=None, @@ -868,7 +1137,6 @@ class Segmenting: return masks def segment_image_with_boxes(self, image_pil, image_path, boxes_filt, pred_phrases): - image = cv2.imread(image_path) image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) self.sam_predictor.set_image(image) @@ -878,7 +1146,9 @@ class Segmenting: # draw output image for mask in masks: - image = self.show_mask(mask[0].cpu().numpy(), image, random_color=True, transparency=0.3) + image = self.show_mask( + mask[0].cpu().numpy(), image, random_color=True, transparency=0.3 + ) updated_image_path = get_new_image_name(image_path, func_name="segmentation") @@ -892,8 +1162,9 @@ class Segmenting: with torch.cuda.amp.autocast(): self.sam_predictor.set_image(img) - def show_points(self, coords: np.ndarray, labels: np.ndarray, - image: np.ndarray) -> np.ndarray: + def show_points( + self, coords: np.ndarray, labels: np.ndarray, image: np.ndarray + ) -> np.ndarray: """Visualize points on top of an image. Args: @@ -908,14 +1179,15 @@ class Segmenting: neg_points = coords[labels == 0] for p in pos_points: image = cv2.circle( - image, p.astype(int), radius=3, color=(0, 255, 0), thickness=-1) + image, p.astype(int), radius=3, color=(0, 255, 0), thickness=-1 + ) for p in neg_points: image = cv2.circle( - image, p.astype(int), radius=3, color=(255, 0, 0), thickness=-1) + image, p.astype(int), radius=3, color=(255, 0, 0), thickness=-1 + ) return image def segment_image_with_click(self, img, is_positive: bool): - self.sam_predictor.set_image(img) # self.saved_points.append([evt.index[0], evt.index[1]]) self.saved_labels.append(1 if is_positive else 0) @@ -936,24 +1208,23 @@ class Segmenting: return img - def segment_image_with_coordinate(self, img, is_positive: bool, - coordinate: tuple): - ''' - Args: - img (numpy.ndarray): the given image, shape: H x W x 3. - is_positive: whether the click is positive, if want to add mask use True else False. - coordinate: the position of the click - If the position is (x,y), means click at the x-th column and y-th row of the pixel matrix. - So x correspond to W, and y correspond to H. - Output: - img (PLI.Image.Image): the result image - result_mask (numpy.ndarray): the result mask, shape: H x W - - Other parameters: - transparency (float): the transparenccy of the mask - to control he degree of transparency after the mask is superimposed. - if transparency=1, then the masked part will be completely replaced with other colors. - ''' + def segment_image_with_coordinate(self, img, is_positive: bool, coordinate: tuple): + """ + Args: + img (numpy.ndarray): the given image, shape: H x W x 3. + is_positive: whether the click is positive, if want to add mask use True else False. + coordinate: the position of the click + If the position is (x,y), means click at the x-th column and y-th row of the pixel matrix. + So x correspond to W, and y correspond to H. + Output: + img (PLI.Image.Image): the result image + result_mask (numpy.ndarray): the result mask, shape: H x W + + Other parameters: + transparency (float): the transparenccy of the mask + to control he degree of transparency after the mask is superimposed. + if transparency=1, then the masked part will be completely replaced with other colors. + """ self.sam_predictor.set_image(img) self.saved_points.append([coordinate[0], coordinate[1]]) self.saved_labels.append(1 if is_positive else 0) @@ -978,13 +1249,15 @@ class Segmenting: return img, result_mask - @prompts(name="Segment the Image", - description="useful when you want to segment all the part of the image, but not segment a certain object." - "like: segment all the object in this image, or generate segmentations on this image, " - "or segment the image," - "or perform segmentation on this image, " - "or segment all the object in this image." - "The input to this tool should be a string, representing the image_path") + @prompts( + name="Segment the Image", + description="useful when you want to segment all the part of the image, but not segment a certain object." + "like: segment all the object in this image, or generate segmentations on this image, " + "or segment the image," + "or perform segmentation on this image, " + "or segment all the object in this image." + "The input to this tool should be a string, representing the image_path", + ) def inference_all(self, image_path): image = cv2.imread(image_path) image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) @@ -993,11 +1266,11 @@ class Segmenting: plt.imshow(image) if len(masks) == 0: return - sorted_anns = sorted(masks, key=(lambda x: x['area']), reverse=True) + sorted_anns = sorted(masks, key=(lambda x: x["area"]), reverse=True) ax = plt.gca() ax.set_autoscale_on(False) for ann in sorted_anns: - m = ann['segmentation'] + m = ann["segmentation"] img = np.ones((m.shape[0], m.shape[1], 3)) color_mask = np.random.random((1, 3)).tolist()[0] for i in range(3): @@ -1005,11 +1278,8 @@ class Segmenting: ax.imshow(np.dstack((img, m))) updated_image_path = get_new_image_name(image_path, func_name="segment-image") - plt.axis('off') - plt.savefig( - updated_image_path, - bbox_inches="tight", dpi=300, pad_inches=0.0 - ) + plt.axis("off") + plt.savefig(updated_image_path, bbox_inches="tight", dpi=300, pad_inches=0.0) return updated_image_path @@ -1017,7 +1287,7 @@ class Text2Box: def __init__(self, device): print(f"Initializing ObjectDetection to {device}") self.device = device - self.torch_dtype = torch.float16 if 'cuda' in device else torch.float32 + self.torch_dtype = torch.float16 if "cuda" in device else torch.float32 self.model_checkpoint_path = os.path.join("checkpoints", "groundingdino") self.model_config_path = os.path.join("checkpoints", "grounding_config.py") self.download_parameters() @@ -1052,7 +1322,9 @@ class Text2Box: args.device = self.device model = build_model(args) checkpoint = torch.load(self.model_checkpoint_path, map_location="cpu") - load_res = model.load_state_dict(clean_state_dict(checkpoint["model"]), strict=False) + load_res = model.load_state_dict( + clean_state_dict(checkpoint["model"]), strict=False + ) print(load_res) _ = model.eval() return model @@ -1083,7 +1355,9 @@ class Text2Box: # build pred pred_phrases = [] for logit, box in zip(logits_filt, boxes_filt): - pred_phrase = get_phrases_from_posmap(logit > self.text_threshold, tokenized, tokenlizer) + pred_phrase = get_phrases_from_posmap( + logit > self.text_threshold, tokenized, tokenlizer + ) if with_logits: pred_phrases.append(pred_phrase + f"({str(logit.max().item())[:4]})") else: @@ -1131,10 +1405,12 @@ class Text2Box: return image_pil, mask - @prompts(name="Detect the Give Object", - description="useful when you only want to detect or find out given objects in the picture" - "The input to this tool should be a comma separated string of two, " - "representing the image_path, the text description of the object to be found") + @prompts( + name="Detect the Give Object", + description="useful when you only want to detect or find out given objects in the picture" + "The input to this tool should be a comma separated string of two, " + "representing the image_path, the text description of the object to be found", + ) def inference(self, inputs): image_path, det_prompt = inputs.split(",") print(f"image_path={image_path}, text_prompt={det_prompt}") @@ -1146,31 +1422,49 @@ class Text2Box: pred_dict = { "boxes": boxes_filt, "size": [size[1], size[0]], # H,W - "labels": pred_phrases, } + "labels": pred_phrases, + } image_with_box = self.plot_boxes_to_image(image_pil, pred_dict)[0] - updated_image_path = get_new_image_name(image_path, func_name="detect-something") + updated_image_path = get_new_image_name( + image_path, func_name="detect-something" + ) updated_image = image_with_box.resize(size) updated_image.save(updated_image_path) print( f"\nProcessed ObejectDetecting, Input Image: {image_path}, Object to be Detect {det_prompt}, " - f"Output Image: {updated_image_path}") + f"Output Image: {updated_image_path}" + ) return updated_image_path class Inpainting: def __init__(self, device): self.device = device - self.revision = 'fp16' if 'cuda' in self.device else None - self.torch_dtype = torch.float16 if 'cuda' in self.device else torch.float32 + self.revision = "fp16" if "cuda" in self.device else None + self.torch_dtype = torch.float16 if "cuda" in self.device else torch.float32 self.inpaint = StableDiffusionInpaintPipeline.from_pretrained( - "runwayml/stable-diffusion-inpainting", revision=self.revision, torch_dtype=self.torch_dtype, safety_checker=StableDiffusionSafetyChecker.from_pretrained('CompVis/stable-diffusion-safety-checker')).to(device) - - def __call__(self, prompt, image, mask_image, height=512, width=512, num_inference_steps=50): - update_image = self.inpaint(prompt=prompt, image=image.resize((width, height)), - mask_image=mask_image.resize((width, height)), height=height, width=width, num_inference_steps=num_inference_steps).images[0] + "runwayml/stable-diffusion-inpainting", + revision=self.revision, + torch_dtype=self.torch_dtype, + safety_checker=StableDiffusionSafetyChecker.from_pretrained( + "CompVis/stable-diffusion-safety-checker" + ), + ).to(device) + + def __call__( + self, prompt, image, mask_image, height=512, width=512, num_inference_steps=50 + ): + update_image = self.inpaint( + prompt=prompt, + image=image.resize((width, height)), + mask_image=mask_image.resize((width, height)), + height=height, + width=width, + num_inference_steps=num_inference_steps, + ).images[0] return update_image @@ -1182,94 +1476,133 @@ class InfinityOutPainting: self.ImageCaption = ImageCaptioning self.inpaint = Inpainting self.ImageVQA = VisualQuestionAnswering - self.a_prompt = 'best quality, extremely detailed' - self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, ' \ - 'fewer digits, cropped, worst quality, low quality' + self.a_prompt = "best quality, extremely detailed" + self.n_prompt = ( + "longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, " + "fewer digits, cropped, worst quality, low quality" + ) def get_BLIP_vqa(self, image, question): - inputs = self.ImageVQA.processor(image, question, return_tensors="pt").to(self.ImageVQA.device, - self.ImageVQA.torch_dtype) + inputs = self.ImageVQA.processor(image, question, return_tensors="pt").to( + self.ImageVQA.device, self.ImageVQA.torch_dtype + ) out = self.ImageVQA.model.generate(**inputs) answer = self.ImageVQA.processor.decode(out[0], skip_special_tokens=True) - print(f"\nProcessed VisualQuestionAnswering, Input Question: {question}, Output Answer: {answer}") + print( + f"\nProcessed VisualQuestionAnswering, Input Question: {question}, Output Answer: {answer}" + ) return answer def get_BLIP_caption(self, image): - inputs = self.ImageCaption.processor(image, return_tensors="pt").to(self.ImageCaption.device, - self.ImageCaption.torch_dtype) + inputs = self.ImageCaption.processor(image, return_tensors="pt").to( + self.ImageCaption.device, self.ImageCaption.torch_dtype + ) out = self.ImageCaption.model.generate(**inputs) - BLIP_caption = self.ImageCaption.processor.decode(out[0], skip_special_tokens=True) + BLIP_caption = self.ImageCaption.processor.decode( + out[0], skip_special_tokens=True + ) return BLIP_caption def check_prompt(self, prompt): - check = f"Here is a paragraph with adjectives. " \ - f"{prompt} " \ - f"Please change all plural forms in the adjectives to singular forms. " + check = ( + f"Here is a paragraph with adjectives. " + f"{prompt} " + f"Please change all plural forms in the adjectives to singular forms. " + ) return self.llm(check) def get_imagine_caption(self, image, imagine): BLIP_caption = self.get_BLIP_caption(image) - background_color = self.get_BLIP_vqa(image, 'what is the background color of this image') - style = self.get_BLIP_vqa(image, 'what is the style of this image') - imagine_prompt = f"let's pretend you are an excellent painter and now " \ - f"there is an incomplete painting with {BLIP_caption} in the center, " \ - f"please imagine the complete painting and describe it" \ - f"you should consider the background color is {background_color}, the style is {style}" \ - f"You should make the painting as vivid and realistic as possible" \ - f"You can not use words like painting or picture" \ - f"and you should use no more than 50 words to describe it" + background_color = self.get_BLIP_vqa( + image, "what is the background color of this image" + ) + style = self.get_BLIP_vqa(image, "what is the style of this image") + imagine_prompt = ( + f"let's pretend you are an excellent painter and now " + f"there is an incomplete painting with {BLIP_caption} in the center, " + f"please imagine the complete painting and describe it" + f"you should consider the background color is {background_color}, the style is {style}" + f"You should make the painting as vivid and realistic as possible" + f"You can not use words like painting or picture" + f"and you should use no more than 50 words to describe it" + ) caption = self.llm(imagine_prompt) if imagine else BLIP_caption caption = self.check_prompt(caption) - print(f'BLIP observation: {BLIP_caption}, ChatGPT imagine to {caption}') if imagine else print( - f'Prompt: {caption}') + print( + f"BLIP observation: {BLIP_caption}, ChatGPT imagine to {caption}" + ) if imagine else print(f"Prompt: {caption}") return caption def resize_image(self, image, max_size=1000000, multiple=8): aspect_ratio = image.size[0] / image.size[1] new_width = int(math.sqrt(max_size * aspect_ratio)) new_height = int(new_width / aspect_ratio) - new_width, new_height = new_width - (new_width % multiple), new_height - (new_height % multiple) + new_width, new_height = new_width - (new_width % multiple), new_height - ( + new_height % multiple + ) return image.resize((new_width, new_height)) def dowhile(self, original_img, tosize, expand_ratio, imagine, usr_prompt): old_img = original_img - while (old_img.size != tosize): - prompt = self.check_prompt(usr_prompt) if usr_prompt else self.get_imagine_caption(old_img, imagine) + while old_img.size != tosize: + prompt = ( + self.check_prompt(usr_prompt) + if usr_prompt + else self.get_imagine_caption(old_img, imagine) + ) crop_w = 15 if old_img.size[0] != tosize[0] else 0 crop_h = 15 if old_img.size[1] != tosize[1] else 0 old_img = ImageOps.crop(old_img, (crop_w, crop_h, crop_w, crop_h)) - temp_canvas_size = (expand_ratio * old_img.width if expand_ratio * old_img.width < tosize[0] else tosize[0], - expand_ratio * old_img.height if expand_ratio * old_img.height < tosize[1] else tosize[ - 1]) - temp_canvas, temp_mask = Image.new("RGB", temp_canvas_size, color="white"), Image.new("L", temp_canvas_size, - color="white") - x, y = (temp_canvas.width - old_img.width) // 2, (temp_canvas.height - old_img.height) // 2 + temp_canvas_size = ( + expand_ratio * old_img.width + if expand_ratio * old_img.width < tosize[0] + else tosize[0], + expand_ratio * old_img.height + if expand_ratio * old_img.height < tosize[1] + else tosize[1], + ) + temp_canvas, temp_mask = Image.new( + "RGB", temp_canvas_size, color="white" + ), Image.new("L", temp_canvas_size, color="white") + x, y = (temp_canvas.width - old_img.width) // 2, ( + temp_canvas.height - old_img.height + ) // 2 temp_canvas.paste(old_img, (x, y)) temp_mask.paste(0, (x, y, x + old_img.width, y + old_img.height)) - resized_temp_canvas, resized_temp_mask = self.resize_image(temp_canvas), self.resize_image(temp_mask) - image = self.inpaint(prompt=prompt, image=resized_temp_canvas, mask_image=resized_temp_mask, - height=resized_temp_canvas.height, width=resized_temp_canvas.width, - num_inference_steps=50).resize( - (temp_canvas.width, temp_canvas.height), Image.ANTIALIAS) + resized_temp_canvas, resized_temp_mask = self.resize_image( + temp_canvas + ), self.resize_image(temp_mask) + image = self.inpaint( + prompt=prompt, + image=resized_temp_canvas, + mask_image=resized_temp_mask, + height=resized_temp_canvas.height, + width=resized_temp_canvas.width, + num_inference_steps=50, + ).resize((temp_canvas.width, temp_canvas.height), Image.ANTIALIAS) image = blend_gt2pt(old_img, image) old_img = image return old_img - @prompts(name="Extend An Image", - description="useful when you need to extend an image into a larger image." - "like: extend the image into a resolution of 2048x1024, extend the image into 2048x1024. " - "The input to this tool should be a comma separated string of two, representing the image_path and the resolution of widthxheight") + @prompts( + name="Extend An Image", + description="useful when you need to extend an image into a larger image." + "like: extend the image into a resolution of 2048x1024, extend the image into 2048x1024. " + "The input to this tool should be a comma separated string of two, representing the image_path and the resolution of widthxheight", + ) def inference(self, inputs): - image_path, resolution = inputs.split(',') - width, height = resolution.split('x') + image_path, resolution = inputs.split(",") + width, height = resolution.split("x") tosize = (int(width), int(height)) image = Image.open(image_path) image = ImageOps.crop(image, (10, 10, 10, 10)) out_painted_image = self.dowhile(image, tosize, 4, True, False) updated_image_path = get_new_image_name(image_path, func_name="outpainting") out_painted_image.save(updated_image_path) - print(f"\nProcessed InfinityOutPainting, Input Image: {image_path}, Input Resolution: {resolution}, " - f"Output Image: {updated_image_path}") + print( + f"\nProcessed InfinityOutPainting, Input Image: {image_path}, Input Resolution: {resolution}, " + f"Output Image: {updated_image_path}" + ) return updated_image_path @@ -1281,38 +1614,45 @@ class ObjectSegmenting: self.grounding = Text2Box self.sam = Segmenting - @prompts(name="Segment the given object", - description="useful when you only want to segment the certain objects in the picture" - "according to the given text" - "like: segment the cat," - "or can you segment an obeject for me" - "The input to this tool should be a comma separated string of two, " - "representing the image_path, the text description of the object to be found") + @prompts( + name="Segment the given object", + description="useful when you only want to segment the certain objects in the picture" + "according to the given text" + "like: segment the cat," + "or can you segment an obeject for me" + "The input to this tool should be a comma separated string of two, " + "representing the image_path, the text description of the object to be found", + ) def inference(self, inputs): image_path, det_prompt = inputs.split(",") print(f"image_path={image_path}, text_prompt={det_prompt}") image_pil, image = self.grounding.load_image(image_path) boxes_filt, pred_phrases = self.grounding.get_grounding_boxes(image, det_prompt) - updated_image_path = self.sam.segment_image_with_boxes(image_pil, image_path, boxes_filt, pred_phrases) + updated_image_path = self.sam.segment_image_with_boxes( + image_pil, image_path, boxes_filt, pred_phrases + ) print( f"\nProcessed ObejectSegmenting, Input Image: {image_path}, Object to be Segment {det_prompt}, " - f"Output Image: {updated_image_path}") + f"Output Image: {updated_image_path}" + ) return updated_image_path def merge_masks(self, masks): - ''' - Args: - mask (numpy.ndarray): shape N x 1 x H x W - Outputs: - new_mask (numpy.ndarray): shape H x W - ''' + """ + Args: + mask (numpy.ndarray): shape N x 1 x H x W + Outputs: + new_mask (numpy.ndarray): shape H x W + """ if type(masks) == torch.Tensor: x = masks elif type(masks) == np.ndarray: x = torch.tensor(masks, dtype=int) else: - raise TypeError("the type of the input masks must be numpy.ndarray or torch.tensor") + raise TypeError( + "the type of the input masks must be numpy.ndarray or torch.tensor" + ) x = x.squeeze(dim=1) value, _ = x.max(dim=0) new_mask = value.cpu().numpy() @@ -1320,13 +1660,14 @@ class ObjectSegmenting: return new_mask def get_mask(self, image_path, text_prompt): - print(f"image_path={image_path}, text_prompt={text_prompt}") # image_pil (PIL.Image.Image) -> size: W x H # image (numpy.ndarray) -> H x W x 3 image_pil, image = self.grounding.load_image(image_path) - boxes_filt, pred_phrases = self.grounding.get_grounding_boxes(image, text_prompt) + boxes_filt, pred_phrases = self.grounding.get_grounding_boxes( + image, text_prompt + ) image = cv2.imread(image_path) image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) self.sam.sam_predictor.set_image(image) @@ -1339,7 +1680,9 @@ class ObjectSegmenting: # draw output image for mask in masks: - image = self.sam.show_mask(mask[0].cpu().numpy(), image, random_color=True, transparency=0.3) + image = self.sam.show_mask( + mask[0].cpu().numpy(), image, random_color=True, transparency=0.3 + ) Image.fromarray(merged_mask) @@ -1349,7 +1692,9 @@ class ObjectSegmenting: class ImageEditing: template_model = True - def __init__(self, Text2Box: Text2Box, Segmenting: Segmenting, Inpainting: Inpainting): + def __init__( + self, Text2Box: Text2Box, Segmenting: Segmenting, Inpainting: Inpainting + ): print("Initializing ImageEditing") self.sam = Segmenting self.grounding = Text2Box @@ -1361,32 +1706,44 @@ class ImageEditing: true_indices = np.argwhere(mask) mask_array = np.zeros_like(mask, dtype=bool) for idx in true_indices: - padded_slice = tuple(slice(max(0, i - padding), i + padding + 1) for i in idx) + padded_slice = tuple( + slice(max(0, i - padding), i + padding + 1) for i in idx + ) mask_array[padded_slice] = True new_mask = (mask_array * 255).astype(np.uint8) # new_mask return new_mask - @prompts(name="Remove Something From The Photo", - description="useful when you want to remove and object or something from the photo " - "from its description or location. " - "The input to this tool should be a comma separated string of two, " - "representing the image_path and the object need to be removed. ") + @prompts( + name="Remove Something From The Photo", + description="useful when you want to remove and object or something from the photo " + "from its description or location. " + "The input to this tool should be a comma separated string of two, " + "representing the image_path and the object need to be removed. ", + ) def inference_remove(self, inputs): - image_path, to_be_removed_txt = inputs.split(",")[0], ','.join(inputs.split(',')[1:]) - return self.inference_replace_sam(f"{image_path},{to_be_removed_txt},background") - - @prompts(name="Replace Something From The Photo", - description="useful when you want to replace an object from the object description or " - "location with another object from its description. " - "The input to this tool should be a comma separated string of three, " - "representing the image_path, the object to be replaced, the object to be replaced with ") + image_path, to_be_removed_txt = inputs.split(",")[0], ",".join( + inputs.split(",")[1:] + ) + return self.inference_replace_sam( + f"{image_path},{to_be_removed_txt},background" + ) + + @prompts( + name="Replace Something From The Photo", + description="useful when you want to replace an object from the object description or " + "location with another object from its description. " + "The input to this tool should be a comma separated string of three, " + "representing the image_path, the object to be replaced, the object to be replaced with ", + ) def inference_replace_sam(self, inputs): image_path, to_be_replaced_txt, replace_with_txt = inputs.split(",") print(f"image_path={image_path}, to_be_replaced_txt={to_be_replaced_txt}") image_pil, image = self.grounding.load_image(image_path) - boxes_filt, pred_phrases = self.grounding.get_grounding_boxes(image, to_be_replaced_txt) + boxes_filt, pred_phrases = self.grounding.get_grounding_boxes( + image, to_be_replaced_txt + ) image = cv2.imread(image_path) image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) self.sam.sam_predictor.set_image(image) @@ -1398,35 +1755,46 @@ class ImageEditing: mask = self.pad_edge(mask, padding=20) # numpy mask_image = Image.fromarray(mask) - updated_image = self.inpaint(prompt=replace_with_txt, image=image_pil, - mask_image=mask_image) - updated_image_path = get_new_image_name(image_path, func_name="replace-something") + updated_image = self.inpaint( + prompt=replace_with_txt, image=image_pil, mask_image=mask_image + ) + updated_image_path = get_new_image_name( + image_path, func_name="replace-something" + ) updated_image = updated_image.resize(image_pil.size) updated_image.save(updated_image_path) print( f"\nProcessed ImageEditing, Input Image: {image_path}, Replace {to_be_replaced_txt} to {replace_with_txt}, " - f"Output Image: {updated_image_path}") + f"Output Image: {updated_image_path}" + ) return updated_image_path class BackgroundRemoving: - ''' - using to remove the background of the given picture - ''' + """ + using to remove the background of the given picture + """ + template_model = True - def __init__(self, VisualQuestionAnswering: VisualQuestionAnswering, Text2Box: Text2Box, Segmenting: Segmenting): + def __init__( + self, + VisualQuestionAnswering: VisualQuestionAnswering, + Text2Box: Text2Box, + Segmenting: Segmenting, + ): self.vqa = VisualQuestionAnswering self.obj_segmenting = ObjectSegmenting(Text2Box, Segmenting) - @prompts(name="Remove the background", - description="useful when you want to extract the object or remove the background," - "the input should be a string image_path" - ) + @prompts( + name="Remove the background", + description="useful when you want to extract the object or remove the background," + "the input should be a string image_path", + ) def inference(self, image_path): - ''' - given a image, return the picture only contains the extracted main object - ''' + """ + given a image, return the picture only contains the extracted main object + """ updated_image_path = None mask = self.get_mask(image_path) @@ -1435,20 +1803,22 @@ class BackgroundRemoving: mask = Image.fromarray(mask) image.putalpha(mask) - updated_image_path = get_new_image_name(image_path, func_name="detect-something") + updated_image_path = get_new_image_name( + image_path, func_name="detect-something" + ) image.save(updated_image_path) return updated_image_path def get_mask(self, image_path): - ''' - Description: - given an image path, return the mask of the main object. - Args: - image_path (string): the file path of the image - Outputs: - mask (numpy.ndarray): H x W - ''' + """ + Description: + given an image path, return the mask of the main object. + Args: + image_path (string): the file path of the image + Outputs: + mask (numpy.ndarray): H x W + """ vqa_input = f"{image_path}, what is the main object in the image?" text_prompt = self.vqa.inference(vqa_input) @@ -1463,12 +1833,14 @@ class MultiModalVisualAgent: load_dict, prefix: str = VISUAL_AGENT_PREFIX, format_instructions: str = VISUAL_AGENT_FORMAT_INSTRUCTIONS, - suffix: str = VISUAL_AGENT_SUFFIX + suffix: str = VISUAL_AGENT_SUFFIX, ): print(f"Initializing MultiModalVisualAgent, load_dict={load_dict}") - if 'ImageCaptioning' not in load_dict: - raise ValueError("You have to load ImageCaptioning as a basic function for MultiModalVisualAgent") + if "ImageCaptioning" not in load_dict: + raise ValueError( + "You have to load ImageCaptioning as a basic function for MultiModalVisualAgent" + ) self.models = {} @@ -1476,23 +1848,26 @@ class MultiModalVisualAgent: self.models[class_name] = globals()[class_name](device=device) for class_name, module in globals().items(): - if getattr(module, 'template_model', False): + if getattr(module, "template_model", False): template_required_names = { - k for k in inspect.signature(module.__init__).parameters.keys() if k != 'self' + k + for k in inspect.signature(module.__init__).parameters.keys() + if k != "self" } loaded_names = set([type(e).__name__ for e in self.models.values()]) if template_required_names.issubset(loaded_names): self.models[class_name] = globals()[class_name]( - **{name: self.models[name] for name in template_required_names}) + **{name: self.models[name] for name in template_required_names} + ) print(f"All the Available Functions: {self.models}") self.tools = [] for instance in self.models.values(): for e in dir(instance): - if e.startswith('inference'): + if e.startswith("inference"): func = getattr(instance, e) self.tools.append( Tool(name=func.name, description=func.description, func=func) @@ -1500,8 +1875,7 @@ class MultiModalVisualAgent: self.llm = OpenAI(temperature=0) self.memory = ConversationBufferMemory( - memory_key="chat_history", - output_key='output' + memory_key="chat_history", output_key="output" ) def init_agent(self, lang): @@ -1511,10 +1885,18 @@ class MultiModalVisualAgent: agent_suffix = self.suffix agent_format_instructions = self.format_instructions - if lang == 'English': - PREFIX, FORMAT_INSTRUCTIONS, SUFFIX = agent_prefix, agent_format_instructions, agent_suffix + if lang == "English": + PREFIX, FORMAT_INSTRUCTIONS, SUFFIX = ( + agent_prefix, + agent_format_instructions, + agent_suffix, + ) else: - PREFIX, FORMAT_INSTRUCTIONS, SUFFIX = VISUAL_AGENT_PREFIX_CN, VISUAL_AGENT_FORMAT_INSTRUCTIONS_CN, VISUAL_AGENT_SUFFIX_CN + PREFIX, FORMAT_INSTRUCTIONS, SUFFIX = ( + VISUAL_AGENT_PREFIX_CN, + VISUAL_AGENT_FORMAT_INSTRUCTIONS_CN, + VISUAL_AGENT_SUFFIX_CN, + ) self.agent = initialize_agent( self.tools, @@ -1524,29 +1906,34 @@ class MultiModalVisualAgent: memory=self.memory, return_intermediate_steps=True, agent_kwargs={ - 'prefix': PREFIX, - 'format_instructions': FORMAT_INSTRUCTIONS, - 'suffix': SUFFIX + "prefix": PREFIX, + "format_instructions": FORMAT_INSTRUCTIONS, + "suffix": SUFFIX, }, ) def run_text(self, text): self.agent.memory.buffer = cut_dialogue_history( - self.agent.memory.buffer, - keep_last_n_words=500 + self.agent.memory.buffer, keep_last_n_words=500 ) res = self.agent({"input": text.strip()}) - res['output'] = res['output'].replace("\\", "/") - response = re.sub('(image/[-\w]*.png)', lambda m: f'![](file={m.group(0)})*{m.group(0)}*', res['output']) + res["output"] = res["output"].replace("\\", "/") + response = re.sub( + "(image/[-\w]*.png)", + lambda m: f"![](file={m.group(0)})*{m.group(0)}*", + res["output"], + ) - print(f"\nProcessed run_text, Input text: {text}\n" - f"Current Memory: {self.agent.memory.buffer}") + print( + f"\nProcessed run_text, Input text: {text}\n" + f"Current Memory: {self.agent.memory.buffer}" + ) return response def run_image(self, image, lang): - image_filename = os.path.join('image', f"{str(uuid.uuid4())[:8]}.png") + image_filename = os.path.join("image", f"{str(uuid.uuid4())[:8]}.png") img = Image.open(image) width, height = img.size @@ -1557,22 +1944,26 @@ class MultiModalVisualAgent: height_new = int(np.round(height_new / 64.0)) * 64 img = img.resize((width_new, height_new)) - img = img.convert('RGB') + img = img.convert("RGB") img.save(image_filename, "PNG") - description = self.models['ImageCaptioning'].inference(image_filename) + description = self.models["ImageCaptioning"].inference(image_filename) - if lang == 'Chinese': - Human_prompt = f'\nHuman: 提供一张名为 {image_filename}的图片。它的描述是: {description}。 这些信息帮助你理解这个图像,但是你应该使用工具来完成下面的任务,而不是直接从我的描述中想象。 如果你明白了, 说 \"收到\". \n' + if lang == "Chinese": + Human_prompt = f'\nHuman: 提供一张名为 {image_filename}的图片。它的描述是: {description}。 这些信息帮助你理解这个图像,但是你应该使用工具来完成下面的任务,而不是直接从我的描述中想象。 如果你明白了, 说 "收到". \n' AI_prompt = "收到。 " else: - Human_prompt = f'\nHuman: provide a figure named {image_filename}. The description is: {description}. This information helps you to understand this image, but you should use tools to finish following tasks, rather than directly imagine from my description. If you understand, say \"Received\". \n' + Human_prompt = f'\nHuman: provide a figure named {image_filename}. The description is: {description}. This information helps you to understand this image, but you should use tools to finish following tasks, rather than directly imagine from my description. If you understand, say "Received". \n' AI_prompt = "Received. " - self.agent.memory.buffer = self.agent.memory.buffer + Human_prompt + 'AI: ' + AI_prompt + self.agent.memory.buffer = ( + self.agent.memory.buffer + Human_prompt + "AI: " + AI_prompt + ) - print(f"\nProcessed run_image, Input image: {image_filename}\n" - f"Current Memory: {self.agent.memory.buffer}") + print( + f"\nProcessed run_image, Input image: {image_filename}\n" + f"Current Memory: {self.agent.memory.buffer}" + ) return AI_prompt @@ -1619,12 +2010,7 @@ class MultiModalAgent: """ - def __init__( - self, - load_dict, - temperature: int = 0.1, - language: str = "english" - ): + def __init__(self, load_dict, temperature: int = 0.1, language: str = "english"): self.load_dict = load_dict self.temperature = temperature self.langigage = language @@ -1634,18 +2020,11 @@ class MultiModalAgent: # "ImageCaptioning": "default_device" # } - self.agent = MultiModalVisualAgent( - load_dict, - temperature - ) + self.agent = MultiModalVisualAgent(load_dict, temperature) self.language = language self.history = [] - def run_text( - self, - text: str = None, - language="english" - ): + def run_text(self, text: str = None, language="english"): """Run text through the model""" if language is None: @@ -1657,29 +2036,17 @@ class MultiModalAgent: except Exception as e: return f"Error processing text: {str(e)}" - def run_img( - self, - image_path: str, - language="english" - ): + def run_img(self, image_path: str, language="english"): """If language is None""" if language is None: language = self.default_language try: - return self.agent.run_image( - image_path, - language - ) + return self.agent.run_image(image_path, language) except Exception as error: return f"Error processing image: {str(error)}" - def chat( - self, - msg: str = None, - language: str = "english", - streaming: bool = False - ): + def chat(self, msg: str = None, language: str = "english", streaming: bool = False): """ Run chat with the multi-modal agent @@ -1701,12 +2068,7 @@ class MultiModalAgent: language = self.default_language # add users message to the history - self.history.append( - Message( - "User", - msg - ) - ) + self.history.append(Message("User", msg)) # process msg try: @@ -1714,12 +2076,7 @@ class MultiModalAgent: response = self.agent.run_text(msg) # add agent's response to the history - self.history.append( - Message( - "Agent", - response - ) - ) + self.history.append(Message("Agent", response)) # if streaming is = True if streaming: @@ -1731,18 +2088,10 @@ class MultiModalAgent: error_message = f"Error processing message: {str(error)}" # add error to history - self.history.append( - Message( - "Agent", - error_message - ) - ) + self.history.append(Message("Agent", error_message)) return error_message - def _stream_response( - self, - response: str = None - ): + def _stream_response(self, response: str = None): """ Yield the response token by token (word by word) diff --git a/swarms/agents/multi_modal_workers/omni_agent/get_token_ids.py b/swarms/agents/multi_modal_workers/omni_agent/get_token_ids.py index 1de21442..cf5ab64e 100644 --- a/swarms/agents/multi_modal_workers/omni_agent/get_token_ids.py +++ b/swarms/agents/multi_modal_workers/omni_agent/get_token_ids.py @@ -31,7 +31,7 @@ max_length = { "davinci": 2049, "curie": 2049, "babbage": 2049, - "ada": 2049 + "ada": 2049, } @@ -44,14 +44,14 @@ def get_max_context_length(model_name): def get_token_ids_for_task_parsing(model_name): - text = '''{"task": "text-classification", "token-classification", "text2text-generation", "summarization", "translation", "question-answering", "conversational", "text-generation", "sentence-similarity", "tabular-classification", "object-detection", "image-classification", "image-to-image", "image-to-text", "text-to-image", "visual-question-answering", "document-question-answering", "image-segmentation", "text-to-speech", "text-to-video", "automatic-speech-recognition", "audio-to-audio", "audio-classification", "canny-control", "hed-control", "mlsd-control", "normal-control", "openpose-control", "canny-text-to-image", "depth-text-to-image", "hed-text-to-image", "mlsd-text-to-image", "normal-text-to-image", "openpose-text-to-image", "seg-text-to-image", "args", "text", "path", "dep", "id", "-"}''' + text = """{"task": "text-classification", "token-classification", "text2text-generation", "summarization", "translation", "question-answering", "conversational", "text-generation", "sentence-similarity", "tabular-classification", "object-detection", "image-classification", "image-to-image", "image-to-text", "text-to-image", "visual-question-answering", "document-question-answering", "image-segmentation", "text-to-speech", "text-to-video", "automatic-speech-recognition", "audio-to-audio", "audio-classification", "canny-control", "hed-control", "mlsd-control", "normal-control", "openpose-control", "canny-text-to-image", "depth-text-to-image", "hed-text-to-image", "mlsd-text-to-image", "normal-text-to-image", "openpose-text-to-image", "seg-text-to-image", "args", "text", "path", "dep", "id", "-"}""" res = encodings[model_name].encode(text) res = list(set(res)) return res def get_token_ids_for_choose_model(model_name): - text = '''{"id": "reason"}''' + text = """{"id": "reason"}""" res = encodings[model_name].encode(text) res = list(set(res)) return res diff --git a/swarms/agents/multi_modal_workers/omni_agent/model_server.py b/swarms/agents/multi_modal_workers/omni_agent/model_server.py index bcacc49e..bd9a1cae 100644 --- a/swarms/agents/multi_modal_workers/omni_agent/model_server.py +++ b/swarms/agents/multi_modal_workers/omni_agent/model_server.py @@ -65,7 +65,7 @@ logger = logging.getLogger(__name__) logger.setLevel(logging.INFO) handler = logging.StreamHandler() handler.setLevel(logging.INFO) -formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') +formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s") handler.setFormatter(formatter) logger.addHandler(handler) @@ -100,10 +100,16 @@ def load_pipes(local_deployment): if local_deployment in ["full"]: other_pipes = { "nlpconnect/vit-gpt2-image-captioning": { - "model": VisionEncoderDecoderModel.from_pretrained(f"{local_fold}/nlpconnect/vit-gpt2-image-captioning"), - "feature_extractor": ViTImageProcessor.from_pretrained(f"{local_fold}/nlpconnect/vit-gpt2-image-captioning"), - "tokenizer": AutoTokenizer.from_pretrained(f"{local_fold}/nlpconnect/vit-gpt2-image-captioning"), - "device": device + "model": VisionEncoderDecoderModel.from_pretrained( + f"{local_fold}/nlpconnect/vit-gpt2-image-captioning" + ), + "feature_extractor": ViTImageProcessor.from_pretrained( + f"{local_fold}/nlpconnect/vit-gpt2-image-captioning" + ), + "tokenizer": AutoTokenizer.from_pretrained( + f"{local_fold}/nlpconnect/vit-gpt2-image-captioning" + ), + "device": device, }, # "Salesforce/blip-image-captioning-large": { # "model": BlipForConditionalGeneration.from_pretrained(f"{local_fold}/Salesforce/blip-image-captioning-large"), @@ -111,8 +117,12 @@ def load_pipes(local_deployment): # "device": device # }, "damo-vilab/text-to-video-ms-1.7b": { - "model": DiffusionPipeline.from_pretrained(f"{local_fold}/damo-vilab/text-to-video-ms-1.7b", torch_dtype=torch.float16, variant="fp16"), - "device": device + "model": DiffusionPipeline.from_pretrained( + f"{local_fold}/damo-vilab/text-to-video-ms-1.7b", + torch_dtype=torch.float16, + variant="fp16", + ), + "device": device, }, # "facebook/maskformer-swin-large-ade": { # "model": MaskFormerForInstanceSegmentation.from_pretrained(f"{local_fold}/facebook/maskformer-swin-large-ade"), @@ -130,16 +140,22 @@ def load_pipes(local_deployment): # "device": device # }, "JorisCos/DCCRNet_Libri1Mix_enhsingle_16k": { - "model": BaseModel.from_pretrained("JorisCos/DCCRNet_Libri1Mix_enhsingle_16k"), - "device": device + "model": BaseModel.from_pretrained( + "JorisCos/DCCRNet_Libri1Mix_enhsingle_16k" + ), + "device": device, }, "espnet/kan-bayashi_ljspeech_vits": { - "model": Text2Speech.from_pretrained("espnet/kan-bayashi_ljspeech_vits"), - "device": device + "model": Text2Speech.from_pretrained( + "espnet/kan-bayashi_ljspeech_vits" + ), + "device": device, }, "lambdalabs/sd-image-variations-diffusers": { - "model": DiffusionPipeline.from_pretrained(f"{local_fold}/lambdalabs/sd-image-variations-diffusers"), # torch_dtype=torch.float16 - "device": device + "model": DiffusionPipeline.from_pretrained( + f"{local_fold}/lambdalabs/sd-image-variations-diffusers" + ), # torch_dtype=torch.float16 + "device": device, }, # "CompVis/stable-diffusion-v1-4": { # "model": DiffusionPipeline.from_pretrained(f"{local_fold}/CompVis/stable-diffusion-v1-4"), @@ -150,8 +166,10 @@ def load_pipes(local_deployment): # "device": device # }, "runwayml/stable-diffusion-v1-5": { - "model": DiffusionPipeline.from_pretrained(f"{local_fold}/runwayml/stable-diffusion-v1-5"), - "device": device + "model": DiffusionPipeline.from_pretrained( + f"{local_fold}/runwayml/stable-diffusion-v1-5" + ), + "device": device, }, # "microsoft/speecht5_tts":{ # "processor": SpeechT5Processor.from_pretrained(f"{local_fold}/microsoft/speecht5_tts"), @@ -165,11 +183,19 @@ def load_pipes(local_deployment): # "device": device # }, "microsoft/speecht5_vc": { - "processor": SpeechT5Processor.from_pretrained(f"{local_fold}/microsoft/speecht5_vc"), - "model": SpeechT5ForSpeechToSpeech.from_pretrained(f"{local_fold}/microsoft/speecht5_vc"), - "vocoder": SpeechT5HifiGan.from_pretrained(f"{local_fold}/microsoft/speecht5_hifigan"), - "embeddings_dataset": load_dataset(f"{local_fold}/Matthijs/cmu-arctic-xvectors", split="validation"), - "device": device + "processor": SpeechT5Processor.from_pretrained( + f"{local_fold}/microsoft/speecht5_vc" + ), + "model": SpeechT5ForSpeechToSpeech.from_pretrained( + f"{local_fold}/microsoft/speecht5_vc" + ), + "vocoder": SpeechT5HifiGan.from_pretrained( + f"{local_fold}/microsoft/speecht5_hifigan" + ), + "embeddings_dataset": load_dataset( + f"{local_fold}/Matthijs/cmu-arctic-xvectors", split="validation" + ), + "device": device, }, # "julien-c/wine-quality": { # "model": joblib.load(cached_download(hf_hub_url("julien-c/wine-quality", "sklearn_model.joblib"))) @@ -180,15 +206,23 @@ def load_pipes(local_deployment): # "device": device # }, "facebook/maskformer-swin-base-coco": { - "feature_extractor": MaskFormerFeatureExtractor.from_pretrained(f"{local_fold}/facebook/maskformer-swin-base-coco"), - "model": MaskFormerForInstanceSegmentation.from_pretrained(f"{local_fold}/facebook/maskformer-swin-base-coco"), - "device": device + "feature_extractor": MaskFormerFeatureExtractor.from_pretrained( + f"{local_fold}/facebook/maskformer-swin-base-coco" + ), + "model": MaskFormerForInstanceSegmentation.from_pretrained( + f"{local_fold}/facebook/maskformer-swin-base-coco" + ), + "device": device, }, "Intel/dpt-hybrid-midas": { - "model": DPTForDepthEstimation.from_pretrained(f"{local_fold}/Intel/dpt-hybrid-midas", low_cpu_mem_usage=True), - "feature_extractor": DPTFeatureExtractor.from_pretrained(f"{local_fold}/Intel/dpt-hybrid-midas"), - "device": device - } + "model": DPTForDepthEstimation.from_pretrained( + f"{local_fold}/Intel/dpt-hybrid-midas", low_cpu_mem_usage=True + ), + "feature_extractor": DPTFeatureExtractor.from_pretrained( + f"{local_fold}/Intel/dpt-hybrid-midas" + ), + "device": device, + }, } if local_deployment in ["full", "standard"]: @@ -198,36 +232,53 @@ def load_pipes(local_deployment): # "device": device # }, "openai/whisper-base": { - "model": pipeline(task="automatic-speech-recognition", model=f"{local_fold}/openai/whisper-base"), - "device": device + "model": pipeline( + task="automatic-speech-recognition", + model=f"{local_fold}/openai/whisper-base", + ), + "device": device, }, "microsoft/speecht5_asr": { - "model": pipeline(task="automatic-speech-recognition", model=f"{local_fold}/microsoft/speecht5_asr"), - "device": device + "model": pipeline( + task="automatic-speech-recognition", + model=f"{local_fold}/microsoft/speecht5_asr", + ), + "device": device, }, "Intel/dpt-large": { - "model": pipeline(task="depth-estimation", model=f"{local_fold}/Intel/dpt-large"), - "device": device + "model": pipeline( + task="depth-estimation", model=f"{local_fold}/Intel/dpt-large" + ), + "device": device, }, # "microsoft/beit-base-patch16-224-pt22k-ft22k": { # "model": pipeline(task="image-classification", model=f"{local_fold}/microsoft/beit-base-patch16-224-pt22k-ft22k"), # "device": device # }, "facebook/detr-resnet-50-panoptic": { - "model": pipeline(task="image-segmentation", model=f"{local_fold}/facebook/detr-resnet-50-panoptic"), - "device": device + "model": pipeline( + task="image-segmentation", + model=f"{local_fold}/facebook/detr-resnet-50-panoptic", + ), + "device": device, }, "facebook/detr-resnet-101": { - "model": pipeline(task="object-detection", model=f"{local_fold}/facebook/detr-resnet-101"), - "device": device + "model": pipeline( + task="object-detection", + model=f"{local_fold}/facebook/detr-resnet-101", + ), + "device": device, }, # "openai/clip-vit-large-patch14": { # "model": pipeline(task="zero-shot-image-classification", model=f"{local_fold}/openai/clip-vit-large-patch14"), # "device": device # }, "google/owlvit-base-patch32": { - "model": pipeline(task="zero-shot-object-detection", model=f"{local_fold}/google/owlvit-base-patch32"), - "device": device + "model": pipeline( + task="zero-shot-object-detection", + model=f"{local_fold}/google/owlvit-base-patch32", + ), + "device": device, }, # "microsoft/DialoGPT-medium": { # "model": pipeline(task="conversational", model=f"{local_fold}/microsoft/DialoGPT-medium"), @@ -270,86 +321,121 @@ def load_pipes(local_deployment): # "device": device # }, "impira/layoutlm-document-qa": { - "model": pipeline(task="document-question-answering", model=f"{local_fold}/impira/layoutlm-document-qa"), - "device": device + "model": pipeline( + task="document-question-answering", + model=f"{local_fold}/impira/layoutlm-document-qa", + ), + "device": device, }, "ydshieh/vit-gpt2-coco-en": { - "model": pipeline(task="image-to-text", model=f"{local_fold}/ydshieh/vit-gpt2-coco-en"), - "device": device + "model": pipeline( + task="image-to-text", model=f"{local_fold}/ydshieh/vit-gpt2-coco-en" + ), + "device": device, }, "dandelin/vilt-b32-finetuned-vqa": { - "model": pipeline(task="visual-question-answering", model=f"{local_fold}/dandelin/vilt-b32-finetuned-vqa"), - "device": device - } + "model": pipeline( + task="visual-question-answering", + model=f"{local_fold}/dandelin/vilt-b32-finetuned-vqa", + ), + "device": device, + }, } if local_deployment in ["full", "standard", "minimal"]: - controlnet = ControlNetModel.from_pretrained(f"{local_fold}/lllyasviel/sd-controlnet-canny", torch_dtype=torch.float16) + controlnet = ControlNetModel.from_pretrained( + f"{local_fold}/lllyasviel/sd-controlnet-canny", torch_dtype=torch.float16 + ) controlnetpipe = StableDiffusionControlNetPipeline.from_pretrained( - f"{local_fold}/runwayml/stable-diffusion-v1-5", controlnet=controlnet, torch_dtype=torch.float16 + f"{local_fold}/runwayml/stable-diffusion-v1-5", + controlnet=controlnet, + torch_dtype=torch.float16, ) def mlsd_control_network(): model = MobileV2_MLSD_Large() - model.load_state_dict(torch.load(f"{local_fold}/lllyasviel/ControlNet/annotator/ckpts/mlsd_large_512_fp32.pth"), strict=True) + model.load_state_dict( + torch.load( + f"{local_fold}/lllyasviel/ControlNet/annotator/ckpts/mlsd_large_512_fp32.pth" + ), + strict=True, + ) return MLSDdetector(model) - hed_network = Network(f"{local_fold}/lllyasviel/ControlNet/annotator/ckpts/network-bsds500.pth") + hed_network = Network( + f"{local_fold}/lllyasviel/ControlNet/annotator/ckpts/network-bsds500.pth" + ) controlnet_sd_pipes = { "openpose-control": { - "model": OpenposeDetector(Body(f"{local_fold}/lllyasviel/ControlNet/annotator/ckpts/body_pose_model.pth")) - }, - "mlsd-control": { - "model": mlsd_control_network() - }, - "hed-control": { - "model": HEDdetector(hed_network) - }, - "scribble-control": { - "model": HEDdetector(hed_network) + "model": OpenposeDetector( + Body( + f"{local_fold}/lllyasviel/ControlNet/annotator/ckpts/body_pose_model.pth" + ) + ) }, + "mlsd-control": {"model": mlsd_control_network()}, + "hed-control": {"model": HEDdetector(hed_network)}, + "scribble-control": {"model": HEDdetector(hed_network)}, "midas-control": { - "model": MidasDetector(model_path=f"{local_fold}/lllyasviel/ControlNet/annotator/ckpts/dpt_hybrid-midas-501f0c75.pt") - }, - "canny-control": { - "model": CannyDetector() + "model": MidasDetector( + model_path=f"{local_fold}/lllyasviel/ControlNet/annotator/ckpts/dpt_hybrid-midas-501f0c75.pt" + ) }, + "canny-control": {"model": CannyDetector()}, "lllyasviel/sd-controlnet-canny": { "control": controlnet, "model": controlnetpipe, - "device": device + "device": device, }, "lllyasviel/sd-controlnet-depth": { - "control": ControlNetModel.from_pretrained(f"{local_fold}/lllyasviel/sd-controlnet-depth", torch_dtype=torch.float16), + "control": ControlNetModel.from_pretrained( + f"{local_fold}/lllyasviel/sd-controlnet-depth", + torch_dtype=torch.float16, + ), "model": controlnetpipe, - "device": device + "device": device, }, "lllyasviel/sd-controlnet-hed": { - "control": ControlNetModel.from_pretrained(f"{local_fold}/lllyasviel/sd-controlnet-hed", torch_dtype=torch.float16), + "control": ControlNetModel.from_pretrained( + f"{local_fold}/lllyasviel/sd-controlnet-hed", + torch_dtype=torch.float16, + ), "model": controlnetpipe, - "device": device + "device": device, }, "lllyasviel/sd-controlnet-mlsd": { - "control": ControlNetModel.from_pretrained(f"{local_fold}/lllyasviel/sd-controlnet-mlsd", torch_dtype=torch.float16), + "control": ControlNetModel.from_pretrained( + f"{local_fold}/lllyasviel/sd-controlnet-mlsd", + torch_dtype=torch.float16, + ), "model": controlnetpipe, - "device": device + "device": device, }, "lllyasviel/sd-controlnet-openpose": { - "control": ControlNetModel.from_pretrained(f"{local_fold}/lllyasviel/sd-controlnet-openpose", torch_dtype=torch.float16), + "control": ControlNetModel.from_pretrained( + f"{local_fold}/lllyasviel/sd-controlnet-openpose", + torch_dtype=torch.float16, + ), "model": controlnetpipe, - "device": device + "device": device, }, "lllyasviel/sd-controlnet-scribble": { - "control": ControlNetModel.from_pretrained(f"{local_fold}/lllyasviel/sd-controlnet-scribble", torch_dtype=torch.float16), + "control": ControlNetModel.from_pretrained( + f"{local_fold}/lllyasviel/sd-controlnet-scribble", + torch_dtype=torch.float16, + ), "model": controlnetpipe, - "device": device + "device": device, }, "lllyasviel/sd-controlnet-seg": { - "control": ControlNetModel.from_pretrained(f"{local_fold}/lllyasviel/sd-controlnet-seg", torch_dtype=torch.float16), + "control": ControlNetModel.from_pretrained( + f"{local_fold}/lllyasviel/sd-controlnet-seg", + torch_dtype=torch.float16, + ), "model": controlnetpipe, - "device": device - } + "device": device, + }, } pipes = {**standard_pipes, **other_pipes, **controlnet_sd_pipes} return pipes @@ -363,14 +449,17 @@ during = end - start print(f"[ ready ] {during}s") -@app.route('/running', methods=['GET']) +@app.route("/running", methods=["GET"]) def running(): return jsonify({"running": True}) -@app.route('/status/', methods=['GET']) +@app.route("/status/", methods=["GET"]) def status(model_id): - disabled_models = ["microsoft/trocr-base-printed", "microsoft/trocr-base-handwritten"] + disabled_models = [ + "microsoft/trocr-base-printed", + "microsoft/trocr-base-handwritten", + ] if model_id in pipes.keys() and model_id not in disabled_models: print(f"[ check {model_id} ] success") return jsonify({"loaded": True}) @@ -379,7 +468,7 @@ def status(model_id): return jsonify({"loaded": False}) -@app.route('/models/', methods=['POST']) +@app.route("/models/", methods=["POST"]) def models(model_id): while "using" in pipes[model_id] and pipes[model_id]["using"]: print(f"[ inference {model_id} ] waiting") @@ -402,23 +491,29 @@ def models(model_id): try: # text to video if model_id == "damo-vilab/text-to-video-ms-1.7b": - pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config) + pipe.scheduler = DPMSolverMultistepScheduler.from_config( + pipe.scheduler.config + ) # pipe.enable_model_cpu_offload() prompt = request.get_json()["text"] video_frames = pipe(prompt, num_inference_steps=50, num_frames=40).frames video_path = export_to_video(video_frames) file_name = str(uuid.uuid4())[:4] - os.system(f"LD_LIBRARY_PATH=/usr/local/lib /usr/local/bin/ffmpeg -i {video_path} -vcodec libx264 public/videos/{file_name}.mp4") + os.system( + f"LD_LIBRARY_PATH=/usr/local/lib /usr/local/bin/ffmpeg -i {video_path} -vcodec libx264 public/videos/{file_name}.mp4" + ) result = {"path": f"/videos/{file_name}.mp4"} # controlnet if model_id.startswith("lllyasviel/sd-controlnet-"): - pipe.controlnet.to('cpu') + pipe.controlnet.to("cpu") pipe.controlnet = pipes[model_id]["control"].to(pipes[model_id]["device"]) pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config) control_image = load_image(request.get_json()["img_url"]) # generator = torch.manual_seed(66) - out_image: Image = pipe(request.get_json()["text"], num_inference_steps=20, image=control_image).images[0] + out_image: Image = pipe( + request.get_json()["text"], num_inference_steps=20, image=control_image + ).images[0] file_name = str(uuid.uuid4())[:4] out_image.save(f"public/images/{file_name}.png") result = {"path": f"/images/{file_name}.png"} @@ -441,17 +536,20 @@ def models(model_id): file_name = str(uuid.uuid4())[:4] with open(f"public/images/{file_name}.png", "wb") as f: f.write(request.data) - tform = transforms.Compose([ - transforms.ToTensor(), - transforms.Resize( - (224, 224), - interpolation=transforms.InterpolationMode.BICUBIC, - antialias=False, - ), - transforms.Normalize( - [0.48145466, 0.4578275, 0.40821073], - [0.26862954, 0.26130258, 0.27577711]), - ]) + tform = transforms.Compose( + [ + transforms.ToTensor(), + transforms.Resize( + (224, 224), + interpolation=transforms.InterpolationMode.BICUBIC, + antialias=False, + ), + transforms.Normalize( + [0.48145466, 0.4578275, 0.40821073], + [0.26862954, 0.26130258, 0.27577711], + ), + ] + ) inp = tform(im).to(pipes[model_id]["device"]).unsqueeze(0) out = pipe(inp, guidance_scale=3) out["images"][0].save(f"public/images/{file_name}.jpg") @@ -459,30 +557,47 @@ def models(model_id): # image to text if model_id == "Salesforce/blip-image-captioning-large": - raw_image = load_image(request.get_json()["img_url"]).convert('RGB') + raw_image = load_image(request.get_json()["img_url"]).convert("RGB") text = request.get_json()["text"] - inputs = pipes[model_id]["processor"](raw_image, return_tensors="pt").to(pipes[model_id]["device"]) + inputs = pipes[model_id]["processor"](raw_image, return_tensors="pt").to( + pipes[model_id]["device"] + ) out = pipe.generate(**inputs) - caption = pipes[model_id]["processor"].decode(out[0], skip_special_tokens=True) + caption = pipes[model_id]["processor"].decode( + out[0], skip_special_tokens=True + ) result = {"generated text": caption} if model_id == "ydshieh/vit-gpt2-coco-en": img_url = request.get_json()["img_url"] - generated_text = pipe(img_url)[0]['generated_text'] + generated_text = pipe(img_url)[0]["generated_text"] result = {"generated text": generated_text} if model_id == "nlpconnect/vit-gpt2-image-captioning": image = load_image(request.get_json()["img_url"]).convert("RGB") - pixel_values = pipes[model_id]["feature_extractor"](images=image, return_tensors="pt").pixel_values + pixel_values = pipes[model_id]["feature_extractor"]( + images=image, return_tensors="pt" + ).pixel_values pixel_values = pixel_values.to(pipes[model_id]["device"]) - generated_ids = pipe.generate(pixel_values, **{"max_length": 200, "num_beams": 1}) - generated_text = pipes[model_id]["tokenizer"].batch_decode(generated_ids, skip_special_tokens=True)[0] + generated_ids = pipe.generate( + pixel_values, **{"max_length": 200, "num_beams": 1} + ) + generated_text = pipes[model_id]["tokenizer"].batch_decode( + generated_ids, skip_special_tokens=True + )[0] result = {"generated text": generated_text} # image to text: OCR - if model_id == "microsoft/trocr-base-printed" or model_id == "microsoft/trocr-base-handwritten": + if ( + model_id == "microsoft/trocr-base-printed" + or model_id == "microsoft/trocr-base-handwritten" + ): image = load_image(request.get_json()["img_url"]).convert("RGB") - pixel_values = pipes[model_id]["processor"](image, return_tensors="pt").pixel_values + pixel_values = pipes[model_id]["processor"]( + image, return_tensors="pt" + ).pixel_values pixel_values = pixel_values.to(pipes[model_id]["device"]) generated_ids = pipe.generate(pixel_values) - generated_text = pipes[model_id]["processor"].batch_decode(generated_ids, skip_special_tokens=True)[0] + generated_text = pipes[model_id]["processor"].batch_decode( + generated_ids, skip_special_tokens=True + )[0] result = {"generated text": generated_text} # text to image @@ -494,9 +609,87 @@ def models(model_id): result = {"path": f"/images/{file_name}.jpg"} # object detection - if model_id == "google/owlvit-base-patch32" or model_id == "facebook/detr-resnet-101": + if ( + model_id == "google/owlvit-base-patch32" + or model_id == "facebook/detr-resnet-101" + ): img_url = request.get_json()["img_url"] - open_types = ["cat", "couch", "person", "car", "dog", "horse", "sheep", "cow", "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee", "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard", "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple", "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch", "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone", "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear", "hair drier", "toothbrush", "traffic light", "fire hydrant", "stop sign", "parking meter", "bench", "bird"] + open_types = [ + "cat", + "couch", + "person", + "car", + "dog", + "horse", + "sheep", + "cow", + "elephant", + "bear", + "zebra", + "giraffe", + "backpack", + "umbrella", + "handbag", + "tie", + "suitcase", + "frisbee", + "skis", + "snowboard", + "sports ball", + "kite", + "baseball bat", + "baseball glove", + "skateboard", + "surfboard", + "tennis racket", + "bottle", + "wine glass", + "cup", + "fork", + "knife", + "spoon", + "bowl", + "banana", + "apple", + "sandwich", + "orange", + "broccoli", + "carrot", + "hot dog", + "pizza", + "donut", + "cake", + "chair", + "couch", + "potted plant", + "bed", + "dining table", + "toilet", + "tv", + "laptop", + "mouse", + "remote", + "keyboard", + "cell phone", + "microwave", + "oven", + "toaster", + "sink", + "refrigerator", + "book", + "clock", + "vase", + "scissors", + "teddy bear", + "hair drier", + "toothbrush", + "traffic light", + "fire hydrant", + "stop sign", + "parking meter", + "bench", + "bird", + ] result = pipe(img_url, candidate_labels=open_types) # VQA @@ -514,14 +707,16 @@ def models(model_id): # depth-estimation if model_id == "Intel/dpt-large": output = pipe(request.get_json()["img_url"]) - image = output['depth'] + image = output["depth"] name = str(uuid.uuid4())[:4] image.save(f"public/images/{name}.jpg") result = {"path": f"/images/{name}.jpg"} if model_id == "Intel/dpt-hybrid-midas" and model_id == "Intel/dpt-large": image = load_image(request.get_json()["img_url"]) - inputs = pipes[model_id]["feature_extractor"](images=image, return_tensors="pt") + inputs = pipes[model_id]["feature_extractor"]( + images=image, return_tensors="pt" + ) with torch.no_grad(): outputs = pipe(**inputs) predicted_depth = outputs.predicted_depth @@ -550,11 +745,21 @@ def models(model_id): text = request.get_json()["text"] inputs = pipes[model_id]["processor"](text=text, return_tensors="pt") embeddings_dataset = pipes[model_id]["embeddings_dataset"] - speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0).to(pipes[model_id]["device"]) + speaker_embeddings = ( + torch.tensor(embeddings_dataset[7306]["xvector"]) + .unsqueeze(0) + .to(pipes[model_id]["device"]) + ) pipes[model_id]["vocoder"].to(pipes[model_id]["device"]) - speech = pipe.generate_speech(inputs["input_ids"].to(pipes[model_id]["device"]), speaker_embeddings, vocoder=pipes[model_id]["vocoder"]) + speech = pipe.generate_speech( + inputs["input_ids"].to(pipes[model_id]["device"]), + speaker_embeddings, + vocoder=pipes[model_id]["vocoder"], + ) name = str(uuid.uuid4())[:4] - sf.write(f"public/audios/{name}.wav", speech.cpu().numpy(), samplerate=16000) + sf.write( + f"public/audios/{name}.wav", speech.cpu().numpy(), samplerate=16000 + ) result = {"path": f"/audios/{name}.wav"} # ASR @@ -569,19 +774,31 @@ def models(model_id): with torch.no_grad(): result_wav = pipe(wav.to(pipes[model_id]["device"])) name = str(uuid.uuid4())[:4] - sf.write(f"public/audios/{name}.wav", result_wav.cpu().squeeze().numpy(), sr) + sf.write( + f"public/audios/{name}.wav", result_wav.cpu().squeeze().numpy(), sr + ) result = {"path": f"/audios/{name}.wav"} if model_id == "microsoft/speecht5_vc": audio_url = request.get_json()["audio_url"] wav, sr = torchaudio.load(audio_url) - inputs = pipes[model_id]["processor"](audio=wav, sampling_rate=sr, return_tensors="pt") + inputs = pipes[model_id]["processor"]( + audio=wav, sampling_rate=sr, return_tensors="pt" + ) embeddings_dataset = pipes[model_id]["embeddings_dataset"] - speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0) + speaker_embeddings = torch.tensor( + embeddings_dataset[7306]["xvector"] + ).unsqueeze(0) pipes[model_id]["vocoder"].to(pipes[model_id]["device"]) - speech = pipe.generate_speech(inputs["input_ids"].to(pipes[model_id]["device"]), speaker_embeddings, vocoder=pipes[model_id]["vocoder"]) + speech = pipe.generate_speech( + inputs["input_ids"].to(pipes[model_id]["device"]), + speaker_embeddings, + vocoder=pipes[model_id]["vocoder"], + ) name = str(uuid.uuid4())[:4] - sf.write(f"public/audios/{name}.wav", speech.cpu().numpy(), samplerate=16000) + sf.write( + f"public/audios/{name}.wav", speech.cpu().numpy(), samplerate=16000 + ) result = {"path": f"/audios/{name}.wav"} # segmentation @@ -592,24 +809,44 @@ def models(model_id): colors = [] for i in range(len(segments)): - colors.append((random.randint(100, 255), random.randint(100, 255), random.randint(100, 255), 50)) + colors.append( + ( + random.randint(100, 255), + random.randint(100, 255), + random.randint(100, 255), + 50, + ) + ) for segment in segments: mask = segment["mask"] - mask = mask.convert('L') - layer = Image.new('RGBA', mask.size, colors[i]) + mask = mask.convert("L") + layer = Image.new("RGBA", mask.size, colors[i]) image.paste(layer, (0, 0), mask) name = str(uuid.uuid4())[:4] image.save(f"public/images/{name}.jpg") result = {"path": f"/images/{name}.jpg"} - if model_id == "facebook/maskformer-swin-base-coco" or model_id == "facebook/maskformer-swin-large-ade": + if ( + model_id == "facebook/maskformer-swin-base-coco" + or model_id == "facebook/maskformer-swin-large-ade" + ): image = load_image(request.get_json()["img_url"]) - inputs = pipes[model_id]["feature_extractor"](images=image, return_tensors="pt").to(pipes[model_id]["device"]) + inputs = pipes[model_id]["feature_extractor"]( + images=image, return_tensors="pt" + ).to(pipes[model_id]["device"]) outputs = pipe(**inputs) - result = pipes[model_id]["feature_extractor"].post_process_panoptic_segmentation(outputs, target_sizes=[image.size[::-1]])[0] + result = pipes[model_id][ + "feature_extractor" + ].post_process_panoptic_segmentation( + outputs, target_sizes=[image.size[::-1]] + )[ + 0 + ] predicted_panoptic_map = result["segmentation"].cpu().numpy() - predicted_panoptic_map = Image.fromarray(predicted_panoptic_map.astype(np.uint8)) + predicted_panoptic_map = Image.fromarray( + predicted_panoptic_map.astype(np.uint8) + ) name = str(uuid.uuid4())[:4] predicted_panoptic_map.save(f"public/images/{name}.jpg") result = {"path": f"/images/{name}.jpg"} @@ -641,7 +878,7 @@ def models(model_id): return jsonify(result) -if __name__ == '__main__': +if __name__ == "__main__": # temp folders if not os.path.exists("public/audios"): os.makedirs("public/audios") diff --git a/swarms/agents/multi_modal_workers/omni_agent/omni_chat.py b/swarms/agents/multi_modal_workers/omni_agent/omni_chat.py index aaebd9cb..2198af25 100644 --- a/swarms/agents/multi_modal_workers/omni_agent/omni_chat.py +++ b/swarms/agents/multi_modal_workers/omni_agent/omni_chat.py @@ -54,7 +54,7 @@ max_length = { "davinci": 2049, "curie": 2049, "babbage": 2049, - "ada": 2049 + "ada": 2049, } @@ -67,14 +67,14 @@ def get_max_context_length(model_name): def get_token_ids_for_task_parsing(model_name): - text = '''{"task": "text-classification", "token-classification", "text2text-generation", "summarization", "translation", "question-answering", "conversational", "text-generation", "sentence-similarity", "tabular-classification", "object-detection", "image-classification", "image-to-image", "image-to-text", "text-to-image", "visual-question-answering", "document-question-answering", "image-segmentation", "text-to-speech", "text-to-video", "automatic-speech-recognition", "audio-to-audio", "audio-classification", "canny-control", "hed-control", "mlsd-control", "normal-control", "openpose-control", "canny-text-to-image", "depth-text-to-image", "hed-text-to-image", "mlsd-text-to-image", "normal-text-to-image", "openpose-text-to-image", "seg-text-to-image", "args", "text", "path", "dep", "id", "-"}''' + text = """{"task": "text-classification", "token-classification", "text2text-generation", "summarization", "translation", "question-answering", "conversational", "text-generation", "sentence-similarity", "tabular-classification", "object-detection", "image-classification", "image-to-image", "image-to-text", "text-to-image", "visual-question-answering", "document-question-answering", "image-segmentation", "text-to-speech", "text-to-video", "automatic-speech-recognition", "audio-to-audio", "audio-classification", "canny-control", "hed-control", "mlsd-control", "normal-control", "openpose-control", "canny-text-to-image", "depth-text-to-image", "hed-text-to-image", "mlsd-text-to-image", "normal-text-to-image", "openpose-text-to-image", "seg-text-to-image", "args", "text", "path", "dep", "id", "-"}""" res = encodings[model_name].encode(text) res = list(set(res)) return res def get_token_ids_for_choose_model(model_name): - text = '''{"id": "reason"}''' + text = """{"id": "reason"}""" res = encodings[model_name].encode(text) res = list(set(res)) return res @@ -82,7 +82,11 @@ def get_token_ids_for_choose_model(model_name): ######### parser = argparse.ArgumentParser() -parser.add_argument("--config", type=str, default="swarms/agents/workers/multi_modal_workers/omni_agent/config.yml") +parser.add_argument( + "--config", + type=str, + default="swarms/agents/workers/multi_modal_workers/omni_agent/config.yml", +) parser.add_argument("--mode", type=str, default="cli") args = parser.parse_args() @@ -102,7 +106,7 @@ logger = logging.getLogger(__name__) logger.setLevel(logging.DEBUG) handler = logging.StreamHandler() -formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') +formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s") handler.setFormatter(formatter) if not config["debug"]: handler.setLevel(logging.CRITICAL) @@ -143,7 +147,9 @@ elif "azure" in config: elif "openai" in config: API_TYPE = "openai" else: - logger.warning(f"No endpoint specified in {args.config}. The endpoint will be set dynamically according to the client.") + logger.warning( + f"No endpoint specified in {args.config}. The endpoint will be set dynamically according to the client." + ) if args.mode in ["test", "cli"]: assert API_TYPE, "Only server mode supports dynamic endpoint." @@ -157,9 +163,13 @@ elif API_TYPE == "azure": API_KEY = config["azure"]["api_key"] elif API_TYPE == "openai": API_ENDPOINT = f"https://api.openai.com/v1/{api_name}" - if config["openai"]["api_key"].startswith("sk-"): # Check for valid OpenAI key in config file + if config["openai"]["api_key"].startswith( + "sk-" + ): # Check for valid OpenAI key in config file API_KEY = config["openai"]["api_key"] - elif "OPENAI_API_KEY" in os.environ and os.getenv("OPENAI_API_KEY").startswith("sk-"): # Check for environment variable OPENAI_API_KEY + elif "OPENAI_API_KEY" in os.environ and os.getenv("OPENAI_API_KEY").startswith( + "sk-" + ): # Check for environment variable OPENAI_API_KEY API_KEY = os.getenv("OPENAI_API_KEY") else: raise ValueError(f"Incorrect OpenAI key. Please check your {args.config} file.") @@ -175,7 +185,12 @@ inference_mode = config["inference_mode"] # check the local_inference_endpoint Model_Server = None if inference_mode != "huggingface": - Model_Server = "http://" + config["local_inference_endpoint"]["host"] + ":" + str(config["local_inference_endpoint"]["port"]) + Model_Server = ( + "http://" + + config["local_inference_endpoint"]["host"] + + ":" + + str(config["local_inference_endpoint"]["port"]) + ) message = f"The server of local inference endpoints is not running, please start it first. (or using `inference_mode: huggingface` in {args.config} for a feature-limited experience)" try: r = requests.get(Model_Server + "/running") @@ -185,9 +200,15 @@ if inference_mode != "huggingface": raise ValueError(message) -parse_task_demos_or_presteps = open(config["demos_or_presteps"]["parse_task"], "r").read() -choose_model_demos_or_presteps = open(config["demos_or_presteps"]["choose_model"], "r").read() -response_results_demos_or_presteps = open(config["demos_or_presteps"]["response_results"], "r").read() +parse_task_demos_or_presteps = open( + config["demos_or_presteps"]["parse_task"], "r" +).read() +choose_model_demos_or_presteps = open( + config["demos_or_presteps"]["choose_model"], "r" +).read() +response_results_demos_or_presteps = open( + config["demos_or_presteps"]["response_results"], "r" +).read() parse_task_prompt = config["prompt"]["parse_task"] choose_model_prompt = config["prompt"]["choose_model"] @@ -209,37 +230,54 @@ for model in MODELS: METADATAS[model["id"]] = model HUGGINGFACE_HEADERS = {} -if config["huggingface"]["token"] and config["huggingface"]["token"].startswith("hf_"): # Check for valid huggingface token in config file +if config["huggingface"]["token"] and config["huggingface"]["token"].startswith( + "hf_" +): # Check for valid huggingface token in config file HUGGINGFACE_HEADERS = { "Authorization": f"Bearer {config['huggingface']['token']}", } -elif "HUGGINGFACE_ACCESS_TOKEN" in os.environ and os.getenv("HUGGINGFACE_ACCESS_TOKEN").startswith("hf_"): # Check for environment variable HUGGINGFACE_ACCESS_TOKEN +elif "HUGGINGFACE_ACCESS_TOKEN" in os.environ and os.getenv( + "HUGGINGFACE_ACCESS_TOKEN" +).startswith( + "hf_" +): # Check for environment variable HUGGINGFACE_ACCESS_TOKEN HUGGINGFACE_HEADERS = { "Authorization": f"Bearer {os.getenv('HUGGINGFACE_ACCESS_TOKEN')}", } else: - raise ValueError(f"Incorrect HuggingFace token. Please check your {args.config} file.") + raise ValueError( + f"Incorrect HuggingFace token. Please check your {args.config} file." + ) def convert_chat_to_completion(data): - messages = data.pop('messages', []) + messages = data.pop("messages", []) tprompt = "" - if messages[0]['role'] == "system": - tprompt = messages[0]['content'] + if messages[0]["role"] == "system": + tprompt = messages[0]["content"] messages = messages[1:] final_prompt = "" for message in messages: - if message['role'] == "user": - final_prompt += ("" + "user" + "\n" + message['content'] + "\n") - elif message['role'] == "assistant": - final_prompt += ("" + "assistant" + "\n" + message['content'] + "\n") + if message["role"] == "user": + final_prompt += ( + "" + "user" + "\n" + message["content"] + "\n" + ) + elif message["role"] == "assistant": + final_prompt += ( + "" + "assistant" + "\n" + message["content"] + "\n" + ) else: - final_prompt += ("" + "system" + "\n" + message['content'] + "\n") + final_prompt += ( + "" + "system" + "\n" + message["content"] + "\n" + ) final_prompt = tprompt + final_prompt final_prompt = final_prompt + "assistant" data["prompt"] = final_prompt - data['stop'] = data.get('stop', [""]) - data['max_tokens'] = data.get('max_tokens', max(get_max_context_length(LLM) - count_tokens(LLM_encoding, final_prompt), 1)) + data["stop"] = data.get("stop", [""]) + data["max_tokens"] = data.get( + "max_tokens", + max(get_max_context_length(LLM) - count_tokens(LLM_encoding, final_prompt), 1), + ) return data @@ -250,14 +288,9 @@ def send_request(data): if use_completion: data = convert_chat_to_completion(data) if api_type == "openai": - HEADER = { - "Authorization": f"Bearer {api_key}" - } + HEADER = {"Authorization": f"Bearer {api_key}"} elif api_type == "azure": - HEADER = { - "api-key": api_key, - "Content-Type": "application/json" - } + HEADER = {"api-key": api_key, "Content-Type": "application/json"} else: HEADER = None response = requests.post(api_endpoint, json=data, headers=HEADER, proxies=PROXY) @@ -274,15 +307,17 @@ def replace_slot(text, entries): for key, value in entries.items(): if not isinstance(value, str): value = str(value) - text = text.replace("{{" + key + "}}", value.replace('"', "'").replace('\n', "")) + text = text.replace( + "{{" + key + "}}", value.replace('"', "'").replace("\n", "") + ) return text def find_json(s): - s = s.replace("\'", "\"") + s = s.replace("'", '"') start = s.find("{") end = s.rfind("}") - res = s[start:end + 1] + res = s[start : end + 1] res = res.replace("\n", "") return res @@ -290,10 +325,10 @@ def find_json(s): def field_extract(s, field): try: field_rep = re.compile(f'{field}.*?:.*?"(.*?)"', re.IGNORECASE) - extracted = field_rep.search(s).group(1).replace("\"", "\'") + extracted = field_rep.search(s).group(1).replace('"', "'") except BaseException: field_rep = re.compile(f'{field}:\ *"(.*?)"', re.IGNORECASE) - extracted = field_rep.search(s).group(1).replace("\"", "\'") + extracted = field_rep.search(s).group(1).replace('"', "'") return extracted @@ -377,7 +412,7 @@ def chitchat(messages, api_key, api_type, api_endpoint): "messages": messages, "api_key": api_key, "api_type": api_type, - "api_endpoint": api_endpoint + "api_endpoint": api_endpoint, } return send_request(data) @@ -391,10 +426,7 @@ def parse_task(context, input, api_key, api_type, api_endpoint): start = 0 while start <= len(context): history = context[start:] - prompt = replace_slot(parse_task_prompt, { - "input": input, - "context": history - }) + prompt = replace_slot(parse_task_prompt, {"input": input, "context": history}) messages.append({"role": "user", "content": prompt}) history_text = "\nuser".join([m["content"] for m in messages]) num = count_tokens(LLM_encoding, history_text) @@ -408,25 +440,29 @@ def parse_task(context, input, api_key, api_type, api_endpoint): "model": LLM, "messages": messages, "temperature": 0, - "logit_bias": {item: config["logit_bias"]["parse_task"] for item in task_parsing_highlight_ids}, + "logit_bias": { + item: config["logit_bias"]["parse_task"] + for item in task_parsing_highlight_ids + }, "api_key": api_key, "api_type": api_type, - "api_endpoint": api_endpoint + "api_endpoint": api_endpoint, } return send_request(data) def choose_model(input, task, metas, api_key, api_type, api_endpoint): - prompt = replace_slot(choose_model_prompt, { - "input": input, - "task": task, - "metas": metas, - }) - demos_or_presteps = replace_slot(choose_model_demos_or_presteps, { - "input": input, - "task": task, - "metas": metas - }) + prompt = replace_slot( + choose_model_prompt, + { + "input": input, + "task": task, + "metas": metas, + }, + ) + demos_or_presteps = replace_slot( + choose_model_demos_or_presteps, {"input": input, "task": task, "metas": metas} + ) messages = json.loads(demos_or_presteps) messages.insert(0, {"role": "system", "content": choose_model_tprompt}) messages.append({"role": "user", "content": prompt}) @@ -435,23 +471,28 @@ def choose_model(input, task, metas, api_key, api_type, api_endpoint): "model": LLM, "messages": messages, "temperature": 0, - "logit_bias": {item: config["logit_bias"]["choose_model"] for item in choose_model_highlight_ids}, # 5 + "logit_bias": { + item: config["logit_bias"]["choose_model"] + for item in choose_model_highlight_ids + }, # 5 "api_key": api_key, "api_type": api_type, - "api_endpoint": api_endpoint + "api_endpoint": api_endpoint, } return send_request(data) def response_results(input, results, api_key, api_type, api_endpoint): results = [v for k, v in sorted(results.items(), key=lambda item: item[0])] - prompt = replace_slot(response_results_prompt, { - "input": input, - }) - demos_or_presteps = replace_slot(response_results_demos_or_presteps, { - "input": input, - "processes": results - }) + prompt = replace_slot( + response_results_prompt, + { + "input": input, + }, + ) + demos_or_presteps = replace_slot( + response_results_demos_or_presteps, {"input": input, "processes": results} + ) messages = json.loads(demos_or_presteps) messages.insert(0, {"role": "system", "content": response_results_tprompt}) messages.append({"role": "user", "content": prompt}) @@ -462,7 +503,7 @@ def response_results(input, results, api_key, api_type, api_endpoint): "temperature": 0, "api_key": api_key, "api_type": api_type, - "api_endpoint": api_endpoint + "api_endpoint": api_endpoint, } return send_request(data) @@ -473,12 +514,23 @@ def huggingface_model_inference(model_id, data, task): # NLP tasks if task == "question-answering": - inputs = {"question": data["text"], "context": (data["context"] if "context" in data else "")} + inputs = { + "question": data["text"], + "context": (data["context"] if "context" in data else ""), + } result = inference(inputs) if task == "sentence-similarity": inputs = {"source_sentence": data["text1"], "target_sentence": data["text2"]} result = inference(inputs) - if task in ["text-classification", "token-classification", "text2text-generation", "summarization", "translation", "conversational", "text-generation"]: + if task in [ + "text-classification", + "token-classification", + "text2text-generation", + "summarization", + "translation", + "conversational", + "text-generation", + ]: inputs = data["text"] result = inference(inputs) @@ -492,7 +544,9 @@ def huggingface_model_inference(model_id, data, task): json_data["inputs"] = {} json_data["inputs"]["question"] = text json_data["inputs"]["image"] = img_base64 - result = requests.post(task_url, headers=HUGGINGFACE_HEADERS, json=json_data).json() + result = requests.post( + task_url, headers=HUGGINGFACE_HEADERS, json=json_data + ).json() # result = inference(inputs) # not support if task == "image-to-image": @@ -520,15 +574,22 @@ def huggingface_model_inference(model_id, data, task): predicted = inference(data=img_data) colors = [] for i in range(len(predicted)): - colors.append((random.randint(100, 255), random.randint(100, 255), random.randint(100, 255), 155)) + colors.append( + ( + random.randint(100, 255), + random.randint(100, 255), + random.randint(100, 255), + 155, + ) + ) for i, pred in enumerate(predicted): label = pred["label"] mask = pred.pop("mask").encode("utf-8") mask = base64.b64decode(mask) - mask = Image.open(BytesIO(mask), mode='r') - mask = mask.convert('L') + mask = Image.open(BytesIO(mask), mode="r") + mask = mask.convert("L") - layer = Image.new('RGBA', mask.size, colors[i]) + layer = Image.new("RGBA", mask.size, colors[i]) image.paste(layer, (0, 0), mask) name = str(uuid.uuid4())[:4] image.save(f"public/images/{name}.jpg") @@ -542,15 +603,27 @@ def huggingface_model_inference(model_id, data, task): predicted = inference(data=img_data) image = Image.open(BytesIO(img_data)) draw = ImageDraw.Draw(image) - labels = list(item['label'] for item in predicted) + labels = list(item["label"] for item in predicted) color_map = {} for label in labels: if label not in color_map: - color_map[label] = (random.randint(0, 255), random.randint(0, 100), random.randint(0, 255)) + color_map[label] = ( + random.randint(0, 255), + random.randint(0, 100), + random.randint(0, 255), + ) for label in predicted: box = label["box"] - draw.rectangle(((box["xmin"], box["ymin"]), (box["xmax"], box["ymax"])), outline=color_map[label["label"]], width=2) - draw.text((box["xmin"] + 5, box["ymin"] - 15), label["label"], fill=color_map[label["label"]]) + draw.rectangle( + ((box["xmin"], box["ymin"]), (box["xmax"], box["ymax"])), + outline=color_map[label["label"]], + width=2, + ) + draw.text( + (box["xmin"] + 5, box["ymin"] - 15), + label["label"], + fill=color_map[label["label"]], + ) name = str(uuid.uuid4())[:4] image.save(f"public/images/{name}.jpg") result = {} @@ -566,7 +639,9 @@ def huggingface_model_inference(model_id, data, task): img_url = data["image"] img_data = image_to_bytes(img_url) HUGGINGFACE_HEADERS["Content-Length"] = str(len(img_data)) - r = requests.post(task_url, headers=HUGGINGFACE_HEADERS, data=img_data, proxies=PROXY) + r = requests.post( + task_url, headers=HUGGINGFACE_HEADERS, data=img_data, proxies=PROXY + ) result = {} if "generated_text" in r.json()[0]: result["generated text"] = r.json()[0].pop("generated_text") @@ -580,7 +655,11 @@ def huggingface_model_inference(model_id, data, task): with open(f"public/audios/{name}.flac", "wb") as f: f.write(response.content) result = {"generated audio": f"/audios/{name}.flac"} - if task in ["automatic-speech-recognition", "audio-to-audio", "audio-classification"]: + if task in [ + "automatic-speech-recognition", + "audio-to-audio", + "audio-classification", + ]: audio_url = data["audio"] audio_data = requests.get(audio_url, timeout=10).content response = inference(data=audio_data, raw_response=True) @@ -631,7 +710,15 @@ def local_model_inference(model_id, data, task): if task == "question-answering" or task == "sentence-similarity": response = requests.post(task_url, json=data) return response.json() - if task in ["text-classification", "token-classification", "text2text-generation", "summarization", "translation", "conversational", "text-generation"]: + if task in [ + "text-classification", + "token-classification", + "text2text-generation", + "summarization", + "translation", + "conversational", + "text-generation", + ]: response = requests.post(task_url, json=data) return response.json() @@ -670,22 +757,39 @@ def local_model_inference(model_id, data, task): return predicted image = load_image(img_url) draw = ImageDraw.Draw(image) - labels = list(item['label'] for item in predicted) + labels = list(item["label"] for item in predicted) color_map = {} for label in labels: if label not in color_map: - color_map[label] = (random.randint(0, 255), random.randint(0, 100), random.randint(0, 255)) + color_map[label] = ( + random.randint(0, 255), + random.randint(0, 100), + random.randint(0, 255), + ) for label in predicted: box = label["box"] - draw.rectangle(((box["xmin"], box["ymin"]), (box["xmax"], box["ymax"])), outline=color_map[label["label"]], width=2) - draw.text((box["xmin"] + 5, box["ymin"] - 15), label["label"], fill=color_map[label["label"]]) + draw.rectangle( + ((box["xmin"], box["ymin"]), (box["xmax"], box["ymax"])), + outline=color_map[label["label"]], + width=2, + ) + draw.text( + (box["xmin"] + 5, box["ymin"] - 15), + label["label"], + fill=color_map[label["label"]], + ) name = str(uuid.uuid4())[:4] image.save(f"public/images/{name}.jpg") results = {} results["generated image"] = f"/images/{name}.jpg" results["predicted"] = predicted return results - if task in ["image-classification", "image-to-text", "document-question-answering", "visual-question-answering"]: + if task in [ + "image-classification", + "image-to-text", + "document-question-answering", + "visual-question-answering", + ]: img_url = data["image"] text = None if "text" in data: @@ -700,7 +804,11 @@ def local_model_inference(model_id, data, task): if "path" in results: results["generated audio"] = results.pop("path") return results - if task in ["automatic-speech-recognition", "audio-to-audio", "audio-classification"]: + if task in [ + "automatic-speech-recognition", + "audio-to-audio", + "audio-classification", + ]: audio_url = data["audio"] response = requests.post(task_url, json={"audio_url": audio_url}) return response.json() @@ -714,8 +822,12 @@ def model_inference(model_id, data, hosted_on, task): if r.status_code == 200 and "loaded" in r.json() and r.json()["loaded"]: hosted_on = "local" else: - huggingfaceStatusUrl = f"https://api-inference.huggingface.co/status/{model_id}" - r = requests.get(huggingfaceStatusUrl, headers=HUGGINGFACE_HEADERS, proxies=PROXY) + huggingfaceStatusUrl = ( + f"https://api-inference.huggingface.co/status/{model_id}" + ) + r = requests.get( + huggingfaceStatusUrl, headers=HUGGINGFACE_HEADERS, proxies=PROXY + ) logger.debug("Huggingface Status: " + str(r.json())) if r.status_code == 200 and "loaded" in r.json() and r.json()["loaded"]: hosted_on = "huggingface" @@ -756,14 +868,27 @@ def get_avaliable_models(candidates, topk=5): model_id = candidate["id"] if inference_mode != "local": - huggingfaceStatusUrl = f"https://api-inference.huggingface.co/status/{model_id}" - thread = threading.Thread(target=get_model_status, args=(model_id, huggingfaceStatusUrl, HUGGINGFACE_HEADERS, result_queue)) + huggingfaceStatusUrl = ( + f"https://api-inference.huggingface.co/status/{model_id}" + ) + thread = threading.Thread( + target=get_model_status, + args=( + model_id, + huggingfaceStatusUrl, + HUGGINGFACE_HEADERS, + result_queue, + ), + ) threads.append(thread) thread.start() if inference_mode != "huggingface" and config["local_deployment"] != "minimal": localStatusUrl = f"{Model_Server}/status/{model_id}" - thread = threading.Thread(target=get_model_status, args=(model_id, localStatusUrl, {}, result_queue)) + thread = threading.Thread( + target=get_model_status, + args=(model_id, localStatusUrl, {}, result_queue), + ) threads.append(thread) thread.start() @@ -772,7 +897,10 @@ def get_avaliable_models(candidates, topk=5): model_id, status, endpoint_type = result_queue.get() if status and model_id not in all_available_models: all_available_models[endpoint_type].append(model_id) - if len(all_available_models["local"] + all_available_models["huggingface"]) >= topk: + if ( + len(all_available_models["local"] + all_available_models["huggingface"]) + >= topk + ): break result_count -= 1 @@ -807,33 +935,45 @@ def run_task(input, command, results, api_key, api_type, api_endpoint): if "image" in args and "-" in args["image"]: resource_id = int(args["image"].split("-")[1]) if "generated image" in results[resource_id]["inference result"]: - args["image"] = results[resource_id]["inference result"]["generated image"] + args["image"] = results[resource_id]["inference result"][ + "generated image" + ] if "audio" in args and "-" in args["audio"]: resource_id = int(args["audio"].split("-")[1]) if "generated audio" in results[resource_id]["inference result"]: - args["audio"] = results[resource_id]["inference result"]["generated audio"] + args["audio"] = results[resource_id]["inference result"][ + "generated audio" + ] if "text" in args and "-" in args["text"]: resource_id = int(args["text"].split("-")[1]) if "generated text" in results[resource_id]["inference result"]: - args["text"] = results[resource_id]["inference result"]["generated text"] + args["text"] = results[resource_id]["inference result"][ + "generated text" + ] text = image = audio = None for dep_task in dep_tasks: if "generated text" in dep_task["inference result"]: text = dep_task["inference result"]["generated text"] - logger.debug("Detect the generated text of dependency task (from results):" + text) + logger.debug( + "Detect the generated text of dependency task (from results):" + text + ) elif "text" in dep_task["task"]["args"]: text = dep_task["task"]["args"]["text"] logger.debug("Detect the text of dependency task (from args): " + text) if "generated image" in dep_task["inference result"]: image = dep_task["inference result"]["generated image"] - logger.debug("Detect the generated image of dependency task (from results): " + image) + logger.debug( + "Detect the generated image of dependency task (from results): " + image + ) elif "image" in dep_task["task"]["args"]: image = dep_task["task"]["args"]["image"] logger.debug("Detect the image of dependency task (from args): " + image) if "generated audio" in dep_task["inference result"]: audio = dep_task["inference result"]["generated audio"] - logger.debug("Detect the generated audio of dependency task (from results): " + audio) + logger.debug( + "Detect the generated audio of dependency task (from results): " + audio + ) elif "audio" in dep_task["task"]["args"]: audio = dep_task["task"]["args"]["audio"] logger.debug("Detect the audio of dependency task (from args): " + audio) @@ -849,19 +989,26 @@ def run_task(input, command, results, api_key, api_type, api_endpoint): args["text"] = text for resource in ["image", "audio"]: - if resource in args and not args[resource].startswith("public/") and len(args[resource]) > 0 and not args[resource].startswith("http"): + if ( + resource in args + and not args[resource].startswith("public/") + and len(args[resource]) > 0 + and not args[resource].startswith("http") + ): args[resource] = f"public/{args[resource]}" - if "-text-to-image" in command['task'] and "text" not in args: - logger.debug("control-text-to-image task, but text is empty, so we use control-generation instead.") + if "-text-to-image" in command["task"] and "text" not in args: + logger.debug( + "control-text-to-image task, but text is empty, so we use control-generation instead." + ) control = task.split("-")[0] if control == "seg": task = "image-segmentation" - command['task'] = task + command["task"] = task elif control == "depth": task = "depth-estimation" - command['task'] = task + command["task"] = task else: task = f"{control}-control" @@ -880,45 +1027,93 @@ def run_task(input, command, results, api_key, api_type, api_endpoint): choose = {"id": best_model_id, "reason": reason} logger.debug(f"chosen model: {choose}") else: - logger.warning(f"Task {command['task']} is not available. ControlNet need to be deployed locally.") - record_case(success=False, **{"input": input, "task": command, "reason": f"Task {command['task']} is not available. ControlNet need to be deployed locally.", "op": "message"}) - inference_result = {"error": "service related to ControlNet is not available."} + logger.warning( + f"Task {command['task']} is not available. ControlNet need to be deployed locally." + ) + record_case( + success=False, + **{ + "input": input, + "task": command, + "reason": f"Task {command['task']} is not available. ControlNet need to be deployed locally.", + "op": "message", + }, + ) + inference_result = { + "error": "service related to ControlNet is not available." + } results[id] = collect_result(command, "", inference_result) return False - elif task in ["summarization", "translation", "conversational", "text-generation", "text2text-generation"]: # ChatGPT Can do + elif task in [ + "summarization", + "translation", + "conversational", + "text-generation", + "text2text-generation", + ]: # ChatGPT Can do best_model_id = "ChatGPT" reason = "ChatGPT performs well on some NLP tasks as well." choose = {"id": best_model_id, "reason": reason} - messages = [{ - "role": "user", - "content": f"[ {input} ] contains a task in JSON format {command}. Now you are a {command['task']} system, the arguments are {command['args']}. Just help me do {command['task']} and give me the result. The result must be in text form without any urls." - }] + messages = [ + { + "role": "user", + "content": f"[ {input} ] contains a task in JSON format {command}. Now you are a {command['task']} system, the arguments are {command['args']}. Just help me do {command['task']} and give me the result. The result must be in text form without any urls.", + } + ] response = chitchat(messages, api_key, api_type, api_endpoint) results[id] = collect_result(command, choose, {"response": response}) return True else: if task not in MODELS_MAP: logger.warning(f"no available models on {task} task.") - record_case(success=False, **{"input": input, "task": command, "reason": f"task not support: {command['task']}", "op": "message"}) - inference_result = {"error": f"{command['task']} not found in available tasks."} + record_case( + success=False, + **{ + "input": input, + "task": command, + "reason": f"task not support: {command['task']}", + "op": "message", + }, + ) + inference_result = { + "error": f"{command['task']} not found in available tasks." + } results[id] = collect_result(command, "", inference_result) return False candidates = MODELS_MAP[task][:10] - all_avaliable_models = get_avaliable_models(candidates, config["num_candidate_models"]) - all_avaliable_model_ids = all_avaliable_models["local"] + all_avaliable_models["huggingface"] + all_avaliable_models = get_avaliable_models( + candidates, config["num_candidate_models"] + ) + all_avaliable_model_ids = ( + all_avaliable_models["local"] + all_avaliable_models["huggingface"] + ) logger.debug(f"avaliable models on {command['task']}: {all_avaliable_models}") if len(all_avaliable_model_ids) == 0: logger.warning(f"no available models on {command['task']}") - record_case(success=False, **{"input": input, "task": command, "reason": f"no available models: {command['task']}", "op": "message"}) - inference_result = {"error": f"no available models on {command['task']} task."} + record_case( + success=False, + **{ + "input": input, + "task": command, + "reason": f"no available models: {command['task']}", + "op": "message", + }, + ) + inference_result = { + "error": f"no available models on {command['task']} task." + } results[id] = collect_result(command, "", inference_result) return False if len(all_avaliable_model_ids) == 1: best_model_id = all_avaliable_model_ids[0] - hosted_on = "local" if best_model_id in all_avaliable_models["local"] else "huggingface" + hosted_on = ( + "local" + if best_model_id in all_avaliable_models["local"] + else "huggingface" + ) reason = "Only one model available." choose = {"id": best_model_id, "reason": reason} logger.debug(f"chosen model: {choose}") @@ -927,34 +1122,60 @@ def run_task(input, command, results, api_key, api_type, api_endpoint): { "id": model["id"], "inference endpoint": all_avaliable_models.get( - "local" if model["id"] in all_avaliable_models["local"] else "huggingface" + "local" + if model["id"] in all_avaliable_models["local"] + else "huggingface" ), "likes": model.get("likes"), - "description": model.get("description", "")[:config["max_description_length"]], + "description": model.get("description", "")[ + : config["max_description_length"] + ], # "language": model.get("meta").get("language") if model.get("meta") else None, - "tags": model.get("meta").get("tags") if model.get("meta") else None, + "tags": model.get("meta").get("tags") + if model.get("meta") + else None, } for model in candidates if model["id"] in all_avaliable_model_ids ] - choose_str = choose_model(input, command, cand_models_info, api_key, api_type, api_endpoint) + choose_str = choose_model( + input, command, cand_models_info, api_key, api_type, api_endpoint + ) logger.debug(f"chosen model: {choose_str}") try: choose = json.loads(choose_str) reason = choose["reason"] best_model_id = choose["id"] - hosted_on = "local" if best_model_id in all_avaliable_models["local"] else "huggingface" + hosted_on = ( + "local" + if best_model_id in all_avaliable_models["local"] + else "huggingface" + ) except Exception: - logger.warning(f"the response [ {choose_str} ] is not a valid JSON, try to find the model id and reason in the response.") + logger.warning( + f"the response [ {choose_str} ] is not a valid JSON, try to find the model id and reason in the response." + ) choose_str = find_json(choose_str) best_model_id, reason, choose = get_id_reason(choose_str) - hosted_on = "local" if best_model_id in all_avaliable_models["local"] else "huggingface" - inference_result = model_inference(best_model_id, args, hosted_on, command['task']) + hosted_on = ( + "local" + if best_model_id in all_avaliable_models["local"] + else "huggingface" + ) + inference_result = model_inference(best_model_id, args, hosted_on, command["task"]) if "error" in inference_result: logger.warning(f"Inference error: {inference_result['error']}") - record_case(success=False, **{"input": input, "task": command, "reason": f"inference error: {inference_result['error']}", "op": "message"}) + record_case( + success=False, + **{ + "input": input, + "task": command, + "reason": f"inference error: {inference_result['error']}", + "op": "message", + }, + ) results[id] = collect_result(command, choose, inference_result) return False @@ -962,7 +1183,14 @@ def run_task(input, command, results, api_key, api_type, api_endpoint): return True -def chat_huggingface(messages, api_key, api_type, api_endpoint, return_planning=False, return_results=False): +def chat_huggingface( + messages, + api_key, + api_type, + api_endpoint, + return_planning=False, + return_results=False, +): start = time.time() context = messages[:-1] input = messages[-1]["content"] @@ -972,7 +1200,15 @@ def chat_huggingface(messages, api_key, api_type, api_endpoint, return_planning= task_str = parse_task(context, input, api_key, api_type, api_endpoint) if "error" in task_str: - record_case(success=False, **{"input": input, "task": task_str, "reason": f"task parsing error: {task_str['error']['message']}", "op": "report message"}) + record_case( + success=False, + **{ + "input": input, + "task": task_str, + "reason": f"task parsing error: {task_str['error']['message']}", + "op": "report message", + }, + ) return {"message": task_str["error"]["message"]} task_str = task_str.strip() @@ -983,16 +1219,46 @@ def chat_huggingface(messages, api_key, api_type, api_endpoint, return_planning= except Exception as e: logger.debug(e) response = chitchat(messages, api_key, api_type, api_endpoint) - record_case(success=False, **{"input": input, "task": task_str, "reason": "task parsing fail", "op": "chitchat"}) + record_case( + success=False, + **{ + "input": input, + "task": task_str, + "reason": "task parsing fail", + "op": "chitchat", + }, + ) return {"message": response} if task_str == "[]": # using LLM response for empty task - record_case(success=False, **{"input": input, "task": [], "reason": "task parsing fail: empty", "op": "chitchat"}) + record_case( + success=False, + **{ + "input": input, + "task": [], + "reason": "task parsing fail: empty", + "op": "chitchat", + }, + ) response = chitchat(messages, api_key, api_type, api_endpoint) return {"message": response} - if len(tasks) == 1 and tasks[0]["task"] in ["summarization", "translation", "conversational", "text-generation", "text2text-generation"]: - record_case(success=True, **{"input": input, "task": tasks, "reason": "chitchat tasks", "op": "chitchat"}) + if len(tasks) == 1 and tasks[0]["task"] in [ + "summarization", + "translation", + "conversational", + "text-generation", + "text2text-generation", + ]: + record_case( + success=True, + **{ + "input": input, + "task": tasks, + "reason": "chitchat tasks", + "op": "chitchat", + }, + ) response = chitchat(messages, api_key, api_type, api_endpoint) return {"message": response} @@ -1019,7 +1285,10 @@ def chat_huggingface(messages, api_key, api_type, api_endpoint, return_planning= dep = task["dep"] if dep[0] == -1 or len(list(set(dep).intersection(d.keys()))) == len(dep): tasks.remove(task) - thread = threading.Thread(target=run_task, args=(input, task, d, api_key, api_type, api_endpoint)) + thread = threading.Thread( + target=run_task, + args=(input, task, d, api_key, api_type, api_endpoint), + ) thread.start() threads.append(thread) if num_thread == len(threads): @@ -1045,7 +1314,17 @@ def chat_huggingface(messages, api_key, api_type, api_endpoint, return_planning= during = end - start answer = {"message": response} - record_case(success=True, **{"input": input, "task": task_str, "results": results, "response": response, "during": during, "op": "response"}) + record_case( + success=True, + **{ + "input": input, + "task": task_str, + "results": results, + "response": response, + "during": during, + "op": "response", + }, + ) logger.info(f"response: {response}") return answer @@ -1058,31 +1337,63 @@ def test(): "Please answer all the named entities in the sentence: Iron Man is a superhero appearing in American comic books published by Marvel Comics. The character was co-created by writer and editor Stan Lee, developed by scripter Larry Lieber, and designed by artists Don Heck and Jack Kirby.", "please dub for me: 'Iron Man is a superhero appearing in American comic books published by Marvel Comics. The character was co-created by writer and editor Stan Lee, developed by scripter Larry Lieber, and designed by artists Don Heck and Jack Kirby.'" "Given an image: https://huggingface.co/datasets/mishig/sample_images/resolve/main/palace.jpg, please answer the question: What is on top of the building?", - "Please generate a canny image based on /examples/f.jpg" + "Please generate a canny image based on /examples/f.jpg", ] for input in inputs: messages = [{"role": "user", "content": input}] - chat_huggingface(messages, API_KEY, API_TYPE, API_ENDPOINT, return_planning=False, return_results=False) + chat_huggingface( + messages, + API_KEY, + API_TYPE, + API_ENDPOINT, + return_planning=False, + return_results=False, + ) # multi rounds example messages = [ - {"role": "user", "content": "Please generate a canny image based on /examples/f.jpg"}, - {"role": "assistant", "content": """Sure. I understand your request. Based on the inference results of the models, I have generated a canny image for you. The workflow I used is as follows: First, I used the image-to-text model (nlpconnect/vit-gpt2-image-captioning) to convert the image /examples/f.jpg to text. The generated text is "a herd of giraffes and zebras grazing in a field". Second, I used the canny-control model (canny-control) to generate a canny image from the text. Unfortunately, the model failed to generate the canny image. Finally, I used the canny-text-to-image model (lllyasviel/sd-controlnet-canny) to generate a canny image from the text. The generated image is located at /images/f16d.png. I hope this answers your request. Is there anything else I can help you with?"""}, - {"role": "user", "content": """then based on the above canny image and a prompt "a photo of a zoo", generate a new image."""}, + { + "role": "user", + "content": "Please generate a canny image based on /examples/f.jpg", + }, + { + "role": "assistant", + "content": """Sure. I understand your request. Based on the inference results of the models, I have generated a canny image for you. The workflow I used is as follows: First, I used the image-to-text model (nlpconnect/vit-gpt2-image-captioning) to convert the image /examples/f.jpg to text. The generated text is "a herd of giraffes and zebras grazing in a field". Second, I used the canny-control model (canny-control) to generate a canny image from the text. Unfortunately, the model failed to generate the canny image. Finally, I used the canny-text-to-image model (lllyasviel/sd-controlnet-canny) to generate a canny image from the text. The generated image is located at /images/f16d.png. I hope this answers your request. Is there anything else I can help you with?""", + }, + { + "role": "user", + "content": """then based on the above canny image and a prompt "a photo of a zoo", generate a new image.""", + }, ] - chat_huggingface(messages, API_KEY, API_TYPE, API_ENDPOINT, return_planning=False, return_results=False) + chat_huggingface( + messages, + API_KEY, + API_TYPE, + API_ENDPOINT, + return_planning=False, + return_results=False, + ) def cli(): messages = [] - print("Welcome to Jarvis! A collaborative system that consists of an LLM as the controller and numerous expert models as collaborative executors. Jarvis can plan tasks, schedule Hugging Face models, generate friendly responses based on your requests, and help you with many things. Please enter your request (`exit` to exit).") + print( + "Welcome to Jarvis! A collaborative system that consists of an LLM as the controller and numerous expert models as collaborative executors. Jarvis can plan tasks, schedule Hugging Face models, generate friendly responses based on your requests, and help you with many things. Please enter your request (`exit` to exit)." + ) while True: message = input("[ User ]: ") if message == "exit": break messages.append({"role": "user", "content": message}) - answer = chat_huggingface(messages, API_KEY, API_TYPE, API_ENDPOINT, return_planning=False, return_results=False) + answer = chat_huggingface( + messages, + API_KEY, + API_TYPE, + API_ENDPOINT, + return_planning=False, + return_results=False, + ) print("[ Jarvis ]: ", answer["message"]) messages.append({"role": "assistant", "content": answer["message"]}) diff --git a/swarms/agents/omni_modal_agent.py b/swarms/agents/omni_modal_agent.py index a62013c6..bdcb20ec 100644 --- a/swarms/agents/omni_modal_agent.py +++ b/swarms/agents/omni_modal_agent.py @@ -17,12 +17,7 @@ from swarms.agents.message import Message class Step: def __init__( - self, - task: str, - id: int, - dep: List[int], - args: Dict[str, str], - tool: BaseTool + self, task: str, id: int, dep: List[int], args: Dict[str, str], tool: BaseTool ): self.task = task self.id = id @@ -32,10 +27,7 @@ class Step: class Plan: - def __init__( - self, - steps: List[Step] - ): + def __init__(self, steps: List[Step]): self.steps = steps def __str__(self) -> str: @@ -104,10 +96,7 @@ class OmniModalAgent: # self.task_executor = TaskExecutor self.history = [] - def run( - self, - input: str - ) -> str: + def run(self, input: str) -> str: """Run the OmniAgent""" plan = self.chat_planner.plan( inputs={ @@ -124,11 +113,7 @@ class OmniModalAgent: return response - def chat( - self, - msg: str = None, - streaming: bool = False - ): + def chat(self, msg: str = None, streaming: bool = False): """ Run chat @@ -148,24 +133,14 @@ class OmniModalAgent: """ # add users message to the history - self.history.append( - Message( - "User", - msg - ) - ) + self.history.append(Message("User", msg)) # process msg try: response = self.agent.run(msg) # add agent's response to the history - self.history.append( - Message( - "Agent", - response - ) - ) + self.history.append(Message("Agent", response)) # if streaming is = True if streaming: @@ -177,19 +152,11 @@ class OmniModalAgent: error_message = f"Error processing message: {str(error)}" # add error to history - self.history.append( - Message( - "Agent", - error_message - ) - ) + self.history.append(Message("Agent", error_message)) return error_message - def _stream_response( - self, - response: str = None - ): + def _stream_response(self, response: str = None): """ Yield the response token by token (word by word) diff --git a/swarms/agents/profitpilot.py b/swarms/agents/profitpilot.py index 0614b452..79a5a755 100644 --- a/swarms/agents/profitpilot.py +++ b/swarms/agents/profitpilot.py @@ -56,36 +56,36 @@ class StageAnalyzerChain(LLMChain): class SalesConversationChain(LLMChain): """ - Chain to generate the next utterance for the conversation. + Chain to generate the next utterance for the conversation. - # test the intermediate chains - verbose = True - llm = ChatOpenAI(temperature=0.9) + # test the intermediate chains + verbose = True + llm = ChatOpenAI(temperature=0.9) - stage_analyzer_chain = StageAnalyzerChain.from_llm(llm, verbose=verbose) - - sales_conversation_utterance_chain = SalesConversationChain.from_llm( - llm, verbose=verbose - ) + stage_analyzer_chain = StageAnalyzerChain.from_llm(llm, verbose=verbose) + sales_conversation_utterance_chain = SalesConversationChain.from_llm( + llm, verbose=verbose + ) - stage_analyzer_chain.run(conversation_history="") - sales_conversation_utterance_chain.run( - salesperson_name="Ted Lasso", - salesperson_role="Business Development Representative", - company_name="Sleep Haven", - company_business="Sleep Haven is a premium mattress company that provides customers with the most comfortable and supportive sleeping experience possible. We offer a range of high-quality mattresses, pillows, and bedding accessories that are designed to meet the unique needs of our customers.", - company_values="Our mission at Sleep Haven is to help people achieve a better night's sleep by providing them with the best possible sleep solutions. We believe that quality sleep is essential to overall health and well-being, and we are committed to helping our customers achieve optimal sleep by offering exceptional products and customer service.", - conversation_purpose="find out whether they are looking to achieve better sleep via buying a premier mattress.", - conversation_history="Hello, this is Ted Lasso from Sleep Haven. How are you doing today? \nUser: I am well, howe are you?", - conversation_type="call", - conversation_stage=conversation_stages.get( - "1", - "Introduction: Start the conversation by introducing yourself and your company. Be polite and respectful while keeping the tone of the conversation professional.", - ), -) + stage_analyzer_chain.run(conversation_history="") + + sales_conversation_utterance_chain.run( + salesperson_name="Ted Lasso", + salesperson_role="Business Development Representative", + company_name="Sleep Haven", + company_business="Sleep Haven is a premium mattress company that provides customers with the most comfortable and supportive sleeping experience possible. We offer a range of high-quality mattresses, pillows, and bedding accessories that are designed to meet the unique needs of our customers.", + company_values="Our mission at Sleep Haven is to help people achieve a better night's sleep by providing them with the best possible sleep solutions. We believe that quality sleep is essential to overall health and well-being, and we are committed to helping our customers achieve optimal sleep by offering exceptional products and customer service.", + conversation_purpose="find out whether they are looking to achieve better sleep via buying a premier mattress.", + conversation_history="Hello, this is Ted Lasso from Sleep Haven. How are you doing today? \nUser: I am well, howe are you?", + conversation_type="call", + conversation_stage=conversation_stages.get( + "1", + "Introduction: Start the conversation by introducing yourself and your company. Be polite and respectful while keeping the tone of the conversation professional.", + ), + ) """ @@ -166,14 +166,12 @@ def get_tools(product_catalog): func=knowledge_base.run, description="useful for when you need to answer questions about product information", ), - # Interpreter Tool( name="Code Interepeter", func=compile, - description="Useful when you need to run code locally, such as Python, Javascript, Shell, and more." + description="Useful when you need to run code locally, such as Python, Javascript, Shell, and more.", ) - # omnimodal agent ] @@ -354,12 +352,7 @@ class ProfitPilot(Chain, BaseModel): return {} @classmethod - def from_llm( - cls, - llm: BaseLLM, - verbose: bool = False, - **kwargs - ): # noqa: F821 + def from_llm(cls, llm: BaseLLM, verbose: bool = False, **kwargs): # noqa: F821 """Initialize the SalesGPT Controller.""" stage_analyzer_chain = StageAnalyzerChain.from_llm(llm, verbose=verbose) diff --git a/swarms/agents/stream_response.py b/swarms/agents/stream_response.py index a8c2bc08..ecd29ff0 100644 --- a/swarms/agents/stream_response.py +++ b/swarms/agents/stream_response.py @@ -1,5 +1,3 @@ - - def stream(response): """ Yield the response token by token (word by word) from llm diff --git a/swarms/artifacts/base.py b/swarms/artifacts/base.py index 5a0b7178..dac7a523 100644 --- a/swarms/artifacts/base.py +++ b/swarms/artifacts/base.py @@ -10,9 +10,14 @@ from marshmallow.exceptions import RegistryError @define class BaseArtifact(ABC): id: str = field(default=Factory(lambda: uuid.uuid4().hex), kw_only=True) - name: str = field(default=Factory(lambda self: self.id, takes_self=True), kw_only=True) + name: str = field( + default=Factory(lambda self: self.id, takes_self=True), kw_only=True + ) value: any = field() - type: str = field(default=Factory(lambda self: self.__class__.__name__, takes_self=True), kw_only=True) + type: str = field( + default=Factory(lambda self: self.__class__.__name__, takes_self=True), + kw_only=True, + ) @classmethod def value_to_bytes(cls, value: any) -> bytes: @@ -38,7 +43,7 @@ class BaseArtifact(ABC): ErrorArtifactSchema, BlobArtifactSchema, CsvRowArtifactSchema, - ListArtifactSchema + ListArtifactSchema, ) class_registry.register("TextArtifact", TextArtifactSchema) diff --git a/swarms/artifacts/main.py b/swarms/artifacts/main.py index 879d5234..4b240b22 100644 --- a/swarms/artifacts/main.py +++ b/swarms/artifacts/main.py @@ -12,14 +12,8 @@ class Artifact(BaseModel): Artifact that has the task has been produced """ - artifact_id: StrictStr = Field( - ..., - description="ID of the artifact" - ) - file_name: StrictStr = Field( - ..., - description="Filename of the artifact" - ) + artifact_id: StrictStr = Field(..., description="ID of the artifact") + file_name: StrictStr = Field(..., description="Filename of the artifact") relative_path: Optional[StrictStr] = Field( None, description="Relative path of the artifact" ) diff --git a/swarms/boss/boss_node.py b/swarms/boss/boss_node.py index 436d1920..9c9ed83c 100644 --- a/swarms/boss/boss_node.py +++ b/swarms/boss/boss_node.py @@ -10,7 +10,9 @@ from langchain.vectorstores import FAISS from langchain_experimental.autonomous_agents import BabyAGI from pydantic import ValidationError -logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') +logging.basicConfig( + level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s" +) # ---------- Boss Node ---------- @@ -48,7 +50,7 @@ class Boss: boss_system_prompt="You are a boss planner in a swarm...", llm_class=OpenAI, worker_node=None, - verbose=False + verbose=False, ): # Store parameters self.api_key = api_key or os.getenv("OPENAI_API_KEY") @@ -85,11 +87,7 @@ class Boss: embedding_size = 8192 index = faiss.IndexFlatL2(embedding_size) - return FAISS( - embeddings_model.embed_query, - index, - InMemoryDocstore({}), {} - ) + return FAISS(embeddings_model.embed_query, index, InMemoryDocstore({}), {}) except Exception as e: logging.error(f"Failed to initialize vector store: {e}") @@ -102,9 +100,13 @@ class Boss: Tool( name="Goal Decomposition Tool", func=todo_chain.run, - description="Use Case: Decompose ambitious goals into as many explicit and well defined tasks for an AI agent to follow. Rules and Regulations, don't use this tool too often only in the beginning when the user grants you a mission." + description="Use Case: Decompose ambitious goals into as many explicit and well defined tasks for an AI agent to follow. Rules and Regulations, don't use this tool too often only in the beginning when the user grants you a mission.", + ), + Tool( + name="Swarm Worker Agent", + func=worker_node, + description="Use Case: When you want to delegate and assign the decomposed goal sub tasks to a worker agent in your swarm, Rules and Regulations, Provide a task specification sheet to the worker agent. It can use the browser, process csvs and generate content", ), - Tool(name="Swarm Worker Agent", func=worker_node, description="Use Case: When you want to delegate and assign the decomposed goal sub tasks to a worker agent in your swarm, Rules and Regulations, Provide a task specification sheet to the worker agent. It can use the browser, process csvs and generate content") ] suffix = """Question: {task}\n{agent_scratchpad}""" @@ -118,7 +120,9 @@ class Boss: llm_chain = LLMChain(llm=self.llm, prompt=prompt) agent = ZeroShotAgent(llm_chain=llm_chain, allowed_tools=tools) - return AgentExecutor.from_agent_and_tools(agent=agent, tools=tools, verbose=self.verbose) + return AgentExecutor.from_agent_and_tools( + agent=agent, tools=tools, verbose=self.verbose + ) def _initialize_baby_agi(self, human_in_the_loop): try: @@ -127,7 +131,7 @@ class Boss: vectorstore=self.vectorstore, task_execution_chain=self.agent_executor, max_iterations=self.max_iterations, - human_in_the_loop=human_in_the_loop + human_in_the_loop=human_in_the_loop, ) except ValidationError as e: logging.error(f"Validation Error while initializing BabyAGI: {e}") diff --git a/swarms/embeddings/openai.py b/swarms/embeddings/openai.py index 12fe41a1..230dade9 100644 --- a/swarms/embeddings/openai.py +++ b/swarms/embeddings/openai.py @@ -28,7 +28,9 @@ from tenacity import ( from swarms.embeddings.base import Embeddings -def get_from_dict_or_env(values: dict, key: str, env_key: str, default: Any = None) -> Any: +def get_from_dict_or_env( + values: dict, key: str, env_key: str, default: Any = None +) -> Any: import os return values.get(key) or os.getenv(env_key) or default @@ -345,7 +347,7 @@ class OpenAIEmbeddings(BaseModel, Embeddings): disallowed_special=self.disallowed_special, ) for j in range(0, len(token), self.embedding_ctx_length): - tokens.append(token[j: j + self.embedding_ctx_length]) + tokens.append(token[j : j + self.embedding_ctx_length]) indices.append(i) batched_embeddings: List[List[float]] = [] @@ -364,7 +366,7 @@ class OpenAIEmbeddings(BaseModel, Embeddings): for i in _iter: response = embed_with_retry( self, - input=tokens[i: i + _chunk_size], + input=tokens[i : i + _chunk_size], **self._invocation_params, ) batched_embeddings.extend(r["embedding"] for r in response["data"]) @@ -426,7 +428,7 @@ class OpenAIEmbeddings(BaseModel, Embeddings): disallowed_special=self.disallowed_special, ) for j in range(0, len(token), self.embedding_ctx_length): - tokens.append(token[j: j + self.embedding_ctx_length]) + tokens.append(token[j : j + self.embedding_ctx_length]) indices.append(i) batched_embeddings: List[List[float]] = [] @@ -434,7 +436,7 @@ class OpenAIEmbeddings(BaseModel, Embeddings): for i in range(0, len(tokens), _chunk_size): response = await async_embed_with_retry( self, - input=tokens[i: i + _chunk_size], + input=tokens[i : i + _chunk_size], **self._invocation_params, ) batched_embeddings.extend(r["embedding"] for r in response["data"]) diff --git a/swarms/embeddings/pegasus.py b/swarms/embeddings/pegasus.py index f86e62d9..a517135e 100644 --- a/swarms/embeddings/pegasus.py +++ b/swarms/embeddings/pegasus.py @@ -8,10 +8,7 @@ from pegasus import Pegasus class PegasusEmbedding: def __init__( - self, - modality: str, - multi_process: bool = False, - n_processes: int = 4 + self, modality: str, multi_process: bool = False, n_processes: int = 4 ): self.modality = modality self.multi_process = multi_process @@ -19,7 +16,9 @@ class PegasusEmbedding: try: self.pegasus = Pegasus(modality, multi_process, n_processes) except Exception as e: - logging.error(f"Failed to initialize Pegasus with modality: {modality}: {e}") + logging.error( + f"Failed to initialize Pegasus with modality: {modality}: {e}" + ) raise def embed(self, data: Union[str, list[str]]): diff --git a/swarms/hivemind/hivemind.py b/swarms/hivemind/hivemind.py index 1dce564d..e3e44927 100644 --- a/swarms/hivemind/hivemind.py +++ b/swarms/hivemind/hivemind.py @@ -10,16 +10,13 @@ import logging from swarms.swarms.swarms import HierarchicalSwarm -logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s') +logging.basicConfig( + level=logging.DEBUG, format="%(asctime)s - %(levelname)s - %(message)s" +) class HiveMind: - def __init__( - self, - openai_api_key="", - num_swarms=1, - max_workers=None - ): + def __init__(self, openai_api_key="", num_swarms=1, max_workers=None): self.openai_api_key = openai_api_key self.num_swarms = num_swarms self.swarms = [HierarchicalSwarm(openai_api_key) for _ in range(num_swarms)] @@ -43,8 +40,13 @@ class HiveMind: logging.error(f"An error occurred in run: {e}") def run(self, objective, timeout=None): - with concurrent.futures.ThreadPoolExecutor(max_workers=self.max_workers) as executor: - futures = {executor.submit(self.run_swarm, swarm, objective) for swarm in self.swarms} + with concurrent.futures.ThreadPoolExecutor( + max_workers=self.max_workers + ) as executor: + futures = { + executor.submit(self.run_swarm, swarm, objective) + for swarm in self.swarms + } results = [] for future in concurrent.futures.as_completed(futures, timeout=timeout): try: diff --git a/swarms/memory/embed.py b/swarms/memory/embed.py index f8f1a57c..ce50e0cf 100644 --- a/swarms/memory/embed.py +++ b/swarms/memory/embed.py @@ -4,8 +4,7 @@ from chromadb import EmbeddingFunction def openai_embed(self, input, api_key, model_name): openai = EmbeddingFunction.OpenAIEmbeddingFunction( - api_key=api_key, - model_name=model_name + api_key=api_key, model_name=model_name ) embedding = openai(input) return embedding diff --git a/swarms/memory/schemas.py b/swarms/memory/schemas.py index 07f7a017..0405323d 100644 --- a/swarms/memory/schemas.py +++ b/swarms/memory/schemas.py @@ -26,19 +26,16 @@ class Artifact(BaseModel): relative_path: Optional[str] = Field( None, description="Relative path of the artifact in the agent's workspace", - example="python/code/" + example="python/code/", ) class ArtifactUpload(BaseModel): - file: bytes = Field( - ..., - description="File to upload" - ) + file: bytes = Field(..., description="File to upload") relative_path: Optional[str] = Field( None, description="Relative path of the artifact in the agent's workspace", - example="python/code/" + example="python/code/", ) diff --git a/swarms/models/__init__.py b/swarms/models/__init__.py index 051d789c..4a9d10d2 100644 --- a/swarms/models/__init__.py +++ b/swarms/models/__init__.py @@ -1,7 +1,9 @@ # prompts from swarms.models.anthropic import Anthropic + # from swarms.models.palm import GooglePalm from swarms.models.petals import Petals + # from swarms.models.chat_openai import OpenAIChat from swarms.models.prompts.debate import * from swarms.models.mistral import Mistral diff --git a/swarms/models/anthropic.py b/swarms/models/anthropic.py index 56814ab8..cada8a05 100644 --- a/swarms/models/anthropic.py +++ b/swarms/models/anthropic.py @@ -13,7 +13,7 @@ class Anthropic: top_k=None, top_p=None, streaming=False, - default_request_timeout=None + default_request_timeout=None, ): self.model = model self.max_tokens_to_sample = max_tokens_to_sample @@ -22,7 +22,9 @@ class Anthropic: self.top_p = top_p self.streaming = streaming self.default_request_timeout = default_request_timeout or 600 - self.anthropic_api_url = os.getenv("ANTHROPIC_API_URL", "https://api.anthropic.com") + self.anthropic_api_url = os.getenv( + "ANTHROPIC_API_URL", "https://api.anthropic.com" + ) self.anthropic_api_key = os.getenv("ANTHROPIC_API_KEY") def _default_params(self): @@ -44,12 +46,13 @@ class Anthropic: stop = stop or [] params = self._default_params() headers = {"Authorization": f"Bearer {self.anthropic_api_key}"} - data = { - "prompt": prompt, - "stop_sequences": stop, - **params - } - response = requests.post(f"{self.anthropic_api_url}/completions", headers=headers, json=data, timeout=self.default_request_timeout) + data = {"prompt": prompt, "stop_sequences": stop, **params} + response = requests.post( + f"{self.anthropic_api_url}/completions", + headers=headers, + json=data, + timeout=self.default_request_timeout, + ) return response.json().get("completion") def __call__(self, prompt, stop=None): @@ -57,10 +60,11 @@ class Anthropic: stop = stop or [] params = self._default_params() headers = {"Authorization": f"Bearer {self.anthropic_api_key}"} - data = { - "prompt": prompt, - "stop_sequences": stop, - **params - } - response = requests.post(f"{self.anthropic_api_url}/completions", headers=headers, json=data, timeout=self.default_request_timeout) + data = {"prompt": prompt, "stop_sequences": stop, **params} + response = requests.post( + f"{self.anthropic_api_url}/completions", + headers=headers, + json=data, + timeout=self.default_request_timeout, + ) return response.json().get("completion") diff --git a/swarms/models/chat_openai.py b/swarms/models/chat_openai.py index 7ffc9136..380623c3 100644 --- a/swarms/models/chat_openai.py +++ b/swarms/models/chat_openai.py @@ -458,7 +458,7 @@ class BaseOpenAI(BaseLLM): ) params["max_tokens"] = self.max_tokens_for_prompt(prompts[0]) sub_prompts = [ - prompts[i: i + self.batch_size] + prompts[i : i + self.batch_size] for i in range(0, len(prompts), self.batch_size) ] return sub_prompts @@ -469,7 +469,7 @@ class BaseOpenAI(BaseLLM): """Create the LLMResult from the choices and prompts.""" generations = [] for i, _ in enumerate(prompts): - sub_choices = choices[i * self.n: (i + 1) * self.n] + sub_choices = choices[i * self.n : (i + 1) * self.n] generations.append( [ Generation( diff --git a/swarms/models/mistral.py b/swarms/models/mistral.py index b2701dd8..61e4305d 100644 --- a/swarms/models/mistral.py +++ b/swarms/models/mistral.py @@ -23,7 +23,7 @@ class Mistral: use_flash_attention: bool = False, temperature: float = 1.0, max_length: int = 100, - do_sample: bool = True + do_sample: bool = True, ): self.ai_name = ai_name self.system_prompt = system_prompt @@ -52,34 +52,24 @@ class Mistral: except Exception as e: raise ValueError(f"Error loading the Mistral model: {str(e)}") - def run( - self, - task: str - ): + def run(self, task: str): """Run the model on a given task.""" try: - model_inputs = self.tokenizer( - [task], - return_tensors="pt" - ).to(self.device) + model_inputs = self.tokenizer([task], return_tensors="pt").to(self.device) generated_ids = self.model.generate( **model_inputs, max_length=self.max_length, do_sample=self.do_sample, temperature=self.temperature, - max_new_tokens=self.max_length + max_new_tokens=self.max_length, ) output_text = self.tokenizer.batch_decode(generated_ids)[0] return output_text except Exception as e: raise ValueError(f"Error running the model: {str(e)}") - def chat( - self, - msg: str = None, - streaming: bool = False - ): + def chat(self, msg: str = None, streaming: bool = False): """ Run chat @@ -99,24 +89,14 @@ class Mistral: """ # add users message to the history - self.history.append( - Message( - "User", - msg - ) - ) + self.history.append(Message("User", msg)) # process msg try: response = self.agent.run(msg) # add agent's response to the history - self.history.append( - Message( - "Agent", - response - ) - ) + self.history.append(Message("Agent", response)) # if streaming is = True if streaming: @@ -128,19 +108,11 @@ class Mistral: error_message = f"Error processing message: {str(error)}" # add error to history - self.history.append( - Message( - "Agent", - error_message - ) - ) + self.history.append(Message("Agent", error_message)) return error_message - def _stream_response( - self, - response: str = None - ): + def _stream_response(self, response: str = None): """ Yield the response token by token (word by word) diff --git a/swarms/models/petals.py b/swarms/models/petals.py index ba5e7a4a..cc90cb62 100644 --- a/swarms/models/petals.py +++ b/swarms/models/petals.py @@ -12,7 +12,7 @@ class Petals: top_p=0.9, top_k=None, do_sample=True, - max_length=None + max_length=None, ): self.model_name = model_name self.temperature = temperature diff --git a/swarms/models/prompts/agent_output_parser.py b/swarms/models/prompts/agent_output_parser.py index 978f217e..27f8ac24 100644 --- a/swarms/models/prompts/agent_output_parser.py +++ b/swarms/models/prompts/agent_output_parser.py @@ -6,6 +6,7 @@ from typing import Dict, NamedTuple class AgentAction(NamedTuple): """Action returned by AgentOutputParser.""" + name: str args: Dict diff --git a/swarms/models/prompts/agent_prompt_auto.py b/swarms/models/prompts/agent_prompt_auto.py index f682eac1..03cf3e62 100644 --- a/swarms/models/prompts/agent_prompt_auto.py +++ b/swarms/models/prompts/agent_prompt_auto.py @@ -16,14 +16,12 @@ class PromptConstructor: self.tools = tools def construct_full_prompt(self, goals: List[str]) -> str: - prompt_start = ( - """Your decisions must always be made independently + prompt_start = """Your decisions must always be made independently without seeking user assistance.\n Play to your strengths as an LLM and pursue simple strategies with no legal complications.\n If you have completed all your tasks, make sure to use the "finish" command.""" - ) # Construct full prompt full_prompt = ( f"You are {self.ai_name}, {self.ai_role}\n{prompt_start}\n\nGOALS:\n\n" @@ -56,10 +54,12 @@ class MessageFormatter: send_token_limit: int = 4196 def format_messages(self, **kwargs: Any) -> List[Message]: - prompt_constructor = PromptConstructor(ai_name=kwargs["ai_name"], - ai_role=kwargs["ai_role"], - tools=kwargs["tools"]) - base_prompt = SystemMessage(content=prompt_constructor.construct_full_prompt(kwargs["goals"])) + prompt_constructor = PromptConstructor( + ai_name=kwargs["ai_name"], ai_role=kwargs["ai_role"], tools=kwargs["tools"] + ) + base_prompt = SystemMessage( + content=prompt_constructor.construct_full_prompt(kwargs["goals"]) + ) time_prompt = SystemMessage( content=f"The current time and date is {time.strftime('%c')}" ) diff --git a/swarms/models/prompts/agent_prompts.py b/swarms/models/prompts/agent_prompts.py index 47b2d0de..350545ff 100644 --- a/swarms/models/prompts/agent_prompts.py +++ b/swarms/models/prompts/agent_prompts.py @@ -1,5 +1,5 @@ def generate_agent_role_prompt(agent): - """ Generates the agent role prompt. + """Generates the agent role prompt. Args: agent (str): The type of the agent. Returns: str: The agent role prompt. """ @@ -7,35 +7,38 @@ def generate_agent_role_prompt(agent): "Finance Agent": "You are a seasoned finance analyst AI assistant. Your primary goal is to compose comprehensive, astute, impartial, and methodically arranged financial reports based on provided data and trends.", "Travel Agent": "You are a world-travelled AI tour guide assistant. Your main purpose is to draft engaging, insightful, unbiased, and well-structured travel reports on given locations, including history, attractions, and cultural insights.", "Academic Research Agent": "You are an AI academic research assistant. Your primary responsibility is to create thorough, academically rigorous, unbiased, and systematically organized reports on a given research topic, following the standards of scholarly work.", - "Default Agent": "You are an AI critical thinker research assistant. Your sole purpose is to write well written, critically acclaimed, objective and structured reports on given text." - + "Default Agent": "You are an AI critical thinker research assistant. Your sole purpose is to write well written, critically acclaimed, objective and structured reports on given text.", } return prompts.get(agent, "No such agent") def generate_report_prompt(question, research_summary): - """ Generates the report prompt for the given question and research summary. + """Generates the report prompt for the given question and research summary. Args: question (str): The question to generate the report prompt for research_summary (str): The research summary to generate the report prompt for Returns: str: The report prompt for the given question and research summary """ - return f'"""{research_summary}""" Using the above information, answer the following'\ - f' question or topic: "{question}" in a detailed report --'\ - " The report should focus on the answer to the question, should be well structured, informative," \ - " in depth, with facts and numbers if available, a minimum of 1,200 words and with markdown syntax and apa format. "\ + return ( + f'"""{research_summary}""" Using the above information, answer the following' + f' question or topic: "{question}" in a detailed report --' + " The report should focus on the answer to the question, should be well structured, informative," + " in depth, with facts and numbers if available, a minimum of 1,200 words and with markdown syntax and apa format. " "Write all source urls at the end of the report in apa format" + ) def generate_search_queries_prompt(question): - """ Generates the search queries prompt for the given question. + """Generates the search queries prompt for the given question. Args: question (str): The question to generate the search queries prompt for Returns: str: The search queries prompt for the given question """ - return f'Write 4 google search queries to search online that form an objective opinion from the following: "{question}"'\ - f'You must respond with a list of strings in the following format: ["query 1", "query 2", "query 3", "query 4"]' + return ( + f'Write 4 google search queries to search online that form an objective opinion from the following: "{question}"' + f'You must respond with a list of strings in the following format: ["query 1", "query 2", "query 3", "query 4"]' + ) def generate_resource_report_prompt(question, research_summary): @@ -48,39 +51,45 @@ def generate_resource_report_prompt(question, research_summary): Returns: str: The resource report prompt for the given question and research summary. """ - return f'"""{research_summary}""" Based on the above information, generate a bibliography recommendation report for the following' \ - f' question or topic: "{question}". The report should provide a detailed analysis of each recommended resource,' \ - ' explaining how each source can contribute to finding answers to the research question.' \ - ' Focus on the relevance, reliability, and significance of each source.' \ - ' Ensure that the report is well-structured, informative, in-depth, and follows Markdown syntax.' \ - ' Include relevant facts, figures, and numbers whenever available.' \ - ' The report should have a minimum length of 1,200 words.' + return ( + f'"""{research_summary}""" Based on the above information, generate a bibliography recommendation report for the following' + f' question or topic: "{question}". The report should provide a detailed analysis of each recommended resource,' + " explaining how each source can contribute to finding answers to the research question." + " Focus on the relevance, reliability, and significance of each source." + " Ensure that the report is well-structured, informative, in-depth, and follows Markdown syntax." + " Include relevant facts, figures, and numbers whenever available." + " The report should have a minimum length of 1,200 words." + ) def generate_outline_report_prompt(question, research_summary): - """ Generates the outline report prompt for the given question and research summary. + """Generates the outline report prompt for the given question and research summary. Args: question (str): The question to generate the outline report prompt for research_summary (str): The research summary to generate the outline report prompt for Returns: str: The outline report prompt for the given question and research summary """ - return f'"""{research_summary}""" Using the above information, generate an outline for a research report in Markdown syntax'\ - f' for the following question or topic: "{question}". The outline should provide a well-structured framework'\ - ' for the research report, including the main sections, subsections, and key points to be covered.' \ - ' The research report should be detailed, informative, in-depth, and a minimum of 1,200 words.' \ - ' Use appropriate Markdown syntax to format the outline and ensure readability.' + return ( + f'"""{research_summary}""" Using the above information, generate an outline for a research report in Markdown syntax' + f' for the following question or topic: "{question}". The outline should provide a well-structured framework' + " for the research report, including the main sections, subsections, and key points to be covered." + " The research report should be detailed, informative, in-depth, and a minimum of 1,200 words." + " Use appropriate Markdown syntax to format the outline and ensure readability." + ) def generate_concepts_prompt(question, research_summary): - """ Generates the concepts prompt for the given question. + """Generates the concepts prompt for the given question. Args: question (str): The question to generate the concepts prompt for research_summary (str): The research summary to generate the concepts prompt for Returns: str: The concepts prompt for the given question """ - return f'"""{research_summary}""" Using the above information, generate a list of 5 main concepts to learn for a research report'\ - f' on the following question or topic: "{question}". The outline should provide a well-structured framework'\ - 'You must respond with a list of strings in the following format: ["concepts 1", "concepts 2", "concepts 3", "concepts 4, concepts 5"]' + return ( + f'"""{research_summary}""" Using the above information, generate a list of 5 main concepts to learn for a research report' + f' on the following question or topic: "{question}". The outline should provide a well-structured framework' + 'You must respond with a list of strings in the following format: ["concepts 1", "concepts 2", "concepts 3", "concepts 4, concepts 5"]' + ) def generate_lesson_prompt(concept): @@ -92,17 +101,19 @@ def generate_lesson_prompt(concept): str: The lesson prompt for the given concept. """ - prompt = f'generate a comprehensive lesson about {concept} in Markdown syntax. This should include the definition'\ - f'of {concept}, its historical background and development, its applications or uses in different'\ - f'fields, and notable events or facts related to {concept}.' + prompt = ( + f"generate a comprehensive lesson about {concept} in Markdown syntax. This should include the definition" + f"of {concept}, its historical background and development, its applications or uses in different" + f"fields, and notable events or facts related to {concept}." + ) return prompt def get_report_by_type(report_type): report_type_mapping = { - 'research_report': generate_report_prompt, - 'resource_report': generate_resource_report_prompt, - 'outline_report': generate_outline_report_prompt + "research_report": generate_report_prompt, + "resource_report": generate_resource_report_prompt, + "outline_report": generate_outline_report_prompt, } return report_type_mapping[report_type] diff --git a/swarms/models/prompts/debate.py b/swarms/models/prompts/debate.py index f523f9ef..a11c7af4 100644 --- a/swarms/models/prompts/debate.py +++ b/swarms/models/prompts/debate.py @@ -38,5 +38,7 @@ def debate_monitor(game_description, word_limit, character_names): return prompt -def generate_character_header(game_description, topic, character_name, character_description): +def generate_character_header( + game_description, topic, character_name, character_description +): pass diff --git a/swarms/models/prompts/prebuild/project_manager.py b/swarms/models/prompts/prebuild/project_manager.py index 295c2c5d..a1912190 100644 --- a/swarms/models/prompts/prebuild/project_manager.py +++ b/swarms/models/prompts/prebuild/project_manager.py @@ -1,4 +1,4 @@ -PROJECT_MANAGR_PROMPT_TEMPLATE = ''' +PROJECT_MANAGR_PROMPT_TEMPLATE = """ # Context {context} @@ -23,7 +23,7 @@ Attention: Use '##' to split sections, not '#', and '## ' SHOULD W ## Anything UNCLEAR: Provide as Plain text. Make clear here. For example, don't forget a main entry. don't forget to init 3rd party libs. -''' +""" FORMAT_EXAMPLE = ''' --- diff --git a/swarms/models/prompts/prebuild/sales_prompts.py b/swarms/models/prompts/prebuild/sales_prompts.py index d4c57b51..806f0ad2 100644 --- a/swarms/models/prompts/prebuild/sales_prompts.py +++ b/swarms/models/prompts/prebuild/sales_prompts.py @@ -1,5 +1,3 @@ - - SALES_ASSISTANT_PROMPT = """You are a sales assistant helping your sales agent to determine which stage of a sales conversation should the agent move to, or stay at. Following '===' is the conversation history. Use this conversation history to make your decision. @@ -47,10 +45,12 @@ Conversation history: {salesperson_name}: """ -conversation_stages = {'1': "Introduction: Start the conversation by introducing yourself and your company. Be polite and respectful while keeping the tone of the conversation professional. Your greeting should be welcoming. Always clarify in your greeting the reason why you are contacting the prospect.", - '2': "Qualification: Qualify the prospect by confirming if they are the right person to talk to regarding your product/service. Ensure that they have the authority to make purchasing decisions.", - '3': "Value proposition: Briefly explain how your product/service can benefit the prospect. Focus on the unique selling points and value proposition of your product/service that sets it apart from competitors.", - '4': "Needs analysis: Ask open-ended questions to uncover the prospect's needs and pain points. Listen carefully to their responses and take notes.", - '5': "Solution presentation: Based on the prospect's needs, present your product/service as the solution that can address their pain points.", - '6': "Objection handling: Address any objections that the prospect may have regarding your product/service. Be prepared to provide evidence or testimonials to support your claims.", - '7': "Close: Ask for the sale by proposing a next step. This could be a demo, a trial or a meeting with decision-makers. Ensure to summarize what has been discussed and reiterate the benefits."} +conversation_stages = { + "1": "Introduction: Start the conversation by introducing yourself and your company. Be polite and respectful while keeping the tone of the conversation professional. Your greeting should be welcoming. Always clarify in your greeting the reason why you are contacting the prospect.", + "2": "Qualification: Qualify the prospect by confirming if they are the right person to talk to regarding your product/service. Ensure that they have the authority to make purchasing decisions.", + "3": "Value proposition: Briefly explain how your product/service can benefit the prospect. Focus on the unique selling points and value proposition of your product/service that sets it apart from competitors.", + "4": "Needs analysis: Ask open-ended questions to uncover the prospect's needs and pain points. Listen carefully to their responses and take notes.", + "5": "Solution presentation: Based on the prospect's needs, present your product/service as the solution that can address their pain points.", + "6": "Objection handling: Address any objections that the prospect may have regarding your product/service. Be prepared to provide evidence or testimonials to support your claims.", + "7": "Close: Ask for the sale by proposing a next step. This could be a demo, a trial or a meeting with decision-makers. Ensure to summarize what has been discussed and reiterate the benefits.", +} diff --git a/swarms/models/prompts/prebuild/summaries_prompts.py b/swarms/models/prompts/prebuild/summaries_prompts.py index 63cff714..01c4c502 100644 --- a/swarms/models/prompts/prebuild/summaries_prompts.py +++ b/swarms/models/prompts/prebuild/summaries_prompts.py @@ -1,4 +1,3 @@ - SUMMARIZE_PROMPT = """ Your output should use the following template: ### Summary diff --git a/swarms/structs/nonlinear_workflow.py b/swarms/structs/nonlinear_workflow.py index f3513a75..831d106e 100644 --- a/swarms/structs/nonlinear_workflow.py +++ b/swarms/structs/nonlinear_workflow.py @@ -5,10 +5,7 @@ from graphlib import TopologicalSorter class Task: def __init__( - self, - id: str, - parents: List["Task"] = None, - children: List["Task"] = None + self, id: str, parents: List["Task"] = None, children: List["Task"] = None ): self.id = id self.parents = parents @@ -48,11 +45,7 @@ class NonLinearWorkflow: """ - def __init__( - self, - agents, - iters_per_task - ): + def __init__(self, agents, iters_per_task): """A workflow is a collection of tasks that can be executed in parallel or sequentially.""" super().__init__() self.executor = ThreadPoolExecutor() @@ -61,10 +54,7 @@ class NonLinearWorkflow: def add(self, task: Task): """Add a task to the workflow""" - assert isinstance( - task, - Task - ), "Input must be an nstance of Task" + assert isinstance(task, Task), "Input must be an nstance of Task" self.tasks.append(task) return task @@ -100,9 +90,5 @@ class NonLinearWorkflow: def order_tasks(self) -> List[Task]: """Order the tasks USING TOPOLOGICAL SORTING""" - task_order = TopologicalSorter( - self.to_graph() - ).static_order() - return [ - self.find_task(task_id) for task_id in task_order - ] + task_order = TopologicalSorter(self.to_graph()).static_order() + return [self.find_task(task_id) for task_id in task_order] diff --git a/swarms/structs/task.py b/swarms/structs/task.py index 8ea5c5c7..66dced87 100644 --- a/swarms/structs/task.py +++ b/swarms/structs/task.py @@ -24,7 +24,7 @@ class BaseTask(ABC): self.parent_ids: List[str] = [] self.child_ids: List[str] = [] self.output: Optional[Union[Artifact, ErrorArtifact]] = None - self.structure: Optional['Structure'] = None + self.structure: Optional["Structure"] = None @property @abstractmethod @@ -45,7 +45,7 @@ class BaseTask(ABC): def __lshift__(self, child: BaseTask) -> BaseTask: return self.add_parent(child) - def preprocess(self, structure: 'Structure') -> BaseTask: + def preprocess(self, structure: "Structure") -> BaseTask: self.structure = structure return self @@ -117,7 +117,9 @@ class BaseTask(ABC): return self.output def can_execute(self) -> bool: - return self.state == self.State.PENDING and all(parent.is_finished() for parent in self.parents) + return self.state == self.State.PENDING and all( + parent.is_finished() for parent in self.parents + ) def reset(self) -> BaseTask: self.state = self.State.PENDING @@ -130,21 +132,13 @@ class BaseTask(ABC): class Task(BaseModel): - input: Optional[StrictStr] = Field( - None, - description="Input prompt for the task" - ) + input: Optional[StrictStr] = Field(None, description="Input prompt for the task") additional_input: Optional[Any] = Field( - None, - description="Input parameters for the task. Any value is allowed" - ) - task_id: StrictStr = Field( - ..., - description="ID of the task" + None, description="Input parameters for the task. Any value is allowed" ) + task_id: StrictStr = Field(..., description="ID of the task") artifacts: conlist(Artifact, min_items=1) = Field( - ..., - description="A list of artifacts that the task has been produced" + ..., description="A list of artifacts that the task has been produced" ) class Config: @@ -158,21 +152,26 @@ class Task(BaseModel): return json.dumps(self.dict(by_alias=True, exclude_none=True)) @classmethod - def from_json(cls, json_str: str) -> 'Task': + def from_json(cls, json_str: str) -> "Task": return cls.parse_raw(json_str) def to_dict(self) -> dict: _dict = self.dict(by_alias=True, exclude_none=True) if self.artifacts: - _dict["artifacts"] = [artifact.dict(by_alias=True, exclude_none=True) for artifact in self.artifacts] + _dict["artifacts"] = [ + artifact.dict(by_alias=True, exclude_none=True) + for artifact in self.artifacts + ] return _dict @classmethod - def from_dict(cls, obj: dict) -> 'Task': + def from_dict(cls, obj: dict) -> "Task": if obj is None: return None if not isinstance(obj, dict): raise ValueError("Input must be a dictionary.") - if 'artifacts' in obj: - obj['artifacts'] = [Artifact.parse_obj(artifact) for artifact in obj['artifacts']] + if "artifacts" in obj: + obj["artifacts"] = [ + Artifact.parse_obj(artifact) for artifact in obj["artifacts"] + ] return cls.parse_obj(obj) diff --git a/swarms/structs/workflow.py b/swarms/structs/workflow.py index 7f9e7d25..2bbfb9be 100644 --- a/swarms/structs/workflow.py +++ b/swarms/structs/workflow.py @@ -25,6 +25,7 @@ class Workflow: """ + class Task: def __init__(self, task: str): self.task = task @@ -33,7 +34,7 @@ class Workflow: self.output = None self.structure = None - def add_child(self, child: 'Workflow.Task'): + def add_child(self, child: "Workflow.Task"): self.children.append(child) child.parents.append(self) child.structure = self.structure @@ -80,9 +81,11 @@ class Workflow: def context(self, task: Task) -> Dict[str, Any]: return { - "parent_output": task.parents[0].output if task.parents and task.parents[0].output else None, + "parent_output": task.parents[0].output + if task.parents and task.parents[0].output + else None, "parent": task.parents[0] if task.parents else None, - "child": task.children[0] if task.children else None + "child": task.children[0] if task.children else None, } def __run_from_task(self, task: Optional[Task]) -> None: diff --git a/swarms/swarms/autoscaler.py b/swarms/swarms/autoscaler.py index b19e8d7d..c85cd13f 100644 --- a/swarms/swarms/autoscaler.py +++ b/swarms/swarms/autoscaler.py @@ -24,6 +24,7 @@ class AutoScaler: auto_scaler.add_task9f"task {I}}) ``` """ + @log_decorator @error_decorator @timing_decorator diff --git a/swarms/swarms/dialogue_simulator.py b/swarms/swarms/dialogue_simulator.py index ef97d701..fecdfa14 100644 --- a/swarms/swarms/dialogue_simulator.py +++ b/swarms/swarms/dialogue_simulator.py @@ -6,12 +6,7 @@ class DialogueSimulator: def __init__(self, agents: List[Worker]): self.agents = agents - def run( - self, - max_iters: int, - name: str = None, - message: str = None - ): + def run(self, max_iters: int, name: str = None, message: str = None): step = 0 if name and message: prompt = f"Name {name} and message: {message}" @@ -25,7 +20,9 @@ class DialogueSimulator: speaker_message = speaker.run(prompt) for receiver in self.agents: - message_history = f"Speaker Name: {speaker.name} and message: {speaker_message}" + message_history = ( + f"Speaker Name: {speaker.name} and message: {speaker_message}" + ) receiver.run(message_history) print(f"({speaker.name}): {speaker_message}") diff --git a/swarms/swarms/god_mode.py b/swarms/swarms/god_mode.py index b5cb053c..1be58ae9 100644 --- a/swarms/swarms/god_mode.py +++ b/swarms/swarms/god_mode.py @@ -30,10 +30,7 @@ class GodMode: """ - def __init__( - self, - llms - ): + def __init__(self, llms): self.llms = llms def run(self, task): @@ -49,10 +46,6 @@ class GodMode: table.append([f"LLM {i+1}", response]) print( colored( - tabulate( - table, - headers=["LLM", "Response"], - tablefmt="pretty" - ), "cyan" + tabulate(table, headers=["LLM", "Response"], tablefmt="pretty"), "cyan" ) ) diff --git a/swarms/swarms/groupchat.py b/swarms/swarms/groupchat.py index ffe1c5bb..4ce97721 100644 --- a/swarms/swarms/groupchat.py +++ b/swarms/swarms/groupchat.py @@ -52,7 +52,8 @@ class GroupChat: selector.update_system_message(self.select_speaker_msg()) final, name = selector.run( - self.messages + [ + self.messages + + [ { "role": "system", "context": f"Read the above conversation. Then select the next role from {self.worker_names} to play. Only return the role.", @@ -80,20 +81,17 @@ class GroupChatManager(Worker): max_consecutive_auto_reply: Optional[int] = sys.maxsize, human_input_mode: Optional[str] = "NEVER", system_message: Optional[str] = "Group chat manager", - **kwargs + **kwargs, ): super().__init__( ai_name=ai_name, # max_consecutive_auto_reply=max_consecutive_auto_reply, # human_input_mode=human_input_mode, # system_message=system_message, - **kwargs + **kwargs, ) self.register_reply( - Worker, - GroupChatManager.run, - config=groupchat, - reset_config=GroupChat.reset + Worker, GroupChatManager.run, config=groupchat, reset_config=GroupChat.reset ) def run( @@ -147,11 +145,7 @@ class GroupChatManager(Worker): break # speaker sends message without requesting a reply - speaker.send( - reply, - self, - request_reply=False - ) + speaker.send(reply, self, request_reply=False) message = self.last_message(speaker) message = self.last_messge(speaker) return True, None diff --git a/swarms/swarms/multi_agent_collab.py b/swarms/swarms/multi_agent_collab.py index 0c499afb..2ee917a3 100644 --- a/swarms/swarms/multi_agent_collab.py +++ b/swarms/swarms/multi_agent_collab.py @@ -15,11 +15,7 @@ bid_parser = BidOutputParser( ) -def select_next_speaker( - step: int, - agents, - director -) -> int: +def select_next_speaker(step: int, agents, director) -> int: # if the step if even => director # => director selects next speaker if step % 2 == 1: @@ -50,10 +46,7 @@ class MultiAgentCollaboration: self._step += 1 def step(self) -> tuple[str, str]: - speaker_idx = self.select_next_speaker( - self._step, - self.agents - ) + speaker_idx = self.select_next_speaker(self._step, self.agents) speaker = self.agents[speaker_idx] message = speaker.send() message = speaker.send() diff --git a/swarms/swarms/multi_agent_debate.py b/swarms/swarms/multi_agent_debate.py index b914906d..a39dba2b 100644 --- a/swarms/swarms/multi_agent_debate.py +++ b/swarms/swarms/multi_agent_debate.py @@ -18,9 +18,7 @@ class MultiAgentDebate: """ def __init__( - self, - agents: List[Worker], - selection_func: Callable[[int, List[Worker]], int] + self, agents: List[Worker], selection_func: Callable[[int, List[Worker]], int] ): self.agents = agents self.selection_func = selection_func @@ -39,19 +37,18 @@ class MultiAgentDebate: speaker_idx = self.selection_func(i, self.agents) speaker = self.agents[speaker_idx] response = speaker.run(task) - results.append({ - 'agent': speaker.ai_name, - 'response': response - }) + results.append({"agent": speaker.ai_name, "response": response}) return results def update_task(self, task: str): self.task = task def format_results(self, results): - formatted_results = "\n".join( - [f"Agent {result['agent']} responded: {result['response']}" for result in results] + [ + f"Agent {result['agent']} responded: {result['response']}" + for result in results + ] ) return formatted_results diff --git a/swarms/swarms/orchestrate.py b/swarms/swarms/orchestrate.py index 96b47579..09914485 100644 --- a/swarms/swarms/orchestrate.py +++ b/swarms/swarms/orchestrate.py @@ -99,7 +99,7 @@ class Orchestrator: api_key: str = None, model_name: str = None, embed_func=None, - worker=None + worker=None, ): self.agent = agent self.agents = queue.Queue() @@ -111,9 +111,7 @@ class Orchestrator: self.chroma_client = chromadb.Client() - self.collection = self.chroma_client.create_collection( - name=collection_name - ) + self.collection = self.chroma_client.create_collection(name=collection_name) self.current_tasks = {} @@ -125,11 +123,7 @@ class Orchestrator: # @abstractmethod - def assign_task( - self, - agent_id: int, - task: Dict[str, Any] - ) -> None: + def assign_task(self, agent_id: int, task: Dict[str, Any]) -> None: """Assign a task to a specific agent""" while True: @@ -144,21 +138,23 @@ class Orchestrator: # using the embed method to get the vector representation of the result vector_representation = self.embed( - result, - self.api_key, - self.model_name + result, self.api_key, self.model_name ) self.collection.add( embeddings=[vector_representation], documents=[str(id(task))], - ids=[str(id(task))] + ids=[str(id(task))], ) - logging.info(f"Task {id(str)} has been processed by agent {id(agent)} with") + logging.info( + f"Task {id(str)} has been processed by agent {id(agent)} with" + ) except Exception as error: - logging.error(f"Failed to process task {id(task)} by agent {id(agent)}. Error: {error}") + logging.error( + f"Failed to process task {id(task)} by agent {id(agent)}. Error: {error}" + ) finally: with self.condition: self.agents.put(agent) @@ -166,8 +162,7 @@ class Orchestrator: def embed(self, input, api_key, model_name): openai = embedding_functions.OpenAIEmbeddingFunction( - api_key=api_key, - model_name=model_name + api_key=api_key, model_name=model_name ) embedding = openai(input) return embedding @@ -179,14 +174,13 @@ class Orchestrator: try: # Query the vector database for documents created by the agents - results = self.collection.query( - query_texts=[str(agent_id)], - n_results=10 - ) + results = self.collection.query(query_texts=[str(agent_id)], n_results=10) return results except Exception as e: - logging.error(f"Failed to retrieve results from agent {agent_id}. Error {e}") + logging.error( + f"Failed to retrieve results from agent {agent_id}. Error {e}" + ) raise # @abstractmethod @@ -197,7 +191,7 @@ class Orchestrator: self.collection.add( embeddings=[data["vector"]], documents=[str(data["task_id"])], - ids=[str(data["task_id"])] + ids=[str(data["task_id"])], ) except Exception as e: @@ -210,17 +204,11 @@ class Orchestrator: """Retrieve the vector database""" return self.collection - def append_to_db( - self, - result: str - ): + def append_to_db(self, result: str): """append the result of the swarm to a specifici collection in the database""" try: - self.collection.add( - documents=[result], - ids=[str(id(result))] - ) + self.collection.add(documents=[result], ids=[str(id(result))]) except Exception as e: logging.error(f"Failed to append the agent output to database. Error: {e}") @@ -236,13 +224,8 @@ class Orchestrator: self.task_queue.append(objective) results = [ - self.assign_task( - agent_id, task - ) for agent_id, task in zip( - range( - len(self.agents) - ), self.task_queue - ) + self.assign_task(agent_id, task) + for agent_id, task in zip(range(len(self.agents)), self.task_queue) ] for result in results: @@ -254,12 +237,7 @@ class Orchestrator: logging.error(f"An error occured in swarm: {e}") return None - def chat( - self, - sender_id: int, - receiver_id: int, - message: str - ): + def chat(self, sender_id: int, receiver_id: int, message: str): """ Allows the agents to chat with eachother thrught the vectordatabase @@ -276,37 +254,24 @@ class Orchestrator: """ - message_vector = self.embed( - message, - self.api_key, - self.model_name - ) + message_vector = self.embed(message, self.api_key, self.model_name) # store the mesage in the vector database self.collection.add( embeddings=[message_vector], documents=[message], - ids=[f"{sender_id}_to_{receiver_id}"] + ids=[f"{sender_id}_to_{receiver_id}"], ) - self.run( - objective=f"chat with agent {receiver_id} about {message}" - ) + self.run(objective=f"chat with agent {receiver_id} about {message}") - def add_agents( - self, - num_agents: int - ): + def add_agents(self, num_agents: int): for _ in range(num_agents): self.agents.put(self.agent()) - self.executor = ThreadPoolExecutor( - max_workers=self.agents.qsize() - ) + self.executor = ThreadPoolExecutor(max_workers=self.agents.qsize()) def remove_agents(self, num_agents): for _ in range(num_agents): if not self.agents.empty(): self.agents.get() - self.executor = ThreadPoolExecutor( - max_workers=self.agents.qsize() - ) + self.executor = ThreadPoolExecutor(max_workers=self.agents.qsize()) diff --git a/swarms/swarms/scable_groupchat.py b/swarms/swarms/scable_groupchat.py index 382e74dc..c826ef8f 100644 --- a/swarms/swarms/scable_groupchat.py +++ b/swarms/swarms/scable_groupchat.py @@ -41,44 +41,30 @@ class ScalableGroupChat: # Create a list of Worker instances with unique names for i in range(worker_count): - self.workers.append( - Worker( - openai_api_key=api_key, - ai_name=f"Worker-{i}" - ) - ) + self.workers.append(Worker(openai_api_key=api_key, ai_name=f"Worker-{i}")) - def embed( - self, - input, - model_name - ): + def embed(self, input, model_name): """Embeds an input of size N into a vector of size M""" openai = embedding_functions.OpenAIEmbeddingFunction( - api_key=self.api_key, - model_name=model_name + api_key=self.api_key, model_name=model_name ) embedding = openai(input) return embedding - def retrieve_results( - self, - agent_id: int - ) -> Any: + def retrieve_results(self, agent_id: int) -> Any: """Retrieve results from a specific agent""" try: # Query the vector database for documents created by the agents - results = self.collection.query( - query_texts=[str(agent_id)], - n_results=10 - ) + results = self.collection.query(query_texts=[str(agent_id)], n_results=10) return results except Exception as e: - logging.error(f"Failed to retrieve results from agent {agent_id}. Error {e}") + logging.error( + f"Failed to retrieve results from agent {agent_id}. Error {e}" + ) raise # @abstractmethod @@ -89,7 +75,7 @@ class ScalableGroupChat: self.collection.add( embeddings=[data["vector"]], documents=[str(data["task_id"])], - ids=[str(data["task_id"])] + ids=[str(data["task_id"])], ) except Exception as e: @@ -102,28 +88,17 @@ class ScalableGroupChat: """Retrieve the vector database""" return self.collection - def append_to_db( - self, - result: str - ): + def append_to_db(self, result: str): """append the result of the swarm to a specifici collection in the database""" try: - self.collection.add( - documents=[result], - ids=[str(id(result))] - ) + self.collection.add(documents=[result], ids=[str(id(result))]) except Exception as e: logging.error(f"Failed to append the agent output to database. Error: {e}") raise - def chat( - self, - sender_id: int, - receiver_id: int, - message: str - ): + def chat(self, sender_id: int, receiver_id: int, message: str): """ Allows the agents to chat with eachother thrught the vectordatabase @@ -139,7 +114,12 @@ class ScalableGroupChat: orchestrator.chat(sender_id=1, receiver_id=2, message="Hello, Agent 2!") """ - if sender_id < 0 or sender_id >= self.worker_count or receiver_id < 0 or receiver_id >= self.worker_count: + if ( + sender_id < 0 + or sender_id >= self.worker_count + or receiver_id < 0 + or receiver_id >= self.worker_count + ): raise ValueError("Invalid sender or receiver ID") message_vector = self.embed( @@ -150,9 +130,7 @@ class ScalableGroupChat: self.collection.add( embeddings=[message_vector], documents=[message], - ids=[f"{sender_id}_to_{receiver_id}"] + ids=[f"{sender_id}_to_{receiver_id}"], ) - self.run( - objective=f"chat with agent {receiver_id} about {message}" - ) + self.run(objective=f"chat with agent {receiver_id} about {message}") diff --git a/swarms/swarms/simple_swarm.py b/swarms/swarms/simple_swarm.py index 8da14b31..2ae8bf69 100644 --- a/swarms/swarms/simple_swarm.py +++ b/swarms/swarms/simple_swarm.py @@ -44,11 +44,7 @@ class SimpleSwarm: self.task_queue = Queue() self.priority_queue = PriorityQueue() - def distribute( - self, - task: str = None, - priority=None - ): + def distribute(self, task: str = None, priority=None): """Distribute a task to the workers""" if priority: self.priority_queue.put((priority, task)) diff --git a/swarms/tools/autogpt.py b/swarms/tools/autogpt.py index 88c0afa7..0603d6f3 100644 --- a/swarms/tools/autogpt.py +++ b/swarms/tools/autogpt.py @@ -112,8 +112,12 @@ def _get_text_splitter(): class WebpageQATool(BaseTool): name = "query_webpage" - description = "Browse a webpage and retrieve the information relevant to the question." - text_splitter: RecursiveCharacterTextSplitter = Field(default_factory=_get_text_splitter) + description = ( + "Browse a webpage and retrieve the information relevant to the question." + ) + text_splitter: RecursiveCharacterTextSplitter = Field( + default_factory=_get_text_splitter + ) qa_chain: BaseCombineDocumentsChain def _run(self, url: str, question: str) -> str: @@ -124,11 +128,19 @@ class WebpageQATool(BaseTool): results = [] # TODO: Handle this with a MapReduceChain for i in range(0, len(web_docs), 4): - input_docs = web_docs[i:i + 4] - window_result = self.qa_chain({"input_documents": input_docs, "question": question}, return_only_outputs=True) + input_docs = web_docs[i : i + 4] + window_result = self.qa_chain( + {"input_documents": input_docs, "question": question}, + return_only_outputs=True, + ) results.append(f"Response from window {i} - {window_result}") - results_docs = [Document(page_content="\n".join(results), metadata={"source": url})] - return self.qa_chain({"input_documents": results_docs, "question": question}, return_only_outputs=True) + results_docs = [ + Document(page_content="\n".join(results), metadata={"source": url}) + ] + return self.qa_chain( + {"input_documents": results_docs, "question": question}, + return_only_outputs=True, + ) async def _arun(self, url: str, question: str) -> str: raise NotImplementedError @@ -179,9 +191,7 @@ def VQAinference(self, inputs): image_path, question = inputs.split(",") raw_image = Image.open(image_path).convert("RGB") - inputs = processor(raw_image, question, return_tensors="pt").to( - device, torch_dtype - ) + inputs = processor(raw_image, question, return_tensors="pt").to(device, torch_dtype) out = model.generate(**inputs) answer = processor.decode(out[0], skip_special_tokens=True) diff --git a/swarms/tools/base.py b/swarms/tools/base.py index c51f1e60..5b7e3f2a 100644 --- a/swarms/tools/base.py +++ b/swarms/tools/base.py @@ -61,7 +61,7 @@ class StructuredTool(BaseTool): name: str, description: str, args_schema: Type[BaseModel], - func: Callable[..., Any] + func: Callable[..., Any], ): self.name = name self.description = description @@ -99,14 +99,18 @@ class ToolWrapper: def to_tool(self, get_session: SessionGetter = lambda: []) -> BaseTool: if self.is_per_session(): - self.func = lambda *args, **kwargs: self.func(*args, **kwargs, get_session=get_session) + self.func = lambda *args, **kwargs: self.func( + *args, **kwargs, get_session=get_session + ) return Tool(name=self.name, description=self.description, func=self.func) class BaseToolSet: def tool_wrappers(cls) -> list[ToolWrapper]: - methods = [getattr(cls, m) for m in dir(cls) if hasattr(getattr(cls, m), "is_tool")] + methods = [ + getattr(cls, m) for m in dir(cls) if hasattr(getattr(cls, m), "is_tool") + ] return [ToolWrapper(m.name, m.description, m.scope, m) for m in methods] @@ -130,7 +134,9 @@ class GlobalToolsCreator(ToolCreator): class SessionToolsCreator(ToolCreator): - def create_tools(self, toolsets: list[BaseToolSet], get_session: SessionGetter = lambda: []) -> list[BaseTool]: + def create_tools( + self, toolsets: list[BaseToolSet], get_session: SessionGetter = lambda: [] + ) -> list[BaseTool]: tools = [] for toolset in toolsets: tools.extend( @@ -145,7 +151,12 @@ class SessionToolsCreator(ToolCreator): class ToolsFactory: @staticmethod - def from_toolset(toolset: BaseToolSet, only_global: Optional[bool] = False, only_per_session: Optional[bool] = False, get_session: SessionGetter = lambda: []) -> list[BaseTool]: + def from_toolset( + toolset: BaseToolSet, + only_global: Optional[bool] = False, + only_per_session: Optional[bool] = False, + get_session: SessionGetter = lambda: [], + ) -> list[BaseTool]: tools = [] for wrapper in toolset.tool_wrappers(): if only_global and not wrapper.is_global(): @@ -156,9 +167,15 @@ class ToolsFactory: return tools @staticmethod - def create_tools(tool_creator: ToolCreator, toolsets: list[BaseToolSet], get_session: SessionGetter = lambda: []): + def create_tools( + tool_creator: ToolCreator, + toolsets: list[BaseToolSet], + get_session: SessionGetter = lambda: [], + ): return tool_creator.create_tools(toolsets, get_session) @staticmethod - def create_global_tools_from_names(toolnames: list[str], llm: Optional[BaseLLM]) -> list[BaseTool]: + def create_global_tools_from_names( + toolnames: list[str], llm: Optional[BaseLLM] + ) -> list[BaseTool]: return load_tools(toolnames, llm=llm) diff --git a/swarms/tools/developer.py b/swarms/tools/developer.py index 2cd47d2f..04e4b30a 100644 --- a/swarms/tools/developer.py +++ b/swarms/tools/developer.py @@ -1,4 +1,3 @@ - import os import re import signal @@ -270,8 +269,7 @@ def terminal_execute(self, commands: str, get_session: SessionGetter) -> str: output = str(e) logger.debug( - f"\nProcessed Terminal, Input Commands: {commands} " - f"Output Answer: {output}" + f"\nProcessed Terminal, Input Commands: {commands} " f"Output Answer: {output}" ) return output @@ -308,7 +306,7 @@ class WriteCommand: @staticmethod def from_str(command: str) -> "WriteCommand": filepath = command.split(WriteCommand.separator)[0] - return WriteCommand(filepath, command[len(filepath) + 1:]) + return WriteCommand(filepath, command[len(filepath) + 1 :]) class CodeWriter: @@ -435,7 +433,7 @@ class ReadCommand: if self.start == self.end: code = code[self.start - 1] else: - code = "".join(code[self.start - 1: self.end]) + code = "".join(code[self.start - 1 : self.end]) return code @staticmethod @@ -592,9 +590,9 @@ class PatchCommand: lines[self.start.line] = ( lines[self.start.line][: self.start.col] + self.content - + lines[self.end.line][self.end.col:] + + lines[self.end.line][self.end.col :] ) - lines = lines[: self.start.line + 1] + lines[self.end.line + 1:] + lines = lines[: self.start.line + 1] + lines[self.end.line + 1 :] after = self.write_lines(lines) @@ -784,7 +782,8 @@ class CodeEditor(BaseToolSet): ) return output -#---------------- end + +# ---------------- end @tool( @@ -844,8 +843,7 @@ def code_editor_append(self, inputs: str) -> str: output = str(e) logger.debug( - f"\nProcessed CodeEditor.APPEND, Input: {inputs} " - f"Output Answer: {output}" + f"\nProcessed CodeEditor.APPEND, Input: {inputs} " f"Output Answer: {output}" ) return output diff --git a/swarms/tools/requests.py b/swarms/tools/requests.py index 67540caa..fa60e8e4 100644 --- a/swarms/tools/requests.py +++ b/swarms/tools/requests.py @@ -1,4 +1,3 @@ - import requests from bs4 import BeautifulSoup diff --git a/swarms/tools/stt.py b/swarms/tools/stt.py index d4845f21..2bc46c0d 100644 --- a/swarms/tools/stt.py +++ b/swarms/tools/stt.py @@ -12,11 +12,11 @@ class SpeechToText: def __init__( self, video_url, - audio_format='mp3', - device='cuda', + audio_format="mp3", + device="cuda", batch_size=16, compute_type="float16", - hf_api_key=None + hf_api_key=None, ): """ # Example usage @@ -39,12 +39,12 @@ class SpeechToText: subprocess.run(["pip", "install", "pydub"]) def download_youtube_video(self): - audio_file = f'video.{self.audio_format}' + audio_file = f"video.{self.audio_format}" # Download video 📥 yt = YouTube(self.video_url) yt_stream = yt.streams.filter(only_audio=True).first() - yt_stream.download(filename='video.mp4') + yt_stream.download(filename="video.mp4") # Convert video to audio 🎧 video = AudioSegment.from_file("video.mp4", format="mp4") @@ -66,57 +66,59 @@ class SpeechToText: result = model.transcribe(audio, batch_size=batch_size) # 2. Align Whisper output 🔍 - model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=device) - result = whisperx.align(result["segments"], model_a, metadata, audio, device, return_char_alignments=False) + model_a, metadata = whisperx.load_align_model( + language_code=result["language"], device=device + ) + result = whisperx.align( + result["segments"], + model_a, + metadata, + audio, + device, + return_char_alignments=False, + ) # 3. Assign speaker labels 🏷️ diarize_model = whisperx.DiarizationPipeline( - use_auth_token=self.hf_api_key, - device=device + use_auth_token=self.hf_api_key, device=device ) diarize_model(audio_file) try: segments = result["segments"] - transcription = " ".join(segment['text'] for segment in segments) + transcription = " ".join(segment["text"] for segment in segments) return transcription except KeyError: print("The key 'segments' is not found in the result.") def transcribe(self, audio_file): - model = whisperx.load_model( - "large-v2", - self.device, - self.compute_type - ) + model = whisperx.load_model("large-v2", self.device, self.compute_type) audio = whisperx.load_audio(audio_file) - result = model.transcribe( - audio, - batch_size=self.batch_size - ) + result = model.transcribe(audio, batch_size=self.batch_size) # 2. Align Whisper output 🔍 - model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=device) + model_a, metadata = whisperx.load_align_model( + language_code=result["language"], device=device + ) result = whisperx.align( result["segments"], model_a, metadata, audio, self.device, - return_char_alignments=False + return_char_alignments=False, ) # 3. Assign speaker labels 🏷️ diarize_model = whisperx.DiarizationPipeline( - use_auth_token=self.hf_api_key, - device=self.device + use_auth_token=self.hf_api_key, device=self.device ) diarize_model(audio_file) try: segments = result["segments"] - transcription = " ".join(segment['text'] for segment in segments) + transcription = " ".join(segment["text"] for segment in segments) return transcription except KeyError: print("The key 'segments' is not found in the result.") diff --git a/swarms/utils/decorators.py b/swarms/utils/decorators.py index 5c58d6ea..8a5a5d56 100644 --- a/swarms/utils/decorators.py +++ b/swarms/utils/decorators.py @@ -7,10 +7,11 @@ import warnings def log_decorator(func): def wrapper(*args, **kwargs): - logging.info(f'Entering {func.__name__}') + logging.info(f"Entering {func.__name__}") result = func(*args, **kwargs) - logging.info(f'Exiting {func.__name__}') + logging.info(f"Exiting {func.__name__}") return result + return wrapper @@ -19,8 +20,9 @@ def error_decorator(func): try: return func(*args, **kwargs) except Exception as e: - logging.error(f'Error in {func.__name__}: {str(e)}') + logging.error(f"Error in {func.__name__}: {str(e)}") raise + return wrapper @@ -29,8 +31,9 @@ def timing_decorator(func): start_time = time.time() result = func(*args, **kwargs) end_time = time.time() - logging.info(f'{func.__name__} executed in {end_time - start_time} seconds') + logging.info(f"{func.__name__} executed in {end_time - start_time} seconds") return result + return wrapper @@ -42,9 +45,13 @@ def retry_decorator(max_retries=5): try: return func(*args, **kwargs) except Exception as error: - logging.error(f" Error in {func.__name__}: {str(error)} Retrying ....") + logging.error( + f" Error in {func.__name__}: {str(error)} Retrying ...." + ) return func(*args, **kwargs) + return wrapper + return decorator @@ -55,6 +62,7 @@ def singleton_decorator(cls): if cls not in instances: instances[cls] = cls(*args, **kwargs) return instances[cls] + return wrapper @@ -64,6 +72,7 @@ def synchronized_decorator(func): def wrapper(*args, **kwargs): with func.__lock__: return func(*args, **kwargs) + return wrapper @@ -72,6 +81,7 @@ def deprecated_decorator(func): def wrapper(*args, **kwargs): warnings.warn(f"{func.__name__} is deprecated", category=DeprecationWarning) return func(*args, **kwargs) + return wrapper @@ -82,5 +92,7 @@ def validate_inputs_decorator(validator): if not validator(*args, **kwargs): raise ValueError("Invalid Inputs") return func(*args, **kwargs) + return wrapper + return decorator diff --git a/swarms/utils/main.py b/swarms/utils/main.py index b7f49dd3..6e5907b2 100644 --- a/swarms/utils/main.py +++ b/swarms/utils/main.py @@ -84,6 +84,8 @@ def get_new_dataframe_name(org_img_name, func_name="update"): this_new_uuid, func_name, recent_prev_file_name, most_org_file_name ) return os.path.join(head, new_file_name) + + # =======================> utils end @@ -203,6 +205,7 @@ def dim_multiline(message: str) -> str: return lines[0] return lines[0] + ANSI("\n... ".join([""] + lines[1:])).to(Color.black().bright()) + # +=============================> ANSI Ending @@ -221,6 +224,7 @@ class AbstractUploader(ABC): def from_settings() -> "AbstractUploader": pass + # ================================> upload end @@ -256,6 +260,7 @@ class S3Uploader(AbstractUploader): self.client.upload_file(filepath, self.bucket, object_name) return self.get_url(object_name) + # ========================= upload s3 @@ -359,10 +364,16 @@ class FileHandler: def handle(self, url: str) -> str: try: if url.startswith(os.environ.get("SERVER", "http://localhost:8000")): - local_filepath = url[len(os.environ.get("SERVER", "http://localhost:8000")) + 1:] + local_filepath = url[ + len(os.environ.get("SERVER", "http://localhost:8000")) + 1 : + ] local_filename = Path("file") / local_filepath.split("/")[-1] src = self.path / local_filepath - dst = self.path / os.environ.get("PLAYGROUND_DIR", "./playground") / local_filename + dst = ( + self.path + / os.environ.get("PLAYGROUND_DIR", "./playground") + / local_filename + ) os.makedirs(os.path.dirname(dst), exist_ok=True) shutil.copy(src, dst) else: @@ -379,6 +390,8 @@ class FileHandler: return handler.handle(local_filename) except Exception as e: raise e + + # => base end diff --git a/swarms/workers/base.py b/swarms/workers/base.py index c920bcd6..358810bd 100644 --- a/swarms/workers/base.py +++ b/swarms/workers/base.py @@ -24,17 +24,14 @@ class AbstractWorker: """Get the name of the worker.""" return self._name - def run( - self, - task: str - ): + def run(self, task: str): """Run the worker agent once""" def send( self, message: Union[Dict, str], recipient, # add AbstractWorker - request_reply: Optional[bool] = None + request_reply: Optional[bool] = None, ): """(Abstract method) Send a message to another worker.""" @@ -42,7 +39,7 @@ class AbstractWorker: self, message: Union[Dict, str], recipient, # add AbstractWorker - request_reply: Optional[bool] = None + request_reply: Optional[bool] = None, ): """(Aabstract async method) Send a message to another worker.""" @@ -50,7 +47,7 @@ class AbstractWorker: self, message: Union[Dict, str], sender, # add AbstractWorker - request_reply: Optional[bool] = None + request_reply: Optional[bool] = None, ): """(Abstract method) Receive a message from another worker.""" @@ -58,7 +55,7 @@ class AbstractWorker: self, message: Union[Dict, str], sender, # add AbstractWorker - request_reply: Optional[bool] = None + request_reply: Optional[bool] = None, ): """(Abstract async method) Receive a message from another worker.""" diff --git a/swarms/workers/worker.py b/swarms/workers/worker.py index f0e7ef88..8d8d28eb 100644 --- a/swarms/workers/worker.py +++ b/swarms/workers/worker.py @@ -1,4 +1,3 @@ - import faiss from langchain.docstore import InMemoryDocstore from langchain.embeddings import OpenAIEmbeddings @@ -13,7 +12,7 @@ from swarms.tools.autogpt import ( compile, process_csv, load_qa_with_sources_chain, - WebpageQATool + WebpageQATool, ) from swarms.utils.decorators import error_decorator, log_decorator, timing_decorator @@ -87,11 +86,7 @@ class Worker: def name(self): return self.ai_name - def receieve( - self, - name: str, - message: str - ) -> None: + def receieve(self, name: str, message: str) -> None: """ Receive a message and update the message history. @@ -152,13 +147,13 @@ class Worker: index = faiss.IndexFlatL2(embedding_size) self.vectorstore = FAISS( - embeddings_model.embed_query, - index, - InMemoryDocstore({}), {} + embeddings_model.embed_query, index, InMemoryDocstore({}), {} ) except Exception as error: - raise RuntimeError(f"Error setting up memory perhaps try try tuning the embedding size: {error}") + raise RuntimeError( + f"Error setting up memory perhaps try try tuning the embedding size: {error}" + ) def setup_agent(self): """ @@ -171,7 +166,7 @@ class Worker: tools=self.tools, llm=self.llm, memory=self.vectorstore.as_retriever(search_kwargs={"k": 8}), - human_in_the_loop=self.human_in_the_loop + human_in_the_loop=self.human_in_the_loop, ) except Exception as error: @@ -180,10 +175,7 @@ class Worker: @log_decorator @error_decorator @timing_decorator - def run( - self, - task: str = None - ): + def run(self, task: str = None): """ Run the autonomous agent on a given task. @@ -202,10 +194,7 @@ class Worker: @log_decorator @error_decorator @timing_decorator - def __call__( - self, - task: str = None - ): + def __call__(self, task: str = None): """ Make the worker callable to run the agent on a given task. @@ -227,11 +216,7 @@ class Worker: @log_decorator @error_decorator @timing_decorator - def chat( - self, - msg: str = None, - streaming: bool = False - ): + def chat(self, msg: str = None, streaming: bool = False): """ Run chat @@ -251,24 +236,14 @@ class Worker: """ # add users message to the history - self.history.append( - Message( - "User", - msg - ) - ) + self.history.append(Message("User", msg)) # process msg try: response = self.agent.run(msg) # add agent's response to the history - self.history.append( - Message( - "Agent", - response - ) - ) + self.history.append(Message("Agent", response)) # if streaming is = True if streaming: @@ -280,19 +255,11 @@ class Worker: error_message = f"Error processing message: {str(error)}" # add error to history - self.history.append( - Message( - "Agent", - error_message - ) - ) + self.history.append(Message("Agent", error_message)) return error_message - def _stream_response( - self, - response: str = None - ): + def _stream_response(self, response: str = None): """ Yield the response token by token (word by word) diff --git a/tests/agents/agents.py b/tests/agents/agents.py index 1f453974..33bedd99 100644 --- a/tests/agents/agents.py +++ b/tests/agents/agents.py @@ -1,14 +1,24 @@ import pytest from unittest.mock import Mock, patch -from swarms.agents.agents import AgentNodeInitializer, AgentNode, agent # replace with actual import +from swarms.agents.agents import ( + AgentNodeInitializer, + AgentNode, + agent, +) # replace with actual import + # For initializing AgentNodeInitializer in multiple tests @pytest.fixture def mock_agent_node_initializer(): - with patch('swarms.agents.agents.ChatOpenAI') as mock_llm, \ - patch('swarms.agents.agents.AutoGPT') as mock_agent: - - initializer = AgentNodeInitializer(model_type='openai', model_id='test', openai_api_key='test_key', temperature=0.5) + with patch("swarms.agents.agents.ChatOpenAI") as mock_llm, patch( + "swarms.agents.agents.AutoGPT" + ) as mock_agent: + initializer = AgentNodeInitializer( + model_type="openai", + model_id="test", + openai_api_key="test_key", + temperature=0.5, + ) initializer.llm = mock_llm initializer.tools = [Mock(spec=BaseTool)] initializer.vectorstore = Mock() @@ -18,72 +28,82 @@ def mock_agent_node_initializer(): # Test initialize_llm method of AgentNodeInitializer class -@pytest.mark.parametrize("model_type", ['openai', 'huggingface', 'invalid']) +@pytest.mark.parametrize("model_type", ["openai", "huggingface", "invalid"]) def test_agent_node_initializer_initialize_llm(model_type, mock_agent_node_initializer): - with patch('swarms.agents.agents.ChatOpenAI') as mock_openai, \ - patch('swarms.agents.agents.HuggingFaceLLM') as mock_huggingface: - - if model_type == 'invalid': + with patch("swarms.agents.agents.ChatOpenAI") as mock_openai, patch( + "swarms.agents.agents.HuggingFaceLLM" + ) as mock_huggingface: + if model_type == "invalid": with pytest.raises(ValueError): - mock_agent_node_initializer.initialize_llm(model_type, 'model_id', 'openai_api_key', 0.5) + mock_agent_node_initializer.initialize_llm( + model_type, "model_id", "openai_api_key", 0.5 + ) else: - mock_agent_node_initializer.initialize_llm(model_type, 'model_id', 'openai_api_key', 0.5) - if model_type == 'openai': + mock_agent_node_initializer.initialize_llm( + model_type, "model_id", "openai_api_key", 0.5 + ) + if model_type == "openai": mock_openai.assert_called_once() - elif model_type == 'huggingface': + elif model_type == "huggingface": mock_huggingface.assert_called_once() # Test add_tool method of AgentNodeInitializer class def test_agent_node_initializer_add_tool(mock_agent_node_initializer): - with patch('swarms.agents.agents.BaseTool') as mock_base_tool: + with patch("swarms.agents.agents.BaseTool") as mock_base_tool: mock_agent_node_initializer.add_tool(mock_base_tool) assert mock_base_tool in mock_agent_node_initializer.tools # Test run method of AgentNodeInitializer class -@pytest.mark.parametrize("prompt", ['valid prompt', '']) +@pytest.mark.parametrize("prompt", ["valid prompt", ""]) def test_agent_node_initializer_run(prompt, mock_agent_node_initializer): - if prompt == '': + if prompt == "": with pytest.raises(ValueError): mock_agent_node_initializer.run(prompt) else: assert mock_agent_node_initializer.run(prompt) == "Task completed by AgentNode" + # For initializing AgentNode in multiple tests @pytest.fixture def mock_agent_node(): - with patch('swarms.agents.agents.ChatOpenAI') as mock_llm, \ - patch('swarms.agents.agents.AgentNodeInitializer') as mock_agent_node_initializer: - - mock_agent_node = AgentNode('test_key') + with patch("swarms.agents.agents.ChatOpenAI") as mock_llm, patch( + "swarms.agents.agents.AgentNodeInitializer" + ) as mock_agent_node_initializer: + mock_agent_node = AgentNode("test_key") mock_agent_node.llm_class = mock_llm mock_agent_node.vectorstore = Mock() mock_agent_node_initializer.llm = mock_llm return mock_agent_node + # Test initialize_llm method of AgentNode class -@pytest.mark.parametrize("llm_class", ['openai', 'huggingface']) +@pytest.mark.parametrize("llm_class", ["openai", "huggingface"]) def test_agent_node_initialize_llm(llm_class, mock_agent_node): - with patch('swarms.agents.agents.ChatOpenAI') as mock_openai, \ - patch('swarms.agents.agents.HuggingFaceLLM') as mock_huggingface: - + with patch("swarms.agents.agents.ChatOpenAI") as mock_openai, patch( + "swarms.agents.agents.HuggingFaceLLM" + ) as mock_huggingface: mock_agent_node.initialize_llm(llm_class) - if llm_class == 'openai': + if llm_class == "openai": mock_openai.assert_called_once() - elif llm_class == 'huggingface': + elif llm_class == "huggingface": mock_huggingface.assert_called_once() + # Test initialize_tools method of AgentNode class def test_agent_node_initialize_tools(mock_agent_node): - with patch('swarms.agents.agents.DuckDuckGoSearchRun') as mock_ddg, \ - patch('swarms.agents.agents.WriteFileTool') as mock_write_file, \ - patch('swarms.agents.agents.ReadFileTool') as mock_read_file, \ - patch('swarms.agents.agents.process_csv') as mock_process_csv, \ - patch('swarms.agents.agents.WebpageQATool') as mock_webpage_qa: - - mock_agent_node.initialize_tools('openai') + with patch("swarms.agents.agents.DuckDuckGoSearchRun") as mock_ddg, patch( + "swarms.agents.agents.WriteFileTool" + ) as mock_write_file, patch( + "swarms.agents.agents.ReadFileTool" + ) as mock_read_file, patch( + "swarms.agents.agents.process_csv" + ) as mock_process_csv, patch( + "swarms.agents.agents.WebpageQATool" + ) as mock_webpage_qa: + mock_agent_node.initialize_tools("openai") assert mock_ddg.called assert mock_write_file.called assert mock_read_file.called @@ -93,26 +113,32 @@ def test_agent_node_initialize_tools(mock_agent_node): # Test create_agent method of AgentNode class def test_agent_node_create_agent(mock_agent_node): - with patch.object(mock_agent_node, 'initialize_llm'), \ - patch.object(mock_agent_node, 'initialize_tools'), \ - patch.object(mock_agent_node, 'initialize_vectorstore'), \ - patch('swarms.agents.agents.AgentNodeInitializer') as mock_agent_node_initializer: - + with patch.object(mock_agent_node, "initialize_llm"), patch.object( + mock_agent_node, "initialize_tools" + ), patch.object(mock_agent_node, "initialize_vectorstore"), patch( + "swarms.agents.agents.AgentNodeInitializer" + ) as mock_agent_node_initializer: mock_agent_node.create_agent() mock_agent_node_initializer.assert_called_once() mock_agent_node_initializer.return_value.create_agent.assert_called_once() # Test agent function -@pytest.mark.parametrize("openai_api_key,objective", [('valid_key', 'valid_objective'), ('', 'valid_objective'), ('valid_key', '')]) +@pytest.mark.parametrize( + "openai_api_key,objective", + [("valid_key", "valid_objective"), ("", "valid_objective"), ("valid_key", "")], +) def test_agent(openai_api_key, objective): - if openai_api_key == '' or objective == '': + if openai_api_key == "" or objective == "": with pytest.raises(ValueError): agent(openai_api_key, objective) else: - with patch('swarms.agents.agents.AgentNodeInitializer') as mock_agent_node_initializer: - mock_agent_node = mock_agent_node_initializer.return_value.create_agent.return_value - mock_agent_node.run.return_value = 'Agent output' + with patch( + "swarms.agents.agents.AgentNodeInitializer" + ) as mock_agent_node_initializer: + mock_agent_node = ( + mock_agent_node_initializer.return_value.create_agent.return_value + ) + mock_agent_node.run.return_value = "Agent output" result = agent(openai_api_key, objective) - assert result == 'Agent output' - + assert result == "Agent output" diff --git a/tests/agents/omni_modal.py b/tests/agents/omni_modal.py index 1407f261..0c9bf281 100644 --- a/tests/agents/omni_modal.py +++ b/tests/agents/omni_modal.py @@ -13,22 +13,26 @@ def mock_llm(): class MockLLM(BaseLanguageModel): def process(self, input): return "mock response" - + return MockLLM() + @pytest.fixture def omni_agent(mock_llm): return OmniModalAgent(mock_llm) + def test_omnimodalagent_initialization(omni_agent): assert omni_agent.llm is not None, "LLM initialization failed" assert len(omni_agent.tools) > 0, "Tools initialization failed" + def test_omnimodalagent_run(omni_agent): input_string = "Hello, how are you?" response = omni_agent.run(input_string) assert response is not None, "Response generation failed" assert isinstance(response, str), "Response should be a string" + def test_task_executor_initialization(omni_agent): assert omni_agent.task_executor is not None, "TaskExecutor initialization failed" diff --git a/tests/boss/boss_node.py b/tests/boss/boss_node.py index 9414a947..d4547a5a 100644 --- a/tests/boss/boss_node.py +++ b/tests/boss/boss_node.py @@ -2,6 +2,8 @@ import pytest from unittest.mock import Mock, patch from swarms.tools.agent_tools import * from swarms.boss.boss_node import BossNodeInitializer, BossNode + + # For initializing BossNodeInitializer in multiple tests @pytest.fixture def mock_boss_node_initializer(): @@ -10,23 +12,25 @@ def mock_boss_node_initializer(): agent_executor = Mock() max_iterations = 5 - boss_node_initializer = BossNodeInitializer(llm, vectorstore, agent_executor, max_iterations) + boss_node_initializer = BossNodeInitializer( + llm, vectorstore, agent_executor, max_iterations + ) return boss_node_initializer # Test BossNodeInitializer class __init__ method def test_boss_node_initializer_init(mock_boss_node_initializer): - with patch('swarms.tools.agent_tools.BabyAGI.from_llm') as mock_from_llm: + with patch("swarms.tools.agent_tools.BabyAGI.from_llm") as mock_from_llm: assert isinstance(mock_boss_node_initializer, BossNodeInitializer) mock_from_llm.assert_called_once() # Test initialize_vectorstore method of BossNodeInitializer class def test_boss_node_initializer_initialize_vectorstore(mock_boss_node_initializer): - with patch('swarms.tools.agent_tools.OpenAIEmbeddings') as mock_embeddings, \ - patch('swarms.tools.agent_tools.FAISS') as mock_faiss: - + with patch("swarms.tools.agent_tools.OpenAIEmbeddings") as mock_embeddings, patch( + "swarms.tools.agent_tools.FAISS" + ) as mock_faiss: result = mock_boss_node_initializer.initialize_vectorstore() mock_embeddings.assert_called_once() mock_faiss.assert_called_once() @@ -35,27 +39,29 @@ def test_boss_node_initializer_initialize_vectorstore(mock_boss_node_initializer # Test initialize_llm method of BossNodeInitializer class def test_boss_node_initializer_initialize_llm(mock_boss_node_initializer): - with patch('swarms.tools.agent_tools.OpenAI') as mock_llm: + with patch("swarms.tools.agent_tools.OpenAI") as mock_llm: result = mock_boss_node_initializer.initialize_llm(mock_llm) mock_llm.assert_called_once() assert result is not None # Test create_task method of BossNodeInitializer class -@pytest.mark.parametrize("objective", ['valid objective', '']) +@pytest.mark.parametrize("objective", ["valid objective", ""]) def test_boss_node_initializer_create_task(objective, mock_boss_node_initializer): - if objective == '': + if objective == "": with pytest.raises(ValueError): mock_boss_node_initializer.create_task(objective) else: - assert mock_boss_node_initializer.create_task(objective) == {"objective": objective} + assert mock_boss_node_initializer.create_task(objective) == { + "objective": objective + } # Test run method of BossNodeInitializer class -@pytest.mark.parametrize("task", ['valid task', '']) +@pytest.mark.parametrize("task", ["valid task", ""]) def test_boss_node_initializer_run(task, mock_boss_node_initializer): - with patch.object(mock_boss_node_initializer, 'baby_agi'): - if task == '': + with patch.object(mock_boss_node_initializer, "baby_agi"): + if task == "": with pytest.raises(ValueError): mock_boss_node_initializer.run(task) else: @@ -67,29 +73,56 @@ def test_boss_node_initializer_run(task, mock_boss_node_initializer): # Test BossNode function -@pytest.mark.parametrize("api_key, objective, llm_class, max_iterations", - [('valid_key', 'valid_objective', OpenAI, 5), - ('', 'valid_objective', OpenAI, 5), - ('valid_key', '', OpenAI, 5), - ('valid_key', 'valid_objective', '', 5), - ('valid_key', 'valid_objective', OpenAI, 0)]) +@pytest.mark.parametrize( + "api_key, objective, llm_class, max_iterations", + [ + ("valid_key", "valid_objective", OpenAI, 5), + ("", "valid_objective", OpenAI, 5), + ("valid_key", "", OpenAI, 5), + ("valid_key", "valid_objective", "", 5), + ("valid_key", "valid_objective", OpenAI, 0), + ], +) def test_boss_node(api_key, objective, llm_class, max_iterations): - with patch('os.getenv') as mock_getenv, \ - patch('swarms.tools.agent_tools.PromptTemplate.from_template') as mock_from_template, \ - patch('swarms.tools.agent_tools.LLMChain') as mock_llm_chain, \ - patch('swarms.tools.agent_tools.ZeroShotAgent.create_prompt') as mock_create_prompt, \ - patch('swarms.tools.agent_tools.ZeroShotAgent') as mock_zero_shot_agent, \ - patch('swarms.tools.agent_tools.AgentExecutor.from_agent_and_tools') as mock_from_agent_and_tools, \ - patch('swarms.tools.agent_tools.BossNodeInitializer') as mock_boss_node_initializer, \ - patch.object(mock_boss_node_initializer, 'create_task') as mock_create_task, \ - patch.object(mock_boss_node_initializer, 'run') as mock_run: - - if api_key == '' or objective == '' or llm_class == '' or max_iterations <= 0: + with patch("os.getenv") as mock_getenv, patch( + "swarms.tools.agent_tools.PromptTemplate.from_template" + ) as mock_from_template, patch( + "swarms.tools.agent_tools.LLMChain" + ) as mock_llm_chain, patch( + "swarms.tools.agent_tools.ZeroShotAgent.create_prompt" + ) as mock_create_prompt, patch( + "swarms.tools.agent_tools.ZeroShotAgent" + ) as mock_zero_shot_agent, patch( + "swarms.tools.agent_tools.AgentExecutor.from_agent_and_tools" + ) as mock_from_agent_and_tools, patch( + "swarms.tools.agent_tools.BossNodeInitializer" + ) as mock_boss_node_initializer, patch.object( + mock_boss_node_initializer, "create_task" + ) as mock_create_task, patch.object( + mock_boss_node_initializer, "run" + ) as mock_run: + if api_key == "" or objective == "" or llm_class == "" or max_iterations <= 0: with pytest.raises(ValueError): - BossNode(objective, api_key, vectorstore=None, worker_node=None, llm_class=llm_class, max_iterations=max_iterations, verbose=False) + BossNode( + objective, + api_key, + vectorstore=None, + worker_node=None, + llm_class=llm_class, + max_iterations=max_iterations, + verbose=False, + ) else: - mock_getenv.return_value = 'valid_key' - BossNode(objective, api_key, vectorstore=None, worker_node=None, llm_class=llm_class, max_iterations=max_iterations, verbose=False) + mock_getenv.return_value = "valid_key" + BossNode( + objective, + api_key, + vectorstore=None, + worker_node=None, + llm_class=llm_class, + max_iterations=max_iterations, + verbose=False, + ) mock_from_template.assert_called_once() mock_llm_chain.assert_called_once() mock_create_prompt.assert_called_once() diff --git a/tests/models/LLM.py b/tests/models/LLM.py index d1476ea3..20493519 100644 --- a/tests/models/LLM.py +++ b/tests/models/LLM.py @@ -5,21 +5,22 @@ from langchain import HuggingFaceHub, ChatOpenAI from swarms.models.llm import LLM + class TestLLM(unittest.TestCase): - @patch.object(HuggingFaceHub, '__init__', return_value=None) - @patch.object(ChatOpenAI, '__init__', return_value=None) + @patch.object(HuggingFaceHub, "__init__", return_value=None) + @patch.object(ChatOpenAI, "__init__", return_value=None) def setUp(self, mock_hf_init, mock_openai_init): - self.llm_openai = LLM(openai_api_key='mock_openai_key') - self.llm_hf = LLM(hf_repo_id='mock_repo_id', hf_api_token='mock_hf_token') + self.llm_openai = LLM(openai_api_key="mock_openai_key") + self.llm_hf = LLM(hf_repo_id="mock_repo_id", hf_api_token="mock_hf_token") self.prompt = "Who won the FIFA World Cup in 1998?" def test_init(self): - self.assertEqual(self.llm_openai.openai_api_key, 'mock_openai_key') - self.assertEqual(self.llm_hf.hf_repo_id, 'mock_repo_id') - self.assertEqual(self.llm_hf.hf_api_token, 'mock_hf_token') + self.assertEqual(self.llm_openai.openai_api_key, "mock_openai_key") + self.assertEqual(self.llm_hf.hf_repo_id, "mock_repo_id") + self.assertEqual(self.llm_hf.hf_api_token, "mock_hf_token") - @patch.object(HuggingFaceHub, 'run', return_value="France") - @patch.object(ChatOpenAI, 'run', return_value="France") + @patch.object(HuggingFaceHub, "run", return_value="France") + @patch.object(ChatOpenAI, "run", return_value="France") def test_run(self, mock_hf_run, mock_openai_run): result_openai = self.llm_openai.run(self.prompt) mock_openai_run.assert_called_once() @@ -33,16 +34,16 @@ class TestLLM(unittest.TestCase): with self.assertRaises(ValueError): LLM() - @patch.object(os, 'environ', {}) + @patch.object(os, "environ", {}) def test_error_on_missing_hf_token(self): with self.assertRaises(ValueError): - LLM(hf_repo_id='mock_repo_id') + LLM(hf_repo_id="mock_repo_id") @patch.dict(os.environ, {"HUGGINGFACEHUB_API_TOKEN": "mock_hf_token"}) def test_hf_token_from_env(self): - llm = LLM(hf_repo_id='mock_repo_id') + llm = LLM(hf_repo_id="mock_repo_id") self.assertEqual(llm.hf_api_token, "mock_hf_token") -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/tests/models/hf.py b/tests/models/hf.py index 0696482a..ab3b648d 100644 --- a/tests/models/hf.py +++ b/tests/models/hf.py @@ -1,7 +1,7 @@ import pytest import torch from unittest.mock import Mock -from swarms.models.huggingface import HuggingFaceLLM +from swarms.models.huggingface import HuggingFaceLLM @pytest.fixture @@ -25,40 +25,51 @@ def mock_bitsandbytesconfig(): @pytest.fixture -def hugging_face_llm(mock_torch, mock_autotokenizer, mock_automodelforcausallm, mock_bitsandbytesconfig): +def hugging_face_llm( + mock_torch, mock_autotokenizer, mock_automodelforcausallm, mock_bitsandbytesconfig +): HuggingFaceLLM.torch = mock_torch HuggingFaceLLM.AutoTokenizer = mock_autotokenizer HuggingFaceLLM.AutoModelForCausalLM = mock_automodelforcausallm HuggingFaceLLM.BitsAndBytesConfig = mock_bitsandbytesconfig - return HuggingFaceLLM(model_id='test') + return HuggingFaceLLM(model_id="test") def test_init(hugging_face_llm, mock_autotokenizer, mock_automodelforcausallm): - assert hugging_face_llm.model_id == 'test' - mock_autotokenizer.from_pretrained.assert_called_once_with('test') - mock_automodelforcausallm.from_pretrained.assert_called_once_with('test', quantization_config=None) - - -def test_init_with_quantize(hugging_face_llm, mock_autotokenizer, mock_automodelforcausallm, mock_bitsandbytesconfig): + assert hugging_face_llm.model_id == "test" + mock_autotokenizer.from_pretrained.assert_called_once_with("test") + mock_automodelforcausallm.from_pretrained.assert_called_once_with( + "test", quantization_config=None + ) + + +def test_init_with_quantize( + hugging_face_llm, + mock_autotokenizer, + mock_automodelforcausallm, + mock_bitsandbytesconfig, +): quantization_config = { - 'load_in_4bit': True, - 'bnb_4bit_use_double_quant': True, - 'bnb_4bit_quant_type': "nf4", - 'bnb_4bit_compute_dtype': torch.bfloat16 + "load_in_4bit": True, + "bnb_4bit_use_double_quant": True, + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_compute_dtype": torch.bfloat16, } mock_bitsandbytesconfig.return_value = quantization_config - HuggingFaceLLM(model_id='test', quantize=True) + HuggingFaceLLM(model_id="test", quantize=True) mock_bitsandbytesconfig.assert_called_once_with(**quantization_config) - mock_autotokenizer.from_pretrained.assert_called_once_with('test') - mock_automodelforcausallm.from_pretrained.assert_called_once_with('test', quantization_config=quantization_config) + mock_autotokenizer.from_pretrained.assert_called_once_with("test") + mock_automodelforcausallm.from_pretrained.assert_called_once_with( + "test", quantization_config=quantization_config + ) def test_generate_text(hugging_face_llm): - prompt_text = 'test prompt' - expected_output = 'test output' + prompt_text = "test prompt" + expected_output = "test output" hugging_face_llm.tokenizer.encode.return_value = torch.tensor([0]) # Mock tensor hugging_face_llm.model.generate.return_value = torch.tensor([0]) # Mock tensor hugging_face_llm.tokenizer.decode.return_value = expected_output diff --git a/tests/orchestrate.py b/tests/orchestrate.py index 18461269..7a73d92d 100644 --- a/tests/orchestrate.py +++ b/tests/orchestrate.py @@ -7,14 +7,17 @@ from swarms.swarms.orchestrate import Orchestrator def mock_agent(): return Mock() + @pytest.fixture def mock_task(): return {"task_id": 1, "task_data": "data"} + @pytest.fixture def mock_vector_db(): return Mock() + @pytest.fixture def orchestrator(mock_agent, mock_vector_db): agent_list = [mock_agent for _ in range(5)] @@ -38,7 +41,9 @@ def test_retrieve_results(orchestrator, mock_vector_db): def test_update_vector_db(orchestrator, mock_vector_db): data = {"vector": [0.1, 0.2, 0.3], "task_id": 1} orchestrator.update_vector_db(data) - mock_vector_db.add_documents.assert_called_once_with([data['vector']], [str(data['task_id'])]) + mock_vector_db.add_documents.assert_called_once_with( + [data["vector"]], [str(data["task_id"])] + ) def test_get_vector_db(orchestrator, mock_vector_db): @@ -49,7 +54,9 @@ def test_append_to_db(orchestrator, mock_vector_db): collection = "test_collection" result = "test_result" orchestrator.append_to_db(collection, result) - mock_vector_db.append_document.assert_called_once_with(collection, result, id=str(id(result))) + mock_vector_db.append_document.assert_called_once_with( + collection, result, id=str(id(result)) + ) def test_run(orchestrator, mock_agent, mock_vector_db): diff --git a/tests/swarms.py b/tests/swarms.py index e523d6ea..dc6f9c36 100644 --- a/tests/swarms.py +++ b/tests/swarms.py @@ -1,73 +1,85 @@ import pytest import logging from unittest.mock import patch -from swarms.swarms.swarms import HierarchicalSwarm # replace with your actual module name +from swarms.swarms.swarms import ( + HierarchicalSwarm, +) # replace with your actual module name + @pytest.fixture def swarm(): return HierarchicalSwarm( - model_id='gpt-4', - openai_api_key='some_api_key', - use_vectorstore=True, - embedding_size=1024, - use_async=False, - human_in_the_loop=True, - model_type='openai', - boss_prompt='boss', - worker_prompt='worker', + model_id="gpt-4", + openai_api_key="some_api_key", + use_vectorstore=True, + embedding_size=1024, + use_async=False, + human_in_the_loop=True, + model_type="openai", + boss_prompt="boss", + worker_prompt="worker", temperature=0.5, max_iterations=100, - logging_enabled=True + logging_enabled=True, ) + @pytest.fixture def swarm_no_logging(): return HierarchicalSwarm(logging_enabled=False) + def test_swarm_init(swarm): - assert swarm.model_id == 'gpt-4' - assert swarm.openai_api_key == 'some_api_key' + assert swarm.model_id == "gpt-4" + assert swarm.openai_api_key == "some_api_key" assert swarm.use_vectorstore assert swarm.embedding_size == 1024 assert not swarm.use_async assert swarm.human_in_the_loop - assert swarm.model_type == 'openai' - assert swarm.boss_prompt == 'boss' - assert swarm.worker_prompt == 'worker' + assert swarm.model_type == "openai" + assert swarm.boss_prompt == "boss" + assert swarm.worker_prompt == "worker" assert swarm.temperature == 0.5 assert swarm.max_iterations == 100 assert swarm.logging_enabled assert isinstance(swarm.logger, logging.Logger) + def test_swarm_no_logging_init(swarm_no_logging): assert not swarm_no_logging.logging_enabled assert swarm_no_logging.logger.disabled -@patch('your_module.OpenAI') -@patch('your_module.HuggingFaceLLM') + +@patch("your_module.OpenAI") +@patch("your_module.HuggingFaceLLM") def test_initialize_llm(mock_huggingface, mock_openai, swarm): - swarm.initialize_llm('openai') - mock_openai.assert_called_once_with(openai_api_key='some_api_key', temperature=0.5) - - swarm.initialize_llm('huggingface') - mock_huggingface.assert_called_once_with(model_id='gpt-4', temperature=0.5) + swarm.initialize_llm("openai") + mock_openai.assert_called_once_with(openai_api_key="some_api_key", temperature=0.5) -@patch('your_module.HierarchicalSwarm.initialize_llm') + swarm.initialize_llm("huggingface") + mock_huggingface.assert_called_once_with(model_id="gpt-4", temperature=0.5) + + +@patch("your_module.HierarchicalSwarm.initialize_llm") def test_initialize_tools(mock_llm, swarm): - mock_llm.return_value = 'mock_llm_class' - tools = swarm.initialize_tools('openai') - assert 'mock_llm_class' in tools + mock_llm.return_value = "mock_llm_class" + tools = swarm.initialize_tools("openai") + assert "mock_llm_class" in tools + -@patch('your_module.HierarchicalSwarm.initialize_llm') +@patch("your_module.HierarchicalSwarm.initialize_llm") def test_initialize_tools_with_extra_tools(mock_llm, swarm): - mock_llm.return_value = 'mock_llm_class' - tools = swarm.initialize_tools('openai', extra_tools=['tool1', 'tool2']) - assert 'tool1' in tools - assert 'tool2' in tools + mock_llm.return_value = "mock_llm_class" + tools = swarm.initialize_tools("openai", extra_tools=["tool1", "tool2"]) + assert "tool1" in tools + assert "tool2" in tools -@patch('your_module.OpenAIEmbeddings') -@patch('your_module.FAISS') + +@patch("your_module.OpenAIEmbeddings") +@patch("your_module.FAISS") def test_initialize_vectorstore(mock_faiss, mock_openai_embeddings, swarm): - mock_openai_embeddings.return_value.embed_query = 'embed_query' + mock_openai_embeddings.return_value.embed_query = "embed_query" swarm.initialize_vectorstore() - mock_faiss.assert_called_once_with('embed_query', instance_of(faiss.IndexFlatL2), instance_of(InMemoryDocstore), {}) + mock_faiss.assert_called_once_with( + "embed_query", instance_of(faiss.IndexFlatL2), instance_of(InMemoryDocstore), {} + ) diff --git a/tests/workers/multi_model_worker.py b/tests/workers/multi_model_worker.py index 0aa5173d..f011d642 100644 --- a/tests/workers/multi_model_worker.py +++ b/tests/workers/multi_model_worker.py @@ -1,6 +1,10 @@ import pytest from unittest.mock import Mock -from swarms.agents.multi_modal_agent import MultiModalVisualAgent, MultiModalVisualAgentTool +from swarms.agents.multi_modal_agent import ( + MultiModalVisualAgent, + MultiModalVisualAgentTool, +) + @pytest.fixture def multimodal_agent(): @@ -9,15 +13,20 @@ def multimodal_agent(): mock_agent.run_text.return_value = "Expected output from agent" return mock_agent + @pytest.fixture def multimodal_agent_tool(multimodal_agent): # Use the mocked MultiModalVisualAgent in the MultiModalVisualAgentTool return MultiModalVisualAgentTool(multimodal_agent) -@pytest.mark.parametrize("text_input, expected_output", [ - ("Hello, world!", "Expected output from agent"), - ("Another task", "Expected output from agent"), -]) + +@pytest.mark.parametrize( + "text_input, expected_output", + [ + ("Hello, world!", "Expected output from agent"), + ("Another task", "Expected output from agent"), + ], +) def test_run(multimodal_agent_tool, text_input, expected_output): assert multimodal_agent_tool._run(text_input) == expected_output diff --git a/tests/workers/omni_worker.py b/tests/workers/omni_worker.py index ef6c579d..0557285d 100644 --- a/tests/workers/omni_worker.py +++ b/tests/workers/omni_worker.py @@ -5,30 +5,54 @@ from swarms.worker.omni_worker import OmniWorkerAgent @pytest.fixture def omni_worker(): - api_key = 'test-key' - api_endpoint = 'test-endpoint' - api_type = 'test-type' + api_key = "test-key" + api_endpoint = "test-endpoint" + api_type = "test-type" return OmniWorkerAgent(api_key, api_endpoint, api_type) -@pytest.mark.parametrize("data, expected_response", [ - ( - {"messages": ["Hello"], "api_key": "key1", "api_type": "type1", "api_endpoint": "endpoint1"}, - {"response": "Hello back from Huggingface!"} - ), - ( - {"messages": ["Goodbye"], "api_key": "key2", "api_type": "type2", "api_endpoint": "endpoint2"}, - {"response": "Goodbye from Huggingface!"} - ), -]) + +@pytest.mark.parametrize( + "data, expected_response", + [ + ( + { + "messages": ["Hello"], + "api_key": "key1", + "api_type": "type1", + "api_endpoint": "endpoint1", + }, + {"response": "Hello back from Huggingface!"}, + ), + ( + { + "messages": ["Goodbye"], + "api_key": "key2", + "api_type": "type2", + "api_endpoint": "endpoint2", + }, + {"response": "Goodbye from Huggingface!"}, + ), + ], +) def test_chat_valid_data(mocker, omni_worker, data, expected_response): - mocker.patch('yourmodule.chat_huggingface', return_value=expected_response) # replace 'yourmodule' with actual module name + mocker.patch( + "yourmodule.chat_huggingface", return_value=expected_response + ) # replace 'yourmodule' with actual module name assert omni_worker.chat(data) == expected_response -@pytest.mark.parametrize("invalid_data", [ - {"messages": ["Hello"]}, # missing api_key, api_type and api_endpoint - {"messages": ["Hello"], "api_key": "key1"}, # missing api_type and api_endpoint - {"messages": ["Hello"], "api_key": "key1", "api_type": "type1"}, # missing api_endpoint -]) + +@pytest.mark.parametrize( + "invalid_data", + [ + {"messages": ["Hello"]}, # missing api_key, api_type and api_endpoint + {"messages": ["Hello"], "api_key": "key1"}, # missing api_type and api_endpoint + { + "messages": ["Hello"], + "api_key": "key1", + "api_type": "type1", + }, # missing api_endpoint + ], +) def test_chat_invalid_data(omni_worker, invalid_data): with pytest.raises(ValueError): omni_worker.chat(invalid_data) diff --git a/tests/workers/worker_agent_ultra.py b/tests/workers/worker_agent_ultra.py index 9426604d..3cf112a2 100644 --- a/tests/workers/worker_agent_ultra.py +++ b/tests/workers/worker_agent_ultra.py @@ -2,44 +2,50 @@ import pytest from unittest.mock import Mock from swarms.workers.worker_agent_ultra import WorkerUltraNode # import your module here + def test_create_agent(): mock_llm = Mock() - mock_toolset = { 'test_toolset': Mock() } + mock_toolset = {"test_toolset": Mock()} mock_vectorstore = Mock() worker = WorkerUltraNode(mock_llm, mock_toolset, mock_vectorstore) worker.create_agent() assert worker.agent is not None -@pytest.mark.parametrize("invalid_toolset", [123, 'string', 0.45]) + +@pytest.mark.parametrize("invalid_toolset", [123, "string", 0.45]) def test_add_toolset_invalid(invalid_toolset): mock_llm = Mock() - mock_toolset = { 'test_toolset': Mock() } + mock_toolset = {"test_toolset": Mock()} mock_vectorstore = Mock() worker = WorkerUltraNode(mock_llm, mock_toolset, mock_vectorstore) with pytest.raises(TypeError): worker.add_toolset(invalid_toolset) + @pytest.mark.parametrize("invalid_prompt", [123, None, "", []]) def test_run_invalid_prompt(invalid_prompt): mock_llm = Mock() - mock_toolset = { 'test_toolset': Mock() } + mock_toolset = {"test_toolset": Mock()} mock_vectorstore = Mock() worker = WorkerUltraNode(mock_llm, mock_toolset, mock_vectorstore) with pytest.raises((TypeError, ValueError)): worker.run(invalid_prompt) + def test_run_valid_prompt(mocker): mock_llm = Mock() - mock_toolset = { 'test_toolset': Mock() } + mock_toolset = {"test_toolset": Mock()} mock_vectorstore = Mock() worker = WorkerUltraNode(mock_llm, mock_toolset, mock_vectorstore) - mocker.patch.object(worker, 'create_agent') - assert worker.run('Test prompt') == 'Task completed by WorkerNode' - + mocker.patch.object(worker, "create_agent") + assert worker.run("Test prompt") == "Task completed by WorkerNode" + + def test_worker_node(): - worker = worker_ultra_node('test-key') + worker = worker_ultra_node("test-key") assert isinstance(worker, WorkerUltraNode) + def test_worker_node_no_key(): with pytest.raises(ValueError): worker_ultra_node(None) diff --git a/tests/workers/worker_node.py b/tests/workers/worker_node.py index 6da706bd..e97b5023 100644 --- a/tests/workers/worker_node.py +++ b/tests/workers/worker_node.py @@ -1,76 +1,94 @@ import pytest from unittest.mock import MagicMock, patch -from swarms.worker.worker_node import WorkerNodeInitializer, WorkerNode # replace your_module with actual module name +from swarms.worker.worker_node import ( + WorkerNodeInitializer, + WorkerNode, +) # replace your_module with actual module name # Mock Tool for testing class MockTool(Tool): pass + # Fixture for llm @pytest.fixture def mock_llm(): return MagicMock() + # Fixture for vectorstore @pytest.fixture def mock_vectorstore(): return MagicMock() + # Fixture for Tools @pytest.fixture def mock_tools(): return [MockTool(), MockTool(), MockTool()] + # Fixture for WorkerNodeInitializer @pytest.fixture def worker_node(mock_llm, mock_tools, mock_vectorstore): - return WorkerNodeInitializer(llm=mock_llm, tools=mock_tools, vectorstore=mock_vectorstore) + return WorkerNodeInitializer( + llm=mock_llm, tools=mock_tools, vectorstore=mock_vectorstore + ) + # Fixture for WorkerNode @pytest.fixture def mock_worker_node(): return WorkerNode(openai_api_key="test_api_key") + # WorkerNodeInitializer Tests def test_worker_node_init(worker_node): assert worker_node.llm is not None assert worker_node.tools is not None assert worker_node.vectorstore is not None + def test_worker_node_create_agent(worker_node): - with patch.object(AutoGPT, 'from_llm_and_tools') as mock_method: + with patch.object(AutoGPT, "from_llm_and_tools") as mock_method: worker_node.create_agent() mock_method.assert_called_once() + def test_worker_node_add_tool(worker_node): initial_tools_count = len(worker_node.tools) new_tool = MockTool() worker_node.add_tool(new_tool) assert len(worker_node.tools) == initial_tools_count + 1 + def test_worker_node_run(worker_node): - with patch.object(worker_node.agent, 'run') as mock_run: + with patch.object(worker_node.agent, "run") as mock_run: worker_node.run(prompt="test prompt") mock_run.assert_called_once() + # WorkerNode Tests def test_worker_node_llm(mock_worker_node): - with patch.object(mock_worker_node, 'initialize_llm') as mock_method: + with patch.object(mock_worker_node, "initialize_llm") as mock_method: mock_worker_node.initialize_llm(llm_class=MagicMock(), temperature=0.5) mock_method.assert_called_once() + def test_worker_node_tools(mock_worker_node): - with patch.object(mock_worker_node, 'initialize_tools') as mock_method: + with patch.object(mock_worker_node, "initialize_tools") as mock_method: mock_worker_node.initialize_tools(llm_class=MagicMock()) mock_method.assert_called_once() + def test_worker_node_vectorstore(mock_worker_node): - with patch.object(mock_worker_node, 'initialize_vectorstore') as mock_method: + with patch.object(mock_worker_node, "initialize_vectorstore") as mock_method: mock_worker_node.initialize_vectorstore() mock_method.assert_called_once() + def test_worker_node_create_worker_node(mock_worker_node): - with patch.object(mock_worker_node, 'create_worker_node') as mock_method: + with patch.object(mock_worker_node, "create_worker_node") as mock_method: mock_worker_node.create_worker_node() mock_method.assert_called_once() diff --git a/tests/workers/worker_ultra.py b/tests/workers/worker_ultra.py index 17b699e3..b1485a28 100644 --- a/tests/workers/worker_ultra.py +++ b/tests/workers/worker_ultra.py @@ -1,63 +1,91 @@ import pytest from unittest.mock import Mock, patch -from swarms.workers.worker_agent_ultra import WorkerUltraNode, WorkerUltraNodeInitializer +from swarms.workers.worker_agent_ultra import ( + WorkerUltraNode, + WorkerUltraNodeInitializer, +) + @pytest.fixture def llm_mock(): return Mock() + @pytest.fixture def toolsets_mock(): return Mock() + @pytest.fixture def vectorstore_mock(): return Mock() + @pytest.fixture def worker_ultra_node(llm_mock, toolsets_mock, vectorstore_mock): return WorkerUltraNode(llm_mock, toolsets_mock, vectorstore_mock) + def test_worker_ultra_node_create_agent(worker_ultra_node): - with patch('yourmodule.AutoGPT.from_llm_and_tools') as mock_method: + with patch("yourmodule.AutoGPT.from_llm_and_tools") as mock_method: worker_ultra_node.create_agent() mock_method.assert_called_once() + def test_worker_ultra_node_add_toolset(worker_ultra_node): with pytest.raises(TypeError): - worker_ultra_node.add_toolset('wrong_toolset') + worker_ultra_node.add_toolset("wrong_toolset") + def test_worker_ultra_node_run(worker_ultra_node): - with patch.object(worker_ultra_node, 'agent') as mock_agent: + with patch.object(worker_ultra_node, "agent") as mock_agent: mock_agent.run.return_value = None - result = worker_ultra_node.run('some prompt') + result = worker_ultra_node.run("some prompt") assert result == "Task completed by WorkerNode" mock_agent.run.assert_called_once() + def test_worker_ultra_node_run_no_prompt(worker_ultra_node): with pytest.raises(ValueError): - worker_ultra_node.run('') + worker_ultra_node.run("") + @pytest.fixture def worker_ultra_node_initializer(): - return WorkerUltraNodeInitializer('openai_api_key') + return WorkerUltraNodeInitializer("openai_api_key") + def test_worker_ultra_node_initializer_initialize_llm(worker_ultra_node_initializer): - with patch('yourmodule.ChatOpenAI') as mock_llm: + with patch("yourmodule.ChatOpenAI") as mock_llm: worker_ultra_node_initializer.initialize_llm(mock_llm) mock_llm.assert_called_once() -def test_worker_ultra_node_initializer_initialize_toolsets(worker_ultra_node_initializer): - with patch('yourmodule.Terminal'), patch('yourmodule.CodeEditor'), patch('yourmodule.RequestsGet'), patch('yourmodule.ExitConversation'): + +def test_worker_ultra_node_initializer_initialize_toolsets( + worker_ultra_node_initializer, +): + with patch("yourmodule.Terminal"), patch("yourmodule.CodeEditor"), patch( + "yourmodule.RequestsGet" + ), patch("yourmodule.ExitConversation"): toolsets = worker_ultra_node_initializer.initialize_toolsets() assert len(toolsets) == 4 -def test_worker_ultra_node_initializer_initialize_vectorstore(worker_ultra_node_initializer): - with patch('yourmodule.OpenAIEmbeddings'), patch('yourmodule.fauss.IndexFlatL2'), patch('yourmodule.FAISS'), patch('yourmodule.InMemoryDocstore'): + +def test_worker_ultra_node_initializer_initialize_vectorstore( + worker_ultra_node_initializer, +): + with patch("yourmodule.OpenAIEmbeddings"), patch( + "yourmodule.fauss.IndexFlatL2" + ), patch("yourmodule.FAISS"), patch("yourmodule.InMemoryDocstore"): vectorstore = worker_ultra_node_initializer.initialize_vectorstore() assert vectorstore is not None -def test_worker_ultra_node_initializer_create_worker_node(worker_ultra_node_initializer): - with patch.object(worker_ultra_node_initializer, 'initialize_llm'), patch.object(worker_ultra_node_initializer, 'initialize_toolsets'), patch.object(worker_ultra_node_initializer, 'initialize_vectorstore'): + +def test_worker_ultra_node_initializer_create_worker_node( + worker_ultra_node_initializer, +): + with patch.object(worker_ultra_node_initializer, "initialize_llm"), patch.object( + worker_ultra_node_initializer, "initialize_toolsets" + ), patch.object(worker_ultra_node_initializer, "initialize_vectorstore"): worker_node = worker_ultra_node_initializer.create_worker_node() assert worker_node is not None