From 4eb60fea4da25bd34e5267f3485776c674e932d5 Mon Sep 17 00:00:00 2001 From: Kye Date: Mon, 4 Dec 2023 12:10:38 -0800 Subject: [PATCH] [DEMO][MultiModal ToT] --- .gitignore | 1 + playground/demos/idea_2_img/idea2img.py | 96 ---------- playground/demos/idea_2_img/main.py | 7 - playground/demos/multimodal_tot/idea2img.py | 185 ++++++++++++++++++++ playground/demos/multimodal_tot/main.py | 114 ++++++++++++ playground/structs/tool_utils.py | 6 +- playground/tools/agent_with_tools.py | 9 +- swarms/models/stable_diffusion.py | 4 +- swarms/prompts/agent_system_prompts.py | 1 + swarms/structs/agent.py | 18 +- swarms/tools/tool_utils.py | 3 +- 11 files changed, 318 insertions(+), 126 deletions(-) delete mode 100644 playground/demos/idea_2_img/idea2img.py delete mode 100644 playground/demos/idea_2_img/main.py create mode 100644 playground/demos/multimodal_tot/idea2img.py create mode 100644 playground/demos/multimodal_tot/main.py diff --git a/.gitignore b/.gitignore index 716dc148..6fc93744 100644 --- a/.gitignore +++ b/.gitignore @@ -9,6 +9,7 @@ video/ dataframe/ static/generated +runs swarms/__pycache__ venv .DS_Store diff --git a/playground/demos/idea_2_img/idea2img.py b/playground/demos/idea_2_img/idea2img.py deleted file mode 100644 index c6be45f0..00000000 --- a/playground/demos/idea_2_img/idea2img.py +++ /dev/null @@ -1,96 +0,0 @@ -import os -import datetime -from dotenv import load_dotenv -from swarms.models.stable_diffusion import StableDiffusion -from swarms.models.gpt4_vision_api import GPT4VisionAPI -from swarms.models import OpenAIChat -from swarms.structs import Agent - -# Load environment variables -load_dotenv() -openai_api_key = os.getenv("OPENAI_API_KEY") -stability_api_key = os.getenv("STABILITY_API_KEY") - -# Initialize the models -vision_api = GPT4VisionAPI(api_key=openai_api_key) -sd_api = StableDiffusion(api_key=stability_api_key) -gpt_api = OpenAIChat(openai_api_key=openai_api_key) - - -class Idea2Image(Agent): - def __init__(self, llm, vision_api): - super().__init__(llm=llm) - self.vision_api = vision_api - - def run(self, initial_prompt, num_iterations, run_folder): - current_prompt = initial_prompt - - for i in range(num_iterations): - print(f"Iteration {i}: Image generation and analysis") - - if i == 0: - current_prompt = self.enrich_prompt(current_prompt) - print(f"Enriched Prompt: {current_prompt}") - - img = sd_api.generate_and_move_image( - current_prompt, i, run_folder - ) - if not img: - print("Failed to generate image") - break - print(f"Generated image at: {img}") - - analysis = ( - self.vision_api.run(img, current_prompt) - if img - else None - ) - if analysis: - current_prompt += ( - ". " + analysis[:500] - ) # Ensure the analysis is concise - print(f"Image Analysis: {analysis}") - else: - print(f"Failed to analyze image at: {img}") - - def enrich_prompt(self, prompt): - enrichment_task = ( - "Create a concise and effective image generation prompt" - " within 400 characters or less, based on Stable" - " Diffusion and Dalle best practices. Starting prompt:" - f" \n\n'{prompt}'\n\nImprove the prompt with any" - " applicable details or keywords by considering the" - " following aspects: \n1. Subject details (like actions," - " emotions, environment) \n2. Artistic style (such as" - " surrealism, hyperrealism) \n3. Medium (digital" - " painting, oil on canvas) \n4. Color themes and" - " lighting (like warm colors, cinematic lighting) \n5." - " Composition and framing (close-up, wide-angle) \n6." - " Additional elements (like a specific type of" - " background, weather conditions) \n7. Any other" - " artistic or thematic details that can make the image" - " more vivid and compelling." - ) - llm_result = self.llm.generate([enrichment_task]) - return ( - llm_result.generations[0][0].text[:500] - if llm_result.generations - else None - ) - - -# User input and setup -user_prompt = input("Prompt for image generation: ") -num_iterations = int( - input("Enter the number of iterations for image improvement: ") -) -run_folder = os.path.join( - "runs", datetime.datetime.now().strftime("%Y%m%d_%H%M%S") -) -os.makedirs(run_folder, exist_ok=True) - -# Initialize and run the agent -idea2image_agent = Idea2Image(gpt_api, vision_api) -idea2image_agent.run(user_prompt, num_iterations, run_folder) - -print("Image improvement process completed.") diff --git a/playground/demos/idea_2_img/main.py b/playground/demos/idea_2_img/main.py deleted file mode 100644 index 84ce67ab..00000000 --- a/playground/demos/idea_2_img/main.py +++ /dev/null @@ -1,7 +0,0 @@ -""" -Idea 2 img - -task -> gpt4 text -> dalle3 img -> gpt4vision img + text analyze img -> dalle3 img -> loop - -""" -from swarms.models.gpt4_vision_api import GPT4VisionAPI diff --git a/playground/demos/multimodal_tot/idea2img.py b/playground/demos/multimodal_tot/idea2img.py new file mode 100644 index 00000000..4a6c1da3 --- /dev/null +++ b/playground/demos/multimodal_tot/idea2img.py @@ -0,0 +1,185 @@ +import datetime +import os + +import streamlit as st +from dotenv import load_dotenv + +from swarms.models import OpenAIChat +from swarms.models.gpt4_vision_api import GPT4VisionAPI +from swarms.models.stable_diffusion import StableDiffusion +from swarms.structs import Agent + +# Load environment variables +load_dotenv() +openai_api_key = os.getenv("OPENAI_API_KEY") +stability_api_key = os.getenv("STABLE_API_KEY") + +# Initialize the models +vision_api = GPT4VisionAPI(api_key=openai_api_key) +sd_api = StableDiffusion(api_key=stability_api_key) +gpt_api = OpenAIChat(openai_api_key=openai_api_key) + + +class Idea2Image(Agent): + def __init__(self, llm, vision_api): + super().__init__(llm=llm) + self.vision_api = vision_api + + def run(self, initial_prompt, num_iterations, run_folder): + current_prompt = initial_prompt + + for i in range(num_iterations): + print(f"Iteration {i}: Image generation and analysis") + + if i == 0: + current_prompt = self.enrich_prompt(current_prompt) + print(f"Enriched Prompt: {current_prompt}") + + img = sd_api.generate_and_move_image( + current_prompt, i, run_folder + ) + if not img: + print("Failed to generate image") + break + print(f"Generated image at: {img}") + + analysis = ( + self.vision_api.run(img, current_prompt) + if img + else None + ) + if analysis: + current_prompt += ( + ". " + analysis[:500] + ) # Ensure the analysis is concise + print(f"Image Analysis: {analysis}") + else: + print(f"Failed to analyze image at: {img}") + + def enrich_prompt(self, prompt): + enrichment_task = ( + "Create a concise and effective image generation prompt" + " within 400 characters or less, based on Stable" + " Diffusion and Dalle best practices to help it create" + " much better images. Starting prompt:" + f" \n\n'{prompt}'\n\nImprove the prompt with any" + " applicable details or keywords by considering the" + " following aspects: \n1. Subject details (like actions," + " emotions, environment) \n2. Artistic style (such as" + " surrealism, hyperrealism) \n3. Medium (digital" + " painting, oil on canvas) \n4. Color themes and" + " lighting (like warm colors, cinematic lighting) \n5." + " Composition and framing (close-up, wide-angle) \n6." + " Additional elements (like a specific type of" + " background, weather conditions) \n7. Any other" + " artistic or thematic details that can make the image" + " more vivid and compelling. Help the image generator" + " create better images by enriching the prompt." + ) + llm_result = self.llm.generate([enrichment_task]) + return ( + llm_result.generations[0][0].text[:500] + if llm_result.generations + else None + ) + + def run_gradio(self, initial_prompt, num_iterations, run_folder): + results = [] + current_prompt = initial_prompt + + for i in range(num_iterations): + enriched_prompt = ( + self.enrich_prompt(current_prompt) + if i == 0 + else current_prompt + ) + img_path = sd_api.generate_and_move_image( + enriched_prompt, i, run_folder + ) + analysis = ( + self.vision_api.run(img_path, enriched_prompt) + if img_path + else None + ) + + if analysis: + current_prompt += ( + ". " + analysis[:500] + ) # Ensuring the analysis is concise + results.append((enriched_prompt, img_path, analysis)) + + return results + + +# print( +# colored("---------------------------------------- MultiModal Tree of Thought agents for Image Generation", "cyan", attrs=["bold"]) +# ) +# # User input and setup +# user_prompt = input("Prompt for image generation: ") +# num_iterations = int( +# input("Enter the number of iterations for image improvement: ") +# ) +# run_folder = os.path.join( +# "runs", datetime.datetime.now().strftime("%Y%m%d_%H%M%S") +# ) +# os.makedirs(run_folder, exist_ok=True) + +# print( +# colored( +# f"---------------------------------- Running Multi-Modal Tree of thoughts agent with {num_iterations} iterations", "green" +# ) +# ) +# # Initialize and run the agent +# idea2image_agent = Idea2Image(gpt_api, vision_api) +# idea2image_agent.run(user_prompt, num_iterations, run_folder) + +# print("Idea space has been traversed.") + + +# Load environment variables and initialize the models +load_dotenv() +openai_api_key = os.getenv("OPENAI_API_KEY") +stability_api_key = os.getenv("STABLE_API_KEY") +vision_api = GPT4VisionAPI(api_key=openai_api_key) +sd_api = StableDiffusion(api_key=stability_api_key) +gpt_api = OpenAIChat(openai_api_key=openai_api_key) + +# Define the modified Idea2Image class here + +# Streamlit UI layout +st.title( + "Explore the infinite Multi-Modal Idea Space with Idea2Image" +) +user_prompt = st.text_input("Prompt for image generation:") +num_iterations = st.number_input( + "Enter the number of iterations for image improvement:", + min_value=1, + step=1, +) + +if st.button("Generate Image"): + run_folder = os.path.join( + "runs", datetime.datetime.now().strftime("%Y%m%d_%H%M%S") + ) + os.makedirs(run_folder, exist_ok=True) + idea2image_agent = Idea2Image(gpt_api, vision_api) + + results = idea2image_agent.run_gradio( + user_prompt, num_iterations, run_folder + ) + + for i, (enriched_prompt, img_path, analysis) in enumerate( + results + ): + st.write(f"Iteration {i+1}:") + st.write("Enriched Prompt:", enriched_prompt) + if img_path: + st.image(img_path, caption="Generated Image") + else: + st.error("Failed to generate image") + if analysis: + st.write("Image Analysis:", analysis) + + st.success("Idea space has been traversed.") + +# [Add any additional necessary code adjustments] diff --git a/playground/demos/multimodal_tot/main.py b/playground/demos/multimodal_tot/main.py new file mode 100644 index 00000000..2d5ed653 --- /dev/null +++ b/playground/demos/multimodal_tot/main.py @@ -0,0 +1,114 @@ +""" +Multi Modal tree of thoughts that leverages the GPT-4 language model and the +Stable Diffusion model to generate a multimodal output and evaluate the +output based a metric from 0.0 to 1.0 and then run a search algorithm using DFS and BFS and return the best output. + + +task: Generate an image of a swarm of bees -> Image generator -> GPT4V evaluates the img from 0.0 to 1.0 -> DFS/BFS -> return the best output + + +- GPT4Vision will evaluate the image from 0.0 to 1.0 based on how likely it accomplishes the task +- DFS/BFS will search for the best output based on the evaluation from GPT4Vision +- The output will be a multimodal output that is a combination of the image and the text +- The output will be evaluated by GPT4Vision +- The prompt to the image generator will be optimized from the output of GPT4Vision and the search + +""" + +import os +from dotenv import load_dotenv +from swarms.models.gpt4_vision_api import GPT4VisionAPI +from swarms.models.stable_diffusion import StableDiffusion +from termcolor import colored + +# Load the environment variables +load_dotenv() + +# Get the API key from the environment +api_key = os.environ.get("OPENAI_API_KEY") +stable_api_key = os.environ.get("STABLE_API_KEY") + + +# Initialize the language model +llm = GPT4VisionAPI( + openai_api_key=api_key, + max_tokens=500, +) + +# IMG Generator +img_generator = StableDiffusion(api_key=stable_api_key) + + +# Initialize the language model +task = "Garden of Eden futuristic city graphic art" + + +def evaluate_img(llm, task: str, img: str): + EVAL_IMG = f""" + Evaluate the image: {img} on a scale from 0.0 to 1.0 based on how likely it accomplishes the task: {task}. Output nothing than the float representing the evaluated img. + """ + out = llm.run(task=EVAL_IMG, img=img) + out = float(out) + return out + + +def enrichment_prompt(starting_prompt: str, evaluated_img: str): + enrichment_task = ( + "Create a concise and effective image generation prompt" + " within 400 characters or less, based on Stable Diffusion" + " and Dalle best practices. Starting prompt:" + f" \n\n'{starting_prompt}'\n\nImprove the prompt with any" + " applicable details or keywords by considering the" + " following aspects: \n1. Subject details (like actions," + " emotions, environment) \n2. Artistic style (such as" + " surrealism, hyperrealism) \n3. Medium (digital painting," + " oil on canvas) \n4. Color themes and lighting (like warm" + " colors, cinematic lighting) \n5. Composition and framing" + " (close-up, wide-angle) \n6. Additional elements (like a" + " specific type of background, weather conditions) \n7. Any" + " other artistic or thematic details that can make the image" + " more vivid and compelling. 8. Based on the evaluation of" + " the first generated prompt used by the first prompt:" + f" {evaluated_img} Enrich the prompt to generate a more" + " compelling image. Output only a new prompt to create a" + " better image" + ) + return enrichment_task + + +# Main loop +max_iterations = 10 # Define the maximum number of iterations +best_score = 0 +best_image = None + +for _ in range(max_iterations): + # Generate an image and get its path + print(colored(f"Generating img for Task: {task}", "purple")) + + img_path = img_generator.run( + task=task + ) # This should return the file path of the generated image + img_path = img_path[0] + print(colored(f"Generated Image Path: {img_path}", "green")) + + # Evaluate the image by passing the file path + score = evaluate_img(llm, task, img_path) + print( + colored( + f"Evaluated Image Score: {score} for {img_path}", "cyan" + ) + ) + + # Update the best score and image path if necessary + if score > best_score: + best_score = score + best_image_path = img_path + + # Enrich the prompt based on the evaluation + prompt = enrichment_prompt(task, score) + print(colored(f"Enrichment Prompt: {prompt}", "yellow")) + + +# Output the best result +print("Best Image Path:", best_image_path) +print("Best Score:", best_score) diff --git a/playground/structs/tool_utils.py b/playground/structs/tool_utils.py index 8ded363a..ff7e17c2 100644 --- a/playground/structs/tool_utils.py +++ b/playground/structs/tool_utils.py @@ -13,7 +13,7 @@ def search_api(query: str) -> str: str: _description_ """ print(f"Searching API for {query}") - - + + tool_docs = scrape_tool_func_docs(search_api) -print(tool_docs) \ No newline at end of file +print(tool_docs) diff --git a/playground/tools/agent_with_tools.py b/playground/tools/agent_with_tools.py index 9436d5a6..ee4a8ef7 100644 --- a/playground/tools/agent_with_tools.py +++ b/playground/tools/agent_with_tools.py @@ -9,9 +9,7 @@ load_dotenv() api_key = os.environ.get("OPENAI_API_KEY") -llm = OpenAIChat( - api_key=api_key -) +llm = OpenAIChat(api_key=api_key) # @tool # def search_api(query: str) -> str: @@ -35,6 +33,7 @@ agent = Agent( ) out = agent.run( - "Use the search api to find the best restaurants in New York City." + "Use the search api to find the best restaurants in New York" + " City." ) -print(out) \ No newline at end of file +print(out) diff --git a/swarms/models/stable_diffusion.py b/swarms/models/stable_diffusion.py index 6256987e..7b363d02 100644 --- a/swarms/models/stable_diffusion.py +++ b/swarms/models/stable_diffusion.py @@ -8,6 +8,8 @@ from typing import List load_dotenv() +stable_api_key = os.environ.get("STABLE_API_KEY") + class StableDiffusion: """ @@ -45,7 +47,7 @@ class StableDiffusion: def __init__( self, - api_key: str, + api_key: str = stable_api_key, api_host: str = "https://api.stability.ai", cfg_scale: int = 7, height: int = 1024, diff --git a/swarms/prompts/agent_system_prompts.py b/swarms/prompts/agent_system_prompts.py index 7ff9faeb..3cf8447b 100644 --- a/swarms/prompts/agent_system_prompts.py +++ b/swarms/prompts/agent_system_prompts.py @@ -66,6 +66,7 @@ def agent_system_prompt_2(name: str): """ return AGENT_SYSTEM_PROMPT_2 + AGENT_SYSTEM_PROMPT_3 = f""" You are a fully autonomous agent serving the user in automating tasks, workflows, and activities. Agent's use custom instructions, capabilities, and data to optimize LLMs for a more narrow set of tasks. diff --git a/swarms/structs/agent.py b/swarms/structs/agent.py index d51d6975..9d490877 100644 --- a/swarms/structs/agent.py +++ b/swarms/structs/agent.py @@ -234,7 +234,7 @@ class Agent: self.preset_stopping_token = preset_stopping_token # self.system_prompt = AGENT_SYSTEM_PROMPT_3 - + # The max_loops will be set dynamically if the dynamic_loop if self.dynamic_loops: self.max_loops = "auto" @@ -268,17 +268,13 @@ class Agent: # If tools exist then add the tool docs usage to the sop if self.tools: - self.sop_list.append(self.tools_prompt_prep( - self.tool_docs, SCENARIOS + self.sop_list.append( + self.tools_prompt_prep(self.tool_docs, SCENARIOS) ) - ) - + def set_system_prompt(self, system_prompt: str): """Set the system prompt""" self.system_prompt = system_prompt - - - def provide_feedback(self, feedback: str) -> None: """Allow users to provide feedback on the responses.""" @@ -395,7 +391,6 @@ class Agent: except Exception as error: print(f"Error parsing JSON command: {error}") - def execute_tools(self, tool_name, params): """Execute the tool with the provided params""" tool = self.tool_find_by_name(tool_name) @@ -403,8 +398,7 @@ class Agent: # Execute the tool with the provided parameters tool_result = tool.run(**params) print(tool_result) - - + def parse_and_execute_tools(self, response: str): """Parse and execute the tools""" json_commands = self.extract_tool_commands(response) @@ -413,7 +407,6 @@ class Agent: params = command.get("parmas", {}) self.execute_tools(tool_name, params) - def truncate_history(self): """ Take the history and truncate it to fit into the model context length @@ -502,7 +495,6 @@ class Agent: ) ) - def activate_autonomous_agent(self): """Print the autonomous agent activation message""" try: diff --git a/swarms/tools/tool_utils.py b/swarms/tools/tool_utils.py index d66ad6f7..c189c9f5 100644 --- a/swarms/tools/tool_utils.py +++ b/swarms/tools/tool_utils.py @@ -29,6 +29,7 @@ def extract_tool_commands(self, text: str): except Exception as error: print(f"Error parsing JSON command: {error}") + def parse_and_execute_tools(response: str): """Parse and execute the tools""" json_commands = extract_tool_commands(response) @@ -37,6 +38,7 @@ def parse_and_execute_tools(response: str): params = command.get("parmas", {}) execute_tools(tool_name, params) + def execute_tools(self, tool_name, params): """Execute the tool with the provided params""" tool = self.tool_find_by_name(tool_name) @@ -44,4 +46,3 @@ def execute_tools(self, tool_name, params): # Execute the tool with the provided parameters tool_result = tool.run(**params) print(tool_result) -