[DEMO][MultiModal ToT]

2 years ago · 4eb60fea4d
parent 1f734ab206
commit 4eb60fea4d
11 changed files with 318 additions and 126 deletions
--- a/.gitignore
+++ b/.gitignore
@ -9,6 +9,7 @@ video/
 dataframe/

 static/generated
+runs
 swarms/__pycache__
 venv
 .DS_Store
--- a/playground/demos/idea_2_img/idea2img.py
+++ b/playground/demos/idea_2_img/idea2img.py
@ -1,96 +0,0 @@
-import os
-import datetime
-from dotenv import load_dotenv
-from swarms.models.stable_diffusion import StableDiffusion
-from swarms.models.gpt4_vision_api import GPT4VisionAPI
-from swarms.models import OpenAIChat
-from swarms.structs import Agent
-
-# Load environment variables
-load_dotenv()
-openai_api_key = os.getenv("OPENAI_API_KEY")
-stability_api_key = os.getenv("STABILITY_API_KEY")
-
-# Initialize the models
-vision_api = GPT4VisionAPI(api_key=openai_api_key)
-sd_api = StableDiffusion(api_key=stability_api_key)
-gpt_api = OpenAIChat(openai_api_key=openai_api_key)
-
-
-class Idea2Image(Agent):
-    def __init__(self, llm, vision_api):
-        super().__init__(llm=llm)
-        self.vision_api = vision_api
-
-    def run(self, initial_prompt, num_iterations, run_folder):
-        current_prompt = initial_prompt
-
-        for i in range(num_iterations):
-            print(f"Iteration {i}: Image generation and analysis")
-
-            if i == 0:
-                current_prompt = self.enrich_prompt(current_prompt)
-                print(f"Enriched Prompt: {current_prompt}")
-
-            img = sd_api.generate_and_move_image(
-                current_prompt, i, run_folder
-            )
-            if not img:
-                print("Failed to generate image")
-                break
-            print(f"Generated image at: {img}")
-
-            analysis = (
-                self.vision_api.run(img, current_prompt)
-                if img
-                else None
-            )
-            if analysis:
-                current_prompt += (
-                    ". " + analysis[:500]
-                )  # Ensure the analysis is concise
-                print(f"Image Analysis: {analysis}")
-            else:
-                print(f"Failed to analyze image at: {img}")
-
-    def enrich_prompt(self, prompt):
-        enrichment_task = (
-            "Create a concise and effective image generation prompt"
-            " within 400 characters or less, based on Stable"
-            " Diffusion and Dalle best practices. Starting prompt:"
-            f" \n\n'{prompt}'\n\nImprove the prompt with any"
-            " applicable details or keywords by considering the"
-            " following aspects: \n1. Subject details (like actions,"
-            " emotions, environment) \n2. Artistic style (such as"
-            " surrealism, hyperrealism) \n3. Medium (digital"
-            " painting, oil on canvas) \n4. Color themes and"
-            " lighting (like warm colors, cinematic lighting) \n5."
-            " Composition and framing (close-up, wide-angle) \n6."
-            " Additional elements (like a specific type of"
-            " background, weather conditions) \n7. Any other"
-            " artistic or thematic details that can make the image"
-            " more vivid and compelling."
-        )
-        llm_result = self.llm.generate([enrichment_task])
-        return (
-            llm_result.generations[0][0].text[:500]
-            if llm_result.generations
-            else None
-        )
-
-
-# User input and setup
-user_prompt = input("Prompt for image generation: ")
-num_iterations = int(
-    input("Enter the number of iterations for image improvement: ")
-)
-run_folder = os.path.join(
-    "runs", datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
-)
-os.makedirs(run_folder, exist_ok=True)
-
-# Initialize and run the agent
-idea2image_agent = Idea2Image(gpt_api, vision_api)
-idea2image_agent.run(user_prompt, num_iterations, run_folder)
-
-print("Image improvement process completed.")
--- a/playground/demos/idea_2_img/main.py
+++ b/playground/demos/idea_2_img/main.py
@ -1,7 +0,0 @@
-"""
-Idea 2 img
-
-task -> gpt4 text -> dalle3 img -> gpt4vision img + text analyze img -> dalle3 img -> loop    
-    
-"""
-from swarms.models.gpt4_vision_api import GPT4VisionAPI
--- a/playground/demos/multimodal_tot/idea2img.py
+++ b/playground/demos/multimodal_tot/idea2img.py
@ -0,0 +1,185 @@
+import datetime
+import os
+
+import streamlit as st
+from dotenv import load_dotenv
+
+from swarms.models import OpenAIChat
+from swarms.models.gpt4_vision_api import GPT4VisionAPI
+from swarms.models.stable_diffusion import StableDiffusion
+from swarms.structs import Agent
+
+# Load environment variables
+load_dotenv()
+openai_api_key = os.getenv("OPENAI_API_KEY")
+stability_api_key = os.getenv("STABLE_API_KEY")
+
+# Initialize the models
+vision_api = GPT4VisionAPI(api_key=openai_api_key)
+sd_api = StableDiffusion(api_key=stability_api_key)
+gpt_api = OpenAIChat(openai_api_key=openai_api_key)
+
+
+class Idea2Image(Agent):
+    def __init__(self, llm, vision_api):
+        super().__init__(llm=llm)
+        self.vision_api = vision_api
+
+    def run(self, initial_prompt, num_iterations, run_folder):
+        current_prompt = initial_prompt
+
+        for i in range(num_iterations):
+            print(f"Iteration {i}: Image generation and analysis")
+
+            if i == 0:
+                current_prompt = self.enrich_prompt(current_prompt)
+                print(f"Enriched Prompt: {current_prompt}")
+
+            img = sd_api.generate_and_move_image(
+                current_prompt, i, run_folder
+            )
+            if not img:
+                print("Failed to generate image")
+                break
+            print(f"Generated image at: {img}")
+
+            analysis = (
+                self.vision_api.run(img, current_prompt)
+                if img
+                else None
+            )
+            if analysis:
+                current_prompt += (
+                    ". " + analysis[:500]
+                )  # Ensure the analysis is concise
+                print(f"Image Analysis: {analysis}")
+            else:
+                print(f"Failed to analyze image at: {img}")
+
+    def enrich_prompt(self, prompt):
+        enrichment_task = (
+            "Create a concise and effective image generation prompt"
+            " within 400 characters or less, based on Stable"
+            " Diffusion and Dalle best practices to help it create"
+            " much better images. Starting prompt:"
+            f" \n\n'{prompt}'\n\nImprove the prompt with any"
+            " applicable details or keywords by considering the"
+            " following aspects: \n1. Subject details (like actions,"
+            " emotions, environment) \n2. Artistic style (such as"
+            " surrealism, hyperrealism) \n3. Medium (digital"
+            " painting, oil on canvas) \n4. Color themes and"
+            " lighting (like warm colors, cinematic lighting) \n5."
+            " Composition and framing (close-up, wide-angle) \n6."
+            " Additional elements (like a specific type of"
+            " background, weather conditions) \n7. Any other"
+            " artistic or thematic details that can make the image"
+            " more vivid and compelling. Help the image generator"
+            " create better images by enriching the prompt."
+        )
+        llm_result = self.llm.generate([enrichment_task])
+        return (
+            llm_result.generations[0][0].text[:500]
+            if llm_result.generations
+            else None
+        )
+
+    def run_gradio(self, initial_prompt, num_iterations, run_folder):
+        results = []
+        current_prompt = initial_prompt
+
+        for i in range(num_iterations):
+            enriched_prompt = (
+                self.enrich_prompt(current_prompt)
+                if i == 0
+                else current_prompt
+            )
+            img_path = sd_api.generate_and_move_image(
+                enriched_prompt, i, run_folder
+            )
+            analysis = (
+                self.vision_api.run(img_path, enriched_prompt)
+                if img_path
+                else None
+            )
+
+            if analysis:
+                current_prompt += (
+                    ". " + analysis[:500]
+                )  # Ensuring the analysis is concise
+            results.append((enriched_prompt, img_path, analysis))
+
+        return results
+
+
+# print(
+#     colored("---------------------------------------- MultiModal Tree of Thought agents for Image Generation", "cyan", attrs=["bold"])
+# )
+# # User input and setup
+# user_prompt = input("Prompt for image generation: ")
+# num_iterations = int(
+#     input("Enter the number of iterations for image improvement: ")
+# )
+# run_folder = os.path.join(
+#     "runs", datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
+# )
+# os.makedirs(run_folder, exist_ok=True)
+
+# print(
+#     colored(
+#         f"---------------------------------- Running Multi-Modal Tree of thoughts agent with {num_iterations} iterations", "green"
+#     )
+# )
+# # Initialize and run the agent
+# idea2image_agent = Idea2Image(gpt_api, vision_api)
+# idea2image_agent.run(user_prompt, num_iterations, run_folder)
+
+# print("Idea space has been traversed.")
+
+
+# Load environment variables and initialize the models
+load_dotenv()
+openai_api_key = os.getenv("OPENAI_API_KEY")
+stability_api_key = os.getenv("STABLE_API_KEY")
+vision_api = GPT4VisionAPI(api_key=openai_api_key)
+sd_api = StableDiffusion(api_key=stability_api_key)
+gpt_api = OpenAIChat(openai_api_key=openai_api_key)
+
+# Define the modified Idea2Image class here
+
+# Streamlit UI layout
+st.title(
+    "Explore the infinite Multi-Modal Idea Space with Idea2Image"
+)
+user_prompt = st.text_input("Prompt for image generation:")
+num_iterations = st.number_input(
+    "Enter the number of iterations for image improvement:",
+    min_value=1,
+    step=1,
+)
+
+if st.button("Generate Image"):
+    run_folder = os.path.join(
+        "runs", datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
+    )
+    os.makedirs(run_folder, exist_ok=True)
+    idea2image_agent = Idea2Image(gpt_api, vision_api)
+
+    results = idea2image_agent.run_gradio(
+        user_prompt, num_iterations, run_folder
+    )
+
+    for i, (enriched_prompt, img_path, analysis) in enumerate(
+        results
+    ):
+        st.write(f"Iteration {i+1}:")
+        st.write("Enriched Prompt:", enriched_prompt)
+        if img_path:
+            st.image(img_path, caption="Generated Image")
+        else:
+            st.error("Failed to generate image")
+        if analysis:
+            st.write("Image Analysis:", analysis)
+
+    st.success("Idea space has been traversed.")
+
+# [Add any additional necessary code adjustments]
--- a/playground/demos/multimodal_tot/main.py
+++ b/playground/demos/multimodal_tot/main.py
@ -0,0 +1,114 @@
+"""
+Multi Modal tree of thoughts that leverages the GPT-4 language model and the
+Stable Diffusion model to generate a multimodal output and evaluate the
+output based a metric from 0.0 to 1.0 and then run a search algorithm using DFS and BFS and return the best output.
+    
+    
+task: Generate an image of a swarm of bees -> Image generator -> GPT4V evaluates the img from 0.0 to 1.0 -> DFS/BFS -> return the best output
+
+
+- GPT4Vision will evaluate the image from 0.0 to 1.0 based on how likely it accomplishes the task
+- DFS/BFS will search for the best output based on the evaluation from GPT4Vision
+- The output will be a multimodal output that is a combination of the image and the text
+- The output will be evaluated by GPT4Vision
+- The prompt to the image generator will be optimized from the output of GPT4Vision and the search
+
+"""
+
+import os
+from dotenv import load_dotenv
+from swarms.models.gpt4_vision_api import GPT4VisionAPI
+from swarms.models.stable_diffusion import StableDiffusion
+from termcolor import colored
+
+# Load the environment variables
+load_dotenv()
+
+# Get the API key from the environment
+api_key = os.environ.get("OPENAI_API_KEY")
+stable_api_key = os.environ.get("STABLE_API_KEY")
+
+
+# Initialize the language model
+llm = GPT4VisionAPI(
+    openai_api_key=api_key,
+    max_tokens=500,
+)
+
+# IMG Generator
+img_generator = StableDiffusion(api_key=stable_api_key)
+
+
+# Initialize the language model
+task = "Garden of Eden futuristic city graphic art"
+
+
+def evaluate_img(llm, task: str, img: str):
+    EVAL_IMG = f"""
+    Evaluate the image: {img} on a scale from 0.0 to 1.0 based on how likely it accomplishes the task: {task}. Output nothing than the float representing the evaluated img.
+    """
+    out = llm.run(task=EVAL_IMG, img=img)
+    out = float(out)
+    return out
+
+
+def enrichment_prompt(starting_prompt: str, evaluated_img: str):
+    enrichment_task = (
+        "Create a concise and effective image generation prompt"
+        " within 400 characters or less, based on Stable Diffusion"
+        " and Dalle best practices. Starting prompt:"
+        f" \n\n'{starting_prompt}'\n\nImprove the prompt with any"
+        " applicable details or keywords by considering the"
+        " following aspects: \n1. Subject details (like actions,"
+        " emotions, environment) \n2. Artistic style (such as"
+        " surrealism, hyperrealism) \n3. Medium (digital painting,"
+        " oil on canvas) \n4. Color themes and lighting (like warm"
+        " colors, cinematic lighting) \n5. Composition and framing"
+        " (close-up, wide-angle) \n6. Additional elements (like a"
+        " specific type of background, weather conditions) \n7. Any"
+        " other artistic or thematic details that can make the image"
+        " more vivid and compelling. 8. Based on the evaluation of"
+        " the first generated prompt used by the first prompt:"
+        f" {evaluated_img} Enrich the prompt to generate a more"
+        " compelling image. Output only a new prompt to create a"
+        " better image"
+    )
+    return enrichment_task
+
+
+# Main loop
+max_iterations = 10  # Define the maximum number of iterations
+best_score = 0
+best_image = None
+
+for _ in range(max_iterations):
+    # Generate an image and get its path
+    print(colored(f"Generating img for Task: {task}", "purple"))
+
+    img_path = img_generator.run(
+        task=task
+    )  # This should return the file path of the generated image
+    img_path = img_path[0]
+    print(colored(f"Generated Image Path: {img_path}", "green"))
+
+    # Evaluate the image by passing the file path
+    score = evaluate_img(llm, task, img_path)
+    print(
+        colored(
+            f"Evaluated Image Score: {score} for {img_path}", "cyan"
+        )
+    )
+
+    # Update the best score and image path if necessary
+    if score > best_score:
+        best_score = score
+        best_image_path = img_path
+
+    # Enrich the prompt based on the evaluation
+    prompt = enrichment_prompt(task, score)
+    print(colored(f"Enrichment Prompt: {prompt}", "yellow"))
+
+
+# Output the best result
+print("Best Image Path:", best_image_path)
+print("Best Score:", best_score)
--- a/playground/structs/tool_utils.py
+++ b/playground/structs/tool_utils.py
@ -13,7 +13,7 @@ def search_api(query: str) -> str:
        str: _description_
    """
    print(f"Searching API for {query}")
-    
-    
+
+
 tool_docs = scrape_tool_func_docs(search_api)
-print(tool_docs)
+print(tool_docs)
--- a/playground/tools/agent_with_tools.py
+++ b/playground/tools/agent_with_tools.py
@ -9,9 +9,7 @@ load_dotenv()
 api_key = os.environ.get("OPENAI_API_KEY")


-llm = OpenAIChat(
-    api_key=api_key
-)
+llm = OpenAIChat(api_key=api_key)

 # @tool
 # def search_api(query: str) -> str:
@ -35,6 +33,7 @@ agent = Agent(
 )

 out = agent.run(
-    "Use the search api to find the best restaurants in New York City."
+    "Use the search api to find the best restaurants in New York"
+    " City."
 )
-print(out)
+print(out)
--- a/swarms/models/stable_diffusion.py
+++ b/swarms/models/stable_diffusion.py
@ -8,6 +8,8 @@ from typing import List

 load_dotenv()

+stable_api_key = os.environ.get("STABLE_API_KEY")
+

 class StableDiffusion:
    """
@ -45,7 +47,7 @@ class StableDiffusion:

    def __init__(
        self,
-        api_key: str,
+        api_key: str = stable_api_key,
        api_host: str = "https://api.stability.ai",
        cfg_scale: int = 7,
        height: int = 1024,
--- a/swarms/prompts/agent_system_prompts.py
+++ b/swarms/prompts/agent_system_prompts.py
@ -66,6 +66,7 @@ def agent_system_prompt_2(name: str):
    """
    return AGENT_SYSTEM_PROMPT_2

+
 AGENT_SYSTEM_PROMPT_3 = f"""
    You are a fully autonomous agent serving the user in automating tasks, workflows, and activities. 
    Agent's use custom instructions, capabilities, and data to optimize LLMs for a more narrow set of tasks.
--- a/swarms/structs/agent.py
+++ b/swarms/structs/agent.py
@ -234,7 +234,7 @@ class Agent:
        self.preset_stopping_token = preset_stopping_token

        # self.system_prompt = AGENT_SYSTEM_PROMPT_3
-        
+
        # The max_loops will be set dynamically if the dynamic_loop
        if self.dynamic_loops:
            self.max_loops = "auto"
@ -268,17 +268,13 @@ class Agent:

        # If tools exist then add the tool docs usage to the sop
        if self.tools:
-            self.sop_list.append(self.tools_prompt_prep(
-                self.tool_docs, SCENARIOS
+            self.sop_list.append(
+                self.tools_prompt_prep(self.tool_docs, SCENARIOS)
            )
-        )
-            
+
    def set_system_prompt(self, system_prompt: str):
        """Set the system prompt"""
        self.system_prompt = system_prompt
-        
-    
-            

    def provide_feedback(self, feedback: str) -> None:
        """Allow users to provide feedback on the responses."""
@ -395,7 +391,6 @@ class Agent:
            except Exception as error:
                print(f"Error parsing JSON command: {error}")

-    
    def execute_tools(self, tool_name, params):
        """Execute the tool with the provided params"""
        tool = self.tool_find_by_name(tool_name)
@ -403,8 +398,7 @@ class Agent:
            # Execute the tool with the provided parameters
            tool_result = tool.run(**params)
            print(tool_result)
-            
-    
+
    def parse_and_execute_tools(self, response: str):
        """Parse and execute the tools"""
        json_commands = self.extract_tool_commands(response)
@ -413,7 +407,6 @@ class Agent:
            params = command.get("parmas", {})
            self.execute_tools(tool_name, params)

-
    def truncate_history(self):
        """
        Take the history and truncate it to fit into the model context length
@ -502,7 +495,6 @@ class Agent:
            )
        )

-
    def activate_autonomous_agent(self):
        """Print the autonomous agent activation message"""
        try:
--- a/swarms/tools/tool_utils.py
+++ b/swarms/tools/tool_utils.py
@ -29,6 +29,7 @@ def extract_tool_commands(self, text: str):
        except Exception as error:
            print(f"Error parsing JSON command: {error}")

+
 def parse_and_execute_tools(response: str):
    """Parse and execute the tools"""
    json_commands = extract_tool_commands(response)
@ -37,6 +38,7 @@ def parse_and_execute_tools(response: str):
        params = command.get("parmas", {})
        execute_tools(tool_name, params)

+
 def execute_tools(self, tool_name, params):
    """Execute the tool with the provided params"""
    tool = self.tool_find_by_name(tool_name)
@ -44,4 +46,3 @@ def execute_tools(self, tool_name, params):
        # Execute the tool with the provided parameters
        tool_result = tool.run(**params)
        print(tool_result)
-