[DEMO][MultiModal ToT]

2 years ago · 4eb60fea4d
parent 1f734ab206
commit 4eb60fea4d
11 changed files with 318 additions and 126 deletions
--- a/.gitignore
+++ b/.gitignore
@ -9,6 +9,7 @@ video/
 dataframe/
 static/generated
 runs
 swarms/__pycache__
 venv
 .DS_Store
--- a/playground/demos/idea_2_img/idea2img.py
+++ b/playground/demos/idea_2_img/idea2img.py
@ -1,96 +0,0 @@
 import os
 import datetime
 from dotenv import load_dotenv
 from swarms.models.stable_diffusion import StableDiffusion
 from swarms.models.gpt4_vision_api import GPT4VisionAPI
 from swarms.models import OpenAIChat
 from swarms.structs import Agent
 # Load environment variables
 load_dotenv()
 openai_api_key = os.getenv("OPENAI_API_KEY")
 stability_api_key = os.getenv("STABILITY_API_KEY")
 # Initialize the models
 vision_api = GPT4VisionAPI(api_key=openai_api_key)
 sd_api = StableDiffusion(api_key=stability_api_key)
 gpt_api = OpenAIChat(openai_api_key=openai_api_key)
 class Idea2Image(Agent):
    def __init__(self, llm, vision_api):
        super().__init__(llm=llm)
        self.vision_api = vision_api
    def run(self, initial_prompt, num_iterations, run_folder):
        current_prompt = initial_prompt
        for i in range(num_iterations):
            print(f"Iteration {i}: Image generation and analysis")
            if i == 0:
                current_prompt = self.enrich_prompt(current_prompt)
                print(f"Enriched Prompt: {current_prompt}")
            img = sd_api.generate_and_move_image(
                current_prompt, i, run_folder
            )
            if not img:
                print("Failed to generate image")
                break
            print(f"Generated image at: {img}")
            analysis = (
                self.vision_api.run(img, current_prompt)
                if img
                else None
            )
            if analysis:
                current_prompt += (
                    ". " + analysis[:500]
                )  # Ensure the analysis is concise
                print(f"Image Analysis: {analysis}")
            else:
                print(f"Failed to analyze image at: {img}")
    def enrich_prompt(self, prompt):
        enrichment_task = (
            "Create a concise and effective image generation prompt"
            " within 400 characters or less, based on Stable"
            " Diffusion and Dalle best practices. Starting prompt:"
            f" \n\n'{prompt}'\n\nImprove the prompt with any"
            " applicable details or keywords by considering the"
            " following aspects: \n1. Subject details (like actions,"
            " emotions, environment) \n2. Artistic style (such as"
            " surrealism, hyperrealism) \n3. Medium (digital"
            " painting, oil on canvas) \n4. Color themes and"
            " lighting (like warm colors, cinematic lighting) \n5."
            " Composition and framing (close-up, wide-angle) \n6."
            " Additional elements (like a specific type of"
            " background, weather conditions) \n7. Any other"
            " artistic or thematic details that can make the image"
            " more vivid and compelling."
        )
        llm_result = self.llm.generate([enrichment_task])
        return (
            llm_result.generations[0][0].text[:500]
            if llm_result.generations
            else None
        )
 # User input and setup
 user_prompt = input("Prompt for image generation: ")
 num_iterations = int(
    input("Enter the number of iterations for image improvement: ")
 )
 run_folder = os.path.join(
    "runs", datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
 )
 os.makedirs(run_folder, exist_ok=True)
 # Initialize and run the agent
 idea2image_agent = Idea2Image(gpt_api, vision_api)
 idea2image_agent.run(user_prompt, num_iterations, run_folder)
 print("Image improvement process completed.")
--- a/playground/demos/idea_2_img/main.py
+++ b/playground/demos/idea_2_img/main.py
@ -1,7 +0,0 @@
 """
 Idea 2 img
 task -> gpt4 text -> dalle3 img -> gpt4vision img + text analyze img -> dalle3 img -> loop    
 """
 from swarms.models.gpt4_vision_api import GPT4VisionAPI
--- a/playground/demos/multimodal_tot/idea2img.py
+++ b/playground/demos/multimodal_tot/idea2img.py
@ -0,0 +1,185 @@
 import datetime
 import os
 import streamlit as st
 from dotenv import load_dotenv
 from swarms.models import OpenAIChat
 from swarms.models.gpt4_vision_api import GPT4VisionAPI
 from swarms.models.stable_diffusion import StableDiffusion
 from swarms.structs import Agent
 # Load environment variables
 load_dotenv()
 openai_api_key = os.getenv("OPENAI_API_KEY")
 stability_api_key = os.getenv("STABLE_API_KEY")
 # Initialize the models
 vision_api = GPT4VisionAPI(api_key=openai_api_key)
 sd_api = StableDiffusion(api_key=stability_api_key)
 gpt_api = OpenAIChat(openai_api_key=openai_api_key)
 class Idea2Image(Agent):
    def __init__(self, llm, vision_api):
        super().__init__(llm=llm)
        self.vision_api = vision_api
    def run(self, initial_prompt, num_iterations, run_folder):
        current_prompt = initial_prompt
        for i in range(num_iterations):
            print(f"Iteration {i}: Image generation and analysis")
            if i == 0:
                current_prompt = self.enrich_prompt(current_prompt)
                print(f"Enriched Prompt: {current_prompt}")
            img = sd_api.generate_and_move_image(
                current_prompt, i, run_folder
            )
            if not img:
                print("Failed to generate image")
                break
            print(f"Generated image at: {img}")
            analysis = (
                self.vision_api.run(img, current_prompt)
                if img
                else None
            )
            if analysis:
                current_prompt += (
                    ". " + analysis[:500]
                )  # Ensure the analysis is concise
                print(f"Image Analysis: {analysis}")
            else:
                print(f"Failed to analyze image at: {img}")
    def enrich_prompt(self, prompt):
        enrichment_task = (
            "Create a concise and effective image generation prompt"
            " within 400 characters or less, based on Stable"
            " Diffusion and Dalle best practices to help it create"
            " much better images. Starting prompt:"
            f" \n\n'{prompt}'\n\nImprove the prompt with any"
            " applicable details or keywords by considering the"
            " following aspects: \n1. Subject details (like actions,"
            " emotions, environment) \n2. Artistic style (such as"
            " surrealism, hyperrealism) \n3. Medium (digital"
            " painting, oil on canvas) \n4. Color themes and"
            " lighting (like warm colors, cinematic lighting) \n5."
            " Composition and framing (close-up, wide-angle) \n6."
            " Additional elements (like a specific type of"
            " background, weather conditions) \n7. Any other"
            " artistic or thematic details that can make the image"
            " more vivid and compelling. Help the image generator"
            " create better images by enriching the prompt."
        )
        llm_result = self.llm.generate([enrichment_task])
        return (
            llm_result.generations[0][0].text[:500]
            if llm_result.generations
            else None
        )
    def run_gradio(self, initial_prompt, num_iterations, run_folder):
        results = []
        current_prompt = initial_prompt
        for i in range(num_iterations):
            enriched_prompt = (
                self.enrich_prompt(current_prompt)
                if i == 0
                else current_prompt
            )
            img_path = sd_api.generate_and_move_image(
                enriched_prompt, i, run_folder
            )
            analysis = (
                self.vision_api.run(img_path, enriched_prompt)
                if img_path
                else None
            )
            if analysis:
                current_prompt += (
                    ". " + analysis[:500]
                )  # Ensuring the analysis is concise
            results.append((enriched_prompt, img_path, analysis))
        return results
 # print(
 #     colored("---------------------------------------- MultiModal Tree of Thought agents for Image Generation", "cyan", attrs=["bold"])
 # )
 # # User input and setup
 # user_prompt = input("Prompt for image generation: ")
 # num_iterations = int(
 #     input("Enter the number of iterations for image improvement: ")
 # )
 # run_folder = os.path.join(
 #     "runs", datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
 # )
 # os.makedirs(run_folder, exist_ok=True)
 # print(
 #     colored(
 #         f"---------------------------------- Running Multi-Modal Tree of thoughts agent with {num_iterations} iterations", "green"
 #     )
 # )
 # # Initialize and run the agent
 # idea2image_agent = Idea2Image(gpt_api, vision_api)
 # idea2image_agent.run(user_prompt, num_iterations, run_folder)
 # print("Idea space has been traversed.")
 # Load environment variables and initialize the models
 load_dotenv()
 openai_api_key = os.getenv("OPENAI_API_KEY")
 stability_api_key = os.getenv("STABLE_API_KEY")
 vision_api = GPT4VisionAPI(api_key=openai_api_key)
 sd_api = StableDiffusion(api_key=stability_api_key)
 gpt_api = OpenAIChat(openai_api_key=openai_api_key)
 # Define the modified Idea2Image class here
 # Streamlit UI layout
 st.title(
    "Explore the infinite Multi-Modal Idea Space with Idea2Image"
 )
 user_prompt = st.text_input("Prompt for image generation:")
 num_iterations = st.number_input(
    "Enter the number of iterations for image improvement:",
    min_value=1,
    step=1,
 )
 if st.button("Generate Image"):
    run_folder = os.path.join(
        "runs", datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
    )
    os.makedirs(run_folder, exist_ok=True)
    idea2image_agent = Idea2Image(gpt_api, vision_api)
    results = idea2image_agent.run_gradio(
        user_prompt, num_iterations, run_folder
    )
    for i, (enriched_prompt, img_path, analysis) in enumerate(
        results
    ):
        st.write(f"Iteration {i+1}:")
        st.write("Enriched Prompt:", enriched_prompt)
        if img_path:
            st.image(img_path, caption="Generated Image")
        else:
            st.error("Failed to generate image")
        if analysis:
            st.write("Image Analysis:", analysis)
    st.success("Idea space has been traversed.")
 # [Add any additional necessary code adjustments]
--- a/playground/demos/multimodal_tot/main.py
+++ b/playground/demos/multimodal_tot/main.py
@ -0,0 +1,114 @@
 """
 Multi Modal tree of thoughts that leverages the GPT-4 language model and the
 Stable Diffusion model to generate a multimodal output and evaluate the
 output based a metric from 0.0 to 1.0 and then run a search algorithm using DFS and BFS and return the best output.
 task: Generate an image of a swarm of bees -> Image generator -> GPT4V evaluates the img from 0.0 to 1.0 -> DFS/BFS -> return the best output
 - GPT4Vision will evaluate the image from 0.0 to 1.0 based on how likely it accomplishes the task
 - DFS/BFS will search for the best output based on the evaluation from GPT4Vision
 - The output will be a multimodal output that is a combination of the image and the text
 - The output will be evaluated by GPT4Vision
 - The prompt to the image generator will be optimized from the output of GPT4Vision and the search
 """
 import os
 from dotenv import load_dotenv
 from swarms.models.gpt4_vision_api import GPT4VisionAPI
 from swarms.models.stable_diffusion import StableDiffusion
 from termcolor import colored
 # Load the environment variables
 load_dotenv()
 # Get the API key from the environment
 api_key = os.environ.get("OPENAI_API_KEY")
 stable_api_key = os.environ.get("STABLE_API_KEY")
 # Initialize the language model
 llm = GPT4VisionAPI(
    openai_api_key=api_key,
    max_tokens=500,
 )
 # IMG Generator
 img_generator = StableDiffusion(api_key=stable_api_key)
 # Initialize the language model
 task = "Garden of Eden futuristic city graphic art"
 def evaluate_img(llm, task: str, img: str):
    EVAL_IMG = f"""
    Evaluate the image: {img} on a scale from 0.0 to 1.0 based on how likely it accomplishes the task: {task}. Output nothing than the float representing the evaluated img.
    """
    out = llm.run(task=EVAL_IMG, img=img)
    out = float(out)
    return out
 def enrichment_prompt(starting_prompt: str, evaluated_img: str):
    enrichment_task = (
        "Create a concise and effective image generation prompt"
        " within 400 characters or less, based on Stable Diffusion"
        " and Dalle best practices. Starting prompt:"
        f" \n\n'{starting_prompt}'\n\nImprove the prompt with any"
        " applicable details or keywords by considering the"
        " following aspects: \n1. Subject details (like actions,"
        " emotions, environment) \n2. Artistic style (such as"
        " surrealism, hyperrealism) \n3. Medium (digital painting,"
        " oil on canvas) \n4. Color themes and lighting (like warm"
        " colors, cinematic lighting) \n5. Composition and framing"
        " (close-up, wide-angle) \n6. Additional elements (like a"
        " specific type of background, weather conditions) \n7. Any"
        " other artistic or thematic details that can make the image"
        " more vivid and compelling. 8. Based on the evaluation of"
        " the first generated prompt used by the first prompt:"
        f" {evaluated_img} Enrich the prompt to generate a more"
        " compelling image. Output only a new prompt to create a"
        " better image"
    )
    return enrichment_task
 # Main loop
 max_iterations = 10  # Define the maximum number of iterations
 best_score = 0
 best_image = None
 for _ in range(max_iterations):
    # Generate an image and get its path
    print(colored(f"Generating img for Task: {task}", "purple"))
    img_path = img_generator.run(
        task=task
    )  # This should return the file path of the generated image
    img_path = img_path[0]
    print(colored(f"Generated Image Path: {img_path}", "green"))
    # Evaluate the image by passing the file path
    score = evaluate_img(llm, task, img_path)
    print(
        colored(
            f"Evaluated Image Score: {score} for {img_path}", "cyan"
        )
    )
    # Update the best score and image path if necessary
    if score > best_score:
        best_score = score
        best_image_path = img_path
    # Enrich the prompt based on the evaluation
    prompt = enrichment_prompt(task, score)
    print(colored(f"Enrichment Prompt: {prompt}", "yellow"))
 # Output the best result
 print("Best Image Path:", best_image_path)
 print("Best Score:", best_score)
--- a/playground/structs/tool_utils.py
+++ b/playground/structs/tool_utils.py
@ -13,7 +13,7 @@ def search_api(query: str) -> str:
        str: _description_
    """
    print(f"Searching API for {query}")
-    
+
-    
+
 tool_docs = scrape_tool_func_docs(search_api)
-print(tool_docs)
+print(tool_docs)
--- a/playground/tools/agent_with_tools.py
+++ b/playground/tools/agent_with_tools.py
@ -9,9 +9,7 @@ load_dotenv()
 api_key = os.environ.get("OPENAI_API_KEY")
-llm = OpenAIChat(
+llm = OpenAIChat(api_key=api_key)
    api_key=api_key
 )
 # @tool
 # def search_api(query: str) -> str:
@ -35,6 +33,7 @@ agent = Agent(
 )
 out = agent.run(
-    "Use the search api to find the best restaurants in New York City."
+    "Use the search api to find the best restaurants in New York"
    " City."
 )
-print(out)
+print(out)
--- a/swarms/models/stable_diffusion.py
+++ b/swarms/models/stable_diffusion.py
@ -8,6 +8,8 @@ from typing import List
 load_dotenv()
 stable_api_key = os.environ.get("STABLE_API_KEY")
 class StableDiffusion:
    """
@ -45,7 +47,7 @@ class StableDiffusion:
    def __init__(
        self,
-        api_key: str,
+        api_key: str = stable_api_key,
        api_host: str = "https://api.stability.ai",
        cfg_scale: int = 7,
        height: int = 1024,
--- a/swarms/prompts/agent_system_prompts.py
+++ b/swarms/prompts/agent_system_prompts.py
@ -66,6 +66,7 @@ def agent_system_prompt_2(name: str):
    """
    return AGENT_SYSTEM_PROMPT_2
 AGENT_SYSTEM_PROMPT_3 = f"""
    You are a fully autonomous agent serving the user in automating tasks, workflows, and activities. 
    Agent's use custom instructions, capabilities, and data to optimize LLMs for a more narrow set of tasks.
--- a/swarms/structs/agent.py
+++ b/swarms/structs/agent.py
@ -234,7 +234,7 @@ class Agent:
        self.preset_stopping_token = preset_stopping_token
        # self.system_prompt = AGENT_SYSTEM_PROMPT_3
-        
+
        # The max_loops will be set dynamically if the dynamic_loop
        if self.dynamic_loops:
            self.max_loops = "auto"
@ -268,17 +268,13 @@ class Agent:
        # If tools exist then add the tool docs usage to the sop
        if self.tools:
-            self.sop_list.append(self.tools_prompt_prep(
+            self.sop_list.append(
-                self.tool_docs, SCENARIOS
+                self.tools_prompt_prep(self.tool_docs, SCENARIOS)
            )
-        )
+
    def set_system_prompt(self, system_prompt: str):
        """Set the system prompt"""
        self.system_prompt = system_prompt
    def provide_feedback(self, feedback: str) -> None:
        """Allow users to provide feedback on the responses."""
@ -395,7 +391,6 @@ class Agent:
            except Exception as error:
                print(f"Error parsing JSON command: {error}")
    def execute_tools(self, tool_name, params):
        """Execute the tool with the provided params"""
        tool = self.tool_find_by_name(tool_name)
@ -403,8 +398,7 @@ class Agent:
            # Execute the tool with the provided parameters
            tool_result = tool.run(**params)
            print(tool_result)
-            
+
    def parse_and_execute_tools(self, response: str):
        """Parse and execute the tools"""
        json_commands = self.extract_tool_commands(response)
@ -413,7 +407,6 @@ class Agent:
            params = command.get("parmas", {})
            self.execute_tools(tool_name, params)
    def truncate_history(self):
        """
        Take the history and truncate it to fit into the model context length
@ -502,7 +495,6 @@ class Agent:
            )
        )
    def activate_autonomous_agent(self):
        """Print the autonomous agent activation message"""
        try:
--- a/swarms/tools/tool_utils.py
+++ b/swarms/tools/tool_utils.py
@ -29,6 +29,7 @@ def extract_tool_commands(self, text: str):
        except Exception as error:
            print(f"Error parsing JSON command: {error}")
 def parse_and_execute_tools(response: str):
    """Parse and execute the tools"""
    json_commands = extract_tool_commands(response)
@ -37,6 +38,7 @@ def parse_and_execute_tools(response: str):
        params = command.get("parmas", {})
        execute_tools(tool_name, params)
 def execute_tools(self, tool_name, params):
    """Execute the tool with the provided params"""
    tool = self.tool_find_by_name(tool_name)
@ -44,4 +46,3 @@ def execute_tools(self, tool_name, params):
        # Execute the tool with the provided parameters
        tool_result = tool.run(**params)
        print(tool_result)