[DEMO][MultiModal ToT]

pull/260/head
Kye 1 year ago
parent 1f734ab206
commit 4eb60fea4d

1
.gitignore vendored

@ -9,6 +9,7 @@ video/
dataframe/ dataframe/
static/generated static/generated
runs
swarms/__pycache__ swarms/__pycache__
venv venv
.DS_Store .DS_Store

@ -1,96 +0,0 @@
import os
import datetime
from dotenv import load_dotenv
from swarms.models.stable_diffusion import StableDiffusion
from swarms.models.gpt4_vision_api import GPT4VisionAPI
from swarms.models import OpenAIChat
from swarms.structs import Agent
# Load environment variables
load_dotenv()
openai_api_key = os.getenv("OPENAI_API_KEY")
stability_api_key = os.getenv("STABILITY_API_KEY")
# Initialize the models
vision_api = GPT4VisionAPI(api_key=openai_api_key)
sd_api = StableDiffusion(api_key=stability_api_key)
gpt_api = OpenAIChat(openai_api_key=openai_api_key)
class Idea2Image(Agent):
def __init__(self, llm, vision_api):
super().__init__(llm=llm)
self.vision_api = vision_api
def run(self, initial_prompt, num_iterations, run_folder):
current_prompt = initial_prompt
for i in range(num_iterations):
print(f"Iteration {i}: Image generation and analysis")
if i == 0:
current_prompt = self.enrich_prompt(current_prompt)
print(f"Enriched Prompt: {current_prompt}")
img = sd_api.generate_and_move_image(
current_prompt, i, run_folder
)
if not img:
print("Failed to generate image")
break
print(f"Generated image at: {img}")
analysis = (
self.vision_api.run(img, current_prompt)
if img
else None
)
if analysis:
current_prompt += (
". " + analysis[:500]
) # Ensure the analysis is concise
print(f"Image Analysis: {analysis}")
else:
print(f"Failed to analyze image at: {img}")
def enrich_prompt(self, prompt):
enrichment_task = (
"Create a concise and effective image generation prompt"
" within 400 characters or less, based on Stable"
" Diffusion and Dalle best practices. Starting prompt:"
f" \n\n'{prompt}'\n\nImprove the prompt with any"
" applicable details or keywords by considering the"
" following aspects: \n1. Subject details (like actions,"
" emotions, environment) \n2. Artistic style (such as"
" surrealism, hyperrealism) \n3. Medium (digital"
" painting, oil on canvas) \n4. Color themes and"
" lighting (like warm colors, cinematic lighting) \n5."
" Composition and framing (close-up, wide-angle) \n6."
" Additional elements (like a specific type of"
" background, weather conditions) \n7. Any other"
" artistic or thematic details that can make the image"
" more vivid and compelling."
)
llm_result = self.llm.generate([enrichment_task])
return (
llm_result.generations[0][0].text[:500]
if llm_result.generations
else None
)
# User input and setup
user_prompt = input("Prompt for image generation: ")
num_iterations = int(
input("Enter the number of iterations for image improvement: ")
)
run_folder = os.path.join(
"runs", datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
)
os.makedirs(run_folder, exist_ok=True)
# Initialize and run the agent
idea2image_agent = Idea2Image(gpt_api, vision_api)
idea2image_agent.run(user_prompt, num_iterations, run_folder)
print("Image improvement process completed.")

@ -1,7 +0,0 @@
"""
Idea 2 img
task -> gpt4 text -> dalle3 img -> gpt4vision img + text analyze img -> dalle3 img -> loop
"""
from swarms.models.gpt4_vision_api import GPT4VisionAPI

@ -0,0 +1,185 @@
import datetime
import os
import streamlit as st
from dotenv import load_dotenv
from swarms.models import OpenAIChat
from swarms.models.gpt4_vision_api import GPT4VisionAPI
from swarms.models.stable_diffusion import StableDiffusion
from swarms.structs import Agent
# Load environment variables
load_dotenv()
openai_api_key = os.getenv("OPENAI_API_KEY")
stability_api_key = os.getenv("STABLE_API_KEY")
# Initialize the models
vision_api = GPT4VisionAPI(api_key=openai_api_key)
sd_api = StableDiffusion(api_key=stability_api_key)
gpt_api = OpenAIChat(openai_api_key=openai_api_key)
class Idea2Image(Agent):
def __init__(self, llm, vision_api):
super().__init__(llm=llm)
self.vision_api = vision_api
def run(self, initial_prompt, num_iterations, run_folder):
current_prompt = initial_prompt
for i in range(num_iterations):
print(f"Iteration {i}: Image generation and analysis")
if i == 0:
current_prompt = self.enrich_prompt(current_prompt)
print(f"Enriched Prompt: {current_prompt}")
img = sd_api.generate_and_move_image(
current_prompt, i, run_folder
)
if not img:
print("Failed to generate image")
break
print(f"Generated image at: {img}")
analysis = (
self.vision_api.run(img, current_prompt)
if img
else None
)
if analysis:
current_prompt += (
". " + analysis[:500]
) # Ensure the analysis is concise
print(f"Image Analysis: {analysis}")
else:
print(f"Failed to analyze image at: {img}")
def enrich_prompt(self, prompt):
enrichment_task = (
"Create a concise and effective image generation prompt"
" within 400 characters or less, based on Stable"
" Diffusion and Dalle best practices to help it create"
" much better images. Starting prompt:"
f" \n\n'{prompt}'\n\nImprove the prompt with any"
" applicable details or keywords by considering the"
" following aspects: \n1. Subject details (like actions,"
" emotions, environment) \n2. Artistic style (such as"
" surrealism, hyperrealism) \n3. Medium (digital"
" painting, oil on canvas) \n4. Color themes and"
" lighting (like warm colors, cinematic lighting) \n5."
" Composition and framing (close-up, wide-angle) \n6."
" Additional elements (like a specific type of"
" background, weather conditions) \n7. Any other"
" artistic or thematic details that can make the image"
" more vivid and compelling. Help the image generator"
" create better images by enriching the prompt."
)
llm_result = self.llm.generate([enrichment_task])
return (
llm_result.generations[0][0].text[:500]
if llm_result.generations
else None
)
def run_gradio(self, initial_prompt, num_iterations, run_folder):
results = []
current_prompt = initial_prompt
for i in range(num_iterations):
enriched_prompt = (
self.enrich_prompt(current_prompt)
if i == 0
else current_prompt
)
img_path = sd_api.generate_and_move_image(
enriched_prompt, i, run_folder
)
analysis = (
self.vision_api.run(img_path, enriched_prompt)
if img_path
else None
)
if analysis:
current_prompt += (
". " + analysis[:500]
) # Ensuring the analysis is concise
results.append((enriched_prompt, img_path, analysis))
return results
# print(
# colored("---------------------------------------- MultiModal Tree of Thought agents for Image Generation", "cyan", attrs=["bold"])
# )
# # User input and setup
# user_prompt = input("Prompt for image generation: ")
# num_iterations = int(
# input("Enter the number of iterations for image improvement: ")
# )
# run_folder = os.path.join(
# "runs", datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
# )
# os.makedirs(run_folder, exist_ok=True)
# print(
# colored(
# f"---------------------------------- Running Multi-Modal Tree of thoughts agent with {num_iterations} iterations", "green"
# )
# )
# # Initialize and run the agent
# idea2image_agent = Idea2Image(gpt_api, vision_api)
# idea2image_agent.run(user_prompt, num_iterations, run_folder)
# print("Idea space has been traversed.")
# Load environment variables and initialize the models
load_dotenv()
openai_api_key = os.getenv("OPENAI_API_KEY")
stability_api_key = os.getenv("STABLE_API_KEY")
vision_api = GPT4VisionAPI(api_key=openai_api_key)
sd_api = StableDiffusion(api_key=stability_api_key)
gpt_api = OpenAIChat(openai_api_key=openai_api_key)
# Define the modified Idea2Image class here
# Streamlit UI layout
st.title(
"Explore the infinite Multi-Modal Idea Space with Idea2Image"
)
user_prompt = st.text_input("Prompt for image generation:")
num_iterations = st.number_input(
"Enter the number of iterations for image improvement:",
min_value=1,
step=1,
)
if st.button("Generate Image"):
run_folder = os.path.join(
"runs", datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
)
os.makedirs(run_folder, exist_ok=True)
idea2image_agent = Idea2Image(gpt_api, vision_api)
results = idea2image_agent.run_gradio(
user_prompt, num_iterations, run_folder
)
for i, (enriched_prompt, img_path, analysis) in enumerate(
results
):
st.write(f"Iteration {i+1}:")
st.write("Enriched Prompt:", enriched_prompt)
if img_path:
st.image(img_path, caption="Generated Image")
else:
st.error("Failed to generate image")
if analysis:
st.write("Image Analysis:", analysis)
st.success("Idea space has been traversed.")
# [Add any additional necessary code adjustments]

@ -0,0 +1,114 @@
"""
Multi Modal tree of thoughts that leverages the GPT-4 language model and the
Stable Diffusion model to generate a multimodal output and evaluate the
output based a metric from 0.0 to 1.0 and then run a search algorithm using DFS and BFS and return the best output.
task: Generate an image of a swarm of bees -> Image generator -> GPT4V evaluates the img from 0.0 to 1.0 -> DFS/BFS -> return the best output
- GPT4Vision will evaluate the image from 0.0 to 1.0 based on how likely it accomplishes the task
- DFS/BFS will search for the best output based on the evaluation from GPT4Vision
- The output will be a multimodal output that is a combination of the image and the text
- The output will be evaluated by GPT4Vision
- The prompt to the image generator will be optimized from the output of GPT4Vision and the search
"""
import os
from dotenv import load_dotenv
from swarms.models.gpt4_vision_api import GPT4VisionAPI
from swarms.models.stable_diffusion import StableDiffusion
from termcolor import colored
# Load the environment variables
load_dotenv()
# Get the API key from the environment
api_key = os.environ.get("OPENAI_API_KEY")
stable_api_key = os.environ.get("STABLE_API_KEY")
# Initialize the language model
llm = GPT4VisionAPI(
openai_api_key=api_key,
max_tokens=500,
)
# IMG Generator
img_generator = StableDiffusion(api_key=stable_api_key)
# Initialize the language model
task = "Garden of Eden futuristic city graphic art"
def evaluate_img(llm, task: str, img: str):
EVAL_IMG = f"""
Evaluate the image: {img} on a scale from 0.0 to 1.0 based on how likely it accomplishes the task: {task}. Output nothing than the float representing the evaluated img.
"""
out = llm.run(task=EVAL_IMG, img=img)
out = float(out)
return out
def enrichment_prompt(starting_prompt: str, evaluated_img: str):
enrichment_task = (
"Create a concise and effective image generation prompt"
" within 400 characters or less, based on Stable Diffusion"
" and Dalle best practices. Starting prompt:"
f" \n\n'{starting_prompt}'\n\nImprove the prompt with any"
" applicable details or keywords by considering the"
" following aspects: \n1. Subject details (like actions,"
" emotions, environment) \n2. Artistic style (such as"
" surrealism, hyperrealism) \n3. Medium (digital painting,"
" oil on canvas) \n4. Color themes and lighting (like warm"
" colors, cinematic lighting) \n5. Composition and framing"
" (close-up, wide-angle) \n6. Additional elements (like a"
" specific type of background, weather conditions) \n7. Any"
" other artistic or thematic details that can make the image"
" more vivid and compelling. 8. Based on the evaluation of"
" the first generated prompt used by the first prompt:"
f" {evaluated_img} Enrich the prompt to generate a more"
" compelling image. Output only a new prompt to create a"
" better image"
)
return enrichment_task
# Main loop
max_iterations = 10 # Define the maximum number of iterations
best_score = 0
best_image = None
for _ in range(max_iterations):
# Generate an image and get its path
print(colored(f"Generating img for Task: {task}", "purple"))
img_path = img_generator.run(
task=task
) # This should return the file path of the generated image
img_path = img_path[0]
print(colored(f"Generated Image Path: {img_path}", "green"))
# Evaluate the image by passing the file path
score = evaluate_img(llm, task, img_path)
print(
colored(
f"Evaluated Image Score: {score} for {img_path}", "cyan"
)
)
# Update the best score and image path if necessary
if score > best_score:
best_score = score
best_image_path = img_path
# Enrich the prompt based on the evaluation
prompt = enrichment_prompt(task, score)
print(colored(f"Enrichment Prompt: {prompt}", "yellow"))
# Output the best result
print("Best Image Path:", best_image_path)
print("Best Score:", best_score)

@ -9,9 +9,7 @@ load_dotenv()
api_key = os.environ.get("OPENAI_API_KEY") api_key = os.environ.get("OPENAI_API_KEY")
llm = OpenAIChat( llm = OpenAIChat(api_key=api_key)
api_key=api_key
)
# @tool # @tool
# def search_api(query: str) -> str: # def search_api(query: str) -> str:
@ -35,6 +33,7 @@ agent = Agent(
) )
out = agent.run( out = agent.run(
"Use the search api to find the best restaurants in New York City." "Use the search api to find the best restaurants in New York"
" City."
) )
print(out) print(out)

@ -8,6 +8,8 @@ from typing import List
load_dotenv() load_dotenv()
stable_api_key = os.environ.get("STABLE_API_KEY")
class StableDiffusion: class StableDiffusion:
""" """
@ -45,7 +47,7 @@ class StableDiffusion:
def __init__( def __init__(
self, self,
api_key: str, api_key: str = stable_api_key,
api_host: str = "https://api.stability.ai", api_host: str = "https://api.stability.ai",
cfg_scale: int = 7, cfg_scale: int = 7,
height: int = 1024, height: int = 1024,

@ -66,6 +66,7 @@ def agent_system_prompt_2(name: str):
""" """
return AGENT_SYSTEM_PROMPT_2 return AGENT_SYSTEM_PROMPT_2
AGENT_SYSTEM_PROMPT_3 = f""" AGENT_SYSTEM_PROMPT_3 = f"""
You are a fully autonomous agent serving the user in automating tasks, workflows, and activities. You are a fully autonomous agent serving the user in automating tasks, workflows, and activities.
Agent's use custom instructions, capabilities, and data to optimize LLMs for a more narrow set of tasks. Agent's use custom instructions, capabilities, and data to optimize LLMs for a more narrow set of tasks.

@ -268,18 +268,14 @@ class Agent:
# If tools exist then add the tool docs usage to the sop # If tools exist then add the tool docs usage to the sop
if self.tools: if self.tools:
self.sop_list.append(self.tools_prompt_prep( self.sop_list.append(
self.tool_docs, SCENARIOS self.tools_prompt_prep(self.tool_docs, SCENARIOS)
) )
)
def set_system_prompt(self, system_prompt: str): def set_system_prompt(self, system_prompt: str):
"""Set the system prompt""" """Set the system prompt"""
self.system_prompt = system_prompt self.system_prompt = system_prompt
def provide_feedback(self, feedback: str) -> None: def provide_feedback(self, feedback: str) -> None:
"""Allow users to provide feedback on the responses.""" """Allow users to provide feedback on the responses."""
self.feedback.append(feedback) self.feedback.append(feedback)
@ -395,7 +391,6 @@ class Agent:
except Exception as error: except Exception as error:
print(f"Error parsing JSON command: {error}") print(f"Error parsing JSON command: {error}")
def execute_tools(self, tool_name, params): def execute_tools(self, tool_name, params):
"""Execute the tool with the provided params""" """Execute the tool with the provided params"""
tool = self.tool_find_by_name(tool_name) tool = self.tool_find_by_name(tool_name)
@ -404,7 +399,6 @@ class Agent:
tool_result = tool.run(**params) tool_result = tool.run(**params)
print(tool_result) print(tool_result)
def parse_and_execute_tools(self, response: str): def parse_and_execute_tools(self, response: str):
"""Parse and execute the tools""" """Parse and execute the tools"""
json_commands = self.extract_tool_commands(response) json_commands = self.extract_tool_commands(response)
@ -413,7 +407,6 @@ class Agent:
params = command.get("parmas", {}) params = command.get("parmas", {})
self.execute_tools(tool_name, params) self.execute_tools(tool_name, params)
def truncate_history(self): def truncate_history(self):
""" """
Take the history and truncate it to fit into the model context length Take the history and truncate it to fit into the model context length
@ -502,7 +495,6 @@ class Agent:
) )
) )
def activate_autonomous_agent(self): def activate_autonomous_agent(self):
"""Print the autonomous agent activation message""" """Print the autonomous agent activation message"""
try: try:

@ -29,6 +29,7 @@ def extract_tool_commands(self, text: str):
except Exception as error: except Exception as error:
print(f"Error parsing JSON command: {error}") print(f"Error parsing JSON command: {error}")
def parse_and_execute_tools(response: str): def parse_and_execute_tools(response: str):
"""Parse and execute the tools""" """Parse and execute the tools"""
json_commands = extract_tool_commands(response) json_commands = extract_tool_commands(response)
@ -37,6 +38,7 @@ def parse_and_execute_tools(response: str):
params = command.get("parmas", {}) params = command.get("parmas", {})
execute_tools(tool_name, params) execute_tools(tool_name, params)
def execute_tools(self, tool_name, params): def execute_tools(self, tool_name, params):
"""Execute the tool with the provided params""" """Execute the tool with the provided params"""
tool = self.tool_find_by_name(tool_name) tool = self.tool_find_by_name(tool_name)
@ -44,4 +46,3 @@ def execute_tools(self, tool_name, params):
# Execute the tool with the provided parameters # Execute the tool with the provided parameters
tool_result = tool.run(**params) tool_result = tool.run(**params)
print(tool_result) print(tool_result)

Loading…
Cancel
Save