parent
1f734ab206
commit
4eb60fea4d
@ -1,96 +0,0 @@
|
||||
import os
|
||||
import datetime
|
||||
from dotenv import load_dotenv
|
||||
from swarms.models.stable_diffusion import StableDiffusion
|
||||
from swarms.models.gpt4_vision_api import GPT4VisionAPI
|
||||
from swarms.models import OpenAIChat
|
||||
from swarms.structs import Agent
|
||||
|
||||
# Load environment variables
|
||||
load_dotenv()
|
||||
openai_api_key = os.getenv("OPENAI_API_KEY")
|
||||
stability_api_key = os.getenv("STABILITY_API_KEY")
|
||||
|
||||
# Initialize the models
|
||||
vision_api = GPT4VisionAPI(api_key=openai_api_key)
|
||||
sd_api = StableDiffusion(api_key=stability_api_key)
|
||||
gpt_api = OpenAIChat(openai_api_key=openai_api_key)
|
||||
|
||||
|
||||
class Idea2Image(Agent):
|
||||
def __init__(self, llm, vision_api):
|
||||
super().__init__(llm=llm)
|
||||
self.vision_api = vision_api
|
||||
|
||||
def run(self, initial_prompt, num_iterations, run_folder):
|
||||
current_prompt = initial_prompt
|
||||
|
||||
for i in range(num_iterations):
|
||||
print(f"Iteration {i}: Image generation and analysis")
|
||||
|
||||
if i == 0:
|
||||
current_prompt = self.enrich_prompt(current_prompt)
|
||||
print(f"Enriched Prompt: {current_prompt}")
|
||||
|
||||
img = sd_api.generate_and_move_image(
|
||||
current_prompt, i, run_folder
|
||||
)
|
||||
if not img:
|
||||
print("Failed to generate image")
|
||||
break
|
||||
print(f"Generated image at: {img}")
|
||||
|
||||
analysis = (
|
||||
self.vision_api.run(img, current_prompt)
|
||||
if img
|
||||
else None
|
||||
)
|
||||
if analysis:
|
||||
current_prompt += (
|
||||
". " + analysis[:500]
|
||||
) # Ensure the analysis is concise
|
||||
print(f"Image Analysis: {analysis}")
|
||||
else:
|
||||
print(f"Failed to analyze image at: {img}")
|
||||
|
||||
def enrich_prompt(self, prompt):
|
||||
enrichment_task = (
|
||||
"Create a concise and effective image generation prompt"
|
||||
" within 400 characters or less, based on Stable"
|
||||
" Diffusion and Dalle best practices. Starting prompt:"
|
||||
f" \n\n'{prompt}'\n\nImprove the prompt with any"
|
||||
" applicable details or keywords by considering the"
|
||||
" following aspects: \n1. Subject details (like actions,"
|
||||
" emotions, environment) \n2. Artistic style (such as"
|
||||
" surrealism, hyperrealism) \n3. Medium (digital"
|
||||
" painting, oil on canvas) \n4. Color themes and"
|
||||
" lighting (like warm colors, cinematic lighting) \n5."
|
||||
" Composition and framing (close-up, wide-angle) \n6."
|
||||
" Additional elements (like a specific type of"
|
||||
" background, weather conditions) \n7. Any other"
|
||||
" artistic or thematic details that can make the image"
|
||||
" more vivid and compelling."
|
||||
)
|
||||
llm_result = self.llm.generate([enrichment_task])
|
||||
return (
|
||||
llm_result.generations[0][0].text[:500]
|
||||
if llm_result.generations
|
||||
else None
|
||||
)
|
||||
|
||||
|
||||
# User input and setup
|
||||
user_prompt = input("Prompt for image generation: ")
|
||||
num_iterations = int(
|
||||
input("Enter the number of iterations for image improvement: ")
|
||||
)
|
||||
run_folder = os.path.join(
|
||||
"runs", datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||
)
|
||||
os.makedirs(run_folder, exist_ok=True)
|
||||
|
||||
# Initialize and run the agent
|
||||
idea2image_agent = Idea2Image(gpt_api, vision_api)
|
||||
idea2image_agent.run(user_prompt, num_iterations, run_folder)
|
||||
|
||||
print("Image improvement process completed.")
|
@ -1,7 +0,0 @@
|
||||
"""
|
||||
Idea 2 img
|
||||
|
||||
task -> gpt4 text -> dalle3 img -> gpt4vision img + text analyze img -> dalle3 img -> loop
|
||||
|
||||
"""
|
||||
from swarms.models.gpt4_vision_api import GPT4VisionAPI
|
@ -0,0 +1,185 @@
|
||||
import datetime
|
||||
import os
|
||||
|
||||
import streamlit as st
|
||||
from dotenv import load_dotenv
|
||||
|
||||
from swarms.models import OpenAIChat
|
||||
from swarms.models.gpt4_vision_api import GPT4VisionAPI
|
||||
from swarms.models.stable_diffusion import StableDiffusion
|
||||
from swarms.structs import Agent
|
||||
|
||||
# Load environment variables
|
||||
load_dotenv()
|
||||
openai_api_key = os.getenv("OPENAI_API_KEY")
|
||||
stability_api_key = os.getenv("STABLE_API_KEY")
|
||||
|
||||
# Initialize the models
|
||||
vision_api = GPT4VisionAPI(api_key=openai_api_key)
|
||||
sd_api = StableDiffusion(api_key=stability_api_key)
|
||||
gpt_api = OpenAIChat(openai_api_key=openai_api_key)
|
||||
|
||||
|
||||
class Idea2Image(Agent):
|
||||
def __init__(self, llm, vision_api):
|
||||
super().__init__(llm=llm)
|
||||
self.vision_api = vision_api
|
||||
|
||||
def run(self, initial_prompt, num_iterations, run_folder):
|
||||
current_prompt = initial_prompt
|
||||
|
||||
for i in range(num_iterations):
|
||||
print(f"Iteration {i}: Image generation and analysis")
|
||||
|
||||
if i == 0:
|
||||
current_prompt = self.enrich_prompt(current_prompt)
|
||||
print(f"Enriched Prompt: {current_prompt}")
|
||||
|
||||
img = sd_api.generate_and_move_image(
|
||||
current_prompt, i, run_folder
|
||||
)
|
||||
if not img:
|
||||
print("Failed to generate image")
|
||||
break
|
||||
print(f"Generated image at: {img}")
|
||||
|
||||
analysis = (
|
||||
self.vision_api.run(img, current_prompt)
|
||||
if img
|
||||
else None
|
||||
)
|
||||
if analysis:
|
||||
current_prompt += (
|
||||
". " + analysis[:500]
|
||||
) # Ensure the analysis is concise
|
||||
print(f"Image Analysis: {analysis}")
|
||||
else:
|
||||
print(f"Failed to analyze image at: {img}")
|
||||
|
||||
def enrich_prompt(self, prompt):
|
||||
enrichment_task = (
|
||||
"Create a concise and effective image generation prompt"
|
||||
" within 400 characters or less, based on Stable"
|
||||
" Diffusion and Dalle best practices to help it create"
|
||||
" much better images. Starting prompt:"
|
||||
f" \n\n'{prompt}'\n\nImprove the prompt with any"
|
||||
" applicable details or keywords by considering the"
|
||||
" following aspects: \n1. Subject details (like actions,"
|
||||
" emotions, environment) \n2. Artistic style (such as"
|
||||
" surrealism, hyperrealism) \n3. Medium (digital"
|
||||
" painting, oil on canvas) \n4. Color themes and"
|
||||
" lighting (like warm colors, cinematic lighting) \n5."
|
||||
" Composition and framing (close-up, wide-angle) \n6."
|
||||
" Additional elements (like a specific type of"
|
||||
" background, weather conditions) \n7. Any other"
|
||||
" artistic or thematic details that can make the image"
|
||||
" more vivid and compelling. Help the image generator"
|
||||
" create better images by enriching the prompt."
|
||||
)
|
||||
llm_result = self.llm.generate([enrichment_task])
|
||||
return (
|
||||
llm_result.generations[0][0].text[:500]
|
||||
if llm_result.generations
|
||||
else None
|
||||
)
|
||||
|
||||
def run_gradio(self, initial_prompt, num_iterations, run_folder):
|
||||
results = []
|
||||
current_prompt = initial_prompt
|
||||
|
||||
for i in range(num_iterations):
|
||||
enriched_prompt = (
|
||||
self.enrich_prompt(current_prompt)
|
||||
if i == 0
|
||||
else current_prompt
|
||||
)
|
||||
img_path = sd_api.generate_and_move_image(
|
||||
enriched_prompt, i, run_folder
|
||||
)
|
||||
analysis = (
|
||||
self.vision_api.run(img_path, enriched_prompt)
|
||||
if img_path
|
||||
else None
|
||||
)
|
||||
|
||||
if analysis:
|
||||
current_prompt += (
|
||||
". " + analysis[:500]
|
||||
) # Ensuring the analysis is concise
|
||||
results.append((enriched_prompt, img_path, analysis))
|
||||
|
||||
return results
|
||||
|
||||
|
||||
# print(
|
||||
# colored("---------------------------------------- MultiModal Tree of Thought agents for Image Generation", "cyan", attrs=["bold"])
|
||||
# )
|
||||
# # User input and setup
|
||||
# user_prompt = input("Prompt for image generation: ")
|
||||
# num_iterations = int(
|
||||
# input("Enter the number of iterations for image improvement: ")
|
||||
# )
|
||||
# run_folder = os.path.join(
|
||||
# "runs", datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||
# )
|
||||
# os.makedirs(run_folder, exist_ok=True)
|
||||
|
||||
# print(
|
||||
# colored(
|
||||
# f"---------------------------------- Running Multi-Modal Tree of thoughts agent with {num_iterations} iterations", "green"
|
||||
# )
|
||||
# )
|
||||
# # Initialize and run the agent
|
||||
# idea2image_agent = Idea2Image(gpt_api, vision_api)
|
||||
# idea2image_agent.run(user_prompt, num_iterations, run_folder)
|
||||
|
||||
# print("Idea space has been traversed.")
|
||||
|
||||
|
||||
# Load environment variables and initialize the models
|
||||
load_dotenv()
|
||||
openai_api_key = os.getenv("OPENAI_API_KEY")
|
||||
stability_api_key = os.getenv("STABLE_API_KEY")
|
||||
vision_api = GPT4VisionAPI(api_key=openai_api_key)
|
||||
sd_api = StableDiffusion(api_key=stability_api_key)
|
||||
gpt_api = OpenAIChat(openai_api_key=openai_api_key)
|
||||
|
||||
# Define the modified Idea2Image class here
|
||||
|
||||
# Streamlit UI layout
|
||||
st.title(
|
||||
"Explore the infinite Multi-Modal Idea Space with Idea2Image"
|
||||
)
|
||||
user_prompt = st.text_input("Prompt for image generation:")
|
||||
num_iterations = st.number_input(
|
||||
"Enter the number of iterations for image improvement:",
|
||||
min_value=1,
|
||||
step=1,
|
||||
)
|
||||
|
||||
if st.button("Generate Image"):
|
||||
run_folder = os.path.join(
|
||||
"runs", datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||
)
|
||||
os.makedirs(run_folder, exist_ok=True)
|
||||
idea2image_agent = Idea2Image(gpt_api, vision_api)
|
||||
|
||||
results = idea2image_agent.run_gradio(
|
||||
user_prompt, num_iterations, run_folder
|
||||
)
|
||||
|
||||
for i, (enriched_prompt, img_path, analysis) in enumerate(
|
||||
results
|
||||
):
|
||||
st.write(f"Iteration {i+1}:")
|
||||
st.write("Enriched Prompt:", enriched_prompt)
|
||||
if img_path:
|
||||
st.image(img_path, caption="Generated Image")
|
||||
else:
|
||||
st.error("Failed to generate image")
|
||||
if analysis:
|
||||
st.write("Image Analysis:", analysis)
|
||||
|
||||
st.success("Idea space has been traversed.")
|
||||
|
||||
# [Add any additional necessary code adjustments]
|
@ -0,0 +1,114 @@
|
||||
"""
|
||||
Multi Modal tree of thoughts that leverages the GPT-4 language model and the
|
||||
Stable Diffusion model to generate a multimodal output and evaluate the
|
||||
output based a metric from 0.0 to 1.0 and then run a search algorithm using DFS and BFS and return the best output.
|
||||
|
||||
|
||||
task: Generate an image of a swarm of bees -> Image generator -> GPT4V evaluates the img from 0.0 to 1.0 -> DFS/BFS -> return the best output
|
||||
|
||||
|
||||
- GPT4Vision will evaluate the image from 0.0 to 1.0 based on how likely it accomplishes the task
|
||||
- DFS/BFS will search for the best output based on the evaluation from GPT4Vision
|
||||
- The output will be a multimodal output that is a combination of the image and the text
|
||||
- The output will be evaluated by GPT4Vision
|
||||
- The prompt to the image generator will be optimized from the output of GPT4Vision and the search
|
||||
|
||||
"""
|
||||
|
||||
import os
|
||||
from dotenv import load_dotenv
|
||||
from swarms.models.gpt4_vision_api import GPT4VisionAPI
|
||||
from swarms.models.stable_diffusion import StableDiffusion
|
||||
from termcolor import colored
|
||||
|
||||
# Load the environment variables
|
||||
load_dotenv()
|
||||
|
||||
# Get the API key from the environment
|
||||
api_key = os.environ.get("OPENAI_API_KEY")
|
||||
stable_api_key = os.environ.get("STABLE_API_KEY")
|
||||
|
||||
|
||||
# Initialize the language model
|
||||
llm = GPT4VisionAPI(
|
||||
openai_api_key=api_key,
|
||||
max_tokens=500,
|
||||
)
|
||||
|
||||
# IMG Generator
|
||||
img_generator = StableDiffusion(api_key=stable_api_key)
|
||||
|
||||
|
||||
# Initialize the language model
|
||||
task = "Garden of Eden futuristic city graphic art"
|
||||
|
||||
|
||||
def evaluate_img(llm, task: str, img: str):
|
||||
EVAL_IMG = f"""
|
||||
Evaluate the image: {img} on a scale from 0.0 to 1.0 based on how likely it accomplishes the task: {task}. Output nothing than the float representing the evaluated img.
|
||||
"""
|
||||
out = llm.run(task=EVAL_IMG, img=img)
|
||||
out = float(out)
|
||||
return out
|
||||
|
||||
|
||||
def enrichment_prompt(starting_prompt: str, evaluated_img: str):
|
||||
enrichment_task = (
|
||||
"Create a concise and effective image generation prompt"
|
||||
" within 400 characters or less, based on Stable Diffusion"
|
||||
" and Dalle best practices. Starting prompt:"
|
||||
f" \n\n'{starting_prompt}'\n\nImprove the prompt with any"
|
||||
" applicable details or keywords by considering the"
|
||||
" following aspects: \n1. Subject details (like actions,"
|
||||
" emotions, environment) \n2. Artistic style (such as"
|
||||
" surrealism, hyperrealism) \n3. Medium (digital painting,"
|
||||
" oil on canvas) \n4. Color themes and lighting (like warm"
|
||||
" colors, cinematic lighting) \n5. Composition and framing"
|
||||
" (close-up, wide-angle) \n6. Additional elements (like a"
|
||||
" specific type of background, weather conditions) \n7. Any"
|
||||
" other artistic or thematic details that can make the image"
|
||||
" more vivid and compelling. 8. Based on the evaluation of"
|
||||
" the first generated prompt used by the first prompt:"
|
||||
f" {evaluated_img} Enrich the prompt to generate a more"
|
||||
" compelling image. Output only a new prompt to create a"
|
||||
" better image"
|
||||
)
|
||||
return enrichment_task
|
||||
|
||||
|
||||
# Main loop
|
||||
max_iterations = 10 # Define the maximum number of iterations
|
||||
best_score = 0
|
||||
best_image = None
|
||||
|
||||
for _ in range(max_iterations):
|
||||
# Generate an image and get its path
|
||||
print(colored(f"Generating img for Task: {task}", "purple"))
|
||||
|
||||
img_path = img_generator.run(
|
||||
task=task
|
||||
) # This should return the file path of the generated image
|
||||
img_path = img_path[0]
|
||||
print(colored(f"Generated Image Path: {img_path}", "green"))
|
||||
|
||||
# Evaluate the image by passing the file path
|
||||
score = evaluate_img(llm, task, img_path)
|
||||
print(
|
||||
colored(
|
||||
f"Evaluated Image Score: {score} for {img_path}", "cyan"
|
||||
)
|
||||
)
|
||||
|
||||
# Update the best score and image path if necessary
|
||||
if score > best_score:
|
||||
best_score = score
|
||||
best_image_path = img_path
|
||||
|
||||
# Enrich the prompt based on the evaluation
|
||||
prompt = enrichment_prompt(task, score)
|
||||
print(colored(f"Enrichment Prompt: {prompt}", "yellow"))
|
||||
|
||||
|
||||
# Output the best result
|
||||
print("Best Image Path:", best_image_path)
|
||||
print("Best Score:", best_score)
|
Loading…
Reference in new issue