diff --git a/.env.example b/.env.example index f13ce77f..e170252c 100644 --- a/.env.example +++ b/.env.example @@ -35,4 +35,5 @@ REDIS_PORT= #dbs -PINECONE_API_KEY="" \ No newline at end of file +PINECONE_API_KEY="" +BING_COOKIE="" \ No newline at end of file diff --git a/docs/swarms/agents/idea_to_image.md b/docs/swarms/agents/idea_to_image.md new file mode 100644 index 00000000..41d5b216 --- /dev/null +++ b/docs/swarms/agents/idea_to_image.md @@ -0,0 +1,124 @@ +# `Idea2Image` Documentation + +## Table of Contents + +1. [Introduction](#introduction) +2. [Idea2Image Class](#idea2image-class) + - [Initialization Parameters](#initialization-parameters) +3. [Methods and Usage](#methods-and-usage) + - [llm_prompt Method](#llm-prompt-method) + - [generate_image Method](#generate-image-method) +4. [Examples](#examples) + - [Example 1: Generating an Image](#example-1-generating-an-image) +5. [Additional Information](#additional-information) +6. [References and Resources](#references-and-resources) + +--- + +## 1. Introduction + +Welcome to the documentation for the Swarms library, with a focus on the `Idea2Image` class. This comprehensive guide provides in-depth information about the Swarms library and its core components. Before we dive into the details, it's crucial to understand the purpose and significance of this library. + +### 1.1 Purpose + +The Swarms library aims to simplify interactions with AI models for generating images from text prompts. The `Idea2Image` class is designed to generate images from textual descriptions using the DALLE-3 model and the OpenAI GPT-4 language model. + +### 1.2 Key Features + +- **Image Generation:** Swarms allows you to generate images based on natural language prompts, providing a bridge between textual descriptions and visual content. + +- **Integration with DALLE-3:** The `Idea2Image` class leverages the power of DALLE-3 to create images that match the given textual descriptions. + +- **Language Model Integration:** The class integrates with OpenAI's GPT-3 for prompt refinement, enhancing the specificity of image generation. + +--- + +## 2. Idea2Image Class + +The `Idea2Image` class is a fundamental module in the Swarms library, enabling the generation of images from text prompts. + +### 2.1 Initialization Parameters + +Here are the initialization parameters for the `Idea2Image` class: + +- `image` (str): Text prompt for the image to generate. + +- `openai_api_key` (str): OpenAI API key. This key is used for prompt refinement with GPT-3. If not provided, the class will attempt to use the `OPENAI_API_KEY` environment variable. + +- `cookie` (str): Cookie value for DALLE-3. This cookie is used to interact with the DALLE-3 API. If not provided, the class will attempt to use the `BING_COOKIE` environment variable. + +- `output_folder` (str): Folder to save the generated images. The default folder is "images/". + +### 2.2 Methods + +The `Idea2Image` class provides the following methods: + +- `llm_prompt()`: Returns a prompt for refining the image generation. This method helps improve the specificity of the image generation prompt. + +- `generate_image()`: Generates and downloads the image based on the prompt. It refines the prompt, opens the website with the query, retrieves image URLs, and downloads the images to the specified folder. + +--- + +## 3. Methods and Usage + +Let's explore the methods provided by the `Idea2Image` class and how to use them effectively. + +### 3.1 `llm_prompt` Method + +The `llm_prompt` method returns a refined prompt for generating the image. It's a critical step in improving the specificity and accuracy of the image generation process. The method provides a guide for refining the prompt, helping users describe the desired image more precisely. + +### 3.2 `generate_image` Method + +The `generate_image` method combines the previous methods to execute the whole process of generating and downloading images based on the provided prompt. It's a convenient way to automate the image generation process. + +--- + +## 4. Examples + +Let's dive into practical examples to demonstrate the usage of the `Idea2Image` class. + +### 4.1 Example 1: Generating an Image + +In this example, we create an instance of the `Idea2Image` class and use it to generate an image based on a text prompt: + +```python +from swarms.agents import Idea2Image + +# Create an instance of the Idea2Image class with your prompt and API keys +idea2image = Idea2Image( + image="Fish hivemind swarm in light blue avatar anime in zen garden pond concept art anime art, happy fish, anime scenery", + openai_api_key="your_openai_api_key_here", + cookie="your_cookie_value_here", +) + +# Generate and download the image +idea2image.generate_image() +``` + +--- + +## 5. Additional Information + +Here are some additional tips and information for using the Swarms library and the `Idea2Image` class effectively: + +- Refining the prompt is a crucial step to influence the style, composition, and mood of the generated image. Follow the provided guide in the `llm_prompt` method to create precise prompts. + +- Experiment with different prompts, variations, and editing techniques to create unique and interesting images. + +- You can combine separate DALLE-3 outputs into panoramas and murals by careful positioning and editing. + +- Consider sharing your creations and exploring resources in communities like Reddit r/dalle2 for inspiration and tools. + +- The `output_folder` parameter allows you to specify the folder where generated images will be saved. Ensure that you have the necessary permissions to write to that folder. + +--- + +## 6. References and Resources + +For further information and resources related to the Swarms library and DALLE-3: + +- [DALLE-3 Unofficial API Documentation](https://www.bing.com/images/create): The official documentation for the DALLE-3 Unofficial API, where you can explore additional features and capabilities. + +- [OpenAI GPT-3 Documentation](https://beta.openai.com/docs/): The documentation for OpenAI's GPT-3, which is used for prompt refinement. + +This concludes the documentation for the Swarms library and the `Idea2Image` class. You now have a comprehensive guide on how to generate images from text prompts using DALLE-3 and GPT-3 with Swarms. \ No newline at end of file diff --git a/mkdocs.yml b/mkdocs.yml index a5f21e06..8222283f 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -87,6 +87,7 @@ nav: - swarms.agents: - AbstractAgent: "swarms/agents/abstract_agent.md" - OmniModalAgent: "swarms/agents/omni_agent.md" + - Idea2Image: "swarms/agents/idea_to_image.md" - swarms.models: - Overview: "swarms/models/index.md" - HuggingFaceLLM: "swarms/models/hf.md" diff --git a/pyproject.toml b/pyproject.toml index 0480a343..11d24694 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -50,6 +50,7 @@ open-interpreter = "*" tabulate = "*" termcolor = "*" black = "*" +dalle = "*" [tool.poetry.dev-dependencies] first_dependency = {git = "https://github.com/IDEA-Research/GroundingDINO.git"} diff --git a/requirements.txt b/requirements.txt index 6f6bcdf0..b390cc4c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -51,6 +51,7 @@ transformers webdataset yapf autopep8 +dalle3 mkdocs diff --git a/swarms/agents/__init__.py b/swarms/agents/__init__.py index 2701a476..62273928 100644 --- a/swarms/agents/__init__.py +++ b/swarms/agents/__init__.py @@ -1,9 +1,4 @@ """Agent Infrastructure, models, memory, utils, tools""" - -# agents -# from swarms.agents.profitpilot import ProfitPilot -# from swarms.agents.aot import AoTAgent -# from swarms.agents.multi_modal_visual_agent import MultiModalAgent from swarms.agents.omni_modal_agent import OmniModalAgent from swarms.agents.hf_agents import HFAgent @@ -13,3 +8,5 @@ from swarms.agents.message import Message from swarms.agents.stream_response import stream from swarms.agents.base import AbstractAgent from swarms.agents.registry import Registry +from swarms.agents.idea_to_image_agent import Idea2Image + diff --git a/swarms/agents/idea_to_image_agent.py b/swarms/agents/idea_to_image_agent.py new file mode 100644 index 00000000..cede83e9 --- /dev/null +++ b/swarms/agents/idea_to_image_agent.py @@ -0,0 +1,111 @@ +import os +import logging +from dataclasses import dataclass +from dalle3 import Dalle +from swarms.models import OpenAIChat + + +@dataclass +class Idea2Image: + """ + A class used to generate images from text prompts using DALLE-3. + + ... + + Attributes + ---------- + image : str + Text prompt for the image to generate + openai_api_key : str + OpenAI API key + cookie : str + Cookie value for DALLE-3 + output_folder : str + Folder to save the generated images + + Methods + ------- + llm_prompt(): + Returns a prompt for refining the image generation + generate_image(): + Generates and downloads the image based on the prompt + + + Usage: + ------ + from dalle3 import Idea2Image + + idea2image = Idea2Image( + image="Fish hivemind swarm in light blue avatar anime in zen garden pond concept art anime art, happy fish, anime scenery" + ) + idea2image.run() + """ + + image: str + openai_api_key: str = os.getenv("OPENAI_API_KEY") or None + cookie: str = os.getenv("BING_COOKIE") or None + output_folder: str = "images/" + + def __post_init__(self): + self.llm = OpenAIChat(openai_api_key=self.openai_api_key) + self.dalle = Dalle(self.cookie) + + def llm_prompt(self): + LLM_PROMPT = f""" + Refine the USER prompt to create a more precise image tailored to the user's needs using + an image generator like DALLE-3. + + ###### FOLLOW THE GUIDE BELOW TO REFINE THE PROMPT ###### + + - Use natural language prompts up to 400 characters to describe the image you want to generate. Be as specific or vague as needed. + + - Frame your photographic prompts like camera position, lighting, film type, year, usage context. This implicitly suggests image qualities. + + - For illustrations, you can borrow photographic terms like "close up" and prompt for media, style, artist, animation style, etc. + + - Prompt hack: name a film/TV show genre + year to "steal the look" for costumes, lighting, etc without knowing technical details. + + - Try variations of a prompt, make edits, and do recursive uncropping to create interesting journeys and zoom-out effects. + + - Use an image editor like Photopea to uncrop DALL-E outputs and prompt again to extend the image. + + - Combine separate DALL-E outputs into panoramas and murals with careful positioning/editing. + + - Browse communities like Reddit r/dalle2 to get inspired and share your creations. See tools, free image resources, articles. + + - Focus prompts on size, structure, shape, mood, aesthetics to influence the overall vibe and composition. + + - Be more vague or detailed as needed - DALL-E has studied over 400M images and can riff creatively or replicate specific styles. + + - Be descriptive, describe the art style at the end like fusing concept art with anime art or game art or product design art. + + ###### END OF GUIDE ###### + + Prompt to refine: {self.image} + """ + return LLM_PROMPT + + def run(self): + """ + Generates and downloads the image based on the prompt. + + This method refines the prompt using the llm, opens the website with the query, + gets the image URLs, and downloads the images to the specified folder. + """ + # Set up logging + logging.basicConfig(level=logging.INFO) + + # Refine the prompt using the llm + image = self.llm_prompt() + refined_prompt = self.llm(image) + print(f"Refined prompt: {refined_prompt}") + + # Open the website with your query + self.dalle.create(refined_prompt) + + # Get the image URLs + urls = self.dalle.get_urls() + + # Download the images to your specified folder + self.dalle.download(urls, self.output_folder) + diff --git a/tests/agents/idea_to_image.py b/tests/agents/idea_to_image.py new file mode 100644 index 00000000..85efd025 --- /dev/null +++ b/tests/agents/idea_to_image.py @@ -0,0 +1,59 @@ +import pytest +import os +import shutil +from swarms.idea2image import Idea2Image + +openai_key = os.getenv("OPENAI_API_KEY") +dalle_cookie = os.getenv("BING_COOKIE") + +# Constants for testing +TEST_PROMPT = "Happy fish." +TEST_OUTPUT_FOLDER = "test_images/" +OPENAI_API_KEY = openai_key +DALLE_COOKIE = dalle_cookie + +@pytest.fixture(scope="module") +def idea2image_instance(): + # Create an instance of the Idea2Image class + idea2image = Idea2Image( + image=TEST_PROMPT, + openai_api_key=OPENAI_API_KEY, + cookie=DALLE_COOKIE, + output_folder=TEST_OUTPUT_FOLDER, + ) + yield idea2image + # Clean up the test output folder after testing + if os.path.exists(TEST_OUTPUT_FOLDER): + shutil.rmtree(TEST_OUTPUT_FOLDER) + +def test_idea2image_instance(idea2image_instance): + # Check if the instance is created successfully + assert isinstance(idea2image_instance, Idea2Image) + +def test_llm_prompt(idea2image_instance): + # Test the llm_prompt method + prompt = idea2image_instance.llm_prompt() + assert isinstance(prompt, str) + +def test_generate_image(idea2image_instance): + # Test the generate_image method + idea2image_instance.generate_image() + # Check if the output folder is created + assert os.path.exists(TEST_OUTPUT_FOLDER) + # Check if files are downloaded (assuming DALLE-3 responds with URLs) + files = os.listdir(TEST_OUTPUT_FOLDER) + assert len(files) > 0 + +def test_invalid_openai_api_key(): + # Test with an invalid OpenAI API key + with pytest.raises(Exception) as exc_info: + Idea2Image( + image=TEST_PROMPT, + openai_api_key="invalid_api_key", + cookie=DALLE_COOKIE, + output_folder=TEST_OUTPUT_FOLDER, + ) + assert "Failed to initialize OpenAIChat" in str(exc_info.value) + +if __name__ == "__main__": + pytest.main()