From c529244913f3b10d63d5c60d4f17ef4518bba75b Mon Sep 17 00:00:00 2001 From: Kye Date: Wed, 7 Feb 2024 09:44:08 -0800 Subject: [PATCH] [EXAMPLE][Fully Multi Modal Rag agent] --- .gitignore | 1 + example.py | 24 +--------- multi_modal_rag_agent.py | 79 +++++++++++++++++++++++++++++++++ pyproject.toml | 4 +- swarms/memory/chroma_db.py | 52 +++++++++++++++++++++- swarms/prompts/worker_prompt.py | 9 +++- swarms/structs/agent.py | 28 +++++++++--- 7 files changed, 164 insertions(+), 33 deletions(-) create mode 100644 multi_modal_rag_agent.py diff --git a/.gitignore b/.gitignore index aa7e0f98..62014a4f 100644 --- a/.gitignore +++ b/.gitignore @@ -10,6 +10,7 @@ dataframe/ static/generated runs +chroma swarms/__pycache__ venv .DS_Store diff --git a/example.py b/example.py index d3032ac4..d5cc891e 100644 --- a/example.py +++ b/example.py @@ -1,31 +1,11 @@ -import os - -from dotenv import load_dotenv - -# Import the OpenAIChat model and the Agent struct from swarms import Agent, OpenAIChat -# Load the environment variables -load_dotenv() - -# Get the API key from the environment -api_key = os.environ.get("OPENAI_API_KEY") - -# Initialize the language model -llm = OpenAIChat( - temperature=0.5, - model_name="gpt-4", - openai_api_key=api_key, - max_tokens=1000, -) - ## Initialize the workflow agent = Agent( - llm=llm, - max_loops=4, + llm=OpenAIChat(), + max_loops="auto", autosave=True, dashboard=False, - # docs_folder="docs", streaming_on=True, verbose=True, ) diff --git a/multi_modal_rag_agent.py b/multi_modal_rag_agent.py new file mode 100644 index 00000000..33b2acb5 --- /dev/null +++ b/multi_modal_rag_agent.py @@ -0,0 +1,79 @@ +# Importing necessary modules +import os +from dotenv import load_dotenv +from swarms import Agent, OpenAIChat +from swarms.tools.tool import tool +from swarms.prompts.visual_cot import VISUAL_CHAIN_OF_THOUGHT +from swarms.memory.chroma_db import ChromaDB + +# Loading environment variables from .env file +load_dotenv() + +# Getting the Gemini API key from environment variables +gemini_api_key = os.getenv("GEMINI_API_KEY") +openai_api_key = os.getenv("OPENAI_API_KEY") + +llm = OpenAIChat( + openai_api_key=openai_api_key, + max_tokens=1000, + temperature=0.2, +) + +# Making an instance of the ChromaDB class +memory = ChromaDB( + metric="cosine", + n_results=3, + multimodal=True, + docs_folder="images", + output_dir="results", +) + + +# Defining tool by creating a function and wrapping it with the @tool decorator and +# providing the necessary parameters and docstrings to show the usage of the tool. +@tool +def make_new_file(file: str, content: str): + """ + Make a new file. + + This function creates a new file with the given name. + + Parameters: + file (str): The name of the file to be created. + + Returns: + dict: A dictionary containing the status of the operation. + """ + with open(file, "w") as f: + f.write(f"{content}") + + +# Initializing the agent with the Gemini instance and other parameters +agent = Agent( + llm=llm, + agent_name="Multi-Modal RAG Agent", + agent_description=( + "This agent fuses together the capabilities of Gemini and" + " Visual Chain of Thought to answer questions based on the" + " input image." + ), + max_loops="auto", + autosave=True, + sop=VISUAL_CHAIN_OF_THOUGHT, + verbose=True, + tools=[make_new_file], + long_term_memory=memory, +) + + +# Defining the task and image path +task = ( + "What is the content of this image, return exactly what you see" + " in the image." +) +img = "images/Screenshot_48.png" + + +# Running the agent with the specified task and image +out = agent.run(task=task, img=img) +print(out) diff --git a/pyproject.toml b/pyproject.toml index bb12e7a4..781dbbe0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api" [tool.poetry] name = "swarms" -version = "4.1.0" +version = "4.1.1" description = "Swarms - Pytorch" license = "MIT" authors = ["Kye Gomez "] @@ -12,7 +12,7 @@ homepage = "https://github.com/kyegomez/swarms" documentation = "https://swarms.apac.ai" readme = "README.md" # Assuming you have a README.md repository = "https://github.com/kyegomez/swarms" -keywords = ["artificial intelligence", "deep learning", "optimizers", "Prompt Engineering"] +keywords = ["artificial intelligence", "deep learning", "optimizers", "Prompt Engineering", "swarms", "agents"] classifiers = [ "Development Status :: 4 - Beta", "Intended Audience :: Developers", diff --git a/swarms/memory/chroma_db.py b/swarms/memory/chroma_db.py index e951ecd9..f1436d9c 100644 --- a/swarms/memory/chroma_db.py +++ b/swarms/memory/chroma_db.py @@ -1,3 +1,4 @@ +import os import numpy as np import logging import uuid @@ -10,6 +11,8 @@ from dotenv import load_dotenv from chromadb.utils.embedding_functions import ( OpenCLIPEmbeddingFunction, ) +from swarms.utils.data_to_text import data_to_text +from swarms.utils.markdown_message import display_markdown_message # Load environment variables @@ -51,6 +54,8 @@ class ChromaDB: embedding_function: Callable = None, data_loader: Callable = None, multimodal: bool = False, + docs_folder: str = None, + verbose: bool = False, *args, **kwargs, ): @@ -58,9 +63,12 @@ class ChromaDB: self.output_dir = output_dir self.limit_tokens = limit_tokens self.n_results = n_results + self.docs_folder = docs_folder + self.verbose = verbose # Disable ChromaDB logging - logging.getLogger("chromadb").setLevel(logging.INFO) + if verbose: + logging.getLogger("chromadb").setLevel(logging.INFO) # Create Chroma collection chroma_persist_dir = "chroma" @@ -100,6 +108,19 @@ class ChromaDB: *args, **kwargs, ) + display_markdown_message( + "ChromaDB collection created:" + f" {self.collection.name} with metric: {self.metric} and" + f" output directory: {self.output_dir}" + ) + + # If docs + if docs_folder: + display_markdown_message( + f"Traversing directory: {docs_folder}" + ) + self.docs = docs_folder + self.traverse_directory() def add( self, @@ -161,3 +182,32 @@ class ChromaDB: return docs[0] except Exception as e: raise Exception(f"Failed to query documents: {str(e)}") + + def traverse_directory(self): + """ + Traverse through every file in the given directory and its subdirectories, + and return the paths of all files. + Parameters: + - directory_name (str): The name of the directory to traverse. + Returns: + - list: A list of paths to each file in the directory and its subdirectories. + """ + image_extensions = [ + ".jpg", + ".jpeg", + ".png", + ] + images = [] + for root, dirs, files in os.walk(self.docs): + for file in files: + _, ext = os.path.splitext(file) + if ext.lower() in image_extensions: + images.append(os.path.join(root, file)) + else: + data = data_to_text(file) + added_to_db = self.add([data]) + print(f"{file} added to Database") + if images: + added_to_db = self.add(img_urls=[images]) + print(f"{len(images)} images added to Database ") + return added_to_db diff --git a/swarms/prompts/worker_prompt.py b/swarms/prompts/worker_prompt.py index 5ba0b5e5..c9c78733 100644 --- a/swarms/prompts/worker_prompt.py +++ b/swarms/prompts/worker_prompt.py @@ -1,4 +1,9 @@ -def worker_agent_system(name: str, memory: str = None): +import datetime + +time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + + +def worker_tools_sop_promp(name: str, memory: str = None): return """ You are {name}, Your decisions must always be made independently without seeking user assistance. @@ -52,7 +57,7 @@ def worker_agent_system(name: str, memory: str = None): } } Ensure the response can be parsed by Python json.loads - System: The current time and date is Sat Jan 20 10:39:07 2024 + System: The current time and date is {time} System: This reminds you of these events from your past: [{memory}] diff --git a/swarms/structs/agent.py b/swarms/structs/agent.py index a0f85814..8e865420 100644 --- a/swarms/structs/agent.py +++ b/swarms/structs/agent.py @@ -27,6 +27,8 @@ from swarms.utils.parse_code import ( ) from swarms.utils.pdf_to_text import pdf_to_text from swarms.utils.token_count_tiktoken import limit_tokens_from_string +from swarms.tools.exec_tool import execute_tool_by_name +from swarms.prompts.worker_prompt import worker_tools_sop_promp # Utils @@ -179,7 +181,7 @@ class Agent: docs_folder: str = None, verbose: bool = False, *args, - **kwargs: Any, + **kwargs, ): self.id = id self.llm = llm @@ -264,6 +266,14 @@ class Agent: if verbose: logger.setLevel(logging.DEBUG) + # If tools are provided then set the tool prompt by adding to sop + if self.tools: + self.sop = self.sop + worker_tools_sop_promp( + self.agent_name, memory="" + ) + + # If the long term memory is provided then set the long term memory prompt + def set_system_prompt(self, system_prompt: str): """Set the system prompt""" self.system_prompt = system_prompt @@ -545,6 +555,14 @@ class Agent: if self.code_interpreter: self.run_code(response) + # If tools are enabled then execute the tools + if self.tools: + execute_tool_by_name( + response, + self.tools, + self.stopping_condition, + ) + # If interactive mode is enabled then print the response and get user input if self.interactive: print(f"AI: {response}") @@ -656,7 +674,7 @@ class Agent: """ return agent_history_prompt - def long_term_memory_prompt(self, query: str): + def long_term_memory_prompt(self, query: str, *args, **kwargs): """ Generate the agent long term memory prompt @@ -667,12 +685,10 @@ class Agent: Returns: str: The agent history prompt """ - ltr = str(self.long_term_memory.query(query)) + ltr = str(self.long_term_memory.query(query), *args, **kwargs) context = f""" - {query} - ####### Long Term Memory ################ - {ltr} + System: This reminds you of these events from your past: [{ltr}] """ return self.short_memory.add( role=self.agent_name, content=context