[EXAMPLE][Fully Multi Modal Rag agent]

pull/383/head
Kye 11 months ago
parent c62577014c
commit c529244913

1
.gitignore vendored

@ -10,6 +10,7 @@ dataframe/
static/generated static/generated
runs runs
chroma
swarms/__pycache__ swarms/__pycache__
venv venv
.DS_Store .DS_Store

@ -1,31 +1,11 @@
import os
from dotenv import load_dotenv
# Import the OpenAIChat model and the Agent struct
from swarms import Agent, OpenAIChat from swarms import Agent, OpenAIChat
# Load the environment variables
load_dotenv()
# Get the API key from the environment
api_key = os.environ.get("OPENAI_API_KEY")
# Initialize the language model
llm = OpenAIChat(
temperature=0.5,
model_name="gpt-4",
openai_api_key=api_key,
max_tokens=1000,
)
## Initialize the workflow ## Initialize the workflow
agent = Agent( agent = Agent(
llm=llm, llm=OpenAIChat(),
max_loops=4, max_loops="auto",
autosave=True, autosave=True,
dashboard=False, dashboard=False,
# docs_folder="docs",
streaming_on=True, streaming_on=True,
verbose=True, verbose=True,
) )

@ -0,0 +1,79 @@
# Importing necessary modules
import os
from dotenv import load_dotenv
from swarms import Agent, OpenAIChat
from swarms.tools.tool import tool
from swarms.prompts.visual_cot import VISUAL_CHAIN_OF_THOUGHT
from swarms.memory.chroma_db import ChromaDB
# Loading environment variables from .env file
load_dotenv()
# Getting the Gemini API key from environment variables
gemini_api_key = os.getenv("GEMINI_API_KEY")
openai_api_key = os.getenv("OPENAI_API_KEY")
llm = OpenAIChat(
openai_api_key=openai_api_key,
max_tokens=1000,
temperature=0.2,
)
# Making an instance of the ChromaDB class
memory = ChromaDB(
metric="cosine",
n_results=3,
multimodal=True,
docs_folder="images",
output_dir="results",
)
# Defining tool by creating a function and wrapping it with the @tool decorator and
# providing the necessary parameters and docstrings to show the usage of the tool.
@tool
def make_new_file(file: str, content: str):
"""
Make a new file.
This function creates a new file with the given name.
Parameters:
file (str): The name of the file to be created.
Returns:
dict: A dictionary containing the status of the operation.
"""
with open(file, "w") as f:
f.write(f"{content}")
# Initializing the agent with the Gemini instance and other parameters
agent = Agent(
llm=llm,
agent_name="Multi-Modal RAG Agent",
agent_description=(
"This agent fuses together the capabilities of Gemini and"
" Visual Chain of Thought to answer questions based on the"
" input image."
),
max_loops="auto",
autosave=True,
sop=VISUAL_CHAIN_OF_THOUGHT,
verbose=True,
tools=[make_new_file],
long_term_memory=memory,
)
# Defining the task and image path
task = (
"What is the content of this image, return exactly what you see"
" in the image."
)
img = "images/Screenshot_48.png"
# Running the agent with the specified task and image
out = agent.run(task=task, img=img)
print(out)

@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
[tool.poetry] [tool.poetry]
name = "swarms" name = "swarms"
version = "4.1.0" version = "4.1.1"
description = "Swarms - Pytorch" description = "Swarms - Pytorch"
license = "MIT" license = "MIT"
authors = ["Kye Gomez <kye@apac.ai>"] authors = ["Kye Gomez <kye@apac.ai>"]
@ -12,7 +12,7 @@ homepage = "https://github.com/kyegomez/swarms"
documentation = "https://swarms.apac.ai" documentation = "https://swarms.apac.ai"
readme = "README.md" # Assuming you have a README.md readme = "README.md" # Assuming you have a README.md
repository = "https://github.com/kyegomez/swarms" repository = "https://github.com/kyegomez/swarms"
keywords = ["artificial intelligence", "deep learning", "optimizers", "Prompt Engineering"] keywords = ["artificial intelligence", "deep learning", "optimizers", "Prompt Engineering", "swarms", "agents"]
classifiers = [ classifiers = [
"Development Status :: 4 - Beta", "Development Status :: 4 - Beta",
"Intended Audience :: Developers", "Intended Audience :: Developers",

@ -1,3 +1,4 @@
import os
import numpy as np import numpy as np
import logging import logging
import uuid import uuid
@ -10,6 +11,8 @@ from dotenv import load_dotenv
from chromadb.utils.embedding_functions import ( from chromadb.utils.embedding_functions import (
OpenCLIPEmbeddingFunction, OpenCLIPEmbeddingFunction,
) )
from swarms.utils.data_to_text import data_to_text
from swarms.utils.markdown_message import display_markdown_message
# Load environment variables # Load environment variables
@ -51,6 +54,8 @@ class ChromaDB:
embedding_function: Callable = None, embedding_function: Callable = None,
data_loader: Callable = None, data_loader: Callable = None,
multimodal: bool = False, multimodal: bool = False,
docs_folder: str = None,
verbose: bool = False,
*args, *args,
**kwargs, **kwargs,
): ):
@ -58,9 +63,12 @@ class ChromaDB:
self.output_dir = output_dir self.output_dir = output_dir
self.limit_tokens = limit_tokens self.limit_tokens = limit_tokens
self.n_results = n_results self.n_results = n_results
self.docs_folder = docs_folder
self.verbose = verbose
# Disable ChromaDB logging # Disable ChromaDB logging
logging.getLogger("chromadb").setLevel(logging.INFO) if verbose:
logging.getLogger("chromadb").setLevel(logging.INFO)
# Create Chroma collection # Create Chroma collection
chroma_persist_dir = "chroma" chroma_persist_dir = "chroma"
@ -100,6 +108,19 @@ class ChromaDB:
*args, *args,
**kwargs, **kwargs,
) )
display_markdown_message(
"ChromaDB collection created:"
f" {self.collection.name} with metric: {self.metric} and"
f" output directory: {self.output_dir}"
)
# If docs
if docs_folder:
display_markdown_message(
f"Traversing directory: {docs_folder}"
)
self.docs = docs_folder
self.traverse_directory()
def add( def add(
self, self,
@ -161,3 +182,32 @@ class ChromaDB:
return docs[0] return docs[0]
except Exception as e: except Exception as e:
raise Exception(f"Failed to query documents: {str(e)}") raise Exception(f"Failed to query documents: {str(e)}")
def traverse_directory(self):
"""
Traverse through every file in the given directory and its subdirectories,
and return the paths of all files.
Parameters:
- directory_name (str): The name of the directory to traverse.
Returns:
- list: A list of paths to each file in the directory and its subdirectories.
"""
image_extensions = [
".jpg",
".jpeg",
".png",
]
images = []
for root, dirs, files in os.walk(self.docs):
for file in files:
_, ext = os.path.splitext(file)
if ext.lower() in image_extensions:
images.append(os.path.join(root, file))
else:
data = data_to_text(file)
added_to_db = self.add([data])
print(f"{file} added to Database")
if images:
added_to_db = self.add(img_urls=[images])
print(f"{len(images)} images added to Database ")
return added_to_db

@ -1,4 +1,9 @@
def worker_agent_system(name: str, memory: str = None): import datetime
time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
def worker_tools_sop_promp(name: str, memory: str = None):
return """ return """
You are {name}, You are {name},
Your decisions must always be made independently without seeking user assistance. Your decisions must always be made independently without seeking user assistance.
@ -52,7 +57,7 @@ def worker_agent_system(name: str, memory: str = None):
} }
} }
Ensure the response can be parsed by Python json.loads Ensure the response can be parsed by Python json.loads
System: The current time and date is Sat Jan 20 10:39:07 2024 System: The current time and date is {time}
System: This reminds you of these events from your past: System: This reminds you of these events from your past:
[{memory}] [{memory}]

@ -27,6 +27,8 @@ from swarms.utils.parse_code import (
) )
from swarms.utils.pdf_to_text import pdf_to_text from swarms.utils.pdf_to_text import pdf_to_text
from swarms.utils.token_count_tiktoken import limit_tokens_from_string from swarms.utils.token_count_tiktoken import limit_tokens_from_string
from swarms.tools.exec_tool import execute_tool_by_name
from swarms.prompts.worker_prompt import worker_tools_sop_promp
# Utils # Utils
@ -179,7 +181,7 @@ class Agent:
docs_folder: str = None, docs_folder: str = None,
verbose: bool = False, verbose: bool = False,
*args, *args,
**kwargs: Any, **kwargs,
): ):
self.id = id self.id = id
self.llm = llm self.llm = llm
@ -264,6 +266,14 @@ class Agent:
if verbose: if verbose:
logger.setLevel(logging.DEBUG) logger.setLevel(logging.DEBUG)
# If tools are provided then set the tool prompt by adding to sop
if self.tools:
self.sop = self.sop + worker_tools_sop_promp(
self.agent_name, memory=""
)
# If the long term memory is provided then set the long term memory prompt
def set_system_prompt(self, system_prompt: str): def set_system_prompt(self, system_prompt: str):
"""Set the system prompt""" """Set the system prompt"""
self.system_prompt = system_prompt self.system_prompt = system_prompt
@ -545,6 +555,14 @@ class Agent:
if self.code_interpreter: if self.code_interpreter:
self.run_code(response) self.run_code(response)
# If tools are enabled then execute the tools
if self.tools:
execute_tool_by_name(
response,
self.tools,
self.stopping_condition,
)
# If interactive mode is enabled then print the response and get user input # If interactive mode is enabled then print the response and get user input
if self.interactive: if self.interactive:
print(f"AI: {response}") print(f"AI: {response}")
@ -656,7 +674,7 @@ class Agent:
""" """
return agent_history_prompt return agent_history_prompt
def long_term_memory_prompt(self, query: str): def long_term_memory_prompt(self, query: str, *args, **kwargs):
""" """
Generate the agent long term memory prompt Generate the agent long term memory prompt
@ -667,12 +685,10 @@ class Agent:
Returns: Returns:
str: The agent history prompt str: The agent history prompt
""" """
ltr = str(self.long_term_memory.query(query)) ltr = str(self.long_term_memory.query(query), *args, **kwargs)
context = f""" context = f"""
{query} System: This reminds you of these events from your past: [{ltr}]
####### Long Term Memory ################
{ltr}
""" """
return self.short_memory.add( return self.short_memory.add(
role=self.agent_name, content=context role=self.agent_name, content=context

Loading…
Cancel
Save