[EXAMPLE][Fully Multi Modal Rag agent]

pull/383/head
Kye 11 months ago
parent c62577014c
commit c529244913

1
.gitignore vendored

@ -10,6 +10,7 @@ dataframe/
static/generated
runs
chroma
swarms/__pycache__
venv
.DS_Store

@ -1,31 +1,11 @@
import os
from dotenv import load_dotenv
# Import the OpenAIChat model and the Agent struct
from swarms import Agent, OpenAIChat
# Load the environment variables
load_dotenv()
# Get the API key from the environment
api_key = os.environ.get("OPENAI_API_KEY")
# Initialize the language model
llm = OpenAIChat(
temperature=0.5,
model_name="gpt-4",
openai_api_key=api_key,
max_tokens=1000,
)
## Initialize the workflow
agent = Agent(
llm=llm,
max_loops=4,
llm=OpenAIChat(),
max_loops="auto",
autosave=True,
dashboard=False,
# docs_folder="docs",
streaming_on=True,
verbose=True,
)

@ -0,0 +1,79 @@
# Importing necessary modules
import os
from dotenv import load_dotenv
from swarms import Agent, OpenAIChat
from swarms.tools.tool import tool
from swarms.prompts.visual_cot import VISUAL_CHAIN_OF_THOUGHT
from swarms.memory.chroma_db import ChromaDB
# Loading environment variables from .env file
load_dotenv()
# Getting the Gemini API key from environment variables
gemini_api_key = os.getenv("GEMINI_API_KEY")
openai_api_key = os.getenv("OPENAI_API_KEY")
llm = OpenAIChat(
openai_api_key=openai_api_key,
max_tokens=1000,
temperature=0.2,
)
# Making an instance of the ChromaDB class
memory = ChromaDB(
metric="cosine",
n_results=3,
multimodal=True,
docs_folder="images",
output_dir="results",
)
# Defining tool by creating a function and wrapping it with the @tool decorator and
# providing the necessary parameters and docstrings to show the usage of the tool.
@tool
def make_new_file(file: str, content: str):
"""
Make a new file.
This function creates a new file with the given name.
Parameters:
file (str): The name of the file to be created.
Returns:
dict: A dictionary containing the status of the operation.
"""
with open(file, "w") as f:
f.write(f"{content}")
# Initializing the agent with the Gemini instance and other parameters
agent = Agent(
llm=llm,
agent_name="Multi-Modal RAG Agent",
agent_description=(
"This agent fuses together the capabilities of Gemini and"
" Visual Chain of Thought to answer questions based on the"
" input image."
),
max_loops="auto",
autosave=True,
sop=VISUAL_CHAIN_OF_THOUGHT,
verbose=True,
tools=[make_new_file],
long_term_memory=memory,
)
# Defining the task and image path
task = (
"What is the content of this image, return exactly what you see"
" in the image."
)
img = "images/Screenshot_48.png"
# Running the agent with the specified task and image
out = agent.run(task=task, img=img)
print(out)

@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
[tool.poetry]
name = "swarms"
version = "4.1.0"
version = "4.1.1"
description = "Swarms - Pytorch"
license = "MIT"
authors = ["Kye Gomez <kye@apac.ai>"]
@ -12,7 +12,7 @@ homepage = "https://github.com/kyegomez/swarms"
documentation = "https://swarms.apac.ai"
readme = "README.md" # Assuming you have a README.md
repository = "https://github.com/kyegomez/swarms"
keywords = ["artificial intelligence", "deep learning", "optimizers", "Prompt Engineering"]
keywords = ["artificial intelligence", "deep learning", "optimizers", "Prompt Engineering", "swarms", "agents"]
classifiers = [
"Development Status :: 4 - Beta",
"Intended Audience :: Developers",

@ -1,3 +1,4 @@
import os
import numpy as np
import logging
import uuid
@ -10,6 +11,8 @@ from dotenv import load_dotenv
from chromadb.utils.embedding_functions import (
OpenCLIPEmbeddingFunction,
)
from swarms.utils.data_to_text import data_to_text
from swarms.utils.markdown_message import display_markdown_message
# Load environment variables
@ -51,6 +54,8 @@ class ChromaDB:
embedding_function: Callable = None,
data_loader: Callable = None,
multimodal: bool = False,
docs_folder: str = None,
verbose: bool = False,
*args,
**kwargs,
):
@ -58,8 +63,11 @@ class ChromaDB:
self.output_dir = output_dir
self.limit_tokens = limit_tokens
self.n_results = n_results
self.docs_folder = docs_folder
self.verbose = verbose
# Disable ChromaDB logging
if verbose:
logging.getLogger("chromadb").setLevel(logging.INFO)
# Create Chroma collection
@ -100,6 +108,19 @@ class ChromaDB:
*args,
**kwargs,
)
display_markdown_message(
"ChromaDB collection created:"
f" {self.collection.name} with metric: {self.metric} and"
f" output directory: {self.output_dir}"
)
# If docs
if docs_folder:
display_markdown_message(
f"Traversing directory: {docs_folder}"
)
self.docs = docs_folder
self.traverse_directory()
def add(
self,
@ -161,3 +182,32 @@ class ChromaDB:
return docs[0]
except Exception as e:
raise Exception(f"Failed to query documents: {str(e)}")
def traverse_directory(self):
"""
Traverse through every file in the given directory and its subdirectories,
and return the paths of all files.
Parameters:
- directory_name (str): The name of the directory to traverse.
Returns:
- list: A list of paths to each file in the directory and its subdirectories.
"""
image_extensions = [
".jpg",
".jpeg",
".png",
]
images = []
for root, dirs, files in os.walk(self.docs):
for file in files:
_, ext = os.path.splitext(file)
if ext.lower() in image_extensions:
images.append(os.path.join(root, file))
else:
data = data_to_text(file)
added_to_db = self.add([data])
print(f"{file} added to Database")
if images:
added_to_db = self.add(img_urls=[images])
print(f"{len(images)} images added to Database ")
return added_to_db

@ -1,4 +1,9 @@
def worker_agent_system(name: str, memory: str = None):
import datetime
time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
def worker_tools_sop_promp(name: str, memory: str = None):
return """
You are {name},
Your decisions must always be made independently without seeking user assistance.
@ -52,7 +57,7 @@ def worker_agent_system(name: str, memory: str = None):
}
}
Ensure the response can be parsed by Python json.loads
System: The current time and date is Sat Jan 20 10:39:07 2024
System: The current time and date is {time}
System: This reminds you of these events from your past:
[{memory}]

@ -27,6 +27,8 @@ from swarms.utils.parse_code import (
)
from swarms.utils.pdf_to_text import pdf_to_text
from swarms.utils.token_count_tiktoken import limit_tokens_from_string
from swarms.tools.exec_tool import execute_tool_by_name
from swarms.prompts.worker_prompt import worker_tools_sop_promp
# Utils
@ -179,7 +181,7 @@ class Agent:
docs_folder: str = None,
verbose: bool = False,
*args,
**kwargs: Any,
**kwargs,
):
self.id = id
self.llm = llm
@ -264,6 +266,14 @@ class Agent:
if verbose:
logger.setLevel(logging.DEBUG)
# If tools are provided then set the tool prompt by adding to sop
if self.tools:
self.sop = self.sop + worker_tools_sop_promp(
self.agent_name, memory=""
)
# If the long term memory is provided then set the long term memory prompt
def set_system_prompt(self, system_prompt: str):
"""Set the system prompt"""
self.system_prompt = system_prompt
@ -545,6 +555,14 @@ class Agent:
if self.code_interpreter:
self.run_code(response)
# If tools are enabled then execute the tools
if self.tools:
execute_tool_by_name(
response,
self.tools,
self.stopping_condition,
)
# If interactive mode is enabled then print the response and get user input
if self.interactive:
print(f"AI: {response}")
@ -656,7 +674,7 @@ class Agent:
"""
return agent_history_prompt
def long_term_memory_prompt(self, query: str):
def long_term_memory_prompt(self, query: str, *args, **kwargs):
"""
Generate the agent long term memory prompt
@ -667,12 +685,10 @@ class Agent:
Returns:
str: The agent history prompt
"""
ltr = str(self.long_term_memory.query(query))
ltr = str(self.long_term_memory.query(query), *args, **kwargs)
context = f"""
{query}
####### Long Term Memory ################
{ltr}
System: This reminds you of these events from your past: [{ltr}]
"""
return self.short_memory.add(
role=self.agent_name, content=context

Loading…
Cancel
Save