[EXAMPLE][Fully Multi Modal Rag agent]

2 years ago · c529244913
parent c62577014c
commit c529244913
7 changed files with 164 additions and 33 deletions
--- a/.gitignore
+++ b/.gitignore
@ -10,6 +10,7 @@ dataframe/
 static/generated
 runs
 chroma
 swarms/__pycache__
 venv
 .DS_Store
--- a/example.py
+++ b/example.py
@ -1,31 +1,11 @@
 import os
 from dotenv import load_dotenv
 # Import the OpenAIChat model and the Agent struct
 from swarms import Agent, OpenAIChat
 # Load the environment variables
 load_dotenv()
 # Get the API key from the environment
 api_key = os.environ.get("OPENAI_API_KEY")
 # Initialize the language model
 llm = OpenAIChat(
    temperature=0.5,
    model_name="gpt-4",
    openai_api_key=api_key,
    max_tokens=1000,
 )
 ## Initialize the workflow
 agent = Agent(
-    llm=llm,
+    llm=OpenAIChat(),
-    max_loops=4,
+    max_loops="auto",
    autosave=True,
    dashboard=False,
    # docs_folder="docs",
    streaming_on=True,
    verbose=True,
 )
--- a/multi_modal_rag_agent.py
+++ b/multi_modal_rag_agent.py
@ -0,0 +1,79 @@
 # Importing necessary modules
 import os
 from dotenv import load_dotenv
 from swarms import Agent, OpenAIChat
 from swarms.tools.tool import tool
 from swarms.prompts.visual_cot import VISUAL_CHAIN_OF_THOUGHT
 from swarms.memory.chroma_db import ChromaDB
 # Loading environment variables from .env file
 load_dotenv()
 # Getting the Gemini API key from environment variables
 gemini_api_key = os.getenv("GEMINI_API_KEY")
 openai_api_key = os.getenv("OPENAI_API_KEY")
 llm = OpenAIChat(
    openai_api_key=openai_api_key,
    max_tokens=1000,
    temperature=0.2,
 )
 # Making an instance of the ChromaDB class
 memory = ChromaDB(
    metric="cosine",
    n_results=3,
    multimodal=True,
    docs_folder="images",
    output_dir="results",
 )
 # Defining tool by creating a function and wrapping it with the @tool decorator and
 # providing the necessary parameters and docstrings to show the usage of the tool.
@tool
 def make_new_file(file: str, content: str):
    """
    Make a new file.
    This function creates a new file with the given name.
    Parameters:
        file (str): The name of the file to be created.
    Returns:
        dict: A dictionary containing the status of the operation.
    """
    with open(file, "w") as f:
        f.write(f"{content}")
 # Initializing the agent with the Gemini instance and other parameters
 agent = Agent(
    llm=llm,
    agent_name="Multi-Modal RAG Agent",
    agent_description=(
        "This agent fuses together the capabilities of Gemini and"
        " Visual Chain of Thought to answer questions based on the"
        " input image."
    ),
    max_loops="auto",
    autosave=True,
    sop=VISUAL_CHAIN_OF_THOUGHT,
    verbose=True,
    tools=[make_new_file],
    long_term_memory=memory,
 )
 # Defining the task and image path
 task = (
    "What is the content of this image, return exactly what you see"
    " in the image."
 )
 img = "images/Screenshot_48.png"
 # Running the agent with the specified task and image
 out = agent.run(task=task, img=img)
 print(out)
--- a/pyproject.toml
+++ b/pyproject.toml
@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
 [tool.poetry]
 name = "swarms"
-version = "4.1.0"
+version = "4.1.1"
 description = "Swarms - Pytorch"
 license = "MIT"
 authors = ["Kye Gomez <kye@apac.ai>"]
@ -12,7 +12,7 @@ homepage = "https://github.com/kyegomez/swarms"
 documentation = "https://swarms.apac.ai"  
 readme = "README.md"  # Assuming you have a README.md
 repository = "https://github.com/kyegomez/swarms"
-keywords = ["artificial intelligence", "deep learning", "optimizers", "Prompt Engineering"]
+keywords = ["artificial intelligence", "deep learning", "optimizers", "Prompt Engineering", "swarms", "agents"]
 classifiers = [
    "Development Status :: 4 - Beta",
    "Intended Audience :: Developers",
--- a/swarms/memory/chroma_db.py
+++ b/swarms/memory/chroma_db.py
@ -1,3 +1,4 @@
 import os
 import numpy as np
 import logging
 import uuid
@ -10,6 +11,8 @@ from dotenv import load_dotenv
 from chromadb.utils.embedding_functions import (
    OpenCLIPEmbeddingFunction,
 )
 from swarms.utils.data_to_text import data_to_text
 from swarms.utils.markdown_message import display_markdown_message
 # Load environment variables
@ -51,6 +54,8 @@ class ChromaDB:
        embedding_function: Callable = None,
        data_loader: Callable = None,
        multimodal: bool = False,
        docs_folder: str = None,
        verbose: bool = False,
        *args,
        **kwargs,
    ):
@ -58,9 +63,12 @@ class ChromaDB:
        self.output_dir = output_dir
        self.limit_tokens = limit_tokens
        self.n_results = n_results
        self.docs_folder = docs_folder
        self.verbose = verbose
        # Disable ChromaDB logging
-        logging.getLogger("chromadb").setLevel(logging.INFO)
+        if verbose:
            logging.getLogger("chromadb").setLevel(logging.INFO)
        # Create Chroma collection
        chroma_persist_dir = "chroma"
@ -100,6 +108,19 @@ class ChromaDB:
            *args,
            **kwargs,
        )
        display_markdown_message(
            "ChromaDB collection created:"
            f" {self.collection.name} with metric: {self.metric} and"
            f" output directory: {self.output_dir}"
        )
        # If docs
        if docs_folder:
            display_markdown_message(
                f"Traversing directory: {docs_folder}"
            )
            self.docs = docs_folder
            self.traverse_directory()
    def add(
        self,
@ -161,3 +182,32 @@ class ChromaDB:
            return docs[0]
        except Exception as e:
            raise Exception(f"Failed to query documents: {str(e)}")
    def traverse_directory(self):
        """
        Traverse through every file in the given directory and its subdirectories,
        and return the paths of all files.
        Parameters:
        - directory_name (str): The name of the directory to traverse.
        Returns:
        - list: A list of paths to each file in the directory and its subdirectories.
        """
        image_extensions = [
            ".jpg",
            ".jpeg",
            ".png",
        ]
        images = []
        for root, dirs, files in os.walk(self.docs):
            for file in files:
                _, ext = os.path.splitext(file)
                if ext.lower() in image_extensions:
                    images.append(os.path.join(root, file))
                else:
                    data = data_to_text(file)
                    added_to_db = self.add([data])
                    print(f"{file} added to Database")
        if images:
            added_to_db = self.add(img_urls=[images])
            print(f"{len(images)} images added to Database ")
        return added_to_db
--- a/swarms/prompts/worker_prompt.py
+++ b/swarms/prompts/worker_prompt.py
@ -1,4 +1,9 @@
-def worker_agent_system(name: str, memory: str = None):
+import datetime
 time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
 def worker_tools_sop_promp(name: str, memory: str = None):
    return """
    You are {name}, 
    Your decisions must always be made independently without seeking user assistance. 
@ -52,7 +57,7 @@ def worker_agent_system(name: str, memory: str = None):
        }
    }
    Ensure the response can be parsed by Python json.loads
-    System: The current time and date is Sat Jan 20 10:39:07 2024
+    System: The current time and date is {time}
    System: This reminds you of these events from your past:
    [{memory}]
--- a/swarms/structs/agent.py
+++ b/swarms/structs/agent.py
@ -27,6 +27,8 @@ from swarms.utils.parse_code import (
 )
 from swarms.utils.pdf_to_text import pdf_to_text
 from swarms.utils.token_count_tiktoken import limit_tokens_from_string
 from swarms.tools.exec_tool import execute_tool_by_name
 from swarms.prompts.worker_prompt import worker_tools_sop_promp
 # Utils
@ -179,7 +181,7 @@ class Agent:
        docs_folder: str = None,
        verbose: bool = False,
        *args,
-        **kwargs: Any,
+        **kwargs,
    ):
        self.id = id
        self.llm = llm
@ -264,6 +266,14 @@ class Agent:
        if verbose:
            logger.setLevel(logging.DEBUG)
        # If tools are provided then set the tool prompt by adding to sop
        if self.tools:
            self.sop = self.sop + worker_tools_sop_promp(
                self.agent_name, memory=""
            )
        # If the long term memory is provided then set the long term memory prompt
    def set_system_prompt(self, system_prompt: str):
        """Set the system prompt"""
        self.system_prompt = system_prompt
@ -545,6 +555,14 @@ class Agent:
                        if self.code_interpreter:
                            self.run_code(response)
                        # If tools are enabled then execute the tools
                        if self.tools:
                            execute_tool_by_name(
                                response,
                                self.tools,
                                self.stopping_condition,
                            )
                        # If interactive mode is enabled then print the response and get user input
                        if self.interactive:
                            print(f"AI: {response}")
@ -656,7 +674,7 @@ class Agent:
            """
            return agent_history_prompt
-    def long_term_memory_prompt(self, query: str):
+    def long_term_memory_prompt(self, query: str, *args, **kwargs):
        """
        Generate the agent long term memory prompt
@ -667,12 +685,10 @@ class Agent:
        Returns:
            str: The agent history prompt
        """
-        ltr = str(self.long_term_memory.query(query))
+        ltr = str(self.long_term_memory.query(query), *args, **kwargs)
        context = f"""
-            {query}
+            System: This reminds you of these events from your past: [{ltr}]
            ####### Long Term Memory ################
            {ltr}
        """
        return self.short_memory.add(
            role=self.agent_name, content=context