[EXAMPLE][Fully Multi Modal Rag agent]

2 years ago · c529244913
parent c62577014c
commit c529244913
7 changed files with 164 additions and 33 deletions
--- a/.gitignore
+++ b/.gitignore
@ -10,6 +10,7 @@ dataframe/

 static/generated
 runs
+chroma
 swarms/__pycache__
 venv
 .DS_Store
--- a/example.py
+++ b/example.py
@ -1,31 +1,11 @@
-import os
-
-from dotenv import load_dotenv
-
-# Import the OpenAIChat model and the Agent struct
 from swarms import Agent, OpenAIChat

-# Load the environment variables
-load_dotenv()
-
-# Get the API key from the environment
-api_key = os.environ.get("OPENAI_API_KEY")
-
-# Initialize the language model
-llm = OpenAIChat(
-    temperature=0.5,
-    model_name="gpt-4",
-    openai_api_key=api_key,
-    max_tokens=1000,
-)
-
 ## Initialize the workflow
 agent = Agent(
-    llm=llm,
-    max_loops=4,
+    llm=OpenAIChat(),
+    max_loops="auto",
    autosave=True,
    dashboard=False,
-    # docs_folder="docs",
    streaming_on=True,
    verbose=True,
 )
--- a/multi_modal_rag_agent.py
+++ b/multi_modal_rag_agent.py
@ -0,0 +1,79 @@
+# Importing necessary modules
+import os
+from dotenv import load_dotenv
+from swarms import Agent, OpenAIChat
+from swarms.tools.tool import tool
+from swarms.prompts.visual_cot import VISUAL_CHAIN_OF_THOUGHT
+from swarms.memory.chroma_db import ChromaDB
+
+# Loading environment variables from .env file
+load_dotenv()
+
+# Getting the Gemini API key from environment variables
+gemini_api_key = os.getenv("GEMINI_API_KEY")
+openai_api_key = os.getenv("OPENAI_API_KEY")
+
+llm = OpenAIChat(
+    openai_api_key=openai_api_key,
+    max_tokens=1000,
+    temperature=0.2,
+)
+
+# Making an instance of the ChromaDB class
+memory = ChromaDB(
+    metric="cosine",
+    n_results=3,
+    multimodal=True,
+    docs_folder="images",
+    output_dir="results",
+)
+
+
+# Defining tool by creating a function and wrapping it with the @tool decorator and
+# providing the necessary parameters and docstrings to show the usage of the tool.
+@tool
+def make_new_file(file: str, content: str):
+    """
+    Make a new file.
+
+    This function creates a new file with the given name.
+
+    Parameters:
+        file (str): The name of the file to be created.
+
+    Returns:
+        dict: A dictionary containing the status of the operation.
+    """
+    with open(file, "w") as f:
+        f.write(f"{content}")
+
+
+# Initializing the agent with the Gemini instance and other parameters
+agent = Agent(
+    llm=llm,
+    agent_name="Multi-Modal RAG Agent",
+    agent_description=(
+        "This agent fuses together the capabilities of Gemini and"
+        " Visual Chain of Thought to answer questions based on the"
+        " input image."
+    ),
+    max_loops="auto",
+    autosave=True,
+    sop=VISUAL_CHAIN_OF_THOUGHT,
+    verbose=True,
+    tools=[make_new_file],
+    long_term_memory=memory,
+)
+
+
+# Defining the task and image path
+task = (
+    "What is the content of this image, return exactly what you see"
+    " in the image."
+)
+img = "images/Screenshot_48.png"
+
+
+# Running the agent with the specified task and image
+out = agent.run(task=task, img=img)
+print(out)
--- a/pyproject.toml
+++ b/pyproject.toml
@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"

 [tool.poetry]
 name = "swarms"
-version = "4.1.0"
+version = "4.1.1"
 description = "Swarms - Pytorch"
 license = "MIT"
 authors = ["Kye Gomez <kye@apac.ai>"]
@ -12,7 +12,7 @@ homepage = "https://github.com/kyegomez/swarms"
 documentation = "https://swarms.apac.ai"  
 readme = "README.md"  # Assuming you have a README.md
 repository = "https://github.com/kyegomez/swarms"
-keywords = ["artificial intelligence", "deep learning", "optimizers", "Prompt Engineering"]
+keywords = ["artificial intelligence", "deep learning", "optimizers", "Prompt Engineering", "swarms", "agents"]
 classifiers = [
    "Development Status :: 4 - Beta",
    "Intended Audience :: Developers",
--- a/swarms/memory/chroma_db.py
+++ b/swarms/memory/chroma_db.py
@ -1,3 +1,4 @@
+import os
 import numpy as np
 import logging
 import uuid
@ -10,6 +11,8 @@ from dotenv import load_dotenv
 from chromadb.utils.embedding_functions import (
    OpenCLIPEmbeddingFunction,
 )
+from swarms.utils.data_to_text import data_to_text
+from swarms.utils.markdown_message import display_markdown_message


 # Load environment variables
@ -51,6 +54,8 @@ class ChromaDB:
        embedding_function: Callable = None,
        data_loader: Callable = None,
        multimodal: bool = False,
+        docs_folder: str = None,
+        verbose: bool = False,
        *args,
        **kwargs,
    ):
@ -58,8 +63,11 @@ class ChromaDB:
        self.output_dir = output_dir
        self.limit_tokens = limit_tokens
        self.n_results = n_results
+        self.docs_folder = docs_folder
+        self.verbose = verbose

        # Disable ChromaDB logging
+        if verbose:
            logging.getLogger("chromadb").setLevel(logging.INFO)

        # Create Chroma collection
@ -100,6 +108,19 @@ class ChromaDB:
            *args,
            **kwargs,
        )
+        display_markdown_message(
+            "ChromaDB collection created:"
+            f" {self.collection.name} with metric: {self.metric} and"
+            f" output directory: {self.output_dir}"
+        )
+
+        # If docs
+        if docs_folder:
+            display_markdown_message(
+                f"Traversing directory: {docs_folder}"
+            )
+            self.docs = docs_folder
+            self.traverse_directory()

    def add(
        self,
@ -161,3 +182,32 @@ class ChromaDB:
            return docs[0]
        except Exception as e:
            raise Exception(f"Failed to query documents: {str(e)}")
+
+    def traverse_directory(self):
+        """
+        Traverse through every file in the given directory and its subdirectories,
+        and return the paths of all files.
+        Parameters:
+        - directory_name (str): The name of the directory to traverse.
+        Returns:
+        - list: A list of paths to each file in the directory and its subdirectories.
+        """
+        image_extensions = [
+            ".jpg",
+            ".jpeg",
+            ".png",
+        ]
+        images = []
+        for root, dirs, files in os.walk(self.docs):
+            for file in files:
+                _, ext = os.path.splitext(file)
+                if ext.lower() in image_extensions:
+                    images.append(os.path.join(root, file))
+                else:
+                    data = data_to_text(file)
+                    added_to_db = self.add([data])
+                    print(f"{file} added to Database")
+        if images:
+            added_to_db = self.add(img_urls=[images])
+            print(f"{len(images)} images added to Database ")
+        return added_to_db
--- a/swarms/prompts/worker_prompt.py
+++ b/swarms/prompts/worker_prompt.py
@ -1,4 +1,9 @@
-def worker_agent_system(name: str, memory: str = None):
+import datetime
+
+time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+
+
+def worker_tools_sop_promp(name: str, memory: str = None):
    return """
    You are {name}, 
    Your decisions must always be made independently without seeking user assistance. 
@ -52,7 +57,7 @@ def worker_agent_system(name: str, memory: str = None):
        }
    }
    Ensure the response can be parsed by Python json.loads
-    System: The current time and date is Sat Jan 20 10:39:07 2024
+    System: The current time and date is {time}
    System: This reminds you of these events from your past:
    [{memory}]
    
--- a/swarms/structs/agent.py
+++ b/swarms/structs/agent.py
@ -27,6 +27,8 @@ from swarms.utils.parse_code import (
 )
 from swarms.utils.pdf_to_text import pdf_to_text
 from swarms.utils.token_count_tiktoken import limit_tokens_from_string
+from swarms.tools.exec_tool import execute_tool_by_name
+from swarms.prompts.worker_prompt import worker_tools_sop_promp


 # Utils
@ -179,7 +181,7 @@ class Agent:
        docs_folder: str = None,
        verbose: bool = False,
        *args,
-        **kwargs: Any,
+        **kwargs,
    ):
        self.id = id
        self.llm = llm
@ -264,6 +266,14 @@ class Agent:
        if verbose:
            logger.setLevel(logging.DEBUG)

+        # If tools are provided then set the tool prompt by adding to sop
+        if self.tools:
+            self.sop = self.sop + worker_tools_sop_promp(
+                self.agent_name, memory=""
+            )
+
+        # If the long term memory is provided then set the long term memory prompt
+
    def set_system_prompt(self, system_prompt: str):
        """Set the system prompt"""
        self.system_prompt = system_prompt
@ -545,6 +555,14 @@ class Agent:
                        if self.code_interpreter:
                            self.run_code(response)

+                        # If tools are enabled then execute the tools
+                        if self.tools:
+                            execute_tool_by_name(
+                                response,
+                                self.tools,
+                                self.stopping_condition,
+                            )
+
                        # If interactive mode is enabled then print the response and get user input
                        if self.interactive:
                            print(f"AI: {response}")
@ -656,7 +674,7 @@ class Agent:
            """
            return agent_history_prompt

-    def long_term_memory_prompt(self, query: str):
+    def long_term_memory_prompt(self, query: str, *args, **kwargs):
        """
        Generate the agent long term memory prompt

@ -667,12 +685,10 @@ class Agent:
        Returns:
            str: The agent history prompt
        """
-        ltr = str(self.long_term_memory.query(query))
+        ltr = str(self.long_term_memory.query(query), *args, **kwargs)

        context = f"""
-            {query}
-            ####### Long Term Memory ################
-            {ltr}
+            System: This reminds you of these events from your past: [{ltr}]
        """
        return self.short_memory.add(
            role=self.agent_name, content=context