From f422450347269fba1147e8fbda10add6458a5011 Mon Sep 17 00:00:00 2001
From: Richard Anthony Hein <richard.hein@gmail.com>
Date: Sun, 25 Aug 2024 09:45:17 +0000
Subject: [PATCH] updated vector_storage.py to parse URLs more efficiently for
 markdown links

---
 playground/demos/chatbot/server/README.md     |  19 +++
 .../demos/chatbot/server/vector_storage.py    | 159 +++++++++++-------
 playground/demos/chatbot/start-chatbot.sh     |   6 +
 3 files changed, 125 insertions(+), 59 deletions(-)
 create mode 100644 playground/demos/chatbot/start-chatbot.sh

diff --git a/playground/demos/chatbot/server/README.md b/playground/demos/chatbot/server/README.md
index a8e35840..987f15ed 100644
--- a/playground/demos/chatbot/server/README.md
+++ b/playground/demos/chatbot/server/README.md
@@ -8,6 +8,25 @@
 
 * Switching from vLLM to another host like Olama requires commenting/uncommenting some code at this time, but will be dynamic later.  
 
+* Locally hosted or cloud (vector storage)[./vector_storage.py]
+
+## Prerequisites
+
+* Install (Redis Stack)[https://redis.io/docs/latest/operate/oss_and_stack/install/install-stack/]
+
+* (Firecrawl)[https://docs.firecrawl.dev/contributing]
+
+## Start Redis Stack and Firecrawl
+
+```bash
+    sudo apt-get install redis-stack-server
+    sudo systemctl enable redis-stack-server
+    sudo systemctl start redis-stack-server
+    cd ~/firecrawl/apps/api
+    pnpm run workers
+    pnpm run start
+```
+
 ## Running vLLM
 
 Running vLLM in a docker container saves a lot of trouble.  Use dockerRunVllm.sh to set up and start vLLM.  This command will allow you to control vLLM using docker commands:
diff --git a/playground/demos/chatbot/server/vector_storage.py b/playground/demos/chatbot/server/vector_storage.py
index f440a82d..a3977b64 100644
--- a/playground/demos/chatbot/server/vector_storage.py
+++ b/playground/demos/chatbot/server/vector_storage.py
@@ -1,3 +1,4 @@
+""" Vector storage using Firecrawl and Redis """
 import re
 from urllib.parse import urlparse, urljoin
 import redis
@@ -5,11 +6,12 @@ from firecrawl import FirecrawlApp
 from redisvl.utils.vectorize import HFTextVectorizer
 from redisvl.index import SearchIndex
 from redisvl.schema import IndexSchema
-from redisvl.query import VectorQuery
+from redisvl.query.filter import Tag
+from redisvl.query import VectorQuery, FilterQuery
 
 class RedisVectorStorage:
     """  Provides vector storage database operations using Redis """
-    def __init__(self, context: str="swarms", use_gpu=False):
+    def __init__(self, context: str="swarms", use_gpu=False, overwrite=False):
         self.use_gpu = use_gpu
         self.context = context
         # Initialize the FirecrawlApp with your API key
@@ -28,7 +30,6 @@ class RedisVectorStorage:
         schema = IndexSchema.from_dict({
         "index": {
             "name": index_name,
-            "prefix": "chunk"
         },
         "fields": [
             {
@@ -54,28 +55,37 @@ class RedisVectorStorage:
                     "algorithm": "hnsw",
                     "datatype": "float32"
                 }
+            },
+            {
+                "name": "source_url",
+                "type": "text",
+                "attrs": {
+                    "sortable": True
+                }
             }
         ]
         })
 
         self.schema = schema
         self.index = SearchIndex(self.schema, self.redis_client)
-        self.index.create()
+        self.index.create(overwrite=overwrite, drop=overwrite)
 
-    # Function to extract Markdown links
     def extract_markdown_links(self, markdown_text):
-        pattern = r'\[([^\]]+)\]\(([^)]+)\)'
+        """ Extract Markdown links from the given markdown text """
+        pattern = r'\[([^\]]+)\]\(([^)]+?)(?:\s+"[^"]*")?\)'
         links = re.findall(pattern, markdown_text)
         urls = [link[1] for link in links]
         return urls
 
-    # Function to check if a URL is internal to the initial domain
-    def is_internal_link(self, url, base_domain):
+    def is_internal_link(self, url: str, base_domain: str):
+        """ Check if a URL is internal to the initial domain """
+        if (url == '\\' or url.startswith("mailto")):
+            return False
         parsed_url = urlparse(url)
         return parsed_url.netloc == '' or parsed_url.netloc == base_domain
 
-    # Function to split markdown content into chunks of max 5000 characters at natural breakpoints
     def split_markdown_content(self, markdown_text, max_length=5000):
+        """ Split markdown content into chunks of max 5000 characters at natural breakpoints """
         paragraphs = markdown_text.split('\n\n')  # Split by paragraphs
         chunks = []
         current_chunk = ''
@@ -104,71 +114,102 @@ class RedisVectorStorage:
 
         return chunks
 
-    # Function to store chunks and their embeddings in Redis
     def store_chunks_in_redis(self, url, chunks):
+        """ Store chunks and their embeddings in Redis """
+        parsed_url = urlparse(url)
+        trimmed_url = parsed_url.netloc + parsed_url.path  # Remove scheme (http:// or https://)
+
         data = []
         for i, chunk in enumerate(chunks):
             embedding = self.vectorizer.embed(chunk, input_type="search_document", as_buffer=True)
             data.append({
-                "id": f"{url}::chunk::{i+1}",
+                "id": f"{trimmed_url}::chunk::{i+1}",
                 "content": chunk,
-                "content_embedding": embedding
+                "content_embedding": embedding,
+                "source_url": trimmed_url
             })
         self.index.load(data)
         print(f"Stored {len(chunks)} chunks for URL {url} in Redis.")
 
-    # Function to recursively crawl a URL and its Markdown links
-    def crawl_recursive(self, url, base_domain, visited=None):
-        if visited is None:
-            visited = set()
-
-        if url in visited:
-            return
-        visited.add(url)
-
-        # Check if the URL has already been processed
-        if self.index.exists(f"{url}::chunk::1"):
-            print(f"URL {url} has already been processed. Skipping.")
-            return
-
-        print(f"Crawling URL: {url}")
-
-        params = {
-            'pageOptions': {
-                'onlyMainContent': False,
-                'fetchPageContent': True,
-                'includeHTML': True,
+    def crawl_iterative(self, start_url, base_domain):
+        """ Iteratively crawl a URL and its Markdown links """
+        visited = set()
+        stack = [start_url]
+
+        while stack:
+            url = stack.pop()
+            if url in visited:
+                continue
+
+            parsed_url = urlparse(url)
+            trimmed_url = parsed_url.netloc + parsed_url.path  # Remove scheme (http:// or https://)
+
+            # Check if the URL has already been processed
+            t = Tag("id") == f"{trimmed_url}::chunk::1"  # Use the original URL format
+
+            # Use a simple filter query instead of a vector query
+            filter_query = FilterQuery(filter_expression=t)
+            results = self.index.query(filter_query)
+            if results:
+                print(f"URL {url} has already been processed. Skipping.")
+                visited.add(url)
+                continue
+
+            print(f"Crawling URL: {url}")
+
+            params = {
+                'pageOptions': {
+                    'onlyMainContent': False,
+                    'fetchPageContent': True,
+                    'includeHTML': False,
+                }
             }
-        }
-        crawl_result = self.app.crawl_url(url, params=params, wait_until_done=True)
-
-        for result in crawl_result:
-            print("Content:\n\n")
-            markdown_content = result["markdown"]
-
-            # Split markdown content into natural chunks
-            chunks = self.split_markdown_content(markdown_content)
-
-            # Store the chunks and their embeddings in Redis
-            self.store_chunks_in_redis(url, chunks)
 
-            links = self.extract_markdown_links(markdown_content)
-            print("Extracted Links:", links)
+            crawl_result = []
+            if self.is_internal_link(url, base_domain) and not url in visited:
+                crawl_result.append(self.app.scrape_url(url, params=params))
+                visited.add(url)
+
+                for result in crawl_result:
+                    markdown_content = result["markdown"]
+                    result_url = result["metadata"]["sourceURL"]
+                    print("Markdown sourceURL: " + result_url)
+                    # print("Content:\n\n")
+                    # print(markdown_content)
+                    # print("\n\n")
+                    # Split markdown content into natural chunks
+                    chunks = self.split_markdown_content(markdown_content)
+
+                    # Store the chunks and their embeddings in Redis
+                    self.store_chunks_in_redis(result_url, chunks)
+
+                    links = self.extract_markdown_links(markdown_content)
+                    print("Extracted Links:", links)
+                    # print("Extracted Links:", links)
+
+                    for link in links:
+                        absolute_link = urljoin(result_url, link)
+                        if self.is_internal_link(absolute_link, base_domain):
+                            if absolute_link not in visited:
+                                stack.append(absolute_link)
+                                print("Appended link: " + absolute_link)
+                        else:
+                            visited.add(absolute_link)
 
-            for link in links:
-                absolute_link = urljoin(url, link)
-                if self.is_internal_link(absolute_link, base_domain):
-                    self.crawl_recursive(absolute_link, base_domain, visited)
+    def crawl(self, crawl_url: str):
+        """ Start the iterative crawling from the initial URL """
+        base_domain = urlparse(crawl_url).netloc
+        self.crawl_iterative(crawl_url, base_domain)
 
-    # Function to embed a string and perform a Redis vector database query
     def embed(self, query: str, num_results: int=3):
+        """ Embed a string and perform a Redis vector database query """
         query_embedding = self.vectorizer.embed(query)
 
         vector_query = VectorQuery(
             vector=query_embedding,
             vector_field_name="content_embedding",
             num_results=num_results,
-            return_fields=["id", "content"],
+            return_fields=["id", "content", "source_url"],
             return_score=True
         )
 
@@ -176,11 +217,11 @@ class RedisVectorStorage:
         results = self.index.query(vector_query)
         return results
 
-    def crawl(self, crawl_url: str):
-        # Start the recursive crawling from the initial URL
-        base_domain = urlparse(crawl_url).netloc
-        self.crawl_recursive(crawl_url, base_domain)
-
 if __name__ == "__main__":
-    storage = RedisVectorStorage()
+    storage = RedisVectorStorage(overwrite=False)
     storage.crawl("https://docs.swarms.world/en/latest/")
+    responses = storage.embed("What is Swarms, and how do I install swarms?", 5)
+    for response in responses:
+        encoded_id = response['id']  # Access the 'id' field directly
+        source_url = response['source_url']
+        print(f"Decoded ID: {encoded_id}, Source URL: {source_url}")
diff --git a/playground/demos/chatbot/start-chatbot.sh b/playground/demos/chatbot/start-chatbot.sh
new file mode 100644
index 00000000..4fbd2763
--- /dev/null
+++ b/playground/demos/chatbot/start-chatbot.sh
@@ -0,0 +1,6 @@
+sudo apt-get install redis-stack-server
+sudo systemctl enable redis-stack-server
+sudo systemctl start redis-stack-server
+cd ~/firecrawl/apps/api
+pnpm run workers
+pnpm run start
\ No newline at end of file