updated vector_storage.py to parse URLs more efficiently for markdown links

10 months ago · f422450347
parent 955f9439f9
commit f422450347
3 changed files with 125 additions and 59 deletions
--- a/playground/demos/chatbot/server/README.md
+++ b/playground/demos/chatbot/server/README.md
@ -8,6 +8,25 @@
 * Switching from vLLM to another host like Olama requires commenting/uncommenting some code at this time, but will be dynamic later.  
 * Locally hosted or cloud (vector storage)[./vector_storage.py]
 ## Prerequisites
 * Install (Redis Stack)[https://redis.io/docs/latest/operate/oss_and_stack/install/install-stack/]
 * (Firecrawl)[https://docs.firecrawl.dev/contributing]
 ## Start Redis Stack and Firecrawl
 ```bash
    sudo apt-get install redis-stack-server
    sudo systemctl enable redis-stack-server
    sudo systemctl start redis-stack-server
    cd ~/firecrawl/apps/api
    pnpm run workers
    pnpm run start
 ```
 ## Running vLLM
 Running vLLM in a docker container saves a lot of trouble.  Use dockerRunVllm.sh to set up and start vLLM.  This command will allow you to control vLLM using docker commands:
--- a/playground/demos/chatbot/server/vector_storage.py
+++ b/playground/demos/chatbot/server/vector_storage.py
@ -1,3 +1,4 @@
 """ Vector storage using Firecrawl and Redis """
 import re
 from urllib.parse import urlparse, urljoin
 import redis
@ -5,11 +6,12 @@ from firecrawl import FirecrawlApp
 from redisvl.utils.vectorize import HFTextVectorizer
 from redisvl.index import SearchIndex
 from redisvl.schema import IndexSchema
-from redisvl.query import VectorQuery
+from redisvl.query.filter import Tag
 from redisvl.query import VectorQuery, FilterQuery
 class RedisVectorStorage:
    """  Provides vector storage database operations using Redis """
-    def __init__(self, context: str="swarms", use_gpu=False):
+    def __init__(self, context: str="swarms", use_gpu=False, overwrite=False):
        self.use_gpu = use_gpu
        self.context = context
        # Initialize the FirecrawlApp with your API key
@ -28,7 +30,6 @@ class RedisVectorStorage:
        schema = IndexSchema.from_dict({
        "index": {
            "name": index_name,
            "prefix": "chunk"
        },
        "fields": [
            {
@ -54,28 +55,37 @@ class RedisVectorStorage:
                    "algorithm": "hnsw",
                    "datatype": "float32"
                }
            },
            {
                "name": "source_url",
                "type": "text",
                "attrs": {
                    "sortable": True
                }
            }
        ]
        })
        self.schema = schema
        self.index = SearchIndex(self.schema, self.redis_client)
-        self.index.create()
+        self.index.create(overwrite=overwrite, drop=overwrite)
    # Function to extract Markdown links
    def extract_markdown_links(self, markdown_text):
-        pattern = r'\[([^\]]+)\]\(([^)]+)\)'
+        """ Extract Markdown links from the given markdown text """
        pattern = r'\[([^\]]+)\]\(([^)]+?)(?:\s+"[^"]*")?\)'
        links = re.findall(pattern, markdown_text)
        urls = [link[1] for link in links]
        return urls
-    # Function to check if a URL is internal to the initial domain
+    def is_internal_link(self, url: str, base_domain: str):
-    def is_internal_link(self, url, base_domain):
+        """ Check if a URL is internal to the initial domain """
        if (url == '\\' or url.startswith("mailto")):
            return False
        parsed_url = urlparse(url)
        return parsed_url.netloc == '' or parsed_url.netloc == base_domain
    # Function to split markdown content into chunks of max 5000 characters at natural breakpoints
    def split_markdown_content(self, markdown_text, max_length=5000):
        """ Split markdown content into chunks of max 5000 characters at natural breakpoints """
        paragraphs = markdown_text.split('\n\n')  # Split by paragraphs
        chunks = []
        current_chunk = ''
@ -104,71 +114,102 @@ class RedisVectorStorage:
        return chunks
    # Function to store chunks and their embeddings in Redis
    def store_chunks_in_redis(self, url, chunks):
        """ Store chunks and their embeddings in Redis """
        parsed_url = urlparse(url)
        trimmed_url = parsed_url.netloc + parsed_url.path  # Remove scheme (http:// or https://)
        data = []
        for i, chunk in enumerate(chunks):
            embedding = self.vectorizer.embed(chunk, input_type="search_document", as_buffer=True)
            data.append({
-                "id": f"{url}::chunk::{i+1}",
+                "id": f"{trimmed_url}::chunk::{i+1}",
                "content": chunk,
-                "content_embedding": embedding
+                "content_embedding": embedding,
                "source_url": trimmed_url
            })
        self.index.load(data)
        print(f"Stored {len(chunks)} chunks for URL {url} in Redis.")
-    # Function to recursively crawl a URL and its Markdown links
+    def crawl_iterative(self, start_url, base_domain):
-    def crawl_recursive(self, url, base_domain, visited=None):
+        """ Iteratively crawl a URL and its Markdown links """
-        if visited is None:
+        visited = set()
-            visited = set()
+        stack = [start_url]
-
+
-        if url in visited:
+        while stack:
-            return
+            url = stack.pop()
-        visited.add(url)
+            if url in visited:
-
+                continue
-        # Check if the URL has already been processed
+
-        if self.index.exists(f"{url}::chunk::1"):
+            parsed_url = urlparse(url)
-            print(f"URL {url} has already been processed. Skipping.")
+            trimmed_url = parsed_url.netloc + parsed_url.path  # Remove scheme (http:// or https://)
-            return
+
-
+            # Check if the URL has already been processed
-        print(f"Crawling URL: {url}")
+            t = Tag("id") == f"{trimmed_url}::chunk::1"  # Use the original URL format
-
+
-        params = {
+            # Use a simple filter query instead of a vector query
-            'pageOptions': {
+            filter_query = FilterQuery(filter_expression=t)
-                'onlyMainContent': False,
+            results = self.index.query(filter_query)
-                'fetchPageContent': True,
+            if results:
-                'includeHTML': True,
+                print(f"URL {url} has already been processed. Skipping.")
                visited.add(url)
                continue
            print(f"Crawling URL: {url}")
            params = {
                'pageOptions': {
                    'onlyMainContent': False,
                    'fetchPageContent': True,
                    'includeHTML': False,
                }
            }
        }
        crawl_result = self.app.crawl_url(url, params=params, wait_until_done=True)
        for result in crawl_result:
            print("Content:\n\n")
            markdown_content = result["markdown"]
            # Split markdown content into natural chunks
            chunks = self.split_markdown_content(markdown_content)
            # Store the chunks and their embeddings in Redis
            self.store_chunks_in_redis(url, chunks)
-            links = self.extract_markdown_links(markdown_content)
+            crawl_result = []
-            print("Extracted Links:", links)
+            if self.is_internal_link(url, base_domain) and not url in visited:
                crawl_result.append(self.app.scrape_url(url, params=params))
                visited.add(url)
                for result in crawl_result:
                    markdown_content = result["markdown"]
                    result_url = result["metadata"]["sourceURL"]
                    print("Markdown sourceURL: " + result_url)
                    # print("Content:\n\n")
                    # print(markdown_content)
                    # print("\n\n")
                    # Split markdown content into natural chunks
                    chunks = self.split_markdown_content(markdown_content)
                    # Store the chunks and their embeddings in Redis
                    self.store_chunks_in_redis(result_url, chunks)
                    links = self.extract_markdown_links(markdown_content)
                    print("Extracted Links:", links)
                    # print("Extracted Links:", links)
                    for link in links:
                        absolute_link = urljoin(result_url, link)
                        if self.is_internal_link(absolute_link, base_domain):
                            if absolute_link not in visited:
                                stack.append(absolute_link)
                                print("Appended link: " + absolute_link)
                        else:
                            visited.add(absolute_link)
-            for link in links:
+    def crawl(self, crawl_url: str):
-                absolute_link = urljoin(url, link)
+        """ Start the iterative crawling from the initial URL """
-                if self.is_internal_link(absolute_link, base_domain):
+        base_domain = urlparse(crawl_url).netloc
-                    self.crawl_recursive(absolute_link, base_domain, visited)
+        self.crawl_iterative(crawl_url, base_domain)
    # Function to embed a string and perform a Redis vector database query
    def embed(self, query: str, num_results: int=3):
        """ Embed a string and perform a Redis vector database query """
        query_embedding = self.vectorizer.embed(query)
        vector_query = VectorQuery(
            vector=query_embedding,
            vector_field_name="content_embedding",
            num_results=num_results,
-            return_fields=["id", "content"],
+            return_fields=["id", "content", "source_url"],
            return_score=True
        )
@ -176,11 +217,11 @@ class RedisVectorStorage:
        results = self.index.query(vector_query)
        return results
    def crawl(self, crawl_url: str):
        # Start the recursive crawling from the initial URL
        base_domain = urlparse(crawl_url).netloc
        self.crawl_recursive(crawl_url, base_domain)
 if __name__ == "__main__":
-    storage = RedisVectorStorage()
+    storage = RedisVectorStorage(overwrite=False)
    storage.crawl("https://docs.swarms.world/en/latest/")
    responses = storage.embed("What is Swarms, and how do I install swarms?", 5)
    for response in responses:
        encoded_id = response['id']  # Access the 'id' field directly
        source_url = response['source_url']
        print(f"Decoded ID: {encoded_id}, Source URL: {source_url}")
--- a/playground/demos/chatbot/start-chatbot.sh
+++ b/playground/demos/chatbot/start-chatbot.sh
@ -0,0 +1,6 @@
 sudo apt-get install redis-stack-server
 sudo systemctl enable redis-stack-server
 sudo systemctl start redis-stack-server
 cd ~/firecrawl/apps/api
 pnpm run workers
 pnpm run start