From f422450347269fba1147e8fbda10add6458a5011 Mon Sep 17 00:00:00 2001 From: Richard Anthony Hein Date: Sun, 25 Aug 2024 09:45:17 +0000 Subject: [PATCH] updated vector_storage.py to parse URLs more efficiently for markdown links --- playground/demos/chatbot/server/README.md | 19 +++ .../demos/chatbot/server/vector_storage.py | 159 +++++++++++------- playground/demos/chatbot/start-chatbot.sh | 6 + 3 files changed, 125 insertions(+), 59 deletions(-) create mode 100644 playground/demos/chatbot/start-chatbot.sh diff --git a/playground/demos/chatbot/server/README.md b/playground/demos/chatbot/server/README.md index a8e35840..987f15ed 100644 --- a/playground/demos/chatbot/server/README.md +++ b/playground/demos/chatbot/server/README.md @@ -8,6 +8,25 @@ * Switching from vLLM to another host like Olama requires commenting/uncommenting some code at this time, but will be dynamic later. +* Locally hosted or cloud (vector storage)[./vector_storage.py] + +## Prerequisites + +* Install (Redis Stack)[https://redis.io/docs/latest/operate/oss_and_stack/install/install-stack/] + +* (Firecrawl)[https://docs.firecrawl.dev/contributing] + +## Start Redis Stack and Firecrawl + +```bash + sudo apt-get install redis-stack-server + sudo systemctl enable redis-stack-server + sudo systemctl start redis-stack-server + cd ~/firecrawl/apps/api + pnpm run workers + pnpm run start +``` + ## Running vLLM Running vLLM in a docker container saves a lot of trouble. Use dockerRunVllm.sh to set up and start vLLM. This command will allow you to control vLLM using docker commands: diff --git a/playground/demos/chatbot/server/vector_storage.py b/playground/demos/chatbot/server/vector_storage.py index f440a82d..a3977b64 100644 --- a/playground/demos/chatbot/server/vector_storage.py +++ b/playground/demos/chatbot/server/vector_storage.py @@ -1,3 +1,4 @@ +""" Vector storage using Firecrawl and Redis """ import re from urllib.parse import urlparse, urljoin import redis @@ -5,11 +6,12 @@ from firecrawl import FirecrawlApp from redisvl.utils.vectorize import HFTextVectorizer from redisvl.index import SearchIndex from redisvl.schema import IndexSchema -from redisvl.query import VectorQuery +from redisvl.query.filter import Tag +from redisvl.query import VectorQuery, FilterQuery class RedisVectorStorage: """ Provides vector storage database operations using Redis """ - def __init__(self, context: str="swarms", use_gpu=False): + def __init__(self, context: str="swarms", use_gpu=False, overwrite=False): self.use_gpu = use_gpu self.context = context # Initialize the FirecrawlApp with your API key @@ -28,7 +30,6 @@ class RedisVectorStorage: schema = IndexSchema.from_dict({ "index": { "name": index_name, - "prefix": "chunk" }, "fields": [ { @@ -54,28 +55,37 @@ class RedisVectorStorage: "algorithm": "hnsw", "datatype": "float32" } + }, + { + "name": "source_url", + "type": "text", + "attrs": { + "sortable": True + } } ] }) self.schema = schema self.index = SearchIndex(self.schema, self.redis_client) - self.index.create() + self.index.create(overwrite=overwrite, drop=overwrite) - # Function to extract Markdown links def extract_markdown_links(self, markdown_text): - pattern = r'\[([^\]]+)\]\(([^)]+)\)' + """ Extract Markdown links from the given markdown text """ + pattern = r'\[([^\]]+)\]\(([^)]+?)(?:\s+"[^"]*")?\)' links = re.findall(pattern, markdown_text) urls = [link[1] for link in links] return urls - # Function to check if a URL is internal to the initial domain - def is_internal_link(self, url, base_domain): + def is_internal_link(self, url: str, base_domain: str): + """ Check if a URL is internal to the initial domain """ + if (url == '\\' or url.startswith("mailto")): + return False parsed_url = urlparse(url) return parsed_url.netloc == '' or parsed_url.netloc == base_domain - # Function to split markdown content into chunks of max 5000 characters at natural breakpoints def split_markdown_content(self, markdown_text, max_length=5000): + """ Split markdown content into chunks of max 5000 characters at natural breakpoints """ paragraphs = markdown_text.split('\n\n') # Split by paragraphs chunks = [] current_chunk = '' @@ -104,71 +114,102 @@ class RedisVectorStorage: return chunks - # Function to store chunks and their embeddings in Redis def store_chunks_in_redis(self, url, chunks): + """ Store chunks and their embeddings in Redis """ + parsed_url = urlparse(url) + trimmed_url = parsed_url.netloc + parsed_url.path # Remove scheme (http:// or https://) + data = [] for i, chunk in enumerate(chunks): embedding = self.vectorizer.embed(chunk, input_type="search_document", as_buffer=True) data.append({ - "id": f"{url}::chunk::{i+1}", + "id": f"{trimmed_url}::chunk::{i+1}", "content": chunk, - "content_embedding": embedding + "content_embedding": embedding, + "source_url": trimmed_url }) self.index.load(data) print(f"Stored {len(chunks)} chunks for URL {url} in Redis.") - # Function to recursively crawl a URL and its Markdown links - def crawl_recursive(self, url, base_domain, visited=None): - if visited is None: - visited = set() - - if url in visited: - return - visited.add(url) - - # Check if the URL has already been processed - if self.index.exists(f"{url}::chunk::1"): - print(f"URL {url} has already been processed. Skipping.") - return - - print(f"Crawling URL: {url}") - - params = { - 'pageOptions': { - 'onlyMainContent': False, - 'fetchPageContent': True, - 'includeHTML': True, + def crawl_iterative(self, start_url, base_domain): + """ Iteratively crawl a URL and its Markdown links """ + visited = set() + stack = [start_url] + + while stack: + url = stack.pop() + if url in visited: + continue + + parsed_url = urlparse(url) + trimmed_url = parsed_url.netloc + parsed_url.path # Remove scheme (http:// or https://) + + # Check if the URL has already been processed + t = Tag("id") == f"{trimmed_url}::chunk::1" # Use the original URL format + + # Use a simple filter query instead of a vector query + filter_query = FilterQuery(filter_expression=t) + results = self.index.query(filter_query) + if results: + print(f"URL {url} has already been processed. Skipping.") + visited.add(url) + continue + + print(f"Crawling URL: {url}") + + params = { + 'pageOptions': { + 'onlyMainContent': False, + 'fetchPageContent': True, + 'includeHTML': False, + } } - } - crawl_result = self.app.crawl_url(url, params=params, wait_until_done=True) - - for result in crawl_result: - print("Content:\n\n") - markdown_content = result["markdown"] - - # Split markdown content into natural chunks - chunks = self.split_markdown_content(markdown_content) - - # Store the chunks and their embeddings in Redis - self.store_chunks_in_redis(url, chunks) - links = self.extract_markdown_links(markdown_content) - print("Extracted Links:", links) + crawl_result = [] + if self.is_internal_link(url, base_domain) and not url in visited: + crawl_result.append(self.app.scrape_url(url, params=params)) + visited.add(url) + + for result in crawl_result: + markdown_content = result["markdown"] + result_url = result["metadata"]["sourceURL"] + print("Markdown sourceURL: " + result_url) + # print("Content:\n\n") + # print(markdown_content) + # print("\n\n") + # Split markdown content into natural chunks + chunks = self.split_markdown_content(markdown_content) + + # Store the chunks and their embeddings in Redis + self.store_chunks_in_redis(result_url, chunks) + + links = self.extract_markdown_links(markdown_content) + print("Extracted Links:", links) + # print("Extracted Links:", links) + + for link in links: + absolute_link = urljoin(result_url, link) + if self.is_internal_link(absolute_link, base_domain): + if absolute_link not in visited: + stack.append(absolute_link) + print("Appended link: " + absolute_link) + else: + visited.add(absolute_link) - for link in links: - absolute_link = urljoin(url, link) - if self.is_internal_link(absolute_link, base_domain): - self.crawl_recursive(absolute_link, base_domain, visited) + def crawl(self, crawl_url: str): + """ Start the iterative crawling from the initial URL """ + base_domain = urlparse(crawl_url).netloc + self.crawl_iterative(crawl_url, base_domain) - # Function to embed a string and perform a Redis vector database query def embed(self, query: str, num_results: int=3): + """ Embed a string and perform a Redis vector database query """ query_embedding = self.vectorizer.embed(query) vector_query = VectorQuery( vector=query_embedding, vector_field_name="content_embedding", num_results=num_results, - return_fields=["id", "content"], + return_fields=["id", "content", "source_url"], return_score=True ) @@ -176,11 +217,11 @@ class RedisVectorStorage: results = self.index.query(vector_query) return results - def crawl(self, crawl_url: str): - # Start the recursive crawling from the initial URL - base_domain = urlparse(crawl_url).netloc - self.crawl_recursive(crawl_url, base_domain) - if __name__ == "__main__": - storage = RedisVectorStorage() + storage = RedisVectorStorage(overwrite=False) storage.crawl("https://docs.swarms.world/en/latest/") + responses = storage.embed("What is Swarms, and how do I install swarms?", 5) + for response in responses: + encoded_id = response['id'] # Access the 'id' field directly + source_url = response['source_url'] + print(f"Decoded ID: {encoded_id}, Source URL: {source_url}") diff --git a/playground/demos/chatbot/start-chatbot.sh b/playground/demos/chatbot/start-chatbot.sh new file mode 100644 index 00000000..4fbd2763 --- /dev/null +++ b/playground/demos/chatbot/start-chatbot.sh @@ -0,0 +1,6 @@ +sudo apt-get install redis-stack-server +sudo systemctl enable redis-stack-server +sudo systemctl start redis-stack-server +cd ~/firecrawl/apps/api +pnpm run workers +pnpm run start \ No newline at end of file