updated vector_storage.py to parse URLs more efficiently for markdown links

pull/570/head
Richard Anthony Hein 8 months ago
parent 955f9439f9
commit f422450347

@ -8,6 +8,25 @@
* Switching from vLLM to another host like Olama requires commenting/uncommenting some code at this time, but will be dynamic later. * Switching from vLLM to another host like Olama requires commenting/uncommenting some code at this time, but will be dynamic later.
* Locally hosted or cloud (vector storage)[./vector_storage.py]
## Prerequisites
* Install (Redis Stack)[https://redis.io/docs/latest/operate/oss_and_stack/install/install-stack/]
* (Firecrawl)[https://docs.firecrawl.dev/contributing]
## Start Redis Stack and Firecrawl
```bash
sudo apt-get install redis-stack-server
sudo systemctl enable redis-stack-server
sudo systemctl start redis-stack-server
cd ~/firecrawl/apps/api
pnpm run workers
pnpm run start
```
## Running vLLM ## Running vLLM
Running vLLM in a docker container saves a lot of trouble. Use dockerRunVllm.sh to set up and start vLLM. This command will allow you to control vLLM using docker commands: Running vLLM in a docker container saves a lot of trouble. Use dockerRunVllm.sh to set up and start vLLM. This command will allow you to control vLLM using docker commands:

@ -1,3 +1,4 @@
""" Vector storage using Firecrawl and Redis """
import re import re
from urllib.parse import urlparse, urljoin from urllib.parse import urlparse, urljoin
import redis import redis
@ -5,11 +6,12 @@ from firecrawl import FirecrawlApp
from redisvl.utils.vectorize import HFTextVectorizer from redisvl.utils.vectorize import HFTextVectorizer
from redisvl.index import SearchIndex from redisvl.index import SearchIndex
from redisvl.schema import IndexSchema from redisvl.schema import IndexSchema
from redisvl.query import VectorQuery from redisvl.query.filter import Tag
from redisvl.query import VectorQuery, FilterQuery
class RedisVectorStorage: class RedisVectorStorage:
""" Provides vector storage database operations using Redis """ """ Provides vector storage database operations using Redis """
def __init__(self, context: str="swarms", use_gpu=False): def __init__(self, context: str="swarms", use_gpu=False, overwrite=False):
self.use_gpu = use_gpu self.use_gpu = use_gpu
self.context = context self.context = context
# Initialize the FirecrawlApp with your API key # Initialize the FirecrawlApp with your API key
@ -28,7 +30,6 @@ class RedisVectorStorage:
schema = IndexSchema.from_dict({ schema = IndexSchema.from_dict({
"index": { "index": {
"name": index_name, "name": index_name,
"prefix": "chunk"
}, },
"fields": [ "fields": [
{ {
@ -54,28 +55,37 @@ class RedisVectorStorage:
"algorithm": "hnsw", "algorithm": "hnsw",
"datatype": "float32" "datatype": "float32"
} }
},
{
"name": "source_url",
"type": "text",
"attrs": {
"sortable": True
}
} }
] ]
}) })
self.schema = schema self.schema = schema
self.index = SearchIndex(self.schema, self.redis_client) self.index = SearchIndex(self.schema, self.redis_client)
self.index.create() self.index.create(overwrite=overwrite, drop=overwrite)
# Function to extract Markdown links
def extract_markdown_links(self, markdown_text): def extract_markdown_links(self, markdown_text):
pattern = r'\[([^\]]+)\]\(([^)]+)\)' """ Extract Markdown links from the given markdown text """
pattern = r'\[([^\]]+)\]\(([^)]+?)(?:\s+"[^"]*")?\)'
links = re.findall(pattern, markdown_text) links = re.findall(pattern, markdown_text)
urls = [link[1] for link in links] urls = [link[1] for link in links]
return urls return urls
# Function to check if a URL is internal to the initial domain def is_internal_link(self, url: str, base_domain: str):
def is_internal_link(self, url, base_domain): """ Check if a URL is internal to the initial domain """
if (url == '\\' or url.startswith("mailto")):
return False
parsed_url = urlparse(url) parsed_url = urlparse(url)
return parsed_url.netloc == '' or parsed_url.netloc == base_domain return parsed_url.netloc == '' or parsed_url.netloc == base_domain
# Function to split markdown content into chunks of max 5000 characters at natural breakpoints
def split_markdown_content(self, markdown_text, max_length=5000): def split_markdown_content(self, markdown_text, max_length=5000):
""" Split markdown content into chunks of max 5000 characters at natural breakpoints """
paragraphs = markdown_text.split('\n\n') # Split by paragraphs paragraphs = markdown_text.split('\n\n') # Split by paragraphs
chunks = [] chunks = []
current_chunk = '' current_chunk = ''
@ -104,71 +114,102 @@ class RedisVectorStorage:
return chunks return chunks
# Function to store chunks and their embeddings in Redis
def store_chunks_in_redis(self, url, chunks): def store_chunks_in_redis(self, url, chunks):
""" Store chunks and their embeddings in Redis """
parsed_url = urlparse(url)
trimmed_url = parsed_url.netloc + parsed_url.path # Remove scheme (http:// or https://)
data = [] data = []
for i, chunk in enumerate(chunks): for i, chunk in enumerate(chunks):
embedding = self.vectorizer.embed(chunk, input_type="search_document", as_buffer=True) embedding = self.vectorizer.embed(chunk, input_type="search_document", as_buffer=True)
data.append({ data.append({
"id": f"{url}::chunk::{i+1}", "id": f"{trimmed_url}::chunk::{i+1}",
"content": chunk, "content": chunk,
"content_embedding": embedding "content_embedding": embedding,
"source_url": trimmed_url
}) })
self.index.load(data) self.index.load(data)
print(f"Stored {len(chunks)} chunks for URL {url} in Redis.") print(f"Stored {len(chunks)} chunks for URL {url} in Redis.")
# Function to recursively crawl a URL and its Markdown links def crawl_iterative(self, start_url, base_domain):
def crawl_recursive(self, url, base_domain, visited=None): """ Iteratively crawl a URL and its Markdown links """
if visited is None: visited = set()
visited = set() stack = [start_url]
if url in visited: while stack:
return url = stack.pop()
visited.add(url) if url in visited:
continue
# Check if the URL has already been processed
if self.index.exists(f"{url}::chunk::1"): parsed_url = urlparse(url)
print(f"URL {url} has already been processed. Skipping.") trimmed_url = parsed_url.netloc + parsed_url.path # Remove scheme (http:// or https://)
return
# Check if the URL has already been processed
print(f"Crawling URL: {url}") t = Tag("id") == f"{trimmed_url}::chunk::1" # Use the original URL format
params = { # Use a simple filter query instead of a vector query
'pageOptions': { filter_query = FilterQuery(filter_expression=t)
'onlyMainContent': False, results = self.index.query(filter_query)
'fetchPageContent': True, if results:
'includeHTML': True, print(f"URL {url} has already been processed. Skipping.")
visited.add(url)
continue
print(f"Crawling URL: {url}")
params = {
'pageOptions': {
'onlyMainContent': False,
'fetchPageContent': True,
'includeHTML': False,
}
} }
}
crawl_result = self.app.crawl_url(url, params=params, wait_until_done=True)
for result in crawl_result:
print("Content:\n\n")
markdown_content = result["markdown"]
# Split markdown content into natural chunks
chunks = self.split_markdown_content(markdown_content)
# Store the chunks and their embeddings in Redis
self.store_chunks_in_redis(url, chunks)
links = self.extract_markdown_links(markdown_content) crawl_result = []
print("Extracted Links:", links) if self.is_internal_link(url, base_domain) and not url in visited:
crawl_result.append(self.app.scrape_url(url, params=params))
visited.add(url)
for result in crawl_result:
markdown_content = result["markdown"]
result_url = result["metadata"]["sourceURL"]
print("Markdown sourceURL: " + result_url)
# print("Content:\n\n")
# print(markdown_content)
# print("\n\n")
# Split markdown content into natural chunks
chunks = self.split_markdown_content(markdown_content)
# Store the chunks and their embeddings in Redis
self.store_chunks_in_redis(result_url, chunks)
links = self.extract_markdown_links(markdown_content)
print("Extracted Links:", links)
# print("Extracted Links:", links)
for link in links:
absolute_link = urljoin(result_url, link)
if self.is_internal_link(absolute_link, base_domain):
if absolute_link not in visited:
stack.append(absolute_link)
print("Appended link: " + absolute_link)
else:
visited.add(absolute_link)
for link in links: def crawl(self, crawl_url: str):
absolute_link = urljoin(url, link) """ Start the iterative crawling from the initial URL """
if self.is_internal_link(absolute_link, base_domain): base_domain = urlparse(crawl_url).netloc
self.crawl_recursive(absolute_link, base_domain, visited) self.crawl_iterative(crawl_url, base_domain)
# Function to embed a string and perform a Redis vector database query
def embed(self, query: str, num_results: int=3): def embed(self, query: str, num_results: int=3):
""" Embed a string and perform a Redis vector database query """
query_embedding = self.vectorizer.embed(query) query_embedding = self.vectorizer.embed(query)
vector_query = VectorQuery( vector_query = VectorQuery(
vector=query_embedding, vector=query_embedding,
vector_field_name="content_embedding", vector_field_name="content_embedding",
num_results=num_results, num_results=num_results,
return_fields=["id", "content"], return_fields=["id", "content", "source_url"],
return_score=True return_score=True
) )
@ -176,11 +217,11 @@ class RedisVectorStorage:
results = self.index.query(vector_query) results = self.index.query(vector_query)
return results return results
def crawl(self, crawl_url: str):
# Start the recursive crawling from the initial URL
base_domain = urlparse(crawl_url).netloc
self.crawl_recursive(crawl_url, base_domain)
if __name__ == "__main__": if __name__ == "__main__":
storage = RedisVectorStorage() storage = RedisVectorStorage(overwrite=False)
storage.crawl("https://docs.swarms.world/en/latest/") storage.crawl("https://docs.swarms.world/en/latest/")
responses = storage.embed("What is Swarms, and how do I install swarms?", 5)
for response in responses:
encoded_id = response['id'] # Access the 'id' field directly
source_url = response['source_url']
print(f"Decoded ID: {encoded_id}, Source URL: {source_url}")

@ -0,0 +1,6 @@
sudo apt-get install redis-stack-server
sudo systemctl enable redis-stack-server
sudo systemctl start redis-stack-server
cd ~/firecrawl/apps/api
pnpm run workers
pnpm run start
Loading…
Cancel
Save