updated submodule and added RedisVectorStorage class

pull/570/head
Richard Anthony Hein 8 months ago
parent 5a60bea7d4
commit 2fdabef9fb

1
.gitignore vendored

@ -226,3 +226,4 @@ swarms/server/.chroma_db
.embeddings .embeddings
.parent_documents .parent_documents
docs/metadata.json docs/metadata.json
dump.rdb

@ -1 +1 @@
Subproject commit 92887afa8dcb2910c98cdd0b1129618c2eec8311 Subproject commit 4fabcfe279505c7ec1d279ef3d66ce63256a9a93

@ -0,0 +1,186 @@
import re
from urllib.parse import urlparse, urljoin
import redis
from firecrawl import FirecrawlApp
from redisvl.utils.vectorize import HFTextVectorizer
from redisvl.index import SearchIndex
from redisvl.schema import IndexSchema
from redisvl.query import VectorQuery
class RedisVectorStorage:
""" Provides vector storage database operations using Redis """
def __init__(self, context: str="swarms", use_gpu=False):
self.use_gpu = use_gpu
self.context = context
# Initialize the FirecrawlApp with your API key
self.app = FirecrawlApp(
api_key="EMPTY",
api_url="http://localhost:3002") # EMPTY for localhost
# Connect to the local Redis server
self.redis_client = redis.Redis(host='localhost', port=6379, db=0)
# Initialize the Cohere text vectorizer
self.vectorizer = HFTextVectorizer()
index_name = self.context
schema = IndexSchema.from_dict({
"index": {
"name": index_name,
"prefix": "chunk"
},
"fields": [
{
"name": "id",
"type": "tag",
"attrs": {
"sortable": True
}
},
{
"name": "content",
"type": "text",
"attrs": {
"sortable": True
}
},
{
"name": "content_embedding",
"type": "vector",
"attrs": {
"dims": self.vectorizer.dims,
"distance_metric": "cosine",
"algorithm": "hnsw",
"datatype": "float32"
}
}
]
})
self.schema = schema
self.index = SearchIndex(self.schema, self.redis_client)
self.index.create()
# Function to extract Markdown links
def extract_markdown_links(self, markdown_text):
pattern = r'\[([^\]]+)\]\(([^)]+)\)'
links = re.findall(pattern, markdown_text)
urls = [link[1] for link in links]
return urls
# Function to check if a URL is internal to the initial domain
def is_internal_link(self, url, base_domain):
parsed_url = urlparse(url)
return parsed_url.netloc == '' or parsed_url.netloc == base_domain
# Function to split markdown content into chunks of max 5000 characters at natural breakpoints
def split_markdown_content(self, markdown_text, max_length=5000):
paragraphs = markdown_text.split('\n\n') # Split by paragraphs
chunks = []
current_chunk = ''
for paragraph in paragraphs:
if len(current_chunk) + len(paragraph) > max_length:
chunks.append(current_chunk)
current_chunk = paragraph
else:
current_chunk += '\n\n' + paragraph
if len(paragraph) > max_length:
sentences = re.split(r'(?<=[.!?])\s+', paragraph)
for sentence in sentences:
if len(sentence) > max_length:
chunks.append(sentence[:max_length])
current_chunk = sentence[max_length:]
elif len(current_chunk) + len(sentence) > max_length:
chunks.append(current_chunk)
current_chunk = sentence
else:
current_chunk += ' ' + sentence
if current_chunk:
chunks.append(current_chunk)
return chunks
# Function to store chunks and their embeddings in Redis
def store_chunks_in_redis(self, url, chunks):
data = []
for i, chunk in enumerate(chunks):
embedding = self.vectorizer.embed(chunk, input_type="search_document", as_buffer=True)
data.append({
"id": f"{url}::chunk::{i+1}",
"content": chunk,
"content_embedding": embedding
})
self.index.load(data)
print(f"Stored {len(chunks)} chunks for URL {url} in Redis.")
# Function to recursively crawl a URL and its Markdown links
def crawl_recursive(self, url, base_domain, visited=None):
if visited is None:
visited = set()
if url in visited:
return
visited.add(url)
# Check if the URL has already been processed
if self.redis_client.exists(f"{url}::chunk::1"):
print(f"URL {url} has already been processed. Skipping.")
return
print(f"Crawling URL: {url}")
params = {
'pageOptions': {
'onlyMainContent': False,
'fetchPageContent': True,
'includeHTML': True,
}
}
crawl_result = self.app.crawl_url(url, params=params, wait_until_done=True)
for result in crawl_result:
print("Content:\n\n")
markdown_content = result["markdown"]
# Split markdown content into natural chunks
chunks = self.split_markdown_content(markdown_content)
# Store the chunks and their embeddings in Redis
self.store_chunks_in_redis(url, chunks)
links = self.extract_markdown_links(markdown_content)
print("Extracted Links:", links)
for link in links:
absolute_link = urljoin(url, link)
if self.is_internal_link(absolute_link, base_domain):
self.crawl_recursive(absolute_link, base_domain, visited)
# Function to embed a string and perform a Redis vector database query
def embed(self, query: str, num_results: int=3):
query_embedding = self.vectorizer.embed(query)
vector_query = VectorQuery(
vector=query_embedding,
vector_field_name="content_embedding",
num_results=num_results,
return_fields=["id", "content"],
return_score=True
)
# show the raw redis query
results = self.index.query(vector_query)
return results
def crawl(self, crawl_url: str):
# Start the recursive crawling from the initial URL
base_domain = urlparse(crawl_url).netloc
self.crawl_recursive(crawl_url, base_domain)
if __name__ == "__main__":
storage = RedisVectorStorage()
storage.crawl("https://docs.swarms.world/en/latest/")
Loading…
Cancel
Save