import re
from urllib.parse import urlparse, urljoin
import redis
from firecrawl import FirecrawlApp
from redisvl.utils.vectorize import HFTextVectorizer
from redisvl.index import SearchIndex
from redisvl.schema import IndexSchema
from redisvl.query import VectorQuery

class RedisVectorStorage:
    """  Provides vector storage database operations using Redis """
    def __init__(self, context: str="swarms", use_gpu=False):
        self.use_gpu = use_gpu
        self.context = context
        # Initialize the FirecrawlApp with your API key
        self.app = FirecrawlApp(
            api_key="EMPTY",
            api_url="http://localhost:3002")  # EMPTY for localhost

        # Connect to the local Redis server
        self.redis_client = redis.Redis(host='localhost', port=6379, db=0)

        # Initialize the Cohere text vectorizer
        self.vectorizer = HFTextVectorizer()

        index_name = self.context

        schema = IndexSchema.from_dict({
        "index": {
            "name": index_name,
            "prefix": "chunk"
        },
        "fields": [
            {
                "name": "id",
                "type": "tag",
                "attrs": {
                    "sortable": True
                }
            },
            {
                "name": "content",
                "type": "text",
                "attrs": {
                    "sortable": True
                }
            },
            {
                "name": "content_embedding",
                "type": "vector",
                "attrs": {
                    "dims": self.vectorizer.dims,
                    "distance_metric": "cosine",
                    "algorithm": "hnsw",
                    "datatype": "float32"
                }
            }
        ]
        })

        self.schema = schema
        self.index = SearchIndex(self.schema, self.redis_client)
        self.index.create()

    # Function to extract Markdown links
    def extract_markdown_links(self, markdown_text):
        pattern = r'\[([^\]]+)\]\(([^)]+)\)'
        links = re.findall(pattern, markdown_text)
        urls = [link[1] for link in links]
        return urls

    # Function to check if a URL is internal to the initial domain
    def is_internal_link(self, url, base_domain):
        parsed_url = urlparse(url)
        return parsed_url.netloc == '' or parsed_url.netloc == base_domain

    # Function to split markdown content into chunks of max 5000 characters at natural breakpoints
    def split_markdown_content(self, markdown_text, max_length=5000):
        paragraphs = markdown_text.split('\n\n')  # Split by paragraphs
        chunks = []
        current_chunk = ''

        for paragraph in paragraphs:
            if len(current_chunk) + len(paragraph) > max_length:
                chunks.append(current_chunk)
                current_chunk = paragraph
            else:
                current_chunk += '\n\n' + paragraph

            if len(paragraph) > max_length:
                sentences = re.split(r'(?<=[.!?])\s+', paragraph)
                for sentence in sentences:
                    if len(sentence) > max_length:
                        chunks.append(sentence[:max_length])
                        current_chunk = sentence[max_length:]
                    elif len(current_chunk) + len(sentence) > max_length:
                        chunks.append(current_chunk)
                        current_chunk = sentence
                    else:
                        current_chunk += ' ' + sentence

        if current_chunk:
            chunks.append(current_chunk)

        return chunks

    # Function to store chunks and their embeddings in Redis
    def store_chunks_in_redis(self, url, chunks):
        data = []
        for i, chunk in enumerate(chunks):
            embedding = self.vectorizer.embed(chunk, input_type="search_document", as_buffer=True)
            data.append({
                "id": f"{url}::chunk::{i+1}",
                "content": chunk,
                "content_embedding": embedding
            })
        self.index.load(data)
        print(f"Stored {len(chunks)} chunks for URL {url} in Redis.")

    # Function to recursively crawl a URL and its Markdown links
    def crawl_recursive(self, url, base_domain, visited=None):
        if visited is None:
            visited = set()

        if url in visited:
            return
        visited.add(url)

        # Check if the URL has already been processed
        if self.redis_client.exists(f"{url}::chunk::1"):
            print(f"URL {url} has already been processed. Skipping.")
            return

        print(f"Crawling URL: {url}")

        params = {
            'pageOptions': {
                'onlyMainContent': False,
                'fetchPageContent': True,
                'includeHTML': True,
            }
        }
        crawl_result = self.app.crawl_url(url, params=params, wait_until_done=True)

        for result in crawl_result:
            print("Content:\n\n")
            markdown_content = result["markdown"]

            # Split markdown content into natural chunks
            chunks = self.split_markdown_content(markdown_content)

            # Store the chunks and their embeddings in Redis
            self.store_chunks_in_redis(url, chunks)

            links = self.extract_markdown_links(markdown_content)
            print("Extracted Links:", links)

            for link in links:
                absolute_link = urljoin(url, link)
                if self.is_internal_link(absolute_link, base_domain):
                    self.crawl_recursive(absolute_link, base_domain, visited)

    # Function to embed a string and perform a Redis vector database query
    def embed(self, query: str, num_results: int=3):
        query_embedding = self.vectorizer.embed(query)

        vector_query = VectorQuery(
            vector=query_embedding,
            vector_field_name="content_embedding",
            num_results=num_results,
            return_fields=["id", "content"],
            return_score=True
        )

        # show the raw redis query
        results = self.index.query(vector_query)
        return results

    def crawl(self, crawl_url: str):
        # Start the recursive crawling from the initial URL
        base_domain = urlparse(crawl_url).netloc
        self.crawl_recursive(crawl_url, base_domain)

if __name__ == "__main__":
    storage = RedisVectorStorage()
    storage.crawl("https://docs.swarms.world/en/latest/")