| 
						
						
							
								
							
						
						
					 | 
					 | 
					@ -16,11 +16,11 @@ from swarms.server.async_parent_document_retriever import AsyncParentDocumentRet
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					store_type = "local"  # "redis" or "local"
 | 
					 | 
					 | 
					 | 
					store_type = "local"  # "redis" or "local"
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					
 | 
					 | 
					 | 
					 | 
					
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					class VectorStorage:
 | 
					 | 
					 | 
					 | 
					class VectorStorage:
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					    def __init__(self, directory):
 | 
					 | 
					 | 
					 | 
					    def __init__(self, directory, useGPU=False):
 | 
				
			
			
				
				
			
		
	
		
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					        self.embeddings = HuggingFaceBgeEmbeddings(
 | 
					 | 
					 | 
					 | 
					        self.embeddings = HuggingFaceBgeEmbeddings(
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					            cache_folder="./.embeddings",
 | 
					 | 
					 | 
					 | 
					            cache_folder="./.embeddings",
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					            model_name="BAAI/bge-large-en",
 | 
					 | 
					 | 
					 | 
					            model_name="BAAI/bge-large-en",
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					            model_kwargs={"device": "cuda"},  # Use GPU
 | 
					 | 
					 | 
					 | 
					            model_kwargs={"device": "cuda" if useGPU else "cpu"},
 | 
				
			
			
				
				
			
		
	
		
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					            encode_kwargs={"normalize_embeddings": True},
 | 
					 | 
					 | 
					 | 
					            encode_kwargs={"normalize_embeddings": True},
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					            query_instruction="Represent this sentence for searching relevant passages: ",
 | 
					 | 
					 | 
					 | 
					            query_instruction="Represent this sentence for searching relevant passages: ",
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					        )
 | 
					 | 
					 | 
					 | 
					        )
 | 
				
			
			
		
	
	
		
		
			
				
					| 
						
							
								
							
						
						
							
								
							
						
						
					 | 
					 | 
					@ -119,18 +119,50 @@ class VectorStorage:
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					                for file in chunk_files:
 | 
					 | 
					 | 
					 | 
					                for file in chunk_files:
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					                    loader = UnstructuredMarkdownLoader(
 | 
					 | 
					 | 
					 | 
					                    loader = UnstructuredMarkdownLoader(
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					                        file,
 | 
					 | 
					 | 
					 | 
					                        file,
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					                        mode="elements",
 | 
					 | 
					 | 
					 | 
					                        mode="single",
 | 
				
			
			
				
				
			
		
	
		
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					                        strategy="fast"
 | 
					 | 
					 | 
					 | 
					                        strategy="fast"
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					                    )
 | 
					 | 
					 | 
					 | 
					                    )
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					                    print(f"Loaded {file} in {subdir} ...")
 | 
					 | 
					 | 
					 | 
					                    print(f"Loaded {file} in {subdir} ...")
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					                    documents.extend(loader.load())
 | 
					 | 
					 | 
					 | 
					                    documents.extend(loader.load())
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					                    
 | 
					 | 
					 | 
					 | 
					
 | 
				
			
			
				
				
			
		
	
		
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					                    # Record the file as processed in metadata
 | 
					 | 
					 | 
					 | 
					                    # Record the file as processed in metadata
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					                    metadata["processed_files"].append({
 | 
					 | 
					 | 
					 | 
					                    metadata["processed_files"].append({
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					                        "file": file,
 | 
					 | 
					 | 
					 | 
					                        "file": file,
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					                        "processed_at": str(datetime.now())
 | 
					 | 
					 | 
					 | 
					                        "processed_at": str(datetime.now())
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					                    })
 | 
					 | 
					 | 
					 | 
					                    })
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					
 | 
					 | 
					 | 
					 | 
					
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					 | 
					 | 
					 | 
					 | 
					                    print(f"Creating new collection for {self.directory}...")
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					 | 
					 | 
					 | 
					 | 
					                    # Create or get the collection
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					 | 
					 | 
					 | 
					 | 
					                    collection = self.client.create_collection(
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					 | 
					 | 
					 | 
					 | 
					                        name=self.directory,
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					 | 
					 | 
					 | 
					 | 
					                        get_or_create=True,
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					 | 
					 | 
					 | 
					 | 
					                        metadata={"processDate": metadata["processDate"]},
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					 | 
					 | 
					 | 
					 | 
					                    )
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					 | 
					 | 
					 | 
					 | 
					
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					 | 
					 | 
					 | 
					 | 
					                    # Reload vectorstore based on collection
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					 | 
					 | 
					 | 
					 | 
					                    vectorstore = self.getVectorStore(collection_name=collection.name)
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					 | 
					 | 
					 | 
					 | 
					
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					 | 
					 | 
					 | 
					 | 
					                    # Create a new parent document retriever
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					 | 
					 | 
					 | 
					 | 
					                    retriever = AsyncParentDocumentRetriever(
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					 | 
					 | 
					 | 
					 | 
					                        docstore=self.store,
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					 | 
					 | 
					 | 
					 | 
					                        vectorstore=vectorstore,
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					 | 
					 | 
					 | 
					 | 
					                        child_splitter=self.child_splitter,
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					 | 
					 | 
					 | 
					 | 
					                        parent_splitter=self.parent_splitter,
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					 | 
					 | 
					 | 
					 | 
					                    )
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					 | 
					 | 
					 | 
					 | 
					
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					 | 
					 | 
					 | 
					 | 
					                    # Add documents to the collection and docstore
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					 | 
					 | 
					 | 
					 | 
					                    print(f"Adding {len(documents)} documents to collection...")
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					 | 
					 | 
					 | 
					 | 
					                    add_docs_start_time = datetime.now()
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					 | 
					 | 
					 | 
					 | 
					                    await retriever.aadd_documents(
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					 | 
					 | 
					 | 
					 | 
					                        documents=documents, add_to_docstore=True
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					 | 
					 | 
					 | 
					 | 
					                    )
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					 | 
					 | 
					 | 
					 | 
					                    add_docs_end_time = datetime.now()
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					 | 
					 | 
					 | 
					 | 
					                    print(
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					 | 
					 | 
					 | 
					 | 
					                        f"Adding {len(documents)} documents to collection took: {add_docs_end_time - add_docs_start_time}"
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					 | 
					 | 
					 | 
					 | 
					                    )
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					 | 
					 | 
					 | 
					 | 
					
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					 | 
					 | 
					 | 
					 | 
					                    documents = []  # clear documents list for next chunk
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					 | 
					 | 
					 | 
					 | 
					
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					                    # Save metadata to the metadata.json file
 | 
					 | 
					 | 
					 | 
					                    # Save metadata to the metadata.json file
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					                    with open(metadata_file, "w") as metadataFile:
 | 
					 | 
					 | 
					 | 
					                    with open(metadata_file, "w") as metadataFile:
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					                        json.dump(metadata, metadataFile, indent=4)
 | 
					 | 
					 | 
					 | 
					                        json.dump(metadata, metadataFile, indent=4)
 | 
				
			
			
		
	
	
		
		
			
				
					| 
						
						
						
							
								
							
						
					 | 
					 | 
					@ -141,38 +173,6 @@ class VectorStorage:
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					                    f"{max_files} markdown file chunks processing time: {chunksEndTime - chunksStartTime}"
 | 
					 | 
					 | 
					 | 
					                    f"{max_files} markdown file chunks processing time: {chunksEndTime - chunksStartTime}"
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					                )
 | 
					 | 
					 | 
					 | 
					                )
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					
 | 
					 | 
					 | 
					 | 
					
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					                print(f"Creating new collection for {self.directory}...")
 | 
					 | 
					 | 
					 | 
					 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					                # Create or get the collection
 | 
					 | 
					 | 
					 | 
					 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					                collection = self.client.create_collection(
 | 
					 | 
					 | 
					 | 
					 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					                    name=self.directory,
 | 
					 | 
					 | 
					 | 
					 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					                    get_or_create=True,
 | 
					 | 
					 | 
					 | 
					 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					                    metadata=metadata,
 | 
					 | 
					 | 
					 | 
					 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					                )
 | 
					 | 
					 | 
					 | 
					 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					
 | 
					 | 
					 | 
					 | 
					 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					                # Reload vectorstore based on collection
 | 
					 | 
					 | 
					 | 
					 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					                vectorstore = self.getVectorStore(collection_name=collection.name)
 | 
					 | 
					 | 
					 | 
					 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					
 | 
					 | 
					 | 
					 | 
					 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					                # Create a new parent document retriever
 | 
					 | 
					 | 
					 | 
					 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					                retriever = AsyncParentDocumentRetriever(
 | 
					 | 
					 | 
					 | 
					 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					                    docstore=self.store,
 | 
					 | 
					 | 
					 | 
					 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					                    vectorstore=vectorstore,
 | 
					 | 
					 | 
					 | 
					 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					                    child_splitter=self.child_splitter,
 | 
					 | 
					 | 
					 | 
					 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					                    parent_splitter=self.parent_splitter,
 | 
					 | 
					 | 
					 | 
					 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					                )
 | 
					 | 
					 | 
					 | 
					 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					
 | 
					 | 
					 | 
					 | 
					 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					                # Add documents to the collection and docstore
 | 
					 | 
					 | 
					 | 
					 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					                print(f"Adding {len(documents)} documents to collection...")
 | 
					 | 
					 | 
					 | 
					 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					                add_docs_start_time = datetime.now()
 | 
					 | 
					 | 
					 | 
					 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					                await retriever.aadd_documents(
 | 
					 | 
					 | 
					 | 
					 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					                    documents=documents, add_to_docstore=True
 | 
					 | 
					 | 
					 | 
					 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					                )
 | 
					 | 
					 | 
					 | 
					 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					                add_docs_end_time = datetime.now()
 | 
					 | 
					 | 
					 | 
					 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					                print(
 | 
					 | 
					 | 
					 | 
					 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					                    f"Adding {len(documents)} documents to collection took: {add_docs_end_time - add_docs_start_time}"
 | 
					 | 
					 | 
					 | 
					 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					                )
 | 
					 | 
					 | 
					 | 
					 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					
 | 
					 | 
					 | 
					 | 
					 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					                documents = []  # clear documents list for next chunk
 | 
					 | 
					 | 
					 | 
					 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					
 | 
					 | 
					 | 
					 | 
					 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					            subdir_end_time = datetime.now()
 | 
					 | 
					 | 
					 | 
					            subdir_end_time = datetime.now()
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					            print(f"Subdir {subdir} processing end time: {subdir_end_time}")
 | 
					 | 
					 | 
					 | 
					            print(f"Subdir {subdir} processing end time: {subdir_end_time}")
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					            print(f"Time taken: {subdir_end_time - subdir_start_time}")
 | 
					 | 
					 | 
					 | 
					            print(f"Time taken: {subdir_end_time - subdir_start_time}")
 | 
				
			
			
		
	
	
		
		
			
				
					| 
						
							
								
							
						
						
						
					 | 
					 | 
					
 
 |