useGPU flag & update metadata for processed files

pull/570/head
Richard Anthony Hein 8 months ago
parent 5c46393ee1
commit 7e3ba13f0f

@ -93,11 +93,13 @@ if missing_vars:
exit(1) exit(1)
useMetal = os.environ.get("USE_METAL", "False") == "True" useMetal = os.environ.get("USE_METAL", "False") == "True"
useGPU = os.environ.get("USE_GPU", "False") == "True"
print(f"Uploads={uploads}") print(f"Uploads={uploads}")
print(f"MODEL_DIR={model_dir}") print(f"MODEL_DIR={model_dir}")
print(f"MODEL_NAME={model_name}") print(f"MODEL_NAME={model_name}")
print(f"USE_METAL={useMetal}") print(f"USE_METAL={useMetal}")
print(f"USE_GPU={useGPU}")
print(f"OPENAI_API_KEY={openai_api_key}") print(f"OPENAI_API_KEY={openai_api_key}")
print(f"OPENAI_API_BASE={openai_api_base}") print(f"OPENAI_API_BASE={openai_api_base}")
@ -145,7 +147,7 @@ if not os.path.exists(uploads):
os.makedirs(uploads) os.makedirs(uploads)
# Initialize the vector store # Initialize the vector store
vector_store = VectorStorage(directory=uploads) vector_store = VectorStorage(directory=uploads, useGPU=useGPU)
async def create_chain( async def create_chain(

@ -16,11 +16,11 @@ from swarms.server.async_parent_document_retriever import AsyncParentDocumentRet
store_type = "local" # "redis" or "local" store_type = "local" # "redis" or "local"
class VectorStorage: class VectorStorage:
def __init__(self, directory): def __init__(self, directory, useGPU=False):
self.embeddings = HuggingFaceBgeEmbeddings( self.embeddings = HuggingFaceBgeEmbeddings(
cache_folder="./.embeddings", cache_folder="./.embeddings",
model_name="BAAI/bge-large-en", model_name="BAAI/bge-large-en",
model_kwargs={"device": "cuda"}, # Use GPU model_kwargs={"device": "cuda" if useGPU else "cpu"},
encode_kwargs={"normalize_embeddings": True}, encode_kwargs={"normalize_embeddings": True},
query_instruction="Represent this sentence for searching relevant passages: ", query_instruction="Represent this sentence for searching relevant passages: ",
) )
@ -119,7 +119,7 @@ class VectorStorage:
for file in chunk_files: for file in chunk_files:
loader = UnstructuredMarkdownLoader( loader = UnstructuredMarkdownLoader(
file, file,
mode="elements", mode="single",
strategy="fast" strategy="fast"
) )
print(f"Loaded {file} in {subdir} ...") print(f"Loaded {file} in {subdir} ...")
@ -131,22 +131,12 @@ class VectorStorage:
"processed_at": str(datetime.now()) "processed_at": str(datetime.now())
}) })
# Save metadata to the metadata.json file
with open(metadata_file, "w") as metadataFile:
json.dump(metadata, metadataFile, indent=4)
print(f"Loaded {len(documents)} documents for directory '{subdir}'.")
chunksEndTime = datetime.now()
print(
f"{max_files} markdown file chunks processing time: {chunksEndTime - chunksStartTime}"
)
print(f"Creating new collection for {self.directory}...") print(f"Creating new collection for {self.directory}...")
# Create or get the collection # Create or get the collection
collection = self.client.create_collection( collection = self.client.create_collection(
name=self.directory, name=self.directory,
get_or_create=True, get_or_create=True,
metadata=metadata, metadata={"processDate": metadata["processDate"]},
) )
# Reload vectorstore based on collection # Reload vectorstore based on collection
@ -173,6 +163,16 @@ class VectorStorage:
documents = [] # clear documents list for next chunk documents = [] # clear documents list for next chunk
# Save metadata to the metadata.json file
with open(metadata_file, "w") as metadataFile:
json.dump(metadata, metadataFile, indent=4)
print(f"Loaded {len(documents)} documents for directory '{subdir}'.")
chunksEndTime = datetime.now()
print(
f"{max_files} markdown file chunks processing time: {chunksEndTime - chunksStartTime}"
)
subdir_end_time = datetime.now() subdir_end_time = datetime.now()
print(f"Subdir {subdir} processing end time: {subdir_end_time}") print(f"Subdir {subdir} processing end time: {subdir_end_time}")
print(f"Time taken: {subdir_end_time - subdir_start_time}") print(f"Time taken: {subdir_end_time - subdir_start_time}")

Loading…
Cancel
Save