You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

136 lines
5.5 KiB

import json
import math # Import math for ceiling division
import sys
import traceback # Import traceback
from pathlib import Path
import pandas as pd
# Add project root to Python path if needed (adjust relative path as necessary)
project_root = Path(__file__).resolve().parent.parent
sys.path.append(str(project_root))
from src.embeddings import CustomHuggingFaceEmbeddings
# Import FAISS after potentially adding to sys.path
try:
from langchain_community.vectorstores import FAISS
except ImportError:
print("Error: langchain_community or FAISS not installed. Please install with 'pip install langchain faiss-cpu'")
sys.exit(1)
def build_faiss_index_from_csv(csv_path: str, index_save_path: str, batch_size: int = 128) -> None:
"""Builds a FAISS index from a CSV containing paragraph content and metadata.
Reads a CSV file, generates embeddings for the 'content' column in batches,
and saves the FAISS index files (index.faiss, index.pkl) locally.
Args:
csv_path: Path to the input CSV file (e.g., data/processed/paragraphs.csv).
index_save_path: Path to the directory where the index files should be saved.
batch_size: Number of texts to process in each embedding batch.
"""
print(f"Loading paragraphs from {csv_path}")
try:
df = pd.read_csv(csv_path)
except FileNotFoundError:
print(f"Error: CSV file not found at {csv_path}. Please run the extraction script first.")
return
except Exception as e:
print(f"Error reading CSV file: {e}")
return
if "content" not in df.columns or "metadata" not in df.columns:
print("Error: CSV file must contain 'content' and 'metadata' columns.")
return
if df.empty:
print("Warning: Input CSV file is empty. No index will be built.")
return
# Prepare documents for FAISS
texts = df["content"].astype(str).tolist()
metadatas = []
try:
metadatas = [json.loads(m) for m in df["metadata"].tolist()]
print(f"Prepared {len(texts)} texts and {len(metadatas)} metadatas.")
except json.JSONDecodeError as e:
print(f"Error parsing metadata JSON: {e}. Check the format in {csv_path}")
traceback.print_exc() # Print traceback for JSON errors
return
except Exception as e:
print(f"Error processing metadata: {e}")
traceback.print_exc() # Print traceback for other metadata errors
return
if not texts or not metadatas or len(texts) != len(metadatas):
print(f"Error: Mismatch or empty texts/metadatas. Texts: {len(texts)}, Metadatas: {len(metadatas)}")
return
print("Initializing embeddings model...")
try:
embeddings = CustomHuggingFaceEmbeddings()
except Exception as e:
print(f"Error initializing embeddings model: {e}")
traceback.print_exc()
return
print("Embeddings model initialized successfully.")
vectorstore = None
num_batches = math.ceil(len(texts) / batch_size)
print(f"Processing {len(texts)} texts in {num_batches} batches of size {batch_size}...")
for i in range(num_batches):
start_idx = i * batch_size
end_idx = min((i + 1) * batch_size, len(texts))
batch_texts = texts[start_idx:end_idx]
batch_metadatas = metadatas[start_idx:end_idx]
print(f" Processing batch {i + 1}/{num_batches} (indices {start_idx}-{end_idx - 1})...")
try:
if i == 0:
# Initialize the vector store with the first batch
print(f" Initializing FAISS index with first batch...")
vectorstore = FAISS.from_texts(texts=batch_texts, embedding=embeddings, metadatas=batch_metadatas)
print(" FAISS index initialized.")
else:
# Add subsequent batches to the existing store
if vectorstore is None:
print("Error: vectorstore is None after first batch, cannot add more texts.")
return # Should not happen if first batch succeeded
print(f" Adding batch {i + 1} to FAISS index...")
vectorstore.add_texts(texts=batch_texts, metadatas=batch_metadatas)
print(f" Batch {i + 1} added.")
except Exception as e:
print(f"Error processing batch {i + 1} (indices {start_idx}-{end_idx - 1}): {e}")
traceback.print_exc()
print("Stopping index creation due to error in batch processing.")
return # Exit if any batch fails
if vectorstore is None:
print("Error: Failed to create or add any data to the vectorstore.")
return
# Save the completed index
try:
print(f"Attempting to save final FAISS index files to directory: {index_save_path}")
# Ensure the target directory exists before saving
Path(index_save_path).mkdir(parents=True, exist_ok=True)
vectorstore.save_local(index_save_path)
print(f"Successfully saved final FAISS index files (index.faiss, index.pkl) to: {index_save_path}")
except Exception as e:
print(f"Error during final vectorstore.save_local to {index_save_path}: {e}")
traceback.print_exc()
if __name__ == "__main__":
# Define paths relative to this script or use absolute paths
PROCESSED_DIR = Path("data/processed")
INPUT_CSV = str(PROCESSED_DIR / "paragraphs.csv")
# FAISS save_local will save index.faiss and index.pkl in this directory
INDEX_SAVE_DIR = str(PROCESSED_DIR) # Save directly to processed dir
build_faiss_index_from_csv(INPUT_CSV, INDEX_SAVE_DIR, batch_size=128)