You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

118 lines
4.0 KiB

import json
import sys
from pathlib import Path
import pandas as pd
# Add project root to Python path for imports
project_root = Path(__file__).resolve().parent.parent
sys.path.append(str(project_root))
# Assuming these are defined in your project structure
from config import DATA_DIR, logger # Adjust import as needed
from src.embeddings import CustomHuggingFaceEmbeddings
# Import FAISS after potentially adding to sys.path
try:
from langchain_community.vectorstores import FAISS
faiss_installed = True
except ImportError:
print("Warning: langchain_community or faiss not installed. Cannot check FAISS index.")
faiss_installed = False
def check_output_files(processed_dir: Path):
"""Prints head and tail of key processed files and FAISS index info.
Args:
processed_dir: The path to the 'data/processed' directory.
"""
print("--- Checking Processed Files ---")
# 1. Check paragraphs.csv
csv_path = processed_dir / "paragraphs.csv"
print(f"\n--- Checking {csv_path} ---")
try:
df = pd.read_csv(csv_path)
print("First 3 rows:")
print(df.head(3).to_string())
print("\nLast 3 rows:")
print(df.tail(3).to_string())
print(f"Total rows: {len(df)}")
except FileNotFoundError:
print(f"Error: {csv_path} not found.")
except Exception as e:
print(f"Error reading {csv_path}: {e}")
# 2. Check questions.jsonl
jsonl_path = processed_dir / "questions.jsonl"
print(f"\n--- Checking {jsonl_path} ---")
try:
with open(jsonl_path, "r", encoding="utf-8") as f:
lines = f.readlines()
num_lines = len(lines)
print(f"Total lines: {num_lines}")
if num_lines > 0:
print("\nFirst 3 lines (parsed JSON):")
for i in range(min(3, num_lines)):
try:
print(json.loads(lines[i].strip()))
except json.JSONDecodeError:
print(f" (Error parsing line {i + 1})")
if num_lines > 3:
print("\nLast 3 lines (parsed JSON):")
for i in range(max(0, num_lines - 3), num_lines):
try:
print(json.loads(lines[i].strip()))
except json.JSONDecodeError:
print(f" (Error parsing line {i + 1})")
elif num_lines > 0:
print("\n(Less than 6 lines total, showing all)")
except FileNotFoundError:
print(f"Error: {jsonl_path} not found.")
except Exception as e:
print(f"Error reading {jsonl_path}: {e}")
# 3. Check FAISS index
print(f"\n--- Checking FAISS Index in {processed_dir} ---")
if not faiss_installed:
print("Skipping FAISS check as required libraries are not installed.")
return
# FAISS loads from the directory containing index.faiss and index.pkl
index_dir = processed_dir
index_file = index_dir / "index.faiss"
pkl_file = index_dir / "index.pkl"
if not index_file.exists() or not pkl_file.exists():
print(f"Error: FAISS index files (index.faiss, index.pkl) not found in {index_dir}")
return
try:
print("Initializing embeddings model for loading index...")
embeddings = CustomHuggingFaceEmbeddings()
print("Loading FAISS index...")
# FAISS.load_local requires the folder_path and the embeddings object
vectorstore = FAISS.load_local(str(index_dir), embeddings, allow_dangerous_deserialization=True)
print("FAISS index loaded successfully.")
# Access the underlying FAISS index object to get the total number of vectors
print(f"Total vectors in index: {vectorstore.index.ntotal}")
except Exception as e:
print(f"Error loading or checking FAISS index from {index_dir}: {e}")
import traceback
traceback.print_exc()
print("\n--- Check Complete ---")
if __name__ == "__main__":
# Assuming the script is run from the project root or paths are relative
PROCESSED_PATH = Path("data/processed")
check_output_files(PROCESSED_PATH)