ReZero-Search-LLM-Agent-Fork/scripts/train_data/extract_musique_paragraphs.py

import json
import sys
from collections import defaultdict  # Use defaultdict for cleaner accumulation
from pathlib import Path

import pandas as pd

# Add project root to Python path if needed (adjust relative path as necessary)
# project_root = Path(__file__).resolve().parent.parent
# sys.path.append(str(project_root))
# from config import logger # Assuming you have a logger setup


def extract_unique_paragraphs(input_paths: list[str], output_csv_path: str) -> None:
    """Extracts unique paragraphs from specified JSONL files.

    Reads Musique JSONL files (train, dev, test), finds unique paragraphs
    (regardless of is_supporting flag), combines title and text,
    tracks source question IDs, and saves to CSV.

    Args:
        input_paths: A list of paths to the input JSONL files.
        output_csv_path: Path to save the output CSV file.
    """
    output_dir = Path(output_csv_path).parent
    output_dir.mkdir(parents=True, exist_ok=True)

    # Use paragraph content as key, value is the set of source question IDs
    paragraphs_data = defaultdict(set)
    print("Starting paragraph extraction (including non-supporting)...")

    for file_path in input_paths:
        print(f"Processing file: {file_path}")
        try:
            with open(file_path, "r", encoding="utf-8") as infile:
                for line_num, line in enumerate(infile, 1):
                    try:
                        data = json.loads(line)
                        main_question_id = data.get("id")
                        if not main_question_id:
                            print(f"Warning: Missing 'id' in line {line_num} of {file_path}")
                            continue

                        for p in data.get("paragraphs", []):
                            title = p.get("title", "No Title")
                            text = p.get("paragraph_text", "")
                            content = f"{title}\n{text}".strip()

                            if not content:
                                continue  # Skip empty paragraphs

                            paragraphs_data[content].add(main_question_id)

                    except json.JSONDecodeError:
                        print(f"Warning: Skipping invalid JSON in line {line_num} of {file_path}")
                    except Exception as e:
                        print(f"Warning: Error processing line {line_num} in {file_path}: {e}")
        except FileNotFoundError:
            print(f"Error: Input file not found: {file_path}")
        except Exception as e:
            print(f"Error reading file {file_path}: {e}")

    print(f"Found {len(paragraphs_data)} unique paragraphs (supporting and non-supporting).")

    # Prepare data for DataFrame
    output_list = []
    sorted_content = sorted(paragraphs_data.keys())
    for chunk_id, content in enumerate(sorted_content, 1):
        question_ids = paragraphs_data[content]
        metadata = {"source_question_ids": sorted(list(question_ids))}
        output_list.append(
            {
                "chunk_id": chunk_id,
                "content": content,
                "metadata": json.dumps(metadata),  # Store metadata as JSON string
            }
        )

    if not output_list:
        print("No paragraphs found to save.")
        return
    df = pd.DataFrame(output_list)
    try:
        df.to_csv(output_csv_path, index=False)
        print(f"Successfully saved unique paragraphs to {output_csv_path}")
    except Exception as e:
        print(f"Error saving CSV file: {e}")


if __name__ == "__main__":
    RAW_DIR = Path("data/raw")
    PROCESSED_DIR = Path("data/processed")

    input_files = [
        str(RAW_DIR / "musique_ans_v1.0_train.jsonl"),
        str(RAW_DIR / "musique_ans_v1.0_dev.jsonl"),
        str(RAW_DIR / "musique_ans_v1.0_test.jsonl"),
    ]
    output_csv = str(PROCESSED_DIR / "paragraphs.csv")

    extract_unique_paragraphs(input_files, output_csv)