You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

102 lines
3.8 KiB

import json
import sys
from collections import defaultdict # Use defaultdict for cleaner accumulation
from pathlib import Path
import pandas as pd
# Add project root to Python path if needed (adjust relative path as necessary)
# project_root = Path(__file__).resolve().parent.parent
# sys.path.append(str(project_root))
# from config import logger # Assuming you have a logger setup
def extract_unique_paragraphs(input_paths: list[str], output_csv_path: str) -> None:
"""Extracts unique paragraphs from specified JSONL files.
Reads Musique JSONL files (train, dev, test), finds unique paragraphs
(regardless of is_supporting flag), combines title and text,
tracks source question IDs, and saves to CSV.
Args:
input_paths: A list of paths to the input JSONL files.
output_csv_path: Path to save the output CSV file.
"""
output_dir = Path(output_csv_path).parent
output_dir.mkdir(parents=True, exist_ok=True)
# Use paragraph content as key, value is the set of source question IDs
paragraphs_data = defaultdict(set)
print("Starting paragraph extraction (including non-supporting)...")
for file_path in input_paths:
print(f"Processing file: {file_path}")
try:
with open(file_path, "r", encoding="utf-8") as infile:
for line_num, line in enumerate(infile, 1):
try:
data = json.loads(line)
main_question_id = data.get("id")
if not main_question_id:
print(f"Warning: Missing 'id' in line {line_num} of {file_path}")
continue
for p in data.get("paragraphs", []):
title = p.get("title", "No Title")
text = p.get("paragraph_text", "")
content = f"{title}\n{text}".strip()
if not content:
continue # Skip empty paragraphs
paragraphs_data[content].add(main_question_id)
except json.JSONDecodeError:
print(f"Warning: Skipping invalid JSON in line {line_num} of {file_path}")
except Exception as e:
print(f"Warning: Error processing line {line_num} in {file_path}: {e}")
except FileNotFoundError:
print(f"Error: Input file not found: {file_path}")
except Exception as e:
print(f"Error reading file {file_path}: {e}")
print(f"Found {len(paragraphs_data)} unique paragraphs (supporting and non-supporting).")
# Prepare data for DataFrame
output_list = []
sorted_content = sorted(paragraphs_data.keys())
for chunk_id, content in enumerate(sorted_content, 1):
question_ids = paragraphs_data[content]
metadata = {"source_question_ids": sorted(list(question_ids))}
output_list.append(
{
"chunk_id": chunk_id,
"content": content,
"metadata": json.dumps(metadata), # Store metadata as JSON string
}
)
if not output_list:
print("No paragraphs found to save.")
return
df = pd.DataFrame(output_list)
try:
df.to_csv(output_csv_path, index=False)
print(f"Successfully saved unique paragraphs to {output_csv_path}")
except Exception as e:
print(f"Error saving CSV file: {e}")
if __name__ == "__main__":
RAW_DIR = Path("data/raw")
PROCESSED_DIR = Path("data/processed")
input_files = [
str(RAW_DIR / "musique_ans_v1.0_train.jsonl"),
str(RAW_DIR / "musique_ans_v1.0_dev.jsonl"),
str(RAW_DIR / "musique_ans_v1.0_test.jsonl"),
]
output_csv = str(PROCESSED_DIR / "paragraphs.csv")
extract_unique_paragraphs(input_files, output_csv)