You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

44 lines
1.1 KiB

"""Download and extract FlashRAG index."""
import os
import zipfile
import requests
from tqdm import tqdm
from config import DATA_DIR
# Constants
URL = "https://www.modelscope.cn/datasets/hhjinjiajie/FlashRAG_Dataset/resolve/master/retrieval_corpus/wiki18_100w_e5_index.zip"
ZIP_NAME = "wiki18_100w_e5_index.zip"
zip_path = DATA_DIR / ZIP_NAME
# Download with progress bar
print("📥 Downloading index...")
response = requests.get(URL, stream=True)
total_size = int(response.headers.get("content-length", 0))
with (
open(zip_path, "wb") as f,
tqdm(
desc=ZIP_NAME,
total=total_size,
unit="iB",
unit_scale=True,
unit_divisor=1024,
) as bar,
):
for data in response.iter_content(chunk_size=1024):
size = f.write(data)
bar.update(size)
# Extract
print("📦 Extracting index...")
with zipfile.ZipFile(zip_path, "r") as zip_ref:
zip_ref.extractall(DATA_DIR)
# Clean up zip
os.remove(zip_path)
print("✅ Download and extraction completed successfully!")
print(f"Index file is at: {DATA_DIR}/data00/jiajie_jin/flashrag_indexes/wiki_dpr_100w/e5_flat_inner.index")