You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
69 lines
1.7 KiB
69 lines
1.7 KiB
import argparse
|
|
import os
|
|
import zipfile
|
|
|
|
from dotenv import load_dotenv
|
|
from huggingface_hub import snapshot_download
|
|
|
|
from config import DATA_DIR
|
|
|
|
|
|
def parse_args() -> argparse.Namespace:
|
|
"""Parse command line arguments.
|
|
|
|
Returns:
|
|
argparse.Namespace: Parsed arguments
|
|
"""
|
|
parser = argparse.ArgumentParser(description="Download FlashRAG datasets from HuggingFace Hub")
|
|
parser.add_argument(
|
|
"--repo-id",
|
|
type=str,
|
|
default="RUC-NLPIR/FlashRAG_datasets",
|
|
help="HuggingFace repository IDs",
|
|
)
|
|
parser.add_argument(
|
|
"--local-dir",
|
|
type=str,
|
|
default=DATA_DIR / "flashrag_datasets",
|
|
help="Local directory to save model",
|
|
)
|
|
|
|
return parser.parse_args()
|
|
|
|
|
|
def main():
|
|
"""Main function to download model."""
|
|
args = parse_args()
|
|
load_dotenv(override=True)
|
|
|
|
# Configuration
|
|
HF_TOKEN = os.getenv("HF_TOKEN")
|
|
|
|
ALLOW_PATTERNS = [
|
|
"*retrieval-corpus*",
|
|
"*bamboogle*",
|
|
"*nq*",
|
|
]
|
|
|
|
# Download the model
|
|
snapshot_download(
|
|
token=HF_TOKEN,
|
|
repo_id=args.repo_id,
|
|
local_dir=args.local_dir,
|
|
repo_type="dataset",
|
|
# ignore_patterns=IGNORE_PATTERNS,
|
|
allow_patterns=ALLOW_PATTERNS,
|
|
)
|
|
|
|
# unzip data/flashrag_datasets/retrieval-corpus/wiki18_100w.zip
|
|
print("Unzipping wiki18_100w.zip. Might take a while...")
|
|
zip_file_path = os.path.join(args.local_dir, "retrieval-corpus", "wiki18_100w.zip")
|
|
with zipfile.ZipFile(zip_file_path, "r") as zip_ref:
|
|
zip_ref.extractall(args.local_dir)
|
|
|
|
print(f"✅ Done: {args.repo_id} -> {args.local_dir}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|