feat: refactor download and upload scripts for improved argument handling (more notebook friendly :D)

main
thinhlpg 1 month ago
parent fa3c0562fe
commit da60b52bd1

@ -1,37 +1,63 @@
"""Download model from HuggingFace Hub. """Download model from HuggingFace Hub.
This script downloads a model repository from HuggingFace Hub to local directory. This script downloads a model repository from HuggingFace Hub to local directory.
Example:
python download_checkpoint.py --repo-id "org/model-name" --local-dir "models"
""" """
import argparse
import os import os
from dotenv import load_dotenv from dotenv import load_dotenv
from huggingface_hub import snapshot_download from huggingface_hub import snapshot_download
load_dotenv(override=True)
def parse_args() -> argparse.Namespace:
# Configuration """Parse command line arguments.
REPO_ID = "janhq/250403-runpod-qwen7b-r1-distil"
LOCAL_DIR = "downloaded_model" # Where to save the model Returns:
HF_TOKEN = os.getenv("HF_TOKEN") argparse.Namespace: Parsed arguments
"""
# Files to ignore during download parser = argparse.ArgumentParser(description="Download model from HuggingFace Hub")
IGNORE_PATTERNS = [ parser.add_argument(
"*.log", # Log files "--repo-id", type=str, default="janhq/250403-llama-3.2-3b-instruct-grpo", help="HuggingFace repository ID"
"*.pyc", # Python cache )
".git*", # Git files parser.add_argument("--local-dir", type=str, default="downloaded_model", help="Local directory to save model")
"*.bin", # Binary files
"*.pt", # PyTorch checkpoints return parser.parse_args()
"*.ckpt", # Checkpoints
"events.*", # Tensorboard
"wandb/*", # Weights & Biases def main():
"runs/*", # Training runs """Main function to download model."""
] args = parse_args()
load_dotenv(override=True)
# Download the model
snapshot_download( # Configuration
token=HF_TOKEN, HF_TOKEN = os.getenv("HF_TOKEN")
repo_id=REPO_ID,
local_dir=LOCAL_DIR, # Files to ignore during download
# ignore_patterns=IGNORE_PATTERNS, IGNORE_PATTERNS = [
) "*.log", # Log files
print(f"✅ Done: {REPO_ID} -> {LOCAL_DIR}") "*.pyc", # Python cache
".git*", # Git files
"*.bin", # Binary files
"*.pt", # PyTorch checkpoints
"*.ckpt", # Checkpoints
"events.*", # Tensorboard
"wandb/*", # Weights & Biases
"runs/*", # Training runs
]
# Download the model
snapshot_download(
token=HF_TOKEN,
repo_id=args.repo_id,
local_dir=args.local_dir,
repo_type="model",
# ignore_patterns=IGNORE_PATTERNS
)
print(f"✅ Done: {args.repo_id} -> {args.local_dir}")
if __name__ == "__main__":
main()

@ -1,39 +1,61 @@
"""Upload local directory to HuggingFace Hub. """Upload local directory to HuggingFace Hub.
This script uploads a specified local directory to HuggingFace Hub as a private repository. This script uploads a specified local directory to HuggingFace Hub as a private repository.
It uses API token from HuggingFace for authentication.
Example:
python upload_checkpoint.py --local-dir "models/my_model" --repo-id "org/model-name"
""" """
import argparse
import os import os
from dotenv import load_dotenv from dotenv import load_dotenv
from huggingface_hub import HfApi from huggingface_hub import HfApi
load_dotenv(override=True)
def parse_args() -> argparse.Namespace:
# Configuration """Parse command line arguments.
LOCAL_DIR = "trainer_output_deepseek-ai_DeepSeek-R1-Distill-Qwen-7B_gpu0_20250403_050520"
REPO_ID = "janhq/250403-runpod-qwen7b-r1-distil" Returns:
HF_TOKEN = os.getenv("HF_TOKEN") argparse.Namespace: Parsed arguments
"""
# Files to ignore during upload parser = argparse.ArgumentParser(description="Upload model to HuggingFace Hub")
IGNORE_PATTERNS = [ parser.add_argument("--local-dir", type=str, required=True, help="Local directory to upload")
"*.log", # Log files parser.add_argument("--repo-id", type=str, required=True, help="HuggingFace repository ID")
"*.pyc", # Python cache parser.add_argument("--public", action="store_true", help="Make repository public (default: private)")
".git*", # Git files return parser.parse_args()
"*.bin", # Binary files
"*.pt", # PyTorch checkpoints
"*.ckpt", # Checkpoints def main():
"events.*", # Tensorboard """Main function to upload model."""
"wandb/*", # Weights & Biases args = parse_args()
"runs/*", # Training runs load_dotenv(override=True)
]
# Configuration
api = HfApi(token=HF_TOKEN) HF_TOKEN = os.getenv("HF_TOKEN")
api.create_repo(repo_id=REPO_ID, private=True, exist_ok=True, repo_type="model")
api.upload_folder( # Files to ignore during upload
folder_path=LOCAL_DIR, IGNORE_PATTERNS = [
repo_id=REPO_ID, "*.log", # Log files
repo_type="model", "*.pyc", # Python cache
# ignore_patterns=IGNORE_PATTERNS, ".git*", # Git files
) "*.bin", # Binary files
print(f"✅ Done: {LOCAL_DIR} -> {REPO_ID}") "*.pt", # PyTorch checkpoints
"*.ckpt", # Checkpoints
"events.*", # Tensorboard
"wandb/*", # Weights & Biases
"runs/*", # Training runs
]
api = HfApi(token=HF_TOKEN)
api.create_repo(repo_id=args.repo_id, private=not args.public, exist_ok=True, repo_type="model")
api.upload_folder(
folder_path=args.local_dir,
repo_id=args.repo_id,
repo_type="model",
# ignore_patterns=IGNORE_PATTERNS
)
print(f"✅ Done: {args.local_dir} -> {args.repo_id}")
if __name__ == "__main__":
main()

Loading…
Cancel
Save