diff --git a/scripts/download_checkpoint.py b/scripts/download_checkpoint.py index e40657f..b3f4e25 100644 --- a/scripts/download_checkpoint.py +++ b/scripts/download_checkpoint.py @@ -1,37 +1,63 @@ """Download model from HuggingFace Hub. This script downloads a model repository from HuggingFace Hub to local directory. + +Example: + python download_checkpoint.py --repo-id "org/model-name" --local-dir "models" """ +import argparse import os from dotenv import load_dotenv from huggingface_hub import snapshot_download -load_dotenv(override=True) - -# Configuration -REPO_ID = "janhq/250403-runpod-qwen7b-r1-distil" -LOCAL_DIR = "downloaded_model" # Where to save the model -HF_TOKEN = os.getenv("HF_TOKEN") - -# Files to ignore during download -IGNORE_PATTERNS = [ - "*.log", # Log files - "*.pyc", # Python cache - ".git*", # Git files - "*.bin", # Binary files - "*.pt", # PyTorch checkpoints - "*.ckpt", # Checkpoints - "events.*", # Tensorboard - "wandb/*", # Weights & Biases - "runs/*", # Training runs -] - -# Download the model -snapshot_download( - token=HF_TOKEN, - repo_id=REPO_ID, - local_dir=LOCAL_DIR, - # ignore_patterns=IGNORE_PATTERNS, -) -print(f"✅ Done: {REPO_ID} -> {LOCAL_DIR}") + +def parse_args() -> argparse.Namespace: + """Parse command line arguments. + + Returns: + argparse.Namespace: Parsed arguments + """ + parser = argparse.ArgumentParser(description="Download model from HuggingFace Hub") + parser.add_argument( + "--repo-id", type=str, default="janhq/250403-llama-3.2-3b-instruct-grpo", help="HuggingFace repository ID" + ) + parser.add_argument("--local-dir", type=str, default="downloaded_model", help="Local directory to save model") + + return parser.parse_args() + + +def main(): + """Main function to download model.""" + args = parse_args() + load_dotenv(override=True) + + # Configuration + HF_TOKEN = os.getenv("HF_TOKEN") + + # Files to ignore during download + IGNORE_PATTERNS = [ + "*.log", # Log files + "*.pyc", # Python cache + ".git*", # Git files + "*.bin", # Binary files + "*.pt", # PyTorch checkpoints + "*.ckpt", # Checkpoints + "events.*", # Tensorboard + "wandb/*", # Weights & Biases + "runs/*", # Training runs + ] + + # Download the model + snapshot_download( + token=HF_TOKEN, + repo_id=args.repo_id, + local_dir=args.local_dir, + repo_type="model", + # ignore_patterns=IGNORE_PATTERNS + ) + print(f"✅ Done: {args.repo_id} -> {args.local_dir}") + + +if __name__ == "__main__": + main() diff --git a/scripts/upload_checkpoint.py b/scripts/upload_checkpoint.py index ca28628..cb911cf 100644 --- a/scripts/upload_checkpoint.py +++ b/scripts/upload_checkpoint.py @@ -1,39 +1,61 @@ """Upload local directory to HuggingFace Hub. This script uploads a specified local directory to HuggingFace Hub as a private repository. -It uses API token from HuggingFace for authentication. + +Example: + python upload_checkpoint.py --local-dir "models/my_model" --repo-id "org/model-name" """ +import argparse import os from dotenv import load_dotenv from huggingface_hub import HfApi -load_dotenv(override=True) - -# Configuration -LOCAL_DIR = "trainer_output_deepseek-ai_DeepSeek-R1-Distill-Qwen-7B_gpu0_20250403_050520" -REPO_ID = "janhq/250403-runpod-qwen7b-r1-distil" -HF_TOKEN = os.getenv("HF_TOKEN") - -# Files to ignore during upload -IGNORE_PATTERNS = [ - "*.log", # Log files - "*.pyc", # Python cache - ".git*", # Git files - "*.bin", # Binary files - "*.pt", # PyTorch checkpoints - "*.ckpt", # Checkpoints - "events.*", # Tensorboard - "wandb/*", # Weights & Biases - "runs/*", # Training runs -] - -api = HfApi(token=HF_TOKEN) -api.create_repo(repo_id=REPO_ID, private=True, exist_ok=True, repo_type="model") -api.upload_folder( - folder_path=LOCAL_DIR, - repo_id=REPO_ID, - repo_type="model", - # ignore_patterns=IGNORE_PATTERNS, -) -print(f"✅ Done: {LOCAL_DIR} -> {REPO_ID}") + +def parse_args() -> argparse.Namespace: + """Parse command line arguments. + + Returns: + argparse.Namespace: Parsed arguments + """ + parser = argparse.ArgumentParser(description="Upload model to HuggingFace Hub") + parser.add_argument("--local-dir", type=str, required=True, help="Local directory to upload") + parser.add_argument("--repo-id", type=str, required=True, help="HuggingFace repository ID") + parser.add_argument("--public", action="store_true", help="Make repository public (default: private)") + return parser.parse_args() + + +def main(): + """Main function to upload model.""" + args = parse_args() + load_dotenv(override=True) + + # Configuration + HF_TOKEN = os.getenv("HF_TOKEN") + + # Files to ignore during upload + IGNORE_PATTERNS = [ + "*.log", # Log files + "*.pyc", # Python cache + ".git*", # Git files + "*.bin", # Binary files + "*.pt", # PyTorch checkpoints + "*.ckpt", # Checkpoints + "events.*", # Tensorboard + "wandb/*", # Weights & Biases + "runs/*", # Training runs + ] + + api = HfApi(token=HF_TOKEN) + api.create_repo(repo_id=args.repo_id, private=not args.public, exist_ok=True, repo_type="model") + api.upload_folder( + folder_path=args.local_dir, + repo_id=args.repo_id, + repo_type="model", + # ignore_patterns=IGNORE_PATTERNS + ) + print(f"✅ Done: {args.local_dir} -> {args.repo_id}") + + +if __name__ == "__main__": + main()