From 6d994feeb21f4a72092c660077b82b9b8ae6fc62 Mon Sep 17 00:00:00 2001 From: thinhlpg Date: Thu, 3 Apr 2025 22:45:06 +0700 Subject: [PATCH] feat: enhance evaluation scripts for base and LoRA models --- scripts/eval_base.py | 90 +++++++++++++++++++------------ scripts/eval_lora.py | 122 +++++++++++++++++++++++++------------------ src/evaluation.py | 2 +- 3 files changed, 129 insertions(+), 85 deletions(-) diff --git a/scripts/eval_base.py b/scripts/eval_base.py index d0de2bd..53167e9 100644 --- a/scripts/eval_base.py +++ b/scripts/eval_base.py @@ -1,7 +1,9 @@ """Simple script to evaluate base model performance.""" import argparse +import os import sys +from datetime import datetime from pathlib import Path # Add project root to Python path @@ -15,18 +17,20 @@ from src import ( apply_chat_template, build_reward_correctness_fn, build_user_prompt, - get_qa_dataset, get_system_prompt, + run_eval, ) +from src.config import logger def main(): """Run base model evaluation.""" parser = argparse.ArgumentParser(description="Evaluate base model") parser.add_argument("--model_name", type=str, required=True, help="Name/path of the model to evaluate") + parser.add_argument("--temperature", type=float, default=0, help="Sampling temperature") args = parser.parse_args() - print(f"🚀 Setting up model {args.model_name}...") + logger.info(f"🚀 Setting up model {args.model_name}...") # Setup model model, tokenizer = FastLanguageModel.from_pretrained( @@ -38,7 +42,14 @@ def main(): # Setup sampling params sampling_params = SamplingParams( - temperature=0.5, + temperature=args.temperature, + top_p=0.95, + max_tokens=4096, + ) + + # Setup verifier with lower temperature + verifier_params = SamplingParams( + temperature=0.1, # Lower temperature for more consistent verification top_p=0.95, max_tokens=4096, ) @@ -59,40 +70,51 @@ def main(): [apply_chat_template(msg, tokenizer=tokenizer)["text"] for msg in messages], sampling_params=sampling_params, ) + return outputs + + def verifier_generate_fn(inputs): + """Generate verification responses with lower temperature.""" + messages = [ + { + "messages": [ + {"role": "system", "content": get_system_prompt()}, + {"role": "user", "content": build_user_prompt(input_text)}, + ] + } + for input_text in inputs + ] - # Format outputs as chat messages - formatted_outputs = [] - for output in outputs: - formatted_outputs.append( - { - "messages": [ - {"role": "system", "content": get_system_prompt()}, - {"role": "assistant", "content": output.outputs[0].text}, - ] - } - ) - return formatted_outputs - - # Get dataset - _, test_dataset = get_qa_dataset() - questions = test_dataset["prompt"] - answers = test_dataset["answer"] - - print(f"📝 Evaluating {len(questions)} questions...") + return model.fast_generate( + [apply_chat_template(msg, tokenizer=tokenizer)["text"] for msg in messages], + sampling_params=verifier_params, + ) # Build verifier - verify_fn = build_reward_correctness_fn(generate_fn, tokenizer) - - # Run evaluation - completions = generate_fn(questions) - rewards = verify_fn(questions, completions, answer=answers) - accuracy = sum(rewards) / len(rewards) - - print(f"\n{'=' * 50}") - print("🎯 BASE MODEL EVALUATION RESULTS:") - print(f"{'=' * 50}") - print(f"✨ Model: {args.model_name}") - print(f"📊 Accuracy: {accuracy:.4f} ({sum(rewards)}/{len(rewards)} correct)") + verify_fn = build_reward_correctness_fn(verifier_generate_fn, tokenizer) + + # Setup output directories + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + eval_log_dir = "eval_logs" + os.makedirs(eval_log_dir, exist_ok=True) + + output_file = os.path.join(eval_log_dir, f"base_model_results_{timestamp}.txt") + debug_file = os.path.join(eval_log_dir, f"base_model_debug_{timestamp}.json") + + logger.info("📝 Starting evaluation...") + logger.info(f"Results will be saved to: {output_file}") + logger.info(f"Debug info will be saved to: {debug_file}") + + # Run evaluation using the agentic approach + full_chat_states = run_eval( + generate_fn=generate_fn, + verify_fn=verify_fn, + tokenizer=tokenizer, + output_file=output_file, + debug_file=debug_file, + ) + + logger.info("✨ Evaluation completed!") + logger.info(f"Check {output_file} for detailed results") if __name__ == "__main__": diff --git a/scripts/eval_lora.py b/scripts/eval_lora.py index a8aafd3..0a70354 100644 --- a/scripts/eval_lora.py +++ b/scripts/eval_lora.py @@ -1,7 +1,10 @@ -"""Simple script to evaluate LoRA model performance.""" +"""Script to evaluate LoRA model performance with enhanced debugging.""" import argparse +import json +import os import sys +from datetime import datetime from pathlib import Path # Add project root to Python path @@ -15,9 +18,10 @@ from src import ( apply_chat_template, build_reward_correctness_fn, build_user_prompt, - get_qa_dataset, get_system_prompt, + run_eval, ) +from src.config import logger def main(): @@ -25,40 +29,46 @@ def main(): parser = argparse.ArgumentParser(description="Evaluate LoRA model") parser.add_argument("--model_name", type=str, required=True, help="Name/path of the base model") parser.add_argument("--lora_path", type=str, required=True, help="Path to LoRA weights") + parser.add_argument("--temperature", type=float, default=0, help="Sampling temperature") args = parser.parse_args() - print(f"🚀 Setting up model {args.model_name} with LoRA from {args.lora_path}...") + logger.info(f"🚀 Setting up model {args.model_name} with LoRA from {args.lora_path}...") - # Setup model with LoRA support + # Load LoRA config first to get max rank + with open(f"{args.lora_path}/adapter_config.json") as f: + lora_config = json.load(f) + + # Setup model with LoRA support using config values model, tokenizer = FastLanguageModel.from_pretrained( model_name=args.model_name, max_seq_length=4096 * 2, load_in_4bit=True, fast_inference=True, - max_lora_rank=64, + max_lora_rank=lora_config["r"], # Use rank from config ) - # Setup LoRA + # Setup LoRA using config model = FastLanguageModel.get_peft_model( model, - r=64, - target_modules=[ - "q_proj", - "k_proj", - "v_proj", - "o_proj", - "gate_proj", - "up_proj", - "down_proj", - ], - lora_alpha=64, + r=lora_config["r"], + target_modules=lora_config["target_modules"], + lora_alpha=lora_config["lora_alpha"], + lora_dropout=lora_config["lora_dropout"], + bias=lora_config["bias"], use_gradient_checkpointing=True, random_state=3407, ) # Setup sampling params sampling_params = SamplingParams( - temperature=0.5, + temperature=args.temperature, + top_p=0.95, + max_tokens=4096, + ) + + # Setup verifier with lower temperature + verifier_params = SamplingParams( + temperature=0.1, # Lower temperature for more consistent verification top_p=0.95, max_tokens=4096, ) @@ -81,41 +91,53 @@ def main(): sampling_params=sampling_params, lora_request=lora_request, ) + return outputs - # Format outputs as chat messages - formatted_outputs = [] - for output in outputs: - formatted_outputs.append( - { - "messages": [ - {"role": "system", "content": get_system_prompt()}, - {"role": "assistant", "content": output.outputs[0].text}, - ] - } - ) - return formatted_outputs - - # Get dataset - _, test_dataset = get_qa_dataset() - questions = test_dataset["prompt"] - answers = test_dataset["answer"] - - print(f"📝 Evaluating {len(questions)} questions...") + def verifier_generate_fn(inputs): + """Generate verification responses with lower temperature.""" + messages = [ + { + "messages": [ + {"role": "system", "content": get_system_prompt()}, + {"role": "user", "content": build_user_prompt(input_text)}, + ] + } + for input_text in inputs + ] + + lora_request = model.load_lora(args.lora_path) + return model.fast_generate( + [apply_chat_template(msg, tokenizer=tokenizer)["text"] for msg in messages], + sampling_params=verifier_params, + lora_request=lora_request, + ) # Build verifier - verify_fn = build_reward_correctness_fn(generate_fn, tokenizer) - - # Run evaluation - completions = generate_fn(questions) - rewards = verify_fn(questions, completions, answer=answers) - accuracy = sum(rewards) / len(rewards) - - print(f"\n{'=' * 50}") - print("🎯 LORA MODEL EVALUATION RESULTS:") - print(f"{'=' * 50}") - print(f"✨ Base Model: {args.model_name}") - print(f"🔧 LoRA Path: {args.lora_path}") - print(f"📊 Accuracy: {accuracy:.4f} ({sum(rewards)}/{len(rewards)} correct)") + verify_fn = build_reward_correctness_fn(verifier_generate_fn, tokenizer) + + # Setup output directories + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + eval_log_dir = "eval_logs" + os.makedirs(eval_log_dir, exist_ok=True) + + output_file = os.path.join(eval_log_dir, f"lora_model_results_{timestamp}.txt") + debug_file = os.path.join(eval_log_dir, f"lora_model_debug_{timestamp}.json") + + logger.info("📝 Starting evaluation...") + logger.info(f"Results will be saved to: {output_file}") + logger.info(f"Debug info will be saved to: {debug_file}") + + # Run evaluation using the agentic approach + full_chat_states = run_eval( + generate_fn=generate_fn, + verify_fn=verify_fn, + tokenizer=tokenizer, + output_file=output_file, + debug_file=debug_file, + ) + + logger.info("✨ Evaluation completed!") + logger.info(f"Check {output_file} for detailed results") if __name__ == "__main__": diff --git a/src/evaluation.py b/src/evaluation.py index 286df9d..e63711d 100644 --- a/src/evaluation.py +++ b/src/evaluation.py @@ -167,7 +167,7 @@ def run_eval(generate_fn, verify_fn, tokenizer, output_file=None, debug_file=Non # Create agent with appropriate adapter based on tokenizer tokenizer_name = tokenizer.name_or_path.lower() - if "deepseek-r1-distill" in tokenizer_name: + if "deepseek-ai/deepseek-r1-distill" in tokenizer_name: adapter = R1DistilTokenizerAdapter() elif "llama" in tokenizer_name: adapter = LlamaTokenizerAdapter()