feat: enhance evaluation scripts for base and LoRA models

main
thinhlpg 1 month ago
parent da60b52bd1
commit 6d994feeb2

@ -1,7 +1,9 @@
"""Simple script to evaluate base model performance."""
import argparse
import os
import sys
from datetime import datetime
from pathlib import Path
# Add project root to Python path
@ -15,18 +17,20 @@ from src import (
apply_chat_template,
build_reward_correctness_fn,
build_user_prompt,
get_qa_dataset,
get_system_prompt,
run_eval,
)
from src.config import logger
def main():
"""Run base model evaluation."""
parser = argparse.ArgumentParser(description="Evaluate base model")
parser.add_argument("--model_name", type=str, required=True, help="Name/path of the model to evaluate")
parser.add_argument("--temperature", type=float, default=0, help="Sampling temperature")
args = parser.parse_args()
print(f"🚀 Setting up model {args.model_name}...")
logger.info(f"🚀 Setting up model {args.model_name}...")
# Setup model
model, tokenizer = FastLanguageModel.from_pretrained(
@ -38,7 +42,14 @@ def main():
# Setup sampling params
sampling_params = SamplingParams(
temperature=0.5,
temperature=args.temperature,
top_p=0.95,
max_tokens=4096,
)
# Setup verifier with lower temperature
verifier_params = SamplingParams(
temperature=0.1, # Lower temperature for more consistent verification
top_p=0.95,
max_tokens=4096,
)
@ -59,40 +70,51 @@ def main():
[apply_chat_template(msg, tokenizer=tokenizer)["text"] for msg in messages],
sampling_params=sampling_params,
)
return outputs
def verifier_generate_fn(inputs):
"""Generate verification responses with lower temperature."""
messages = [
{
"messages": [
{"role": "system", "content": get_system_prompt()},
{"role": "user", "content": build_user_prompt(input_text)},
]
}
for input_text in inputs
]
# Format outputs as chat messages
formatted_outputs = []
for output in outputs:
formatted_outputs.append(
{
"messages": [
{"role": "system", "content": get_system_prompt()},
{"role": "assistant", "content": output.outputs[0].text},
]
}
)
return formatted_outputs
# Get dataset
_, test_dataset = get_qa_dataset()
questions = test_dataset["prompt"]
answers = test_dataset["answer"]
print(f"📝 Evaluating {len(questions)} questions...")
return model.fast_generate(
[apply_chat_template(msg, tokenizer=tokenizer)["text"] for msg in messages],
sampling_params=verifier_params,
)
# Build verifier
verify_fn = build_reward_correctness_fn(generate_fn, tokenizer)
# Run evaluation
completions = generate_fn(questions)
rewards = verify_fn(questions, completions, answer=answers)
accuracy = sum(rewards) / len(rewards)
print(f"\n{'=' * 50}")
print("🎯 BASE MODEL EVALUATION RESULTS:")
print(f"{'=' * 50}")
print(f"✨ Model: {args.model_name}")
print(f"📊 Accuracy: {accuracy:.4f} ({sum(rewards)}/{len(rewards)} correct)")
verify_fn = build_reward_correctness_fn(verifier_generate_fn, tokenizer)
# Setup output directories
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
eval_log_dir = "eval_logs"
os.makedirs(eval_log_dir, exist_ok=True)
output_file = os.path.join(eval_log_dir, f"base_model_results_{timestamp}.txt")
debug_file = os.path.join(eval_log_dir, f"base_model_debug_{timestamp}.json")
logger.info("📝 Starting evaluation...")
logger.info(f"Results will be saved to: {output_file}")
logger.info(f"Debug info will be saved to: {debug_file}")
# Run evaluation using the agentic approach
full_chat_states = run_eval(
generate_fn=generate_fn,
verify_fn=verify_fn,
tokenizer=tokenizer,
output_file=output_file,
debug_file=debug_file,
)
logger.info("✨ Evaluation completed!")
logger.info(f"Check {output_file} for detailed results")
if __name__ == "__main__":

@ -1,7 +1,10 @@
"""Simple script to evaluate LoRA model performance."""
"""Script to evaluate LoRA model performance with enhanced debugging."""
import argparse
import json
import os
import sys
from datetime import datetime
from pathlib import Path
# Add project root to Python path
@ -15,9 +18,10 @@ from src import (
apply_chat_template,
build_reward_correctness_fn,
build_user_prompt,
get_qa_dataset,
get_system_prompt,
run_eval,
)
from src.config import logger
def main():
@ -25,40 +29,46 @@ def main():
parser = argparse.ArgumentParser(description="Evaluate LoRA model")
parser.add_argument("--model_name", type=str, required=True, help="Name/path of the base model")
parser.add_argument("--lora_path", type=str, required=True, help="Path to LoRA weights")
parser.add_argument("--temperature", type=float, default=0, help="Sampling temperature")
args = parser.parse_args()
print(f"🚀 Setting up model {args.model_name} with LoRA from {args.lora_path}...")
logger.info(f"🚀 Setting up model {args.model_name} with LoRA from {args.lora_path}...")
# Setup model with LoRA support
# Load LoRA config first to get max rank
with open(f"{args.lora_path}/adapter_config.json") as f:
lora_config = json.load(f)
# Setup model with LoRA support using config values
model, tokenizer = FastLanguageModel.from_pretrained(
model_name=args.model_name,
max_seq_length=4096 * 2,
load_in_4bit=True,
fast_inference=True,
max_lora_rank=64,
max_lora_rank=lora_config["r"], # Use rank from config
)
# Setup LoRA
# Setup LoRA using config
model = FastLanguageModel.get_peft_model(
model,
r=64,
target_modules=[
"q_proj",
"k_proj",
"v_proj",
"o_proj",
"gate_proj",
"up_proj",
"down_proj",
],
lora_alpha=64,
r=lora_config["r"],
target_modules=lora_config["target_modules"],
lora_alpha=lora_config["lora_alpha"],
lora_dropout=lora_config["lora_dropout"],
bias=lora_config["bias"],
use_gradient_checkpointing=True,
random_state=3407,
)
# Setup sampling params
sampling_params = SamplingParams(
temperature=0.5,
temperature=args.temperature,
top_p=0.95,
max_tokens=4096,
)
# Setup verifier with lower temperature
verifier_params = SamplingParams(
temperature=0.1, # Lower temperature for more consistent verification
top_p=0.95,
max_tokens=4096,
)
@ -81,41 +91,53 @@ def main():
sampling_params=sampling_params,
lora_request=lora_request,
)
return outputs
# Format outputs as chat messages
formatted_outputs = []
for output in outputs:
formatted_outputs.append(
{
"messages": [
{"role": "system", "content": get_system_prompt()},
{"role": "assistant", "content": output.outputs[0].text},
]
}
)
return formatted_outputs
# Get dataset
_, test_dataset = get_qa_dataset()
questions = test_dataset["prompt"]
answers = test_dataset["answer"]
print(f"📝 Evaluating {len(questions)} questions...")
def verifier_generate_fn(inputs):
"""Generate verification responses with lower temperature."""
messages = [
{
"messages": [
{"role": "system", "content": get_system_prompt()},
{"role": "user", "content": build_user_prompt(input_text)},
]
}
for input_text in inputs
]
lora_request = model.load_lora(args.lora_path)
return model.fast_generate(
[apply_chat_template(msg, tokenizer=tokenizer)["text"] for msg in messages],
sampling_params=verifier_params,
lora_request=lora_request,
)
# Build verifier
verify_fn = build_reward_correctness_fn(generate_fn, tokenizer)
# Run evaluation
completions = generate_fn(questions)
rewards = verify_fn(questions, completions, answer=answers)
accuracy = sum(rewards) / len(rewards)
print(f"\n{'=' * 50}")
print("🎯 LORA MODEL EVALUATION RESULTS:")
print(f"{'=' * 50}")
print(f"✨ Base Model: {args.model_name}")
print(f"🔧 LoRA Path: {args.lora_path}")
print(f"📊 Accuracy: {accuracy:.4f} ({sum(rewards)}/{len(rewards)} correct)")
verify_fn = build_reward_correctness_fn(verifier_generate_fn, tokenizer)
# Setup output directories
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
eval_log_dir = "eval_logs"
os.makedirs(eval_log_dir, exist_ok=True)
output_file = os.path.join(eval_log_dir, f"lora_model_results_{timestamp}.txt")
debug_file = os.path.join(eval_log_dir, f"lora_model_debug_{timestamp}.json")
logger.info("📝 Starting evaluation...")
logger.info(f"Results will be saved to: {output_file}")
logger.info(f"Debug info will be saved to: {debug_file}")
# Run evaluation using the agentic approach
full_chat_states = run_eval(
generate_fn=generate_fn,
verify_fn=verify_fn,
tokenizer=tokenizer,
output_file=output_file,
debug_file=debug_file,
)
logger.info("✨ Evaluation completed!")
logger.info(f"Check {output_file} for detailed results")
if __name__ == "__main__":

@ -167,7 +167,7 @@ def run_eval(generate_fn, verify_fn, tokenizer, output_file=None, debug_file=Non
# Create agent with appropriate adapter based on tokenizer
tokenizer_name = tokenizer.name_or_path.lower()
if "deepseek-r1-distill" in tokenizer_name:
if "deepseek-ai/deepseek-r1-distill" in tokenizer_name:
adapter = R1DistilTokenizerAdapter()
elif "llama" in tokenizer_name:
adapter = LlamaTokenizerAdapter()

Loading…
Cancel
Save