feat: update evaluation scripts to enhance model configuration and dataset loading, including increased max tokens and added logging

main
thinhlpg 4 weeks ago
parent bf9f2c4102
commit 424459d840

@ -13,6 +13,7 @@ sys.path.append(project_root)
from unsloth import FastLanguageModel from unsloth import FastLanguageModel
from vllm import SamplingParams from vllm import SamplingParams
from config import logger
from src import ( from src import (
apply_chat_template, apply_chat_template,
build_reward_correctness_fn, build_reward_correctness_fn,
@ -20,7 +21,6 @@ from src import (
get_system_prompt, get_system_prompt,
run_eval, run_eval,
) )
from config import logger
def main(): def main():
@ -35,23 +35,24 @@ def main():
# Setup model # Setup model
model, tokenizer = FastLanguageModel.from_pretrained( model, tokenizer = FastLanguageModel.from_pretrained(
model_name=args.model_name, model_name=args.model_name,
max_seq_length=4096 * 2, max_seq_length=4096 * 6,
load_in_4bit=True, load_in_4bit=True,
fast_inference=True, fast_inference=True,
gpu_memory_utilization=0.8,
) )
# Setup sampling params # Setup sampling params
sampling_params = SamplingParams( sampling_params = SamplingParams(
temperature=args.temperature, temperature=args.temperature,
top_p=0.95, top_p=0.95,
max_tokens=4096, max_tokens=4096 * 6,
) )
# Setup verifier with lower temperature # Setup verifier with lower temperature
verifier_params = SamplingParams( verifier_params = SamplingParams(
temperature=0.1, # Lower temperature for more consistent verification temperature=0.1, # Lower temperature for more consistent verification
top_p=0.95, top_p=0.95,
max_tokens=4096, max_tokens=4096 * 6,
) )
def generate_fn(inputs): def generate_fn(inputs):
@ -111,6 +112,7 @@ def main():
tokenizer=tokenizer, tokenizer=tokenizer,
output_file=output_file, output_file=output_file,
debug_file=debug_file, debug_file=debug_file,
max_generations=32,
) )
logger.info("✨ Evaluation completed!") logger.info("✨ Evaluation completed!")

@ -14,6 +14,7 @@ sys.path.append(project_root)
from unsloth import FastLanguageModel from unsloth import FastLanguageModel
from vllm import SamplingParams from vllm import SamplingParams
from config import logger
from src import ( from src import (
apply_chat_template, apply_chat_template,
build_reward_correctness_fn, build_reward_correctness_fn,
@ -21,7 +22,6 @@ from src import (
get_system_prompt, get_system_prompt,
run_eval, run_eval,
) )
from config import logger
def main(): def main():
@ -45,6 +45,7 @@ def main():
load_in_4bit=True, load_in_4bit=True,
fast_inference=True, fast_inference=True,
max_lora_rank=lora_config["r"], # Use rank from config max_lora_rank=lora_config["r"], # Use rank from config
gpu_memory_utilization=0.8,
) )
# Setup LoRA using config # Setup LoRA using config
@ -63,7 +64,7 @@ def main():
sampling_params = SamplingParams( sampling_params = SamplingParams(
temperature=args.temperature, temperature=args.temperature,
top_p=0.95, top_p=0.95,
max_tokens=4096, max_tokens=4096 * 2,
) )
# Setup verifier with lower temperature # Setup verifier with lower temperature
@ -134,6 +135,7 @@ def main():
tokenizer=tokenizer, tokenizer=tokenizer,
output_file=output_file, output_file=output_file,
debug_file=debug_file, debug_file=debug_file,
max_generations=32,
) )
logger.info("✨ Evaluation completed!") logger.info("✨ Evaluation completed!")

@ -5,7 +5,7 @@ Evaluation utilities for RL training.
import inspect import inspect
from datetime import datetime from datetime import datetime
from config import logger from config import DATA_DIR, logger
from src.agent import Agent from src.agent import Agent
from src.search_module import get_qa_dataset from src.search_module import get_qa_dataset
from src.tokenizer_adapter import LlamaTokenizerAdapter, R1DistilTokenizerAdapter from src.tokenizer_adapter import LlamaTokenizerAdapter, R1DistilTokenizerAdapter
@ -162,7 +162,11 @@ def run_eval(generate_fn, verify_fn, tokenizer, max_generations=20, output_file=
Returns: Returns:
full_chat_states: The chat states from evaluation full_chat_states: The chat states from evaluation
""" """
train_dataset, test_dataset = get_qa_dataset() train_dataset, test_dataset = get_qa_dataset(
randomize=False,
test_size=1,
questions_path=DATA_DIR / "processed" / "questions_dev.jsonl",
)
questions = test_dataset["prompt"] questions = test_dataset["prompt"]
# Create agent with appropriate adapter based on tokenizer # Create agent with appropriate adapter based on tokenizer

@ -74,10 +74,20 @@ def search(query: str, return_type=str, results: int = 5):
# Load questions from saved data # Load questions from saved data
def load_qa_data(): def load_qa_data(questions_path=None):
"""Load the pre-generated questions""" """
Load the pre-generated questions
Args:
questions_path: Path to questions file (default: PROCESSED_DATA_DIR / "questions.jsonl")
Returns:
List of question-answer pairs
"""
try: try:
questions_path = PROCESSED_DATA_DIR / "questions.jsonl" if questions_path is None:
questions_path = PROCESSED_DATA_DIR / "questions.jsonl"
logger.info(f"Loading questions from: {questions_path}") logger.info(f"Loading questions from: {questions_path}")
# Load the questions # Load the questions
@ -144,11 +154,11 @@ def get_question_count() -> int:
return len(questions) return len(questions)
def get_qa_dataset(randomize: bool = False, test_size: float = 0.1, seed: int = 42) -> tuple: def get_qa_dataset(randomize: bool = False, test_size: float = 0.1, seed: int = 42, questions_path=None) -> tuple:
""" """
Return a HuggingFace Dataset containing question and answer pairs. Return a HuggingFace Dataset containing question and answer pairs.
This dataset is constructed from the loaded questions data (questions.json). This dataset is constructed from the loaded questions data.
Each element in the dataset is a dictionary that includes at least: Each element in the dataset is a dictionary that includes at least:
- "question": The question text. - "question": The question text.
- "answer": The corresponding answer text. - "answer": The corresponding answer text.
@ -159,15 +169,21 @@ def get_qa_dataset(randomize: bool = False, test_size: float = 0.1, seed: int =
randomize: Whether to shuffle the dataset randomize: Whether to shuffle the dataset
test_size: Proportion of the dataset to include in the test split (0 for train-only) test_size: Proportion of the dataset to include in the test split (0 for train-only)
seed: Random seed for reproducibility seed: Random seed for reproducibility
questions_path: Path to questions.jsonl file (if None, uses globally loaded questions)
Returns: Returns:
A tuple of (train_dataset, test_dataset) HuggingFace Dataset objects. A tuple of (train_dataset, test_dataset) HuggingFace Dataset objects.
If test_size=0, test_dataset will be empty. If test_size=1, train_dataset will be empty. If test_size=0, test_dataset will be empty. If test_size=1, train_dataset will be empty.
""" """
if questions is None: qa_data = questions
raise ValueError("Questions not loaded. Please ensure questions.json exists.")
if questions_path is not None:
qa_data = load_qa_data(questions_path)
if qa_data is None:
raise ValueError("Questions not loaded. Please ensure questions.jsonl exists.")
qa_dataset = Dataset.from_list(questions) qa_dataset = Dataset.from_list(qa_data)
if randomize: if randomize:
qa_dataset = qa_dataset.shuffle(seed=seed) qa_dataset = qa_dataset.shuffle(seed=seed)

Loading…
Cancel
Save