From 424459d84084ea09e804f793f38e2d81d633a796 Mon Sep 17 00:00:00 2001 From: thinhlpg Date: Mon, 14 Apr 2025 05:58:27 +0000 Subject: [PATCH] feat: update evaluation scripts to enhance model configuration and dataset loading, including increased max tokens and added logging --- scripts/eval_base.py | 10 ++++++---- scripts/eval_lora.py | 6 ++++-- src/evaluation.py | 8 ++++++-- src/search_module.py | 32 ++++++++++++++++++++++++-------- 4 files changed, 40 insertions(+), 16 deletions(-) diff --git a/scripts/eval_base.py b/scripts/eval_base.py index 2857f83..da1a1c0 100644 --- a/scripts/eval_base.py +++ b/scripts/eval_base.py @@ -13,6 +13,7 @@ sys.path.append(project_root) from unsloth import FastLanguageModel from vllm import SamplingParams +from config import logger from src import ( apply_chat_template, build_reward_correctness_fn, @@ -20,7 +21,6 @@ from src import ( get_system_prompt, run_eval, ) -from config import logger def main(): @@ -35,23 +35,24 @@ def main(): # Setup model model, tokenizer = FastLanguageModel.from_pretrained( model_name=args.model_name, - max_seq_length=4096 * 2, + max_seq_length=4096 * 6, load_in_4bit=True, fast_inference=True, + gpu_memory_utilization=0.8, ) # Setup sampling params sampling_params = SamplingParams( temperature=args.temperature, top_p=0.95, - max_tokens=4096, + max_tokens=4096 * 6, ) # Setup verifier with lower temperature verifier_params = SamplingParams( temperature=0.1, # Lower temperature for more consistent verification top_p=0.95, - max_tokens=4096, + max_tokens=4096 * 6, ) def generate_fn(inputs): @@ -111,6 +112,7 @@ def main(): tokenizer=tokenizer, output_file=output_file, debug_file=debug_file, + max_generations=32, ) logger.info("✨ Evaluation completed!") diff --git a/scripts/eval_lora.py b/scripts/eval_lora.py index bdec694..b96e0d5 100644 --- a/scripts/eval_lora.py +++ b/scripts/eval_lora.py @@ -14,6 +14,7 @@ sys.path.append(project_root) from unsloth import FastLanguageModel from vllm import SamplingParams +from config import logger from src import ( apply_chat_template, build_reward_correctness_fn, @@ -21,7 +22,6 @@ from src import ( get_system_prompt, run_eval, ) -from config import logger def main(): @@ -45,6 +45,7 @@ def main(): load_in_4bit=True, fast_inference=True, max_lora_rank=lora_config["r"], # Use rank from config + gpu_memory_utilization=0.8, ) # Setup LoRA using config @@ -63,7 +64,7 @@ def main(): sampling_params = SamplingParams( temperature=args.temperature, top_p=0.95, - max_tokens=4096, + max_tokens=4096 * 2, ) # Setup verifier with lower temperature @@ -134,6 +135,7 @@ def main(): tokenizer=tokenizer, output_file=output_file, debug_file=debug_file, + max_generations=32, ) logger.info("✨ Evaluation completed!") diff --git a/src/evaluation.py b/src/evaluation.py index 4089c24..7559cd3 100644 --- a/src/evaluation.py +++ b/src/evaluation.py @@ -5,7 +5,7 @@ Evaluation utilities for RL training. import inspect from datetime import datetime -from config import logger +from config import DATA_DIR, logger from src.agent import Agent from src.search_module import get_qa_dataset from src.tokenizer_adapter import LlamaTokenizerAdapter, R1DistilTokenizerAdapter @@ -162,7 +162,11 @@ def run_eval(generate_fn, verify_fn, tokenizer, max_generations=20, output_file= Returns: full_chat_states: The chat states from evaluation """ - train_dataset, test_dataset = get_qa_dataset() + train_dataset, test_dataset = get_qa_dataset( + randomize=False, + test_size=1, + questions_path=DATA_DIR / "processed" / "questions_dev.jsonl", + ) questions = test_dataset["prompt"] # Create agent with appropriate adapter based on tokenizer diff --git a/src/search_module.py b/src/search_module.py index 5182276..a8eba7b 100644 --- a/src/search_module.py +++ b/src/search_module.py @@ -74,10 +74,20 @@ def search(query: str, return_type=str, results: int = 5): # Load questions from saved data -def load_qa_data(): - """Load the pre-generated questions""" +def load_qa_data(questions_path=None): + """ + Load the pre-generated questions + + Args: + questions_path: Path to questions file (default: PROCESSED_DATA_DIR / "questions.jsonl") + + Returns: + List of question-answer pairs + """ try: - questions_path = PROCESSED_DATA_DIR / "questions.jsonl" + if questions_path is None: + questions_path = PROCESSED_DATA_DIR / "questions.jsonl" + logger.info(f"Loading questions from: {questions_path}") # Load the questions @@ -144,11 +154,11 @@ def get_question_count() -> int: return len(questions) -def get_qa_dataset(randomize: bool = False, test_size: float = 0.1, seed: int = 42) -> tuple: +def get_qa_dataset(randomize: bool = False, test_size: float = 0.1, seed: int = 42, questions_path=None) -> tuple: """ Return a HuggingFace Dataset containing question and answer pairs. - This dataset is constructed from the loaded questions data (questions.json). + This dataset is constructed from the loaded questions data. Each element in the dataset is a dictionary that includes at least: - "question": The question text. - "answer": The corresponding answer text. @@ -159,15 +169,21 @@ def get_qa_dataset(randomize: bool = False, test_size: float = 0.1, seed: int = randomize: Whether to shuffle the dataset test_size: Proportion of the dataset to include in the test split (0 for train-only) seed: Random seed for reproducibility + questions_path: Path to questions.jsonl file (if None, uses globally loaded questions) Returns: A tuple of (train_dataset, test_dataset) HuggingFace Dataset objects. If test_size=0, test_dataset will be empty. If test_size=1, train_dataset will be empty. """ - if questions is None: - raise ValueError("Questions not loaded. Please ensure questions.json exists.") + qa_data = questions + + if questions_path is not None: + qa_data = load_qa_data(questions_path) + + if qa_data is None: + raise ValueError("Questions not loaded. Please ensure questions.jsonl exists.") - qa_dataset = Dataset.from_list(questions) + qa_dataset = Dataset.from_list(qa_data) if randomize: qa_dataset = qa_dataset.shuffle(seed=seed)