From 424459d84084ea09e804f793f38e2d81d633a796 Mon Sep 17 00:00:00 2001
From: thinhlpg <thinhlpg@gmail.com>
Date: Mon, 14 Apr 2025 05:58:27 +0000
Subject: [PATCH] feat: update evaluation scripts to enhance model
 configuration and dataset loading, including increased max tokens and added
 logging

---
 scripts/eval_base.py | 10 ++++++----
 scripts/eval_lora.py |  6 ++++--
 src/evaluation.py    |  8 ++++++--
 src/search_module.py | 32 ++++++++++++++++++++++++--------
 4 files changed, 40 insertions(+), 16 deletions(-)

diff --git a/scripts/eval_base.py b/scripts/eval_base.py
index 2857f83..da1a1c0 100644
--- a/scripts/eval_base.py
+++ b/scripts/eval_base.py
@@ -13,6 +13,7 @@ sys.path.append(project_root)
 from unsloth import FastLanguageModel
 from vllm import SamplingParams
 
+from config import logger
 from src import (
     apply_chat_template,
     build_reward_correctness_fn,
@@ -20,7 +21,6 @@ from src import (
     get_system_prompt,
     run_eval,
 )
-from config import logger
 
 
 def main():
@@ -35,23 +35,24 @@ def main():
     # Setup model
     model, tokenizer = FastLanguageModel.from_pretrained(
         model_name=args.model_name,
-        max_seq_length=4096 * 2,
+        max_seq_length=4096 * 6,
         load_in_4bit=True,
         fast_inference=True,
+        gpu_memory_utilization=0.8,
     )
 
     # Setup sampling params
     sampling_params = SamplingParams(
         temperature=args.temperature,
         top_p=0.95,
-        max_tokens=4096,
+        max_tokens=4096 * 6,
     )
 
     # Setup verifier with lower temperature
     verifier_params = SamplingParams(
         temperature=0.1,  # Lower temperature for more consistent verification
         top_p=0.95,
-        max_tokens=4096,
+        max_tokens=4096 * 6,
     )
 
     def generate_fn(inputs):
@@ -111,6 +112,7 @@ def main():
         tokenizer=tokenizer,
         output_file=output_file,
         debug_file=debug_file,
+        max_generations=32,
     )
 
     logger.info("✨ Evaluation completed!")
diff --git a/scripts/eval_lora.py b/scripts/eval_lora.py
index bdec694..b96e0d5 100644
--- a/scripts/eval_lora.py
+++ b/scripts/eval_lora.py
@@ -14,6 +14,7 @@ sys.path.append(project_root)
 from unsloth import FastLanguageModel
 from vllm import SamplingParams
 
+from config import logger
 from src import (
     apply_chat_template,
     build_reward_correctness_fn,
@@ -21,7 +22,6 @@ from src import (
     get_system_prompt,
     run_eval,
 )
-from config import logger
 
 
 def main():
@@ -45,6 +45,7 @@ def main():
         load_in_4bit=True,
         fast_inference=True,
         max_lora_rank=lora_config["r"],  # Use rank from config
+        gpu_memory_utilization=0.8,
     )
 
     # Setup LoRA using config
@@ -63,7 +64,7 @@ def main():
     sampling_params = SamplingParams(
         temperature=args.temperature,
         top_p=0.95,
-        max_tokens=4096,
+        max_tokens=4096 * 2,
     )
 
     # Setup verifier with lower temperature
@@ -134,6 +135,7 @@ def main():
         tokenizer=tokenizer,
         output_file=output_file,
         debug_file=debug_file,
+        max_generations=32,
     )
 
     logger.info("✨ Evaluation completed!")
diff --git a/src/evaluation.py b/src/evaluation.py
index 4089c24..7559cd3 100644
--- a/src/evaluation.py
+++ b/src/evaluation.py
@@ -5,7 +5,7 @@ Evaluation utilities for RL training.
 import inspect
 from datetime import datetime
 
-from config import logger
+from config import DATA_DIR, logger
 from src.agent import Agent
 from src.search_module import get_qa_dataset
 from src.tokenizer_adapter import LlamaTokenizerAdapter, R1DistilTokenizerAdapter
@@ -162,7 +162,11 @@ def run_eval(generate_fn, verify_fn, tokenizer, max_generations=20, output_file=
     Returns:
         full_chat_states: The chat states from evaluation
     """
-    train_dataset, test_dataset = get_qa_dataset()
+    train_dataset, test_dataset = get_qa_dataset(
+        randomize=False,
+        test_size=1,
+        questions_path=DATA_DIR / "processed" / "questions_dev.jsonl",
+    )
     questions = test_dataset["prompt"]
 
     # Create agent with appropriate adapter based on tokenizer
diff --git a/src/search_module.py b/src/search_module.py
index 5182276..a8eba7b 100644
--- a/src/search_module.py
+++ b/src/search_module.py
@@ -74,10 +74,20 @@ def search(query: str, return_type=str, results: int = 5):
 
 
 # Load questions from saved data
-def load_qa_data():
-    """Load the pre-generated questions"""
+def load_qa_data(questions_path=None):
+    """
+    Load the pre-generated questions
+
+    Args:
+        questions_path: Path to questions file (default: PROCESSED_DATA_DIR / "questions.jsonl")
+
+    Returns:
+        List of question-answer pairs
+    """
     try:
-        questions_path = PROCESSED_DATA_DIR / "questions.jsonl"
+        if questions_path is None:
+            questions_path = PROCESSED_DATA_DIR / "questions.jsonl"
+
         logger.info(f"Loading questions from: {questions_path}")
 
         # Load the questions
@@ -144,11 +154,11 @@ def get_question_count() -> int:
     return len(questions)
 
 
-def get_qa_dataset(randomize: bool = False, test_size: float = 0.1, seed: int = 42) -> tuple:
+def get_qa_dataset(randomize: bool = False, test_size: float = 0.1, seed: int = 42, questions_path=None) -> tuple:
     """
     Return a HuggingFace Dataset containing question and answer pairs.
 
-    This dataset is constructed from the loaded questions data (questions.json).
+    This dataset is constructed from the loaded questions data.
     Each element in the dataset is a dictionary that includes at least:
       - "question": The question text.
       - "answer": The corresponding answer text.
@@ -159,15 +169,21 @@ def get_qa_dataset(randomize: bool = False, test_size: float = 0.1, seed: int =
         randomize: Whether to shuffle the dataset
         test_size: Proportion of the dataset to include in the test split (0 for train-only)
         seed: Random seed for reproducibility
+        questions_path: Path to questions.jsonl file (if None, uses globally loaded questions)
 
     Returns:
         A tuple of (train_dataset, test_dataset) HuggingFace Dataset objects.
         If test_size=0, test_dataset will be empty. If test_size=1, train_dataset will be empty.
     """
-    if questions is None:
-        raise ValueError("Questions not loaded. Please ensure questions.json exists.")
+    qa_data = questions
+
+    if questions_path is not None:
+        qa_data = load_qa_data(questions_path)
+
+    if qa_data is None:
+        raise ValueError("Questions not loaded. Please ensure questions.jsonl exists.")
 
-    qa_dataset = Dataset.from_list(questions)
+    qa_dataset = Dataset.from_list(qa_data)
     if randomize:
         qa_dataset = qa_dataset.shuffle(seed=seed)