feat: update evaluation scripts to enhance model configuration and dataset loading, including increased max tokens and added logging

3 months ago · 424459d840
parent bf9f2c4102
commit 424459d840
4 changed files with 40 additions and 16 deletions
--- a/scripts/eval_base.py
+++ b/scripts/eval_base.py
@ -13,6 +13,7 @@ sys.path.append(project_root)
 from unsloth import FastLanguageModel
 from vllm import SamplingParams

+from config import logger
 from src import (
    apply_chat_template,
    build_reward_correctness_fn,
@ -20,7 +21,6 @@ from src import (
    get_system_prompt,
    run_eval,
 )
-from config import logger


 def main():
@ -35,23 +35,24 @@ def main():
    # Setup model
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name=args.model_name,
-        max_seq_length=4096 * 2,
+        max_seq_length=4096 * 6,
        load_in_4bit=True,
        fast_inference=True,
+        gpu_memory_utilization=0.8,
    )

    # Setup sampling params
    sampling_params = SamplingParams(
        temperature=args.temperature,
        top_p=0.95,
-        max_tokens=4096,
+        max_tokens=4096 * 6,
    )

    # Setup verifier with lower temperature
    verifier_params = SamplingParams(
        temperature=0.1,  # Lower temperature for more consistent verification
        top_p=0.95,
-        max_tokens=4096,
+        max_tokens=4096 * 6,
    )

    def generate_fn(inputs):
@ -111,6 +112,7 @@ def main():
        tokenizer=tokenizer,
        output_file=output_file,
        debug_file=debug_file,
+        max_generations=32,
    )

    logger.info("✨ Evaluation completed!")
--- a/scripts/eval_lora.py
+++ b/scripts/eval_lora.py
@ -14,6 +14,7 @@ sys.path.append(project_root)
 from unsloth import FastLanguageModel
 from vllm import SamplingParams

+from config import logger
 from src import (
    apply_chat_template,
    build_reward_correctness_fn,
@ -21,7 +22,6 @@ from src import (
    get_system_prompt,
    run_eval,
 )
-from config import logger


 def main():
@ -45,6 +45,7 @@ def main():
        load_in_4bit=True,
        fast_inference=True,
        max_lora_rank=lora_config["r"],  # Use rank from config
+        gpu_memory_utilization=0.8,
    )

    # Setup LoRA using config
@ -63,7 +64,7 @@ def main():
    sampling_params = SamplingParams(
        temperature=args.temperature,
        top_p=0.95,
-        max_tokens=4096,
+        max_tokens=4096 * 2,
    )

    # Setup verifier with lower temperature
@ -134,6 +135,7 @@ def main():
        tokenizer=tokenizer,
        output_file=output_file,
        debug_file=debug_file,
+        max_generations=32,
    )

    logger.info("✨ Evaluation completed!")
--- a/src/evaluation.py
+++ b/src/evaluation.py
@ -5,7 +5,7 @@ Evaluation utilities for RL training.
 import inspect
 from datetime import datetime

-from config import logger
+from config import DATA_DIR, logger
 from src.agent import Agent
 from src.search_module import get_qa_dataset
 from src.tokenizer_adapter import LlamaTokenizerAdapter, R1DistilTokenizerAdapter
@ -162,7 +162,11 @@ def run_eval(generate_fn, verify_fn, tokenizer, max_generations=20, output_file=
    Returns:
        full_chat_states: The chat states from evaluation
    """
-    train_dataset, test_dataset = get_qa_dataset()
+    train_dataset, test_dataset = get_qa_dataset(
+        randomize=False,
+        test_size=1,
+        questions_path=DATA_DIR / "processed" / "questions_dev.jsonl",
+    )
    questions = test_dataset["prompt"]

    # Create agent with appropriate adapter based on tokenizer
--- a/src/search_module.py
+++ b/src/search_module.py
@ -74,10 +74,20 @@ def search(query: str, return_type=str, results: int = 5):


 # Load questions from saved data
-def load_qa_data():
-    """Load the pre-generated questions"""
+def load_qa_data(questions_path=None):
+    """
+    Load the pre-generated questions
+
+    Args:
+        questions_path: Path to questions file (default: PROCESSED_DATA_DIR / "questions.jsonl")
+
+    Returns:
+        List of question-answer pairs
+    """
    try:
+        if questions_path is None:
            questions_path = PROCESSED_DATA_DIR / "questions.jsonl"
+
        logger.info(f"Loading questions from: {questions_path}")

        # Load the questions
@ -144,11 +154,11 @@ def get_question_count() -> int:
    return len(questions)


-def get_qa_dataset(randomize: bool = False, test_size: float = 0.1, seed: int = 42) -> tuple:
+def get_qa_dataset(randomize: bool = False, test_size: float = 0.1, seed: int = 42, questions_path=None) -> tuple:
    """
    Return a HuggingFace Dataset containing question and answer pairs.

-    This dataset is constructed from the loaded questions data (questions.json).
+    This dataset is constructed from the loaded questions data.
    Each element in the dataset is a dictionary that includes at least:
      - "question": The question text.
      - "answer": The corresponding answer text.
@ -159,15 +169,21 @@ def get_qa_dataset(randomize: bool = False, test_size: float = 0.1, seed: int =
        randomize: Whether to shuffle the dataset
        test_size: Proportion of the dataset to include in the test split (0 for train-only)
        seed: Random seed for reproducibility
+        questions_path: Path to questions.jsonl file (if None, uses globally loaded questions)

    Returns:
        A tuple of (train_dataset, test_dataset) HuggingFace Dataset objects.
        If test_size=0, test_dataset will be empty. If test_size=1, train_dataset will be empty.
    """
-    if questions is None:
-        raise ValueError("Questions not loaded. Please ensure questions.json exists.")
+    qa_data = questions
+
+    if questions_path is not None:
+        qa_data = load_qa_data(questions_path)
+
+    if qa_data is None:
+        raise ValueError("Questions not loaded. Please ensure questions.jsonl exists.")

-    qa_dataset = Dataset.from_list(questions)
+    qa_dataset = Dataset.from_list(qa_data)
    if randomize:
        qa_dataset = qa_dataset.shuffle(seed=seed)