feat: update evaluation scripts to enhance model configuration and dataset loading, including increased max tokens and added logging

main
thinhlpg 4 weeks ago
parent bf9f2c4102
commit 424459d840

@ -13,6 +13,7 @@ sys.path.append(project_root)
from unsloth import FastLanguageModel
from vllm import SamplingParams
from config import logger
from src import (
apply_chat_template,
build_reward_correctness_fn,
@ -20,7 +21,6 @@ from src import (
get_system_prompt,
run_eval,
)
from config import logger
def main():
@ -35,23 +35,24 @@ def main():
# Setup model
model, tokenizer = FastLanguageModel.from_pretrained(
model_name=args.model_name,
max_seq_length=4096 * 2,
max_seq_length=4096 * 6,
load_in_4bit=True,
fast_inference=True,
gpu_memory_utilization=0.8,
)
# Setup sampling params
sampling_params = SamplingParams(
temperature=args.temperature,
top_p=0.95,
max_tokens=4096,
max_tokens=4096 * 6,
)
# Setup verifier with lower temperature
verifier_params = SamplingParams(
temperature=0.1, # Lower temperature for more consistent verification
top_p=0.95,
max_tokens=4096,
max_tokens=4096 * 6,
)
def generate_fn(inputs):
@ -111,6 +112,7 @@ def main():
tokenizer=tokenizer,
output_file=output_file,
debug_file=debug_file,
max_generations=32,
)
logger.info("✨ Evaluation completed!")

@ -14,6 +14,7 @@ sys.path.append(project_root)
from unsloth import FastLanguageModel
from vllm import SamplingParams
from config import logger
from src import (
apply_chat_template,
build_reward_correctness_fn,
@ -21,7 +22,6 @@ from src import (
get_system_prompt,
run_eval,
)
from config import logger
def main():
@ -45,6 +45,7 @@ def main():
load_in_4bit=True,
fast_inference=True,
max_lora_rank=lora_config["r"], # Use rank from config
gpu_memory_utilization=0.8,
)
# Setup LoRA using config
@ -63,7 +64,7 @@ def main():
sampling_params = SamplingParams(
temperature=args.temperature,
top_p=0.95,
max_tokens=4096,
max_tokens=4096 * 2,
)
# Setup verifier with lower temperature
@ -134,6 +135,7 @@ def main():
tokenizer=tokenizer,
output_file=output_file,
debug_file=debug_file,
max_generations=32,
)
logger.info("✨ Evaluation completed!")

@ -5,7 +5,7 @@ Evaluation utilities for RL training.
import inspect
from datetime import datetime
from config import logger
from config import DATA_DIR, logger
from src.agent import Agent
from src.search_module import get_qa_dataset
from src.tokenizer_adapter import LlamaTokenizerAdapter, R1DistilTokenizerAdapter
@ -162,7 +162,11 @@ def run_eval(generate_fn, verify_fn, tokenizer, max_generations=20, output_file=
Returns:
full_chat_states: The chat states from evaluation
"""
train_dataset, test_dataset = get_qa_dataset()
train_dataset, test_dataset = get_qa_dataset(
randomize=False,
test_size=1,
questions_path=DATA_DIR / "processed" / "questions_dev.jsonl",
)
questions = test_dataset["prompt"]
# Create agent with appropriate adapter based on tokenizer

@ -74,10 +74,20 @@ def search(query: str, return_type=str, results: int = 5):
# Load questions from saved data
def load_qa_data():
"""Load the pre-generated questions"""
def load_qa_data(questions_path=None):
"""
Load the pre-generated questions
Args:
questions_path: Path to questions file (default: PROCESSED_DATA_DIR / "questions.jsonl")
Returns:
List of question-answer pairs
"""
try:
if questions_path is None:
questions_path = PROCESSED_DATA_DIR / "questions.jsonl"
logger.info(f"Loading questions from: {questions_path}")
# Load the questions
@ -144,11 +154,11 @@ def get_question_count() -> int:
return len(questions)
def get_qa_dataset(randomize: bool = False, test_size: float = 0.1, seed: int = 42) -> tuple:
def get_qa_dataset(randomize: bool = False, test_size: float = 0.1, seed: int = 42, questions_path=None) -> tuple:
"""
Return a HuggingFace Dataset containing question and answer pairs.
This dataset is constructed from the loaded questions data (questions.json).
This dataset is constructed from the loaded questions data.
Each element in the dataset is a dictionary that includes at least:
- "question": The question text.
- "answer": The corresponding answer text.
@ -159,15 +169,21 @@ def get_qa_dataset(randomize: bool = False, test_size: float = 0.1, seed: int =
randomize: Whether to shuffle the dataset
test_size: Proportion of the dataset to include in the test split (0 for train-only)
seed: Random seed for reproducibility
questions_path: Path to questions.jsonl file (if None, uses globally loaded questions)
Returns:
A tuple of (train_dataset, test_dataset) HuggingFace Dataset objects.
If test_size=0, test_dataset will be empty. If test_size=1, train_dataset will be empty.
"""
if questions is None:
raise ValueError("Questions not loaded. Please ensure questions.json exists.")
qa_data = questions
if questions_path is not None:
qa_data = load_qa_data(questions_path)
if qa_data is None:
raise ValueError("Questions not loaded. Please ensure questions.jsonl exists.")
qa_dataset = Dataset.from_list(questions)
qa_dataset = Dataset.from_list(qa_data)
if randomize:
qa_dataset = qa_dataset.shuffle(seed=seed)

Loading…
Cancel
Save