# Train AutoDidact
- Taken from [AutoDidact](https://github.com/menloresearch/DeepSearch/blob/main/notebooks/train_autodidact.ipynb)

In [1]:
from unsloth import FastLanguageModel

ðŸ¦¥ Unsloth: Will patch your computer to enable 2x faster free finetuning.
ðŸ¦¥ Unsloth Zoo will now patch everything to make training faster!


In [2]:
from unsloth import is_bfloat16_supported
import torch

max_seq_length = 4096 * 2  # Can increase for longer reasoning traces
lora_rank = 64  # Larger rank = smarter, but slower

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="meta-llama/meta-Llama-3.1-8B-Instruct",
    max_seq_length=max_seq_length,
    load_in_4bit=True,  # False for LoRA 16bit
    fast_inference=True,  # Enable vLLM fast inference
    max_lora_rank=lora_rank,
    gpu_memory_utilization=0.6,  # Reduce if out of memory
)

model = FastLanguageModel.get_peft_model(
    model,
    r=lora_rank,  # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
    ],  # Remove QKVO if out of memory
    lora_alpha=lora_rank,
    use_gradient_checkpointing="unsloth",  # Enable long context finetuning
    random_state=3407,
)

INFO 03-18 00:53:10 __init__.py:207] Automatically detected platform cuda.
==((====))==  Unsloth 2025.3.6: Fast Llama patching. Transformers: 4.49.0. vLLM: 0.7.3.
   \\   /|    NVIDIA GeForce RTX 4090. Num GPUs = 1. Max memory: 23.542 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu124. CUDA: 8.9. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: vLLM loading unsloth/meta-llama-3.1-8b-instruct-unsloth-bnb-4bit with actual GPU utilization = 58.92%
Unsloth: Your GPU has CUDA compute capability 8.9 with VRAM = 23.54 GB.
Unsloth: Using conservativeness = 1.0. Chunked prefill tokens = 8192. Num Sequences = 192.
Unsloth: vLLM's KV Cache can use up to 7.54 GB. Also swap space = 6 GB.
INFO 03-18 00:53:19 config.py:549] This model supports multiple tasks: {'generate', 'reward', 'c



INFO 03-18 00:53:21 weight_utils.py:254] Using model weights format ['*.safetensors']


Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


INFO 03-18 00:53:23 model_runner.py:1115] Loading model weights took 5.5976 GB
INFO 03-18 00:53:23 punica_selector.py:18] Using PunicaWrapperGPU.
INFO 03-18 00:53:25 worker.py:267] Memory profiling takes 1.67 seconds
INFO 03-18 00:53:25 worker.py:267] the current vLLM instance can use total_gpu_memory (23.54GiB) x gpu_memory_utilization (0.59) = 13.87GiB
INFO 03-18 00:53:25 worker.py:267] model weights take 5.60GiB; non_torch_memory takes 0.08GiB; PyTorch activation peak memory takes 0.96GiB; the rest of the memory reserved for KV Cache is 7.24GiB.
INFO 03-18 00:53:25 executor_base.py:111] # cuda blocks: 3704, # CPU blocks: 3072
INFO 03-18 00:53:25 executor_base.py:116] Maximum concurrency for 8192 tokens per request: 7.23x
INFO 03-18 00:53:29 model_runner.py:1434] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI. If out-of-memory error occ

Capturing CUDA graph shapes: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 27/27 [00:18<00:00,  1.47it/s]

INFO 03-18 00:53:47 model_runner.py:1562] Graph capturing finished in 18 secs, took 0.66 GiB
INFO 03-18 00:53:47 llm_engine.py:436] init engine (profile, create kv cache, warmup model) took 24.22 seconds



Unsloth 2025.3.6 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [3]:
import re
from datasets import load_dataset, Dataset
from search_module import search, get_question_answer, get_question_count
from rl_helpers import get_qa_dataset

train_dataset, test_dataset = get_qa_dataset()

Loading FAISS index from: /root/AutoDidact/faiss_index
Successfully loaded FAISS index
Loading chunks from: /root/AutoDidact/saved_data/chunks.pkl
Loading questions from: /root/AutoDidact/saved_data/questions.json
Successfully loaded 341 chunks and 676 questions


<a name="Train"></a>
### Train the model

Now set up GRPO Trainer and all configurations!

In [4]:
import os

os.environ["WANDB_PROJECT"] = "bootstrap-search-rl"

In [5]:
# from UnslothGRPOTrainerTemp import UnslothGRPOConfig, _UnslothGRPOTrainer
import UnslothGRPOTrainerTemp

training_args = UnslothGRPOTrainerTemp.UnslothGRPOConfig(
    use_vllm=True,  # use vLLM for fast inference!
    use_agentic_generate=True,  # use agentic generation
    learning_rate=5e-6,
    adam_beta1=0.9,
    adam_beta2=0.99,
    weight_decay=0.1,
    warmup_ratio=0.1,
    lr_scheduler_type="cosine",
    optim="paged_adamw_8bit",
    logging_steps=1,
    bf16=is_bfloat16_supported(),
    fp16=not is_bfloat16_supported(),
    per_device_train_batch_size=8,
    gradient_accumulation_steps=1,  # Increase to 4 for smoother training
    num_generations=8,  # Decrease if out of memory
    max_prompt_length=1024,
    max_completion_length=1024,
    # num_train_epochs = 1, # Set to 1 for a full training run
    max_steps=101,
    save_steps=50,
    max_grad_norm=0.1,
    report_to="none",  # Can use Weights & Biases
    output_dir="full_local_training",
)

In [6]:
import rl_helpers
# importlib.reload(rl_helpers)


def agentic_generate(
    prompts: list[str],
    generate_fn,
    max_generations: int = 6,
):
    return run_agent(generate_fn, tokenizer, prompts, max_generations)


model.agentic_generate = agentic_generate


from vllm import SamplingParams

verifier_sampling_params = SamplingParams(
    temperature=0.1,
    top_p=0.95,
    max_tokens=4096,
)


def verifier_generate_fn(inputs):
    return model.fast_generate(
        inputs,
        sampling_params=verifier_sampling_params,
    )


run_agent = rl_helpers.run_agent
reward_correctness = rl_helpers.build_reward_correctness_fn(
    verifier_generate_fn,
    tokenizer,
)
reward_formatting = rl_helpers.reward_formatting

import UnslothGRPOTrainerTemp

trainer = UnslothGRPOTrainerTemp.UnslothGRPOTrainer(
    model=model,
    processing_class=tokenizer,
    reward_funcs=[
        reward_correctness,
        reward_formatting,
    ],
    args=training_args,
    train_dataset=train_dataset,
)

In [7]:
trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 608 | Num Epochs = 1 | Total steps = 101
O^O/ \_/ \    Batch size per device = 8 | Gradient accumulation steps = 1
\        /    Data Parallel GPUs = 1 | Total batch size (8 x 1 x 1) = 8
 "-____-"     Trainable parameters = 167,772,160/4,796,452,864 (3.50% trained)
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mjcaples4[0m ([33mllm-research-activated[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin



  completion_ids = [torch.tensor(ids, device=device) for ids in completion_ids]
Processed prompts: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 8/8 [00:00<00:00, 37.67it/s, est. speed input: 5686.72 toks/s, output: 75.38 toks/s]


rewards_per_func: tensor([0.3750, 0.3500], device='cuda:0')


Step,Training Loss,reward,reward_std,completion_length,kl,rewards / reward_correctness,rewards / reward_formatting
1,-0.0,0.725,0.841342,158.375,0.0,0.375,0.35
2,0.0,0.2125,0.601041,196.875,0.0,0.125,0.0875
3,0.0,0.9,0.723089,212.125,0.000496,0.375,0.525
4,0.0,0.4375,0.362284,195.5,0.00065,0.0,0.4375
5,0.0,0.35,0.374166,252.25,0.000427,0.0,0.35
6,0.0,0.4375,0.362284,290.625,0.000602,0.0,0.4375
7,0.0,0.4375,0.362284,238.75,0.000577,0.0,0.4375
8,0.0,0.35,0.374166,159.5,0.000379,0.0,0.35
9,0.0,0.5625,0.575543,227.875,0.000637,0.125,0.4375
10,0.0,0.775,0.647523,349.75,0.00089,0.25,0.525


['What was the maximum differential pressure that caused the leak rate in the ascent stage oxygen tank 2?', 'What was the maximum differential pressure that caused the leak rate in the ascent stage oxygen tank 2?', 'What was the maximum differential pressure that caused the leak rate in the ascent stage oxygen tank 2?', 'What was the maximum differential pressure that caused the leak rate in the ascent stage oxygen tank 2?', 'What was the maximum differential pressure that caused the leak rate in the ascent stage oxygen tank 2?', 'What was the maximum differential pressure that caused the leak rate in the ascent stage oxygen tank 2?', 'What was the maximum differential pressure that caused the leak rate in the ascent stage oxygen tank 2?', 'What was the maximum differential pressure that caused the leak rate in the ascent stage oxygen tank 2?']


  completion_ids = [torch.tensor(ids, device=device) for ids in completion_ids]
Processed prompts: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 8/8 [00:00<00:00, 59.63it/s, est. speed input: 7487.99 toks/s, output: 119.45 toks/s]


rewards_per_func: tensor([0.1250, 0.0875], device='cuda:0')
Unsloth: Will smartly offload gradients to save VRAM!
['What was the purpose of attaching Velcro patches to the bread, mustard, and catsup packages?', 'What was the purpose of attaching Velcro patches to the bread, mustard, and catsup packages?', 'What was the purpose of attaching Velcro patches to the bread, mustard, and catsup packages?', 'What was the purpose of attaching Velcro patches to the bread, mustard, and catsup packages?', 'What was the purpose of attaching Velcro patches to the bread, mustard, and catsup packages?', 'What was the purpose of attaching Velcro patches to the bread, mustard, and catsup packages?', 'What was the purpose of attaching Velcro patches to the bread, mustard, and catsup packages?', 'What was the purpose of attaching Velcro patches to the bread, mustard, and catsup packages?']


Processed prompts: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 8/8 [00:00<00:00, 41.85it/s, est. speed input: 8187.79 toks/s, output: 83.76 toks/s]


rewards_per_func: tensor([0.3750, 0.5250], device='cuda:0')
['What was the title of the first supplement for Apollo 7?', 'What was the title of the first supplement for Apollo 7?', 'What was the title of the first supplement for Apollo 7?', 'What was the title of the first supplement for Apollo 7?', 'What was the title of the first supplement for Apollo 7?', 'What was the title of the first supplement for Apollo 7?', 'What was the title of the first supplement for Apollo 7?', 'What was the title of the first supplement for Apollo 7?']


Processed prompts: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 8/8 [00:00<00:00, 55.45it/s, est. speed input: 7618.59 toks/s, output: 111.01 toks/s]


rewards_per_func: tensor([0.0000, 0.4375], device='cuda:0')
['What is the probable cause of the problem with the pressure transducers in the Apollo 14 mission?', 'What is the probable cause of the problem with the pressure transducers in the Apollo 14 mission?', 'What is the probable cause of the problem with the pressure transducers in the Apollo 14 mission?', 'What is the probable cause of the problem with the pressure transducers in the Apollo 14 mission?', 'What is the probable cause of the problem with the pressure transducers in the Apollo 14 mission?', 'What is the probable cause of the problem with the pressure transducers in the Apollo 14 mission?', 'What is the probable cause of the problem with the pressure transducers in the Apollo 14 mission?', 'What is the probable cause of the problem with the pressure transducers in the Apollo 14 mission?']


Processed prompts: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 8/8 [00:00<00:00, 49.94it/s, est. speed input: 7623.62 toks/s, output: 99.98 toks/s]


rewards_per_func: tensor([0.0000, 0.3500], device='cuda:0')
['What was the latitude of the landing site?', 'What was the latitude of the landing site?', 'What was the latitude of the landing site?', 'What was the latitude of the landing site?', 'What was the latitude of the landing site?', 'What was the latitude of the landing site?', 'What was the latitude of the landing site?', 'What was the latitude of the landing site?']


Processed prompts: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 8/8 [00:00<00:00, 46.12it/s, est. speed input: 7885.86 toks/s, output: 92.36 toks/s]


rewards_per_func: tensor([0.0000, 0.4375], device='cuda:0')
['What was the reason for the substitution of the backup Command Module Pilot?', 'What was the reason for the substitution of the backup Command Module Pilot?', 'What was the reason for the substitution of the backup Command Module Pilot?', 'What was the reason for the substitution of the backup Command Module Pilot?', 'What was the reason for the substitution of the backup Command Module Pilot?', 'What was the reason for the substitution of the backup Command Module Pilot?', 'What was the reason for the substitution of the backup Command Module Pilot?', 'What was the reason for the substitution of the backup Command Module Pilot?']


  completion_ids = [torch.tensor(ids, device=device) for ids in completion_ids]
Processed prompts: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 8/8 [00:00<00:00, 49.96it/s, est. speed input: 7929.34 toks/s, output: 100.05 toks/s]


rewards_per_func: tensor([0.0000, 0.4375], device='cuda:0')
['What was the objective of the television in earth orbit for the Apollo 13 mission?', 'What was the objective of the television in earth orbit for the Apollo 13 mission?', 'What was the objective of the television in earth orbit for the Apollo 13 mission?', 'What was the objective of the television in earth orbit for the Apollo 13 mission?', 'What was the objective of the television in earth orbit for the Apollo 13 mission?', 'What was the objective of the television in earth orbit for the Apollo 13 mission?', 'What was the objective of the television in earth orbit for the Apollo 13 mission?', 'What was the objective of the television in earth orbit for the Apollo 13 mission?']


Processed prompts: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 8/8 [00:00<00:00, 21.57it/s, est. speed input: 3128.09 toks/s, output: 108.14 toks/s]


rewards_per_func: tensor([0.0000, 0.3500], device='cuda:0')
['What was the targeted impact point of the discarded S-IVB stage on the lunar surface?', 'What was the targeted impact point of the discarded S-IVB stage on the lunar surface?', 'What was the targeted impact point of the discarded S-IVB stage on the lunar surface?', 'What was the targeted impact point of the discarded S-IVB stage on the lunar surface?', 'What was the targeted impact point of the discarded S-IVB stage on the lunar surface?', 'What was the targeted impact point of the discarded S-IVB stage on the lunar surface?', 'What was the targeted impact point of the discarded S-IVB stage on the lunar surface?', 'What was the targeted impact point of the discarded S-IVB stage on the lunar surface?']


Processed prompts: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 8/8 [00:00<00:00, 45.96it/s, est. speed input: 7894.52 toks/s, output: 91.99 toks/s]


rewards_per_func: tensor([0.1250, 0.4375], device='cuda:0')
['Where did the spacecraft land?', 'Where did the spacecraft land?', 'Where did the spacecraft land?', 'Where did the spacecraft land?', 'Where did the spacecraft land?', 'Where did the spacecraft land?', 'Where did the spacecraft land?', 'Where did the spacecraft land?']


Processed prompts: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 8/8 [00:00<00:00, 36.02it/s, est. speed input: 7887.76 toks/s, output: 72.07 toks/s]


rewards_per_func: tensor([0.2500, 0.5250], device='cuda:0')
['What was the reason for the yaw rate reversal in the lunar module during passive thermal control?', 'What was the reason for the yaw rate reversal in the lunar module during passive thermal control?', 'What was the reason for the yaw rate reversal in the lunar module during passive thermal control?', 'What was the reason for the yaw rate reversal in the lunar module during passive thermal control?', 'What was the reason for the yaw rate reversal in the lunar module during passive thermal control?', 'What was the reason for the yaw rate reversal in the lunar module during passive thermal control?', 'What was the reason for the yaw rate reversal in the lunar module during passive thermal control?', 'What was the reason for the yaw rate reversal in the lunar module during passive thermal control?']


Processed prompts: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 8/8 [00:00<00:00, 42.19it/s, est. speed input: 8127.11 toks/s, output: 84.44 toks/s]


rewards_per_func: tensor([0.2500, 0.5250], device='cuda:0')
['What change will be made to the cryogenic oxygen tank design to prevent structural failure?', 'What change will be made to the cryogenic oxygen tank design to prevent structural failure?', 'What change will be made to the cryogenic oxygen tank design to prevent structural failure?', 'What change will be made to the cryogenic oxygen tank design to prevent structural failure?', 'What change will be made to the cryogenic oxygen tank design to prevent structural failure?', 'What change will be made to the cryogenic oxygen tank design to prevent structural failure?', 'What change will be made to the cryogenic oxygen tank design to prevent structural failure?', 'What change will be made to the cryogenic oxygen tank design to prevent structural failure?']


Processed prompts: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 8/8 [00:00<00:00, 42.93it/s, est. speed input: 8187.86 toks/s, output: 85.96 toks/s]


rewards_per_func: tensor([0.2500, 0.3500], device='cuda:0')
['What is the chapter about the performance of the lunar module?', 'What is the chapter about the performance of the lunar module?', 'What is the chapter about the performance of the lunar module?', 'What is the chapter about the performance of the lunar module?', 'What is the chapter about the performance of the lunar module?', 'What is the chapter about the performance of the lunar module?', 'What is the chapter about the performance of the lunar module?', 'What is the chapter about the performance of the lunar module?']


Processed prompts: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 8/8 [00:00<00:00, 47.12it/s, est. speed input: 7881.40 toks/s, output: 94.31 toks/s]


rewards_per_func: tensor([0.2500, 0.6125], device='cuda:0')
['What was the effect of the error-counter-enable status bit being set during the platform coarse alignment?', 'What was the effect of the error-counter-enable status bit being set during the platform coarse alignment?', 'What was the effect of the error-counter-enable status bit being set during the platform coarse alignment?', 'What was the effect of the error-counter-enable status bit being set during the platform coarse alignment?', 'What was the effect of the error-counter-enable status bit being set during the platform coarse alignment?', 'What was the effect of the error-counter-enable status bit being set during the platform coarse alignment?', 'What was the effect of the error-counter-enable status bit being set during the platform coarse alignment?', 'What was the effect of the error-counter-enable status bit being set during the platform coarse alignment?']


Processed prompts: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 8/8 [00:00<00:00, 33.01it/s, est. speed input: 6896.09 toks/s, output: 74.37 toks/s]


rewards_per_func: tensor([1.0000, 0.7000], device='cuda:0')
['What is the result of the piston O-ring being 0.075 inch from entering the chamfer in the breech assembly?', 'What is the result of the piston O-ring being 0.075 inch from entering the chamfer in the breech assembly?', 'What is the result of the piston O-ring being 0.075 inch from entering the chamfer in the breech assembly?', 'What is the result of the piston O-ring being 0.075 inch from entering the chamfer in the breech assembly?', 'What is the result of the piston O-ring being 0.075 inch from entering the chamfer in the breech assembly?', 'What is the result of the piston O-ring being 0.075 inch from entering the chamfer in the breech assembly?', 'What is the result of the piston O-ring being 0.075 inch from entering the chamfer in the breech assembly?', 'What is the result of the piston O-ring being 0.075 inch from entering the chamfer in the breech assembly?']


Processed prompts: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 8/8 [00:00<00:00,  8.54it/s, est. speed input: 2151.95 toks/s, output: 93.05 toks/s]


rewards_per_func: tensor([0.1250, 0.5250], device='cuda:0')
['What was the cause of the leakage in the valve in the Ascent stage tank shutoff valve?', 'What was the cause of the leakage in the valve in the Ascent stage tank shutoff valve?', 'What was the cause of the leakage in the valve in the Ascent stage tank shutoff valve?', 'What was the cause of the leakage in the valve in the Ascent stage tank shutoff valve?', 'What was the cause of the leakage in the valve in the Ascent stage tank shutoff valve?', 'What was the cause of the leakage in the valve in the Ascent stage tank shutoff valve?', 'What was the cause of the leakage in the valve in the Ascent stage tank shutoff valve?', 'What was the cause of the leakage in the valve in the Ascent stage tank shutoff valve?']


Processed prompts: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 8/8 [00:00<00:00, 34.80it/s, est. speed input: 7971.59 toks/s, output: 74.01 toks/s]


rewards_per_func: tensor([0.1250, 0.7000], device='cuda:0')
['What was the predicted rupture range for the helium tank of the spacecraft?', 'What was the predicted rupture range for the helium tank of the spacecraft?', 'What was the predicted rupture range for the helium tank of the spacecraft?', 'What was the predicted rupture range for the helium tank of the spacecraft?', 'What was the predicted rupture range for the helium tank of the spacecraft?', 'What was the predicted rupture range for the helium tank of the spacecraft?', 'What was the predicted rupture range for the helium tank of the spacecraft?', 'What was the predicted rupture range for the helium tank of the spacecraft?']


Processed prompts: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 8/8 [00:00<00:00, 48.62it/s, est. speed input: 7218.39 toks/s, output: 97.46 toks/s]


rewards_per_func: tensor([0.0000, 0.6125], device='cuda:0')
['What type of corrective action is being taken to prevent electrolyte shorts associated with the previously discussed battery anomaly?', 'What type of corrective action is being taken to prevent electrolyte shorts associated with the previously discussed battery anomaly?', 'What type of corrective action is being taken to prevent electrolyte shorts associated with the previously discussed battery anomaly?', 'What type of corrective action is being taken to prevent electrolyte shorts associated with the previously discussed battery anomaly?', 'What type of corrective action is being taken to prevent electrolyte shorts associated with the previously discussed battery anomaly?', 'What type of corrective action is being taken to prevent electrolyte shorts associated with the previously discussed battery anomaly?', 'What type of corrective action is being taken to prevent electrolyte shorts associated with the previously discussed

Processed prompts: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 8/8 [00:00<00:00, 31.44it/s, est. speed input: 8313.98 toks/s, output: 66.86 toks/s]


rewards_per_func: tensor([0.6250, 0.7000], device='cuda:0')
["What modification was made to the Aclar supplier's heating and quenching process?", "What modification was made to the Aclar supplier's heating and quenching process?", "What modification was made to the Aclar supplier's heating and quenching process?", "What modification was made to the Aclar supplier's heating and quenching process?", "What modification was made to the Aclar supplier's heating and quenching process?", "What modification was made to the Aclar supplier's heating and quenching process?", "What modification was made to the Aclar supplier's heating and quenching process?", "What modification was made to the Aclar supplier's heating and quenching process?"]


Processed prompts: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 8/8 [00:00<00:00, 43.27it/s, est. speed input: 8021.88 toks/s, output: 86.60 toks/s]


rewards_per_func: tensor([0.6250, 0.7000], device='cuda:0')
["What was the range of the Command Module Pilot's heart rate during the entry phase?", "What was the range of the Command Module Pilot's heart rate during the entry phase?", "What was the range of the Command Module Pilot's heart rate during the entry phase?", "What was the range of the Command Module Pilot's heart rate during the entry phase?", "What was the range of the Command Module Pilot's heart rate during the entry phase?", "What was the range of the Command Module Pilot's heart rate during the entry phase?", "What was the range of the Command Module Pilot's heart rate during the entry phase?", "What was the range of the Command Module Pilot's heart rate during the entry phase?"]


Processed prompts: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 8/8 [00:00<00:00, 50.44it/s, est. speed input: 7323.59 toks/s, output: 101.01 toks/s]


rewards_per_func: tensor([0.6250, 0.7000], device='cuda:0')
['What could be the cause of the current surge experienced by battery 2?', 'What could be the cause of the current surge experienced by battery 2?', 'What could be the cause of the current surge experienced by battery 2?', 'What could be the cause of the current surge experienced by battery 2?', 'What could be the cause of the current surge experienced by battery 2?', 'What could be the cause of the current surge experienced by battery 2?', 'What could be the cause of the current surge experienced by battery 2?', 'What could be the cause of the current surge experienced by battery 2?']


Processed prompts: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 8/8 [00:01<00:00,  4.40it/s, est. speed input: 1121.06 toks/s, output: 104.12 toks/s]


rewards_per_func: tensor([0.2500, 0.7000], device='cuda:0')
['How long did it take to fully charge battery A in the command module?', 'How long did it take to fully charge battery A in the command module?', 'How long did it take to fully charge battery A in the command module?', 'How long did it take to fully charge battery A in the command module?', 'How long did it take to fully charge battery A in the command module?', 'How long did it take to fully charge battery A in the command module?', 'How long did it take to fully charge battery A in the command module?', 'How long did it take to fully charge battery A in the command module?']


Processed prompts: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 8/8 [00:00<00:00, 41.69it/s, est. speed input: 7673.56 toks/s, output: 83.46 toks/s]


rewards_per_func: tensor([0.2500, 0.6125], device='cuda:0')
['What was used exclusively as insulation in the S-Il stage to reduce weight?', 'What was used exclusively as insulation in the S-Il stage to reduce weight?', 'What was used exclusively as insulation in the S-Il stage to reduce weight?', 'What was used exclusively as insulation in the S-Il stage to reduce weight?', 'What was used exclusively as insulation in the S-Il stage to reduce weight?', 'What was used exclusively as insulation in the S-Il stage to reduce weight?', 'What was used exclusively as insulation in the S-Il stage to reduce weight?', 'What was used exclusively as insulation in the S-Il stage to reduce weight?']


Processed prompts: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 8/8 [00:00<00:00, 53.02it/s, est. speed input: 7148.51 toks/s, output: 106.19 toks/s]


rewards_per_func: tensor([1.0000, 0.7000], device='cuda:0')
['What is the percentage at which the piston O-ring enters the chamfer in the breech assembly?', 'What is the percentage at which the piston O-ring enters the chamfer in the breech assembly?', 'What is the percentage at which the piston O-ring enters the chamfer in the breech assembly?', 'What is the percentage at which the piston O-ring enters the chamfer in the breech assembly?', 'What is the percentage at which the piston O-ring enters the chamfer in the breech assembly?', 'What is the percentage at which the piston O-ring enters the chamfer in the breech assembly?', 'What is the percentage at which the piston O-ring enters the chamfer in the breech assembly?', 'What is the percentage at which the piston O-ring enters the chamfer in the breech assembly?']


Processed prompts: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 8/8 [00:00<00:00, 46.64it/s, est. speed input: 8017.50 toks/s, output: 93.36 toks/s]


rewards_per_func: tensor([0.1250, 0.7000], device='cuda:0')
['What was the daily caloric intake provided by the flight menus?', 'What was the daily caloric intake provided by the flight menus?', 'What was the daily caloric intake provided by the flight menus?', 'What was the daily caloric intake provided by the flight menus?', 'What was the daily caloric intake provided by the flight menus?', 'What was the daily caloric intake provided by the flight menus?', 'What was the daily caloric intake provided by the flight menus?', 'What was the daily caloric intake provided by the flight menus?']


Processed prompts: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 8/8 [00:00<00:00, 50.47it/s, est. speed input: 7294.12 toks/s, output: 101.04 toks/s]


rewards_per_func: tensor([0.5000, 0.7000], device='cuda:0')
['What was the nominal rating of the secondary lithium hydroxide cartridge?', 'What was the nominal rating of the secondary lithium hydroxide cartridge?', 'What was the nominal rating of the secondary lithium hydroxide cartridge?', 'What was the nominal rating of the secondary lithium hydroxide cartridge?', 'What was the nominal rating of the secondary lithium hydroxide cartridge?', 'What was the nominal rating of the secondary lithium hydroxide cartridge?', 'What was the nominal rating of the secondary lithium hydroxide cartridge?', 'What was the nominal rating of the secondary lithium hydroxide cartridge?']


Processed prompts: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 8/8 [00:00<00:00, 46.07it/s, est. speed input: 6869.69 toks/s, output: 92.44 toks/s]


rewards_per_func: tensor([1.0000, 0.7000], device='cuda:0')
['What was the condition found when the handle was extended from 5/16 to 3/8 inch from the valve locked position?', 'What was the condition found when the handle was extended from 5/16 to 3/8 inch from the valve locked position?', 'What was the condition found when the handle was extended from 5/16 to 3/8 inch from the valve locked position?', 'What was the condition found when the handle was extended from 5/16 to 3/8 inch from the valve locked position?', 'What was the condition found when the handle was extended from 5/16 to 3/8 inch from the valve locked position?', 'What was the condition found when the handle was extended from 5/16 to 3/8 inch from the valve locked position?', 'What was the condition found when the handle was extended from 5/16 to 3/8 inch from the valve locked position?', 'What was the condition found when the handle was extended from 5/16 to 3/8 inch from the valve locked position?']


Processed prompts: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 8/8 [00:00<00:00, 44.15it/s, est. speed input: 8002.87 toks/s, output: 88.37 toks/s]


rewards_per_func: tensor([0.8750, 0.7000], device='cuda:0')
['What is the estimated depth that the S-IVB seismic energy penetrated into the moon?', 'What is the estimated depth that the S-IVB seismic energy penetrated into the moon?', 'What is the estimated depth that the S-IVB seismic energy penetrated into the moon?', 'What is the estimated depth that the S-IVB seismic energy penetrated into the moon?', 'What is the estimated depth that the S-IVB seismic energy penetrated into the moon?', 'What is the estimated depth that the S-IVB seismic energy penetrated into the moon?', 'What is the estimated depth that the S-IVB seismic energy penetrated into the moon?', 'What is the estimated depth that the S-IVB seismic energy penetrated into the moon?']


Processed prompts: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 8/8 [00:00<00:00, 50.56it/s, est. speed input: 7569.36 toks/s, output: 101.26 toks/s]


rewards_per_func: tensor([0.8750, 0.7000], device='cuda:0')
['What caused the difficulty in establishing acceptable initial conditions for the passive thermal control mode?', 'What caused the difficulty in establishing acceptable initial conditions for the passive thermal control mode?', 'What caused the difficulty in establishing acceptable initial conditions for the passive thermal control mode?', 'What caused the difficulty in establishing acceptable initial conditions for the passive thermal control mode?', 'What caused the difficulty in establishing acceptable initial conditions for the passive thermal control mode?', 'What caused the difficulty in establishing acceptable initial conditions for the passive thermal control mode?', 'What caused the difficulty in establishing acceptable initial conditions for the passive thermal control mode?', 'What caused the difficulty in establishing acceptable initial conditions for the passive thermal control mode?']


Processed prompts: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 8/8 [00:00<00:00, 36.93it/s, est. speed input: 7864.89 toks/s, output: 78.55 toks/s]


rewards_per_func: tensor([0.1250, 0.7000], device='cuda:0')
['How long did it take for the three crewmen to be onboard the recovery ship, USS Iwo Jima, after landing?', 'How long did it take for the three crewmen to be onboard the recovery ship, USS Iwo Jima, after landing?', 'How long did it take for the three crewmen to be onboard the recovery ship, USS Iwo Jima, after landing?', 'How long did it take for the three crewmen to be onboard the recovery ship, USS Iwo Jima, after landing?', 'How long did it take for the three crewmen to be onboard the recovery ship, USS Iwo Jima, after landing?', 'How long did it take for the three crewmen to be onboard the recovery ship, USS Iwo Jima, after landing?', 'How long did it take for the three crewmen to be onboard the recovery ship, USS Iwo Jima, after landing?', 'How long did it take for the three crewmen to be onboard the recovery ship, USS Iwo Jima, after landing?']


Processed prompts: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 8/8 [00:00<00:00, 35.98it/s, est. speed input: 7903.27 toks/s, output: 72.01 toks/s]


rewards_per_func: tensor([0.5000, 0.7000], device='cuda:0')
['What was the reason for the lunar module reaction control system being used at about 137 hours 40 minutes?', 'What was the reason for the lunar module reaction control system being used at about 137 hours 40 minutes?', 'What was the reason for the lunar module reaction control system being used at about 137 hours 40 minutes?', 'What was the reason for the lunar module reaction control system being used at about 137 hours 40 minutes?', 'What was the reason for the lunar module reaction control system being used at about 137 hours 40 minutes?', 'What was the reason for the lunar module reaction control system being used at about 137 hours 40 minutes?', 'What was the reason for the lunar module reaction control system being used at about 137 hours 40 minutes?', 'What was the reason for the lunar module reaction control system being used at about 137 hours 40 minutes?']


Processed prompts: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 8/8 [00:00<00:00,  8.69it/s, est. speed input: 1794.03 toks/s, output: 105.47 toks/s]


rewards_per_func: tensor([0.1250, 0.7000], device='cuda:0')
['What was the status of the Advanced Technology Satellite on the day of photography?', 'What was the status of the Advanced Technology Satellite on the day of photography?', 'What was the status of the Advanced Technology Satellite on the day of photography?', 'What was the status of the Advanced Technology Satellite on the day of photography?', 'What was the status of the Advanced Technology Satellite on the day of photography?', 'What was the status of the Advanced Technology Satellite on the day of photography?', 'What was the status of the Advanced Technology Satellite on the day of photography?', 'What was the status of the Advanced Technology Satellite on the day of photography?']


Processed prompts: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 8/8 [00:00<00:00, 43.18it/s, est. speed input: 8160.97 toks/s, output: 86.41 toks/s]


rewards_per_func: tensor([0.3750, 0.6125], device='cuda:0')
['What was the amplitude of the vibrations at the spacecraft during S-II boost?', 'What was the amplitude of the vibrations at the spacecraft during S-II boost?', 'What was the amplitude of the vibrations at the spacecraft during S-II boost?', 'What was the amplitude of the vibrations at the spacecraft during S-II boost?', 'What was the amplitude of the vibrations at the spacecraft during S-II boost?', 'What was the amplitude of the vibrations at the spacecraft during S-II boost?', 'What was the amplitude of the vibrations at the spacecraft during S-II boost?', 'What was the amplitude of the vibrations at the spacecraft during S-II boost?']


Processed prompts: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 8/8 [00:00<00:00, 49.51it/s, est. speed input: 7770.64 toks/s, output: 99.14 toks/s]


rewards_per_func: tensor([0.0000, 0.7000], device='cuda:0')
['What type of aircraft was staged from Patrick AFB, Florida?', 'What type of aircraft was staged from Patrick AFB, Florida?', 'What type of aircraft was staged from Patrick AFB, Florida?', 'What type of aircraft was staged from Patrick AFB, Florida?', 'What type of aircraft was staged from Patrick AFB, Florida?', 'What type of aircraft was staged from Patrick AFB, Florida?', 'What type of aircraft was staged from Patrick AFB, Florida?', 'What type of aircraft was staged from Patrick AFB, Florida?']


Processed prompts: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 8/8 [00:00<00:00, 34.55it/s, est. speed input: 7092.85 toks/s, output: 77.89 toks/s]


rewards_per_func: tensor([0.3750, 0.7000], device='cuda:0')
['What was the maximum rate excursion in roll?', 'What was the maximum rate excursion in roll?', 'What was the maximum rate excursion in roll?', 'What was the maximum rate excursion in roll?', 'What was the maximum rate excursion in roll?', 'What was the maximum rate excursion in roll?', 'What was the maximum rate excursion in roll?', 'What was the maximum rate excursion in roll?']


Processed prompts: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 8/8 [00:00<00:00, 55.19it/s, est. speed input: 7567.78 toks/s, output: 110.47 toks/s]


rewards_per_func: tensor([0.0000, 0.7000], device='cuda:0')
['How much propellant was used for the initial separation from the S-IVB, the turnaround maneuver, docking and ejection?', 'How much propellant was used for the initial separation from the S-IVB, the turnaround maneuver, docking and ejection?', 'How much propellant was used for the initial separation from the S-IVB, the turnaround maneuver, docking and ejection?', 'How much propellant was used for the initial separation from the S-IVB, the turnaround maneuver, docking and ejection?', 'How much propellant was used for the initial separation from the S-IVB, the turnaround maneuver, docking and ejection?', 'How much propellant was used for the initial separation from the S-IVB, the turnaround maneuver, docking and ejection?', 'How much propellant was used for the initial separation from the S-IVB, the turnaround maneuver, docking and ejection?', 'How much propellant was used for the initial separation from the S-IVB, the turnarou

Processed prompts: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 8/8 [00:00<00:00, 36.25it/s, est. speed input: 7937.82 toks/s, output: 72.57 toks/s]


rewards_per_func: tensor([0.2500, 0.7000], device='cuda:0')
['What caused the performance of fuel cells l and 3 to degrade within 3 minutes after the Oxygen tank 2 pressure dropped?', 'What caused the performance of fuel cells l and 3 to degrade within 3 minutes after the Oxygen tank 2 pressure dropped?', 'What caused the performance of fuel cells l and 3 to degrade within 3 minutes after the Oxygen tank 2 pressure dropped?', 'What caused the performance of fuel cells l and 3 to degrade within 3 minutes after the Oxygen tank 2 pressure dropped?', 'What caused the performance of fuel cells l and 3 to degrade within 3 minutes after the Oxygen tank 2 pressure dropped?', 'What caused the performance of fuel cells l and 3 to degrade within 3 minutes after the Oxygen tank 2 pressure dropped?', 'What caused the performance of fuel cells l and 3 to degrade within 3 minutes after the Oxygen tank 2 pressure dropped?', 'What caused the performance of fuel cells l and 3 to degrade within 3 minutes

Processed prompts: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 8/8 [00:00<00:00, 39.45it/s, est. speed input: 7969.34 toks/s, output: 83.94 toks/s]


rewards_per_func: tensor([1.0000, 0.7000], device='cuda:0')
['What was the initial consumption rate of electrical energy from the lunar module batteries?', 'What was the initial consumption rate of electrical energy from the lunar module batteries?', 'What was the initial consumption rate of electrical energy from the lunar module batteries?', 'What was the initial consumption rate of electrical energy from the lunar module batteries?', 'What was the initial consumption rate of electrical energy from the lunar module batteries?', 'What was the initial consumption rate of electrical energy from the lunar module batteries?', 'What was the initial consumption rate of electrical energy from the lunar module batteries?', 'What was the initial consumption rate of electrical energy from the lunar module batteries?']


Processed prompts: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 8/8 [00:00<00:00, 44.27it/s, est. speed input: 7970.81 toks/s, output: 88.62 toks/s]


rewards_per_func: tensor([0.6250, 0.7000], device='cuda:0')
['What type of loads were experienced by the lunar module during docking and service propulsion and descent propulsion maneuvers?', 'What type of loads were experienced by the lunar module during docking and service propulsion and descent propulsion maneuvers?', 'What type of loads were experienced by the lunar module during docking and service propulsion and descent propulsion maneuvers?', 'What type of loads were experienced by the lunar module during docking and service propulsion and descent propulsion maneuvers?', 'What type of loads were experienced by the lunar module during docking and service propulsion and descent propulsion maneuvers?', 'What type of loads were experienced by the lunar module during docking and service propulsion and descent propulsion maneuvers?', 'What type of loads were experienced by the lunar module during docking and service propulsion and descent propulsion maneuvers?', 'What type of loads we

Processed prompts: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 8/8 [00:00<00:00, 32.70it/s, est. speed input: 8505.00 toks/s, output: 65.45 toks/s]


rewards_per_func: tensor([0.0000, 0.7000], device='cuda:0')
['Why was the suit pressure transducer failure not considered a critical issue?', 'Why was the suit pressure transducer failure not considered a critical issue?', 'Why was the suit pressure transducer failure not considered a critical issue?', 'Why was the suit pressure transducer failure not considered a critical issue?', 'Why was the suit pressure transducer failure not considered a critical issue?', 'Why was the suit pressure transducer failure not considered a critical issue?', 'Why was the suit pressure transducer failure not considered a critical issue?', 'Why was the suit pressure transducer failure not considered a critical issue?']


Processed prompts: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 8/8 [00:00<00:00, 30.04it/s, est. speed input: 6866.91 toks/s, output: 75.17 toks/s]


rewards_per_func: tensor([0.5000, 0.7000], device='cuda:0')
['What was the reason for the reaction control isolation valve failure?', 'What was the reason for the reaction control isolation valve failure?', 'What was the reason for the reaction control isolation valve failure?', 'What was the reason for the reaction control isolation valve failure?', 'What was the reason for the reaction control isolation valve failure?', 'What was the reason for the reaction control isolation valve failure?', 'What was the reason for the reaction control isolation valve failure?', 'What was the reason for the reaction control isolation valve failure?']


Processed prompts: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 8/8 [00:00<00:00, 38.17it/s, est. speed input: 8493.78 toks/s, output: 76.39 toks/s]


rewards_per_func: tensor([0.2500, 0.7000], device='cuda:0')
['What was the primary function of the potable water tank in the spacecraft?', 'What was the primary function of the potable water tank in the spacecraft?', 'What was the primary function of the potable water tank in the spacecraft?', 'What was the primary function of the potable water tank in the spacecraft?', 'What was the primary function of the potable water tank in the spacecraft?', 'What was the primary function of the potable water tank in the spacecraft?', 'What was the primary function of the potable water tank in the spacecraft?', 'What was the primary function of the potable water tank in the spacecraft?']


Processed prompts: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 8/8 [00:00<00:00, 41.29it/s, est. speed input: 7782.11 toks/s, output: 87.84 toks/s]


rewards_per_func: tensor([1.0000, 0.7000], device='cuda:0')
['How long was the suit compressor limited to operate during entry?', 'How long was the suit compressor limited to operate during entry?', 'How long was the suit compressor limited to operate during entry?', 'How long was the suit compressor limited to operate during entry?', 'How long was the suit compressor limited to operate during entry?', 'How long was the suit compressor limited to operate during entry?', 'How long was the suit compressor limited to operate during entry?', 'How long was the suit compressor limited to operate during entry?']


Processed prompts: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 8/8 [00:00<00:00, 51.52it/s, est. speed input: 6672.47 toks/s, output: 103.14 toks/s]


rewards_per_func: tensor([1.0000, 0.7000], device='cuda:0')
['What was the firing time of the first midcourse correction during the transearth phase?', 'What was the firing time of the first midcourse correction during the transearth phase?', 'What was the firing time of the first midcourse correction during the transearth phase?', 'What was the firing time of the first midcourse correction during the transearth phase?', 'What was the firing time of the first midcourse correction during the transearth phase?', 'What was the firing time of the first midcourse correction during the transearth phase?', 'What was the firing time of the first midcourse correction during the transearth phase?', 'What was the firing time of the first midcourse correction during the transearth phase?']


Processed prompts: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 8/8 [00:00<00:00, 43.46it/s, est. speed input: 7986.60 toks/s, output: 86.99 toks/s]


rewards_per_func: tensor([0.7500, 0.7000], device='cuda:0')
['What was the reason for the minus 0.5-ft/sec translation maneuver after service module jettison?', 'What was the reason for the minus 0.5-ft/sec translation maneuver after service module jettison?', 'What was the reason for the minus 0.5-ft/sec translation maneuver after service module jettison?', 'What was the reason for the minus 0.5-ft/sec translation maneuver after service module jettison?', 'What was the reason for the minus 0.5-ft/sec translation maneuver after service module jettison?', 'What was the reason for the minus 0.5-ft/sec translation maneuver after service module jettison?', 'What was the reason for the minus 0.5-ft/sec translation maneuver after service module jettison?', 'What was the reason for the minus 0.5-ft/sec translation maneuver after service module jettison?']


Processed prompts: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 8/8 [00:00<00:00, 13.68it/s, est. speed input: 2491.02 toks/s, output: 104.43 toks/s]


rewards_per_func: tensor([0.5000, 0.7000], device='cuda:0')
['What was the result of testing the operation of the valves with the handle extended from 3/8 inch to full travel from the valve-locked position?', 'What was the result of testing the operation of the valves with the handle extended from 3/8 inch to full travel from the valve-locked position?', 'What was the result of testing the operation of the valves with the handle extended from 3/8 inch to full travel from the valve-locked position?', 'What was the result of testing the operation of the valves with the handle extended from 3/8 inch to full travel from the valve-locked position?', 'What was the result of testing the operation of the valves with the handle extended from 3/8 inch to full travel from the valve-locked position?', 'What was the result of testing the operation of the valves with the handle extended from 3/8 inch to full travel from the valve-locked position?', 'What was the result of testing the operation of th

Processed prompts: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 8/8 [00:00<00:00, 34.70it/s, est. speed input: 8188.20 toks/s, output: 73.80 toks/s]


rewards_per_func: tensor([0.5000, 0.7000], device='cuda:0')
['How much water was used from the lunar module tanks between activation of the sublimator and undocking?', 'How much water was used from the lunar module tanks between activation of the sublimator and undocking?', 'How much water was used from the lunar module tanks between activation of the sublimator and undocking?', 'How much water was used from the lunar module tanks between activation of the sublimator and undocking?', 'How much water was used from the lunar module tanks between activation of the sublimator and undocking?', 'How much water was used from the lunar module tanks between activation of the sublimator and undocking?', 'How much water was used from the lunar module tanks between activation of the sublimator and undocking?', 'How much water was used from the lunar module tanks between activation of the sublimator and undocking?']


Processed prompts: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 8/8 [00:00<00:00, 50.15it/s, est. speed input: 7638.55 toks/s, output: 100.42 toks/s]


rewards_per_func: tensor([0.8750, 0.6125], device='cuda:0')
['What was the amplitude of the S-II crossbeam oscillations at 330.6 seconds?', 'What was the amplitude of the S-II crossbeam oscillations at 330.6 seconds?', 'What was the amplitude of the S-II crossbeam oscillations at 330.6 seconds?', 'What was the amplitude of the S-II crossbeam oscillations at 330.6 seconds?', 'What was the amplitude of the S-II crossbeam oscillations at 330.6 seconds?', 'What was the amplitude of the S-II crossbeam oscillations at 330.6 seconds?', 'What was the amplitude of the S-II crossbeam oscillations at 330.6 seconds?', 'What was the amplitude of the S-II crossbeam oscillations at 330.6 seconds?']


Processed prompts: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 8/8 [00:00<00:00, 47.16it/s, est. speed input: 7829.51 toks/s, output: 94.40 toks/s]


rewards_per_func: tensor([0.8750, 0.7000], device='cuda:0')
["What was the final reading from the Command Module Pilot's dosimeter?", "What was the final reading from the Command Module Pilot's dosimeter?", "What was the final reading from the Command Module Pilot's dosimeter?", "What was the final reading from the Command Module Pilot's dosimeter?", "What was the final reading from the Command Module Pilot's dosimeter?", "What was the final reading from the Command Module Pilot's dosimeter?", "What was the final reading from the Command Module Pilot's dosimeter?", "What was the final reading from the Command Module Pilot's dosimeter?"]


Processed prompts: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 8/8 [00:00<00:00, 56.78it/s, est. speed input: 7339.88 toks/s, output: 113.68 toks/s]


rewards_per_func: tensor([1.0000, 0.7000], device='cuda:0')
['What was the reason for the erratic readings from the fuel cell flow indicators before lift-off?', 'What was the reason for the erratic readings from the fuel cell flow indicators before lift-off?', 'What was the reason for the erratic readings from the fuel cell flow indicators before lift-off?', 'What was the reason for the erratic readings from the fuel cell flow indicators before lift-off?', 'What was the reason for the erratic readings from the fuel cell flow indicators before lift-off?', 'What was the reason for the erratic readings from the fuel cell flow indicators before lift-off?', 'What was the reason for the erratic readings from the fuel cell flow indicators before lift-off?', 'What was the reason for the erratic readings from the fuel cell flow indicators before lift-off?']


Processed prompts: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 8/8 [00:00<00:00, 10.77it/s, est. speed input: 2495.78 toks/s, output: 99.72 toks/s]


rewards_per_func: tensor([0.1250, 0.6125], device='cuda:0')
["What was the time of the lunar module's jettison?", "What was the time of the lunar module's jettison?", "What was the time of the lunar module's jettison?", "What was the time of the lunar module's jettison?", "What was the time of the lunar module's jettison?", "What was the time of the lunar module's jettison?", "What was the time of the lunar module's jettison?", "What was the time of the lunar module's jettison?"]


  completion_ids = [torch.tensor(ids, device=device) for ids in completion_ids]
Processed prompts: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 8/8 [00:00<00:00, 50.09it/s, est. speed input: 7723.20 toks/s, output: 100.30 toks/s]


rewards_per_func: tensor([0.3750, 0.7000], device='cuda:0')
['How will the suit pressure transducers be reassembled for Apollo 15 and subsequent missions?', 'How will the suit pressure transducers be reassembled for Apollo 15 and subsequent missions?', 'How will the suit pressure transducers be reassembled for Apollo 15 and subsequent missions?', 'How will the suit pressure transducers be reassembled for Apollo 15 and subsequent missions?', 'How will the suit pressure transducers be reassembled for Apollo 15 and subsequent missions?', 'How will the suit pressure transducers be reassembled for Apollo 15 and subsequent missions?', 'How will the suit pressure transducers be reassembled for Apollo 15 and subsequent missions?', 'How will the suit pressure transducers be reassembled for Apollo 15 and subsequent missions?']


Processed prompts: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 8/8 [00:00<00:00, 41.91it/s, est. speed input: 8340.83 toks/s, output: 83.88 toks/s]


rewards_per_func: tensor([0.7500, 0.7000], device='cuda:0')
['What was the cause of the gas leak in the Apex Cover Jettison System?', 'What was the cause of the gas leak in the Apex Cover Jettison System?', 'What was the cause of the gas leak in the Apex Cover Jettison System?', 'What was the cause of the gas leak in the Apex Cover Jettison System?', 'What was the cause of the gas leak in the Apex Cover Jettison System?', 'What was the cause of the gas leak in the Apex Cover Jettison System?', 'What was the cause of the gas leak in the Apex Cover Jettison System?', 'What was the cause of the gas leak in the Apex Cover Jettison System?']


Processed prompts: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 8/8 [00:00<00:00, 36.33it/s, est. speed input: 8316.43 toks/s, output: 72.71 toks/s]


rewards_per_func: tensor([0.1250, 0.6125], device='cuda:0')
['What was the reason for discontinuing the use of the auxiliary dump nozzle on the Apollo mission?', 'What was the reason for discontinuing the use of the auxiliary dump nozzle on the Apollo mission?', 'What was the reason for discontinuing the use of the auxiliary dump nozzle on the Apollo mission?', 'What was the reason for discontinuing the use of the auxiliary dump nozzle on the Apollo mission?', 'What was the reason for discontinuing the use of the auxiliary dump nozzle on the Apollo mission?', 'What was the reason for discontinuing the use of the auxiliary dump nozzle on the Apollo mission?', 'What was the reason for discontinuing the use of the auxiliary dump nozzle on the Apollo mission?', 'What was the reason for discontinuing the use of the auxiliary dump nozzle on the Apollo mission?']


Processed prompts: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 8/8 [00:00<00:00, 43.73it/s, est. speed input: 7780.31 toks/s, output: 87.54 toks/s]


rewards_per_func: tensor([0.8750, 0.7000], device='cuda:0')
['What type of malfunction could cause the deep repetitive transients on the phase modulated downlink carrier?', 'What type of malfunction could cause the deep repetitive transients on the phase modulated downlink carrier?', 'What type of malfunction could cause the deep repetitive transients on the phase modulated downlink carrier?', 'What type of malfunction could cause the deep repetitive transients on the phase modulated downlink carrier?', 'What type of malfunction could cause the deep repetitive transients on the phase modulated downlink carrier?', 'What type of malfunction could cause the deep repetitive transients on the phase modulated downlink carrier?', 'What type of malfunction could cause the deep repetitive transients on the phase modulated downlink carrier?', 'What type of malfunction could cause the deep repetitive transients on the phase modulated downlink carrier?']


Processed prompts: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 8/8 [00:00<00:00, 39.11it/s, est. speed input: 8057.98 toks/s, output: 83.22 toks/s]


rewards_per_func: tensor([0.8750, 0.7000], device='cuda:0')
['What was the primary function of the helicopter designated "Recovery"?', 'What was the primary function of the helicopter designated "Recovery"?', 'What was the primary function of the helicopter designated "Recovery"?', 'What was the primary function of the helicopter designated "Recovery"?', 'What was the primary function of the helicopter designated "Recovery"?', 'What was the primary function of the helicopter designated "Recovery"?', 'What was the primary function of the helicopter designated "Recovery"?', 'What was the primary function of the helicopter designated "Recovery"?']


Processed prompts: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 8/8 [00:00<00:00, 49.36it/s, est. speed input: 7553.94 toks/s, output: 98.82 toks/s]


rewards_per_func: tensor([1.0000, 0.7000], device='cuda:0')
['What was the location of the Manned Spacecraft Center?', 'What was the location of the Manned Spacecraft Center?', 'What was the location of the Manned Spacecraft Center?', 'What was the location of the Manned Spacecraft Center?', 'What was the location of the Manned Spacecraft Center?', 'What was the location of the Manned Spacecraft Center?', 'What was the location of the Manned Spacecraft Center?', 'What was the location of the Manned Spacecraft Center?']


Processed prompts: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 8/8 [00:00<00:00, 38.67it/s, est. speed input: 8188.92 toks/s, output: 77.39 toks/s]


rewards_per_func: tensor([0.6250, 0.7000], device='cuda:0')
['What system was used to perform the transearth injection maneuver?', 'What system was used to perform the transearth injection maneuver?', 'What system was used to perform the transearth injection maneuver?', 'What system was used to perform the transearth injection maneuver?', 'What system was used to perform the transearth injection maneuver?', 'What system was used to perform the transearth injection maneuver?', 'What system was used to perform the transearth injection maneuver?', 'What system was used to perform the transearth injection maneuver?']


Processed prompts: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 8/8 [00:00<00:00, 41.29it/s, est. speed input: 7840.35 toks/s, output: 82.64 toks/s]


rewards_per_func: tensor([0.2500, 0.7000], device='cuda:0')
['Who controlled the spacecraft during the sun/moon alignment?', 'Who controlled the spacecraft during the sun/moon alignment?', 'Who controlled the spacecraft during the sun/moon alignment?', 'Who controlled the spacecraft during the sun/moon alignment?', 'Who controlled the spacecraft during the sun/moon alignment?', 'Who controlled the spacecraft during the sun/moon alignment?', 'Who controlled the spacecraft during the sun/moon alignment?', 'Who controlled the spacecraft during the sun/moon alignment?']


Processed prompts: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 8/8 [00:00<00:00, 49.70it/s, est. speed input: 7080.43 toks/s, output: 99.63 toks/s]


rewards_per_func: tensor([0.8750, 0.7000], device='cuda:0')
['What is the purpose of Table D-I in the provided data?', 'What is the purpose of Table D-I in the provided data?', 'What is the purpose of Table D-I in the provided data?', 'What is the purpose of Table D-I in the provided data?', 'What is the purpose of Table D-I in the provided data?', 'What is the purpose of Table D-I in the provided data?', 'What is the purpose of Table D-I in the provided data?', 'What is the purpose of Table D-I in the provided data?']


Processed prompts: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 8/8 [00:00<00:00, 42.97it/s, est. speed input: 8181.83 toks/s, output: 86.01 toks/s]


rewards_per_func: tensor([0.6250, 0.7000], device='cuda:0')
['What was the difference between the manually adjusted antenna settings and the most favorable settings for 55 hours?', 'What was the difference between the manually adjusted antenna settings and the most favorable settings for 55 hours?', 'What was the difference between the manually adjusted antenna settings and the most favorable settings for 55 hours?', 'What was the difference between the manually adjusted antenna settings and the most favorable settings for 55 hours?', 'What was the difference between the manually adjusted antenna settings and the most favorable settings for 55 hours?', 'What was the difference between the manually adjusted antenna settings and the most favorable settings for 55 hours?', 'What was the difference between the manually adjusted antenna settings and the most favorable settings for 55 hours?', 'What was the difference between the manually adjusted antenna settings and the most favorable sett

Processed prompts: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 8/8 [00:00<00:00, 42.62it/s, est. speed input: 8158.55 toks/s, output: 85.32 toks/s]


rewards_per_func: tensor([0.7500, 0.6125], device='cuda:0')
['What was the antenna mode at the time of acquisition and until the reacquisition mode was selected at 55:00:10?', 'What was the antenna mode at the time of acquisition and until the reacquisition mode was selected at 55:00:10?', 'What was the antenna mode at the time of acquisition and until the reacquisition mode was selected at 55:00:10?', 'What was the antenna mode at the time of acquisition and until the reacquisition mode was selected at 55:00:10?', 'What was the antenna mode at the time of acquisition and until the reacquisition mode was selected at 55:00:10?', 'What was the antenna mode at the time of acquisition and until the reacquisition mode was selected at 55:00:10?', 'What was the antenna mode at the time of acquisition and until the reacquisition mode was selected at 55:00:10?', 'What was the antenna mode at the time of acquisition and until the reacquisition mode was selected at 55:00:10?']


Processed prompts: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 8/8 [00:00<00:00, 41.96it/s, est. speed input: 8276.17 toks/s, output: 83.97 toks/s]


rewards_per_func: tensor([0.3750, 0.7000], device='cuda:0')
['When did crew training for Apollo 13 commence?', 'When did crew training for Apollo 13 commence?', 'When did crew training for Apollo 13 commence?', 'When did crew training for Apollo 13 commence?', 'When did crew training for Apollo 13 commence?', 'When did crew training for Apollo 13 commence?', 'When did crew training for Apollo 13 commence?', 'When did crew training for Apollo 13 commence?']


Processed prompts: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 8/8 [00:00<00:00, 54.56it/s, est. speed input: 6919.19 toks/s, output: 109.28 toks/s]


rewards_per_func: tensor([1.0000, 0.7000], device='cuda:0')
["What was the reason for the Lunar Module Pilot's headache on the second day of the mission?", "What was the reason for the Lunar Module Pilot's headache on the second day of the mission?", "What was the reason for the Lunar Module Pilot's headache on the second day of the mission?", "What was the reason for the Lunar Module Pilot's headache on the second day of the mission?", "What was the reason for the Lunar Module Pilot's headache on the second day of the mission?", "What was the reason for the Lunar Module Pilot's headache on the second day of the mission?", "What was the reason for the Lunar Module Pilot's headache on the second day of the mission?", "What was the reason for the Lunar Module Pilot's headache on the second day of the mission?"]


Processed prompts: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 8/8 [00:00<00:00, 18.74it/s, est. speed input: 3556.28 toks/s, output: 100.87 toks/s]


rewards_per_func: tensor([0.1250, 0.7000], device='cuda:0')
['How many ampere-hours of energy remained in the lunar module batteries at the time of undocking?', 'How many ampere-hours of energy remained in the lunar module batteries at the time of undocking?', 'How many ampere-hours of energy remained in the lunar module batteries at the time of undocking?', 'How many ampere-hours of energy remained in the lunar module batteries at the time of undocking?', 'How many ampere-hours of energy remained in the lunar module batteries at the time of undocking?', 'How many ampere-hours of energy remained in the lunar module batteries at the time of undocking?', 'How many ampere-hours of energy remained in the lunar module batteries at the time of undocking?', 'How many ampere-hours of energy remained in the lunar module batteries at the time of undocking?']


Processed prompts: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 8/8 [00:00<00:00, 41.44it/s, est. speed input: 7592.04 toks/s, output: 82.97 toks/s]


rewards_per_func: tensor([0.7500, 0.7000], device='cuda:0')
['At what time did the command module land?', 'At what time did the command module land?', 'At what time did the command module land?', 'At what time did the command module land?', 'At what time did the command module land?', 'At what time did the command module land?', 'At what time did the command module land?', 'At what time did the command module land?']


Processed prompts: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 8/8 [00:00<00:00, 39.00it/s, est. speed input: 7636.65 toks/s, output: 78.07 toks/s]


rewards_per_func: tensor([0.1250, 0.7000], device='cuda:0')
['What was the frequency of the observed current variation from the balloon released at 1:14 p.m.?', 'What was the frequency of the observed current variation from the balloon released at 1:14 p.m.?', 'What was the frequency of the observed current variation from the balloon released at 1:14 p.m.?', 'What was the frequency of the observed current variation from the balloon released at 1:14 p.m.?', 'What was the frequency of the observed current variation from the balloon released at 1:14 p.m.?', 'What was the frequency of the observed current variation from the balloon released at 1:14 p.m.?', 'What was the frequency of the observed current variation from the balloon released at 1:14 p.m.?', 'What was the frequency of the observed current variation from the balloon released at 1:14 p.m.?']


Processed prompts: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 8/8 [00:00<00:00, 47.49it/s, est. speed input: 7615.88 toks/s, output: 95.05 toks/s]


rewards_per_func: tensor([0.8750, 0.7000], device='cuda:0')
['What was the result of disassembling the Apollo 13 transducer and water tank?', 'What was the result of disassembling the Apollo 13 transducer and water tank?', 'What was the result of disassembling the Apollo 13 transducer and water tank?', 'What was the result of disassembling the Apollo 13 transducer and water tank?', 'What was the result of disassembling the Apollo 13 transducer and water tank?', 'What was the result of disassembling the Apollo 13 transducer and water tank?', 'What was the result of disassembling the Apollo 13 transducer and water tank?', 'What was the result of disassembling the Apollo 13 transducer and water tank?']


Processed prompts: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 8/8 [00:00<00:00, 50.02it/s, est. speed input: 7765.10 toks/s, output: 100.19 toks/s]


rewards_per_func: tensor([0.3750, 0.6125], device='cuda:0')
['What was the location of the primary recovery support for the spacecraft?', 'What was the location of the primary recovery support for the spacecraft?', 'What was the location of the primary recovery support for the spacecraft?', 'What was the location of the primary recovery support for the spacecraft?', 'What was the location of the primary recovery support for the spacecraft?', 'What was the location of the primary recovery support for the spacecraft?', 'What was the location of the primary recovery support for the spacecraft?', 'What was the location of the primary recovery support for the spacecraft?']


Processed prompts: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 8/8 [00:00<00:00, 47.41it/s, est. speed input: 7883.06 toks/s, output: 94.90 toks/s]


rewards_per_func: tensor([0.0000, 0.7000], device='cuda:0')
['What was the weight of the spacecraft at the time of lunar module separation?', 'What was the weight of the spacecraft at the time of lunar module separation?', 'What was the weight of the spacecraft at the time of lunar module separation?', 'What was the weight of the spacecraft at the time of lunar module separation?', 'What was the weight of the spacecraft at the time of lunar module separation?', 'What was the weight of the spacecraft at the time of lunar module separation?', 'What was the weight of the spacecraft at the time of lunar module separation?', 'What was the weight of the spacecraft at the time of lunar module separation?']


Processed prompts: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 8/8 [00:00<00:00, 41.91it/s, est. speed input: 8064.70 toks/s, output: 83.90 toks/s]


rewards_per_func: tensor([0.0000, 0.7000], device='cuda:0')
['What was required to reconstruct the geometry involved in the experiment?', 'What was required to reconstruct the geometry involved in the experiment?', 'What was required to reconstruct the geometry involved in the experiment?', 'What was required to reconstruct the geometry involved in the experiment?', 'What was required to reconstruct the geometry involved in the experiment?', 'What was required to reconstruct the geometry involved in the experiment?', 'What was required to reconstruct the geometry involved in the experiment?', 'What was required to reconstruct the geometry involved in the experiment?']


Processed prompts: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 8/8 [00:01<00:00,  7.54it/s, est. speed input: 1591.12 toks/s, output: 104.75 toks/s]


rewards_per_func: tensor([0.1250, 0.7000], device='cuda:0')
['What was the predicted tank condition at the time of descent engine firing for lunar descent?', 'What was the predicted tank condition at the time of descent engine firing for lunar descent?', 'What was the predicted tank condition at the time of descent engine firing for lunar descent?', 'What was the predicted tank condition at the time of descent engine firing for lunar descent?', 'What was the predicted tank condition at the time of descent engine firing for lunar descent?', 'What was the predicted tank condition at the time of descent engine firing for lunar descent?', 'What was the predicted tank condition at the time of descent engine firing for lunar descent?', 'What was the predicted tank condition at the time of descent engine firing for lunar descent?']


Processed prompts: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 8/8 [00:00<00:00, 42.13it/s, est. speed input: 7508.84 toks/s, output: 89.64 toks/s]


rewards_per_func: tensor([0.7500, 0.7000], device='cuda:0')
['What was the initial current consumption of the vehicle after the second descent propulsion system firing?', 'What was the initial current consumption of the vehicle after the second descent propulsion system firing?', 'What was the initial current consumption of the vehicle after the second descent propulsion system firing?', 'What was the initial current consumption of the vehicle after the second descent propulsion system firing?', 'What was the initial current consumption of the vehicle after the second descent propulsion system firing?', 'What was the initial current consumption of the vehicle after the second descent propulsion system firing?', 'What was the initial current consumption of the vehicle after the second descent propulsion system firing?', 'What was the initial current consumption of the vehicle after the second descent propulsion system firing?']


Processed prompts: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 8/8 [00:00<00:00, 47.81it/s, est. speed input: 7830.56 toks/s, output: 95.71 toks/s]


rewards_per_func: tensor([0.1250, 0.7000], device='cuda:0')
['What was the respiratory rate of the Command Module Pilot just prior to the incident?', 'What was the respiratory rate of the Command Module Pilot just prior to the incident?', 'What was the respiratory rate of the Command Module Pilot just prior to the incident?', 'What was the respiratory rate of the Command Module Pilot just prior to the incident?', 'What was the respiratory rate of the Command Module Pilot just prior to the incident?', 'What was the respiratory rate of the Command Module Pilot just prior to the incident?', 'What was the respiratory rate of the Command Module Pilot just prior to the incident?', 'What was the respiratory rate of the Command Module Pilot just prior to the incident?']


Processed prompts: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 8/8 [00:00<00:00, 51.70it/s, est. speed input: 7504.23 toks/s, output: 103.50 toks/s]


rewards_per_func: tensor([0.6250, 0.7000], device='cuda:0')
["What was the reason for the Lunar Module Pilot's heart rate increase during the entry phase?", "What was the reason for the Lunar Module Pilot's heart rate increase during the entry phase?", "What was the reason for the Lunar Module Pilot's heart rate increase during the entry phase?", "What was the reason for the Lunar Module Pilot's heart rate increase during the entry phase?", "What was the reason for the Lunar Module Pilot's heart rate increase during the entry phase?", "What was the reason for the Lunar Module Pilot's heart rate increase during the entry phase?", "What was the reason for the Lunar Module Pilot's heart rate increase during the entry phase?", "What was the reason for the Lunar Module Pilot's heart rate increase during the entry phase?"]


Processed prompts: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 8/8 [00:00<00:00, 42.46it/s, est. speed input: 7852.19 toks/s, output: 85.00 toks/s]


rewards_per_func: tensor([0.6250, 0.7000], device='cuda:0')
['What was the effect observed on the fuel cell flow indicators?', 'What was the effect observed on the fuel cell flow indicators?', 'What was the effect observed on the fuel cell flow indicators?', 'What was the effect observed on the fuel cell flow indicators?', 'What was the effect observed on the fuel cell flow indicators?', 'What was the effect observed on the fuel cell flow indicators?', 'What was the effect observed on the fuel cell flow indicators?', 'What was the effect observed on the fuel cell flow indicators?']


Processed prompts: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 8/8 [00:00<00:00,  9.62it/s, est. speed input: 2319.29 toks/s, output: 90.41 toks/s]


rewards_per_func: tensor([0.0000, 0.7000], device='cuda:0')
['What was the allowable limit of carbon dioxide buildup in the lunar module atmosphere?', 'What was the allowable limit of carbon dioxide buildup in the lunar module atmosphere?', 'What was the allowable limit of carbon dioxide buildup in the lunar module atmosphere?', 'What was the allowable limit of carbon dioxide buildup in the lunar module atmosphere?', 'What was the allowable limit of carbon dioxide buildup in the lunar module atmosphere?', 'What was the allowable limit of carbon dioxide buildup in the lunar module atmosphere?', 'What was the allowable limit of carbon dioxide buildup in the lunar module atmosphere?', 'What was the allowable limit of carbon dioxide buildup in the lunar module atmosphere?']


Processed prompts: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 8/8 [00:00<00:00, 49.00it/s, est. speed input: 7487.47 toks/s, output: 98.11 toks/s]


rewards_per_func: tensor([0.1250, 0.7000], device='cuda:0')
['What was the reason for the poor star visibility during the command module alignment?', 'What was the reason for the poor star visibility during the command module alignment?', 'What was the reason for the poor star visibility during the command module alignment?', 'What was the reason for the poor star visibility during the command module alignment?', 'What was the reason for the poor star visibility during the command module alignment?', 'What was the reason for the poor star visibility during the command module alignment?', 'What was the reason for the poor star visibility during the command module alignment?', 'What was the reason for the poor star visibility during the command module alignment?']


Processed prompts: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 8/8 [00:00<00:00, 30.88it/s, est. speed input: 7073.96 toks/s, output: 73.44 toks/s]


rewards_per_func: tensor([0.8750, 0.7000], device='cuda:0')
['What was the estimated minimum temperature reached by the inertial measurement unit before power-up?', 'What was the estimated minimum temperature reached by the inertial measurement unit before power-up?', 'What was the estimated minimum temperature reached by the inertial measurement unit before power-up?', 'What was the estimated minimum temperature reached by the inertial measurement unit before power-up?', 'What was the estimated minimum temperature reached by the inertial measurement unit before power-up?', 'What was the estimated minimum temperature reached by the inertial measurement unit before power-up?', 'What was the estimated minimum temperature reached by the inertial measurement unit before power-up?', 'What was the estimated minimum temperature reached by the inertial measurement unit before power-up?']


Processed prompts: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 8/8 [00:00<00:00, 42.24it/s, est. speed input: 8086.33 toks/s, output: 84.56 toks/s]


rewards_per_func: tensor([0.7500, 0.7000], device='cuda:0')
['How long did the seismic signals from the S-IVB impact continue for?', 'How long did the seismic signals from the S-IVB impact continue for?', 'How long did the seismic signals from the S-IVB impact continue for?', 'How long did the seismic signals from the S-IVB impact continue for?', 'How long did the seismic signals from the S-IVB impact continue for?', 'How long did the seismic signals from the S-IVB impact continue for?', 'How long did the seismic signals from the S-IVB impact continue for?', 'How long did the seismic signals from the S-IVB impact continue for?']


Processed prompts: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 8/8 [00:00<00:00, 38.02it/s, est. speed input: 7770.64 toks/s, output: 76.09 toks/s]


rewards_per_func: tensor([0.3750, 0.7000], device='cuda:0')
["What was the cause of the high-gain antenna's temporary loss of telemetry data at approximately 56 hours?", "What was the cause of the high-gain antenna's temporary loss of telemetry data at approximately 56 hours?", "What was the cause of the high-gain antenna's temporary loss of telemetry data at approximately 56 hours?", "What was the cause of the high-gain antenna's temporary loss of telemetry data at approximately 56 hours?", "What was the cause of the high-gain antenna's temporary loss of telemetry data at approximately 56 hours?", "What was the cause of the high-gain antenna's temporary loss of telemetry data at approximately 56 hours?", "What was the cause of the high-gain antenna's temporary loss of telemetry data at approximately 56 hours?", "What was the cause of the high-gain antenna's temporary loss of telemetry data at approximately 56 hours?"]


Processed prompts: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 8/8 [00:00<00:00, 41.39it/s, est. speed input: 8522.00 toks/s, output: 82.84 toks/s]


rewards_per_func: tensor([0.8750, 0.7000], device='cuda:0')
['What was the allowable limit of carbon dioxide buildup in the lunar module atmosphere?', 'What was the allowable limit of carbon dioxide buildup in the lunar module atmosphere?', 'What was the allowable limit of carbon dioxide buildup in the lunar module atmosphere?', 'What was the allowable limit of carbon dioxide buildup in the lunar module atmosphere?', 'What was the allowable limit of carbon dioxide buildup in the lunar module atmosphere?', 'What was the allowable limit of carbon dioxide buildup in the lunar module atmosphere?', 'What was the allowable limit of carbon dioxide buildup in the lunar module atmosphere?', 'What was the allowable limit of carbon dioxide buildup in the lunar module atmosphere?']


Processed prompts: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 8/8 [00:00<00:00, 63.22it/s, est. speed input: 9629.78 toks/s, output: 126.60 toks/s]


rewards_per_func: tensor([0.3750, 0.7000], device='cuda:0')
['What was the electrostatic potential energy provided by a stored charge of 0.04 coulomb at a potential of 4 million volts?', 'What was the electrostatic potential energy provided by a stored charge of 0.04 coulomb at a potential of 4 million volts?', 'What was the electrostatic potential energy provided by a stored charge of 0.04 coulomb at a potential of 4 million volts?', 'What was the electrostatic potential energy provided by a stored charge of 0.04 coulomb at a potential of 4 million volts?', 'What was the electrostatic potential energy provided by a stored charge of 0.04 coulomb at a potential of 4 million volts?', 'What was the electrostatic potential energy provided by a stored charge of 0.04 coulomb at a potential of 4 million volts?', 'What was the electrostatic potential energy provided by a stored charge of 0.04 coulomb at a potential of 4 million volts?', 'What was the electrostatic potential energy provided by 

Processed prompts: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 8/8 [00:01<00:00,  6.90it/s, est. speed input: 1218.14 toks/s, output: 105.25 toks/s]


rewards_per_func: tensor([0.7500, 0.6125], device='cuda:0')
['What was the condition of the interior surfaces of the command module?', 'What was the condition of the interior surfaces of the command module?', 'What was the condition of the interior surfaces of the command module?', 'What was the condition of the interior surfaces of the command module?', 'What was the condition of the interior surfaces of the command module?', 'What was the condition of the interior surfaces of the command module?', 'What was the condition of the interior surfaces of the command module?', 'What was the condition of the interior surfaces of the command module?']


Processed prompts: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 8/8 [00:00<00:00, 42.82it/s, est. speed input: 7004.27 toks/s, output: 85.74 toks/s]


rewards_per_func: tensor([0.6250, 0.7000], device='cuda:0')
['What was the status of the Reaction Control System Performance report for the Apollo 8 mission?', 'What was the status of the Reaction Control System Performance report for the Apollo 8 mission?', 'What was the status of the Reaction Control System Performance report for the Apollo 8 mission?', 'What was the status of the Reaction Control System Performance report for the Apollo 8 mission?', 'What was the status of the Reaction Control System Performance report for the Apollo 8 mission?', 'What was the status of the Reaction Control System Performance report for the Apollo 8 mission?', 'What was the status of the Reaction Control System Performance report for the Apollo 8 mission?', 'What was the status of the Reaction Control System Performance report for the Apollo 8 mission?']


Processed prompts: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 8/8 [00:00<00:00, 39.40it/s, est. speed input: 7507.74 toks/s, output: 78.92 toks/s]


rewards_per_func: tensor([0.1250, 0.7000], device='cuda:0')
['What was the objective of the television in earth orbit that could not be achieved?', 'What was the objective of the television in earth orbit that could not be achieved?', 'What was the objective of the television in earth orbit that could not be achieved?', 'What was the objective of the television in earth orbit that could not be achieved?', 'What was the objective of the television in earth orbit that could not be achieved?', 'What was the objective of the television in earth orbit that could not be achieved?', 'What was the objective of the television in earth orbit that could not be achieved?', 'What was the objective of the television in earth orbit that could not be achieved?']


Processed prompts: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 8/8 [00:00<00:00, 47.33it/s, est. speed input: 7733.04 toks/s, output: 94.74 toks/s]


rewards_per_func: tensor([0.0000, 0.7000], device='cuda:0')
['What was the title of the report about the Apollo 13 Cryogenic Oxygen Tank 2 Anomaly?', 'What was the title of the report about the Apollo 13 Cryogenic Oxygen Tank 2 Anomaly?', 'What was the title of the report about the Apollo 13 Cryogenic Oxygen Tank 2 Anomaly?', 'What was the title of the report about the Apollo 13 Cryogenic Oxygen Tank 2 Anomaly?', 'What was the title of the report about the Apollo 13 Cryogenic Oxygen Tank 2 Anomaly?', 'What was the title of the report about the Apollo 13 Cryogenic Oxygen Tank 2 Anomaly?', 'What was the title of the report about the Apollo 13 Cryogenic Oxygen Tank 2 Anomaly?', 'What was the title of the report about the Apollo 13 Cryogenic Oxygen Tank 2 Anomaly?']


Processed prompts: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 8/8 [00:00<00:00, 44.54it/s, est. speed input: 8010.02 toks/s, output: 89.18 toks/s]


rewards_per_func: tensor([0.0000, 0.7000], device='cuda:0')
['What was the cause of the cracking in the window shade?', 'What was the cause of the cracking in the window shade?', 'What was the cause of the cracking in the window shade?', 'What was the cause of the cracking in the window shade?', 'What was the cause of the cracking in the window shade?', 'What was the cause of the cracking in the window shade?', 'What was the cause of the cracking in the window shade?', 'What was the cause of the cracking in the window shade?']


Processed prompts: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 8/8 [00:00<00:00, 32.59it/s, est. speed input: 7207.65 toks/s, output: 73.38 toks/s]


rewards_per_func: tensor([1.0000, 0.7000], device='cuda:0')
['What condition can be ruled out because the light remained illuminated for a brief period and then began flickering intermittently after the battery was replaced on the bus?', 'What condition can be ruled out because the light remained illuminated for a brief period and then began flickering intermittently after the battery was replaced on the bus?', 'What condition can be ruled out because the light remained illuminated for a brief period and then began flickering intermittently after the battery was replaced on the bus?', 'What condition can be ruled out because the light remained illuminated for a brief period and then began flickering intermittently after the battery was replaced on the bus?', 'What condition can be ruled out because the light remained illuminated for a brief period and then began flickering intermittently after the battery was replaced on the bus?', 'What condition can be ruled out because the light rem

Processed prompts: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 8/8 [00:02<00:00,  2.67it/s, est. speed input: 642.02 toks/s, output: 78.75 toks/s]


rewards_per_func: tensor([0.5000, 0.7000], device='cuda:0')
['What was the maximum oscillation measured during either of the two S-IVB thrust periods?', 'What was the maximum oscillation measured during either of the two S-IVB thrust periods?', 'What was the maximum oscillation measured during either of the two S-IVB thrust periods?', 'What was the maximum oscillation measured during either of the two S-IVB thrust periods?', 'What was the maximum oscillation measured during either of the two S-IVB thrust periods?', 'What was the maximum oscillation measured during either of the two S-IVB thrust periods?', 'What was the maximum oscillation measured during either of the two S-IVB thrust periods?', 'What was the maximum oscillation measured during either of the two S-IVB thrust periods?']


Processed prompts: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 8/8 [00:00<00:00, 46.10it/s, est. speed input: 7821.46 toks/s, output: 92.29 toks/s]


rewards_per_func: tensor([0.0000, 0.6125], device='cuda:0')
['What could have caused the shorted condition in the temperature switch wires?', 'What could have caused the shorted condition in the temperature switch wires?', 'What could have caused the shorted condition in the temperature switch wires?', 'What could have caused the shorted condition in the temperature switch wires?', 'What could have caused the shorted condition in the temperature switch wires?', 'What could have caused the shorted condition in the temperature switch wires?', 'What could have caused the shorted condition in the temperature switch wires?', 'What could have caused the shorted condition in the temperature switch wires?']


Processed prompts: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 8/8 [00:01<00:00,  4.70it/s, est. speed input: 1022.65 toks/s, output: 71.20 toks/s]


rewards_per_func: tensor([0.3750, 0.7000], device='cuda:0')
['What was the effect of the long cold-soak period on the command module structure?', 'What was the effect of the long cold-soak period on the command module structure?', 'What was the effect of the long cold-soak period on the command module structure?', 'What was the effect of the long cold-soak period on the command module structure?', 'What was the effect of the long cold-soak period on the command module structure?', 'What was the effect of the long cold-soak period on the command module structure?', 'What was the effect of the long cold-soak period on the command module structure?', 'What was the effect of the long cold-soak period on the command module structure?']


Processed prompts: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 8/8 [00:00<00:00, 12.89it/s, est. speed input: 2531.32 toks/s, output: 100.02 toks/s]


rewards_per_func: tensor([0.7500, 0.7000], device='cuda:0')
['What was the temperature of the engine package during the peak engine activity period after the oxygen tank incident?', 'What was the temperature of the engine package during the peak engine activity period after the oxygen tank incident?', 'What was the temperature of the engine package during the peak engine activity period after the oxygen tank incident?', 'What was the temperature of the engine package during the peak engine activity period after the oxygen tank incident?', 'What was the temperature of the engine package during the peak engine activity period after the oxygen tank incident?', 'What was the temperature of the engine package during the peak engine activity period after the oxygen tank incident?', 'What was the temperature of the engine package during the peak engine activity period after the oxygen tank incident?', 'What was the temperature of the engine package during the peak engine activity period after

Processed prompts: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 8/8 [00:00<00:00, 54.88it/s, est. speed input: 7772.19 toks/s, output: 109.85 toks/s]


rewards_per_func: tensor([0.7500, 0.6125], device='cuda:0')
['What was the cause of the thumping noise reported by the crew at 97 hours 14 minutes?', 'What was the cause of the thumping noise reported by the crew at 97 hours 14 minutes?', 'What was the cause of the thumping noise reported by the crew at 97 hours 14 minutes?', 'What was the cause of the thumping noise reported by the crew at 97 hours 14 minutes?', 'What was the cause of the thumping noise reported by the crew at 97 hours 14 minutes?', 'What was the cause of the thumping noise reported by the crew at 97 hours 14 minutes?', 'What was the cause of the thumping noise reported by the crew at 97 hours 14 minutes?', 'What was the cause of the thumping noise reported by the crew at 97 hours 14 minutes?']


Processed prompts: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 8/8 [00:00<00:00, 38.72it/s, est. speed input: 8296.90 toks/s, output: 77.49 toks/s]


rewards_per_func: tensor([0.8750, 0.7000], device='cuda:0')
['What was the equilibrium potential for a conventional jet aircraft?', 'What was the equilibrium potential for a conventional jet aircraft?', 'What was the equilibrium potential for a conventional jet aircraft?', 'What was the equilibrium potential for a conventional jet aircraft?', 'What was the equilibrium potential for a conventional jet aircraft?', 'What was the equilibrium potential for a conventional jet aircraft?', 'What was the equilibrium potential for a conventional jet aircraft?', 'What was the equilibrium potential for a conventional jet aircraft?']


Processed prompts: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 8/8 [00:00<00:00, 49.43it/s, est. speed input: 7898.84 toks/s, output: 98.96 toks/s]


rewards_per_func: tensor([0.0000, 0.7000], device='cuda:0')
['What was the expected outcome of the telluric current system during the launch?', 'What was the expected outcome of the telluric current system during the launch?', 'What was the expected outcome of the telluric current system during the launch?', 'What was the expected outcome of the telluric current system during the launch?', 'What was the expected outcome of the telluric current system during the launch?', 'What was the expected outcome of the telluric current system during the launch?', 'What was the expected outcome of the telluric current system during the launch?', 'What was the expected outcome of the telluric current system during the launch?']


Processed prompts: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 8/8 [00:00<00:00, 41.34it/s, est. speed input: 8342.10 toks/s, output: 82.75 toks/s]


rewards_per_func: tensor([0.3750, 0.7000], device='cuda:0')
['What was proven for the first time 3 days prior to the flight?', 'What was proven for the first time 3 days prior to the flight?', 'What was proven for the first time 3 days prior to the flight?', 'What was proven for the first time 3 days prior to the flight?', 'What was proven for the first time 3 days prior to the flight?', 'What was proven for the first time 3 days prior to the flight?', 'What was proven for the first time 3 days prior to the flight?', 'What was proven for the first time 3 days prior to the flight?']


Processed prompts: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 8/8 [00:00<00:00, 40.92it/s, est. speed input: 8182.46 toks/s, output: 81.92 toks/s]


rewards_per_func: tensor([0.0000, 0.6125], device='cuda:0')
['What was the reaction control system used for at about 137 hours 40 minutes?', 'What was the reaction control system used for at about 137 hours 40 minutes?', 'What was the reaction control system used for at about 137 hours 40 minutes?', 'What was the reaction control system used for at about 137 hours 40 minutes?', 'What was the reaction control system used for at about 137 hours 40 minutes?', 'What was the reaction control system used for at about 137 hours 40 minutes?', 'What was the reaction control system used for at about 137 hours 40 minutes?', 'What was the reaction control system used for at about 137 hours 40 minutes?']


Processed prompts: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 8/8 [00:00<00:00, 43.51it/s, est. speed input: 8133.02 toks/s, output: 87.10 toks/s]


rewards_per_func: tensor([0.0000, 0.7000], device='cuda:0')
['How was the platform alignment affected by the large amount of debris in the vicinity of the spacecraft?', 'How was the platform alignment affected by the large amount of debris in the vicinity of the spacecraft?', 'How was the platform alignment affected by the large amount of debris in the vicinity of the spacecraft?', 'How was the platform alignment affected by the large amount of debris in the vicinity of the spacecraft?', 'How was the platform alignment affected by the large amount of debris in the vicinity of the spacecraft?', 'How was the platform alignment affected by the large amount of debris in the vicinity of the spacecraft?', 'How was the platform alignment affected by the large amount of debris in the vicinity of the spacecraft?', 'How was the platform alignment affected by the large amount of debris in the vicinity of the spacecraft?']


Processed prompts: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 8/8 [00:00<00:00, 38.15it/s, est. speed input: 7877.54 toks/s, output: 76.34 toks/s]


rewards_per_func: tensor([0.3750, 0.6125], device='cuda:0')
['What was the nickel content found in the command module hot water port?', 'What was the nickel content found in the command module hot water port?', 'What was the nickel content found in the command module hot water port?', 'What was the nickel content found in the command module hot water port?', 'What was the nickel content found in the command module hot water port?', 'What was the nickel content found in the command module hot water port?', 'What was the nickel content found in the command module hot water port?', 'What was the nickel content found in the command module hot water port?']


Processed prompts: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 8/8 [00:00<00:00, 54.31it/s, est. speed input: 7801.08 toks/s, output: 108.72 toks/s]


rewards_per_func: tensor([1.0000, 0.7000], device='cuda:0')
['What was the reason for the first abort in the Apollo program?', 'What was the reason for the first abort in the Apollo program?', 'What was the reason for the first abort in the Apollo program?', 'What was the reason for the first abort in the Apollo program?', 'What was the reason for the first abort in the Apollo program?', 'What was the reason for the first abort in the Apollo program?', 'What was the reason for the first abort in the Apollo program?', 'What was the reason for the first abort in the Apollo program?']


  completion_ids = [torch.tensor(ids, device=device) for ids in completion_ids]
Processed prompts: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 8/8 [00:00<00:00, 40.77it/s, est. speed input: 8338.94 toks/s, output: 81.60 toks/s]


rewards_per_func: tensor([0.0000, 0.7000], device='cuda:0')


TrainOutput(global_step=101, training_loss=0.00014615571892236538, metrics={'train_runtime': 2771.7194, 'train_samples_per_second': 0.292, 'train_steps_per_second': 0.036, 'total_flos': 0.0, 'train_loss': 0.00014615571892236538})

<a name="Inference"></a>
### Inference
Now let's try benchmark the model we trained!

In [8]:
from vllm import SamplingParams
import rl_helpers

sampling_params = SamplingParams(
    temperature=0.5,
    top_p=0.95,
    max_tokens=4096,
)


def eval_generate_fn(inputs):
    return model.fast_generate(
        inputs,
        sampling_params=sampling_params,
        lora_request=model.load_lora(
            "full_local_training/checkpoint-101"
        ),  # load the trained LoRA
    )


rl_helpers.run_eval(
    generate_fn=eval_generate_fn,
    verify_fn=reward_correctness,
    tokenizer=tokenizer,
)

Processed prompts:   0%|          | 0/68 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed prompts: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 68/68 [00:10<00:00,  6.70it/s, est. speed input: 1944.51 toks/s, output: 831.15 toks/s]
Processed prompts: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 68/68 [00:12<00:00,  5.35it/s, est. speed input: 4089.36 toks/s, output: 466.37 toks/s]
Processed prompts: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 16/16 [00:08<00:00,  1.87it/s, est. speed input: 2294.17 toks/s, output: 191.82 toks/s]
Processed prompts: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 11/11 [00:08<00:00,  1.22it/s, est. speed input: 2019.97 toks/s, output: 140.61 toks/s]
Processed prompts: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 8/8 [00:07<00:00,  1.06it/s, est. speed input: 2231.39 toks/s, output: 96.48 toks/s]
Processed prompts: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 68/68 [00:01<00:00, 38.50it/s, est. speed input: 7604.52 toks/s, output: 109.34 toks/s]

RESULTS:
percentage of correct answers: 0.5882352941176471





[{'messages': [{'role': 'system',
    'content': 'Cutting Knowledge Date: December 2023\nToday Date: 18 Mar 2025\n\nWhen you receive a tool call response, use the output to format an answer to the original user question.\n\nYou are a helpful assistant with tool calling capabilities.\n'},
   {'role': 'user',
    'content': 'You are a research assistant, and you use the search_corpus tool to find answers to questions.\nGiven a question, answer it using by doing searches using the search_corpus tool.\nTo use the search_corpus tool, respond with a JSON for a function call with its proper arguments.\n\nYou may also reason in any message, thinking step by step about how to answer the question. Wrap your reasoning in <reasoning> and </reasoning> tags.\n\n{\n  "type": "function",\n  "function": {\n    "name": "search_corpus",\n    "description": "Search over the knowledge corpus with a given query",\n    "parameters": {\n      "type": "object",\n      "properties": {\n        "query": {\n     

In [10]:
# eval w/o lora
def eval_generate_fn(inputs):
    return model.fast_generate(
        inputs,
        sampling_params=sampling_params,
    )


rl_helpers.run_eval(
    generate_fn=eval_generate_fn,
    verify_fn=reward_correctness,
    tokenizer=tokenizer,
)

Processed prompts: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 68/68 [00:10<00:00,  6.78it/s, est. speed input: 1968.67 toks/s, output: 878.99 toks/s]
Processed prompts: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 32/32 [00:08<00:00,  3.95it/s, est. speed input: 3104.79 toks/s, output: 369.96 toks/s]
Processed prompts: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 8/8 [00:06<00:00,  1.29it/s, est. speed input: 1740.26 toks/s, output: 167.67 toks/s]
Processed prompts: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 6/6 [00:05<00:00,  1.11it/s, est. speed input: 2095.70 toks/s, output: 114.21 toks/s]
Processed prompts: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 3/3 [00:03<00:00,  1.19s/it, est. speed input: 2076.54 toks/s, output: 74.54 toks/s]
Processed prompts: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 68/68 [00:01<00:00, 39.85it/s, est. speed input: 6465.53 toks/s, output: 158.31 toks/s]

RESULTS:
percentage of correct answers: 0.23529411764705882





[{'messages': [{'role': 'system',
    'content': 'Cutting Knowledge Date: December 2023\nToday Date: 18 Mar 2025\n\nWhen you receive a tool call response, use the output to format an answer to the original user question.\n\nYou are a helpful assistant with tool calling capabilities.\n'},
   {'role': 'user',
    'content': 'You are a research assistant, and you use the search_corpus tool to find answers to questions.\nGiven a question, answer it using by doing searches using the search_corpus tool.\nTo use the search_corpus tool, respond with a JSON for a function call with its proper arguments.\n\nYou may also reason in any message, thinking step by step about how to answer the question. Wrap your reasoning in <reasoning> and </reasoning> tags.\n\n{\n  "type": "function",\n  "function": {\n    "name": "search_corpus",\n    "description": "Search over the knowledge corpus with a given query",\n    "parameters": {\n      "type": "object",\n      "properties": {\n        "query": {\n     