diff --git a/notebooks/250329_saving_inference.ipynb b/notebooks/250329_saving_inference.ipynb
new file mode 100644
index 0000000..a6c5ca2
--- /dev/null
+++ b/notebooks/250329_saving_inference.ipynb
@@ -0,0 +1,243 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Load finetuned model ans say hello"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## ✅ Utils"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from unsloth import FastLanguageModel\n",
+    "from vllm import SamplingParams\n",
+    "\n",
+    "\n",
+    "def load_model(\n",
+    "    # model_name=\"meta-llama/Llama-3.2-1B-Instruct\",\n",
+    "    model_name=\"meta-llama/meta-Llama-3.1-8B-Instruct\",\n",
+    "    lora_path=\"../trainer_output_meta-llama_Llama-3.1-8B-Instruct_gpu1_20250326_134236/checkpoint-101\",\n",
+    "    max_seq_length=8192,\n",
+    "):\n",
+    "    \"\"\"Load model and tokenizer with optional LoRA weights.\"\"\"\n",
+    "    # Load base model\n",
+    "    model, tokenizer = FastLanguageModel.from_pretrained(\n",
+    "        model_name=model_name,\n",
+    "        max_seq_length=max_seq_length,\n",
+    "        load_in_4bit=True,\n",
+    "        fast_inference=True,\n",
+    "        max_lora_rank=64,\n",
+    "        gpu_memory_utilization=0.6,\n",
+    "    )\n",
+    "\n",
+    "    # Setup LoRA if path provided\n",
+    "    if lora_path:\n",
+    "        model = FastLanguageModel.get_peft_model(\n",
+    "            model,\n",
+    "            r=64,\n",
+    "            target_modules=[\n",
+    "                \"q_proj\",\n",
+    "                \"k_proj\",\n",
+    "                \"v_proj\",\n",
+    "                \"o_proj\",\n",
+    "                \"gate_proj\",\n",
+    "                \"up_proj\",\n",
+    "                \"down_proj\",\n",
+    "            ],\n",
+    "            lora_alpha=64,\n",
+    "            use_gradient_checkpointing=True,\n",
+    "            random_state=3407,\n",
+    "        )\n",
+    "        model.load_lora(lora_path)\n",
+    "\n",
+    "    return model, tokenizer\n",
+    "\n",
+    "\n",
+    "def get_sampling_params(\n",
+    "    temperature=0.7,\n",
+    "    top_p=0.95,\n",
+    "    max_tokens=4096,\n",
+    "):\n",
+    "    \"\"\"Get sampling parameters for text generation.\"\"\"\n",
+    "    return SamplingParams(\n",
+    "        temperature=temperature,\n",
+    "        top_p=top_p,\n",
+    "        max_tokens=max_tokens,\n",
+    "    )\n",
+    "\n",
+    "\n",
+    "def generate_response(prompt, model, tokenizer, sampling_params):\n",
+    "    \"\"\"Generate a response from the model.\"\"\"\n",
+    "    inputs = tokenizer.apply_chat_template(\n",
+    "        [{\"role\": \"user\", \"content\": prompt}],\n",
+    "        tokenize=False,\n",
+    "        add_generation_prompt=True,\n",
+    "    )\n",
+    "\n",
+    "    outputs = model.fast_generate([inputs], sampling_params=sampling_params)\n",
+    "\n",
+    "    if hasattr(outputs[0], \"outputs\"):\n",
+    "        response_text = outputs[0].outputs[0].text\n",
+    "    else:\n",
+    "        response_text = outputs[0]\n",
+    "\n",
+    "    return response_text"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model, tokenizer = load_model()  # Using default hardcoded path\n",
+    "sampling_params = get_sampling_params()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "response = generate_response(\"Hi! How are you?\", model, tokenizer, sampling_params)\n",
+    "print(\"\\nModel response:\")\n",
+    "print(response)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Merge lora and save model to 16 bit, then test load inference\n",
+    "if False:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model.save_pretrained_merged(\n",
+    "    \"model\",\n",
+    "    tokenizer,\n",
+    "    save_method=\"merged_16bit\",\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## ✅ Test load merged model 16bit"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Test load merged model\n",
+    "model, tokenizer = load_model(model_name=\"model\", lora_path=None)\n",
+    "sampling_params = get_sampling_params()\n",
+    "response = generate_response(\"Hi! How are you?\", model, tokenizer, sampling_params)\n",
+    "print(\"\\nModel response:\")\n",
+    "print(response)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# ❌ Save model to Ollama format\n",
+    "- bug no lllama-quantize on wsl, fix later\n",
+    "https://docs.unsloth.ai/basics/runninand-saving-models/saving-to-gguf\n",
+    "```bash\n",
+    "git clone https://github.com/ggml-org/llama.cpp\n",
+    "cd llama.cpp\n",
+    "cmake -B build\n",
+    "cmake --build build --config Release\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Save to 8bit Q8_0\n",
+    "if True:\n",
+    "    model.save_pretrained_gguf(\n",
+    "        \"model-gguf\",\n",
+    "        tokenizer,\n",
+    "    )\n",
+    "# Remember to go to https://huggingface.co/settings/tokens for a token!\n",
+    "# And change hf to your username!\n",
+    "if False:\n",
+    "    model.push_to_hub_gguf(\"hf/model\", tokenizer, token=\"\")\n",
+    "\n",
+    "# Save to 16bit GGUF\n",
+    "if False:\n",
+    "    model.save_pretrained_gguf(\"model\", tokenizer, quantization_method=\"f16\")\n",
+    "if False:\n",
+    "    model.push_to_hub_gguf(\"hf/model\", tokenizer, quantization_method=\"f16\", token=\"\")\n",
+    "\n",
+    "# Save to q4_k_m GGUF\n",
+    "if False:\n",
+    "    model.save_pretrained_gguf(\"model\", tokenizer, quantization_method=\"q4_k_m\")\n",
+    "if False:\n",
+    "    model.push_to_hub_gguf(\n",
+    "        \"hf/model\", tokenizer, quantization_method=\"q4_k_m\", token=\"\"\n",
+    "    )\n",
+    "\n",
+    "# Save to multiple GGUF options - much faster if you want multiple!\n",
+    "if False:\n",
+    "    model.push_to_hub_gguf(\n",
+    "        \"hf/model\",  # Change hf to your username!\n",
+    "        tokenizer,\n",
+    "        quantization_method=[\n",
+    "            \"q4_k_m\",\n",
+    "            \"q8_0\",\n",
+    "            \"q5_k_m\",\n",
+    "        ],\n",
+    "        token=\"\",\n",
+    "    )"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "deepsearch-py311",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.11"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/notebooks/250331_train_grpo_r1_distil.ipynb b/notebooks/250331_train_grpo_r1_distil.ipynb
new file mode 100644
index 0000000..575d94c
--- /dev/null
+++ b/notebooks/250331_train_grpo_r1_distil.ipynb
@@ -0,0 +1,213 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Train R1 Distil\n",
+    "This notebook is for caching the model loading so that It wouldn't take so long to reload every time I change the trainer source code"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Utils"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import sys\n",
+    "\n",
+    "sys.path.append(\"..\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "\n",
+    "from unsloth import FastLanguageModel, is_bfloat16_supported\n",
+    "\n",
+    "import src.UnslothGRPOTrainerTemp as UnslothGRPOTrainerTemp\n",
+    "from src.config import (\n",
+    "    MODEL_CONFIG,\n",
+    "    MODEL_NAME,\n",
+    "    OUTPUT_DIR,\n",
+    "    TRAINING_CONFIG,\n",
+    "    get_sampling_params,\n",
+    "    init_training_dirs,\n",
+    "    logger,\n",
+    "    update_log_path,\n",
+    ")\n",
+    "\n",
+    "# Import reward functions\n",
+    "from src.rl_helpers_r1_distil import (\n",
+    "    build_reward_correctness_fn,\n",
+    "    get_qa_dataset,\n",
+    "    reward_exact_match_chunk_query,\n",
+    "    reward_formatting,\n",
+    "    reward_retry_behavior,\n",
+    "    run_agent,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Initialize training directories\n",
+    "paths = init_training_dirs()\n",
+    "\n",
+    "# Update logger to use the training directory\n",
+    "update_log_path(paths[\"log_dir\"])\n",
+    "logger.info(f\"Training output directory: {paths['output_dir']}\")\n",
+    "logger.info(f\"Logs are being saved to both ./logs and {paths['log_dir']}\")\n",
+    "\n",
+    "\n",
+    "# Initialize model and tokenizer\n",
+    "logger.info(f\"Initializing model {MODEL_NAME}\")\n",
+    "model, tokenizer = FastLanguageModel.from_pretrained(\n",
+    "    model_name=MODEL_NAME,\n",
+    "    max_seq_length=MODEL_CONFIG[\"max_seq_length\"],\n",
+    "    load_in_4bit=True,  # False for LoRA 16bit\n",
+    "    fast_inference=True,  # Enable vLLM fast inference\n",
+    "    max_lora_rank=MODEL_CONFIG[\"lora_rank\"],\n",
+    "    gpu_memory_utilization=MODEL_CONFIG[\"gpu_memory_utilization\"],\n",
+    ")\n",
+    "\n",
+    "# Setup LoRA\n",
+    "logger.info(\"Setting up LoRA adapter\")\n",
+    "model = FastLanguageModel.get_peft_model(\n",
+    "    model,\n",
+    "    r=MODEL_CONFIG[\"lora_rank\"],\n",
+    "    target_modules=MODEL_CONFIG[\"target_modules\"],\n",
+    "    lora_alpha=MODEL_CONFIG[\"lora_rank\"],\n",
+    "    use_gradient_checkpointing=True,  # Enable long context finetuning\n",
+    "    random_state=3407,\n",
+    ")\n",
+    "\n",
+    "# Load datasets\n",
+    "logger.info(\"Loading datasets\")\n",
+    "train_dataset, test_dataset = get_qa_dataset()\n",
+    "logger.info(\n",
+    "    f\"Loaded {len(train_dataset)} training examples and {len(test_dataset)} test examples\"\n",
+    ")\n",
+    "\n",
+    "# Setup training arguments\n",
+    "logger.info(\"Setting up training arguments\")\n",
+    "training_args = UnslothGRPOTrainerTemp.UnslothGRPOConfig(\n",
+    "    use_vllm=True,  # use vLLM for fast inference!\n",
+    "    use_agentic_generate=True,  # use agentic generation\n",
+    "    **TRAINING_CONFIG,\n",
+    "    bf16=is_bfloat16_supported(),\n",
+    "    fp16=not is_bfloat16_supported(),\n",
+    "    output_dir=OUTPUT_DIR,\n",
+    "    # report_to=\"tensorboard\",  # ❓ Does't have billions of tensorboard files if set report to right here\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Setup model generation functions\n",
+    "def agentic_generate(\n",
+    "    prompts: list,\n",
+    "    generate_fn,\n",
+    "    max_generations: int = 10,\n",
+    "):\n",
+    "    return run_agent(generate_fn, tokenizer, prompts, max_generations)\n",
+    "\n",
+    "\n",
+    "model.agentic_generate = agentic_generate\n",
+    "\n",
+    "# Setup verifier\n",
+    "logger.info(\"Setting up verifier\")\n",
+    "verifier_sampling_params = get_sampling_params(temperature=0.1)\n",
+    "\n",
+    "\n",
+    "def verifier_generate_fn(inputs):\n",
+    "    return model.fast_generate(\n",
+    "        inputs,\n",
+    "        sampling_params=verifier_sampling_params,\n",
+    "    )\n",
+    "\n",
+    "\n",
+    "# Setup trainer\n",
+    "logger.info(\"Initializing trainer\")\n",
+    "trainer = UnslothGRPOTrainerTemp.UnslothGRPOTrainer(\n",
+    "    model=model,\n",
+    "    processing_class=tokenizer,\n",
+    "    reward_funcs=[\n",
+    "        build_reward_correctness_fn(\n",
+    "            verifier_generate_fn,\n",
+    "            tokenizer,\n",
+    "            log_file=os.path.join(paths[\"log_dir\"], \"qa_log.txt\"),\n",
+    "        ),\n",
+    "        reward_formatting,\n",
+    "        reward_retry_behavior,\n",
+    "        reward_exact_match_chunk_query,\n",
+    "    ],\n",
+    "    args=training_args,\n",
+    "    train_dataset=train_dataset,\n",
+    ")\n",
+    "\n",
+    "print(\"Trainer initialized successfully! Starting training...\")\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Load model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Train the model\n",
+    "if __name__ == \"__main__\":\n",
+    "    logger.info(\"Starting training\")\n",
+    "    trainer.train()\n",
+    "    logger.info(\"Training completed\")\n",
+    "    logger.info(f\"Model saved to {OUTPUT_DIR}\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "deepsearch-py311",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.11"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/notebooks/250331_train_grpo_r1_distil_llama3b.ipynb b/notebooks/250331_train_grpo_r1_distil_llama3b.ipynb
new file mode 100644
index 0000000..3408047
--- /dev/null
+++ b/notebooks/250331_train_grpo_r1_distil_llama3b.ipynb
@@ -0,0 +1,213 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Train R1 Distil\n",
+    "This notebook is for caching the model loading so that It wouldn't take so long to reload every time I change the trainer source code"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Utils"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import sys\n",
+    "\n",
+    "sys.path.append(\"..\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "\n",
+    "from unsloth import FastLanguageModel, is_bfloat16_supported\n",
+    "\n",
+    "import src.UnslothGRPOTrainerTemp as UnslothGRPOTrainerTemp\n",
+    "from src.config import (\n",
+    "    MODEL_CONFIG,\n",
+    "    MODEL_NAME,\n",
+    "    OUTPUT_DIR,\n",
+    "    TRAINING_CONFIG,\n",
+    "    get_sampling_params,\n",
+    "    init_training_dirs,\n",
+    "    logger,\n",
+    "    update_log_path,\n",
+    ")\n",
+    "\n",
+    "# Import reward functions\n",
+    "from src.rl_helpers import (\n",
+    "    build_reward_correctness_fn,\n",
+    "    get_qa_dataset,\n",
+    "    reward_exact_match_chunk_query,\n",
+    "    reward_formatting,\n",
+    "    reward_retry_behavior,\n",
+    "    run_agent,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Initialize training directories\n",
+    "paths = init_training_dirs()\n",
+    "\n",
+    "# Update logger to use the training directory\n",
+    "update_log_path(paths[\"log_dir\"])\n",
+    "logger.info(f\"Training output directory: {paths['output_dir']}\")\n",
+    "logger.info(f\"Logs are being saved to both ./logs and {paths['log_dir']}\")\n",
+    "\n",
+    "\n",
+    "# Initialize model and tokenizer\n",
+    "logger.info(f\"Initializing model {MODEL_NAME}\")\n",
+    "model, tokenizer = FastLanguageModel.from_pretrained(\n",
+    "    model_name=MODEL_NAME,\n",
+    "    max_seq_length=MODEL_CONFIG[\"max_seq_length\"],\n",
+    "    load_in_4bit=True,  # False for LoRA 16bit\n",
+    "    fast_inference=True,  # Enable vLLM fast inference\n",
+    "    max_lora_rank=MODEL_CONFIG[\"lora_rank\"],\n",
+    "    gpu_memory_utilization=MODEL_CONFIG[\"gpu_memory_utilization\"],\n",
+    ")\n",
+    "\n",
+    "# Setup LoRA\n",
+    "logger.info(\"Setting up LoRA adapter\")\n",
+    "model = FastLanguageModel.get_peft_model(\n",
+    "    model,\n",
+    "    r=MODEL_CONFIG[\"lora_rank\"],\n",
+    "    target_modules=MODEL_CONFIG[\"target_modules\"],\n",
+    "    lora_alpha=MODEL_CONFIG[\"lora_rank\"],\n",
+    "    use_gradient_checkpointing=True,  # Enable long context finetuning\n",
+    "    random_state=3407,\n",
+    ")\n",
+    "\n",
+    "# Load datasets\n",
+    "logger.info(\"Loading datasets\")\n",
+    "train_dataset, test_dataset = get_qa_dataset()\n",
+    "logger.info(\n",
+    "    f\"Loaded {len(train_dataset)} training examples and {len(test_dataset)} test examples\"\n",
+    ")\n",
+    "\n",
+    "# Setup training arguments\n",
+    "logger.info(\"Setting up training arguments\")\n",
+    "training_args = UnslothGRPOTrainerTemp.UnslothGRPOConfig(\n",
+    "    use_vllm=True,  # use vLLM for fast inference!\n",
+    "    use_agentic_generate=True,  # use agentic generation\n",
+    "    **TRAINING_CONFIG,\n",
+    "    bf16=is_bfloat16_supported(),\n",
+    "    fp16=not is_bfloat16_supported(),\n",
+    "    output_dir=OUTPUT_DIR,\n",
+    "    # report_to=\"tensorboard\",  # ❓ Does't have billions of tensorboard files if set report to right here\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Setup model generation functions\n",
+    "def agentic_generate(\n",
+    "    prompts: list,\n",
+    "    generate_fn,\n",
+    "    max_generations: int = 10,\n",
+    "):\n",
+    "    return run_agent(generate_fn, tokenizer, prompts, max_generations)\n",
+    "\n",
+    "\n",
+    "model.agentic_generate = agentic_generate\n",
+    "\n",
+    "# Setup verifier\n",
+    "logger.info(\"Setting up verifier\")\n",
+    "verifier_sampling_params = get_sampling_params(temperature=0.1)\n",
+    "\n",
+    "\n",
+    "def verifier_generate_fn(inputs):\n",
+    "    return model.fast_generate(\n",
+    "        inputs,\n",
+    "        sampling_params=verifier_sampling_params,\n",
+    "    )\n",
+    "\n",
+    "\n",
+    "# Setup trainer\n",
+    "logger.info(\"Initializing trainer\")\n",
+    "trainer = UnslothGRPOTrainerTemp.UnslothGRPOTrainer(\n",
+    "    model=model,\n",
+    "    processing_class=tokenizer,\n",
+    "    reward_funcs=[\n",
+    "        build_reward_correctness_fn(\n",
+    "            verifier_generate_fn,\n",
+    "            tokenizer,\n",
+    "            log_file=os.path.join(paths[\"log_dir\"], \"qa_log.txt\"),\n",
+    "        ),\n",
+    "        reward_formatting,\n",
+    "        reward_retry_behavior,\n",
+    "        reward_exact_match_chunk_query,\n",
+    "    ],\n",
+    "    args=training_args,\n",
+    "    train_dataset=train_dataset,\n",
+    ")\n",
+    "\n",
+    "print(\"Trainer initialized successfully! Starting training...\")\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Load model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Train the model\n",
+    "if __name__ == \"__main__\":\n",
+    "    logger.info(\"Starting training\")\n",
+    "    trainer.train()\n",
+    "    logger.info(\"Training completed\")\n",
+    "    logger.info(f\"Model saved to {OUTPUT_DIR}\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "deepsearch-py311",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.11"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/notebooks/Llama_3_1_8b_2x_faster_inference.ipynb b/notebooks/Llama_3_1_8b_2x_faster_inference.ipynb
new file mode 100644
index 0000000..4dc87ad
--- /dev/null
+++ b/notebooks/Llama_3_1_8b_2x_faster_inference.ipynb
@@ -0,0 +1,277 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "IqM-T1RTzY6C"
+   },
+   "source": [
+    "To run this, press \"*Runtime*\" and press \"*Run all*\" on a **free** Tesla T4 Google Colab instance!\n",
+    "<div class=\"align-center\">\n",
+    "  <a href=\"https://github.com/unslothai/unsloth\"><img src=\"https://github.com/unslothai/unsloth/raw/main/images/unsloth%20new%20logo.png\" width=\"115\"></a>\n",
+    "  <a href=\"https://discord.gg/u54VK8m8tk\"><img src=\"https://github.com/unslothai/unsloth/raw/main/images/Discord button.png\" width=\"145\"></a>\n",
+    "  <a href=\"https://ko-fi.com/unsloth\"><img src=\"https://github.com/unslothai/unsloth/raw/main/images/Kofi button.png\" width=\"145\"></a></a> Join Discord if you need help + support us if you can!\n",
+    "</div>\n",
+    "\n",
+    "To install Unsloth on your own computer, follow the installation instructions on our Github page [here](https://github.com/unslothai/unsloth#installation-instructions---conda)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "2eSvM9zX_2d3"
+   },
+   "outputs": [],
+   "source": [
+    "%%capture\n",
+    "# Installs Unsloth, Xformers (Flash Attention) and all other packages!\n",
+    "!pip install unsloth\n",
+    "# Get latest Unsloth\n",
+    "!pip install --upgrade --no-deps \"unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "r2v_X2fA0Df5"
+   },
+   "source": [
+    "If you want to finetune Llama-3 2x faster and use 70% less VRAM, go to our [finetuning notebook](https://colab.research.google.com/drive/135ced7oHytdxu3N2DNe1Z0kqjyYIkDXp?usp=sharing)!"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/",
+     "height": 371,
+     "referenced_widgets": [
+      "7ca2facea2414549ab2a0a3fd07a0723",
+      "78676eb3a75f45ed860b488a42fb9bc5",
+      "b69b75dd3b5745f4a7f869f7ba6593d4",
+      "ea10da0d677849c28bfdfc9206ac1b73",
+      "493f6a1af5e94b5eb88a4935361a88c3",
+      "1a4ef35565f54388afdf1839055eb5c8",
+      "07ae7e9f6861403ea8064a98f4063754",
+      "fc2ce36bb32f4c648ccb06d65a4e2aea",
+      "bf44ec9413b24dda932f575ebe32b222",
+      "2ad28d8e556b4e17b0859aa3683d31c6",
+      "e32220b1d3f14386834bcd3736ac9888",
+      "d9c5466214ed4f06876a89c64c29048f",
+      "1e0c170f075b43ad9e2d39fcdbaa0a06",
+      "00abd0bd091f4258b7f61d80ecb3118e",
+      "0f2c8641a38043d2a218ec0809e7a7d3",
+      "948972fa1a2e400097b590ad47ff6f3d",
+      "47a13a9b02ad4181b74cc1f5ddeb3f3f",
+      "7c01111484b14a5999d48393085b0264",
+      "fb1a288281fc446788fdd0f432b4732c",
+      "2f026e4c1ff34aafbf7c5deff6c1e66f",
+      "acc52213c033476885ee30fb92f55e23",
+      "9caec4d2cb6e4a72b7155912787d3139",
+      "2abdab3fdd4f4a70a8faaa7b0e39f811",
+      "4d432e4349964027bae76c34f5734fcf",
+      "630c170c04494811b21992d6886a3015",
+      "cb9a92b76a1d4b189e7ed783bde524c9",
+      "ccc49354213c4c1394855986a0e07483",
+      "7bf239e457f04e15b4b0e458f0dd1e19",
+      "f5c6558ef929438eb117159de0285a58",
+      "447aa145c09c4c6f8bd9ce3ea8b5cd44",
+      "9c82690bd07346d1ab875bdf5ed80154",
+      "5200dae3ba2346c1bca27530b82b38b4",
+      "c7a3b0cba7ee49ba929e0038d9c03e98",
+      "36250ac65fcb4756a4368f98902a1617",
+      "705a1673de934cce8418f7893fbf780a",
+      "1f7ed3fc3a824e119cf559af8bc7c370",
+      "cafc86be3bd94bc79cb07bc94461ec5d",
+      "0c190d903a38402ab452de0a0100a8eb",
+      "b3362bda7942432692049558d0cdf14e",
+      "4afb54f2995447e68ae3a8129325ffdc",
+      "31c7337bfd504409992b7e04b713c243",
+      "9b5a45647bd74e8c878cf6ba43b27190",
+      "b9c9010af82b4678af43a20546f3ad49",
+      "9b818c8e8bac40edb97abf8a266ab28e",
+      "ebaa8d22074d495ea675f51dc2d5a4d6",
+      "8fecfa968d55466ba8f42730d32bd9b4",
+      "dd0e8e4cb46945df8dbec4fefd720358",
+      "60a80b3316c14ce79945e06cb190be39",
+      "4f0e06a2e2204dad9f29a1697edd4218",
+      "66527058da914c73b48221011a2b605a",
+      "58413234f68b484693f0719166942214",
+      "84b3814386ee4d04b9817f0da2108725",
+      "01ede422dc5f4d4da7f913880db4984d",
+      "2e9f87f895344785855cca39bb64d5e2",
+      "5512e17f3864445a875e640937f296ef",
+      "932920c0ee834e3e8ffe53ebf7833fa7",
+      "cb65c93ecef84c2c947c67481326691c",
+      "73c3f04df8044f34ab45b92c174e121e",
+      "13b8be4a5af14bfdbbce24d185187906",
+      "21d8ebf7cbfd4e2b9db111d163aad8b8",
+      "3122a97298124d0ab0e6297960c132e2",
+      "cc829a40832849f586b077f6d8e5795b",
+      "3f5a8b6d049244959c1558c56483f643",
+      "11fea1d7e4a8426d98165fa5ca7fc650",
+      "fc3715b03c5249438965886892ba07bb",
+      "05d6f6c3efed4ecf9893c8aa86fe4176"
+     ]
+    },
+    "id": "QmUBVEnvCDJv",
+    "outputId": "14ca09b9-e1ff-4f91-b98c-7a1ed22eef3a"
+   },
+   "outputs": [],
+   "source": [
+    "from unsloth import FastLanguageModel\n",
+    "\n",
+    "# 4bit pre quantized models we support for 4x faster downloading + no OOMs.\n",
+    "fourbit_models = [\n",
+    "    \"unsloth/mistral-7b-instruct-v0.2-bnb-4bit\",\n",
+    "    \"unsloth/gemma-7b-it-bnb-4bit\",\n",
+    "] # More models at https://huggingface.co/unsloth\n",
+    "\n",
+    "model, tokenizer = FastLanguageModel.from_pretrained(\n",
+    "    model_name = \"unsloth/Meta-Llama-3.1-8B-Instruct\",\n",
+    "    max_seq_length = 8192,\n",
+    "    load_in_4bit = True,\n",
+    "    # token = \"hf_...\", # use one if using gated models like meta-llama/Llama-2-7b-hf\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "Yvwf0OEtchS_"
+   },
+   "outputs": [],
+   "source": [
+    "from transformers import TextStreamer\n",
+    "from unsloth.chat_templates import get_chat_template\n",
+    "tokenizer = get_chat_template(\n",
+    "    tokenizer,\n",
+    "    chat_template = \"llama-3.1\",\n",
+    "    mapping = {\"role\" : \"from\", \"content\" : \"value\", \"user\" : \"human\", \"assistant\" : \"gpt\"}, # ShareGPT style\n",
+    ")\n",
+    "FastLanguageModel.for_inference(model) # Enable native 2x faster inference"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "qTRSvafleB2Q"
+   },
+   "source": [
+    "Change the \"value\" part to call the model!\n",
+    "\n",
+    "Unsloth makes inference natively 2x faster!! No need to change or do anything!"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "e2pEuRb1r2Vg",
+    "outputId": "5cfe649f-cd72-469d-b73e-88748df49690"
+   },
+   "outputs": [],
+   "source": [
+    "messages = [\n",
+    "                               # EDIT HERE!\n",
+    "    {\"from\": \"human\", \"value\": \"Continue the fibonnaci sequence: 1, 1, 2, 3, 5, 8,\"},\n",
+    "]\n",
+    "inputs = tokenizer.apply_chat_template(messages, tokenize = True, add_generation_prompt = True, return_tensors = \"pt\").to(\"cuda\")\n",
+    "\n",
+    "text_streamer = TextStreamer(tokenizer)\n",
+    "_ = model.generate(input_ids = inputs, streamer = text_streamer, max_new_tokens = 1024, use_cache = True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "yo7qGcu-cqCW",
+    "outputId": "9959df0e-e04b-4af0-c624-d640f896f441"
+   },
+   "outputs": [],
+   "source": [
+    "messages = [\n",
+    "    {\"from\": \"human\", \"value\": \"Describe the tallest tower in the world.\"},\n",
+    "]\n",
+    "inputs = tokenizer.apply_chat_template(messages, tokenize = True, add_generation_prompt = True, return_tensors = \"pt\").to(\"cuda\")\n",
+    "\n",
+    "text_streamer = TextStreamer(tokenizer)\n",
+    "_ = model.generate(input_ids = inputs, streamer = text_streamer, max_new_tokens = 1024, use_cache = True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "xmQcutr7cxmc",
+    "outputId": "2cb3f9af-7db6-40cf-9b21-e67a3f841adb"
+   },
+   "outputs": [],
+   "source": [
+    "messages = [\n",
+    "    {\"from\": \"human\", \"value\": \"What is Unsloth?\"},\n",
+    "]\n",
+    "inputs = tokenizer.apply_chat_template(messages, tokenize = True, add_generation_prompt = True, return_tensors = \"pt\").to(\"cuda\")\n",
+    "\n",
+    "text_streamer = TextStreamer(tokenizer)\n",
+    "_ = model.generate(input_ids = inputs, streamer = text_streamer, max_new_tokens = 1024, use_cache = True)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "Zt9CHJqO6p30"
+   },
+   "source": [
+    "And we're done! If you have any questions on Unsloth, we have a [Discord](https://discord.gg/u54VK8m8tk) channel! If you find any bugs or want to keep updated with the latest LLM stuff, or need help, join projects etc, feel free to join our Discord!\n",
+    "\n",
+    "If you want to finetune Llama-3 2x faster and use 70% less VRAM, go to our [finetuning notebook](https://colab.research.google.com/drive/135ced7oHytdxu3N2DNe1Z0kqjyYIkDXp?usp=sharing)!\n",
+    "\n",
+    "Some other links:\n",
+    "1. Zephyr DPO 2x faster [free Colab](https://colab.research.google.com/drive/15vttTpzzVXv_tJwEk-hIcQ0S9FcEWvwP?usp=sharing)\n",
+    "2. Llama 7b 2x faster [free Colab](https://colab.research.google.com/drive/1lBzz5KeZJKXjvivbYvmGarix9Ao6Wxe5?usp=sharing)\n",
+    "3. TinyLlama 4x faster full Alpaca 52K in 1 hour [free Colab](https://colab.research.google.com/drive/1AZghoNBQaMDgWJpi4RbffGM1h6raLUj9?usp=sharing)\n",
+    "4. CodeLlama 34b 2x faster [A100 on Colab](https://colab.research.google.com/drive/1y7A0AxE3y8gdj4AVkl2aZX47Xu3P1wJT?usp=sharing)\n",
+    "5. Mistral 7b [free Kaggle version](https://www.kaggle.com/code/danielhanchen/kaggle-mistral-7b-unsloth-notebook)\n",
+    "6. We also did a [blog](https://huggingface.co/blog/unsloth-trl) with 🤗 HuggingFace, and we're in the TRL [docs](https://huggingface.co/docs/trl/main/en/sft_trainer#accelerate-fine-tuning-2x-using-unsloth)!\n",
+    "7. Text completions like novel writing [notebook](https://colab.research.google.com/drive/1ef-tab5bhkvWmBOObepl1WgJvfvSzn5Q?usp=sharing)\n",
+    "9. Gemma 6 trillion tokens is 2.5x faster! [free Colab](https://colab.research.google.com/drive/10NbwlsRChbma1v55m8LAPYG15uQv6HLo?usp=sharing)\n",
+    "\n",
+    "<div class=\"align-center\">\n",
+    "  <a href=\"https://github.com/unslothai/unsloth\"><img src=\"https://github.com/unslothai/unsloth/raw/main/images/unsloth%20new%20logo.png\" width=\"115\"></a>\n",
+    "  <a href=\"https://discord.gg/u54VK8m8tk\"><img src=\"https://github.com/unslothai/unsloth/raw/main/images/Discord.png\" width=\"145\"></a>\n",
+    "  <a href=\"https://ko-fi.com/unsloth\"><img src=\"https://github.com/unslothai/unsloth/raw/main/images/Kofi button.png\" width=\"145\"></a></a> Support our work if you can! Thanks!\n",
+    "</div>"
+   ]
+  }
+ ],
+ "metadata": {
+  "accelerator": "GPU",
+  "colab": {
+   "gpuType": "T4",
+   "provenance": []
+  },
+  "kernelspec": {
+   "display_name": "Python 3",
+   "name": "python3"
+  },
+  "language_info": {
+   "name": "python"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}