ReZero-Search-LLM-Agent-Fork/notebooks/250402_inspect_mask.ipynb

{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Anti Dumb Mask Inspection\n",
    "\n",
    "- Mask = 1 means these tokens ARE used to calculate the loss. These are the tokens we want the model to learn/predict (the assistant's response).\n",
    "-Mask = 0 means these tokens are NOT used to calculate the loss. These are the tokens we don't want the model to learn/predict (system message, user input, markers, etc.)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Inspect Original Llama Autodiact"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Imports\n",
    "import sys\n",
    "\n",
    "sys.path.append(\"..\")\n",
    "\n",
    "from transformers import AutoTokenizer\n",
    "from src.tokenizer_adapter import LlamaTokenizerAdapter\n",
    "import pandas as pd\n",
    "\n",
    "pd.set_option(\"display.max_rows\", None)\n",
    "pd.set_option(\"display.max_colwidth\", None)\n",
    "\n",
    "# Initialize\n",
    "tokenizer = AutoTokenizer.from_pretrained(\"meta-llama/meta-Llama-3.1-8B-Instruct\")\n",
    "adapter = LlamaTokenizerAdapter()\n",
    "\n",
    "# Example conversation using chat template\n",
    "chat = [\n",
    "    {\n",
    "        \"role\": \"system\",\n",
    "        \"content\": \"You are a friendly chatbot who always responds in the style of a pirate\",\n",
    "    },\n",
    "    {\"role\": \"user\", \"content\": \"Hello, how are you?\"},\n",
    "    {\"role\": \"assistant\", \"content\": \"I'm doing great. How can I help you today?\"},\n",
    "    {\"role\": \"ipython\", \"content\": \"THIS IS THE DOCUMENT!!!\"},  # this shit doesn't work in chat template\n",
    "    {\"role\": \"user\", \"content\": \"Hello, have you eanten?\"},\n",
    "    {\"role\": \"assistant\", \"content\": \"No I'm hungry?\"},\n",
    "]\n",
    "\n",
    "# Get the formatted conversation using chat template\n",
    "convo = tokenizer.apply_chat_template(chat, tokenize=False)\n",
    "print(\"💬 Raw Chat Template Output:\")\n",
    "print(f\"{'-' * 50}\\n{convo}\\n{'-' * 50}\\n\")\n",
    "\n",
    "# 1. Show text splitting\n",
    "prompt, response = adapter.split_prompt_assistant(convo)\n",
    "print(\"🔍 Text Split:\")\n",
    "print(f\"Prompt:\\n{'-' * 50}\\n{prompt}\\n{'-' * 50}\")\n",
    "print(f\"Response:\\n{'-' * 50}\\n{response}\\n{'-' * 50}\\n\")\n",
    "\n",
    "# 2. Get tokens and mask\n",
    "encoding = tokenizer(convo, add_special_tokens=False)\n",
    "input_ids = encoding.input_ids\n",
    "tokens = tokenizer.convert_ids_to_tokens(input_ids)\n",
    "mask = adapter.get_mask(convo, tokenizer)\n",
    "\n",
    "# 3. Create detailed view\n",
    "df = pd.DataFrame(\n",
    "    {\n",
    "        \"Position\": range(len(tokens)),\n",
    "        \"Token ID\": input_ids,\n",
    "        \"Token\": tokens,\n",
    "        \"Text\": [tokenizer.decode([id]) for id in input_ids],\n",
    "        \"Mask\": mask.tolist(),\n",
    "    }\n",
    ")\n",
    "\n",
    "print(\"📊 Token Analysis:\")\n",
    "print(df.to_string(index=False))\n",
    "\n",
    "# 4. Quick Stats\n",
    "print(\"\\n📈 Quick Stats:\")\n",
    "print(f\"Total tokens: {len(tokens)}\")\n",
    "print(f\"Masked tokens (1s): {mask.sum().item()}\")\n",
    "print(f\"Unmasked tokens (0s): {len(mask) - mask.sum().item()}\")\n",
    "\n",
    "# 5. Show masked content only\n",
    "print(\"\\n🎯 Masked Content (Response):\")\n",
    "masked_df = df[df[\"Mask\"] == 1]\n",
    "print(masked_df.to_string(index=False))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# ❌ Inspect R1-Distill (role = ipython didn't work)\n",
    "the document content went missing."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Imports\n",
    "import sys\n",
    "\n",
    "sys.path.append(\"..\")\n",
    "\n",
    "\n",
    "# Imports\n",
    "from transformers import AutoTokenizer\n",
    "from src.tokenizer_adapter import R1DistilTokenizerAdapter\n",
    "import pandas as pd\n",
    "\n",
    "pd.set_option(\"display.max_rows\", None)\n",
    "pd.set_option(\"display.max_colwidth\", None)\n",
    "\n",
    "# Initialize with R1-Distil\n",
    "tokenizer = AutoTokenizer.from_pretrained(\"deepseek-ai/deepseek-R1-Distill-Qwen-1.5B\")\n",
    "adapter = R1DistilTokenizerAdapter()\n",
    "\n",
    "# Example conversation using R1-Distil format\n",
    "chat = [\n",
    "    {\n",
    "        \"role\": \"system\",\n",
    "        \"content\": \"You are a friendly chatbot who always responds in the style of a pirate\",\n",
    "    },\n",
    "    {\"role\": \"user\", \"content\": \"Hello, how are you?\"},\n",
    "    {\"role\": \"assistant\", \"content\": \"I'm doing great. How can I help you today?\"},\n",
    "    {\"role\": \"ipython\", \"content\": \"THIS IS THE DOCUMENT!!!\"},  # this shit doesn't work in chat template\n",
    "    {\"role\": \"user\", \"content\": \"Hello, have you eanten?\"},\n",
    "    {\"role\": \"assistant\", \"content\": \"No I'm hungry?\"},\n",
    "]\n",
    "\n",
    "# Get the formatted conversation using chat template\n",
    "convo = tokenizer.apply_chat_template(chat, tokenize=False)\n",
    "print(\"💬 Raw Chat Template Output:\")\n",
    "print(f\"{'-' * 50}\\n{convo}\\n{'-' * 50}\\n\")\n",
    "\n",
    "# 1. Show text splitting\n",
    "prompt, response = adapter.split_prompt_assistant(convo)\n",
    "print(\"🔍 Text Split:\")\n",
    "print(f\"Prompt:\\n{'-' * 50}\\n{prompt}\\n{'-' * 50}\")\n",
    "print(f\"Response:\\n{'-' * 50}\\n{response}\\n{'-' * 50}\\n\")\n",
    "\n",
    "# 2. Get tokens and mask\n",
    "encoding = tokenizer(convo, add_special_tokens=False)\n",
    "input_ids = encoding.input_ids\n",
    "tokens = tokenizer.convert_ids_to_tokens(input_ids)\n",
    "mask = adapter.get_mask(convo, tokenizer)\n",
    "\n",
    "# 3. Create detailed view\n",
    "df = pd.DataFrame(\n",
    "    {\n",
    "        \"Position\": range(len(tokens)),\n",
    "        \"Token\": tokens,\n",
    "        \"Token ID\": input_ids,\n",
    "        \"Text\": [tokenizer.decode([id]) for id in input_ids],\n",
    "        \"Mask\": mask.tolist(),\n",
    "    }\n",
    ")\n",
    "\n",
    "print(\"📊 Token Analysis:\")\n",
    "print(df.to_string(index=False))\n",
    "\n",
    "# 4. Quick Stats\n",
    "print(\"\\n📈 Quick Stats:\")\n",
    "print(f\"Total tokens: {len(tokens)}\")\n",
    "print(f\"Masked tokens (1s): {mask.sum().item()}\")\n",
    "print(f\"Unmasked tokens (0s): {len(mask) - mask.sum().item()}\")\n",
    "\n",
    "# 5. Show masked content only\n",
    "print(\"\\n🎯 Masked Content (Response):\")\n",
    "masked_df = df[df[\"Mask\"] == 1]\n",
    "print(masked_df.to_string(index=False))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Inspect R1-Distill just add string?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Imports\n",
    "import sys\n",
    "\n",
    "sys.path.append(\"..\")\n",
    "\n",
    "\n",
    "# Imports\n",
    "from transformers import AutoTokenizer\n",
    "from src.tokenizer_adapter import R1DistilTokenizerAdapter\n",
    "import pandas as pd\n",
    "\n",
    "pd.set_option(\"display.max_rows\", None)\n",
    "pd.set_option(\"display.max_colwidth\", None)\n",
    "\n",
    "# Initialize with R1-Distil\n",
    "tokenizer = AutoTokenizer.from_pretrained(\"deepseek-ai/deepseek-R1-Distill-Qwen-1.5B\")\n",
    "adapter = R1DistilTokenizerAdapter()\n",
    "\n",
    "# Example conversation using R1-Distil format\n",
    "chat = [\n",
    "    {\n",
    "        \"role\": \"system\",\n",
    "        \"content\": \"You are a friendly chatbot who always responds in the style of a pirate\",\n",
    "    },\n",
    "    {\"role\": \"user\", \"content\": \"Hello, how are you?\"},\n",
    "    {\"role\": \"assistant\", \"content\": \"I'm doing great. How can I help you today?\"},\n",
    "    # {\"role\": \"ipython\", \"content\": \"THIS IS THE DOCUMENT!!!\"},  # this shit doesn't work in chat template\n",
    "    {\"role\": \"user\", \"content\": \"Hello, have you eanten?\"},\n",
    "    {\"role\": \"assistant\", \"content\": \"No I'm hungry?\"},\n",
    "]\n",
    "\n",
    "# Get the formatted conversation using chat template\n",
    "convo = tokenizer.apply_chat_template(chat, tokenize=False)\n",
    "# print(type(convo)) # string\n",
    "\n",
    "\n",
    "think_and_search = \"<think>I love cats</think>\\n<search>Cat images</search>\\n\"\n",
    "search_results = \"Here are some cat images: cat1 cat2 cat3\"\n",
    "search_template = \"\\n\\n{think_and_search}<information>{search_results}</information>\\n\\n\"\n",
    "search_text = search_template.format(think_and_search=think_and_search, search_results=search_results)\n",
    "\n",
    "convo = convo + search_text\n",
    "\n",
    "print(\"💬 Raw Chat Template Output:\")\n",
    "print(f\"{'-' * 50}\\n{convo}\\n{'-' * 50}\\n\")\n",
    "\n",
    "# 1. Show text splitting\n",
    "prompt, response = adapter.split_prompt_assistant(convo)\n",
    "print(\"🔍 Text Split:\")\n",
    "print(f\"Prompt:\\n{'-' * 50}\\n{prompt}\\n{'-' * 50}\")\n",
    "print(f\"Response:\\n{'-' * 50}\\n{response}\\n{'-' * 50}\\n\")\n",
    "\n",
    "# 2. Get tokens and mask\n",
    "encoding = tokenizer(convo, add_special_tokens=False)\n",
    "input_ids = encoding.input_ids\n",
    "tokens = tokenizer.convert_ids_to_tokens(input_ids)\n",
    "mask = adapter.get_mask(convo, tokenizer)\n",
    "\n",
    "# 3. Create detailed view\n",
    "df = pd.DataFrame(\n",
    "    {\n",
    "        \"Position\": range(len(tokens)),\n",
    "        \"Token\": tokens,\n",
    "        \"Token ID\": input_ids,\n",
    "        \"Text\": [tokenizer.decode([id]) for id in input_ids],\n",
    "        \"Mask\": mask.tolist(),\n",
    "    }\n",
    ")\n",
    "\n",
    "print(\"📊 Token Analysis:\")\n",
    "print(df.to_string(index=False))\n",
    "\n",
    "# 4. Quick Stats\n",
    "print(\"\\n📈 Quick Stats:\")\n",
    "print(f\"Total tokens: {len(tokens)}\")\n",
    "print(f\"Masked tokens (1s): {mask.sum().item()}\")\n",
    "print(f\"Unmasked tokens (0s): {len(mask) - mask.sum().item()}\")\n",
    "\n",
    "# 5. Show masked content only\n",
    "print(\"\\n🎯 Masked Content (Response):\")\n",
    "masked_df = df[df[\"Mask\"] == 1]\n",
    "print(masked_df.to_string(index=False))"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "deepsearch-py311",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}