{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Anti Dumb Mask Inspection\n",
"\n",
"- Mask = 1 means these tokens ARE used to calculate the loss. These are the tokens we want the model to learn/predict (the assistant's response).\n",
"-Mask = 0 means these tokens are NOT used to calculate the loss. These are the tokens we don't want the model to learn/predict (system message, user input, markers, etc.)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Inspect Original Llama Autodiact"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Imports\n",
"import sys\n",
"\n",
"sys.path.append(\"..\")\n",
"\n",
"from transformers import AutoTokenizer\n",
"from src.tokenizer_adapter import LlamaTokenizerAdapter\n",
"import pandas as pd\n",
"\n",
"pd.set_option(\"display.max_rows\", None)\n",
"pd.set_option(\"display.max_colwidth\", None)\n",
"\n",
"# Initialize\n",
"tokenizer = AutoTokenizer.from_pretrained(\"meta-llama/meta-Llama-3.1-8B-Instruct\")\n",
"adapter = LlamaTokenizerAdapter()\n",
"\n",
"# Example conversation using chat template\n",
"chat = [\n",
" {\n",
" \"role\": \"system\",\n",
" \"content\": \"You are a friendly chatbot who always responds in the style of a pirate\",\n",
" },\n",
" {\"role\": \"user\", \"content\": \"Hello, how are you?\"},\n",
" {\"role\": \"assistant\", \"content\": \"I'm doing great. How can I help you today?\"},\n",
" {\"role\": \"ipython\", \"content\": \"THIS IS THE DOCUMENT!!!\"}, # this shit doesn't work in chat template\n",
" {\"role\": \"user\", \"content\": \"Hello, have you eanten?\"},\n",
" {\"role\": \"assistant\", \"content\": \"No I'm hungry?\"},\n",
"]\n",
"\n",
"# Get the formatted conversation using chat template\n",
"convo = tokenizer.apply_chat_template(chat, tokenize=False)\n",
"print(\"š¬ Raw Chat Template Output:\")\n",
"print(f\"{'-' * 50}\\n{convo}\\n{'-' * 50}\\n\")\n",
"\n",
"# 1. Show text splitting\n",
"prompt, response = adapter.split_prompt_assistant(convo)\n",
"print(\"š Text Split:\")\n",
"print(f\"Prompt:\\n{'-' * 50}\\n{prompt}\\n{'-' * 50}\")\n",
"print(f\"Response:\\n{'-' * 50}\\n{response}\\n{'-' * 50}\\n\")\n",
"\n",
"# 2. Get tokens and mask\n",
"encoding = tokenizer(convo, add_special_tokens=False)\n",
"input_ids = encoding.input_ids\n",
"tokens = tokenizer.convert_ids_to_tokens(input_ids)\n",
"mask = adapter.get_mask(convo, tokenizer)\n",
"\n",
"# 3. Create detailed view\n",
"df = pd.DataFrame(\n",
" {\n",
" \"Position\": range(len(tokens)),\n",
" \"Token ID\": input_ids,\n",
" \"Token\": tokens,\n",
" \"Text\": [tokenizer.decode([id]) for id in input_ids],\n",
" \"Mask\": mask.tolist(),\n",
" }\n",
")\n",
"\n",
"print(\"š Token Analysis:\")\n",
"print(df.to_string(index=False))\n",
"\n",
"# 4. Quick Stats\n",
"print(\"\\nš Quick Stats:\")\n",
"print(f\"Total tokens: {len(tokens)}\")\n",
"print(f\"Masked tokens (1s): {mask.sum().item()}\")\n",
"print(f\"Unmasked tokens (0s): {len(mask) - mask.sum().item()}\")\n",
"\n",
"# 5. Show masked content only\n",
"print(\"\\nšÆ Masked Content (Response):\")\n",
"masked_df = df[df[\"Mask\"] == 1]\n",
"print(masked_df.to_string(index=False))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# ā Inspect R1-Distill (role = ipython didn't work)\n",
"the document content went missing."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Imports\n",
"import sys\n",
"\n",
"sys.path.append(\"..\")\n",
"\n",
"\n",
"# Imports\n",
"from transformers import AutoTokenizer\n",
"from src.tokenizer_adapter import R1DistilTokenizerAdapter\n",
"import pandas as pd\n",
"\n",
"pd.set_option(\"display.max_rows\", None)\n",
"pd.set_option(\"display.max_colwidth\", None)\n",
"\n",
"# Initialize with R1-Distil\n",
"tokenizer = AutoTokenizer.from_pretrained(\"deepseek-ai/deepseek-R1-Distill-Qwen-1.5B\")\n",
"adapter = R1DistilTokenizerAdapter()\n",
"\n",
"# Example conversation using R1-Distil format\n",
"chat = [\n",
" {\n",
" \"role\": \"system\",\n",
" \"content\": \"You are a friendly chatbot who always responds in the style of a pirate\",\n",
" },\n",
" {\"role\": \"user\", \"content\": \"Hello, how are you?\"},\n",
" {\"role\": \"assistant\", \"content\": \"I'm doing great. How can I help you today?\"},\n",
" {\"role\": \"ipython\", \"content\": \"THIS IS THE DOCUMENT!!!\"}, # this shit doesn't work in chat template\n",
" {\"role\": \"user\", \"content\": \"Hello, have you eanten?\"},\n",
" {\"role\": \"assistant\", \"content\": \"No I'm hungry?\"},\n",
"]\n",
"\n",
"# Get the formatted conversation using chat template\n",
"convo = tokenizer.apply_chat_template(chat, tokenize=False)\n",
"print(\"š¬ Raw Chat Template Output:\")\n",
"print(f\"{'-' * 50}\\n{convo}\\n{'-' * 50}\\n\")\n",
"\n",
"# 1. Show text splitting\n",
"prompt, response = adapter.split_prompt_assistant(convo)\n",
"print(\"š Text Split:\")\n",
"print(f\"Prompt:\\n{'-' * 50}\\n{prompt}\\n{'-' * 50}\")\n",
"print(f\"Response:\\n{'-' * 50}\\n{response}\\n{'-' * 50}\\n\")\n",
"\n",
"# 2. Get tokens and mask\n",
"encoding = tokenizer(convo, add_special_tokens=False)\n",
"input_ids = encoding.input_ids\n",
"tokens = tokenizer.convert_ids_to_tokens(input_ids)\n",
"mask = adapter.get_mask(convo, tokenizer)\n",
"\n",
"# 3. Create detailed view\n",
"df = pd.DataFrame(\n",
" {\n",
" \"Position\": range(len(tokens)),\n",
" \"Token\": tokens,\n",
" \"Token ID\": input_ids,\n",
" \"Text\": [tokenizer.decode([id]) for id in input_ids],\n",
" \"Mask\": mask.tolist(),\n",
" }\n",
")\n",
"\n",
"print(\"š Token Analysis:\")\n",
"print(df.to_string(index=False))\n",
"\n",
"# 4. Quick Stats\n",
"print(\"\\nš Quick Stats:\")\n",
"print(f\"Total tokens: {len(tokens)}\")\n",
"print(f\"Masked tokens (1s): {mask.sum().item()}\")\n",
"print(f\"Unmasked tokens (0s): {len(mask) - mask.sum().item()}\")\n",
"\n",
"# 5. Show masked content only\n",
"print(\"\\nšÆ Masked Content (Response):\")\n",
"masked_df = df[df[\"Mask\"] == 1]\n",
"print(masked_df.to_string(index=False))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Inspect R1-Distill just add string?"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Imports\n",
"import sys\n",
"\n",
"sys.path.append(\"..\")\n",
"\n",
"\n",
"# Imports\n",
"from transformers import AutoTokenizer\n",
"from src.tokenizer_adapter import R1DistilTokenizerAdapter\n",
"import pandas as pd\n",
"\n",
"pd.set_option(\"display.max_rows\", None)\n",
"pd.set_option(\"display.max_colwidth\", None)\n",
"\n",
"# Initialize with R1-Distil\n",
"tokenizer = AutoTokenizer.from_pretrained(\"deepseek-ai/deepseek-R1-Distill-Qwen-1.5B\")\n",
"adapter = R1DistilTokenizerAdapter()\n",
"\n",
"# Example conversation using R1-Distil format\n",
"chat = [\n",
" {\n",
" \"role\": \"system\",\n",
" \"content\": \"You are a friendly chatbot who always responds in the style of a pirate\",\n",
" },\n",
" {\"role\": \"user\", \"content\": \"Hello, how are you?\"},\n",
" {\"role\": \"assistant\", \"content\": \"I'm doing great. How can I help you today?\"},\n",
" # {\"role\": \"ipython\", \"content\": \"THIS IS THE DOCUMENT!!!\"}, # this shit doesn't work in chat template\n",
" {\"role\": \"user\", \"content\": \"Hello, have you eanten?\"},\n",
" {\"role\": \"assistant\", \"content\": \"No I'm hungry?\"},\n",
"]\n",
"\n",
"# Get the formatted conversation using chat template\n",
"convo = tokenizer.apply_chat_template(chat, tokenize=False)\n",
"# print(type(convo)) # string\n",
"\n",
"\n",
"think_and_search = \"I love cats\\nCat images\\n\"\n",
"search_results = \"Here are some cat images: cat1 cat2 cat3\"\n",
"search_template = \"\\n\\n{think_and_search}{search_results}\\n\\n\"\n",
"search_text = search_template.format(think_and_search=think_and_search, search_results=search_results)\n",
"\n",
"convo = convo + search_text\n",
"\n",
"print(\"š¬ Raw Chat Template Output:\")\n",
"print(f\"{'-' * 50}\\n{convo}\\n{'-' * 50}\\n\")\n",
"\n",
"# 1. Show text splitting\n",
"prompt, response = adapter.split_prompt_assistant(convo)\n",
"print(\"š Text Split:\")\n",
"print(f\"Prompt:\\n{'-' * 50}\\n{prompt}\\n{'-' * 50}\")\n",
"print(f\"Response:\\n{'-' * 50}\\n{response}\\n{'-' * 50}\\n\")\n",
"\n",
"# 2. Get tokens and mask\n",
"encoding = tokenizer(convo, add_special_tokens=False)\n",
"input_ids = encoding.input_ids\n",
"tokens = tokenizer.convert_ids_to_tokens(input_ids)\n",
"mask = adapter.get_mask(convo, tokenizer)\n",
"\n",
"# 3. Create detailed view\n",
"df = pd.DataFrame(\n",
" {\n",
" \"Position\": range(len(tokens)),\n",
" \"Token\": tokens,\n",
" \"Token ID\": input_ids,\n",
" \"Text\": [tokenizer.decode([id]) for id in input_ids],\n",
" \"Mask\": mask.tolist(),\n",
" }\n",
")\n",
"\n",
"print(\"š Token Analysis:\")\n",
"print(df.to_string(index=False))\n",
"\n",
"# 4. Quick Stats\n",
"print(\"\\nš Quick Stats:\")\n",
"print(f\"Total tokens: {len(tokens)}\")\n",
"print(f\"Masked tokens (1s): {mask.sum().item()}\")\n",
"print(f\"Unmasked tokens (0s): {len(mask) - mask.sum().item()}\")\n",
"\n",
"# 5. Show masked content only\n",
"print(\"\\nšÆ Masked Content (Response):\")\n",
"masked_df = df[df[\"Mask\"] == 1]\n",
"print(masked_df.to_string(index=False))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Inspect Qwen 2.5 Instruct"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Imports\n",
"import sys\n",
"\n",
"sys.path.append(\"..\")\n",
"\n",
"from transformers import AutoTokenizer\n",
"from src.tokenizer_adapter import QwenTokenizerAdapter\n",
"import pandas as pd\n",
"\n",
"pd.set_option(\"display.max_rows\", None)\n",
"pd.set_option(\"display.max_colwidth\", None)\n",
"\n",
"# Initialize\n",
"tokenizer = AutoTokenizer.from_pretrained(\"Qwen/Qwen2.5-1.5B-Instruct\")\n",
"adapter = QwenTokenizerAdapter()\n",
"\n",
"# Example conversation using chat template\n",
"chat = [\n",
" {\n",
" \"role\": \"system\",\n",
" \"content\": \"You are a friendly chatbot who always responds in the style of a pirate\",\n",
" },\n",
" {\"role\": \"user\", \"content\": \"Hello, how are you?\"},\n",
" {\"role\": \"assistant\", \"content\": \"I'm doing great. How can I help you today?\"},\n",
" {\"role\": \"ipython\", \"content\": \"THIS IS THE DOCUMENT!!!\"}, # this shit doesn't work in chat template\n",
" {\"role\": \"user\", \"content\": \"Hello, have you eanten?\"},\n",
" {\"role\": \"assistant\", \"content\": \"No I'm hungry?\"},\n",
"]\n",
"\n",
"# Get the formatted conversation using chat template\n",
"convo = tokenizer.apply_chat_template(chat, tokenize=False)\n",
"print(\"š¬ Raw Chat Template Output:\")\n",
"print(f\"{'-' * 50}\\n{convo}\\n{'-' * 50}\\n\")\n",
"\n",
"# 1. Show text splitting\n",
"prompt, response = adapter.split_prompt_assistant(convo)\n",
"print(\"š Text Split:\")\n",
"print(f\"Prompt:\\n{'-' * 50}\\n{prompt}\\n{'-' * 50}\")\n",
"print(f\"Response:\\n{'-' * 50}\\n{response}\\n{'-' * 50}\\n\")\n",
"\n",
"# 2. Get tokens and mask\n",
"encoding = tokenizer(convo, add_special_tokens=False)\n",
"input_ids = encoding.input_ids\n",
"tokens = tokenizer.convert_ids_to_tokens(input_ids)\n",
"mask = adapter.get_mask(convo, tokenizer)\n",
"\n",
"# 3. Create detailed view\n",
"df = pd.DataFrame(\n",
" {\n",
" \"Position\": range(len(tokens)),\n",
" \"Token ID\": input_ids,\n",
" \"Token\": tokens,\n",
" \"Text\": [tokenizer.decode([id]) for id in input_ids],\n",
" \"Mask\": mask.tolist(),\n",
" }\n",
")\n",
"\n",
"print(\"š Token Analysis:\")\n",
"print(df.to_string(index=False))\n",
"\n",
"# 4. Quick Stats\n",
"print(\"\\nš Quick Stats:\")\n",
"print(f\"Total tokens: {len(tokens)}\")\n",
"print(f\"Masked tokens (1s): {mask.sum().item()}\")\n",
"print(f\"Unmasked tokens (0s): {len(mask) - mask.sum().item()}\")\n",
"\n",
"# 5. Show masked content only\n",
"print(\"\\nšÆ Masked Content (Response):\")\n",
"masked_df = df[df[\"Mask\"] == 1]\n",
"print(masked_df.to_string(index=False))"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "deepsearch-py311",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.11"
}
},
"nbformat": 4,
"nbformat_minor": 2
}