You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
306 lines
11 KiB
306 lines
11 KiB
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"# Anti Dumb Mask Inspection\n",
|
|
"\n",
|
|
"- Mask = 1 means these tokens ARE used to calculate the loss. These are the tokens we want the model to learn/predict (the assistant's response).\n",
|
|
"-Mask = 0 means these tokens are NOT used to calculate the loss. These are the tokens we don't want the model to learn/predict (system message, user input, markers, etc.)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"# Inspect Original Llama Autodiact"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Imports\n",
|
|
"import sys\n",
|
|
"\n",
|
|
"sys.path.append(\"..\")\n",
|
|
"\n",
|
|
"from transformers import AutoTokenizer\n",
|
|
"from src.tokenizer_adapter import LlamaTokenizerAdapter\n",
|
|
"import pandas as pd\n",
|
|
"\n",
|
|
"pd.set_option(\"display.max_rows\", None)\n",
|
|
"pd.set_option(\"display.max_colwidth\", None)\n",
|
|
"\n",
|
|
"# Initialize\n",
|
|
"tokenizer = AutoTokenizer.from_pretrained(\"meta-llama/meta-Llama-3.1-8B-Instruct\")\n",
|
|
"adapter = LlamaTokenizerAdapter()\n",
|
|
"\n",
|
|
"# Example conversation using chat template\n",
|
|
"chat = [\n",
|
|
" {\n",
|
|
" \"role\": \"system\",\n",
|
|
" \"content\": \"You are a friendly chatbot who always responds in the style of a pirate\",\n",
|
|
" },\n",
|
|
" {\"role\": \"user\", \"content\": \"Hello, how are you?\"},\n",
|
|
" {\"role\": \"assistant\", \"content\": \"I'm doing great. How can I help you today?\"},\n",
|
|
" {\"role\": \"ipython\", \"content\": \"THIS IS THE DOCUMENT!!!\"}, # this shit doesn't work in chat template\n",
|
|
" {\"role\": \"user\", \"content\": \"Hello, have you eanten?\"},\n",
|
|
" {\"role\": \"assistant\", \"content\": \"No I'm hungry?\"},\n",
|
|
"]\n",
|
|
"\n",
|
|
"# Get the formatted conversation using chat template\n",
|
|
"convo = tokenizer.apply_chat_template(chat, tokenize=False)\n",
|
|
"print(\"💬 Raw Chat Template Output:\")\n",
|
|
"print(f\"{'-' * 50}\\n{convo}\\n{'-' * 50}\\n\")\n",
|
|
"\n",
|
|
"# 1. Show text splitting\n",
|
|
"prompt, response = adapter.split_prompt_assistant(convo)\n",
|
|
"print(\"🔍 Text Split:\")\n",
|
|
"print(f\"Prompt:\\n{'-' * 50}\\n{prompt}\\n{'-' * 50}\")\n",
|
|
"print(f\"Response:\\n{'-' * 50}\\n{response}\\n{'-' * 50}\\n\")\n",
|
|
"\n",
|
|
"# 2. Get tokens and mask\n",
|
|
"encoding = tokenizer(convo, add_special_tokens=False)\n",
|
|
"input_ids = encoding.input_ids\n",
|
|
"tokens = tokenizer.convert_ids_to_tokens(input_ids)\n",
|
|
"mask = adapter.get_mask(convo, tokenizer)\n",
|
|
"\n",
|
|
"# 3. Create detailed view\n",
|
|
"df = pd.DataFrame(\n",
|
|
" {\n",
|
|
" \"Position\": range(len(tokens)),\n",
|
|
" \"Token ID\": input_ids,\n",
|
|
" \"Token\": tokens,\n",
|
|
" \"Text\": [tokenizer.decode([id]) for id in input_ids],\n",
|
|
" \"Mask\": mask.tolist(),\n",
|
|
" }\n",
|
|
")\n",
|
|
"\n",
|
|
"print(\"📊 Token Analysis:\")\n",
|
|
"print(df.to_string(index=False))\n",
|
|
"\n",
|
|
"# 4. Quick Stats\n",
|
|
"print(\"\\n📈 Quick Stats:\")\n",
|
|
"print(f\"Total tokens: {len(tokens)}\")\n",
|
|
"print(f\"Masked tokens (1s): {mask.sum().item()}\")\n",
|
|
"print(f\"Unmasked tokens (0s): {len(mask) - mask.sum().item()}\")\n",
|
|
"\n",
|
|
"# 5. Show masked content only\n",
|
|
"print(\"\\n🎯 Masked Content (Response):\")\n",
|
|
"masked_df = df[df[\"Mask\"] == 1]\n",
|
|
"print(masked_df.to_string(index=False))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"# ❌ Inspect R1-Distill (role = ipython didn't work)\n",
|
|
"the document content went missing."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Imports\n",
|
|
"import sys\n",
|
|
"\n",
|
|
"sys.path.append(\"..\")\n",
|
|
"\n",
|
|
"\n",
|
|
"# Imports\n",
|
|
"from transformers import AutoTokenizer\n",
|
|
"from src.tokenizer_adapter import R1DistilTokenizerAdapter\n",
|
|
"import pandas as pd\n",
|
|
"\n",
|
|
"pd.set_option(\"display.max_rows\", None)\n",
|
|
"pd.set_option(\"display.max_colwidth\", None)\n",
|
|
"\n",
|
|
"# Initialize with R1-Distil\n",
|
|
"tokenizer = AutoTokenizer.from_pretrained(\"deepseek-ai/deepseek-R1-Distill-Qwen-1.5B\")\n",
|
|
"adapter = R1DistilTokenizerAdapter()\n",
|
|
"\n",
|
|
"# Example conversation using R1-Distil format\n",
|
|
"chat = [\n",
|
|
" {\n",
|
|
" \"role\": \"system\",\n",
|
|
" \"content\": \"You are a friendly chatbot who always responds in the style of a pirate\",\n",
|
|
" },\n",
|
|
" {\"role\": \"user\", \"content\": \"Hello, how are you?\"},\n",
|
|
" {\"role\": \"assistant\", \"content\": \"I'm doing great. How can I help you today?\"},\n",
|
|
" {\"role\": \"ipython\", \"content\": \"THIS IS THE DOCUMENT!!!\"}, # this shit doesn't work in chat template\n",
|
|
" {\"role\": \"user\", \"content\": \"Hello, have you eanten?\"},\n",
|
|
" {\"role\": \"assistant\", \"content\": \"No I'm hungry?\"},\n",
|
|
"]\n",
|
|
"\n",
|
|
"# Get the formatted conversation using chat template\n",
|
|
"convo = tokenizer.apply_chat_template(chat, tokenize=False)\n",
|
|
"print(\"💬 Raw Chat Template Output:\")\n",
|
|
"print(f\"{'-' * 50}\\n{convo}\\n{'-' * 50}\\n\")\n",
|
|
"\n",
|
|
"# 1. Show text splitting\n",
|
|
"prompt, response = adapter.split_prompt_assistant(convo)\n",
|
|
"print(\"🔍 Text Split:\")\n",
|
|
"print(f\"Prompt:\\n{'-' * 50}\\n{prompt}\\n{'-' * 50}\")\n",
|
|
"print(f\"Response:\\n{'-' * 50}\\n{response}\\n{'-' * 50}\\n\")\n",
|
|
"\n",
|
|
"# 2. Get tokens and mask\n",
|
|
"encoding = tokenizer(convo, add_special_tokens=False)\n",
|
|
"input_ids = encoding.input_ids\n",
|
|
"tokens = tokenizer.convert_ids_to_tokens(input_ids)\n",
|
|
"mask = adapter.get_mask(convo, tokenizer)\n",
|
|
"\n",
|
|
"# 3. Create detailed view\n",
|
|
"df = pd.DataFrame(\n",
|
|
" {\n",
|
|
" \"Position\": range(len(tokens)),\n",
|
|
" \"Token\": tokens,\n",
|
|
" \"Token ID\": input_ids,\n",
|
|
" \"Text\": [tokenizer.decode([id]) for id in input_ids],\n",
|
|
" \"Mask\": mask.tolist(),\n",
|
|
" }\n",
|
|
")\n",
|
|
"\n",
|
|
"print(\"📊 Token Analysis:\")\n",
|
|
"print(df.to_string(index=False))\n",
|
|
"\n",
|
|
"# 4. Quick Stats\n",
|
|
"print(\"\\n📈 Quick Stats:\")\n",
|
|
"print(f\"Total tokens: {len(tokens)}\")\n",
|
|
"print(f\"Masked tokens (1s): {mask.sum().item()}\")\n",
|
|
"print(f\"Unmasked tokens (0s): {len(mask) - mask.sum().item()}\")\n",
|
|
"\n",
|
|
"# 5. Show masked content only\n",
|
|
"print(\"\\n🎯 Masked Content (Response):\")\n",
|
|
"masked_df = df[df[\"Mask\"] == 1]\n",
|
|
"print(masked_df.to_string(index=False))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"# Inspect R1-Distill just add string?"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Imports\n",
|
|
"import sys\n",
|
|
"\n",
|
|
"sys.path.append(\"..\")\n",
|
|
"\n",
|
|
"\n",
|
|
"# Imports\n",
|
|
"from transformers import AutoTokenizer\n",
|
|
"from src.tokenizer_adapter import R1DistilTokenizerAdapter\n",
|
|
"import pandas as pd\n",
|
|
"\n",
|
|
"pd.set_option(\"display.max_rows\", None)\n",
|
|
"pd.set_option(\"display.max_colwidth\", None)\n",
|
|
"\n",
|
|
"# Initialize with R1-Distil\n",
|
|
"tokenizer = AutoTokenizer.from_pretrained(\"deepseek-ai/deepseek-R1-Distill-Qwen-1.5B\")\n",
|
|
"adapter = R1DistilTokenizerAdapter()\n",
|
|
"\n",
|
|
"# Example conversation using R1-Distil format\n",
|
|
"chat = [\n",
|
|
" {\n",
|
|
" \"role\": \"system\",\n",
|
|
" \"content\": \"You are a friendly chatbot who always responds in the style of a pirate\",\n",
|
|
" },\n",
|
|
" {\"role\": \"user\", \"content\": \"Hello, how are you?\"},\n",
|
|
" {\"role\": \"assistant\", \"content\": \"I'm doing great. How can I help you today?\"},\n",
|
|
" # {\"role\": \"ipython\", \"content\": \"THIS IS THE DOCUMENT!!!\"}, # this shit doesn't work in chat template\n",
|
|
" {\"role\": \"user\", \"content\": \"Hello, have you eanten?\"},\n",
|
|
" {\"role\": \"assistant\", \"content\": \"No I'm hungry?\"},\n",
|
|
"]\n",
|
|
"\n",
|
|
"# Get the formatted conversation using chat template\n",
|
|
"convo = tokenizer.apply_chat_template(chat, tokenize=False)\n",
|
|
"# print(type(convo)) # string\n",
|
|
"\n",
|
|
"\n",
|
|
"think_and_search = \"<think>I love cats</think>\\n<search>Cat images</search>\\n\"\n",
|
|
"search_results = \"Here are some cat images: cat1 cat2 cat3\"\n",
|
|
"search_template = \"\\n\\n{think_and_search}<information>{search_results}</information>\\n\\n\"\n",
|
|
"search_text = search_template.format(think_and_search=think_and_search, search_results=search_results)\n",
|
|
"\n",
|
|
"convo = convo + search_text\n",
|
|
"\n",
|
|
"print(\"💬 Raw Chat Template Output:\")\n",
|
|
"print(f\"{'-' * 50}\\n{convo}\\n{'-' * 50}\\n\")\n",
|
|
"\n",
|
|
"# 1. Show text splitting\n",
|
|
"prompt, response = adapter.split_prompt_assistant(convo)\n",
|
|
"print(\"🔍 Text Split:\")\n",
|
|
"print(f\"Prompt:\\n{'-' * 50}\\n{prompt}\\n{'-' * 50}\")\n",
|
|
"print(f\"Response:\\n{'-' * 50}\\n{response}\\n{'-' * 50}\\n\")\n",
|
|
"\n",
|
|
"# 2. Get tokens and mask\n",
|
|
"encoding = tokenizer(convo, add_special_tokens=False)\n",
|
|
"input_ids = encoding.input_ids\n",
|
|
"tokens = tokenizer.convert_ids_to_tokens(input_ids)\n",
|
|
"mask = adapter.get_mask(convo, tokenizer)\n",
|
|
"\n",
|
|
"# 3. Create detailed view\n",
|
|
"df = pd.DataFrame(\n",
|
|
" {\n",
|
|
" \"Position\": range(len(tokens)),\n",
|
|
" \"Token\": tokens,\n",
|
|
" \"Token ID\": input_ids,\n",
|
|
" \"Text\": [tokenizer.decode([id]) for id in input_ids],\n",
|
|
" \"Mask\": mask.tolist(),\n",
|
|
" }\n",
|
|
")\n",
|
|
"\n",
|
|
"print(\"📊 Token Analysis:\")\n",
|
|
"print(df.to_string(index=False))\n",
|
|
"\n",
|
|
"# 4. Quick Stats\n",
|
|
"print(\"\\n📈 Quick Stats:\")\n",
|
|
"print(f\"Total tokens: {len(tokens)}\")\n",
|
|
"print(f\"Masked tokens (1s): {mask.sum().item()}\")\n",
|
|
"print(f\"Unmasked tokens (0s): {len(mask) - mask.sum().item()}\")\n",
|
|
"\n",
|
|
"# 5. Show masked content only\n",
|
|
"print(\"\\n🎯 Masked Content (Response):\")\n",
|
|
"masked_df = df[df[\"Mask\"] == 1]\n",
|
|
"print(masked_df.to_string(index=False))"
|
|
]
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "deepsearch-py311",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.11.11"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 2
|
|
}
|