{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Anti Dumb Mask Inspection\n", "\n", "- Mask = 1 means these tokens ARE used to calculate the loss. These are the tokens we want the model to learn/predict (the assistant's response).\n", "-Mask = 0 means these tokens are NOT used to calculate the loss. These are the tokens we don't want the model to learn/predict (system message, user input, markers, etc.)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Inspect Original Llama Autodiact" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Imports\n", "import sys\n", "\n", "sys.path.append(\"..\")\n", "\n", "from transformers import AutoTokenizer\n", "from src.tokenizer_adapter import LlamaTokenizerAdapter\n", "import pandas as pd\n", "\n", "pd.set_option(\"display.max_rows\", None)\n", "pd.set_option(\"display.max_colwidth\", None)\n", "\n", "# Initialize\n", "tokenizer = AutoTokenizer.from_pretrained(\"meta-llama/meta-Llama-3.1-8B-Instruct\")\n", "adapter = LlamaTokenizerAdapter()\n", "\n", "# Example conversation using chat template\n", "chat = [\n", " {\n", " \"role\": \"system\",\n", " \"content\": \"You are a friendly chatbot who always responds in the style of a pirate\",\n", " },\n", " {\"role\": \"user\", \"content\": \"Hello, how are you?\"},\n", " {\"role\": \"assistant\", \"content\": \"I'm doing great. How can I help you today?\"},\n", " {\"role\": \"ipython\", \"content\": \"THIS IS THE DOCUMENT!!!\"}, # this shit doesn't work in chat template\n", " {\"role\": \"user\", \"content\": \"Hello, have you eanten?\"},\n", " {\"role\": \"assistant\", \"content\": \"No I'm hungry?\"},\n", "]\n", "\n", "# Get the formatted conversation using chat template\n", "convo = tokenizer.apply_chat_template(chat, tokenize=False)\n", "print(\"šŸ’¬ Raw Chat Template Output:\")\n", "print(f\"{'-' * 50}\\n{convo}\\n{'-' * 50}\\n\")\n", "\n", "# 1. Show text splitting\n", "prompt, response = adapter.split_prompt_assistant(convo)\n", "print(\"šŸ” Text Split:\")\n", "print(f\"Prompt:\\n{'-' * 50}\\n{prompt}\\n{'-' * 50}\")\n", "print(f\"Response:\\n{'-' * 50}\\n{response}\\n{'-' * 50}\\n\")\n", "\n", "# 2. Get tokens and mask\n", "encoding = tokenizer(convo, add_special_tokens=False)\n", "input_ids = encoding.input_ids\n", "tokens = tokenizer.convert_ids_to_tokens(input_ids)\n", "mask = adapter.get_mask(convo, tokenizer)\n", "\n", "# 3. Create detailed view\n", "df = pd.DataFrame(\n", " {\n", " \"Position\": range(len(tokens)),\n", " \"Token ID\": input_ids,\n", " \"Token\": tokens,\n", " \"Text\": [tokenizer.decode([id]) for id in input_ids],\n", " \"Mask\": mask.tolist(),\n", " }\n", ")\n", "\n", "print(\"šŸ“Š Token Analysis:\")\n", "print(df.to_string(index=False))\n", "\n", "# 4. Quick Stats\n", "print(\"\\nšŸ“ˆ Quick Stats:\")\n", "print(f\"Total tokens: {len(tokens)}\")\n", "print(f\"Masked tokens (1s): {mask.sum().item()}\")\n", "print(f\"Unmasked tokens (0s): {len(mask) - mask.sum().item()}\")\n", "\n", "# 5. Show masked content only\n", "print(\"\\nšŸŽÆ Masked Content (Response):\")\n", "masked_df = df[df[\"Mask\"] == 1]\n", "print(masked_df.to_string(index=False))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# āŒ Inspect R1-Distill (role = ipython didn't work)\n", "the document content went missing." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Imports\n", "import sys\n", "\n", "sys.path.append(\"..\")\n", "\n", "\n", "# Imports\n", "from transformers import AutoTokenizer\n", "from src.tokenizer_adapter import R1DistilTokenizerAdapter\n", "import pandas as pd\n", "\n", "pd.set_option(\"display.max_rows\", None)\n", "pd.set_option(\"display.max_colwidth\", None)\n", "\n", "# Initialize with R1-Distil\n", "tokenizer = AutoTokenizer.from_pretrained(\"deepseek-ai/deepseek-R1-Distill-Qwen-1.5B\")\n", "adapter = R1DistilTokenizerAdapter()\n", "\n", "# Example conversation using R1-Distil format\n", "chat = [\n", " {\n", " \"role\": \"system\",\n", " \"content\": \"You are a friendly chatbot who always responds in the style of a pirate\",\n", " },\n", " {\"role\": \"user\", \"content\": \"Hello, how are you?\"},\n", " {\"role\": \"assistant\", \"content\": \"I'm doing great. How can I help you today?\"},\n", " {\"role\": \"ipython\", \"content\": \"THIS IS THE DOCUMENT!!!\"}, # this shit doesn't work in chat template\n", " {\"role\": \"user\", \"content\": \"Hello, have you eanten?\"},\n", " {\"role\": \"assistant\", \"content\": \"No I'm hungry?\"},\n", "]\n", "\n", "# Get the formatted conversation using chat template\n", "convo = tokenizer.apply_chat_template(chat, tokenize=False)\n", "print(\"šŸ’¬ Raw Chat Template Output:\")\n", "print(f\"{'-' * 50}\\n{convo}\\n{'-' * 50}\\n\")\n", "\n", "# 1. Show text splitting\n", "prompt, response = adapter.split_prompt_assistant(convo)\n", "print(\"šŸ” Text Split:\")\n", "print(f\"Prompt:\\n{'-' * 50}\\n{prompt}\\n{'-' * 50}\")\n", "print(f\"Response:\\n{'-' * 50}\\n{response}\\n{'-' * 50}\\n\")\n", "\n", "# 2. Get tokens and mask\n", "encoding = tokenizer(convo, add_special_tokens=False)\n", "input_ids = encoding.input_ids\n", "tokens = tokenizer.convert_ids_to_tokens(input_ids)\n", "mask = adapter.get_mask(convo, tokenizer)\n", "\n", "# 3. Create detailed view\n", "df = pd.DataFrame(\n", " {\n", " \"Position\": range(len(tokens)),\n", " \"Token\": tokens,\n", " \"Token ID\": input_ids,\n", " \"Text\": [tokenizer.decode([id]) for id in input_ids],\n", " \"Mask\": mask.tolist(),\n", " }\n", ")\n", "\n", "print(\"šŸ“Š Token Analysis:\")\n", "print(df.to_string(index=False))\n", "\n", "# 4. Quick Stats\n", "print(\"\\nšŸ“ˆ Quick Stats:\")\n", "print(f\"Total tokens: {len(tokens)}\")\n", "print(f\"Masked tokens (1s): {mask.sum().item()}\")\n", "print(f\"Unmasked tokens (0s): {len(mask) - mask.sum().item()}\")\n", "\n", "# 5. Show masked content only\n", "print(\"\\nšŸŽÆ Masked Content (Response):\")\n", "masked_df = df[df[\"Mask\"] == 1]\n", "print(masked_df.to_string(index=False))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Inspect R1-Distill just add string?" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Imports\n", "import sys\n", "\n", "sys.path.append(\"..\")\n", "\n", "\n", "# Imports\n", "from transformers import AutoTokenizer\n", "from src.tokenizer_adapter import R1DistilTokenizerAdapter\n", "import pandas as pd\n", "\n", "pd.set_option(\"display.max_rows\", None)\n", "pd.set_option(\"display.max_colwidth\", None)\n", "\n", "# Initialize with R1-Distil\n", "tokenizer = AutoTokenizer.from_pretrained(\"deepseek-ai/deepseek-R1-Distill-Qwen-1.5B\")\n", "adapter = R1DistilTokenizerAdapter()\n", "\n", "# Example conversation using R1-Distil format\n", "chat = [\n", " {\n", " \"role\": \"system\",\n", " \"content\": \"You are a friendly chatbot who always responds in the style of a pirate\",\n", " },\n", " {\"role\": \"user\", \"content\": \"Hello, how are you?\"},\n", " {\"role\": \"assistant\", \"content\": \"I'm doing great. How can I help you today?\"},\n", " # {\"role\": \"ipython\", \"content\": \"THIS IS THE DOCUMENT!!!\"}, # this shit doesn't work in chat template\n", " {\"role\": \"user\", \"content\": \"Hello, have you eanten?\"},\n", " {\"role\": \"assistant\", \"content\": \"No I'm hungry?\"},\n", "]\n", "\n", "# Get the formatted conversation using chat template\n", "convo = tokenizer.apply_chat_template(chat, tokenize=False)\n", "# print(type(convo)) # string\n", "\n", "\n", "think_and_search = \"I love cats\\nCat images\\n\"\n", "search_results = \"Here are some cat images: cat1 cat2 cat3\"\n", "search_template = \"\\n\\n{think_and_search}{search_results}\\n\\n\"\n", "search_text = search_template.format(think_and_search=think_and_search, search_results=search_results)\n", "\n", "convo = convo + search_text\n", "\n", "print(\"šŸ’¬ Raw Chat Template Output:\")\n", "print(f\"{'-' * 50}\\n{convo}\\n{'-' * 50}\\n\")\n", "\n", "# 1. Show text splitting\n", "prompt, response = adapter.split_prompt_assistant(convo)\n", "print(\"šŸ” Text Split:\")\n", "print(f\"Prompt:\\n{'-' * 50}\\n{prompt}\\n{'-' * 50}\")\n", "print(f\"Response:\\n{'-' * 50}\\n{response}\\n{'-' * 50}\\n\")\n", "\n", "# 2. Get tokens and mask\n", "encoding = tokenizer(convo, add_special_tokens=False)\n", "input_ids = encoding.input_ids\n", "tokens = tokenizer.convert_ids_to_tokens(input_ids)\n", "mask = adapter.get_mask(convo, tokenizer)\n", "\n", "# 3. Create detailed view\n", "df = pd.DataFrame(\n", " {\n", " \"Position\": range(len(tokens)),\n", " \"Token\": tokens,\n", " \"Token ID\": input_ids,\n", " \"Text\": [tokenizer.decode([id]) for id in input_ids],\n", " \"Mask\": mask.tolist(),\n", " }\n", ")\n", "\n", "print(\"šŸ“Š Token Analysis:\")\n", "print(df.to_string(index=False))\n", "\n", "# 4. Quick Stats\n", "print(\"\\nšŸ“ˆ Quick Stats:\")\n", "print(f\"Total tokens: {len(tokens)}\")\n", "print(f\"Masked tokens (1s): {mask.sum().item()}\")\n", "print(f\"Unmasked tokens (0s): {len(mask) - mask.sum().item()}\")\n", "\n", "# 5. Show masked content only\n", "print(\"\\nšŸŽÆ Masked Content (Response):\")\n", "masked_df = df[df[\"Mask\"] == 1]\n", "print(masked_df.to_string(index=False))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Inspect Qwen 2.5 Instruct" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Imports\n", "import sys\n", "\n", "sys.path.append(\"..\")\n", "\n", "from transformers import AutoTokenizer\n", "from src.tokenizer_adapter import QwenTokenizerAdapter\n", "import pandas as pd\n", "\n", "pd.set_option(\"display.max_rows\", None)\n", "pd.set_option(\"display.max_colwidth\", None)\n", "\n", "# Initialize\n", "tokenizer = AutoTokenizer.from_pretrained(\"Qwen/Qwen2.5-1.5B-Instruct\")\n", "adapter = QwenTokenizerAdapter()\n", "\n", "# Example conversation using chat template\n", "chat = [\n", " {\n", " \"role\": \"system\",\n", " \"content\": \"You are a friendly chatbot who always responds in the style of a pirate\",\n", " },\n", " {\"role\": \"user\", \"content\": \"Hello, how are you?\"},\n", " {\"role\": \"assistant\", \"content\": \"I'm doing great. How can I help you today?\"},\n", " {\"role\": \"ipython\", \"content\": \"THIS IS THE DOCUMENT!!!\"}, # this shit doesn't work in chat template\n", " {\"role\": \"user\", \"content\": \"Hello, have you eanten?\"},\n", " {\"role\": \"assistant\", \"content\": \"No I'm hungry?\"},\n", "]\n", "\n", "# Get the formatted conversation using chat template\n", "convo = tokenizer.apply_chat_template(chat, tokenize=False)\n", "print(\"šŸ’¬ Raw Chat Template Output:\")\n", "print(f\"{'-' * 50}\\n{convo}\\n{'-' * 50}\\n\")\n", "\n", "# 1. Show text splitting\n", "prompt, response = adapter.split_prompt_assistant(convo)\n", "print(\"šŸ” Text Split:\")\n", "print(f\"Prompt:\\n{'-' * 50}\\n{prompt}\\n{'-' * 50}\")\n", "print(f\"Response:\\n{'-' * 50}\\n{response}\\n{'-' * 50}\\n\")\n", "\n", "# 2. Get tokens and mask\n", "encoding = tokenizer(convo, add_special_tokens=False)\n", "input_ids = encoding.input_ids\n", "tokens = tokenizer.convert_ids_to_tokens(input_ids)\n", "mask = adapter.get_mask(convo, tokenizer)\n", "\n", "# 3. Create detailed view\n", "df = pd.DataFrame(\n", " {\n", " \"Position\": range(len(tokens)),\n", " \"Token ID\": input_ids,\n", " \"Token\": tokens,\n", " \"Text\": [tokenizer.decode([id]) for id in input_ids],\n", " \"Mask\": mask.tolist(),\n", " }\n", ")\n", "\n", "print(\"šŸ“Š Token Analysis:\")\n", "print(df.to_string(index=False))\n", "\n", "# 4. Quick Stats\n", "print(\"\\nšŸ“ˆ Quick Stats:\")\n", "print(f\"Total tokens: {len(tokens)}\")\n", "print(f\"Masked tokens (1s): {mask.sum().item()}\")\n", "print(f\"Unmasked tokens (0s): {len(mask) - mask.sum().item()}\")\n", "\n", "# 5. Show masked content only\n", "print(\"\\nšŸŽÆ Masked Content (Response):\")\n", "masked_df = df[df[\"Mask\"] == 1]\n", "print(masked_df.to_string(index=False))" ] } ], "metadata": { "kernelspec": { "display_name": "deepsearch-py311", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.11" } }, "nbformat": 4, "nbformat_minor": 2 }