|
|
@ -279,6 +279,91 @@
|
|
|
|
"masked_df = df[df[\"Mask\"] == 1]\n",
|
|
|
|
"masked_df = df[df[\"Mask\"] == 1]\n",
|
|
|
|
"print(masked_df.to_string(index=False))"
|
|
|
|
"print(masked_df.to_string(index=False))"
|
|
|
|
]
|
|
|
|
]
|
|
|
|
|
|
|
|
},
|
|
|
|
|
|
|
|
{
|
|
|
|
|
|
|
|
"cell_type": "markdown",
|
|
|
|
|
|
|
|
"metadata": {},
|
|
|
|
|
|
|
|
"source": [
|
|
|
|
|
|
|
|
"# Inspect Qwen 2.5 Instruct"
|
|
|
|
|
|
|
|
]
|
|
|
|
|
|
|
|
},
|
|
|
|
|
|
|
|
{
|
|
|
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
|
|
|
"execution_count": null,
|
|
|
|
|
|
|
|
"metadata": {},
|
|
|
|
|
|
|
|
"outputs": [],
|
|
|
|
|
|
|
|
"source": [
|
|
|
|
|
|
|
|
"# Imports\n",
|
|
|
|
|
|
|
|
"import sys\n",
|
|
|
|
|
|
|
|
"\n",
|
|
|
|
|
|
|
|
"sys.path.append(\"..\")\n",
|
|
|
|
|
|
|
|
"\n",
|
|
|
|
|
|
|
|
"from transformers import AutoTokenizer\n",
|
|
|
|
|
|
|
|
"from src.tokenizer_adapter import QwenTokenizerAdapter\n",
|
|
|
|
|
|
|
|
"import pandas as pd\n",
|
|
|
|
|
|
|
|
"\n",
|
|
|
|
|
|
|
|
"pd.set_option(\"display.max_rows\", None)\n",
|
|
|
|
|
|
|
|
"pd.set_option(\"display.max_colwidth\", None)\n",
|
|
|
|
|
|
|
|
"\n",
|
|
|
|
|
|
|
|
"# Initialize\n",
|
|
|
|
|
|
|
|
"tokenizer = AutoTokenizer.from_pretrained(\"Qwen/Qwen2.5-1.5B-Instruct\")\n",
|
|
|
|
|
|
|
|
"adapter = QwenTokenizerAdapter()\n",
|
|
|
|
|
|
|
|
"\n",
|
|
|
|
|
|
|
|
"# Example conversation using chat template\n",
|
|
|
|
|
|
|
|
"chat = [\n",
|
|
|
|
|
|
|
|
" {\n",
|
|
|
|
|
|
|
|
" \"role\": \"system\",\n",
|
|
|
|
|
|
|
|
" \"content\": \"You are a friendly chatbot who always responds in the style of a pirate\",\n",
|
|
|
|
|
|
|
|
" },\n",
|
|
|
|
|
|
|
|
" {\"role\": \"user\", \"content\": \"Hello, how are you?\"},\n",
|
|
|
|
|
|
|
|
" {\"role\": \"assistant\", \"content\": \"I'm doing great. How can I help you today?\"},\n",
|
|
|
|
|
|
|
|
" {\"role\": \"ipython\", \"content\": \"THIS IS THE DOCUMENT!!!\"}, # this shit doesn't work in chat template\n",
|
|
|
|
|
|
|
|
" {\"role\": \"user\", \"content\": \"Hello, have you eanten?\"},\n",
|
|
|
|
|
|
|
|
" {\"role\": \"assistant\", \"content\": \"No I'm hungry?\"},\n",
|
|
|
|
|
|
|
|
"]\n",
|
|
|
|
|
|
|
|
"\n",
|
|
|
|
|
|
|
|
"# Get the formatted conversation using chat template\n",
|
|
|
|
|
|
|
|
"convo = tokenizer.apply_chat_template(chat, tokenize=False)\n",
|
|
|
|
|
|
|
|
"print(\"💬 Raw Chat Template Output:\")\n",
|
|
|
|
|
|
|
|
"print(f\"{'-' * 50}\\n{convo}\\n{'-' * 50}\\n\")\n",
|
|
|
|
|
|
|
|
"\n",
|
|
|
|
|
|
|
|
"# 1. Show text splitting\n",
|
|
|
|
|
|
|
|
"prompt, response = adapter.split_prompt_assistant(convo)\n",
|
|
|
|
|
|
|
|
"print(\"🔍 Text Split:\")\n",
|
|
|
|
|
|
|
|
"print(f\"Prompt:\\n{'-' * 50}\\n{prompt}\\n{'-' * 50}\")\n",
|
|
|
|
|
|
|
|
"print(f\"Response:\\n{'-' * 50}\\n{response}\\n{'-' * 50}\\n\")\n",
|
|
|
|
|
|
|
|
"\n",
|
|
|
|
|
|
|
|
"# 2. Get tokens and mask\n",
|
|
|
|
|
|
|
|
"encoding = tokenizer(convo, add_special_tokens=False)\n",
|
|
|
|
|
|
|
|
"input_ids = encoding.input_ids\n",
|
|
|
|
|
|
|
|
"tokens = tokenizer.convert_ids_to_tokens(input_ids)\n",
|
|
|
|
|
|
|
|
"mask = adapter.get_mask(convo, tokenizer)\n",
|
|
|
|
|
|
|
|
"\n",
|
|
|
|
|
|
|
|
"# 3. Create detailed view\n",
|
|
|
|
|
|
|
|
"df = pd.DataFrame(\n",
|
|
|
|
|
|
|
|
" {\n",
|
|
|
|
|
|
|
|
" \"Position\": range(len(tokens)),\n",
|
|
|
|
|
|
|
|
" \"Token ID\": input_ids,\n",
|
|
|
|
|
|
|
|
" \"Token\": tokens,\n",
|
|
|
|
|
|
|
|
" \"Text\": [tokenizer.decode([id]) for id in input_ids],\n",
|
|
|
|
|
|
|
|
" \"Mask\": mask.tolist(),\n",
|
|
|
|
|
|
|
|
" }\n",
|
|
|
|
|
|
|
|
")\n",
|
|
|
|
|
|
|
|
"\n",
|
|
|
|
|
|
|
|
"print(\"📊 Token Analysis:\")\n",
|
|
|
|
|
|
|
|
"print(df.to_string(index=False))\n",
|
|
|
|
|
|
|
|
"\n",
|
|
|
|
|
|
|
|
"# 4. Quick Stats\n",
|
|
|
|
|
|
|
|
"print(\"\\n📈 Quick Stats:\")\n",
|
|
|
|
|
|
|
|
"print(f\"Total tokens: {len(tokens)}\")\n",
|
|
|
|
|
|
|
|
"print(f\"Masked tokens (1s): {mask.sum().item()}\")\n",
|
|
|
|
|
|
|
|
"print(f\"Unmasked tokens (0s): {len(mask) - mask.sum().item()}\")\n",
|
|
|
|
|
|
|
|
"\n",
|
|
|
|
|
|
|
|
"# 5. Show masked content only\n",
|
|
|
|
|
|
|
|
"print(\"\\n🎯 Masked Content (Response):\")\n",
|
|
|
|
|
|
|
|
"masked_df = df[df[\"Mask\"] == 1]\n",
|
|
|
|
|
|
|
|
"print(masked_df.to_string(index=False))"
|
|
|
|
|
|
|
|
]
|
|
|
|
}
|
|
|
|
}
|
|
|
|
],
|
|
|
|
],
|
|
|
|
"metadata": {
|
|
|
|
"metadata": {
|
|
|
|