# Anti Dumb Mask Inspection

- Mask = 1 means these tokens ARE used to calculate the loss. These are the tokens we want the model to learn/predict (the assistant's response).
-Mask = 0 means these tokens are NOT used to calculate the loss. These are the tokens we don't want the model to learn/predict (system message, user input, markers, etc.)

# Inspect Original Llama Autodiact

In [None]:
# Imports
import sys

sys.path.append("..")

from transformers import AutoTokenizer
from src.tokenizer_adapter import LlamaTokenizerAdapter
import pandas as pd

pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

# Initialize
tokenizer = AutoTokenizer.from_pretrained("meta-llama/meta-Llama-3.1-8B-Instruct")
adapter = LlamaTokenizerAdapter()

# Example conversation using chat template
chat = [
 {
 "role": "system",
 "content": "You are a friendly chatbot who always responds in the style of a pirate",
 },
 {"role": "user", "content": "Hello, how are you?"},
 {"role": "assistant", "content": "I'm doing great. How can I help you today?"},
 {"role": "ipython", "content": "THIS IS THE DOCUMENT!!!"}, # this shit doesn't work in chat template
 {"role": "user", "content": "Hello, have you eanten?"},
 {"role": "assistant", "content": "No I'm hungry?"},
]

# Get the formatted conversation using chat template
convo = tokenizer.apply_chat_template(chat, tokenize=False)
print("šŸ’¬ Raw Chat Template Output:")
print(f"{'-' * 50}\n{convo}\n{'-' * 50}\n")

# 1. Show text splitting
prompt, response = adapter.split_prompt_assistant(convo)
print("šŸ” Text Split:")
print(f"Prompt:\n{'-' * 50}\n{prompt}\n{'-' * 50}")
print(f"Response:\n{'-' * 50}\n{response}\n{'-' * 50}\n")

# 2. Get tokens and mask
encoding = tokenizer(convo, add_special_tokens=False)
input_ids = encoding.input_ids
tokens = tokenizer.convert_ids_to_tokens(input_ids)
mask = adapter.get_mask(convo, tokenizer)

# 3. Create detailed view
df = pd.DataFrame(
 {
 "Position": range(len(tokens)),
 "Token ID": input_ids,
 "Token": tokens,
 "Text": [tokenizer.decode([id]) for id in input_ids],
 "Mask": mask.tolist(),
 }
)

print("šŸ“Š Token Analysis:")
print(df.to_string(index=False))

# 4. Quick Stats
print("\nšŸ“ˆ Quick Stats:")
print(f"Total tokens: {len(tokens)}")
print(f"Masked tokens (1s): {mask.sum().item()}")
print(f"Unmasked tokens (0s): {len(mask) - mask.sum().item()}")

# 5. Show masked content only
print("\nšŸŽÆ Masked Content (Response):")
masked_df = df[df["Mask"] == 1]
print(masked_df.to_string(index=False))

# āŒ Inspect R1-Distill (role = ipython didn't work)
the document content went missing.

In [None]:
# Imports
import sys

sys.path.append("..")


# Imports
from transformers import AutoTokenizer
from src.tokenizer_adapter import R1DistilTokenizerAdapter
import pandas as pd

pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

# Initialize with R1-Distil
tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/deepseek-R1-Distill-Qwen-1.5B")
adapter = R1DistilTokenizerAdapter()

# Example conversation using R1-Distil format
chat = [
 {
 "role": "system",
 "content": "You are a friendly chatbot who always responds in the style of a pirate",
 },
 {"role": "user", "content": "Hello, how are you?"},
 {"role": "assistant", "content": "I'm doing great. How can I help you today?"},
 {"role": "ipython", "content": "THIS IS THE DOCUMENT!!!"}, # this shit doesn't work in chat template
 {"role": "user", "content": "Hello, have you eanten?"},
 {"role": "assistant", "content": "No I'm hungry?"},
]

# Get the formatted conversation using chat template
convo = tokenizer.apply_chat_template(chat, tokenize=False)
print("šŸ’¬ Raw Chat Template Output:")
print(f"{'-' * 50}\n{convo}\n{'-' * 50}\n")

# 1. Show text splitting
prompt, response = adapter.split_prompt_assistant(convo)
print("šŸ” Text Split:")
print(f"Prompt:\n{'-' * 50}\n{prompt}\n{'-' * 50}")
print(f"Response:\n{'-' * 50}\n{response}\n{'-' * 50}\n")

# 2. Get tokens and mask
encoding = tokenizer(convo, add_special_tokens=False)
input_ids = encoding.input_ids
tokens = tokenizer.convert_ids_to_tokens(input_ids)
mask = adapter.get_mask(convo, tokenizer)

# 3. Create detailed view
df = pd.DataFrame(
 {
 "Position": range(len(tokens)),
 "Token": tokens,
 "Token ID": input_ids,
 "Text": [tokenizer.decode([id]) for id in input_ids],
 "Mask": mask.tolist(),
 }
)

print("šŸ“Š Token Analysis:")
print(df.to_string(index=False))

# 4. Quick Stats
print("\nšŸ“ˆ Quick Stats:")
print(f"Total tokens: {len(tokens)}")
print(f"Masked tokens (1s): {mask.sum().item()}")
print(f"Unmasked tokens (0s): {len(mask) - mask.sum().item()}")

# 5. Show masked content only
print("\nšŸŽÆ Masked Content (Response):")
masked_df = df[df["Mask"] == 1]
print(masked_df.to_string(index=False))

# Inspect R1-Distill just add string?

In [None]:
# Imports
import sys

sys.path.append("..")


# Imports
from transformers import AutoTokenizer
from src.tokenizer_adapter import R1DistilTokenizerAdapter
import pandas as pd

pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

# Initialize with R1-Distil
tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/deepseek-R1-Distill-Qwen-1.5B")
adapter = R1DistilTokenizerAdapter()

# Example conversation using R1-Distil format
chat = [
 {
 "role": "system",
 "content": "You are a friendly chatbot who always responds in the style of a pirate",
 },
 {"role": "user", "content": "Hello, how are you?"},
 {"role": "assistant", "content": "I'm doing great. How can I help you today?"},
 # {"role": "ipython", "content": "THIS IS THE DOCUMENT!!!"}, # this shit doesn't work in chat template
 {"role": "user", "content": "Hello, have you eanten?"},
 {"role": "assistant", "content": "No I'm hungry?"},
]

# Get the formatted conversation using chat template
convo = tokenizer.apply_chat_template(chat, tokenize=False)
# print(type(convo)) # string


think_and_search = "I love cats\nCat images\n"
search_results = "Here are some cat images: cat1 cat2 cat3"
search_template = "\n\n{think_and_search}{search_results}\n\n"
search_text = search_template.format(think_and_search=think_and_search, search_results=search_results)

convo = convo + search_text

print("šŸ’¬ Raw Chat Template Output:")
print(f"{'-' * 50}\n{convo}\n{'-' * 50}\n")

# 1. Show text splitting
prompt, response = adapter.split_prompt_assistant(convo)
print("šŸ” Text Split:")
print(f"Prompt:\n{'-' * 50}\n{prompt}\n{'-' * 50}")
print(f"Response:\n{'-' * 50}\n{response}\n{'-' * 50}\n")

# 2. Get tokens and mask
encoding = tokenizer(convo, add_special_tokens=False)
input_ids = encoding.input_ids
tokens = tokenizer.convert_ids_to_tokens(input_ids)
mask = adapter.get_mask(convo, tokenizer)

# 3. Create detailed view
df = pd.DataFrame(
 {
 "Position": range(len(tokens)),
 "Token": tokens,
 "Token ID": input_ids,
 "Text": [tokenizer.decode([id]) for id in input_ids],
 "Mask": mask.tolist(),
 }
)

print("šŸ“Š Token Analysis:")
print(df.to_string(index=False))

# 4. Quick Stats
print("\nšŸ“ˆ Quick Stats:")
print(f"Total tokens: {len(tokens)}")
print(f"Masked tokens (1s): {mask.sum().item()}")
print(f"Unmasked tokens (0s): {len(mask) - mask.sum().item()}")

# 5. Show masked content only
print("\nšŸŽÆ Masked Content (Response):")
masked_df = df[df["Mask"] == 1]
print(masked_df.to_string(index=False))

# Inspect Qwen 2.5 Instruct

In [None]:
# Imports
import sys

sys.path.append("..")

from transformers import AutoTokenizer
from src.tokenizer_adapter import QwenTokenizerAdapter
import pandas as pd

pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

# Initialize
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-1.5B-Instruct")
adapter = QwenTokenizerAdapter()

# Example conversation using chat template
chat = [
 {
 "role": "system",
 "content": "You are a friendly chatbot who always responds in the style of a pirate",
 },
 {"role": "user", "content": "Hello, how are you?"},
 {"role": "assistant", "content": "I'm doing great. How can I help you today?"},
 {"role": "ipython", "content": "THIS IS THE DOCUMENT!!!"}, # this shit doesn't work in chat template
 {"role": "user", "content": "Hello, have you eanten?"},
 {"role": "assistant", "content": "No I'm hungry?"},
]

# Get the formatted conversation using chat template
convo = tokenizer.apply_chat_template(chat, tokenize=False)
print("šŸ’¬ Raw Chat Template Output:")
print(f"{'-' * 50}\n{convo}\n{'-' * 50}\n")

# 1. Show text splitting
prompt, response = adapter.split_prompt_assistant(convo)
print("šŸ” Text Split:")
print(f"Prompt:\n{'-' * 50}\n{prompt}\n{'-' * 50}")
print(f"Response:\n{'-' * 50}\n{response}\n{'-' * 50}\n")

# 2. Get tokens and mask
encoding = tokenizer(convo, add_special_tokens=False)
input_ids = encoding.input_ids
tokens = tokenizer.convert_ids_to_tokens(input_ids)
mask = adapter.get_mask(convo, tokenizer)

# 3. Create detailed view
df = pd.DataFrame(
 {
 "Position": range(len(tokens)),
 "Token ID": input_ids,
 "Token": tokens,
 "Text": [tokenizer.decode([id]) for id in input_ids],
 "Mask": mask.tolist(),
 }
)

print("šŸ“Š Token Analysis:")
print(df.to_string(index=False))

# 4. Quick Stats
print("\nšŸ“ˆ Quick Stats:")
print(f"Total tokens: {len(tokens)}")
print(f"Masked tokens (1s): {mask.sum().item()}")
print(f"Unmasked tokens (0s): {len(mask) - mask.sum().item()}")

# 5. Show masked content only
print("\nšŸŽÆ Masked Content (Response):")
masked_df = df[df["Mask"] == 1]
print(masked_df.to_string(index=False))