test: add unit tests for agent, reward functions, and tokenizer adapters

3 months ago · 3910ef343a
parent 31dcbf5d8a
commit 3910ef343a
4 changed files with 930 additions and 0 deletions
--- a/tests/init.py
+++ b/tests/init.py
--- a/tests/test_agent.py
+++ b/tests/test_agent.py
@ -0,0 +1,68 @@
+"""Test agent functionality."""
+
+from transformers import LlamaTokenizerFast
+
+from src.agent import Agent
+from src.tokenizer_adapter import LlamaTokenizerAdapter, R1DistilTokenizerAdapter
+
+
+def mock_generate_fn(prompts):
+    """Mock generation function that returns simple responses."""
+
+    class MockResponse:
+        def __init__(self, text):
+            self.outputs = [type("obj", (object,), {"text": text})()]
+
+    return [MockResponse(f"Assistant: Test response for {i}") for i, _ in enumerate(prompts)]
+
+
+def test_llama_agent_response_mask_lengths():
+    """Test that response tokens and masks have the same length for Llama."""
+    # Test data
+    questions = ["What is Python?", "How to write tests?"]
+
+    # Setup Llama agent
+    tokenizer = LlamaTokenizerFast.from_pretrained("meta-llama/Llama-3.2-3B-Instruct")
+    agent = Agent(LlamaTokenizerAdapter())
+
+    # Run agent
+    outputs = agent.run_agent(
+        generate_fn=mock_generate_fn, tokenizer=tokenizer, questions=questions, max_generations=1, max_new_tokens=100
+    )
+
+    # Check lengths match for each example
+    for i, (tokens, mask) in enumerate(zip(outputs.response_tokens, outputs.response_masks)):
+        print(f"\nExample {i}:")
+        print(f"Question: {questions[i]}")
+        print(f"Response tokens length: {len(tokens)}")
+        print(f"Response mask length: {len(mask)}")
+
+        assert len(tokens) == len(mask), f"Mismatch in example {i}: tokens={len(tokens)}, mask={len(mask)}"
+        assert mask.sum().item() > 0, "Mask should have some 1s indicating response tokens"
+        assert all(x in [0, 1] for x in mask.tolist()), "Mask should only contain 0s and 1s"
+
+
+def test_r1_distil_agent_response_mask_lengths():
+    """Test that response tokens and masks have the same length for R1-Distil."""
+    # Test data
+    questions = ["What is Python?", "How to write tests?"]
+
+    # Setup R1-Distil agent
+    tokenizer = LlamaTokenizerFast.from_pretrained("deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B")
+    agent = Agent(R1DistilTokenizerAdapter())
+
+    # Run agent
+    outputs = agent.run_agent(
+        generate_fn=mock_generate_fn, tokenizer=tokenizer, questions=questions, max_generations=1, max_new_tokens=100
+    )
+
+    # Check lengths match for each example
+    for i, (tokens, mask) in enumerate(zip(outputs.response_tokens, outputs.response_masks)):
+        print(f"\nExample {i}:")
+        print(f"Question: {questions[i]}")
+        print(f"Response tokens length: {len(tokens)}")
+        print(f"Response mask length: {len(mask)}")
+
+        assert len(tokens) == len(mask), f"Mismatch in example {i}: tokens={len(tokens)}, mask={len(mask)}"
+        assert mask.sum().item() > 0, "Mask should have some 1s indicating response tokens"
+        assert all(x in [0, 1] for x in mask.tolist()), "Mask should only contain 0s and 1s"
--- a/tests/test_rewards.py
+++ b/tests/test_rewards.py
@ -0,0 +1,434 @@
+"""
+Test cases for reward functions in rewards.py
+"""
+
+import pytest
+
+from src.rewards import (
+    build_reward_correctness_fn,
+    reward_em_chunk,
+    reward_format,
+    reward_retry,
+)
+
+
+class MockResponse:
+    """Mock response class that simulates vLLM response"""
+
+    def __init__(self, text):
+        self.outputs = [type("obj", (object,), {"text": text})()]
+
+
+# Mock functions for testing
+def mock_vllm_generate_func(*args, **kwargs):
+    """Mock function that returns verification responses based on the input"""
+    # Check if the prompt contains "5" (wrong answer) or "4" (correct answer)
+    prompt = str(args[0]) if args else ""
+    if "5" in prompt:
+        return [MockResponse("No, the answer is incorrect")]  # Return False for wrong answer
+    return [MockResponse("Yes, the answer is correct")]  # Return True for correct answer
+
+
+class MockTokenizer:
+    """Mock tokenizer class that simulates the behavior of a real tokenizer"""
+
+    def __init__(self):
+        self.input_ids = [1, 2, 3]
+
+    def apply_chat_template(self, messages, tokenize=False, add_generation_prompt=True):
+        """Mock apply_chat_template method"""
+        # For testing, we just return a formatted string
+        return "\n".join([f"{msg['role']}: {msg['content']}" for msg in messages])
+
+
+@pytest.fixture
+def reward_correctness_fn():
+    """Fixture to create reward correctness function"""
+    return build_reward_correctness_fn(mock_vllm_generate_func, MockTokenizer())
+
+
+def test_reward_correctness_basic(reward_correctness_fn):
+    """Test basic reward correctness functionality"""
+    prompts = ["What is 2+2?"]
+    completions = [{"messages": [{"content": "4"}]}]
+    reward_kwargs = {"answer": ["4"]}
+
+    rewards = reward_correctness_fn(prompts, completions, **reward_kwargs)
+    assert len(rewards) == 1  # Should return one verification result per answer
+    assert rewards[0] is True  # Should be True for correct answer
+
+
+def test_reward_correctness_wrong_answer(reward_correctness_fn):
+    """Test reward correctness with wrong answer"""
+    prompts = ["What is 2+2?"]
+    completions = [{"messages": [{"content": "5"}]}]
+    reward_kwargs = {"answer": ["4"]}
+
+    rewards = reward_correctness_fn(prompts, completions, **reward_kwargs)
+    assert len(rewards) == 1  # Should return one verification result per answer
+    assert rewards[0] is False  # Should be False for wrong answer
+
+
+def test_reward_format_correct():
+    """Test reward format with correct format"""
+    completions = [
+        {
+            "messages": [
+                {"role": "assistant", "content": "<think>\nSome reasoning\n</think>\n<answer>\nThe answer\n</answer>"}
+            ]
+        }
+    ]
+    rewards = reward_format([], completions)
+    assert rewards[0] == 1.0
+
+
+def test_reward_format_with_search():
+    """Test reward format with search tags only (no answer tags)"""
+    completions = [
+        {"messages": [{"role": "assistant", "content": "<think>\nSome reasoning\n</think>\n<search>query</search>"}]}
+    ]
+    rewards = reward_format([], completions)
+    assert rewards[0] == 1.0
+
+
+def test_reward_format_markdown_tags():
+    """Test reward format with markdown-styled tags"""
+    markdown_formats = [
+        {
+            "messages": [
+                {
+                    "role": "assistant",
+                    "content": "**<think>**\nSome reasoning\n**</think>**\n<answer>\nThe answer\n</answer>",
+                }
+            ]
+        },
+        {
+            "messages": [
+                {
+                    "role": "assistant",
+                    "content": "*<think>*\nSome reasoning\n*</think>*\n<answer>\nThe answer\n</answer>",
+                }
+            ]
+        },
+        {
+            "messages": [
+                {
+                    "role": "assistant",
+                    "content": "_<think>_\nSome reasoning\n_</think>_\n<answer>\nThe answer\n</answer>",
+                }
+            ]
+        },
+    ]
+
+    for completion in markdown_formats:
+        rewards = reward_format([], [completion])
+        assert rewards[0] == 0.0, f"Failed with: {completion['messages'][0]['content']}"
+
+
+def test_reward_format_information_tags():
+    """Test reward format with information tags"""
+    # Test different information tag variants
+    info_variants = [
+        "<information>Some info</information>",
+        "<info>Some info</info>",
+        "<Info>Some info</Info>",
+        "<INFORMATION>Some info</INFORMATION>",
+        "<INFO>Some info</INFO>",
+    ]
+
+    for info_tag in info_variants:
+        content = f"<think>\nSome reasoning\n</think>\n{info_tag}\n<answer>\nThe answer\n</answer>"
+        completions = [{"messages": [{"role": "assistant", "content": content}]}]
+        rewards = reward_format([], completions)
+        assert rewards[0] == 0.0, f"Failed to detect information tag: {info_tag}"
+
+
+def test_reward_format_real_example():
+    """Test reward format with a real-world example - should fail now since it has both search and answer tags"""
+    content = """<think>I need to search for Paul Walker's cars in Fast and Furious movies.</think>
+<search> Paul Walker's cars in Fast and Furious </search>
+
+From the information provided, it's clear that Paul Walker was a part of the "Fast and Furious" series, but the specific list of cars is not mentioned. Since I lack this particular detail, I will call a search engine to get the specific list of cars Paul Walker drove in the "Fast and Furious" movies.
+
+<search> list of cars paul walker drove in Fast and Furious </search>
+
+Based on the updated information, it seems the focus was on his career, financials, and family. However, I am still missing the specific list of cars he drove in the "Fast and Furious" movies. Since it appears that the information might not be contained within the accessed documents, and I have no further search queries to make, I will provide an answer based on the details I have.
+
+<answer> Charger </answer>"""
+
+    completions = [{"messages": [{"role": "assistant", "content": content}]}]
+    rewards = reward_format([], completions)
+    assert rewards[0] == 0.0, "Should reject responses with both search and answer tags"
+
+
+def test_reward_format_real_example_search_only():
+    """Test reward format with search-only format in a real-world example"""
+    content = """<think>I need to search for Paul Walker's cars in Fast and Furious movies.</think>
+<search> Paul Walker's cars in Fast and Furious </search>"""
+
+    completions = [{"messages": [{"role": "assistant", "content": content}]}]
+    rewards = reward_format([], completions)
+    assert rewards[0] == 1.0, "Should accept responses with only search tags"
+
+
+def test_reward_format_real_example_answer_only():
+    """Test reward format with answer-only format in a real-world example"""
+    content = """<think>Based on the information provided, it seems Paul Walker drove a Charger in the Fast and Furious series.</think>
+<answer> Charger </answer>"""
+
+    completions = [{"messages": [{"role": "assistant", "content": content}]}]
+    rewards = reward_format([], completions)
+    assert rewards[0] == 1.0, "Should accept responses with only answer tags"
+
+
+def test_reward_format_incorrect_tag_sequence():
+    """Test reward format with incorrect tag sequence - should now pass since sequence doesn't matter"""
+    formats = [
+        {
+            "messages": [
+                {"role": "assistant", "content": "<answer>\nThe answer\n</answer>\n<think>\nSome reasoning\n</think>"}
+            ]
+        },
+        {
+            "messages": [
+                {
+                    "role": "assistant",
+                    "content": "<search>query</search>\n<think>\nSome reasoning\n</think>",
+                }
+            ]
+        },
+    ]
+
+    for completion in formats:
+        rewards = reward_format([], [completion])
+        assert rewards[0] == 1.0, f"Failed with: {completion['messages'][0]['content']}"
+
+
+def test_reward_format_multiple_answers():
+    """Test reward format with multiple answer tags"""
+    completions = [
+        {
+            "messages": [
+                {
+                    "role": "assistant",
+                    "content": "<think>\nSome reasoning\n</think>\n<answer>\nFirst answer\n</answer>\n<answer>\nSecond answer\n</answer>",
+                }
+            ]
+        }
+    ]
+    rewards = reward_format([], completions)
+    assert rewards[0] == 0.0
+
+
+def test_reward_format_incomplete_tags():
+    """Test reward format with incomplete tags"""
+    incomplete_formats = [
+        {
+            "messages": [
+                {"role": "assistant", "content": "<think>\nMissing closing think tag\n<answer>\nThe answer\n</answer>"}
+            ]
+        },
+        {
+            "messages": [
+                {
+                    "role": "assistant",
+                    "content": "<think>\nSome reasoning\n</think>\n<answer>\nMissing closing answer tag",
+                }
+            ]
+        },
+        {
+            "messages": [
+                {
+                    "role": "assistant",
+                    "content": "Missing opening think tag\n</think>\n<answer>\nThe answer\n</answer>",
+                }
+            ]
+        },
+    ]
+
+    for completion in incomplete_formats:
+        rewards = reward_format([], [completion])
+        assert rewards[0] == 0.0, f"Failed with: {completion['messages'][0]['content']}"
+
+
+def test_reward_retry():
+    """Test reward retry functionality with progressive rewards up to 5 searches"""
+    # Test case with no searches
+    completions = [{"messages": [{"role": "assistant", "content": "No searches here"}]}]
+    rewards = reward_retry([], completions)
+    assert rewards[0] == 0.0, "Should get 0 reward for no searches"
+
+    # Test case with one search
+    completions = [
+        {
+            "messages": [
+                {
+                    "role": "assistant",
+                    "content": "<think>I need more information</think>\n<search>First query</search>",
+                }
+            ]
+        }
+    ]
+    rewards = reward_retry([], completions)
+    assert rewards[0] == 0.35, "Should get 0.35 reward for one search"
+
+    # Test case with three searches in different messages
+    completions = [
+        {
+            "messages": [
+                {
+                    "role": "assistant",
+                    "content": "<think>First search</think>\n<search>Query 1</search>",
+                },
+                {"role": "assistant", "content": "<think>Second search</think>\n<search>Query 2</search>"},
+                {"role": "assistant", "content": "<think>Third search</think>\n<search>Query 3</search>"},
+            ]
+        }
+    ]
+    rewards = reward_retry([], completions)
+    assert rewards[0] == 0.65, "Should get 0.65 reward for three searches"
+
+    # Test case with five searches in different messages
+    completions = [
+        {
+            "messages": [
+                {"role": "assistant", "content": "<think>Search 1</think>\n<search>Query 1</search>"},
+                {"role": "assistant", "content": "<think>Search 2</think>\n<search>Query 2</search>"},
+                {"role": "assistant", "content": "<think>Search 3</think>\n<search>Query 3</search>"},
+                {"role": "assistant", "content": "<think>Search 4</think>\n<search>Query 4</search>"},
+                {"role": "assistant", "content": "<think>Search 5</think>\n<search>Query 5</search>"},
+            ]
+        }
+    ]
+    rewards = reward_retry([], completions)
+    assert rewards[0] == 0.95, "Should get 0.95 reward for five searches"
+
+    # Test case with more than five searches
+    completions = [
+        {
+            "messages": [
+                {"role": "assistant", "content": "<think>Search 1</think>\n<search>Query 1</search>"},
+                {"role": "assistant", "content": "<think>Search 2</think>\n<search>Query 2</search>"},
+                {"role": "assistant", "content": "<think>Search 3</think>\n<search>Query 3</search>"},
+                {"role": "assistant", "content": "<think>Search 4</think>\n<search>Query 4</search>"},
+                {"role": "assistant", "content": "<think>Search 5</think>\n<search>Query 5</search>"},
+                {"role": "assistant", "content": "<think>Search 6</think>\n<search>Query 6</search>"},
+            ]
+        }
+    ]
+    rewards = reward_retry([], completions)
+    assert rewards[0] == 0.95, "Should cap at 0.95 reward for more than five searches"
+
+    # Test case with violation (multiple searches in one message)
+    completions = [
+        {
+            "messages": [
+                {
+                    "role": "assistant",
+                    "content": "<think>Multiple searches</think>\n<search>First query</search>\n<search>Second query</search>",
+                }
+            ]
+        }
+    ]
+    rewards = reward_retry([], completions)
+    assert rewards[0] == 0.25, "Should get penalized reward (0.5 * 0.5) for violation"
+
+
+def test_reward_em_chunk():
+    """Test reward EM chunk functionality with information tags"""
+    # Test case with matching content in ipython role
+    completions = [
+        {"messages": [{"role": "ipython", "content": "<information>This is the correct chunk content</information>"}]}
+    ]
+    reward_kwargs = {"chunk_content": ["This is the correct chunk content"]}
+
+    rewards = reward_em_chunk([], completions, **reward_kwargs)
+    assert len(rewards) == 1
+    assert rewards[0] == 1.0, "Should get reward 1.0 for exact match in ipython role"
+
+    # Test case with matching content in user role
+    completions = [
+        {"messages": [{"role": "user", "content": "<information>This is the correct chunk content</information>"}]}
+    ]
+    rewards = reward_em_chunk([], completions, **reward_kwargs)
+    assert rewards[0] == 1.0, "Should get reward 1.0 for exact match in user role"
+
+    # Test case with content not starting with <information> tag
+    completions = [{"messages": [{"role": "ipython", "content": "This is the correct chunk content"}]}]
+    rewards = reward_em_chunk([], completions, **reward_kwargs)
+    assert rewards[0] == 0.0, "Should get reward 0.0 for missing information tag"
+
+    # Test case with wrong role
+    completions = [
+        {
+            "messages": [
+                {"role": "assistant", "content": "<information>This is the correct chunk content</information>"}
+            ]
+        }
+    ]
+    rewards = reward_em_chunk([], completions, **reward_kwargs)
+    assert rewards[0] == 0.0, "Should get reward 0.0 for wrong role"
+
+    # Test case with multiple messages, only one matching
+    completions = [
+        {
+            "messages": [
+                {"role": "ipython", "content": "<information>Wrong content</information>"},
+                {"role": "user", "content": "<information>This is the correct chunk content</information>"},
+            ]
+        }
+    ]
+    rewards = reward_em_chunk([], completions, **reward_kwargs)
+    assert rewards[0] == 1.0, "Should get reward 1.0 if any message matches"
+
+
+def test_reward_em_chunk_no_chunk_content():
+    """Test reward EM chunk with no chunk content provided"""
+    completions = [{"messages": [{"role": "ipython", "content": "<information>Some content</information>"}]}]
+
+    with pytest.raises(ValueError, match="chunk_content must be provided"):
+        reward_em_chunk([], completions)
+
+
+def test_reward_em_chunk_multiple_chunks():
+    """Test reward EM chunk with multiple chunks to match"""
+    completions = [
+        {"messages": [{"role": "ipython", "content": "<information>First chunk content</information>"}]},
+        {"messages": [{"role": "user", "content": "<information>Second chunk content</information>"}]},
+    ]
+    reward_kwargs = {"chunk_content": ["First chunk content", "Second chunk content"]}
+
+    rewards = reward_em_chunk([], completions, **reward_kwargs)
+    assert len(rewards) == 2
+    assert rewards == [1.0, 1.0], "Should get reward 1.0 for each matching chunk"
+
+
+def test_reward_em_chunk_whitespace_handling():
+    """Test reward EM chunk handles whitespace properly"""
+    completions = [
+        {"messages": [{"role": "ipython", "content": "  <information>  Content with spaces  </information>  "}]}
+    ]
+    reward_kwargs = {"chunk_content": ["Content with spaces"]}
+
+    rewards = reward_em_chunk([], completions, **reward_kwargs)
+    assert rewards[0] == 1.0, "Should handle whitespace in content and tags"
+
+
+def test_reward_format_search_or_answer_not_both():
+    """Test that having both search and answer tags in the same message is not allowed"""
+    content = "<think>I need to search</think>\n<search>query</search>\n<answer>Final answer</answer>"
+    completions = [{"messages": [{"role": "assistant", "content": content}]}]
+    rewards = reward_format([], completions)
+    assert rewards[0] == 0.0, "Should reject messages with both search and answer tags"
+
+    # Verify that having just search tag is valid
+    content_search_only = "<think>I need to search</think>\n<search>query</search>"
+    completions = [{"messages": [{"role": "assistant", "content": content_search_only}]}]
+    rewards = reward_format([], completions)
+    assert rewards[0] == 1.0, "Should accept messages with just search tags"
+
+    # Verify that having just answer tag is valid
+    content_answer_only = "<think>I know the answer</think>\n<answer>Final answer</answer>"
+    completions = [{"messages": [{"role": "assistant", "content": content_answer_only}]}]
+    rewards = reward_format([], completions)
+    assert rewards[0] == 1.0, "Should accept messages with just answer tags"
--- a/tests/test_tokenizer_adapters.py
+++ b/tests/test_tokenizer_adapters.py
@ -0,0 +1,428 @@
+"""
+Test module for tokenizer adapters.
+"""
+
+import torch
+from transformers import LlamaTokenizerFast
+
+from src.config import logger
+from src.tokenizer_adapter import LlamaTokenizerAdapter, R1DistilTokenizerAdapter
+
+# Test conversation used across all tests
+TEST_CHAT = [
+    {
+        "role": "system",
+        "content": "You are a friendly chatbot who always responds in the style of a pirate",
+    },
+    {"role": "user", "content": "Hello, how are you?"},
+    {"role": "assistant", "content": "I'm doing great. How can I help you today?"},
+    {"role": "ipython", "content": "THIS IS THE DOCUMENT!!!"},
+    {"role": "user", "content": "Hello, have you eanten?"},
+    {"role": "assistant", "content": "No I'm hungry?"},
+]
+
+
+def test_llama_format():
+    """Test Llama tokenizer adapter format handling."""
+    # Setup
+    tokenizer = LlamaTokenizerFast.from_pretrained("meta-llama/Llama-3.2-3B-Instruct")
+    adapter = LlamaTokenizerAdapter()
+
+    # Get formatted conversation using chat template
+    convo = tokenizer.apply_chat_template(TEST_CHAT, tokenize=False)
+
+    # Test with marker included (training scenario)
+    prompt, response = adapter.split_prompt_assistant(convo)
+    assert prompt, "Prompt should not be empty"
+    assert response, "Response should not be empty"
+    assert "<|start_header_id|>assistant<|end_header_id|>" in prompt, (
+        "Prompt should contain assistant marker"
+    )  # Absolute Cinema I have no idea why.
+    assert "I'm doing great" in response, "Response should contain assistant's message"
+
+
+def test_r1_distil_format():
+    """Test R1-Distil tokenizer adapter format handling."""
+    # Setup
+    tokenizer = LlamaTokenizerFast.from_pretrained("deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B")
+    adapter = R1DistilTokenizerAdapter()
+
+    # Get formatted conversation using chat template
+    convo = tokenizer.apply_chat_template(TEST_CHAT, tokenize=False)
+
+    logger.debug("\n🔍 Testing R1Distil Format:")
+    logger.debug(f"Input conversation length: {len(convo)}")
+    logger.debug(f"Input conversation: {convo}")
+
+    # Test
+    try:
+        prompt, response = adapter.split_prompt_assistant(convo)
+        logger.debug("Successfully split into:")
+        logger.debug(f"Prompt length: {len(prompt)}")
+        logger.debug(f"Response length: {len(response)}")
+    except ValueError as e:
+        logger.debug(f"❌ Error splitting conversation: {str(e)}")
+        raise
+
+    assert prompt, "Prompt should not be empty"
+    assert response, "Response should not be empty"
+    # assert "assistant" not in prompt.lower(), "Prompt should not contain assistant response" dont ask me why idk. this is dumb
+    assert "I'm doing great" in response, "Response should contain assistant's message"
+
+
+def test_llama_mask():
+    """Test Llama tokenizer adapter mask generation."""
+    # Setup
+    tokenizer = LlamaTokenizerFast.from_pretrained("meta-llama/Llama-3.2-3B-Instruct")
+    adapter = LlamaTokenizerAdapter()
+
+    # Get formatted conversation using chat template
+    convo = tokenizer.apply_chat_template(TEST_CHAT, tokenize=False)
+
+    # Test
+    logger.debug("\n🔍 Testing Llama Mask Generation:")
+    logger.debug(f"Input conversation length: {len(convo)}")
+
+    # Get tokenization details
+    encoding = tokenizer(convo, add_special_tokens=False)
+    logger.debug(f"Tokenized length: {len(encoding.input_ids)}")
+    logger.debug(f"Input IDs: {encoding.input_ids}")
+
+    # Get mask
+    mask = adapter.get_mask(convo, tokenizer)
+    logger.debug(f"Generated mask shape: {mask.shape}")
+    logger.debug(f"Mask sum: {mask.sum().item()}")
+    logger.debug(f"Mask values: {mask.tolist()}")
+
+    assert isinstance(mask, torch.Tensor)
+    assert mask.dtype == torch.int
+    assert mask.dim() == 1
+    assert mask.sum().item() > 0
+    assert mask.max().item() == 1
+    assert mask.min().item() == 0
+
+    # Verify mask length matches token length
+    assert mask.shape[0] == len(encoding.input_ids), "Mask length must match token length"
+
+    # Verify assistant response is masked (not the marker)
+    start_header_id = tokenizer.convert_tokens_to_ids("<|start_header_id|>")
+    assistant_token = tokenizer.convert_tokens_to_ids("assistant")
+    end_header_id = tokenizer.convert_tokens_to_ids("<|end_header_id|>")
+
+    # Find the position of the assistant marker
+    input_ids = encoding.input_ids
+    i = 0
+    while i < len(input_ids) - 1:
+        if input_ids[i] == start_header_id and input_ids[i + 1] == assistant_token:
+            # Skip the marker and header
+            i += 2
+            while i < len(input_ids) and input_ids[i] != end_header_id:
+                i += 1
+            i += 2  # Skip end header
+            # Check if the response is masked
+            response_start = i
+            while i < len(input_ids) and input_ids[i] != tokenizer.convert_tokens_to_ids("<|eot_id|>"):
+                i += 1
+            response_end = i
+            assert mask[response_start:response_end].sum().item() > 0, "Assistant response should be masked"
+            logger.debug(f"Found assistant response at positions {response_start}:{response_end}")
+            logger.debug(f"Response mask values: {mask[response_start:response_end]}")
+            break
+        i += 1
+
+
+def test_r1_distil_mask():
+    """Test R1-Distil tokenizer adapter mask generation."""
+    # Setup
+    tokenizer = LlamaTokenizerFast.from_pretrained("deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B")
+    adapter = R1DistilTokenizerAdapter()
+
+    # Get formatted conversation using chat template
+    convo = tokenizer.apply_chat_template(TEST_CHAT, tokenize=False)
+
+    logger.debug("\n🔍 Testing R1Distil Mask:")
+    logger.debug(f"Input conversation length: {len(convo)}")
+    logger.debug(f"Input conversation: {convo}")
+
+    # Test
+    mask = adapter.get_mask(convo, tokenizer)
+    logger.debug(f"Generated mask shape: {mask.shape}")
+    logger.debug(f"Mask sum: {mask.sum().item()}")
+    logger.debug(f"Mask values: {mask.tolist()}")
+
+    assert isinstance(mask, torch.Tensor)
+    assert mask.dtype == torch.int
+    assert mask.dim() == 1
+    assert mask.sum().item() > 0
+    assert mask.max().item() == 1
+    assert mask.min().item() == 0
+
+    # Verify mask length matches token length
+    encoding = tokenizer(convo, add_special_tokens=False)
+    logger.debug(f"Token length: {len(encoding.input_ids)}")
+    logger.debug(f"Token IDs: {encoding.input_ids}")
+    assert mask.shape[0] == len(encoding.input_ids), "Mask length must match token length"
+
+
+def test_llama_mask_length():
+    """Test that mask length matches input_ids length for Llama format."""
+    # Setup
+    tokenizer = LlamaTokenizerFast.from_pretrained("meta-llama/Llama-3.2-3B-Instruct")
+    adapter = LlamaTokenizerAdapter()
+
+    # Get formatted conversation using chat template
+    convo = tokenizer.apply_chat_template(TEST_CHAT, tokenize=False)
+
+    # Get tokenization and mask
+    encoding = tokenizer(convo, add_special_tokens=False)
+    mask = adapter.get_mask(convo, tokenizer)
+
+    # Debug info
+    logger.debug("\n🔍 Testing Llama Mask Length:")
+    logger.debug(f"Token length: {len(encoding.input_ids)}")
+    logger.debug(f"Mask length: {len(mask)}")
+
+    # Verify lengths match
+    assert len(mask) == len(encoding.input_ids), (
+        f"Mask length ({len(mask)}) != input_ids length ({len(encoding.input_ids)})"
+    )
+
+
+def test_r1_distil_mask_length():
+    """Test that mask length matches input_ids length for R1-Distil format."""
+    # Setup
+    tokenizer = LlamaTokenizerFast.from_pretrained("deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B")
+    adapter = R1DistilTokenizerAdapter()
+
+    # Get formatted conversation using chat template
+    convo = tokenizer.apply_chat_template(TEST_CHAT, tokenize=False)
+
+    # Get tokenization and mask
+    encoding = tokenizer(convo, add_special_tokens=False)
+    mask = adapter.get_mask(convo, tokenizer)
+
+    # Debug info
+    logger.debug("\n🔍 Testing R1Distil Mask Length:")
+    logger.debug(f"Token length: {len(encoding.input_ids)}")
+    logger.debug(f"Mask length: {len(mask)}")
+
+    # Verify lengths match
+    assert len(mask) == len(encoding.input_ids), (
+        f"Mask length ({len(mask)}) != input_ids length ({len(encoding.input_ids)})"
+    )
+
+
+def test_llama_mask_correctness():
+    """Test that the mask is correctly applied to assistant responses for Llama format."""
+    # Setup
+    tokenizer = LlamaTokenizerFast.from_pretrained("meta-llama/Llama-3.2-3B-Instruct")
+    adapter = LlamaTokenizerAdapter()
+
+    # Get formatted conversation using chat template
+    convo = tokenizer.apply_chat_template(TEST_CHAT, tokenize=False)
+
+    # Get tokenization and mask
+    encoding = tokenizer(convo, add_special_tokens=False)
+    tokens = tokenizer.convert_ids_to_tokens(encoding.input_ids)
+    mask = adapter.get_mask(convo, tokenizer)
+
+    # Debug info
+    logger.debug(f"Total tokens: {len(tokens)}")
+    logger.debug(f"Masked tokens (1s): {mask.sum().item()}")
+    logger.debug(f"Unmasked tokens (0s): {len(mask) - mask.sum().item()}")
+
+    # Verify expected count of masked tokens
+    assert 15 <= mask.sum().item() <= 20, f"Expected between 15-20 masked tokens, got {mask.sum().item()}"
+
+    # Extract assistant responses from TEST_CHAT for verification
+    assistant_responses = [msg["content"] for msg in TEST_CHAT if msg["role"] == "assistant"]
+
+    # Verify each assistant response is masked
+    for response in assistant_responses:
+        # Find where this response occurs in the text
+        response_pos = convo.find(response)
+        if response_pos == -1:
+            continue
+
+        # Convert position in string to position in tokens
+        offset = len(tokenizer.encode(convo[:response_pos], add_special_tokens=False))
+        response_tokens = tokenizer.encode(response, add_special_tokens=False)
+
+        # Check if tokens in this response are masked
+        for i, token_id in enumerate(response_tokens):
+            token_pos = offset + i
+            if token_pos < len(mask):
+                # Check if token is masked - allow some flexibility at response boundaries
+                if i > 0 and i < len(response_tokens) - 1 and mask[token_pos] != 1:
+                    token_text = tokenizer.decode([token_id])
+                    assert False, f"Token '{token_text}' in assistant response is not masked"
+
+    # Verify system and user messages are NOT masked
+    for msg in TEST_CHAT:
+        if msg["role"] not in ["assistant"]:
+            content = msg["content"]
+            content_pos = convo.find(content)
+            if content_pos == -1:
+                continue
+
+            # Check a sample of tokens from each non-assistant message
+            offset = len(tokenizer.encode(convo[:content_pos], add_special_tokens=False))
+            content_tokens = tokenizer.encode(content, add_special_tokens=False)
+
+            # Check 3 tokens max to keep test simple
+            for i in range(min(3, len(content_tokens))):
+                token_pos = offset + i
+                if token_pos < len(mask) and mask[token_pos] == 1:
+                    token_text = tokenizer.decode([content_tokens[i]])
+                    assert False, f"Token '{token_text}' in non-assistant message is incorrectly masked"
+
+
+def test_r1_distil_mask_correctness():
+    """Test that the mask is correctly applied to assistant responses for R1-Distil format."""
+    # Setup
+    tokenizer = LlamaTokenizerFast.from_pretrained("deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B")
+    adapter = R1DistilTokenizerAdapter()
+
+    # Get formatted conversation using chat template
+    convo = tokenizer.apply_chat_template(TEST_CHAT, tokenize=False)
+
+    # Get tokenization and mask
+    encoding = tokenizer(convo, add_special_tokens=False)
+    tokens = tokenizer.convert_ids_to_tokens(encoding.input_ids)
+    mask = adapter.get_mask(convo, tokenizer)
+
+    # Debug info
+    logger.debug(f"Total tokens: {len(tokens)}")
+    logger.debug(f"Masked tokens (1s): {mask.sum().item()}")
+    logger.debug(f"Unmasked tokens (0s): {len(mask) - mask.sum().item()}")
+
+    # Verify expected count of masked tokens - adjusted for not masking end markers
+    assert 13 <= mask.sum().item() <= 17, f"Expected between 13-17 masked tokens, got {mask.sum().item()}"
+
+    # Extract assistant responses from TEST_CHAT for verification
+    assistant_responses = [msg["content"] for msg in TEST_CHAT if msg["role"] == "assistant"]
+
+    # Verify each assistant response is masked
+    for response in assistant_responses:
+        # Skip long responses to keep test simple
+        if len(response) > 50:
+            continue
+
+        # Find a unique portion of this response to locate it
+        unique_part = response[:20] if len(response) > 20 else response
+        response_pos = convo.find(unique_part)
+        if response_pos == -1:
+            continue
+
+        # Convert position in string to position in tokens
+        offset = len(tokenizer.encode(convo[:response_pos], add_special_tokens=False))
+        response_tokens = tokenizer.encode(unique_part, add_special_tokens=False)
+
+        # Check if tokens in this response are masked
+        masked_count = 0
+        for i, token_id in enumerate(response_tokens):
+            token_pos = offset + i
+            if token_pos < len(mask) and mask[token_pos] == 1:
+                masked_count += 1
+
+        # Verify that most of the response tokens are masked
+        assert masked_count >= len(response_tokens) * 0.8, f"Not enough tokens masked in '{unique_part}'"
+
+    # Verify system and user messages are NOT masked
+    for msg in TEST_CHAT:
+        if msg["role"] not in ["assistant"]:
+            content = msg["content"]
+            # Use a shorter substring to ensure we find it
+            content_sample = content[:15] if len(content) > 15 else content
+            content_pos = convo.find(content_sample)
+            if content_pos == -1:
+                continue
+
+            # Check a sample of tokens from each non-assistant message
+            offset = len(tokenizer.encode(convo[:content_pos], add_special_tokens=False))
+            content_tokens = tokenizer.encode(content_sample, add_special_tokens=False)
+
+            # Count masked tokens (should be very few or none)
+            masked_count = 0
+            for i in range(len(content_tokens)):
+                token_pos = offset + i
+                if token_pos < len(mask) and mask[token_pos] == 1:
+                    masked_count += 1
+
+            # Allow some flexibility but most tokens should not be masked
+            assert masked_count <= len(content_tokens) * 0.2, "Too many tokens masked in non-assistant message"
+
+
+def test_r1_distil_multi_turn():
+    """Test R1-Distil adapter with multi-turn conversations including search."""
+    # Setup
+    tokenizer = LlamaTokenizerFast.from_pretrained("deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B")
+    adapter = R1DistilTokenizerAdapter()
+
+    # Create a multi-turn conversation with search
+    multi_turn_chat = [
+        {
+            "role": "system",
+            "content": "You are a helpful assistant.",
+        },
+        {"role": "user", "content": "What's the capital of France?"},
+        {"role": "assistant", "content": "<search>capital of France</search>"},
+        {"role": "user", "content": "<information>Paris is the capital of France.</information>"},
+        {"role": "assistant", "content": "The capital of France is Paris."},
+    ]
+
+    # Get formatted conversation using chat template
+    convo = tokenizer.apply_chat_template(multi_turn_chat, tokenize=False)
+
+    logger.debug("\n🔍 Testing R1Distil Multi-turn:")
+    logger.debug(f"Multi-turn conversation length: {len(convo)}")
+    logger.debug(f"Multi-turn conversation: {convo[:200]}...")
+
+    # Get mask for the entire conversation
+    full_mask = adapter.get_mask(convo, tokenizer)
+
+    # Split into prompt and response
+    prompt_text, response_text = adapter.split_prompt_assistant(convo)
+
+    # Get tokens for prompt and response
+    prompt_tokens = tokenizer(prompt_text, add_special_tokens=False, return_tensors="pt")["input_ids"].squeeze()
+    response_tokens = tokenizer(response_text, add_special_tokens=False, return_tensors="pt")["input_ids"].squeeze()
+
+    # Slice the mask to match the tokens after prompt
+    prompt_len = prompt_tokens.shape[0]
+    response_mask = full_mask[prompt_len:]
+
+    # Debug info
+    logger.debug(f"Prompt tokens length: {len(prompt_tokens)}")
+    logger.debug(f"Response tokens length: {len(response_tokens)}")
+    logger.debug(f"Response mask length: {len(response_mask)}")
+    logger.debug(f"Response mask sum: {response_mask.sum().item()}")
+
+    # Verify response tokens length matches mask length
+    # Allow for small differences due to special token handling
+    token_mask_diff = abs(len(response_tokens) - len(response_mask))
+    assert token_mask_diff <= 5, f"Response tokens and mask length difference too large: {token_mask_diff}"
+
+    # If mask is longer, truncate to match response tokens
+    if len(response_mask) > len(response_tokens):
+        response_mask = response_mask[: len(response_tokens)]
+
+    # Get token IDs for markers to identify non-content tokens
+    end_marker_tokens = tokenizer(adapter.get_end_marker(), add_special_tokens=False).input_ids
+    assistant_marker_tokens = tokenizer(adapter.get_assistant_marker(), add_special_tokens=False).input_ids
+    special_token_count = len(end_marker_tokens) + len(assistant_marker_tokens)
+
+    # Verify the mask properly covers assistant responses
+    non_zero_mask = response_mask.sum().item()
+    assert non_zero_mask > 0, "Response mask should have non-zero values"
+
+    # Instead of requiring half of ALL tokens to be masked,
+    # we verify that we have a reasonable number of masked tokens
+    # after accounting for markers and special tokens
+    content_token_count = len(response_mask) - special_token_count
+    assert non_zero_mask > 0.2 * content_token_count, "Should have some reasonable amount of content tokens masked"
+
+    # Verify end markers are not masked
+    for i in range(len(response_tokens) - len(end_marker_tokens) + 1):
+        if response_tokens[i : i + len(end_marker_tokens)].tolist() == end_marker_tokens:
+            assert not response_mask[i : i + len(end_marker_tokens)].any(), "End markers should not be masked"