You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
69 lines
1.7 KiB
69 lines
1.7 KiB
# HuggingFaceTokenizer
|
|
|
|
import os
|
|
from unittest.mock import patch
|
|
|
|
import pytest
|
|
|
|
from swarms.tokenizers.r_tokenizers import HuggingFaceTokenizer
|
|
|
|
|
|
# Test class setup
|
|
@pytest.fixture
|
|
def hftokenizer():
|
|
dir_path = os.path.join(os.getcwd(), "modeldir")
|
|
tokenizer = HuggingFaceTokenizer(dir_path)
|
|
return tokenizer
|
|
|
|
|
|
# testing __init__
|
|
@patch("os.path")
|
|
@patch("swarms.tokenizers.get_logger")
|
|
def test___init__(mock_get_logger, mock_path, hftokenizer):
|
|
mock_path.exists.return_value = False
|
|
mock_path.join.return_value = "dummy_path"
|
|
mock_get_logger.return_value = "dummy_logger"
|
|
assert hftokenizer.model_dir == "dummy_path"
|
|
assert hftokenizer.logger == "dummy_logger"
|
|
assert hftokenizer._maybe_decode_bytes is False
|
|
assert hftokenizer._prefix_space_tokens is None
|
|
|
|
|
|
# testing vocab_size property
|
|
def test_vocab_size(hftokenizer):
|
|
assert hftokenizer.vocab_size == 30522
|
|
|
|
|
|
# testing bos_token_id property
|
|
def test_bos_token_id(hftokenizer):
|
|
assert hftokenizer.bos_token_id == 101
|
|
|
|
|
|
# testing eos_token_id property
|
|
def test_eos_token_id(hftokenizer):
|
|
assert hftokenizer.eos_token_id == 102
|
|
|
|
|
|
# testing prefix_space_tokens property
|
|
def test_prefix_space_tokens(hftokenizer):
|
|
assert len(hftokenizer.prefix_space_tokens) > 0
|
|
|
|
|
|
# testing _maybe_add_prefix_space method
|
|
def test__maybe_add_prefix_space(hftokenizer):
|
|
assert (
|
|
hftokenizer._maybe_add_prefix_space(
|
|
[101, 2003, 2010, 2050, 2001, 2339], " is why"
|
|
)
|
|
== " is why"
|
|
)
|
|
assert (
|
|
hftokenizer._maybe_add_prefix_space(
|
|
[2003, 2010, 2050, 2001, 2339], "is why"
|
|
)
|
|
== " is why"
|
|
)
|
|
|
|
|
|
# continuing tests for other methods...
|