From f4c66926717892ff6318499b4024870b34d1b21a Mon Sep 17 00:00:00 2001 From: Kye Date: Tue, 26 Mar 2024 10:08:58 -0700 Subject: [PATCH] [CLEANUP] --- pyproject.toml | 2 +- swarms/memory/__init__.py | 12 +-- swarms/models/__init__.py | 26 ++---- swarms/models/petals.py | 4 +- tests/tokenizers/test_anthropictokenizer.py | 40 --------- tests/tokenizers/test_basetokenizer.py | 46 ----------- tests/tokenizers/test_coheretokenizer.py | 37 --------- tests/tokenizers/test_huggingfacetokenizer.py | 68 --------------- tests/tokenizers/test_openaitokenizer.py | 48 ----------- .../tokenizers/test_sentencepiecetokenizer.py | 1 - tests/tokenizers/test_tokenizer.py | 82 ------------------- 11 files changed, 13 insertions(+), 353 deletions(-) delete mode 100644 tests/tokenizers/test_anthropictokenizer.py delete mode 100644 tests/tokenizers/test_basetokenizer.py delete mode 100644 tests/tokenizers/test_coheretokenizer.py delete mode 100644 tests/tokenizers/test_huggingfacetokenizer.py delete mode 100644 tests/tokenizers/test_openaitokenizer.py delete mode 100644 tests/tokenizers/test_sentencepiecetokenizer.py delete mode 100644 tests/tokenizers/test_tokenizer.py diff --git a/pyproject.toml b/pyproject.toml index 07bd584e..1a7a63f3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -26,7 +26,7 @@ classifiers = [ [tool.poetry.dependencies] python = ">=3.9,<4.0" torch = ">=2.1.1,<3.0" -transformers = "4.39.0" +transformers = ">= 4.39.0, <5.0.0" asyncio = ">=3.4.3,<4.0" einops = "0.7.0" langchain-core = "0.1.33" diff --git a/swarms/memory/__init__.py b/swarms/memory/__init__.py index 0dafab08..b92e35e1 100644 --- a/swarms/memory/__init__.py +++ b/swarms/memory/__init__.py @@ -1,21 +1,17 @@ from swarms.memory.action_subtask import ActionSubtaskEntry from swarms.memory.base_db import AbstractDatabase from swarms.memory.base_vectordb import AbstractVectorDatabase -from swarms.memory.chroma_db import ChromaDB from swarms.memory.dict_internal_memory import DictInternalMemory from swarms.memory.dict_shared_memory import DictSharedMemory from swarms.memory.short_term_memory import ShortTermMemory -from swarms.memory.sqlite import SQLiteDB from swarms.memory.visual_memory import VisualShortTermMemory __all__ = [ - "AbstractVectorDatabase", "AbstractDatabase", - "ShortTermMemory", - "SQLiteDB", - "VisualShortTermMemory", + "AbstractVectorDatabase", "ActionSubtaskEntry", - "ChromaDB", "DictInternalMemory", "DictSharedMemory", -] + "ShortTermMemory", + "VisualShortTermMemory", +] \ No newline at end of file diff --git a/swarms/models/__init__.py b/swarms/models/__init__.py index 8400073f..f26a2eeb 100644 --- a/swarms/models/__init__.py +++ b/swarms/models/__init__.py @@ -1,8 +1,6 @@ from swarms.models.base_embedding_model import BaseEmbeddingModel from swarms.models.base_llm import AbstractLLM # noqa: E402 from swarms.models.base_multimodal_model import BaseMultiModalModel -from swarms.models.biogpt import BioGPT # noqa: E402 -from swarms.models.clipq import CLIPQ # noqa: E402 from swarms.models.fire_function import FireFunctionCaller from swarms.models.fuyu import Fuyu # noqa: E402 from swarms.models.gemini import Gemini # noqa: E402 @@ -52,12 +50,7 @@ from swarms.models.types import ( # noqa: E402 TextModality, VideoModality, ) - -# from swarms.models.ultralytics_model import UltralyticsModel from swarms.models.vilt import Vilt # noqa: E402 -from swarms.models.wizard_storytelling import WizardLLMStoryTeller -from swarms.models.zephyr import Zephyr # noqa: E402 -from swarms.models.zeroscope import ZeroscopeTTV # noqa: E402 __all__ = [ "AbstractLLM", @@ -65,41 +58,34 @@ __all__ = [ "AzureOpenAI", "BaseEmbeddingModel", "BaseMultiModalModel", - "BioGPT", - "CLIPQ", "Cohere", "FireFunctionCaller", "Fuyu", - "GPT4VisionAPI", "Gemini", + "GPT4VisionAPI", "HuggingfaceLLM", "Idefics", "Kosmos", "LayoutLMDocumentQA", "LavaMultiModal", - "Replicate", - "MPT7B", "Mistral", "Mixtral", - "MosaicML", + "MPT7B", + "MultimodalData", "Nougat", "OpenAI", "OpenAIChat", "OpenAITTS", "Petals", "QwenVLMultiModal", + "Replicate", "SamplingParams", "SamplingType", "SegmentAnythingMarkGenerator", "TextModality", - "TimmModel", "TogetherLLM", "Vilt", - "VideoModality", - "WizardLLMStoryTeller", - "Zephyr", - "ZeroscopeTTV", "AudioModality", "ImageModality", - "MultimodalData", -] + "VideoModality", +] \ No newline at end of file diff --git a/swarms/models/petals.py b/swarms/models/petals.py index 7ceeef8b..699d7d9d 100644 --- a/swarms/models/petals.py +++ b/swarms/models/petals.py @@ -1,7 +1,7 @@ from transformers import AutoModelForCausalLM, AutoTokenizer +from swarms.models.base_llm import AbstractLLM - -class Petals: +class Petals(AbstractLLM): """Petals Bloom models.""" def __init__( diff --git a/tests/tokenizers/test_anthropictokenizer.py b/tests/tokenizers/test_anthropictokenizer.py deleted file mode 100644 index 14b2fd86..00000000 --- a/tests/tokenizers/test_anthropictokenizer.py +++ /dev/null @@ -1,40 +0,0 @@ -# AnthropicTokenizer - -import pytest - -from swarms.tokenizers.anthropic_tokenizer import AnthropicTokenizer - - -def test_post_init(): - tokenizer = AnthropicTokenizer() - assert tokenizer.model == "claude-2.1" - assert tokenizer.max_tokens == 200000 - - -def test_default_max_tokens(): - tokenizer = AnthropicTokenizer(model="claude") - assert tokenizer.default_max_tokens() == 100000 - - -@pytest.mark.parametrize( - "model,tokens", [("claude-2.1", 200000), ("claude", 100000)] -) -def test_default_max_tokens_models(model, tokens): - tokenizer = AnthropicTokenizer(model=model) - assert tokenizer.default_max_tokens() == tokens - - -def test_count_tokens_string(): - # Insert mock instantiation of anthropic client and its count_tokens function - text = "This is a test string." - tokenizer = AnthropicTokenizer() - tokens = tokenizer.count_tokens(text) - assert tokens == 5 - - -def test_count_tokens_list(): - # Insert mock instantiation of anthropic client and its count_tokens function - text = ["This", "is", "a", "test", "string."] - tokenizer = AnthropicTokenizer() - with pytest.raises(ValueError): - tokenizer.count_tokens(text) diff --git a/tests/tokenizers/test_basetokenizer.py b/tests/tokenizers/test_basetokenizer.py deleted file mode 100644 index 3956d2de..00000000 --- a/tests/tokenizers/test_basetokenizer.py +++ /dev/null @@ -1,46 +0,0 @@ -# BaseTokenizer - -import pytest - -from swarms.tokenizers.base_tokenizer import BaseTokenizer - - -# 1. Fixture for BaseTokenizer instance. -@pytest.fixture -def base_tokenizer(): - return BaseTokenizer(max_tokens=100) - - -# 2. Tests for __post_init__. -def test_post_init(base_tokenizer): - assert base_tokenizer.stop_sequences == ["<|Response|>"] - assert base_tokenizer.stop_token == "<|Response|>" - - -# 3. Tests for count_tokens_left with different inputs. -def test_count_tokens_left_with_positive_diff( - base_tokenizer, monkeypatch -): - # Mocking count_tokens to return a specific value - monkeypatch.setattr( - "swarms.tokenizers.BaseTokenizer.count_tokens", - lambda x, y: 50, - ) - assert base_tokenizer.count_tokens_left("some text") == 50 - - -def test_count_tokens_left_with_zero_diff( - base_tokenizer, monkeypatch -): - monkeypatch.setattr( - "swarms.tokenizers.BaseTokenizer.count_tokens", - lambda x, y: 100, - ) - assert base_tokenizer.count_tokens_left("some text") == 0 - - -# 4. Add tests for count_tokens. This method is an abstract one, so testing it -# will be dependent on the actual implementation in the subclass. Here is just -# a general idea how to test it (we assume that test_count_tokens is implemented in some subclass). -def test_count_tokens(subclass_tokenizer_instance): - assert subclass_tokenizer_instance.count_tokens("some text") == 6 diff --git a/tests/tokenizers/test_coheretokenizer.py b/tests/tokenizers/test_coheretokenizer.py deleted file mode 100644 index 2607cf9a..00000000 --- a/tests/tokenizers/test_coheretokenizer.py +++ /dev/null @@ -1,37 +0,0 @@ -# CohereTokenizer - -from unittest.mock import MagicMock - -import pytest - -from swarms.tokenizers.cohere_tokenizer import CohereTokenizer - - -@pytest.fixture -def cohere_tokenizer(): - mock_client = MagicMock() - mock_client.tokenize.return_value.tokens = [ - "token1", - "token2", - "token3", - ] - return CohereTokenizer(model="", client=mock_client) - - -def test_count_tokens_with_string(cohere_tokenizer): - tokens_count = cohere_tokenizer.count_tokens("valid string") - assert tokens_count == 3 - - -def test_count_tokens_with_non_string(cohere_tokenizer): - with pytest.raises(ValueError): - cohere_tokenizer.count_tokens(["invalid", "input"]) - - -def test_count_tokens_with_different_length(cohere_tokenizer): - cohere_tokenizer.client.tokenize.return_value.tokens = [ - "token1", - "token2", - ] - tokens_count = cohere_tokenizer.count_tokens("valid string") - assert tokens_count == 2 diff --git a/tests/tokenizers/test_huggingfacetokenizer.py b/tests/tokenizers/test_huggingfacetokenizer.py deleted file mode 100644 index 1eedb6e5..00000000 --- a/tests/tokenizers/test_huggingfacetokenizer.py +++ /dev/null @@ -1,68 +0,0 @@ -# HuggingFaceTokenizer - -import os -from unittest.mock import patch - -import pytest - -from swarms.tokenizers.r_tokenizers import HuggingFaceTokenizer - - -# Test class setup -@pytest.fixture -def hftokenizer(): - dir_path = os.path.join(os.getcwd(), "modeldir") - tokenizer = HuggingFaceTokenizer(dir_path) - return tokenizer - - -# testing __init__ -@patch("os.path") -@patch("swarms.tokenizers.get_logger") -def test___init__(mock_get_logger, mock_path, hftokenizer): - mock_path.exists.return_value = False - mock_path.join.return_value = "dummy_path" - mock_get_logger.return_value = "dummy_logger" - assert hftokenizer.model_dir == "dummy_path" - assert hftokenizer.logger == "dummy_logger" - assert hftokenizer._maybe_decode_bytes is False - assert hftokenizer._prefix_space_tokens is None - - -# testing vocab_size property -def test_vocab_size(hftokenizer): - assert hftokenizer.vocab_size == 30522 - - -# testing bos_token_id property -def test_bos_token_id(hftokenizer): - assert hftokenizer.bos_token_id == 101 - - -# testing eos_token_id property -def test_eos_token_id(hftokenizer): - assert hftokenizer.eos_token_id == 102 - - -# testing prefix_space_tokens property -def test_prefix_space_tokens(hftokenizer): - assert len(hftokenizer.prefix_space_tokens) > 0 - - -# testing _maybe_add_prefix_space method -def test__maybe_add_prefix_space(hftokenizer): - assert ( - hftokenizer._maybe_add_prefix_space( - [101, 2003, 2010, 2050, 2001, 2339], " is why" - ) - == " is why" - ) - assert ( - hftokenizer._maybe_add_prefix_space( - [2003, 2010, 2050, 2001, 2339], "is why" - ) - == " is why" - ) - - -# continuing tests for other methods... diff --git a/tests/tokenizers/test_openaitokenizer.py b/tests/tokenizers/test_openaitokenizer.py deleted file mode 100644 index 3c24748d..00000000 --- a/tests/tokenizers/test_openaitokenizer.py +++ /dev/null @@ -1,48 +0,0 @@ -# OpenAITokenizer - -import pytest - -import swarms.tokenizers.openai_tokenizers as tokenizers - - -@pytest.fixture() -def openai_tokenizer(): - return tokenizers.OpenAITokenizer("gpt-3") - - -def test_init(openai_tokenizer): - assert openai_tokenizer.model == "gpt-3" - - -def test_default_max_tokens(openai_tokenizer): - assert openai_tokenizer.default_max_tokens() == 4096 - - -@pytest.mark.parametrize( - "text, expected_output", [("Hello, world!", 3), (["Hello"], 4)] -) -def test_count_tokens_single(openai_tokenizer, text, expected_output): - assert ( - openai_tokenizer.count_tokens(text, "gpt-3") - == expected_output - ) - - -@pytest.mark.parametrize( - "texts, expected_output", - [(["Hello, world!", "This is a test"], 6), (["Hello"], 4)], -) -def test_count_tokens_multiple( - openai_tokenizer, texts, expected_output -): - assert ( - openai_tokenizer.count_tokens(texts, "gpt-3") - == expected_output - ) - - -@pytest.mark.parametrize( - "text, expected_output", [("Hello, world!", 3), (["Hello"], 4)] -) -def test_len(openai_tokenizer, text, expected_output): - assert openai_tokenizer.len(text, "gpt-3") == expected_output diff --git a/tests/tokenizers/test_sentencepiecetokenizer.py b/tests/tokenizers/test_sentencepiecetokenizer.py deleted file mode 100644 index e3a0b917..00000000 --- a/tests/tokenizers/test_sentencepiecetokenizer.py +++ /dev/null @@ -1 +0,0 @@ -# SentencePieceTokenizer diff --git a/tests/tokenizers/test_tokenizer.py b/tests/tokenizers/test_tokenizer.py deleted file mode 100644 index b868f0a1..00000000 --- a/tests/tokenizers/test_tokenizer.py +++ /dev/null @@ -1,82 +0,0 @@ -# Tokenizer - -from unittest.mock import patch - -from swarms.tokenizers.r_tokenizers import Tokenizer - - -def test_initializer_existing_model_file(): - with patch("os.path.exists", return_value=True): - with patch( - "swarms.tokenizers.SentencePieceTokenizer" - ) as mock_model: - tokenizer = Tokenizer("tokenizers/my_model.model") - mock_model.assert_called_with("tokenizers/my_model.model") - assert tokenizer.model == mock_model.return_value - - -def test_initializer_model_folder(): - with patch("os.path.exists", side_effect=[False, True]): - with patch( - "swarms.tokenizers.HuggingFaceTokenizer" - ) as mock_model: - tokenizer = Tokenizer("my_model_directory") - mock_model.assert_called_with("my_model_directory") - assert tokenizer.model == mock_model.return_value - - -def test_vocab_size(): - with patch( - "swarms.tokenizers.SentencePieceTokenizer" - ) as mock_model: - tokenizer = Tokenizer("tokenizers/my_model.model") - assert ( - tokenizer.vocab_size == mock_model.return_value.vocab_size - ) - - -def test_bos_token_id(): - with patch( - "swarms.tokenizers.SentencePieceTokenizer" - ) as mock_model: - tokenizer = Tokenizer("tokenizers/my_model.model") - assert ( - tokenizer.bos_token_id - == mock_model.return_value.bos_token_id - ) - - -def test_encode(): - with patch( - "swarms.tokenizers.SentencePieceTokenizer" - ) as mock_model: - tokenizer = Tokenizer("tokenizers/my_model.model") - assert ( - tokenizer.encode("hello") - == mock_model.return_value.encode.return_value - ) - - -def test_decode(): - with patch( - "swarms.tokenizers.SentencePieceTokenizer" - ) as mock_model: - tokenizer = Tokenizer("tokenizers/my_model.model") - assert ( - tokenizer.decode([1, 2, 3]) - == mock_model.return_value.decode.return_value - ) - - -def test_call(): - with patch( - "swarms.tokenizers.SentencePieceTokenizer" - ) as mock_model: - tokenizer = Tokenizer("tokenizers/my_model.model") - assert ( - tokenizer("hello") - == mock_model.return_value.__call__.return_value - ) - - -# More tests can be added here