import logging from unittest.mock import patch import pytest import torch from swarms.models.huggingface import HuggingfaceLLM # Fixture for the class instance @pytest.fixture def llm_instance(): model_id = "NousResearch/Nous-Hermes-2-Vision-Alpha" instance = HuggingfaceLLM(model_id=model_id) return instance # Test for instantiation and attributes def test_llm_initialization(llm_instance): assert ( llm_instance.model_id == "NousResearch/Nous-Hermes-2-Vision-Alpha" ) assert llm_instance.max_length == 500 # ... add more assertions for all default attributes # Parameterized test for setting devices @pytest.mark.parametrize("device", ["cpu", "cuda"]) def test_llm_set_device(llm_instance, device): llm_instance.set_device(device) assert llm_instance.device == device # Test exception during initialization with a bad model_id def test_llm_bad_model_initialization(): with pytest.raises(Exception): HuggingfaceLLM(model_id="unknown-model") # # Mocking the tokenizer and model to test run method # @patch("swarms.models.huggingface.AutoTokenizer.from_pretrained") # @patch( # "swarms.models.huggingface.AutoModelForCausalLM.from_pretrained" # ) # def test_llm_run(mock_model, mock_tokenizer, llm_instance): # mock_model.return_value.generate.return_value = "mocked output" # mock_tokenizer.return_value.encode.return_value = "mocked input" # result = llm_instance.run("test task") # assert result == "mocked output" # Async test (requires pytest-asyncio plugin) @pytest.mark.asyncio async def test_llm_run_async(llm_instance): result = await llm_instance.run_async("test task") assert isinstance(result, str) # Test for checking GPU availability def test_llm_gpu_availability(llm_instance): # Assuming the test is running on a machine where the GPU availability is known expected_result = torch.cuda.is_available() assert llm_instance.gpu_available() == expected_result # Test for memory consumption reporting def test_llm_memory_consumption(llm_instance): # Mocking torch.cuda functions for consistent results with patch("torch.cuda.memory_allocated", return_value=1024): with patch("torch.cuda.memory_reserved", return_value=2048): memory = llm_instance.memory_consumption() assert memory == {"allocated": 1024, "reserved": 2048} # Test different initialization parameters @pytest.mark.parametrize( "model_id, max_length", [ ("NousResearch/Nous-Hermes-2-Vision-Alpha", 100), ("microsoft/Orca-2-13b", 200), ( "berkeley-nest/Starling-LM-7B-alpha", None, ), # None to check default behavior ], ) def test_llm_initialization_params(model_id, max_length): if max_length: instance = HuggingfaceLLM(model_id=model_id, max_length=max_length) assert instance.max_length == max_length else: instance = HuggingfaceLLM(model_id=model_id) assert ( instance.max_length == 500 ) # Assuming 500 is the default max_length # Test for setting an invalid device def test_llm_set_invalid_device(llm_instance): with pytest.raises(ValueError): llm_instance.set_device("quantum_processor") # Mocking external API call to test run method without network @patch("swarms.models.huggingface.HuggingfaceLLM.run") def test_llm_run_without_network(mock_run, llm_instance): mock_run.return_value = "mocked output" result = llm_instance.run("test task without network") assert result == "mocked output" # Test handling of empty input for the run method def test_llm_run_empty_input(llm_instance): with pytest.raises(ValueError): llm_instance.run("") # Test the generation with a provided seed for reproducibility @patch("swarms.models.huggingface.HuggingfaceLLM.run") def test_llm_run_with_seed(mock_run, llm_instance): seed = 42 llm_instance.set_seed(seed) # Assuming set_seed method affects the randomness in the model # You would typically ensure that setting the seed gives reproducible results mock_run.return_value = "mocked deterministic output" result = llm_instance.run("test task", seed=seed) assert result == "mocked deterministic output" # Test the output length is as expected @patch("swarms.models.huggingface.HuggingfaceLLM.run") def test_llm_run_output_length(mock_run, llm_instance): input_text = "test task" llm_instance.max_length = 50 # set a max_length for the output mock_run.return_value = "mocked output" * 10 # some long text result = llm_instance.run(input_text) assert len(result.split()) <= llm_instance.max_length # Test the tokenizer handling special tokens correctly @patch("swarms.models.huggingface.HuggingfaceLLM._tokenizer.encode") @patch("swarms.models.huggingface.HuggingfaceLLM._tokenizer.decode") def test_llm_tokenizer_special_tokens( mock_decode, mock_encode, llm_instance ): mock_encode.return_value = "encoded input with special tokens" mock_decode.return_value = "decoded output with special tokens" result = llm_instance.run("test task with special tokens") mock_encode.assert_called_once() mock_decode.assert_called_once() assert "special tokens" in result # Test for correct handling of timeouts @patch("swarms.models.huggingface.HuggingfaceLLM.run") def test_llm_timeout_handling(mock_run, llm_instance): mock_run.side_effect = TimeoutError with pytest.raises(TimeoutError): llm_instance.run("test task with timeout") # Test for response time within a threshold (performance test) @patch("swarms.models.huggingface.HuggingfaceLLM.run") def test_llm_response_time(mock_run, llm_instance): import time mock_run.return_value = "mocked output" start_time = time.time() llm_instance.run("test task for response time") end_time = time.time() assert ( end_time - start_time < 1 ) # Assuming the response should be faster than 1 second # Test the logging of a warning for long inputs @patch("swarms.models.huggingface.logging.warning") def test_llm_long_input_warning(mock_warning, llm_instance): long_input = "x" * 10000 # input longer than the typical limit llm_instance.run(long_input) mock_warning.assert_called_once() # Test for run method behavior when model raises an exception @patch( "swarms.models.huggingface.HuggingfaceLLM._model.generate", side_effect=RuntimeError, ) def test_llm_run_model_exception(mock_generate, llm_instance): with pytest.raises(RuntimeError): llm_instance.run("test task when model fails") # Test the behavior when GPU is forced but not available @patch("torch.cuda.is_available", return_value=False) def test_llm_force_gpu_when_unavailable(mock_is_available, llm_instance): with pytest.raises(EnvironmentError): llm_instance.set_device( "cuda" ) # Attempt to set CUDA when it's not available # Test for proper cleanup after model use (releasing resources) @patch("swarms.models.huggingface.HuggingfaceLLM._model") def test_llm_cleanup(mock_model, mock_tokenizer, llm_instance): llm_instance.cleanup() # Assuming cleanup method is meant to free resources mock_model.delete.assert_called_once() mock_tokenizer.delete.assert_called_once() # Test model's ability to handle multilingual input @patch("swarms.models.huggingface.HuggingfaceLLM.run") def test_llm_multilingual_input(mock_run, llm_instance): mock_run.return_value = "mocked multilingual output" multilingual_input = "Bonjour, ceci est un test multilingue." result = llm_instance.run(multilingual_input) assert isinstance( result, str ) # Simple check to ensure output is string type # Test caching mechanism to prevent re-running the same inputs @patch("swarms.models.huggingface.HuggingfaceLLM.run") def test_llm_caching_mechanism(mock_run, llm_instance): input_text = "test caching mechanism" mock_run.return_value = "cached output" # Run the input twice first_run_result = llm_instance.run(input_text) second_run_result = llm_instance.run(input_text) mock_run.assert_called_once() # Should only be called once due to caching assert first_run_result == second_run_result # These tests are provided as examples. In real-world scenarios, you will need to adapt these tests to the actual logic of your `HuggingfaceLLM` class. # For instance, "mock_model.delete.assert_called_once()" and similar lines are based on hypothetical methods and behaviors that you need to replace with actual implementations. # Mock some functions and objects for testing @pytest.fixture def mock_huggingface_llm(monkeypatch): # Mock the model and tokenizer creation def mock_init( self, model_id, device="cpu", max_length=500, quantize=False, quantization_config=None, verbose=False, distributed=False, decoding=False, max_workers=5, repitition_penalty=1.3, no_repeat_ngram_size=5, temperature=0.7, top_k=40, top_p=0.8, ): pass # Mock the model loading def mock_load_model(self): pass # Mock the model generation def mock_run(self, task): pass monkeypatch.setattr(HuggingfaceLLM, "__init__", mock_init) monkeypatch.setattr(HuggingfaceLLM, "load_model", mock_load_model) monkeypatch.setattr(HuggingfaceLLM, "run", mock_run) # Basic tests for initialization and attribute settings def test_init_huggingface_llm(): llm = HuggingfaceLLM( model_id="test_model", device="cuda", max_length=1000, quantize=True, quantization_config={"config_key": "config_value"}, verbose=True, distributed=True, decoding=True, max_workers=3, repitition_penalty=1.5, no_repeat_ngram_size=4, temperature=0.8, top_k=50, top_p=0.7, ) assert llm.model_id == "test_model" assert llm.device == "cuda" assert llm.max_length == 1000 assert llm.quantize is True assert llm.quantization_config == {"config_key": "config_value"} assert llm.verbose is True assert llm.distributed is True assert llm.decoding is True assert llm.max_workers == 3 assert llm.repitition_penalty == 1.5 assert llm.no_repeat_ngram_size == 4 assert llm.temperature == 0.8 assert llm.top_k == 50 assert llm.top_p == 0.7 # Test loading the model def test_load_model(mock_huggingface_llm): llm = HuggingfaceLLM(model_id="test_model") llm.load_model() # Test running the model def test_run(mock_huggingface_llm): llm = HuggingfaceLLM(model_id="test_model") llm.run("Test prompt") # Test for setting max_length def test_llm_set_max_length(llm_instance): new_max_length = 1000 llm_instance.set_max_length(new_max_length) assert llm_instance.max_length == new_max_length # Test for setting verbose def test_llm_set_verbose(llm_instance): llm_instance.set_verbose(True) assert llm_instance.verbose is True # Test for setting distributed def test_llm_set_distributed(llm_instance): llm_instance.set_distributed(True) assert llm_instance.distributed is True # Test for setting decoding def test_llm_set_decoding(llm_instance): llm_instance.set_decoding(True) assert llm_instance.decoding is True # Test for setting max_workers def test_llm_set_max_workers(llm_instance): new_max_workers = 10 llm_instance.set_max_workers(new_max_workers) assert llm_instance.max_workers == new_max_workers # Test for setting repitition_penalty def test_llm_set_repitition_penalty(llm_instance): new_repitition_penalty = 1.5 llm_instance.set_repitition_penalty(new_repitition_penalty) assert llm_instance.repitition_penalty == new_repitition_penalty # Test for setting no_repeat_ngram_size def test_llm_set_no_repeat_ngram_size(llm_instance): new_no_repeat_ngram_size = 6 llm_instance.set_no_repeat_ngram_size(new_no_repeat_ngram_size) assert llm_instance.no_repeat_ngram_size == new_no_repeat_ngram_size # Test for setting temperature def test_llm_set_temperature(llm_instance): new_temperature = 0.8 llm_instance.set_temperature(new_temperature) assert llm_instance.temperature == new_temperature # Test for setting top_k def test_llm_set_top_k(llm_instance): new_top_k = 50 llm_instance.set_top_k(new_top_k) assert llm_instance.top_k == new_top_k # Test for setting top_p def test_llm_set_top_p(llm_instance): new_top_p = 0.9 llm_instance.set_top_p(new_top_p) assert llm_instance.top_p == new_top_p # Test for setting quantize def test_llm_set_quantize(llm_instance): llm_instance.set_quantize(True) assert llm_instance.quantize is True # Test for setting quantization_config def test_llm_set_quantization_config(llm_instance): new_quantization_config = { "load_in_4bit": False, "bnb_4bit_use_double_quant": False, "bnb_4bit_quant_type": "nf4", "bnb_4bit_compute_dtype": torch.bfloat16, } llm_instance.set_quantization_config(new_quantization_config) assert llm_instance.quantization_config == new_quantization_config # Test for setting model_id def test_llm_set_model_id(llm_instance): new_model_id = "EleutherAI/gpt-neo-2.7B" llm_instance.set_model_id(new_model_id) assert llm_instance.model_id == new_model_id # Test for setting model @patch("swarms.models.huggingface.AutoModelForCausalLM.from_pretrained") def test_llm_set_model(mock_model, llm_instance): mock_model.return_value = "mocked model" llm_instance.set_model(mock_model) assert llm_instance.model == "mocked model" # Test for setting tokenizer @patch("swarms.models.huggingface.AutoTokenizer.from_pretrained") def test_llm_set_tokenizer(mock_tokenizer, llm_instance): mock_tokenizer.return_value = "mocked tokenizer" llm_instance.set_tokenizer(mock_tokenizer) assert llm_instance.tokenizer == "mocked tokenizer" # Test for setting logger def test_llm_set_logger(llm_instance): new_logger = logging.getLogger("test_logger") llm_instance.set_logger(new_logger) assert llm_instance.logger == new_logger # Test for saving model @patch("torch.save") def test_llm_save_model(mock_save, llm_instance): llm_instance.save_model("path/to/save") mock_save.assert_called_once() # Test for print_dashboard @patch("builtins.print") def test_llm_print_dashboard(mock_print, llm_instance): llm_instance.print_dashboard("test task") mock_print.assert_called() # Test for __call__ method @patch("swarms.models.huggingface.HuggingfaceLLM.run") def test_llm_call(mock_run, llm_instance): mock_run.return_value = "mocked output" result = llm_instance("test task") assert result == "mocked output"