diff --git a/docs/swarms/models/huggingface.md b/docs/swarms/models/huggingface.md new file mode 100644 index 00000000..841d7a39 --- /dev/null +++ b/docs/swarms/models/huggingface.md @@ -0,0 +1,153 @@ +## `HuggingfaceLLM` Documentation + +### Introduction + +The `HuggingfaceLLM` class is designed for running inference using models from the Hugging Face Transformers library. This documentation provides an in-depth understanding of the class, its purpose, attributes, methods, and usage examples. + +#### Purpose + +The `HuggingfaceLLM` class serves the following purposes: + +1. Load pre-trained Hugging Face models and tokenizers. +2. Generate text-based responses from the loaded model using a given prompt. +3. Provide flexibility in device selection, quantization, and other configuration options. + +### Class Definition + +The `HuggingfaceLLM` class is defined as follows: + +```python +class HuggingfaceLLM: + def __init__( + self, + model_id: str, + device: str = None, + max_length: int = 20, + quantize: bool = False, + quantization_config: dict = None, + verbose=False, + distributed=False, + decoding=False, + ): + # Attributes and initialization logic explained below + pass + + def load_model(self): + # Method to load the pre-trained model and tokenizer + pass + + def run(self, prompt_text: str, max_length: int = None): + # Method to generate text-based responses + pass + + def __call__(self, prompt_text: str, max_length: int = None): + # Alternate method for generating text-based responses + pass +``` + +### Attributes + +| Attribute | Description | +|----------------------|---------------------------------------------------------------------------------------------------------------------------| +| `model_id` | The ID of the pre-trained model to be used. | +| `device` | The device on which the model runs (`'cuda'` for GPU or `'cpu'` for CPU). | +| `max_length` | The maximum length of the generated text. | +| `quantize` | A boolean indicating whether quantization should be used. | +| `quantization_config`| A dictionary with configuration options for quantization. | +| `verbose` | A boolean indicating whether verbose logs should be printed. | +| `logger` | An optional logger for logging messages (defaults to a basic logger). | +| `distributed` | A boolean indicating whether distributed processing should be used. | +| `decoding` | A boolean indicating whether to perform decoding during text generation. | + +### Class Methods + +#### `__init__` Method + +The `__init__` method initializes an instance of the `HuggingfaceLLM` class with the specified parameters. It also loads the pre-trained model and tokenizer. + +- `model_id` (str): The ID of the pre-trained model to use. +- `device` (str, optional): The device to run the model on ('cuda' or 'cpu'). +- `max_length` (int, optional): The maximum length of the generated text. +- `quantize` (bool, optional): Whether to use quantization. +- `quantization_config` (dict, optional): Configuration for quantization. +- `verbose` (bool, optional): Whether to print verbose logs. +- `logger` (logging.Logger, optional): The logger to use. +- `distributed` (bool, optional): Whether to use distributed processing. +- `decoding` (bool, optional): Whether to perform decoding during text generation. + +#### `load_model` Method + +The `load_model` method loads the pre-trained model and tokenizer specified by `model_id`. + +#### `run` and `__call__` Methods + +Both `run` and `__call__` methods generate text-based responses based on a given prompt. They accept the following parameters: + +- `prompt_text` (str): The text prompt to initiate text generation. +- `max_length` (int, optional): The maximum length of the generated text. + +### Usage Examples + +Here are three ways to use the `HuggingfaceLLM` class: + +#### Example 1: Basic Usage + +```python +from your_module import HuggingfaceLLM + +# Initialize the HuggingfaceLLM instance with a model ID +model_id = "gpt2-small" +inference = HuggingfaceLLM(model_id=model_id) + +# Generate text based on a prompt +prompt_text = "Once upon a time" +generated_text = inference(prompt_text) +print(generated_text) +``` + +#### Example 2: Custom Configuration + +```python +from your_module import HuggingfaceLLM + +# Initialize with custom configuration +custom_config = { + "quantize": True, + "quantization_config": {"load_in_4bit": True}, + "verbose": True +} +inference = HuggingfaceLLM(model_id="gpt2-small", **custom_config) + +# Generate text based on a prompt +prompt_text = "Tell me a joke" +generated_text = inference(prompt_text) +print(generated_text) +``` + +#### Example 3: Distributed Processing + +```python +from your_module import HuggingfaceLLM + +# Initialize for distributed processing +inference = HuggingfaceLLM(model_id="gpt2-medium", distributed=True) + +# Generate text based on a prompt +prompt_text = "Translate the following sentence to French" +generated_text = inference(prompt_text) +print(generated_text) +``` + +### Additional Information + +- The `HuggingfaceLLM` class provides the flexibility to load and use pre-trained models from the Hugging Face Transformers library. +- Quantization can be enabled to reduce model size and inference time. +- Distributed processing can be used for parallelized inference. +- Verbose logging can help in debugging and understanding the text generation process. + +### References + +- [Hugging Face Transformers Documentation](https://huggingface.co/transformers/) +- [PyTorch Documentation](https://pytorch.org/docs/stable/index.html) + +This documentation provides a comprehensive understanding of the `HuggingfaceLLM` class, its attributes, methods, and usage examples. Developers can use this class to perform text generation tasks efficiently using pre-trained models from the Hugging Face Transformers library. \ No newline at end of file diff --git a/mkdocs.yml b/mkdocs.yml index 0b8083c9..d1b5f464 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -92,7 +92,7 @@ nav: - swarms.models: - Language: - Overview: "swarms/models/index.md" - - HuggingFaceLLM: "swarms/models/hf.md" + - HuggingFaceLLM: "swarms/models/huggingface.md" - Anthropic: "swarms/models/anthropic.md" - OpenAI: "swarms/models/openai.md" - Zephyr: "swarms/models/zephyr.md" diff --git a/swarms/models/huggingface.py b/swarms/models/huggingface.py index 97c87d5a..95ebca47 100644 --- a/swarms/models/huggingface.py +++ b/swarms/models/huggingface.py @@ -35,7 +35,7 @@ class HuggingfaceLLM: self, model_id: str, device: str = None, - max_length: int = 20, + max_length: int = 500, quantize: bool = False, quantization_config: dict = None, verbose=False, @@ -83,6 +83,7 @@ class HuggingfaceLLM: raise def load_model(self): + """Load the model""" if not self.model or not self.tokenizer: try: self.tokenizer = AutoTokenizer.from_pretrained(self.model_id) @@ -103,7 +104,7 @@ class HuggingfaceLLM: self.logger.error(f"Failed to load the model or the tokenizer: {error}") raise - def run(self, prompt_text: str, max_length: int = None): + def run(self, prompt_text: str): """ Generate a response based on the prompt text. @@ -116,7 +117,7 @@ class HuggingfaceLLM: """ self.load_model() - max_length = max_length if max_length else self.max_length + max_length = self.max_length try: inputs = self.tokenizer.encode(prompt_text, return_tensors="pt").to( @@ -157,7 +158,7 @@ class HuggingfaceLLM: self.logger.error(f"Failed to generate the text: {e}") raise - def __call__(self, prompt_text: str, max_length: int = None): + def __call__(self, prompt_text: str): """ Generate a response based on the prompt text. @@ -170,7 +171,7 @@ class HuggingfaceLLM: """ self.load_model() - max_length = max_length if max_length else self.max_length + max_length = self.max_ try: inputs = self.tokenizer.encode(prompt_text, return_tensors="pt").to( @@ -210,4 +211,4 @@ class HuggingfaceLLM: return self.tokenizer.decode(outputs[0], skip_special_tokens=True) except Exception as e: self.logger.error(f"Failed to generate the text: {e}") - raise \ No newline at end of file + raise diff --git a/tests/models/huggingface.py b/tests/models/huggingface.py new file mode 100644 index 00000000..46c7fa12 --- /dev/null +++ b/tests/models/huggingface.py @@ -0,0 +1,58 @@ +import pytest +import torch +from unittest.mock import Mock, patch +from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig +from swarms.models.huggingface import HuggingfaceLLM + + +@pytest.fixture +def huggingface_llm(): + # Create an instance of HuggingfaceLLM for testing. + model_id = "gpt2-small" + return HuggingfaceLLM(model_id=model_id) + + +def test_initialization(huggingface_llm): + # Test the initialization of the HuggingfaceLLM class. + assert huggingface_llm.model_id == "gpt2-small" + assert huggingface_llm.device in ["cpu", "cuda"] + assert huggingface_llm.max_length == 20 + assert huggingface_llm.verbose == False + assert huggingface_llm.distributed == False + assert huggingface_llm.decoding == False + assert huggingface_llm.model is None + assert huggingface_llm.tokenizer is None + + +def test_load_model(huggingface_llm): + # Test loading the model. + huggingface_llm.load_model() + assert isinstance(huggingface_llm.model, AutoModelForCausalLM) + assert isinstance(huggingface_llm.tokenizer, AutoTokenizer) + + +def test_run(huggingface_llm): + # Test the run method of HuggingfaceLLM. + prompt_text = "Once upon a time" + generated_text = huggingface_llm.run(prompt_text) + assert isinstance(generated_text, str) + assert len(generated_text) > 0 + + +def test_call_method(huggingface_llm): + # Test the __call__ method of HuggingfaceLLM. + prompt_text = "Once upon a time" + generated_text = huggingface_llm(prompt_text) + assert isinstance(generated_text, str) + assert len(generated_text) > 0 + + +def test_load_model_failure(): + # Test loading model failure. + with patch( + "your_module.AutoModelForCausalLM.from_pretrained", + side_effect=Exception("Model load failed"), + ): + with pytest.raises(Exception): + huggingface_llm = HuggingfaceLLM(model_id="gpt2-small") + huggingface_llm.load_model()