diff --git a/docs/swarms/models/huggingface.md b/docs/swarms/models/huggingface.md
new file mode 100644
index 00000000..841d7a39
--- /dev/null
+++ b/docs/swarms/models/huggingface.md
@@ -0,0 +1,153 @@
+## `HuggingfaceLLM` Documentation
+
+### Introduction
+
+The `HuggingfaceLLM` class is designed for running inference using models from the Hugging Face Transformers library. This documentation provides an in-depth understanding of the class, its purpose, attributes, methods, and usage examples.
+
+#### Purpose
+
+The `HuggingfaceLLM` class serves the following purposes:
+
+1. Load pre-trained Hugging Face models and tokenizers.
+2. Generate text-based responses from the loaded model using a given prompt.
+3. Provide flexibility in device selection, quantization, and other configuration options.
+
+### Class Definition
+
+The `HuggingfaceLLM` class is defined as follows:
+
+```python
+class HuggingfaceLLM:
+    def __init__(
+        self,
+        model_id: str,
+        device: str = None,
+        max_length: int = 20,
+        quantize: bool = False,
+        quantization_config: dict = None,
+        verbose=False,
+        distributed=False,
+        decoding=False,
+    ):
+        # Attributes and initialization logic explained below
+        pass
+
+    def load_model(self):
+        # Method to load the pre-trained model and tokenizer
+        pass
+
+    def run(self, prompt_text: str, max_length: int = None):
+        # Method to generate text-based responses
+        pass
+
+    def __call__(self, prompt_text: str, max_length: int = None):
+        # Alternate method for generating text-based responses
+        pass
+```
+
+### Attributes
+
+| Attribute            | Description                                                                                                               |
+|----------------------|---------------------------------------------------------------------------------------------------------------------------|
+| `model_id`           | The ID of the pre-trained model to be used.                                                                              |
+| `device`             | The device on which the model runs (`'cuda'` for GPU or `'cpu'` for CPU).                                              |
+| `max_length`         | The maximum length of the generated text.                                                                                |
+| `quantize`           | A boolean indicating whether quantization should be used.                                                               |
+| `quantization_config`| A dictionary with configuration options for quantization.                                                                |
+| `verbose`            | A boolean indicating whether verbose logs should be printed.                                                             |
+| `logger`             | An optional logger for logging messages (defaults to a basic logger).                                                   |
+| `distributed`        | A boolean indicating whether distributed processing should be used.                                                     |
+| `decoding`           | A boolean indicating whether to perform decoding during text generation.                                                  |
+
+### Class Methods
+
+#### `__init__` Method
+
+The `__init__` method initializes an instance of the `HuggingfaceLLM` class with the specified parameters. It also loads the pre-trained model and tokenizer.
+
+- `model_id` (str): The ID of the pre-trained model to use.
+- `device` (str, optional): The device to run the model on ('cuda' or 'cpu').
+- `max_length` (int, optional): The maximum length of the generated text.
+- `quantize` (bool, optional): Whether to use quantization.
+- `quantization_config` (dict, optional): Configuration for quantization.
+- `verbose` (bool, optional): Whether to print verbose logs.
+- `logger` (logging.Logger, optional): The logger to use.
+- `distributed` (bool, optional): Whether to use distributed processing.
+- `decoding` (bool, optional): Whether to perform decoding during text generation.
+
+#### `load_model` Method
+
+The `load_model` method loads the pre-trained model and tokenizer specified by `model_id`.
+
+#### `run` and `__call__` Methods
+
+Both `run` and `__call__` methods generate text-based responses based on a given prompt. They accept the following parameters:
+
+- `prompt_text` (str): The text prompt to initiate text generation.
+- `max_length` (int, optional): The maximum length of the generated text.
+
+### Usage Examples
+
+Here are three ways to use the `HuggingfaceLLM` class:
+
+#### Example 1: Basic Usage
+
+```python
+from your_module import HuggingfaceLLM
+
+# Initialize the HuggingfaceLLM instance with a model ID
+model_id = "gpt2-small"
+inference = HuggingfaceLLM(model_id=model_id)
+
+# Generate text based on a prompt
+prompt_text = "Once upon a time"
+generated_text = inference(prompt_text)
+print(generated_text)
+```
+
+#### Example 2: Custom Configuration
+
+```python
+from your_module import HuggingfaceLLM
+
+# Initialize with custom configuration
+custom_config = {
+    "quantize": True,
+    "quantization_config": {"load_in_4bit": True},
+    "verbose": True
+}
+inference = HuggingfaceLLM(model_id="gpt2-small", **custom_config)
+
+# Generate text based on a prompt
+prompt_text = "Tell me a joke"
+generated_text = inference(prompt_text)
+print(generated_text)
+```
+
+#### Example 3: Distributed Processing
+
+```python
+from your_module import HuggingfaceLLM
+
+# Initialize for distributed processing
+inference = HuggingfaceLLM(model_id="gpt2-medium", distributed=True)
+
+# Generate text based on a prompt
+prompt_text = "Translate the following sentence to French"
+generated_text = inference(prompt_text)
+print(generated_text)
+```
+
+### Additional Information
+
+- The `HuggingfaceLLM` class provides the flexibility to load and use pre-trained models from the Hugging Face Transformers library.
+- Quantization can be enabled to reduce model size and inference time.
+- Distributed processing can be used for parallelized inference.
+- Verbose logging can help in debugging and understanding the text generation process.
+
+### References
+
+- [Hugging Face Transformers Documentation](https://huggingface.co/transformers/)
+- [PyTorch Documentation](https://pytorch.org/docs/stable/index.html)
+
+This documentation provides a comprehensive understanding of the `HuggingfaceLLM` class, its attributes, methods, and usage examples. Developers can use this class to perform text generation tasks efficiently using pre-trained models from the Hugging Face Transformers library.
\ No newline at end of file
diff --git a/mkdocs.yml b/mkdocs.yml
index 0b8083c9..d1b5f464 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -92,7 +92,7 @@ nav:
   - swarms.models:
     - Language:  
       - Overview: "swarms/models/index.md"
-      - HuggingFaceLLM: "swarms/models/hf.md"
+      - HuggingFaceLLM: "swarms/models/huggingface.md"
       - Anthropic: "swarms/models/anthropic.md"
       - OpenAI: "swarms/models/openai.md"
       - Zephyr: "swarms/models/zephyr.md"
diff --git a/swarms/models/huggingface.py b/swarms/models/huggingface.py
index 97c87d5a..95ebca47 100644
--- a/swarms/models/huggingface.py
+++ b/swarms/models/huggingface.py
@@ -35,7 +35,7 @@ class HuggingfaceLLM:
         self,
         model_id: str,
         device: str = None,
-        max_length: int = 20,
+        max_length: int = 500,
         quantize: bool = False,
         quantization_config: dict = None,
         verbose=False,
@@ -83,6 +83,7 @@ class HuggingfaceLLM:
             raise
 
     def load_model(self):
+        """Load the model"""
         if not self.model or not self.tokenizer:
             try:
                 self.tokenizer = AutoTokenizer.from_pretrained(self.model_id)
@@ -103,7 +104,7 @@ class HuggingfaceLLM:
                 self.logger.error(f"Failed to load the model or the tokenizer: {error}")
                 raise
 
-    def run(self, prompt_text: str, max_length: int = None):
+    def run(self, prompt_text: str):
         """
         Generate a response based on the prompt text.
 
@@ -116,7 +117,7 @@ class HuggingfaceLLM:
         """
         self.load_model()
 
-        max_length = max_length if max_length else self.max_length
+        max_length = self.max_length
 
         try:
             inputs = self.tokenizer.encode(prompt_text, return_tensors="pt").to(
@@ -157,7 +158,7 @@ class HuggingfaceLLM:
             self.logger.error(f"Failed to generate the text: {e}")
             raise
 
-    def __call__(self, prompt_text: str, max_length: int = None):
+    def __call__(self, prompt_text: str):
         """
         Generate a response based on the prompt text.
 
@@ -170,7 +171,7 @@ class HuggingfaceLLM:
         """
         self.load_model()
 
-        max_length = max_length if max_length else self.max_length
+        max_length = self.max_
 
         try:
             inputs = self.tokenizer.encode(prompt_text, return_tensors="pt").to(
@@ -210,4 +211,4 @@ class HuggingfaceLLM:
             return self.tokenizer.decode(outputs[0], skip_special_tokens=True)
         except Exception as e:
             self.logger.error(f"Failed to generate the text: {e}")
-            raise
\ No newline at end of file
+            raise
diff --git a/tests/models/huggingface.py b/tests/models/huggingface.py
new file mode 100644
index 00000000..46c7fa12
--- /dev/null
+++ b/tests/models/huggingface.py
@@ -0,0 +1,58 @@
+import pytest
+import torch
+from unittest.mock import Mock, patch
+from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
+from swarms.models.huggingface import HuggingfaceLLM
+
+
+@pytest.fixture
+def huggingface_llm():
+    # Create an instance of HuggingfaceLLM for testing.
+    model_id = "gpt2-small"
+    return HuggingfaceLLM(model_id=model_id)
+
+
+def test_initialization(huggingface_llm):
+    # Test the initialization of the HuggingfaceLLM class.
+    assert huggingface_llm.model_id == "gpt2-small"
+    assert huggingface_llm.device in ["cpu", "cuda"]
+    assert huggingface_llm.max_length == 20
+    assert huggingface_llm.verbose == False
+    assert huggingface_llm.distributed == False
+    assert huggingface_llm.decoding == False
+    assert huggingface_llm.model is None
+    assert huggingface_llm.tokenizer is None
+
+
+def test_load_model(huggingface_llm):
+    # Test loading the model.
+    huggingface_llm.load_model()
+    assert isinstance(huggingface_llm.model, AutoModelForCausalLM)
+    assert isinstance(huggingface_llm.tokenizer, AutoTokenizer)
+
+
+def test_run(huggingface_llm):
+    # Test the run method of HuggingfaceLLM.
+    prompt_text = "Once upon a time"
+    generated_text = huggingface_llm.run(prompt_text)
+    assert isinstance(generated_text, str)
+    assert len(generated_text) > 0
+
+
+def test_call_method(huggingface_llm):
+    # Test the __call__ method of HuggingfaceLLM.
+    prompt_text = "Once upon a time"
+    generated_text = huggingface_llm(prompt_text)
+    assert isinstance(generated_text, str)
+    assert len(generated_text) > 0
+
+
+def test_load_model_failure():
+    # Test loading model failure.
+    with patch(
+        "your_module.AutoModelForCausalLM.from_pretrained",
+        side_effect=Exception("Model load failed"),
+    ):
+        with pytest.raises(Exception):
+            huggingface_llm = HuggingfaceLLM(model_id="gpt2-small")
+            huggingface_llm.load_model()