[REFACTOR][HuggingfaceLLM]

2 years ago · e8ca14f071
parent 41b858a91d
commit e8ca14f071
3 changed files with 33 additions and 117 deletions
--- a/pyproject.toml
+++ b/pyproject.toml
@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
 [tool.poetry]
 name = "swarms"
-version = "3.2.7"
+version = "3.2.8"
 description = "Swarms - Pytorch"
 license = "MIT"
 authors = ["Kye Gomez <kye@apac.ai>"]
--- a/swarms/models/cog_agent.py
+++ b/swarms/models/cog_agent.py
@ -8,11 +8,11 @@ device_check = "cuda" if torch.cuda.is_available() else "cpu"
 class CogAgent(BaseMultiModalModel):
    """CogAgent
-    
+
    Multi-modal conversational agent that can be used to chat with
    images and text. It is based on the CogAgent model from the
    ModelScope library.
-    
+
    Attributes:
        model_name (str): The name of the model to be used
        tokenizer_name (str): The name of the tokenizer to be used
@ -21,13 +21,14 @@ class CogAgent(BaseMultiModalModel):
        load_in_4bit (bool): Whether to load in 4-bit
        trust_remote_code (bool): Whether to trust remote code
        device (str): The device to be used
-    
+
    Examples:
        >>> from swarms.models.cog_agent import CogAgent
        >>> cog_agent = CogAgent()
        >>> cog_agent.run("How are you?", "images/1.jpg")
        <s> I'm fine. How are you? </s>
    """
    def __init__(
        self,
        model_name: str = "ZhipuAI/cogagent-chat",
@ -73,8 +74,8 @@ class CogAgent(BaseMultiModalModel):
        Args:
            task (str): The task to be performed
            img (str): The image path
-            
+
-        """ 
+        """
        image = Image.open(img).convert("RGB")
        input_by_model = self.model.build_conversation_input_ids(
--- a/swarms/models/huggingface.py
+++ b/swarms/models/huggingface.py
@ -3,18 +3,18 @@ import concurrent.futures
 import logging
 from typing import List, Tuple
 import torch
 from termcolor import colored
 from torch.nn.parallel import DistributedDataParallel as DDP
 from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
 )
 from swarms.models.base_llm import AbstractLLM
-class HuggingfaceLLM:
+class HuggingfaceLLM(AbstractLLM):
    """
    A class for running inference on a given model.
@ -123,7 +123,6 @@ class HuggingfaceLLM:
        quantize: bool = False,
        quantization_config: dict = None,
        verbose=False,
        # logger=None,
        distributed=False,
        decoding=False,
        max_workers: int = 5,
@ -135,6 +134,7 @@ class HuggingfaceLLM:
        *args,
        **kwargs,
    ):
        super().__init__(*args, **kwargs)
        self.logger = logging.getLogger(__name__)
        self.device = (
            device
@ -174,16 +174,21 @@ class HuggingfaceLLM:
        try:
            self.tokenizer = AutoTokenizer.from_pretrained(
-                self.model_id, *args, **kwargs
+                self.model_id
            )
            self.model = AutoModelForCausalLM.from_pretrained(
                self.model_id,
                quantization_config=bnb_config,
                *args,
                **kwargs,
            )
-            self.model  # .to(self.device)
+            if quantize:
                self.model = AutoModelForCausalLM.from_pretrained(
                    self.model_id,
                    quantization_config=bnb_config,
                    *args,
                    **kwargs,
                ).to(self.device)
            else:
                self.model = AutoModelForCausalLM.from_pretrained(
                    self.model_id, *args, **kwargs
                ).to(self.device)
        except Exception as e:
            # self.logger.error(f"Failed to load the model or the tokenizer: {e}")
            # raise
@ -205,33 +210,6 @@ class HuggingfaceLLM:
        """Ashcnronous generate text for a given prompt"""
        return await asyncio.to_thread(self.run, task)
    def load_model(self):
        """Load the model"""
        if not self.model or not self.tokenizer:
            try:
                self.tokenizer = AutoTokenizer.from_pretrained(
                    self.model_id
                )
                bnb_config = (
                    BitsAndBytesConfig(**self.quantization_config)
                    if self.quantization_config
                    else None
                )
                self.model = AutoModelForCausalLM.from_pretrained(
                    self.model_id, quantization_config=bnb_config
                ).to(self.device)
                if self.distributed:
                    self.model = DDP(self.model)
            except Exception as error:
                self.logger.error(
                    "Failed to load the model or the tokenizer:"
                    f" {error}"
                )
                raise
    def concurrent_run(self, tasks: List[str], max_workers: int = 5):
        """Concurrently generate text for a list of prompts."""
        with concurrent.futures.ThreadPoolExecutor(
@ -252,7 +230,7 @@ class HuggingfaceLLM:
            results = [future.result() for future in futures]
        return results
-    def run(self, task: str):
+    def run(self, task: str, *args, **kwargs):
        """
        Generate a response based on the prompt text.
@ -263,20 +241,12 @@ class HuggingfaceLLM:
        Returns:
        - Generated text (str).
        """
        self.load_model()
        max_length = self.max_length
        self.print_dashboard(task)
        try:
            inputs = self.tokenizer.encode(task, return_tensors="pt")
            # self.log.start()
            if self.decoding:
                with torch.no_grad():
-                    for _ in range(max_length):
+                    for _ in range(self.max_length):
                        output_sequence = []
                        outputs = self.model.generate(
@ -300,7 +270,11 @@ class HuggingfaceLLM:
            else:
                with torch.no_grad():
                    outputs = self.model.generate(
-                        inputs, max_length=max_length, do_sample=True
+                        inputs,
                        max_length=self.max_length,
                        do_sample=True,
                        *args,
                        **kwargs,
                    )
            del inputs
@ -320,67 +294,8 @@ class HuggingfaceLLM:
            )
            raise
-    def __call__(self, task: str):
+    def __call__(self, task: str, *args, **kwargs):
-        """
+        return self.run(task, *args, **kwargs)
        Generate a response based on the prompt text.
        Args:
        - task (str): Text to prompt the model.
        - max_length (int): Maximum length of the response.
        Returns:
        - Generated text (str).
        """
        self.load_model()
        max_length = self.max_length
        self.print_dashboard(task)
        try:
            inputs = self.tokenizer.encode(
                task, return_tensors="pt"
            ).to(self.device)
            # self.log.start()
            if self.decoding:
                with torch.no_grad():
                    for _ in range(max_length):
                        output_sequence = []
                        outputs = self.model.generate(
                            inputs,
                            max_length=len(inputs) + 1,
                            do_sample=True,
                        )
                        output_tokens = outputs[0][-1]
                        output_sequence.append(output_tokens.item())
                        # print token in real-time
                        print(
                            self.tokenizer.decode(
                                [output_tokens],
                                skip_special_tokens=True,
                            ),
                            end="",
                            flush=True,
                        )
                        inputs = outputs
            else:
                with torch.no_grad():
                    outputs = self.model.generate(
                        inputs, max_length=max_length, do_sample=True
                    )
            del inputs
            return self.tokenizer.decode(
                outputs[0], skip_special_tokens=True
            )
        except Exception as e:
            self.logger.error(f"Failed to generate the text: {e}")
            raise
    async def __call_async__(self, task: str, *args, **kwargs) -> str:
        """Call the model asynchronously""" ""