[REFACTOR][HuggingfaceLLM]

2 years ago · e8ca14f071
parent 41b858a91d
commit e8ca14f071
3 changed files with 33 additions and 117 deletions
--- a/pyproject.toml
+++ b/pyproject.toml
@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"

 [tool.poetry]
 name = "swarms"
-version = "3.2.7"
+version = "3.2.8"
 description = "Swarms - Pytorch"
 license = "MIT"
 authors = ["Kye Gomez <kye@apac.ai>"]
--- a/swarms/models/cog_agent.py
+++ b/swarms/models/cog_agent.py
@ -8,11 +8,11 @@ device_check = "cuda" if torch.cuda.is_available() else "cpu"

 class CogAgent(BaseMultiModalModel):
    """CogAgent
-    
+
    Multi-modal conversational agent that can be used to chat with
    images and text. It is based on the CogAgent model from the
    ModelScope library.
-    
+
    Attributes:
        model_name (str): The name of the model to be used
        tokenizer_name (str): The name of the tokenizer to be used
@ -21,13 +21,14 @@ class CogAgent(BaseMultiModalModel):
        load_in_4bit (bool): Whether to load in 4-bit
        trust_remote_code (bool): Whether to trust remote code
        device (str): The device to be used
-    
+
    Examples:
        >>> from swarms.models.cog_agent import CogAgent
        >>> cog_agent = CogAgent()
        >>> cog_agent.run("How are you?", "images/1.jpg")
        <s> I'm fine. How are you? </s>
    """
+
    def __init__(
        self,
        model_name: str = "ZhipuAI/cogagent-chat",
@ -73,8 +74,8 @@ class CogAgent(BaseMultiModalModel):
        Args:
            task (str): The task to be performed
            img (str): The image path
-            
-        """ 
+
+        """
        image = Image.open(img).convert("RGB")

        input_by_model = self.model.build_conversation_input_ids(
--- a/swarms/models/huggingface.py
+++ b/swarms/models/huggingface.py
@ -3,18 +3,18 @@ import concurrent.futures
 import logging
 from typing import List, Tuple

-
 import torch
 from termcolor import colored
-from torch.nn.parallel import DistributedDataParallel as DDP
 from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
 )

+from swarms.models.base_llm import AbstractLLM
+

-class HuggingfaceLLM:
+class HuggingfaceLLM(AbstractLLM):
    """
    A class for running inference on a given model.

@ -123,7 +123,6 @@ class HuggingfaceLLM:
        quantize: bool = False,
        quantization_config: dict = None,
        verbose=False,
-        # logger=None,
        distributed=False,
        decoding=False,
        max_workers: int = 5,
@ -135,6 +134,7 @@ class HuggingfaceLLM:
        *args,
        **kwargs,
    ):
+        super().__init__(*args, **kwargs)
        self.logger = logging.getLogger(__name__)
        self.device = (
            device
@ -174,16 +174,21 @@ class HuggingfaceLLM:

        try:
            self.tokenizer = AutoTokenizer.from_pretrained(
-                self.model_id, *args, **kwargs
-            )
-            self.model = AutoModelForCausalLM.from_pretrained(
-                self.model_id,
-                quantization_config=bnb_config,
-                *args,
-                **kwargs,
+                self.model_id
            )

-            self.model  # .to(self.device)
+            if quantize:
+                self.model = AutoModelForCausalLM.from_pretrained(
+                    self.model_id,
+                    quantization_config=bnb_config,
+                    *args,
+                    **kwargs,
+                ).to(self.device)
+            else:
+                self.model = AutoModelForCausalLM.from_pretrained(
+                    self.model_id, *args, **kwargs
+                ).to(self.device)
+
        except Exception as e:
            # self.logger.error(f"Failed to load the model or the tokenizer: {e}")
            # raise
@ -205,33 +210,6 @@ class HuggingfaceLLM:
        """Ashcnronous generate text for a given prompt"""
        return await asyncio.to_thread(self.run, task)

-    def load_model(self):
-        """Load the model"""
-        if not self.model or not self.tokenizer:
-            try:
-                self.tokenizer = AutoTokenizer.from_pretrained(
-                    self.model_id
-                )
-
-                bnb_config = (
-                    BitsAndBytesConfig(**self.quantization_config)
-                    if self.quantization_config
-                    else None
-                )
-
-                self.model = AutoModelForCausalLM.from_pretrained(
-                    self.model_id, quantization_config=bnb_config
-                ).to(self.device)
-
-                if self.distributed:
-                    self.model = DDP(self.model)
-            except Exception as error:
-                self.logger.error(
-                    "Failed to load the model or the tokenizer:"
-                    f" {error}"
-                )
-                raise
-
    def concurrent_run(self, tasks: List[str], max_workers: int = 5):
        """Concurrently generate text for a list of prompts."""
        with concurrent.futures.ThreadPoolExecutor(
@ -252,7 +230,7 @@ class HuggingfaceLLM:
            results = [future.result() for future in futures]
        return results

-    def run(self, task: str):
+    def run(self, task: str, *args, **kwargs):
        """
        Generate a response based on the prompt text.

@ -263,20 +241,12 @@ class HuggingfaceLLM:
        Returns:
        - Generated text (str).
        """
-        self.load_model()
-
-        max_length = self.max_length
-
-        self.print_dashboard(task)
-
        try:
            inputs = self.tokenizer.encode(task, return_tensors="pt")

-            # self.log.start()
-
            if self.decoding:
                with torch.no_grad():
-                    for _ in range(max_length):
+                    for _ in range(self.max_length):
                        output_sequence = []

                        outputs = self.model.generate(
@ -300,7 +270,11 @@ class HuggingfaceLLM:
            else:
                with torch.no_grad():
                    outputs = self.model.generate(
-                        inputs, max_length=max_length, do_sample=True
+                        inputs,
+                        max_length=self.max_length,
+                        do_sample=True,
+                        *args,
+                        **kwargs,
                    )

            del inputs
@ -320,67 +294,8 @@ class HuggingfaceLLM:
            )
            raise

-    def __call__(self, task: str):
-        """
-        Generate a response based on the prompt text.
-
-        Args:
-        - task (str): Text to prompt the model.
-        - max_length (int): Maximum length of the response.
-
-        Returns:
-        - Generated text (str).
-        """
-        self.load_model()
-
-        max_length = self.max_length
-
-        self.print_dashboard(task)
-
-        try:
-            inputs = self.tokenizer.encode(
-                task, return_tensors="pt"
-            ).to(self.device)
-
-            # self.log.start()
-
-            if self.decoding:
-                with torch.no_grad():
-                    for _ in range(max_length):
-                        output_sequence = []
-
-                        outputs = self.model.generate(
-                            inputs,
-                            max_length=len(inputs) + 1,
-                            do_sample=True,
-                        )
-                        output_tokens = outputs[0][-1]
-                        output_sequence.append(output_tokens.item())
-
-                        # print token in real-time
-                        print(
-                            self.tokenizer.decode(
-                                [output_tokens],
-                                skip_special_tokens=True,
-                            ),
-                            end="",
-                            flush=True,
-                        )
-                        inputs = outputs
-            else:
-                with torch.no_grad():
-                    outputs = self.model.generate(
-                        inputs, max_length=max_length, do_sample=True
-                    )
-
-            del inputs
-
-            return self.tokenizer.decode(
-                outputs[0], skip_special_tokens=True
-            )
-        except Exception as e:
-            self.logger.error(f"Failed to generate the text: {e}")
-            raise
+    def __call__(self, task: str, *args, **kwargs):
+        return self.run(task, *args, **kwargs)

    async def __call_async__(self, task: str, *args, **kwargs) -> str:
        """Call the model asynchronously""" ""