From e8ca14f071d765e2fdcc5d53d72ac592bbe1d810 Mon Sep 17 00:00:00 2001 From: Kye Date: Fri, 5 Jan 2024 12:29:24 -0500 Subject: [PATCH] [REFACTOR][HuggingfaceLLM] --- pyproject.toml | 2 +- swarms/models/cog_agent.py | 11 +-- swarms/models/huggingface.py | 137 +++++++---------------------------- 3 files changed, 33 insertions(+), 117 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index a40d9530..0fa7b2db 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api" [tool.poetry] name = "swarms" -version = "3.2.7" +version = "3.2.8" description = "Swarms - Pytorch" license = "MIT" authors = ["Kye Gomez "] diff --git a/swarms/models/cog_agent.py b/swarms/models/cog_agent.py index 7a3ff684..2d0d09e9 100644 --- a/swarms/models/cog_agent.py +++ b/swarms/models/cog_agent.py @@ -8,11 +8,11 @@ device_check = "cuda" if torch.cuda.is_available() else "cpu" class CogAgent(BaseMultiModalModel): """CogAgent - + Multi-modal conversational agent that can be used to chat with images and text. It is based on the CogAgent model from the ModelScope library. - + Attributes: model_name (str): The name of the model to be used tokenizer_name (str): The name of the tokenizer to be used @@ -21,13 +21,14 @@ class CogAgent(BaseMultiModalModel): load_in_4bit (bool): Whether to load in 4-bit trust_remote_code (bool): Whether to trust remote code device (str): The device to be used - + Examples: >>> from swarms.models.cog_agent import CogAgent >>> cog_agent = CogAgent() >>> cog_agent.run("How are you?", "images/1.jpg") I'm fine. How are you? """ + def __init__( self, model_name: str = "ZhipuAI/cogagent-chat", @@ -73,8 +74,8 @@ class CogAgent(BaseMultiModalModel): Args: task (str): The task to be performed img (str): The image path - - """ + + """ image = Image.open(img).convert("RGB") input_by_model = self.model.build_conversation_input_ids( diff --git a/swarms/models/huggingface.py b/swarms/models/huggingface.py index bbb39223..d9447f3c 100644 --- a/swarms/models/huggingface.py +++ b/swarms/models/huggingface.py @@ -3,18 +3,18 @@ import concurrent.futures import logging from typing import List, Tuple - import torch from termcolor import colored -from torch.nn.parallel import DistributedDataParallel as DDP from transformers import ( AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, ) +from swarms.models.base_llm import AbstractLLM + -class HuggingfaceLLM: +class HuggingfaceLLM(AbstractLLM): """ A class for running inference on a given model. @@ -123,7 +123,6 @@ class HuggingfaceLLM: quantize: bool = False, quantization_config: dict = None, verbose=False, - # logger=None, distributed=False, decoding=False, max_workers: int = 5, @@ -135,6 +134,7 @@ class HuggingfaceLLM: *args, **kwargs, ): + super().__init__(*args, **kwargs) self.logger = logging.getLogger(__name__) self.device = ( device @@ -174,16 +174,21 @@ class HuggingfaceLLM: try: self.tokenizer = AutoTokenizer.from_pretrained( - self.model_id, *args, **kwargs - ) - self.model = AutoModelForCausalLM.from_pretrained( - self.model_id, - quantization_config=bnb_config, - *args, - **kwargs, + self.model_id ) - self.model # .to(self.device) + if quantize: + self.model = AutoModelForCausalLM.from_pretrained( + self.model_id, + quantization_config=bnb_config, + *args, + **kwargs, + ).to(self.device) + else: + self.model = AutoModelForCausalLM.from_pretrained( + self.model_id, *args, **kwargs + ).to(self.device) + except Exception as e: # self.logger.error(f"Failed to load the model or the tokenizer: {e}") # raise @@ -205,33 +210,6 @@ class HuggingfaceLLM: """Ashcnronous generate text for a given prompt""" return await asyncio.to_thread(self.run, task) - def load_model(self): - """Load the model""" - if not self.model or not self.tokenizer: - try: - self.tokenizer = AutoTokenizer.from_pretrained( - self.model_id - ) - - bnb_config = ( - BitsAndBytesConfig(**self.quantization_config) - if self.quantization_config - else None - ) - - self.model = AutoModelForCausalLM.from_pretrained( - self.model_id, quantization_config=bnb_config - ).to(self.device) - - if self.distributed: - self.model = DDP(self.model) - except Exception as error: - self.logger.error( - "Failed to load the model or the tokenizer:" - f" {error}" - ) - raise - def concurrent_run(self, tasks: List[str], max_workers: int = 5): """Concurrently generate text for a list of prompts.""" with concurrent.futures.ThreadPoolExecutor( @@ -252,7 +230,7 @@ class HuggingfaceLLM: results = [future.result() for future in futures] return results - def run(self, task: str): + def run(self, task: str, *args, **kwargs): """ Generate a response based on the prompt text. @@ -263,20 +241,12 @@ class HuggingfaceLLM: Returns: - Generated text (str). """ - self.load_model() - - max_length = self.max_length - - self.print_dashboard(task) - try: inputs = self.tokenizer.encode(task, return_tensors="pt") - # self.log.start() - if self.decoding: with torch.no_grad(): - for _ in range(max_length): + for _ in range(self.max_length): output_sequence = [] outputs = self.model.generate( @@ -300,7 +270,11 @@ class HuggingfaceLLM: else: with torch.no_grad(): outputs = self.model.generate( - inputs, max_length=max_length, do_sample=True + inputs, + max_length=self.max_length, + do_sample=True, + *args, + **kwargs, ) del inputs @@ -320,67 +294,8 @@ class HuggingfaceLLM: ) raise - def __call__(self, task: str): - """ - Generate a response based on the prompt text. - - Args: - - task (str): Text to prompt the model. - - max_length (int): Maximum length of the response. - - Returns: - - Generated text (str). - """ - self.load_model() - - max_length = self.max_length - - self.print_dashboard(task) - - try: - inputs = self.tokenizer.encode( - task, return_tensors="pt" - ).to(self.device) - - # self.log.start() - - if self.decoding: - with torch.no_grad(): - for _ in range(max_length): - output_sequence = [] - - outputs = self.model.generate( - inputs, - max_length=len(inputs) + 1, - do_sample=True, - ) - output_tokens = outputs[0][-1] - output_sequence.append(output_tokens.item()) - - # print token in real-time - print( - self.tokenizer.decode( - [output_tokens], - skip_special_tokens=True, - ), - end="", - flush=True, - ) - inputs = outputs - else: - with torch.no_grad(): - outputs = self.model.generate( - inputs, max_length=max_length, do_sample=True - ) - - del inputs - - return self.tokenizer.decode( - outputs[0], skip_special_tokens=True - ) - except Exception as e: - self.logger.error(f"Failed to generate the text: {e}") - raise + def __call__(self, task: str, *args, **kwargs): + return self.run(task, *args, **kwargs) async def __call_async__(self, task: str, *args, **kwargs) -> str: """Call the model asynchronously""" ""