import torch from transformers import AutoProcessor, IdeficsForVisionText2Text from termcolor import colored from swarms.models.base_multimodal_model import BaseMultiModalModel from typing import Optional, Callable def autodetect_device(): """ Autodetects the device to use for inference. Returns ------- str The device to use for inference. """ return "cuda" if torch.cuda.is_available() else "cpu" class Idefics(BaseMultiModalModel): """ A class for multimodal inference using pre-trained models from the Hugging Face Hub. Attributes ---------- device : str The device to use for inference. model_name : str, optional The name of the pre-trained model model_name (default is "HuggingFaceM4/idefics-9b-instruct"). processor : transformers.PreTrainedProcessor The pre-trained processor. max_length : int The maximum length of the generated text. chat_history : list The chat history. Methods ------- infer(prompts, batched_mode=True) Generates text based on the provided prompts. chat(user_input) Engages in a continuous bidirectional conversation based on the user input. set_model_name(model_name) Changes the model model_name. set_device(device) Changes the device used for inference. set_max_length(max_length) Changes the maximum length of the generated text. clear_chat_history() Clears the chat history. # Usage ``` from swarms.models import idefics model = idefics() user_input = "User: What is in this image? https://upload.wikimedia.org/wikipedia/commons/8/86/Id%C3%A9fix.JPG" response = model.chat(user_input) print(response) user_input = "User: And who is that? https://static.wikia.nocookie.net/asterix/images/2/25/R22b.gif/revision/latest?cb=20110815073052" response = model.chat(user_input) print(response) model.set_model_name("new_model_name") model.set_device("cpu") model.set_max_length(200) model.clear_chat_history() ``` """ def __init__( self, model_name: Optional[ str ] = "HuggingFaceM4/idefics-9b-instruct", device: Callable = autodetect_device, torch_dtype=torch.bfloat16, max_length: int = 100, batched_mode: bool = True, *args, **kwargs, ): # Initialize the parent class super().__init__(*args, **kwargs) self.model_name = model_name self.device = device self.max_length = max_length self.batched_mode = batched_mode self.chat_history = [] self.device = ( device if device else ("cuda" if torch.cuda.is_available() else "cpu") ) self.model = IdeficsForVisionText2Text.from_pretrained( model_name, torch_dtype=torch_dtype, *args, **kwargs ).to(self.device) self.processor = AutoProcessor.from_pretrained( model_name, *args, **kwargs ) def run( self, task: str = None, img: str = None, *args, **kwargs ) -> str: """ Generates text based on the provided prompts. Parameters ---------- task : str the task to perform batched_mode : bool, optional Whether to process the prompts in batched mode. If True, all prompts are processed together. If False, only the first prompt is processed (default is True). Returns ------- list A list of generated text strings. """ try: inputs = ( self.processor( task, add_end_of_utterance_token=False, return_tensors="pt", *args, **kwargs, ).to(self.device) if self.batched_mode else self.processor(task, return_tensors="pt").to( self.device ) ) exit_condition = self.processor.tokenizer( "", add_special_tokens=False ).input_ids bad_words_ids = self.processor.tokenizer( ["", "