[Idefics -> BaseMultiModalModel] [Vilt => BaseMultiModalModel]

2 years ago · 79d8f149b7
parent 4bef09a252
commit 79d8f149b7
4 changed files with 70 additions and 10 deletions
--- a/swarms/models/idefics.py
+++ b/swarms/models/idefics.py
@ -102,9 +102,13 @@ class Idefics(BaseMultiModalModel):
            model_name, torch_dtype=torch_dtype, *args, **kwargs
        ).to(self.device)
-        self.processor = AutoProcessor.from_pretrained(model_name)
+        self.processor = AutoProcessor.from_pretrained(
            model_name, *args, **kwargs
        )
-    def run(self, task: str, *args, **kwargs) -> str:
+    def run(
        self, task: str = None, img: str = None, *args, **kwargs
    ) -> str:
        """
        Generates text based on the provided prompts.
--- a/swarms/models/openai_tts.py
+++ b/swarms/models/openai_tts.py
@ -61,6 +61,8 @@ class OpenAITTS(AbstractLLM):
        chunk_size=1024 * 1024,
        autosave: bool = False,
        saved_filepath: str = None,
        *args,
        **kwargs,
    ):
        super().__init__()
        self.model_name = model_name
--- a/swarms/models/vilt.py
+++ b/swarms/models/vilt.py
@ -1,9 +1,11 @@
 from transformers import ViltProcessor, ViltForQuestionAnswering
 import requests
 from PIL import Image
 from transformers import ViltForQuestionAnswering, ViltProcessor
 from swarms.models.base_multimodal_model import BaseMultiModalModel
-class Vilt:
+
 class Vilt(BaseMultiModalModel):
    """
    Vision-and-Language Transformer (ViLT) model fine-tuned on VQAv2.
    It was introduced in the paper ViLT: Vision-and-Language Transformer Without
@ -21,15 +23,21 @@ class Vilt:
    """
-    def __init__(self):
+    def __init__(
        self,
        model_name: str = "dandelin/vilt-b32-finetuned-vqa",
        *args,
        **kwargs,
    ):
        super().__init__(model_name, *args, **kwargs)
        self.processor = ViltProcessor.from_pretrained(
-            "dandelin/vilt-b32-finetuned-vqa"
+            model_name, *args, **kwargs
        )
        self.model = ViltForQuestionAnswering.from_pretrained(
-            "dandelin/vilt-b32-finetuned-vqa"
+            model_name, *args, **kwargs
        )
-    def __call__(self, text: str, image_url: str):
+    def run(self, task: str = None, img: str = None, *args, **kwargs):
        """
        Run the model
@ -38,9 +46,9 @@ class Vilt:
        """
        # Download the image
-        image = Image.open(requests.get(image_url, stream=True).raw)
+        image = Image.open(requests.get(img, stream=True).raw)
-        encoding = self.processor(image, text, return_tensors="pt")
+        encoding = self.processor(image, task, return_tensors="pt")
        # Forward pass
        outputs = self.model(**encoding)
--- a/swarms/structs/base.py
+++ b/swarms/structs/base.py
@ -15,6 +15,52 @@ except ImportError as error:
 class BaseStructure(ABC):
    """Base structure.
    Attributes:
        name (Optional[str]): _description_
        description (Optional[str]): _description_
        save_metadata (bool): _description_
        save_artifact_path (Optional[str]): _description_
        save_metadata_path (Optional[str]): _description_
        save_error_path (Optional[str]): _description_
    Methods:
        run: _description_
        save_to_file: _description_
        load_from_file: _description_
        save_metadata: _description_
        load_metadata: _description_
        log_error: _description_
        save_artifact: _description_
        load_artifact: _description_
        log_event: _description_
        run_async: _description_
        save_metadata_async: _description_
        load_metadata_async: _description_
        log_error_async: _description_
        save_artifact_async: _description_
        load_artifact_async: _description_
        log_event_async: _description_
        asave_to_file: _description_
        aload_from_file: _description_
        run_in_thread: _description_
        save_metadata_in_thread: _description_
        run_concurrent: _description_
        compress_data: _description_
        decompres_data: _description_
        run_batched: _description_
        load_config: _description_
        backup_data: _description_
        monitor_resources: _description_
        run_with_resources: _description_
        run_with_resources_batched: _description_
    Examples:
    """
    def __init__(
        self,
        name: Optional[str] = None,