From e9bb8dcbf4f2f0ae3b23b10dd963df193c6a5001 Mon Sep 17 00:00:00 2001 From: Kye Date: Fri, 22 Dec 2023 23:38:35 -0500 Subject: [PATCH] [swarm.models][cleanup] --- swarms/models/kosmos2.py | 131 ------------------------------ swarms/models/kosmos_two.py | 87 ++++++++++---------- swarms/models/stable_diffusion.py | 12 +++ 3 files changed, 56 insertions(+), 174 deletions(-) delete mode 100644 swarms/models/kosmos2.py diff --git a/swarms/models/kosmos2.py b/swarms/models/kosmos2.py deleted file mode 100644 index 9a9a0de3..00000000 --- a/swarms/models/kosmos2.py +++ /dev/null @@ -1,131 +0,0 @@ -from typing import List, Tuple - -from PIL import Image -from pydantic import BaseModel, model_validator, validator -from transformers import AutoModelForVision2Seq, AutoProcessor - - -# Assuming the Detections class represents the output of the model prediction -class Detections(BaseModel): - xyxy: List[Tuple[float, float, float, float]] - class_id: List[int] - confidence: List[float] - - @model_validator - def check_length(cls, values): - assert ( - len(values.get("xyxy")) - == len(values.get("class_id")) - == len(values.get("confidence")) - ), "All fields must have the same length." - return values - - @validator( - "xyxy", "class_id", "confidence", pre=True, each_item=True - ) - def check_not_empty(cls, v): - if isinstance(v, list) and len(v) == 0: - raise ValueError("List must not be empty") - return v - - @classmethod - def empty(cls): - return cls(xyxy=[], class_id=[], confidence=[]) - - -class Kosmos2(BaseModel): - """ - Kosmos2 - - Args: - ------ - model: AutoModelForVision2Seq - processor: AutoProcessor - - Usage: - ------ - >>> from swarms import Kosmos2 - >>> from swarms.models.kosmos2 import Detections - >>> from PIL import Image - >>> model = Kosmos2.initialize() - >>> image = Image.open("path_to_image.jpg") - >>> detections = model(image) - >>> print(detections) - - """ - - model: AutoModelForVision2Seq - processor: AutoProcessor - - @classmethod - def initialize(cls): - model = AutoModelForVision2Seq.from_pretrained( - "ydshieh/kosmos-2-patch14-224", trust_remote_code=True - ) - processor = AutoProcessor.from_pretrained( - "ydshieh/kosmos-2-patch14-224", trust_remote_code=True - ) - return cls(model=model, processor=processor) - - def __call__(self, img: str) -> Detections: - image = Image.open(img) - prompt = "An image of" - - inputs = self.processor( - text=prompt, images=image, return_tensors="pt" - ) - outputs = self.model.generate( - **inputs, use_cache=True, max_new_tokens=64 - ) - - generated_text = self.processor.batch_decode( - outputs, skip_special_tokens=True - )[0] - - # The actual processing of generated_text to entities would go here - # For the purpose of this example, assume a mock function 'extract_entities' exists: - entities = self.extract_entities(generated_text) - - # Convert entities to detections format - detections = self.process_entities_to_detections( - entities, image - ) - return detections - - def extract_entities( - self, text: str - ) -> List[Tuple[str, Tuple[float, float, float, float]]]: - # Placeholder function for entity extraction - # This should be replaced with the actual method of extracting entities - return [] - - def process_entities_to_detections( - self, - entities: List[Tuple[str, Tuple[float, float, float, float]]], - image: Image.Image, - ) -> Detections: - if not entities: - return Detections.empty() - - class_ids = [0] * len( - entities - ) # Replace with actual class ID extraction logic - xyxys = [ - ( - e[1][0] * image.width, - e[1][1] * image.height, - e[1][2] * image.width, - e[1][3] * image.height, - ) - for e in entities - ] - confidences = [1.0] * len(entities) # Placeholder confidence - - return Detections( - xyxy=xyxys, class_id=class_ids, confidence=confidences - ) - - -# Usage: -# kosmos2 = Kosmos2.initialize() -# detections = kosmos2(img="path_to_image.jpg") diff --git a/swarms/models/kosmos_two.py b/swarms/models/kosmos_two.py index 3b1d4233..a0c5a86a 100644 --- a/swarms/models/kosmos_two.py +++ b/swarms/models/kosmos_two.py @@ -8,6 +8,8 @@ import torchvision.transforms as T from PIL import Image from transformers import AutoModelForVision2Seq, AutoProcessor +from swarms.models.base_multimodal_model import BaseMultimodalModel + # utils def is_overlapping(rect1, rect2): @@ -16,7 +18,7 @@ def is_overlapping(rect1, rect2): return not (x2 < x3 or x1 > x4 or y2 < y3 or y1 > y4) -class Kosmos: +class Kosmos(BaseMultimodalModel): """ Kosmos model by Yen-Chun Shieh @@ -35,9 +37,14 @@ class Kosmos: def __init__( self, model_name="ydshieh/kosmos-2-patch14-224", + max_new_tokens: int = 64, *args, **kwargs, ): + super(Kosmos, self).__init__(*args, **kwargs) + + self.max_new_tokens = max_new_tokens + self.model = AutoModelForVision2Seq.from_pretrained( model_name, trust_remote_code=True, *args, **kwargs ) @@ -45,81 +52,75 @@ class Kosmos: model_name, trust_remote_code=True, *args, **kwargs ) - def get_image(self, url): - """Image""" + def get_image(self, url: str): + """Get image from url + + Args: + url (str): url of image + + Returns: + _type_: _description_ + """ return Image.open(requests.get(url, stream=True).raw) - def run(self, prompt, image): - """Run Kosmos""" - inputs = self.processor( - text=prompt, images=image, return_tensors="pt" - ) - generated_ids = self.model.generate( - pixel_values=inputs["pixel_values"], - input_ids=inputs["input_ids"][:, :-1], - attention_mask=inputs["attention_mask"][:, :-1], - img_features=None, - img_attn_mask=inputs["img_attn_mask"][:, :-1], - use_cache=True, - max_new_tokens=64, - ) - generated_texts = self.processor.batch_decode( - generated_ids, - skip_special_tokens=True, - )[0] - processed_text, entities = ( - self.processor.post_process_generation(generated_texts) - ) + def run(self, task: str, image: str, *args, **kwargs): + """Run the model - def __call__(self, prompt, image): - """Run call""" + Args: + task (str): task to run + image (str): img url + """ inputs = self.processor( - text=prompt, images=image, return_tensors="pt" + text=task, images=image, return_tensors="pt" ) generated_ids = self.model.generate( pixel_values=inputs["pixel_values"], input_ids=inputs["input_ids"][:, :-1], attention_mask=inputs["attention_mask"][:, :-1], - img_features=None, + image_embeds=None, img_attn_mask=inputs["img_attn_mask"][:, :-1], use_cache=True, - max_new_tokens=64, + max_new_tokens=self.max_new_tokens, ) + generated_texts = self.processor.batch_decode( generated_ids, skip_special_tokens=True, )[0] + processed_text, entities = ( self.processor.post_process_generation(generated_texts) ) + return processed_text, entities + # tasks def multimodal_grounding(self, phrase, image_url): - prompt = f" {phrase} " - self.run(prompt, image_url) + task = f" {phrase} " + self.run(task, image_url) def referring_expression_comprehension(self, phrase, image_url): - prompt = f" {phrase} " - self.run(prompt, image_url) + task = f" {phrase} " + self.run(task, image_url) def referring_expression_generation(self, phrase, image_url): - prompt = ( + task = ( "" " It is" ) - self.run(prompt, image_url) + self.run(task, image_url) def grounded_vqa(self, question, image_url): - prompt = f" Question: {question} Answer:" - self.run(prompt, image_url) + task = f" Question: {question} Answer:" + self.run(task, image_url) def grounded_image_captioning(self, image_url): - prompt = " An image of" - self.run(prompt, image_url) + task = " An image of" + self.run(task, image_url) def grounded_image_captioning_detailed(self, image_url): - prompt = " Describe this image in detail" - self.run(prompt, image_url) + task = " Describe this image in detail" + self.run(task, image_url) def draw_entity_boxes_on_image( image, entities, show=False, save_path=None @@ -320,7 +321,7 @@ class Kosmos: return new_image - def generate_boxees(self, prompt, image_url): + def generate_boxees(self, task, image_url): image = self.get_image(image_url) - processed_text, entities = self.process_prompt(prompt, image) + processed_text, entities = self.process_task(task, image) self.draw_entity_boxes_on_image(image, entities, show=True) diff --git a/swarms/models/stable_diffusion.py b/swarms/models/stable_diffusion.py index 7b363d02..a0068531 100644 --- a/swarms/models/stable_diffusion.py +++ b/swarms/models/stable_diffusion.py @@ -140,6 +140,18 @@ class StableDiffusion: return image_paths def generate_and_move_image(self, prompt, iteration, folder_path): + """ + Generates an image based on the given prompt and moves it to the specified folder. + + Args: + prompt (str): The prompt used to generate the image. + iteration (int): The iteration number. + folder_path (str): The path to the folder where the image will be moved. + + Returns: + str: The path of the moved image. + + """ # Generate the image image_paths = self.run(prompt) if not image_paths: