Former-commit-id: 583d9384edde63090c59f56ac050ff984bed0178
pull/160/head
Kye 2 years ago
parent 6b2179e9a5
commit cd804d7c17

@ -1300,6 +1300,56 @@ class VisualQuestionAnswering(BaseToolSet):
return answer return answer
#========================> handlers/image
import torch
from PIL import Image
from transformers import BlipForConditionalGeneration, BlipProcessor
# from core.prompts.file import IMAGE_PROMPT
from swarms.prompts.prompts import IMAGE_PROMPT
from swarms.utils.utils import BaseHandler
class ImageCaptioning(BaseHandler):
def __init__(self, device):
print("Initializing ImageCaptioning to %s" % device)
self.device = device
self.torch_dtype = torch.float16 if "cuda" in device else torch.float32
self.processor = BlipProcessor.from_pretrained(
"Salesforce/blip-image-captioning-base"
)
self.model = BlipForConditionalGeneration.from_pretrained(
"Salesforce/blip-image-captioning-base", torch_dtype=self.torch_dtype
).to(self.device)
def handle(self, filename: str):
img = Image.open(filename)
width, height = img.size
ratio = min(512 / width, 512 / height)
width_new, height_new = (round(width * ratio), round(height * ratio))
img = img.resize((width_new, height_new))
img = img.convert("RGB")
img.save(filename, "PNG")
print(f"Resize image form {width}x{height} to {width_new}x{height_new}")
inputs = self.processor(Image.open(filename), return_tensors="pt").to(
self.device, self.torch_dtype
)
out = self.model.generate(**inputs)
description = self.processor.decode(out[0], skip_special_tokens=True)
print(
f"\nProcessed ImageCaptioning, Input Image: {filename}, Output Text: {description}"
)
return IMAGE_PROMPT.format(filename=filename, description=description)
#segment anything: #segment anything:
########################### MODELS ########################### MODELS

@ -225,7 +225,7 @@ class AbstractUploader(ABC):
#========================= upload s3 #========================= upload s3
import os
import boto3 import boto3
@ -262,7 +262,6 @@ class S3Uploader(AbstractUploader):
#========================= upload s3 #========================= upload s3
#========================> upload/static #========================> upload/static
import os
import shutil import shutil
from pathlib import Path from pathlib import Path
@ -291,11 +290,9 @@ class StaticUploader(AbstractUploader):
#========================> handlers/base #========================> handlers/base
import os
import shutil
import uuid import uuid
from enum import Enum from enum import Enum
from pathlib import Path
from typing import Dict from typing import Dict
import requests import requests
@ -402,7 +399,6 @@ class FileHandler:
#############===========================> #############===========================>
import pandas as pd
from swarms.prompts.prompts import DATAFRAME_PROMPT from swarms.prompts.prompts import DATAFRAME_PROMPT
@ -425,77 +421,3 @@ class CsvToDataframe(BaseHandler):
#========================> handlers/image
import torch
from PIL import Image
from transformers import BlipForConditionalGeneration, BlipProcessor
# from core.prompts.file import IMAGE_PROMPT
from swarms.prompts.prompts import IMAGE_PROMPT
class ImageCaptioning(BaseHandler):
def __init__(self, device):
print("Initializing ImageCaptioning to %s" % device)
self.device = device
self.torch_dtype = torch.float16 if "cuda" in device else torch.float32
self.processor = BlipProcessor.from_pretrained(
"Salesforce/blip-image-captioning-base"
)
self.model = BlipForConditionalGeneration.from_pretrained(
"Salesforce/blip-image-captioning-base", torch_dtype=self.torch_dtype
).to(self.device)
def handle(self, filename: str):
img = Image.open(filename)
width, height = img.size
ratio = min(512 / width, 512 / height)
width_new, height_new = (round(width * ratio), round(height * ratio))
img = img.resize((width_new, height_new))
img = img.convert("RGB")
img.save(filename, "PNG")
print(f"Resize image form {width}x{height} to {width_new}x{height_new}")
inputs = self.processor(Image.open(filename), return_tensors="pt").to(
self.device, self.torch_dtype
)
out = self.model.generate(**inputs)
description = self.processor.decode(out[0], skip_special_tokens=True)
print(
f"\nProcessed ImageCaptioning, Input Image: {filename}, Output Text: {description}"
)
return IMAGE_PROMPT.format(filename=filename, description=description)
# from autogpt.agent import Agent
# from swarms.agents.swarms import worker_node
# class MultiAgent(worker_node):
# def __init__(
# self,
# ai_name,
# memory,
# full_message_history,
# prompt,
# user_input,
# agent_id
# ):
# super().__init__(
# ai_name=ai_name,
# memory=memory,
# full_message_history=full_message_history,
# next_action_count=0,
# prompt=prompt,
# user_input=user_input,
# )
# self.agent_id = agent_id
# self.auditory_buffer = [] # contains the non processed parts of the conversation
# def receive_message(self, speaker, message):
# self.auditory_buffer.append((speaker.ai_name, message))
Loading…
Cancel
Save