gpt4vision api

pull/307/head
Kye 2 years ago committed by Zack
parent 80f288c832
commit 13c54d0b00

@ -1,34 +1,17 @@
# Description: This is an example of how to use the Agent class to run a multi-modal workflow
import os
from dotenv import load_dotenv
from swarms.structs import Flow
from swarms.models.gpt4_vision_api import GPT4VisionAPI
from swarms.structs import Agent
# Load the environment variables
load_dotenv()
# Get the API key from the environment
api_key = os.environ.get("OPENAI_API_KEY")
llm = GPT4VisionAPI()
# Initialize the language model
llm = GPT4VisionAPI(
openai_api_key=api_key,
max_tokens=500,
)
# Initialize the language model
task = "What is the color of the object?"
img = "images/swarms.jpeg"
## Initialize the workflow
agent = Agent(
flow = Flow(
llm=llm,
max_loops="auto",
autosave=True,
max_loops='auto',
dashboard=True,
multi_modal=True,
)
# Run the workflow on a task
out = agent.run(task=task, img=img)
print(out)
flow.run(task=task, img=img)

@ -1,22 +1,3 @@
<<<<<<< HEAD
from swarms.structs import Agent
from swarms.models.gpt4_vision_api import GPT4VisionAPI
llm = GPT4VisionAPI()
task = "What is the color of the object?"
img = "images/swarms.jpeg"
## Initialize the workflow
agent = Agent(
llm=llm,
max_loops="auto",
dashboard=True,
)
agent.run(task=task, img=img)
=======
from swarms.structs import Flow
from swarms.models import Idefics
@ -50,4 +31,3 @@ out = flow.run(task)
# out = flow.print_history_and_memory()
# # out = flow.save_state("flow_state.json")
# print(out)
>>>>>>> fa52e094 (CLEAN UP: Flow and demo layouts)

@ -8,14 +8,7 @@ from swarms.models.openai_models import (
AzureOpenAI,
OpenAIChat,
) # noqa: E402
<<<<<<< HEAD
# from swarms.models.vllm import vLLM # noqa: E402
# from swarms.models.zephyr import Zephyr # noqa: E402
=======
from swarms.models.zephyr import Zephyr # noqa: E402
>>>>>>> 49c7b97c (code quality fixes: line length = 80)
from swarms.models.biogpt import BioGPT # noqa: E402
from swarms.models.huggingface import HuggingfaceLLM # noqa: E402
from swarms.models.wizard_storytelling import (
@ -30,11 +23,9 @@ from swarms.models.base_multimodal_model import (
from swarms.models.idefics import Idefics # noqa: E402
from swarms.models.vilt import Vilt # noqa: E402
from swarms.models.nougat import Nougat # noqa: E402
from swarms.models.layoutlm_document_qa import (
LayoutLMDocumentQA,
) # noqa: E402
from swarms.models.gpt4_vision_api import GPT4VisionAPI # noqa: E402
from swarms.models.openai_tts import OpenAITTS # noqa: E402
from swarms.models.layoutlm_document_qa import LayoutLMDocumentQA # noqa: E402
from swarms.models.gpt4_vision_api import GPT4VisionAPI # noqa: E40
# from swarms.models.gpt4v import GPT4Vision
# from swarms.models.dalle3 import Dalle3
# from swarms.models.distilled_whisperx import DistilWhisperModel # noqa: E402
@ -64,6 +55,5 @@ __all__ = [
# "Dalle3",
# "DistilWhisperModel",
"GPT4VisionAPI",
# "vLLM",
"OpenAITTS",
]

@ -1,36 +1,13 @@
import base64
import json
import logging
import os
from typing import Optional
import aiohttp
import requests
from dotenv import load_dotenv
from termcolor import colored
from swarms.models.base_multimodal_model import BaseMultiModalModel
try:
import cv2
except ImportError:
print(
"OpenCV not installed. Please install OpenCV to use this"
" model."
)
raise ImportError
# Load environment variables
load_dotenv()
openai_api_key = os.getenv("OPENAI_API_KEY")
gpt4_vision_system_prompt = """
You are an multi-modal autonomous agent. You are given a task and an image. You must generate a response to the task and image.
"""
class GPT4VisionAPI(BaseMultiModalModel):
class GPT4VisionAPI:
"""
GPT-4 Vision API
@ -40,9 +17,6 @@ class GPT4VisionAPI(BaseMultiModalModel):
----------
openai_api_key : str
The OpenAI API key. Defaults to the OPENAI_API_KEY environment variable.
max_tokens : int
The maximum number of tokens to generate. Defaults to 300.
Methods
-------
@ -60,282 +34,33 @@ class GPT4VisionAPI(BaseMultiModalModel):
>>> task = "What is the color of the object?"
>>> img = "https://i.imgur.com/2M2ZGwC.jpeg"
>>> llm.run(task, img)
"""
def __init__(
self,
openai_api_key: str = openai_api_key,
model_name: str = "gpt-4-vision-preview",
logging_enabled: bool = False,
max_workers: int = 10,
max_tokens: str = 300,
openai_proxy: str = "https://api.openai.com/v1/chat/completions",
beautify: bool = False,
streaming_enabled: Optional[bool] = False,
meta_prompt: Optional[bool] = False,
system_prompt: Optional[str] = gpt4_vision_system_prompt,
*args,
**kwargs,
openai_api_key: str = openai_api_key
):
super(GPT4VisionAPI).__init__(*args, **kwargs)
super().__init__()
self.openai_api_key = openai_api_key
self.logging_enabled = logging_enabled
self.model_name = model_name
self.max_workers = max_workers
self.max_tokens = max_tokens
self.openai_proxy = openai_proxy
self.beautify = beautify
self.streaming_enabled = streaming_enabled
self.meta_prompt = meta_prompt
self.system_prompt = system_prompt
if self.logging_enabled:
logging.basicConfig(level=logging.DEBUG)
else:
# Disable debug logs for requests and urllib3
logging.getLogger("requests").setLevel(logging.WARNING)
logging.getLogger("urllib3").setLevel(logging.WARNING)
if self.meta_prompt:
self.system_prompt = self.meta_prompt_init()
def encode_image(self, img: str):
"""Encode image to base64."""
if not os.path.exists(img):
print(f"Image file not found: {img}")
return None
with open(img, "rb") as image_file:
return base64.b64encode(image_file.read()).decode("utf-8")
def download_img_then_encode(self, img: str):
"""Download image from URL then encode image to base64 using requests"""
if not os.path.exists(img):
print(f"Image file not found: {img}")
return None
response = requests.get(img)
return base64.b64encode(response.content).decode("utf-8")
# Function to handle vision tasks
def run(self, task: str = None, img: str = None, *args, **kwargs):
def run(self, task: str, img: str):
"""Run the model."""
try:
base64_image = self.encode_image(img)
headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {self.openai_api_key}",
}
payload = {
"model": self.model_name,
"messages": [
{
"role": "system",
"content": [self.system_prompt],
},
{
"role": "user",
"content": [
{"type": "text", "text": task},
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{base64_image}"
},
},
],
},
],
"max_tokens": self.max_tokens,
}
response = requests.post(
self.openai_proxy, headers=headers, json=payload
)
out = response.json()
if "choices" in out and out["choices"]:
content = (
out["choices"][0]
.get("message", {})
.get("content", None)
)
if self.streaming_enabled:
content = self.stream_response(content)
return content
else:
print("No valid response in 'choices'")
return None
except Exception as error:
print(
f"Error with the request: {error}, make sure you"
" double check input types and positions"
)
return None
def video_prompt(self, frames):
"""
SystemPrompt is a class that generates a prompt for the user to respond to.
The prompt is generated based on the current state of the system.
Parameters
----------
frames : list
A list of base64 frames
Returns
-------
PROMPT : str
The system prompt
Examples
--------
>>> from swarms.models import GPT4VisionAPI
>>> llm = GPT4VisionAPI()
>>> video = "video.mp4"
>>> base64_frames = llm.process_video(video)
>>> prompt = llm.video_prompt(base64_frames)
>>> print(prompt)
"""
PROMPT = f"""
These are frames from a video that I want to upload. Generate a compelling description that I can upload along with the video:
{frames}
"""
return PROMPT
def stream_response(self, content: str):
"""Stream the response of the output
Args:
content (str): _description_
"""
for chunk in content:
print(chunk)
def process_video(self, video: str = None):
"""
Process a video into a list of base64 frames
Parameters
----------
video : str
The path to the video file
Returns
-------
base64_frames : list
A list of base64 frames
Examples
--------
>>> from swarms.models import GPT4VisionAPI
>>> llm = GPT4VisionAPI()
>>> video = "video.mp4"
>>> base64_frames = llm.process_video(video)
"""
video = cv2.VideoCapture(video)
base64_frames = []
while video.isOpened():
success, frame = video.read()
if not success:
break
_, buffer = cv2.imencode(".jpg", frame)
base64_frames.append(
base64.b64encode(buffer).decode("utf-8")
)
video.release()
print(len(base64_frames), "frames read.")
return base64_frames
def run_with_video(
self,
task: str = None,
video: str = None,
*args,
**kwargs,
):
prompt = self.video_prompt(self.process_video(video))
headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {openai_api_key}",
}
payload = {
"model": self.model_name,
"messages": [
{
"role": "system",
"content": [self.system_prompt],
},
{
"role": "user",
"content": [
(task,), # task
*map(
lambda x: {"image": x, "resize": 768},
prompt[0::50],
),
],
},
],
"max_tokens": self.max_tokens,
}
response = requests.post(
self.openai_proxy,
headers=headers,
json=payload,
)
out = response.json()
content = out["choices"][0]["message"]["content"]
if self.streaming_enabled:
content = self.stream_response(content)
else:
pass
if self.beautify:
content = colored(content, "cyan")
print(content)
else:
print(content)
def __call__(
self,
task: Optional[str] = None,
img: Optional[str] = None,
*args,
**kwargs,
):
"""Call the model
Args:
task (Optional[str], optional): _description_. Defaults to None.
img (Optional[str], optional): _description_. Defaults to None.
Raises:
error: _description_
"""
try:
try:
base64_image = self.encode_image(img)
headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {openai_api_key}",
}
payload = {
"model": self.model_name,
"model": "gpt-4-vision-preview",
"messages": [
{
"role": "system",
"content": [self.system_prompt],
},
{
"role": "user",
"content": [
@ -347,56 +72,27 @@ class GPT4VisionAPI(BaseMultiModalModel):
},
},
],
},
}
],
"max_tokens": self.max_tokens,
"max_tokens": 300,
}
response = requests.post(
self.openai_proxy,
"https://api.openai.com/v1/chat/completions",
headers=headers,
json=payload,
)
out = response.json()
content = out["choices"][0]["message"]["content"]
if self.streaming_enabled:
content = self.stream_response(content)
else:
pass
if self.beautify:
content = colored(content, "cyan")
print(content)
else:
print(content)
out = out["choices"][0]["text"]
except Exception as error:
print(f"Error with the request: {error}")
raise error
# Function to handle vision tasks
async def arun(
self,
task: Optional[str] = None,
img: Optional[str] = None,
):
"""
Asynchronously run the model
Overview:
---------
This method is used to asynchronously run the model. It is used to run the model
on a single task and image.
Parameters:
----------
task : str
The task to run the model on.
img : str
The image to run the task on
"""
try:
def __call__(self, task: str, img: str):
"""Run the model."""
try:
base64_image = self.encode_image(img)
headers = {
"Content-Type": "application/json",
@ -418,57 +114,14 @@ class GPT4VisionAPI(BaseMultiModalModel):
],
}
],
"max_tokens": self.max_tokens,
"max_tokens": 300,
}
async with aiohttp.ClientSession() as session:
async with session.post(
self.openai_proxy,
headers=headers,
data=json.dumps(payload),
) as response:
out = await response.json()
content = out["choices"][0]["message"]["content"]
print(content)
response = requests.post(
"https://api.openai.com/v1/chat/completions",
headers=headers,
json=payload,
)
return response.json()
except Exception as error:
print(f"Error with the request {error}")
print(f"Error with the request: {error}")
raise error
def health_check(self):
"""Health check for the GPT4Vision model"""
try:
response = requests.get(
"https://api.openai.com/v1/engines"
)
return response.status_code == 200
except requests.RequestException as error:
print(f"Health check failed: {error}")
return False
def print_dashboard(self):
dashboard = print(
colored(
f"""
GPT4Vision Dashboard
-------------------
Model: {self.model_name}
Max Workers: {self.max_workers}
OpenAIProxy: {self.openai_proxy}
""",
"green",
)
)
return dashboard
# def meta_prompt_init(self):
# """Meta Prompt
# Returns:
# _type_: _description_
# """
# META_PROMPT = """
# For any labels or markings on an image that you reference in your response, please
# enclose them in square brackets ([]) and list them explicitly. Do not use ranges; for
# example, instead of '1 - 4', list as '[1], [2], [3], [4]'. These labels could be
# numbers or letters and typically correspond to specific segments or parts of the image.
# """
# return META_PROMPT

@ -496,7 +496,7 @@ class Flow:
)
print(error)
def run(self, task: str, **kwargs):
def run(self, task: str, img: Optional[str], **kwargs):
"""
Run the autonomous agent loop
@ -550,10 +550,17 @@ class Flow:
attempt = 0
while attempt < self.retry_attempts:
try:
response = self.llm(
task,
**kwargs,
)
if img:
response = self.llm(
task,
img,
**kwargs,
)
else:
response = self.llm(
task,
**kwargs,
)
# If code interpreter is enabled then run the code
if self.code_interpreter:

Loading…
Cancel
Save