gpt4vision api

pull/307/head
Kye 2 years ago committed by Zack
parent 80f288c832
commit 13c54d0b00

@ -1,34 +1,17 @@
# Description: This is an example of how to use the Agent class to run a multi-modal workflow from swarms.structs import Flow
import os
from dotenv import load_dotenv
from swarms.models.gpt4_vision_api import GPT4VisionAPI from swarms.models.gpt4_vision_api import GPT4VisionAPI
from swarms.structs import Agent
# Load the environment variables
load_dotenv()
# Get the API key from the environment llm = GPT4VisionAPI()
api_key = os.environ.get("OPENAI_API_KEY")
# Initialize the language model
llm = GPT4VisionAPI(
openai_api_key=api_key,
max_tokens=500,
)
# Initialize the language model
task = "What is the color of the object?" task = "What is the color of the object?"
img = "images/swarms.jpeg" img = "images/swarms.jpeg"
## Initialize the workflow ## Initialize the workflow
agent = Agent( flow = Flow(
llm=llm, llm=llm,
max_loops="auto", max_loops='auto',
autosave=True,
dashboard=True, dashboard=True,
multi_modal=True,
) )
# Run the workflow on a task flow.run(task=task, img=img)
out = agent.run(task=task, img=img)
print(out)

@ -1,22 +1,3 @@
<<<<<<< HEAD
from swarms.structs import Agent
from swarms.models.gpt4_vision_api import GPT4VisionAPI
llm = GPT4VisionAPI()
task = "What is the color of the object?"
img = "images/swarms.jpeg"
## Initialize the workflow
agent = Agent(
llm=llm,
max_loops="auto",
dashboard=True,
)
agent.run(task=task, img=img)
=======
from swarms.structs import Flow from swarms.structs import Flow
from swarms.models import Idefics from swarms.models import Idefics
@ -50,4 +31,3 @@ out = flow.run(task)
# out = flow.print_history_and_memory() # out = flow.print_history_and_memory()
# # out = flow.save_state("flow_state.json") # # out = flow.save_state("flow_state.json")
# print(out) # print(out)
>>>>>>> fa52e094 (CLEAN UP: Flow and demo layouts)

@ -8,14 +8,7 @@ from swarms.models.openai_models import (
AzureOpenAI, AzureOpenAI,
OpenAIChat, OpenAIChat,
) # noqa: E402 ) # noqa: E402
<<<<<<< HEAD
# from swarms.models.vllm import vLLM # noqa: E402
# from swarms.models.zephyr import Zephyr # noqa: E402
=======
from swarms.models.zephyr import Zephyr # noqa: E402 from swarms.models.zephyr import Zephyr # noqa: E402
>>>>>>> 49c7b97c (code quality fixes: line length = 80)
from swarms.models.biogpt import BioGPT # noqa: E402 from swarms.models.biogpt import BioGPT # noqa: E402
from swarms.models.huggingface import HuggingfaceLLM # noqa: E402 from swarms.models.huggingface import HuggingfaceLLM # noqa: E402
from swarms.models.wizard_storytelling import ( from swarms.models.wizard_storytelling import (
@ -30,11 +23,9 @@ from swarms.models.base_multimodal_model import (
from swarms.models.idefics import Idefics # noqa: E402 from swarms.models.idefics import Idefics # noqa: E402
from swarms.models.vilt import Vilt # noqa: E402 from swarms.models.vilt import Vilt # noqa: E402
from swarms.models.nougat import Nougat # noqa: E402 from swarms.models.nougat import Nougat # noqa: E402
from swarms.models.layoutlm_document_qa import ( from swarms.models.layoutlm_document_qa import LayoutLMDocumentQA # noqa: E402
LayoutLMDocumentQA, from swarms.models.gpt4_vision_api import GPT4VisionAPI # noqa: E40
) # noqa: E402
from swarms.models.gpt4_vision_api import GPT4VisionAPI # noqa: E402
from swarms.models.openai_tts import OpenAITTS # noqa: E402
# from swarms.models.gpt4v import GPT4Vision # from swarms.models.gpt4v import GPT4Vision
# from swarms.models.dalle3 import Dalle3 # from swarms.models.dalle3 import Dalle3
# from swarms.models.distilled_whisperx import DistilWhisperModel # noqa: E402 # from swarms.models.distilled_whisperx import DistilWhisperModel # noqa: E402
@ -64,6 +55,5 @@ __all__ = [
# "Dalle3", # "Dalle3",
# "DistilWhisperModel", # "DistilWhisperModel",
"GPT4VisionAPI", "GPT4VisionAPI",
# "vLLM",
"OpenAITTS",
] ]

@ -1,36 +1,13 @@
import base64 import base64
import json
import logging
import os import os
from typing import Optional
import aiohttp
import requests import requests
from dotenv import load_dotenv from dotenv import load_dotenv
from termcolor import colored
from swarms.models.base_multimodal_model import BaseMultiModalModel
try:
import cv2
except ImportError:
print(
"OpenCV not installed. Please install OpenCV to use this"
" model."
)
raise ImportError
# Load environment variables # Load environment variables
load_dotenv() load_dotenv()
openai_api_key = os.getenv("OPENAI_API_KEY") openai_api_key = os.getenv("OPENAI_API_KEY")
class GPT4VisionAPI:
gpt4_vision_system_prompt = """
You are an multi-modal autonomous agent. You are given a task and an image. You must generate a response to the task and image.
"""
class GPT4VisionAPI(BaseMultiModalModel):
""" """
GPT-4 Vision API GPT-4 Vision API
@ -40,9 +17,6 @@ class GPT4VisionAPI(BaseMultiModalModel):
---------- ----------
openai_api_key : str openai_api_key : str
The OpenAI API key. Defaults to the OPENAI_API_KEY environment variable. The OpenAI API key. Defaults to the OPENAI_API_KEY environment variable.
max_tokens : int
The maximum number of tokens to generate. Defaults to 300.
Methods Methods
------- -------
@ -63,266 +37,21 @@ class GPT4VisionAPI(BaseMultiModalModel):
""" """
def __init__( def __init__(
self, self,
openai_api_key: str = openai_api_key, openai_api_key: str = openai_api_key
model_name: str = "gpt-4-vision-preview",
logging_enabled: bool = False,
max_workers: int = 10,
max_tokens: str = 300,
openai_proxy: str = "https://api.openai.com/v1/chat/completions",
beautify: bool = False,
streaming_enabled: Optional[bool] = False,
meta_prompt: Optional[bool] = False,
system_prompt: Optional[str] = gpt4_vision_system_prompt,
*args,
**kwargs,
): ):
super(GPT4VisionAPI).__init__(*args, **kwargs) super().__init__()
self.openai_api_key = openai_api_key self.openai_api_key = openai_api_key
self.logging_enabled = logging_enabled
self.model_name = model_name
self.max_workers = max_workers
self.max_tokens = max_tokens
self.openai_proxy = openai_proxy
self.beautify = beautify
self.streaming_enabled = streaming_enabled
self.meta_prompt = meta_prompt
self.system_prompt = system_prompt
if self.logging_enabled:
logging.basicConfig(level=logging.DEBUG)
else:
# Disable debug logs for requests and urllib3
logging.getLogger("requests").setLevel(logging.WARNING)
logging.getLogger("urllib3").setLevel(logging.WARNING)
if self.meta_prompt:
self.system_prompt = self.meta_prompt_init()
def encode_image(self, img: str): def encode_image(self, img: str):
"""Encode image to base64.""" """Encode image to base64."""
if not os.path.exists(img):
print(f"Image file not found: {img}")
return None
with open(img, "rb") as image_file: with open(img, "rb") as image_file:
return base64.b64encode(image_file.read()).decode("utf-8") return base64.b64encode(image_file.read()).decode("utf-8")
def download_img_then_encode(self, img: str):
"""Download image from URL then encode image to base64 using requests"""
if not os.path.exists(img):
print(f"Image file not found: {img}")
return None
response = requests.get(img)
return base64.b64encode(response.content).decode("utf-8")
# Function to handle vision tasks # Function to handle vision tasks
def run(self, task: str = None, img: str = None, *args, **kwargs): def run(self, task: str, img: str):
"""Run the model.""" """Run the model."""
try:
base64_image = self.encode_image(img)
headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {self.openai_api_key}",
}
payload = {
"model": self.model_name,
"messages": [
{
"role": "system",
"content": [self.system_prompt],
},
{
"role": "user",
"content": [
{"type": "text", "text": task},
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{base64_image}"
},
},
],
},
],
"max_tokens": self.max_tokens,
}
response = requests.post(
self.openai_proxy, headers=headers, json=payload
)
out = response.json()
if "choices" in out and out["choices"]:
content = (
out["choices"][0]
.get("message", {})
.get("content", None)
)
if self.streaming_enabled:
content = self.stream_response(content)
return content
else:
print("No valid response in 'choices'")
return None
except Exception as error:
print(
f"Error with the request: {error}, make sure you"
" double check input types and positions"
)
return None
def video_prompt(self, frames):
"""
SystemPrompt is a class that generates a prompt for the user to respond to.
The prompt is generated based on the current state of the system.
Parameters
----------
frames : list
A list of base64 frames
Returns
-------
PROMPT : str
The system prompt
Examples
--------
>>> from swarms.models import GPT4VisionAPI
>>> llm = GPT4VisionAPI()
>>> video = "video.mp4"
>>> base64_frames = llm.process_video(video)
>>> prompt = llm.video_prompt(base64_frames)
>>> print(prompt)
"""
PROMPT = f"""
These are frames from a video that I want to upload. Generate a compelling description that I can upload along with the video:
{frames}
"""
return PROMPT
def stream_response(self, content: str):
"""Stream the response of the output
Args:
content (str): _description_
"""
for chunk in content:
print(chunk)
def process_video(self, video: str = None):
"""
Process a video into a list of base64 frames
Parameters
----------
video : str
The path to the video file
Returns
-------
base64_frames : list
A list of base64 frames
Examples
--------
>>> from swarms.models import GPT4VisionAPI
>>> llm = GPT4VisionAPI()
>>> video = "video.mp4"
>>> base64_frames = llm.process_video(video)
"""
video = cv2.VideoCapture(video)
base64_frames = []
while video.isOpened():
success, frame = video.read()
if not success:
break
_, buffer = cv2.imencode(".jpg", frame)
base64_frames.append(
base64.b64encode(buffer).decode("utf-8")
)
video.release()
print(len(base64_frames), "frames read.")
return base64_frames
def run_with_video(
self,
task: str = None,
video: str = None,
*args,
**kwargs,
):
prompt = self.video_prompt(self.process_video(video))
headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {openai_api_key}",
}
payload = {
"model": self.model_name,
"messages": [
{
"role": "system",
"content": [self.system_prompt],
},
{
"role": "user",
"content": [
(task,), # task
*map(
lambda x: {"image": x, "resize": 768},
prompt[0::50],
),
],
},
],
"max_tokens": self.max_tokens,
}
response = requests.post(
self.openai_proxy,
headers=headers,
json=payload,
)
out = response.json()
content = out["choices"][0]["message"]["content"]
if self.streaming_enabled:
content = self.stream_response(content)
else:
pass
if self.beautify:
content = colored(content, "cyan")
print(content)
else:
print(content)
def __call__(
self,
task: Optional[str] = None,
img: Optional[str] = None,
*args,
**kwargs,
):
"""Call the model
Args:
task (Optional[str], optional): _description_. Defaults to None.
img (Optional[str], optional): _description_. Defaults to None.
Raises:
error: _description_
"""
try: try:
base64_image = self.encode_image(img) base64_image = self.encode_image(img)
headers = { headers = {
@ -330,12 +59,8 @@ class GPT4VisionAPI(BaseMultiModalModel):
"Authorization": f"Bearer {openai_api_key}", "Authorization": f"Bearer {openai_api_key}",
} }
payload = { payload = {
"model": self.model_name, "model": "gpt-4-vision-preview",
"messages": [ "messages": [
{
"role": "system",
"content": [self.system_prompt],
},
{ {
"role": "user", "role": "user",
"content": [ "content": [
@ -347,55 +72,26 @@ class GPT4VisionAPI(BaseMultiModalModel):
}, },
}, },
], ],
}, }
], ],
"max_tokens": self.max_tokens, "max_tokens": 300,
} }
response = requests.post( response = requests.post(
self.openai_proxy, "https://api.openai.com/v1/chat/completions",
headers=headers, headers=headers,
json=payload, json=payload,
) )
out = response.json() out = response.json()
content = out["choices"][0]["message"]["content"]
if self.streaming_enabled:
content = self.stream_response(content)
else:
pass
if self.beautify:
content = colored(content, "cyan")
print(content)
else:
print(content)
out = out["choices"][0]["text"]
except Exception as error: except Exception as error:
print(f"Error with the request: {error}") print(f"Error with the request: {error}")
raise error raise error
# Function to handle vision tasks
async def arun( def __call__(self, task: str, img: str):
self, """Run the model."""
task: Optional[str] = None,
img: Optional[str] = None,
):
"""
Asynchronously run the model
Overview:
---------
This method is used to asynchronously run the model. It is used to run the model
on a single task and image.
Parameters:
----------
task : str
The task to run the model on.
img : str
The image to run the task on
"""
try: try:
base64_image = self.encode_image(img) base64_image = self.encode_image(img)
headers = { headers = {
@ -418,57 +114,14 @@ class GPT4VisionAPI(BaseMultiModalModel):
], ],
} }
], ],
"max_tokens": self.max_tokens, "max_tokens": 300,
} }
async with aiohttp.ClientSession() as session: response = requests.post(
async with session.post( "https://api.openai.com/v1/chat/completions",
self.openai_proxy, headers=headers,
headers=headers, json=payload,
data=json.dumps(payload), )
) as response: return response.json()
out = await response.json()
content = out["choices"][0]["message"]["content"]
print(content)
except Exception as error: except Exception as error:
print(f"Error with the request {error}") print(f"Error with the request: {error}")
raise error raise error
def health_check(self):
"""Health check for the GPT4Vision model"""
try:
response = requests.get(
"https://api.openai.com/v1/engines"
)
return response.status_code == 200
except requests.RequestException as error:
print(f"Health check failed: {error}")
return False
def print_dashboard(self):
dashboard = print(
colored(
f"""
GPT4Vision Dashboard
-------------------
Model: {self.model_name}
Max Workers: {self.max_workers}
OpenAIProxy: {self.openai_proxy}
""",
"green",
)
)
return dashboard
# def meta_prompt_init(self):
# """Meta Prompt
# Returns:
# _type_: _description_
# """
# META_PROMPT = """
# For any labels or markings on an image that you reference in your response, please
# enclose them in square brackets ([]) and list them explicitly. Do not use ranges; for
# example, instead of '1 - 4', list as '[1], [2], [3], [4]'. These labels could be
# numbers or letters and typically correspond to specific segments or parts of the image.
# """
# return META_PROMPT

@ -496,7 +496,7 @@ class Flow:
) )
print(error) print(error)
def run(self, task: str, **kwargs): def run(self, task: str, img: Optional[str], **kwargs):
""" """
Run the autonomous agent loop Run the autonomous agent loop
@ -550,10 +550,17 @@ class Flow:
attempt = 0 attempt = 0
while attempt < self.retry_attempts: while attempt < self.retry_attempts:
try: try:
response = self.llm( if img:
task, response = self.llm(
**kwargs, task,
) img,
**kwargs,
)
else:
response = self.llm(
task,
**kwargs,
)
# If code interpreter is enabled then run the code # If code interpreter is enabled then run the code
if self.code_interpreter: if self.code_interpreter:

Loading…
Cancel
Save