gpt4vision api

2 years ago · 13c54d0b00
parent 80f288c832
commit 13c54d0b00
5 changed files with 45 additions and 432 deletions
--- a/multi_modal_auto_agent.py
+++ b/multi_modal_auto_agent.py
@ -1,34 +1,17 @@
-# Description: This is an example of how to use the Agent class to run a multi-modal workflow
+from swarms.structs import Flow
 import os
 from dotenv import load_dotenv
 from swarms.models.gpt4_vision_api import GPT4VisionAPI
 from swarms.structs import Agent
 # Load the environment variables
 load_dotenv()
-# Get the API key from the environment
+llm = GPT4VisionAPI()
 api_key = os.environ.get("OPENAI_API_KEY")
 # Initialize the language model
 llm = GPT4VisionAPI(
    openai_api_key=api_key,
    max_tokens=500,
 )
 # Initialize the language model
 task = "What is the color of the object?"
 img = "images/swarms.jpeg"
 ## Initialize the workflow
-agent = Agent(
+flow = Flow(
    llm=llm,
-    max_loops="auto",
+    max_loops='auto',
    autosave=True,
    dashboard=True,
    multi_modal=True,
 )
-# Run the workflow on a task
+flow.run(task=task, img=img)
 out = agent.run(task=task, img=img)
 print(out)
--- a/playground/demos/multi_modal_autonomous_agents/multi_modal_auto_agent.py
+++ b/playground/demos/multi_modal_autonomous_agents/multi_modal_auto_agent.py
@ -1,22 +1,3 @@
 <<<<<<< HEAD
 from swarms.structs import Agent
 from swarms.models.gpt4_vision_api import GPT4VisionAPI
 llm = GPT4VisionAPI()
 task = "What is the color of the object?"
 img = "images/swarms.jpeg"
 ## Initialize the workflow
 agent = Agent(
    llm=llm,
    max_loops="auto",
    dashboard=True,
 )
 agent.run(task=task, img=img)
 =======
 from swarms.structs import Flow
 from swarms.models import Idefics
@ -50,4 +31,3 @@ out = flow.run(task)
 # out = flow.print_history_and_memory()
 # # out = flow.save_state("flow_state.json")
 # print(out)
 >>>>>>> fa52e094 (CLEAN UP: Flow and demo layouts)
--- a/swarms/models/init.py
+++ b/swarms/models/init.py
@ -8,14 +8,7 @@ from swarms.models.openai_models import (
    AzureOpenAI,
    OpenAIChat,
 )  # noqa: E402
 <<<<<<< HEAD
 # from swarms.models.vllm import vLLM  # noqa: E402
 # from swarms.models.zephyr import Zephyr  # noqa: E402
 =======
 from swarms.models.zephyr import Zephyr  # noqa: E402
 >>>>>>> 49c7b97c (code quality fixes: line length = 80)
 from swarms.models.biogpt import BioGPT  # noqa: E402
 from swarms.models.huggingface import HuggingfaceLLM  # noqa: E402
 from swarms.models.wizard_storytelling import (
@ -30,11 +23,9 @@ from swarms.models.base_multimodal_model import (
 from swarms.models.idefics import Idefics  # noqa: E402
 from swarms.models.vilt import Vilt  # noqa: E402
 from swarms.models.nougat import Nougat  # noqa: E402
-from swarms.models.layoutlm_document_qa import (
+from swarms.models.layoutlm_document_qa import LayoutLMDocumentQA  # noqa: E402
-    LayoutLMDocumentQA,
+from swarms.models.gpt4_vision_api import GPT4VisionAPI  # noqa: E40
-)  # noqa: E402
+
 from swarms.models.gpt4_vision_api import GPT4VisionAPI  # noqa: E402
 from swarms.models.openai_tts import OpenAITTS  # noqa: E402
 # from swarms.models.gpt4v import GPT4Vision
 # from swarms.models.dalle3 import Dalle3
 # from swarms.models.distilled_whisperx import DistilWhisperModel # noqa: E402
@ -64,6 +55,5 @@ __all__ = [
    # "Dalle3",
    # "DistilWhisperModel",
    "GPT4VisionAPI",
-    # "vLLM",
+    
    "OpenAITTS",
 ]
--- a/swarms/models/gpt4_vision_api.py
+++ b/swarms/models/gpt4_vision_api.py
@ -1,36 +1,13 @@
 import base64
 import json
 import logging
 import os
 from typing import Optional
 import aiohttp
 import requests
 from dotenv import load_dotenv
 from termcolor import colored
 from swarms.models.base_multimodal_model import BaseMultiModalModel
 try:
    import cv2
 except ImportError:
    print(
        "OpenCV not installed. Please install OpenCV to use this"
        " model."
    )
    raise ImportError
 # Load environment variables
 load_dotenv()
 openai_api_key = os.getenv("OPENAI_API_KEY")
-
+class GPT4VisionAPI:
 gpt4_vision_system_prompt = """
 You are an multi-modal autonomous agent. You are given a task and an image. You must generate a response to the task and image.
 """
 class GPT4VisionAPI(BaseMultiModalModel):
    """
    GPT-4 Vision API
@ -40,9 +17,6 @@ class GPT4VisionAPI(BaseMultiModalModel):
    ----------
    openai_api_key : str
        The OpenAI API key. Defaults to the OPENAI_API_KEY environment variable.
    max_tokens : int
        The maximum number of tokens to generate. Defaults to 300.
    Methods
    -------
@ -63,266 +37,21 @@ class GPT4VisionAPI(BaseMultiModalModel):
    """
    def __init__(
        self,
-        openai_api_key: str = openai_api_key,
+        openai_api_key: str = openai_api_key
        model_name: str = "gpt-4-vision-preview",
        logging_enabled: bool = False,
        max_workers: int = 10,
        max_tokens: str = 300,
        openai_proxy: str = "https://api.openai.com/v1/chat/completions",
        beautify: bool = False,
        streaming_enabled: Optional[bool] = False,
        meta_prompt: Optional[bool] = False,
        system_prompt: Optional[str] = gpt4_vision_system_prompt,
        *args,
        **kwargs,
    ):
-        super(GPT4VisionAPI).__init__(*args, **kwargs)
+        super().__init__()
        self.openai_api_key = openai_api_key
        self.logging_enabled = logging_enabled
        self.model_name = model_name
        self.max_workers = max_workers
        self.max_tokens = max_tokens
        self.openai_proxy = openai_proxy
        self.beautify = beautify
        self.streaming_enabled = streaming_enabled
        self.meta_prompt = meta_prompt
        self.system_prompt = system_prompt
        if self.logging_enabled:
            logging.basicConfig(level=logging.DEBUG)
        else:
            # Disable debug logs for requests and urllib3
            logging.getLogger("requests").setLevel(logging.WARNING)
            logging.getLogger("urllib3").setLevel(logging.WARNING)
        if self.meta_prompt:
            self.system_prompt = self.meta_prompt_init()
    def encode_image(self, img: str):
        """Encode image to base64."""
        if not os.path.exists(img):
            print(f"Image file not found: {img}")
            return None
        with open(img, "rb") as image_file:
            return base64.b64encode(image_file.read()).decode("utf-8")
    def download_img_then_encode(self, img: str):
        """Download image from URL then encode image to base64 using requests"""
        if not os.path.exists(img):
            print(f"Image file not found: {img}")
            return None
        response = requests.get(img)
        return base64.b64encode(response.content).decode("utf-8")
    # Function to handle vision tasks
-    def run(self, task: str = None, img: str = None, *args, **kwargs):
+    def run(self, task: str, img: str):
        """Run the model."""
        try:
            base64_image = self.encode_image(img)
            headers = {
                "Content-Type": "application/json",
                "Authorization": f"Bearer {self.openai_api_key}",
            }
            payload = {
                "model": self.model_name,
                "messages": [
                    {
                        "role": "system",
                        "content": [self.system_prompt],
                    },
                    {
                        "role": "user",
                        "content": [
                            {"type": "text", "text": task},
                            {
                                "type": "image_url",
                                "image_url": {
                                    "url": f"data:image/jpeg;base64,{base64_image}"
                                },
                            },
                        ],
                    },
                ],
                "max_tokens": self.max_tokens,
            }
            response = requests.post(
                self.openai_proxy, headers=headers, json=payload
            )
            out = response.json()
            if "choices" in out and out["choices"]:
                content = (
                    out["choices"][0]
                    .get("message", {})
                    .get("content", None)
                )
                if self.streaming_enabled:
                    content = self.stream_response(content)
                return content
            else:
                print("No valid response in 'choices'")
                return None
        except Exception as error:
            print(
                f"Error with the request: {error}, make sure you"
                " double check input types and positions"
            )
            return None
    def video_prompt(self, frames):
        """
        SystemPrompt is a class that generates a prompt for the user to respond to.
        The prompt is generated based on the current state of the system.
        Parameters
        ----------
        frames : list
            A list of base64 frames
        Returns
        -------
        PROMPT : str
            The system prompt
        Examples
        --------
        >>> from swarms.models import GPT4VisionAPI
        >>> llm = GPT4VisionAPI()
        >>> video = "video.mp4"
        >>> base64_frames = llm.process_video(video)
        >>> prompt = llm.video_prompt(base64_frames)
        >>> print(prompt)
        """
        PROMPT = f"""
        These are frames from a video that I want to upload. Generate a compelling description that I can upload along with the video:
        {frames}
        """
        return PROMPT
    def stream_response(self, content: str):
        """Stream the response of the output
        Args:
            content (str): _description_
        """
        for chunk in content:
            print(chunk)
    def process_video(self, video: str = None):
        """
        Process a video into a list of base64 frames
        Parameters
        ----------
        video : str
            The path to the video file
        Returns
        -------
        base64_frames : list
            A list of base64 frames
        Examples
        --------
        >>> from swarms.models import GPT4VisionAPI
        >>> llm = GPT4VisionAPI()
        >>> video = "video.mp4"
        >>> base64_frames = llm.process_video(video)
        """
        video = cv2.VideoCapture(video)
        base64_frames = []
        while video.isOpened():
            success, frame = video.read()
            if not success:
                break
            _, buffer = cv2.imencode(".jpg", frame)
            base64_frames.append(
                base64.b64encode(buffer).decode("utf-8")
            )
        video.release()
        print(len(base64_frames), "frames read.")
        return base64_frames
    def run_with_video(
        self,
        task: str = None,
        video: str = None,
        *args,
        **kwargs,
    ):
        prompt = self.video_prompt(self.process_video(video))
        headers = {
            "Content-Type": "application/json",
            "Authorization": f"Bearer {openai_api_key}",
        }
        payload = {
            "model": self.model_name,
            "messages": [
                {
                    "role": "system",
                    "content": [self.system_prompt],
                },
                {
                    "role": "user",
                    "content": [
                        (task,),  # task
                        *map(
                            lambda x: {"image": x, "resize": 768},
                            prompt[0::50],
                        ),
                    ],
                },
            ],
            "max_tokens": self.max_tokens,
        }
        response = requests.post(
            self.openai_proxy,
            headers=headers,
            json=payload,
        )
        out = response.json()
        content = out["choices"][0]["message"]["content"]
        if self.streaming_enabled:
            content = self.stream_response(content)
        else:
            pass
        if self.beautify:
            content = colored(content, "cyan")
            print(content)
        else:
            print(content)
    def __call__(
        self,
        task: Optional[str] = None,
        img: Optional[str] = None,
        *args,
        **kwargs,
    ):
        """Call the model
        Args:
            task (Optional[str], optional): _description_. Defaults to None.
            img (Optional[str], optional): _description_. Defaults to None.
        Raises:
            error: _description_
        """
        try:                
            base64_image = self.encode_image(img)
            headers = {
@ -330,12 +59,8 @@ class GPT4VisionAPI(BaseMultiModalModel):
                "Authorization": f"Bearer {openai_api_key}",
            }
            payload = {
-                "model": self.model_name,
+                "model": "gpt-4-vision-preview",
                "messages": [
                    {
                        "role": "system",
                        "content": [self.system_prompt],
                    },
                    {
                        "role": "user",
                        "content": [
@ -347,55 +72,26 @@ class GPT4VisionAPI(BaseMultiModalModel):
                                },
                            },
                        ],
-                    },
+                    }
                ],
-                "max_tokens": self.max_tokens,
+                "max_tokens": 300,
            }
            response = requests.post(
-                self.openai_proxy,
+                "https://api.openai.com/v1/chat/completions",
                headers=headers,
                json=payload,
            )
            out = response.json()
            content = out["choices"][0]["message"]["content"]
            if self.streaming_enabled:
                content = self.stream_response(content)
            else:
                pass
            if self.beautify:
                content = colored(content, "cyan")
                print(content)
            else:
                print(content)
            out = out["choices"][0]["text"]
        except Exception as error:
            print(f"Error with the request: {error}")
            raise error
        # Function to handle vision tasks
-    async def arun(
+    def __call__(self, task: str, img: str):
-        self,
+        """Run the model."""
        task: Optional[str] = None,
        img: Optional[str] = None,
    ):
        """
        Asynchronously run the model
        Overview:
        ---------
        This method is used to asynchronously run the model. It is used to run the model
        on a single task and image.
        Parameters:
        ----------
        task : str
            The task to run the model on.
        img : str
            The image to run the task on
        """
        try:                
            base64_image = self.encode_image(img)
            headers = {
@ -418,57 +114,14 @@ class GPT4VisionAPI(BaseMultiModalModel):
                        ],
                    }
                ],
-                "max_tokens": self.max_tokens,
+                "max_tokens": 300,
            }
-            async with aiohttp.ClientSession() as session:
+            response = requests.post(
-                async with session.post(
+                "https://api.openai.com/v1/chat/completions",
-                    self.openai_proxy,
+                headers=headers,
-                    headers=headers,
+                json=payload,
-                    data=json.dumps(payload),
+            )
-                ) as response:
+            return response.json()
                    out = await response.json()
                    content = out["choices"][0]["message"]["content"]
                    print(content)
        except Exception as error:
-            print(f"Error with the request {error}")
+            print(f"Error with the request: {error}")
            raise error
    def health_check(self):
        """Health check for the GPT4Vision model"""
        try:
            response = requests.get(
                "https://api.openai.com/v1/engines"
            )
            return response.status_code == 200
        except requests.RequestException as error:
            print(f"Health check failed: {error}")
            return False
    def print_dashboard(self):
        dashboard = print(
            colored(
                f"""
            GPT4Vision Dashboard
            -------------------
            Model: {self.model_name}
            Max Workers: {self.max_workers}
            OpenAIProxy: {self.openai_proxy}
            """,
                "green",
            )
        )
        return dashboard
    # def meta_prompt_init(self):
    #     """Meta Prompt
    #     Returns:
    #         _type_: _description_
    #     """
    #     META_PROMPT = """
    #     For any labels or markings on an image that you reference in your response, please
    #     enclose them in square brackets ([]) and list them explicitly. Do not use ranges; for
    #     example, instead of '1 - 4', list as '[1], [2], [3], [4]'. These labels could be
    #     numbers or letters and typically correspond to specific segments or parts of the image.
    #     """
    #     return META_PROMPT
--- a/swarms/structs/flow.py
+++ b/swarms/structs/flow.py
@ -496,7 +496,7 @@ class Flow:
            )
            print(error)
-    def run(self, task: str, **kwargs):
+    def run(self, task: str, img: Optional[str], **kwargs):
        """
        Run the autonomous agent loop
@ -550,10 +550,17 @@ class Flow:
                attempt = 0
                while attempt < self.retry_attempts:
                    try:
-                        response = self.llm(
+                        if img:
-                            task,
+                            response = self.llm(
-                            **kwargs,
+                                task,
-                        )
+                                img,
                                **kwargs,
                            )
                        else:
                            response = self.llm(
                                task,
                                **kwargs,
                            )
                        # If code interpreter is enabled then run the code
                        if self.code_interpreter: