gpt4vision api

2 years ago · 9c3a292938
parent 399099ef67
commit 9c3a292938
5 changed files with 160 additions and 38 deletions
--- a/multi_modal_auto_agent.py
+++ b/multi_modal_auto_agent.py
@ -0,0 +1,17 @@
 from swarms.structs import Flow
 from swarms.models.gpt4_vision_api import GPT4VisionAPI
 llm = GPT4VisionAPI()
 task = "What is the color of the object?"
 img = "images/swarms.jpeg"
 ## Initialize the workflow
 flow = Flow(
    llm=llm,
    max_loops='auto',
    dashboard=True,
 )
 flow.run(task=task, img=img)
--- a/playground/demos/multi_modal_autonomous_agents/multi_modal_auto_agent.py
+++ b/playground/demos/multi_modal_autonomous_agents/multi_modal_auto_agent.py
@ -1,33 +0,0 @@
 from swarms.structs import Flow
 from swarms.models import Idefics
 # Multi Modality Auto Agent
 llm = Idefics(max_length=2000)
 task = (
    "User: What is in this image?"
    " https://upload.wikimedia.org/wikipedia/commons/8/86/Id%C3%A9fix.JPG"
 )
 ## Initialize the workflow
 flow = Flow(
    llm=llm,
    max_loops=2,
    dashboard=True,
    # stopping_condition=None,  # You can define a stopping condition as needed.
    # loop_interval=1,
    # retry_attempts=3,
    # retry_interval=1,
    # interactive=False,  # Set to 'True' for interactive mode.
    # dynamic_temperature=False,  # Set to 'True' for dynamic temperature handling.
 )
 # out = flow.load_state("flow_state.json")
 # temp = flow.dynamic_temperature()
 # filter = flow.add_response_filter("Trump")
 out = flow.run(task)
 # out = flow.validate_response(out)
 # out = flow.analyze_feedback(out)
 # out = flow.print_history_and_memory()
 # # out = flow.save_state("flow_state.json")
 # print(out)
--- a/swarms/models/init.py
+++ b/swarms/models/init.py
@ -25,6 +25,7 @@ from swarms.models.idefics import Idefics  # noqa: E402
 from swarms.models.vilt import Vilt  # noqa: E402
 from swarms.models.nougat import Nougat  # noqa: E402
 from swarms.models.layoutlm_document_qa import LayoutLMDocumentQA  # noqa: E402
 from swarms.models.gpt4_vision_api import GPT4VisionAPI  # noqa: E40
 # from swarms.models.gpt4v import GPT4Vision
 # from swarms.models.dalle3 import Dalle3
@ -49,4 +50,7 @@ __all__ = [
    "WizardLLMStoryTeller",
    # "GPT4Vision",
    # "Dalle3",
    # "DistilWhisperModel",
    "GPT4VisionAPI",
 ]
--- a/swarms/models/gpt4_vision_api.py
+++ b/swarms/models/gpt4_vision_api.py
@ -0,0 +1,127 @@
 import base64
 import os
 import requests
 from dotenv import load_dotenv
 # Load environment variables
 load_dotenv()
 openai_api_key = os.getenv("OPENAI_API_KEY")
 class GPT4VisionAPI:
    """
    GPT-4 Vision API
    This class is a wrapper for the OpenAI API. It is used to run the GPT-4 Vision model.
    Parameters
    ----------
    openai_api_key : str
        The OpenAI API key. Defaults to the OPENAI_API_KEY environment variable.
    Methods
    -------
    encode_image(img: str)
        Encode image to base64.
    run(task: str, img: str)
        Run the model.
    __call__(task: str, img: str)
        Run the model.
    Examples:
    ---------
    >>> from swarms.models import GPT4VisionAPI
    >>> llm = GPT4VisionAPI()
    >>> task = "What is the color of the object?"
    >>> img = "https://i.imgur.com/2M2ZGwC.jpeg"
    >>> llm.run(task, img)
    """
    def __init__(
        self,
        openai_api_key: str = openai_api_key
    ):
        super().__init__()
        self.openai_api_key = openai_api_key
    def encode_image(self, img: str):
        """Encode image to base64."""
        with open(img, "rb") as image_file:
            return base64.b64encode(image_file.read()).decode("utf-8")
    # Function to handle vision tasks
    def run(self, task: str, img: str):
        """Run the model."""
        try:                
            base64_image = self.encode_image(img)
            headers = {
                "Content-Type": "application/json",
                "Authorization": f"Bearer {openai_api_key}",
            }
            payload = {
                "model": "gpt-4-vision-preview",
                "messages": [
                    {
                        "role": "user",
                        "content": [
                            {"type": "text", "text": task},
                            {
                                "type": "image_url",
                                "image_url": {
                                    "url": f"data:image/jpeg;base64,{base64_image}"
                                },
                            },
                        ],
                    }
                ],
                "max_tokens": 300,
            }
            response = requests.post(
                "https://api.openai.com/v1/chat/completions",
                headers=headers,
                json=payload,
            )
            out = response.json()
            out = out["choices"][0]["text"]
        except Exception as error:
            print(f"Error with the request: {error}")
            raise error
        # Function to handle vision tasks
    def __call__(self, task: str, img: str):
        """Run the model."""
        try:                
            base64_image = self.encode_image(img)
            headers = {
                "Content-Type": "application/json",
                "Authorization": f"Bearer {openai_api_key}",
            }
            payload = {
                "model": "gpt-4-vision-preview",
                "messages": [
                    {
                        "role": "user",
                        "content": [
                            {"type": "text", "text": task},
                            {
                                "type": "image_url",
                                "image_url": {
                                    "url": f"data:image/jpeg;base64,{base64_image}"
                                },
                            },
                        ],
                    }
                ],
                "max_tokens": 300,
            }
            response = requests.post(
                "https://api.openai.com/v1/chat/completions",
                headers=headers,
                json=payload,
            )
            return response.json()
        except Exception as error:
            print(f"Error with the request: {error}")
            raise error
--- a/swarms/structs/flow.py
+++ b/swarms/structs/flow.py
@ -496,7 +496,7 @@ class Flow:
            )
            print(error)
-    def run(self, task: str, **kwargs):
+    def run(self, task: str, img: Optional[str], **kwargs):
        """
        Run the autonomous agent loop
@ -550,10 +550,17 @@ class Flow:
                attempt = 0
                while attempt < self.retry_attempts:
                    try:
-                        response = self.llm(
+                        if img:
-                            task,
+                            response = self.llm(
-                            **kwargs,
+                                task,
-                        )
+                                img,
                                **kwargs,
                            )
                        else:
                            response = self.llm(
                                task,
                                **kwargs,
                            )
                        # If code interpreter is enabled then run the code
                        if self.code_interpreter: