gpt4vision api

2 years ago · 9c3a292938
parent 399099ef67
commit 9c3a292938
5 changed files with 160 additions and 38 deletions
--- a/multi_modal_auto_agent.py
+++ b/multi_modal_auto_agent.py
@ -0,0 +1,17 @@
+from swarms.structs import Flow
+from swarms.models.gpt4_vision_api import GPT4VisionAPI
+
+
+llm = GPT4VisionAPI()
+
+task = "What is the color of the object?"
+img = "images/swarms.jpeg"
+
+## Initialize the workflow
+flow = Flow(
+    llm=llm,
+    max_loops='auto',
+    dashboard=True,
+)
+
+flow.run(task=task, img=img)
--- a/playground/demos/multi_modal_autonomous_agents/multi_modal_auto_agent.py
+++ b/playground/demos/multi_modal_autonomous_agents/multi_modal_auto_agent.py
@ -1,33 +0,0 @@
-from swarms.structs import Flow
-from swarms.models import Idefics
-
-# Multi Modality Auto Agent
-llm = Idefics(max_length=2000)
-
-task = (
-    "User: What is in this image?"
-    " https://upload.wikimedia.org/wikipedia/commons/8/86/Id%C3%A9fix.JPG"
-)
-
-## Initialize the workflow
-flow = Flow(
-    llm=llm,
-    max_loops=2,
-    dashboard=True,
-    # stopping_condition=None,  # You can define a stopping condition as needed.
-    # loop_interval=1,
-    # retry_attempts=3,
-    # retry_interval=1,
-    # interactive=False,  # Set to 'True' for interactive mode.
-    # dynamic_temperature=False,  # Set to 'True' for dynamic temperature handling.
-)
-
-# out = flow.load_state("flow_state.json")
-# temp = flow.dynamic_temperature()
-# filter = flow.add_response_filter("Trump")
-out = flow.run(task)
-# out = flow.validate_response(out)
-# out = flow.analyze_feedback(out)
-# out = flow.print_history_and_memory()
-# # out = flow.save_state("flow_state.json")
-# print(out)
--- a/swarms/models/init.py
+++ b/swarms/models/init.py
@ -25,6 +25,7 @@ from swarms.models.idefics import Idefics  # noqa: E402
 from swarms.models.vilt import Vilt  # noqa: E402
 from swarms.models.nougat import Nougat  # noqa: E402
 from swarms.models.layoutlm_document_qa import LayoutLMDocumentQA  # noqa: E402
+from swarms.models.gpt4_vision_api import GPT4VisionAPI  # noqa: E40

 # from swarms.models.gpt4v import GPT4Vision
 # from swarms.models.dalle3 import Dalle3
@ -49,4 +50,7 @@ __all__ = [
    "WizardLLMStoryTeller",
    # "GPT4Vision",
    # "Dalle3",
+    # "DistilWhisperModel",
+    "GPT4VisionAPI",
+    
 ]
--- a/swarms/models/gpt4_vision_api.py
+++ b/swarms/models/gpt4_vision_api.py
@ -0,0 +1,127 @@
+import base64
+import os
+import requests
+from dotenv import load_dotenv
+
+# Load environment variables
+load_dotenv()
+openai_api_key = os.getenv("OPENAI_API_KEY")
+
+class GPT4VisionAPI:
+    """
+    GPT-4 Vision API
+
+    This class is a wrapper for the OpenAI API. It is used to run the GPT-4 Vision model.
+
+    Parameters
+    ----------
+    openai_api_key : str
+        The OpenAI API key. Defaults to the OPENAI_API_KEY environment variable.
+
+    Methods
+    -------
+    encode_image(img: str)
+        Encode image to base64.
+    run(task: str, img: str)
+        Run the model.
+    __call__(task: str, img: str)
+        Run the model.
+
+    Examples:
+    ---------
+    >>> from swarms.models import GPT4VisionAPI
+    >>> llm = GPT4VisionAPI()
+    >>> task = "What is the color of the object?"
+    >>> img = "https://i.imgur.com/2M2ZGwC.jpeg"
+    >>> llm.run(task, img)
+    
+    
+    """
+    def __init__(
+        self,
+        openai_api_key: str = openai_api_key
+    ):
+        super().__init__()
+        self.openai_api_key = openai_api_key
+
+    def encode_image(self, img: str):
+        """Encode image to base64."""
+        with open(img, "rb") as image_file:
+            return base64.b64encode(image_file.read()).decode("utf-8")
+
+    # Function to handle vision tasks
+    def run(self, task: str, img: str):
+        """Run the model."""
+        try:                
+            base64_image = self.encode_image(img)
+            headers = {
+                "Content-Type": "application/json",
+                "Authorization": f"Bearer {openai_api_key}",
+            }
+            payload = {
+                "model": "gpt-4-vision-preview",
+                "messages": [
+                    {
+                        "role": "user",
+                        "content": [
+                            {"type": "text", "text": task},
+                            {
+                                "type": "image_url",
+                                "image_url": {
+                                    "url": f"data:image/jpeg;base64,{base64_image}"
+                                },
+                            },
+                        ],
+                    }
+                ],
+                "max_tokens": 300,
+            }
+            response = requests.post(
+                "https://api.openai.com/v1/chat/completions",
+                headers=headers,
+                json=payload,
+            )
+
+            out = response.json()
+
+            out = out["choices"][0]["text"]
+        except Exception as error:
+            print(f"Error with the request: {error}")
+            raise error
+        # Function to handle vision tasks
+
+    def __call__(self, task: str, img: str):
+        """Run the model."""
+        try:                
+            base64_image = self.encode_image(img)
+            headers = {
+                "Content-Type": "application/json",
+                "Authorization": f"Bearer {openai_api_key}",
+            }
+            payload = {
+                "model": "gpt-4-vision-preview",
+                "messages": [
+                    {
+                        "role": "user",
+                        "content": [
+                            {"type": "text", "text": task},
+                            {
+                                "type": "image_url",
+                                "image_url": {
+                                    "url": f"data:image/jpeg;base64,{base64_image}"
+                                },
+                            },
+                        ],
+                    }
+                ],
+                "max_tokens": 300,
+            }
+            response = requests.post(
+                "https://api.openai.com/v1/chat/completions",
+                headers=headers,
+                json=payload,
+            )
+            return response.json()
+        except Exception as error:
+            print(f"Error with the request: {error}")
+            raise error
--- a/swarms/structs/flow.py
+++ b/swarms/structs/flow.py
@ -496,7 +496,7 @@ class Flow:
            )
            print(error)

-    def run(self, task: str, **kwargs):
+    def run(self, task: str, img: Optional[str], **kwargs):
        """
        Run the autonomous agent loop

@ -550,10 +550,17 @@ class Flow:
                attempt = 0
                while attempt < self.retry_attempts:
                    try:
-                        response = self.llm(
-                            task,
-                            **kwargs,
-                        )
+                        if img:
+                            response = self.llm(
+                                task,
+                                img,
+                                **kwargs,
+                            )
+                        else:
+                            response = self.llm(
+                                task,
+                                **kwargs,
+                            )

                        # If code interpreter is enabled then run the code
                        if self.code_interpreter: