[BUG][OpenAIChat model name][GPT4VisionAPI][system prompt]

1 year ago · 9a82be6943
parent 4212fb4aa5
commit 9a82be6943
4 changed files with 58 additions and 24 deletions
--- a/example.py
+++ b/example.py
@ -24,3 +24,4 @@ agent = Agent(llm=llm, max_loops=1, dashboard=True)

 # Run the workflow on a task
 out = agent.run("Generate a 10,000 word blog on health and wellness.")
+print(out)
--- a/multi_modal_auto_agent.py
+++ b/multi_modal_auto_agent.py
@ -1,11 +1,20 @@
-from swarms.structs import Agent
+import os
+
+from dotenv import load_dotenv
+
 from swarms.models.gpt4_vision_api import GPT4VisionAPI
 from swarms.prompts.multi_modal_autonomous_instruction_prompt import (
    MULTI_MODAL_AUTO_AGENT_SYSTEM_PROMPT_1,
 )
+from swarms.structs import Agent
+
+load_dotenv()

+api_key = os.environ.get("OPENAI_API_KEY")

-llm = GPT4VisionAPI()
+llm = GPT4VisionAPI(
+    openai_api_key=api_key,
+)

 task = "What is the color of the object?"
 img = "images/swarms.jpeg"
@ -19,4 +28,5 @@ agent = Agent(
    dashboard=True,
 )

-agent.run(task=task, img=img)
+out = agent.run(task=task, img=img)
+print(out)
--- a/swarms/models/gpt4_vision_api.py
+++ b/swarms/models/gpt4_vision_api.py
@ -23,6 +23,11 @@ load_dotenv()
 openai_api_key = os.getenv("OPENAI_API_KEY")


+gpt4_vision_system_prompt = """
+You are an multi-modal autonomous agent. You are given a task and an image. You must generate a response to the task and image.
+
+"""
+
 class GPT4VisionAPI:
    """
    GPT-4 Vision API
@ -67,8 +72,8 @@ class GPT4VisionAPI:
        openai_proxy: str = "https://api.openai.com/v1/chat/completions",
        beautify: bool = False,
        streaming_enabled: Optional[bool] = False,
-        meta_prompt: Optional[bool] = None,
-        system_prompt: Optional[str] = None,
+        meta_prompt: Optional[bool] = False,
+        system_prompt: Optional[str] = gpt4_vision_system_prompt,
        *args,
        **kwargs,
    ):
@ -119,7 +124,7 @@ class GPT4VisionAPI:
                "Authorization": f"Bearer {openai_api_key}",
            }
            payload = {
-                "model": "gpt-4-vision-preview",
+                "model": self.model_name,
                "messages": [
                    {"role": "system", "content": [self.system_prompt]},
                    {
@ -243,7 +248,13 @@ class GPT4VisionAPI:
        for img in base64_frames:
            base64.b64decode(img.encode("utf-8"))

-    def __call__(self, task: str, img: str):
+    def __call__(
+        self,
+        task: Optional[str] = None,
+        img: Optional[str] = None,
+        *args,
+        **kwargs,
+    ):
        """Run the model."""
        try:
            base64_image = self.encode_image(img)
@ -252,7 +263,7 @@ class GPT4VisionAPI:
                "Authorization": f"Bearer {openai_api_key}",
            }
            payload = {
-                "model": "gpt-4-vision-preview",
+                "model": self.model_name,
                "messages": [
                    {"role": "system", "content": [self.system_prompt]},
                    {
@ -437,16 +448,16 @@ class GPT4VisionAPI:
        )
        return dashboard

-    def meta_prompt_init(self):
-        """Meta Prompt
-
-        Returns:
-            _type_: _description_
-        """
-        META_PROMPT = """
-        For any labels or markings on an image that you reference in your response, please 
-        enclose them in square brackets ([]) and list them explicitly. Do not use ranges; for 
-        example, instead of '1 - 4', list as '[1], [2], [3], [4]'. These labels could be 
-        numbers or letters and typically correspond to specific segments or parts of the image.
-        """
-        return META_PROMPT
+    # def meta_prompt_init(self):
+    #     """Meta Prompt
+
+    #     Returns:
+    #         _type_: _description_
+    #     """
+    #     META_PROMPT = """
+    #     For any labels or markings on an image that you reference in your response, please 
+    #     enclose them in square brackets ([]) and list them explicitly. Do not use ranges; for 
+    #     example, instead of '1 - 4', list as '[1], [2], [3], [4]'. These labels could be 
+    #     numbers or letters and typically correspond to specific segments or parts of the image.
+    #     """
+    #     return META_PROMPT
--- a/swarms/models/openai_models.py
+++ b/swarms/models/openai_models.py
@ -751,6 +751,21 @@ class OpenAIChat(BaseLLM):

    Any parameters that are valid to be passed to the openai.create call can be passed
    in, even if not explicitly saved on this class.
+    
+    Args:
+    
+        model_name: The model name to use.
+        model_kwargs: Any additional kwargs to pass to the model.
+        openai_api_key: The OpenAI API key to use.
+        openai_api_base: The OpenAI API base to use.
+        openai_proxy: The OpenAI proxy to use.
+        max_retries: The maximum number of retries to make when generating.
+        prefix_messages: The prefix messages to use.
+        streaming: Whether to stream the results or not.
+        allowed_special: Set of special tokens that are allowed。
+        disallowed_special: Set of special tokens that are not allowed。
+        
+        

    Example:
        .. code-block:: python
@ -761,12 +776,9 @@ class OpenAIChat(BaseLLM):

    client: Any  #: :meta private:
    model_name: str = "gpt-3.5-turbo-1106"
-    """Model name to use."""
    model_kwargs: Dict[str, Any] = Field(default_factory=dict)
-    """Holds any model parameters valid for `create` call not explicitly specified."""
    openai_api_key: Optional[str] = None
    openai_api_base: Optional[str] = None
-    # to support explicit proxy for OpenAI
    openai_proxy: Optional[str] = None
    max_retries: int = 6
    """Maximum number of retries to make when generating."""