From 9a82be6943ddfc973fdb3392dda0996e0d9554ef Mon Sep 17 00:00:00 2001 From: Kye Date: Tue, 28 Nov 2023 14:27:09 -0800 Subject: [PATCH] [BUG][OpenAIChat model name][GPT4VisionAPI][system prompt] --- example.py | 1 + multi_modal_auto_agent.py | 16 +++++++++-- swarms/models/gpt4_vision_api.py | 47 ++++++++++++++++++++------------ swarms/models/openai_models.py | 18 ++++++++++-- 4 files changed, 58 insertions(+), 24 deletions(-) diff --git a/example.py b/example.py index 7d6c5e03..bff19021 100644 --- a/example.py +++ b/example.py @@ -24,3 +24,4 @@ agent = Agent(llm=llm, max_loops=1, dashboard=True) # Run the workflow on a task out = agent.run("Generate a 10,000 word blog on health and wellness.") +print(out) \ No newline at end of file diff --git a/multi_modal_auto_agent.py b/multi_modal_auto_agent.py index e51f4ff5..e0fd7f06 100644 --- a/multi_modal_auto_agent.py +++ b/multi_modal_auto_agent.py @@ -1,11 +1,20 @@ -from swarms.structs import Agent +import os + +from dotenv import load_dotenv + from swarms.models.gpt4_vision_api import GPT4VisionAPI from swarms.prompts.multi_modal_autonomous_instruction_prompt import ( MULTI_MODAL_AUTO_AGENT_SYSTEM_PROMPT_1, ) +from swarms.structs import Agent + +load_dotenv() +api_key = os.environ.get("OPENAI_API_KEY") -llm = GPT4VisionAPI() +llm = GPT4VisionAPI( + openai_api_key=api_key, +) task = "What is the color of the object?" img = "images/swarms.jpeg" @@ -19,4 +28,5 @@ agent = Agent( dashboard=True, ) -agent.run(task=task, img=img) +out = agent.run(task=task, img=img) +print(out) diff --git a/swarms/models/gpt4_vision_api.py b/swarms/models/gpt4_vision_api.py index 7af82e59..27d53312 100644 --- a/swarms/models/gpt4_vision_api.py +++ b/swarms/models/gpt4_vision_api.py @@ -23,6 +23,11 @@ load_dotenv() openai_api_key = os.getenv("OPENAI_API_KEY") +gpt4_vision_system_prompt = """ +You are an multi-modal autonomous agent. You are given a task and an image. You must generate a response to the task and image. + +""" + class GPT4VisionAPI: """ GPT-4 Vision API @@ -67,8 +72,8 @@ class GPT4VisionAPI: openai_proxy: str = "https://api.openai.com/v1/chat/completions", beautify: bool = False, streaming_enabled: Optional[bool] = False, - meta_prompt: Optional[bool] = None, - system_prompt: Optional[str] = None, + meta_prompt: Optional[bool] = False, + system_prompt: Optional[str] = gpt4_vision_system_prompt, *args, **kwargs, ): @@ -119,7 +124,7 @@ class GPT4VisionAPI: "Authorization": f"Bearer {openai_api_key}", } payload = { - "model": "gpt-4-vision-preview", + "model": self.model_name, "messages": [ {"role": "system", "content": [self.system_prompt]}, { @@ -243,7 +248,13 @@ class GPT4VisionAPI: for img in base64_frames: base64.b64decode(img.encode("utf-8")) - def __call__(self, task: str, img: str): + def __call__( + self, + task: Optional[str] = None, + img: Optional[str] = None, + *args, + **kwargs, + ): """Run the model.""" try: base64_image = self.encode_image(img) @@ -252,7 +263,7 @@ class GPT4VisionAPI: "Authorization": f"Bearer {openai_api_key}", } payload = { - "model": "gpt-4-vision-preview", + "model": self.model_name, "messages": [ {"role": "system", "content": [self.system_prompt]}, { @@ -437,16 +448,16 @@ class GPT4VisionAPI: ) return dashboard - def meta_prompt_init(self): - """Meta Prompt - - Returns: - _type_: _description_ - """ - META_PROMPT = """ - For any labels or markings on an image that you reference in your response, please - enclose them in square brackets ([]) and list them explicitly. Do not use ranges; for - example, instead of '1 - 4', list as '[1], [2], [3], [4]'. These labels could be - numbers or letters and typically correspond to specific segments or parts of the image. - """ - return META_PROMPT + # def meta_prompt_init(self): + # """Meta Prompt + + # Returns: + # _type_: _description_ + # """ + # META_PROMPT = """ + # For any labels or markings on an image that you reference in your response, please + # enclose them in square brackets ([]) and list them explicitly. Do not use ranges; for + # example, instead of '1 - 4', list as '[1], [2], [3], [4]'. These labels could be + # numbers or letters and typically correspond to specific segments or parts of the image. + # """ + # return META_PROMPT diff --git a/swarms/models/openai_models.py b/swarms/models/openai_models.py index 2fd86122..8d74ca2e 100644 --- a/swarms/models/openai_models.py +++ b/swarms/models/openai_models.py @@ -751,6 +751,21 @@ class OpenAIChat(BaseLLM): Any parameters that are valid to be passed to the openai.create call can be passed in, even if not explicitly saved on this class. + + Args: + + model_name: The model name to use. + model_kwargs: Any additional kwargs to pass to the model. + openai_api_key: The OpenAI API key to use. + openai_api_base: The OpenAI API base to use. + openai_proxy: The OpenAI proxy to use. + max_retries: The maximum number of retries to make when generating. + prefix_messages: The prefix messages to use. + streaming: Whether to stream the results or not. + allowed_special: Set of special tokens that are allowed。 + disallowed_special: Set of special tokens that are not allowed。 + + Example: .. code-block:: python @@ -761,12 +776,9 @@ class OpenAIChat(BaseLLM): client: Any #: :meta private: model_name: str = "gpt-3.5-turbo-1106" - """Model name to use.""" model_kwargs: Dict[str, Any] = Field(default_factory=dict) - """Holds any model parameters valid for `create` call not explicitly specified.""" openai_api_key: Optional[str] = None openai_api_base: Optional[str] = None - # to support explicit proxy for OpenAI openai_proxy: Optional[str] = None max_retries: int = 6 """Maximum number of retries to make when generating."""