diff --git a/multi_modal_auto_agent.py b/multi_modal_auto_agent.py new file mode 100644 index 00000000..5d27dc42 --- /dev/null +++ b/multi_modal_auto_agent.py @@ -0,0 +1,17 @@ +from swarms.structs import Flow +from swarms.models.gpt4_vision_api import GPT4VisionAPI + + +llm = GPT4VisionAPI() + +task = "What is the color of the object?" +img = "images/swarms.jpeg" + +## Initialize the workflow +flow = Flow( + llm=llm, + max_loops='auto', + dashboard=True, +) + +flow.run(task=task, img=img) \ No newline at end of file diff --git a/playground/demos/multi_modal_autonomous_agents/multi_modal_auto_agent.py b/playground/demos/multi_modal_autonomous_agents/multi_modal_auto_agent.py deleted file mode 100644 index a2602706..00000000 --- a/playground/demos/multi_modal_autonomous_agents/multi_modal_auto_agent.py +++ /dev/null @@ -1,33 +0,0 @@ -from swarms.structs import Flow -from swarms.models import Idefics - -# Multi Modality Auto Agent -llm = Idefics(max_length=2000) - -task = ( - "User: What is in this image?" - " https://upload.wikimedia.org/wikipedia/commons/8/86/Id%C3%A9fix.JPG" -) - -## Initialize the workflow -flow = Flow( - llm=llm, - max_loops=2, - dashboard=True, - # stopping_condition=None, # You can define a stopping condition as needed. - # loop_interval=1, - # retry_attempts=3, - # retry_interval=1, - # interactive=False, # Set to 'True' for interactive mode. - # dynamic_temperature=False, # Set to 'True' for dynamic temperature handling. -) - -# out = flow.load_state("flow_state.json") -# temp = flow.dynamic_temperature() -# filter = flow.add_response_filter("Trump") -out = flow.run(task) -# out = flow.validate_response(out) -# out = flow.analyze_feedback(out) -# out = flow.print_history_and_memory() -# # out = flow.save_state("flow_state.json") -# print(out) diff --git a/swarms/models/__init__.py b/swarms/models/__init__.py index 10bf2fab..aa1da8f7 100644 --- a/swarms/models/__init__.py +++ b/swarms/models/__init__.py @@ -25,6 +25,7 @@ from swarms.models.idefics import Idefics # noqa: E402 from swarms.models.vilt import Vilt # noqa: E402 from swarms.models.nougat import Nougat # noqa: E402 from swarms.models.layoutlm_document_qa import LayoutLMDocumentQA # noqa: E402 +from swarms.models.gpt4_vision_api import GPT4VisionAPI # noqa: E40 # from swarms.models.gpt4v import GPT4Vision # from swarms.models.dalle3 import Dalle3 @@ -49,4 +50,7 @@ __all__ = [ "WizardLLMStoryTeller", # "GPT4Vision", # "Dalle3", + # "DistilWhisperModel", + "GPT4VisionAPI", + ] diff --git a/swarms/models/gpt4_vision_api.py b/swarms/models/gpt4_vision_api.py new file mode 100644 index 00000000..05736261 --- /dev/null +++ b/swarms/models/gpt4_vision_api.py @@ -0,0 +1,127 @@ +import base64 +import os +import requests +from dotenv import load_dotenv + +# Load environment variables +load_dotenv() +openai_api_key = os.getenv("OPENAI_API_KEY") + +class GPT4VisionAPI: + """ + GPT-4 Vision API + + This class is a wrapper for the OpenAI API. It is used to run the GPT-4 Vision model. + + Parameters + ---------- + openai_api_key : str + The OpenAI API key. Defaults to the OPENAI_API_KEY environment variable. + + Methods + ------- + encode_image(img: str) + Encode image to base64. + run(task: str, img: str) + Run the model. + __call__(task: str, img: str) + Run the model. + + Examples: + --------- + >>> from swarms.models import GPT4VisionAPI + >>> llm = GPT4VisionAPI() + >>> task = "What is the color of the object?" + >>> img = "https://i.imgur.com/2M2ZGwC.jpeg" + >>> llm.run(task, img) + + + """ + def __init__( + self, + openai_api_key: str = openai_api_key + ): + super().__init__() + self.openai_api_key = openai_api_key + + def encode_image(self, img: str): + """Encode image to base64.""" + with open(img, "rb") as image_file: + return base64.b64encode(image_file.read()).decode("utf-8") + + # Function to handle vision tasks + def run(self, task: str, img: str): + """Run the model.""" + try: + base64_image = self.encode_image(img) + headers = { + "Content-Type": "application/json", + "Authorization": f"Bearer {openai_api_key}", + } + payload = { + "model": "gpt-4-vision-preview", + "messages": [ + { + "role": "user", + "content": [ + {"type": "text", "text": task}, + { + "type": "image_url", + "image_url": { + "url": f"data:image/jpeg;base64,{base64_image}" + }, + }, + ], + } + ], + "max_tokens": 300, + } + response = requests.post( + "https://api.openai.com/v1/chat/completions", + headers=headers, + json=payload, + ) + + out = response.json() + + out = out["choices"][0]["text"] + except Exception as error: + print(f"Error with the request: {error}") + raise error + # Function to handle vision tasks + + def __call__(self, task: str, img: str): + """Run the model.""" + try: + base64_image = self.encode_image(img) + headers = { + "Content-Type": "application/json", + "Authorization": f"Bearer {openai_api_key}", + } + payload = { + "model": "gpt-4-vision-preview", + "messages": [ + { + "role": "user", + "content": [ + {"type": "text", "text": task}, + { + "type": "image_url", + "image_url": { + "url": f"data:image/jpeg;base64,{base64_image}" + }, + }, + ], + } + ], + "max_tokens": 300, + } + response = requests.post( + "https://api.openai.com/v1/chat/completions", + headers=headers, + json=payload, + ) + return response.json() + except Exception as error: + print(f"Error with the request: {error}") + raise error diff --git a/swarms/structs/flow.py b/swarms/structs/flow.py index 99a3e587..2287273c 100644 --- a/swarms/structs/flow.py +++ b/swarms/structs/flow.py @@ -496,7 +496,7 @@ class Flow: ) print(error) - def run(self, task: str, **kwargs): + def run(self, task: str, img: Optional[str], **kwargs): """ Run the autonomous agent loop @@ -550,10 +550,17 @@ class Flow: attempt = 0 while attempt < self.retry_attempts: try: - response = self.llm( - task, - **kwargs, - ) + if img: + response = self.llm( + task, + img, + **kwargs, + ) + else: + response = self.llm( + task, + **kwargs, + ) # If code interpreter is enabled then run the code if self.code_interpreter: