diff --git a/multi_modal_auto_agent.py b/multi_modal_auto_agent.py index f5462847..1500349c 100644 --- a/multi_modal_auto_agent.py +++ b/multi_modal_auto_agent.py @@ -11,7 +11,7 @@ img = "images/swarms.jpeg" flow = Flow( llm=llm, max_loops="auto", - dashboard=True, + ) flow.run(task=task, img=img) diff --git a/swarms/models/gpt4_vision_api.py b/swarms/models/gpt4_vision_api.py index 87c76200..87436c55 100644 --- a/swarms/models/gpt4_vision_api.py +++ b/swarms/models/gpt4_vision_api.py @@ -18,7 +18,10 @@ class GPT4VisionAPI: ---------- openai_api_key : str The OpenAI API key. Defaults to the OPENAI_API_KEY environment variable. + max_tokens : int + The maximum number of tokens to generate. Defaults to 300. + Methods ------- encode_image(img: str) @@ -39,9 +42,10 @@ class GPT4VisionAPI: """ - def __init__(self, openai_api_key: str = openai_api_key): + def __init__(self, openai_api_key: str = openai_api_key, max_tokens: str = 300): super().__init__() self.openai_api_key = openai_api_key + self.max_tokens = max_tokens def encode_image(self, img: str): """Encode image to base64.""" @@ -75,7 +79,7 @@ class GPT4VisionAPI: ], } ], - "max_tokens": 300, + "max_tokens": self.max_tokens, } response = requests.post( "https://api.openai.com/v1/chat/completions", @@ -84,8 +88,8 @@ class GPT4VisionAPI: ) out = response.json() - - out = out["choices"][0]["text"] + content = out["choices"][0]["message"]["content"] + print(content) except Exception as error: print(f"Error with the request: {error}") raise error @@ -117,14 +121,18 @@ class GPT4VisionAPI: ], } ], - "max_tokens": 300, + "max_tokens": self.max_tokens, } response = requests.post( "https://api.openai.com/v1/chat/completions", headers=headers, json=payload, ) - return response.json() + + out = response.json() + content = out["choices"][0]["message"]["content"] + print(content) except Exception as error: print(f"Error with the request: {error}") raise error + # Function to handle vision tasks \ No newline at end of file diff --git a/tests/models/test_gpt4_vision_api.py b/tests/models/test_gpt4_vision_api.py index a40d6f63..df2379a8 100644 --- a/tests/models/test_gpt4_vision_api.py +++ b/tests/models/test_gpt4_vision_api.py @@ -25,7 +25,7 @@ def test_encode_image(vision_api): with patch( "builtins.open", mock_open(read_data=b"test_image_data"), create=True ): - encoded_image = vision_api.encode_image("test_image.jpg") + encoded_image = vision_api.encode_image(img) assert encoded_image == "dGVzdF9pbWFnZV9kYXRh" @@ -34,7 +34,7 @@ def test_run_success(vision_api): with patch( "requests.post", return_value=Mock(json=lambda: expected_response) ) as mock_post: - result = vision_api.run("What is this?", "test_image.jpg") + result = vision_api.run("What is this?", img) mock_post.assert_called_once() assert result == "This is the model's response." @@ -44,7 +44,7 @@ def test_run_request_error(vision_api): "requests.post", side_effect=RequestException("Request Error") ) as mock_post: with pytest.raises(RequestException): - vision_api.run("What is this?", "test_image.jpg") + vision_api.run("What is this?", img) def test_run_response_error(vision_api): @@ -53,7 +53,7 @@ def test_run_response_error(vision_api): "requests.post", return_value=Mock(json=lambda: expected_response) ) as mock_post: with pytest.raises(RuntimeError): - vision_api.run("What is this?", "test_image.jpg") + vision_api.run("What is this?", img) def test_call(vision_api): @@ -61,7 +61,7 @@ def test_call(vision_api): with patch( "requests.post", return_value=Mock(json=lambda: expected_response) ) as mock_post: - result = vision_api("What is this?", "test_image.jpg") + result = vision_api("What is this?", img) mock_post.assert_called_once() assert result == "This is the model's response."