From c657ed3aa4a90062e31033873853eb7655392823 Mon Sep 17 00:00:00 2001 From: harshalmore31 Date: Mon, 7 Jul 2025 20:42:47 +0530 Subject: [PATCH 1/5] Enhance vision processing by supporting direct URL usage and adding utility methods for processing info --- swarms/utils/litellm_wrapper.py | 269 ++++++++++++++++++++++---------- 1 file changed, 188 insertions(+), 81 deletions(-) diff --git a/swarms/utils/litellm_wrapper.py b/swarms/utils/litellm_wrapper.py index 063e6ce3..ed06fc1e 100644 --- a/swarms/utils/litellm_wrapper.py +++ b/swarms/utils/litellm_wrapper.py @@ -212,44 +212,62 @@ class LiteLLM: Process vision input specifically for Anthropic models. Handles Anthropic's specific image format requirements. """ - # Get base64 encoded image - image_url = get_image_base64(image) - - # Extract mime type from the data URI or use default - mime_type = "image/jpeg" # default - if "data:" in image_url and ";base64," in image_url: - mime_type = image_url.split(";base64,")[0].split("data:")[ - 1 - ] - - # Ensure mime type is one of the supported formats - supported_formats = [ - "image/jpeg", - "image/png", - "image/gif", - "image/webp", - ] - if mime_type not in supported_formats: - mime_type = ( - "image/jpeg" # fallback to jpeg if unsupported + # Check if we can use direct URL + if self._should_use_direct_url(image): + # Use direct URL without base64 conversion + messages.append( + { + "role": "user", + "content": [ + {"type": "text", "text": task}, + { + "type": "image_url", + "image_url": { + "url": image, + }, + }, + ], + } ) + else: + # Fall back to base64 conversion for local files + image_url = get_image_base64(image) + + # Extract mime type from the data URI or use default + mime_type = "image/jpeg" # default + if "data:" in image_url and ";base64," in image_url: + mime_type = image_url.split(";base64,")[0].split("data:")[ + 1 + ] + + # Ensure mime type is one of the supported formats + supported_formats = [ + "image/jpeg", + "image/png", + "image/gif", + "image/webp", + ] + if mime_type not in supported_formats: + mime_type = ( + "image/jpeg" # fallback to jpeg if unsupported + ) - # Construct Anthropic vision message - messages.append( - { - "role": "user", - "content": [ - {"type": "text", "text": task}, - { - "type": "image_url", - "image_url": { - "url": image_url, - "format": mime_type, + # Construct Anthropic vision message with base64 + messages.append( + { + "role": "user", + "content": [ + {"type": "text", "text": task}, + { + "type": "image_url", + "image_url": { + "url": image_url, + "format": mime_type, + }, }, - }, - ], - } - ) + ], + } + ) return messages @@ -260,21 +278,29 @@ class LiteLLM: Process vision input specifically for OpenAI models. Handles OpenAI's specific image format requirements. """ - # Get base64 encoded image with proper format - image_url = get_image_base64(image) + # Check if we can use direct URL + if self._should_use_direct_url(image): + # Use direct URL without base64 conversion + vision_message = { + "type": "image_url", + "image_url": {"url": image}, + } + else: + # Fall back to base64 conversion for local files + image_url = get_image_base64(image) - # Prepare vision message - vision_message = { - "type": "image_url", - "image_url": {"url": image_url}, - } + # Prepare vision message with base64 + vision_message = { + "type": "image_url", + "image_url": {"url": image_url}, + } - # Add format for specific models - extension = Path(image).suffix.lower() - mime_type = ( - f"image/{extension[1:]}" if extension else "image/jpeg" - ) - vision_message["image_url"]["format"] = mime_type + # Add format for specific models + extension = Path(image).suffix.lower() + mime_type = ( + f"image/{extension[1:]}" if extension else "image/jpeg" + ) + vision_message["image_url"]["format"] = mime_type # Append vision message messages.append( @@ -289,44 +315,86 @@ class LiteLLM: return messages + def _should_use_direct_url(self, image: str) -> bool: + """ + Determine if we should use direct URL passing instead of base64 conversion. + + Args: + image (str): The image source (URL or file path) + + Returns: + bool: True if we should use direct URL, False if we need base64 conversion + """ + # Only use direct URL for HTTP/HTTPS URLs + if not image.startswith(("http://", "https://")): + return False + + # Check if the model supports direct URL passing + # Most major providers (OpenAI, Anthropic, etc.) support direct URLs + model_lower = self.model_name.lower() + + # List of models/providers that support direct URL passing + url_supported_models = [ + "gpt-4", + "gpt-4o", + "gpt-4-vision", + "claude", + "anthropic", + "openai", + "gemini", + "vertex_ai", + ] + + # Check if any of the supported model patterns match + return any(pattern in model_lower for pattern in url_supported_models) + + def _is_local_model(self) -> bool: + """ + Check if the model is a local/custom model that might not support direct URLs. + + Returns: + bool: True if it's likely a local model + """ + model_lower = self.model_name.lower() + + # Indicators of local/custom models + local_indicators = [ + "localhost", + "127.0.0.1", + "local", + "custom", + "ollama", + "llama-cpp", + ] + + return any(indicator in model_lower for indicator in local_indicators) or \ + (self.base_url is not None and any(indicator in self.base_url.lower() for indicator in local_indicators)) + def vision_processing( self, task: str, image: str, messages: Optional[list] = None ): """ Process the image for the given task. Handles different image formats and model requirements. + + This method now intelligently chooses between: + 1. Direct URL passing (when model supports it and image is a URL) + 2. Base64 conversion (for local files or unsupported models) + + This approach reduces server load and improves performance by avoiding + unnecessary image downloads and base64 conversions when possible. """ - # # # Handle Anthropic models separately - # # if "anthropic" in self.model_name.lower() or "claude" in self.model_name.lower(): - # # messages = self.anthropic_vision_processing(task, image, messages) - # # return messages - - # # Get base64 encoded image with proper format - # image_url = get_image_base64(image) - - # # Prepare vision message - # vision_message = { - # "type": "image_url", - # "image_url": {"url": image_url}, - # } - - # # Add format for specific models - # extension = Path(image).suffix.lower() - # mime_type = f"image/{extension[1:]}" if extension else "image/jpeg" - # vision_message["image_url"]["format"] = mime_type - - # # Append vision message - # messages.append( - # { - # "role": "user", - # "content": [ - # {"type": "text", "text": task}, - # vision_message, - # ], - # } - # ) - - # return messages + logger.info(f"Processing image for model: {self.model_name}") + + # Log whether we're using direct URL or base64 conversion + if self._should_use_direct_url(image): + logger.info(f"Using direct URL passing for image: {image[:100]}...") + else: + if image.startswith(("http://", "https://")): + logger.info("Converting URL image to base64 (model doesn't support direct URLs)") + else: + logger.info("Converting local file to base64") + if ( "anthropic" in self.model_name.lower() or "claude" in self.model_name.lower() @@ -370,14 +438,25 @@ class LiteLLM: def check_if_model_supports_vision(self, img: str = None): """ - Check if the model supports vision. + Check if the model supports vision capabilities. + + This method uses LiteLLM's built-in supports_vision function to verify + that the model can handle image inputs before processing. + + Args: + img (str, optional): Image path/URL to validate against model capabilities + + Raises: + ValueError: If the model doesn't support vision and an image is provided """ if img is not None: out = supports_vision(model=self.model_name) if out is False: raise ValueError( - f"Model {self.model_name} does not support vision" + f"Model {self.model_name} does not support vision. " + f"Use a vision-capable model like gpt-4-vision-preview, " + f"claude-3-sonnet, or gemini-pro-vision." ) def run( @@ -619,3 +698,31 @@ class LiteLLM: f"Running {len(tasks)} tasks asynchronously in batches of {batch_size}" ) return await self._process_batch(tasks, batch_size) + + def get_vision_processing_info(self, image: str) -> dict: + """ + Get information about how the image will be processed for this model. + + This utility method helps users understand whether their image will be: + - Passed directly as URL (more efficient) + - Converted to base64 (fallback for unsupported models/local files) + + Args: + image (str): The image source (URL or file path) + + Returns: + dict: Information about the processing approach + """ + return { + "model_name": self.model_name, + "image_source": image, + "is_url": image.startswith(("http://", "https://")), + "is_local_file": not image.startswith(("http://", "https://", "data:")), + "will_use_direct_url": self._should_use_direct_url(image), + "supports_vision": supports_vision(model=self.model_name), + "processing_method": "direct_url" if self._should_use_direct_url(image) else "base64_conversion", + "benefits": { + "direct_url": "No server bandwidth/CPU usage for image processing", + "base64_conversion": "Works with local files and all model types" + } + } From 7ec57fe39e7c10bfaf9ef6aabbf938b6a64b3a79 Mon Sep 17 00:00:00 2001 From: harshalmore31 Date: Tue, 8 Jul 2025 21:31:52 +0530 Subject: [PATCH 2/5] Refactor vision processing methods to streamline direct URL handling and remove deprecated functionality --- swarms/utils/litellm_wrapper.py | 92 +++++++++------------------------ 1 file changed, 24 insertions(+), 68 deletions(-) diff --git a/swarms/utils/litellm_wrapper.py b/swarms/utils/litellm_wrapper.py index ed06fc1e..d9832edb 100644 --- a/swarms/utils/litellm_wrapper.py +++ b/swarms/utils/litellm_wrapper.py @@ -328,47 +328,22 @@ class LiteLLM: # Only use direct URL for HTTP/HTTPS URLs if not image.startswith(("http://", "https://")): return False - - # Check if the model supports direct URL passing - # Most major providers (OpenAI, Anthropic, etc.) support direct URLs - model_lower = self.model_name.lower() - - # List of models/providers that support direct URL passing - url_supported_models = [ - "gpt-4", - "gpt-4o", - "gpt-4-vision", - "claude", - "anthropic", - "openai", - "gemini", - "vertex_ai", - ] - - # Check if any of the supported model patterns match - return any(pattern in model_lower for pattern in url_supported_models) - - def _is_local_model(self) -> bool: - """ - Check if the model is a local/custom model that might not support direct URLs. - Returns: - bool: True if it's likely a local model - """ + # Check for local/custom models that might not support direct URLs model_lower = self.model_name.lower() + local_indicators = ["localhost", "127.0.0.1", "local", "custom", "ollama", "llama-cpp"] - # Indicators of local/custom models - local_indicators = [ - "localhost", - "127.0.0.1", - "local", - "custom", - "ollama", - "llama-cpp", - ] + is_local = any(indicator in model_lower for indicator in local_indicators) or \ + (self.base_url is not None and any(indicator in self.base_url.lower() for indicator in local_indicators)) - return any(indicator in model_lower for indicator in local_indicators) or \ - (self.base_url is not None and any(indicator in self.base_url.lower() for indicator in local_indicators)) + if is_local: + return False + + # Use LiteLLM's supports_vision to check if model supports vision and direct URLs + try: + return supports_vision(model=self.model_name) + except Exception: + return False def vision_processing( self, task: str, image: str, messages: Optional[list] = None @@ -571,12 +546,21 @@ class LiteLLM: """ return self.run(task, *args, **kwargs) - async def arun(self, task: str, *args, **kwargs): + async def arun( + self, + task: str, + audio: Optional[str] = None, + img: Optional[str] = None, + *args, + **kwargs + ): """ Run the LLM model asynchronously for the given task. Args: task (str): The task to run the model for. + audio (str, optional): Audio input if any. Defaults to None. + img (str, optional): Image input if any. Defaults to None. *args: Additional positional arguments. **kwargs: Additional keyword arguments. @@ -584,9 +568,9 @@ class LiteLLM: str: The content of the response from the model. """ try: - messages = self._prepare_messages(task) + messages = self._prepare_messages(task=task, img=img) - # Prepare common completion parameters + # Base completion parameters completion_params = { "model": self.model_name, "messages": messages, @@ -698,31 +682,3 @@ class LiteLLM: f"Running {len(tasks)} tasks asynchronously in batches of {batch_size}" ) return await self._process_batch(tasks, batch_size) - - def get_vision_processing_info(self, image: str) -> dict: - """ - Get information about how the image will be processed for this model. - - This utility method helps users understand whether their image will be: - - Passed directly as URL (more efficient) - - Converted to base64 (fallback for unsupported models/local files) - - Args: - image (str): The image source (URL or file path) - - Returns: - dict: Information about the processing approach - """ - return { - "model_name": self.model_name, - "image_source": image, - "is_url": image.startswith(("http://", "https://")), - "is_local_file": not image.startswith(("http://", "https://", "data:")), - "will_use_direct_url": self._should_use_direct_url(image), - "supports_vision": supports_vision(model=self.model_name), - "processing_method": "direct_url" if self._should_use_direct_url(image) else "base64_conversion", - "benefits": { - "direct_url": "No server bandwidth/CPU usage for image processing", - "base64_conversion": "Works with local files and all model types" - } - } From 81a60405273db865d7d1c5e3060bfc9878a50a5d Mon Sep 17 00:00:00 2001 From: harshalmore31 Date: Tue, 8 Jul 2025 21:37:31 +0530 Subject: [PATCH 3/5] fixes and clean up ! --- swarms/utils/litellm_wrapper.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/swarms/utils/litellm_wrapper.py b/swarms/utils/litellm_wrapper.py index d9832edb..3692cb65 100644 --- a/swarms/utils/litellm_wrapper.py +++ b/swarms/utils/litellm_wrapper.py @@ -429,9 +429,7 @@ class LiteLLM: if out is False: raise ValueError( - f"Model {self.model_name} does not support vision. " - f"Use a vision-capable model like gpt-4-vision-preview, " - f"claude-3-sonnet, or gemini-pro-vision." + f"Model {self.model_name} does not support vision" ) def run( @@ -570,7 +568,7 @@ class LiteLLM: try: messages = self._prepare_messages(task=task, img=img) - # Base completion parameters + # Prepare common completion parameters completion_params = { "model": self.model_name, "messages": messages, From 0105c958512eb8b2dfad619cf273f29d39a43cde Mon Sep 17 00:00:00 2001 From: harshalmore31 Date: Tue, 8 Jul 2025 21:39:14 +0530 Subject: [PATCH 4/5] cleanup ! --- swarms/utils/litellm_wrapper.py | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/swarms/utils/litellm_wrapper.py b/swarms/utils/litellm_wrapper.py index 3692cb65..52550800 100644 --- a/swarms/utils/litellm_wrapper.py +++ b/swarms/utils/litellm_wrapper.py @@ -544,21 +544,12 @@ class LiteLLM: """ return self.run(task, *args, **kwargs) - async def arun( - self, - task: str, - audio: Optional[str] = None, - img: Optional[str] = None, - *args, - **kwargs - ): + async def arun(self, task: str, *args, **kwargs): """ Run the LLM model asynchronously for the given task. Args: task (str): The task to run the model for. - audio (str, optional): Audio input if any. Defaults to None. - img (str, optional): Image input if any. Defaults to None. *args: Additional positional arguments. **kwargs: Additional keyword arguments. @@ -566,7 +557,7 @@ class LiteLLM: str: The content of the response from the model. """ try: - messages = self._prepare_messages(task=task, img=img) + messages = self._prepare_messages(task) # Prepare common completion parameters completion_params = { From 3d89664193f61bbe31504c85418a749851a68ed6 Mon Sep 17 00:00:00 2001 From: harshalmore31 Date: Tue, 8 Jul 2025 22:58:16 +0530 Subject: [PATCH 5/5] Add vision support tests and enhance URL processing validation --- tests/utils/test_litellm_wrapper.py | 113 ++++++++++++++++++++++++++++ 1 file changed, 113 insertions(+) diff --git a/tests/utils/test_litellm_wrapper.py b/tests/utils/test_litellm_wrapper.py index 02e79c9f..3a657bae 100644 --- a/tests/utils/test_litellm_wrapper.py +++ b/tests/utils/test_litellm_wrapper.py @@ -201,6 +201,119 @@ def run_test_suite(): except Exception as e: log_test_result("Batched Run", False, str(e)) + # Test 8: Vision Support Check + try: + logger.info("Testing vision support check") + llm = LiteLLM(model_name="gpt-4o") + # This should not raise an error for vision-capable models + llm.check_if_model_supports_vision(img="test.jpg") + log_test_result("Vision Support Check", True) + except Exception as e: + log_test_result("Vision Support Check", False, str(e)) + + # Test 9: Direct URL Processing + try: + logger.info("Testing direct URL processing") + llm = LiteLLM(model_name="gpt-4o") + test_url = "https://github.com/kyegomez/swarms/blob/master/swarms_logo_new.png?raw=true" + should_use_direct = llm._should_use_direct_url(test_url) + assert isinstance(should_use_direct, bool) + log_test_result("Direct URL Processing", True) + except Exception as e: + log_test_result("Direct URL Processing", False, str(e)) + + # Test 10: Message Preparation with Image + try: + logger.info("Testing message preparation with image") + llm = LiteLLM(model_name="gpt-4o") + # Mock image URL to test message structure + test_img = "https://github.com/kyegomez/swarms/blob/master/swarms_logo_new.png?raw=true" + messages = llm._prepare_messages("Describe this image", img=test_img) + assert isinstance(messages, list) + assert len(messages) >= 1 + # Check if image content is properly structured + user_message = next((msg for msg in messages if msg["role"] == "user"), None) + assert user_message is not None + log_test_result("Message Preparation with Image", True) + except Exception as e: + log_test_result("Message Preparation with Image", False, str(e)) + + # Test 11: Vision Processing Methods + try: + logger.info("Testing vision processing methods") + llm = LiteLLM(model_name="gpt-4o") + messages = [] + + # Test OpenAI vision processing + processed_messages = llm.openai_vision_processing( + "Describe this image", + "https://github.com/kyegomez/swarms/blob/master/swarms_logo_new.png?raw=true", + messages.copy() + ) + assert isinstance(processed_messages, list) + assert len(processed_messages) > 0 + + # Test Anthropic vision processing + llm_anthropic = LiteLLM(model_name="claude-3-5-sonnet-20241022") + processed_messages_anthropic = llm_anthropic.anthropic_vision_processing( + "Describe this image", + "https://github.com/kyegomez/swarms/blob/master/swarms_logo_new.png?raw=true", + messages.copy() + ) + assert isinstance(processed_messages_anthropic, list) + assert len(processed_messages_anthropic) > 0 + + log_test_result("Vision Processing Methods", True) + except Exception as e: + log_test_result("Vision Processing Methods", False, str(e)) + + # Test 12: Local vs URL Detection + try: + logger.info("Testing local vs URL detection") + llm = LiteLLM(model_name="gpt-4o") + + # Test URL detection + url_test = "https://github.com/kyegomez/swarms/blob/master/swarms_logo_new.png?raw=true" + is_url_direct = llm._should_use_direct_url(url_test) + + # Test local file detection + local_test = "/path/to/local/image.jpg" + is_local_direct = llm._should_use_direct_url(local_test) + + # URLs should potentially use direct, local files should not + assert isinstance(is_url_direct, bool) + assert isinstance(is_local_direct, bool) + assert is_local_direct == False # Local files should never use direct URL + + log_test_result("Local vs URL Detection", True) + except Exception as e: + log_test_result("Local vs URL Detection", False, str(e)) + + # Test 13: Vision Message Structure + try: + logger.info("Testing vision message structure") + llm = LiteLLM(model_name="gpt-4o") + messages = [] + + # Test message structure for image input + result = llm.vision_processing( + task="What do you see?", + image="https://github.com/kyegomez/swarms/blob/master/swarms_logo_new.png?raw=true", + messages=messages + ) + + assert isinstance(result, list) + assert len(result) > 0 + + # Verify the message contains both text and image components + user_msg = result[-1] # Last message should be user message + assert user_msg["role"] == "user" + assert "content" in user_msg + + log_test_result("Vision Message Structure", True) + except Exception as e: + log_test_result("Vision Message Structure", False, str(e)) + # Generate test report success_rate = (passed_tests / total_tests) * 100 logger.info("\n=== Test Suite Report ===")