From c657ed3aa4a90062e31033873853eb7655392823 Mon Sep 17 00:00:00 2001
From: harshalmore31 <harshalmore2468@gmail.com>
Date: Mon, 7 Jul 2025 20:42:47 +0530
Subject: [PATCH 1/5] Enhance vision processing by supporting direct URL usage
 and adding utility methods for processing info

---
 swarms/utils/litellm_wrapper.py | 269 ++++++++++++++++++++++----------
 1 file changed, 188 insertions(+), 81 deletions(-)

diff --git a/swarms/utils/litellm_wrapper.py b/swarms/utils/litellm_wrapper.py
index 063e6ce3..ed06fc1e 100644
--- a/swarms/utils/litellm_wrapper.py
+++ b/swarms/utils/litellm_wrapper.py
@@ -212,44 +212,62 @@ class LiteLLM:
         Process vision input specifically for Anthropic models.
         Handles Anthropic's specific image format requirements.
         """
-        # Get base64 encoded image
-        image_url = get_image_base64(image)
-
-        # Extract mime type from the data URI or use default
-        mime_type = "image/jpeg"  # default
-        if "data:" in image_url and ";base64," in image_url:
-            mime_type = image_url.split(";base64,")[0].split("data:")[
-                1
-            ]
-
-        # Ensure mime type is one of the supported formats
-        supported_formats = [
-            "image/jpeg",
-            "image/png",
-            "image/gif",
-            "image/webp",
-        ]
-        if mime_type not in supported_formats:
-            mime_type = (
-                "image/jpeg"  # fallback to jpeg if unsupported
+        # Check if we can use direct URL
+        if self._should_use_direct_url(image):
+            # Use direct URL without base64 conversion
+            messages.append(
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "text", "text": task},
+                        {
+                            "type": "image_url",
+                            "image_url": {
+                                "url": image,
+                            },
+                        },
+                    ],
+                }
             )
+        else:
+            # Fall back to base64 conversion for local files
+            image_url = get_image_base64(image)
+
+            # Extract mime type from the data URI or use default
+            mime_type = "image/jpeg"  # default
+            if "data:" in image_url and ";base64," in image_url:
+                mime_type = image_url.split(";base64,")[0].split("data:")[
+                    1
+                ]
+
+            # Ensure mime type is one of the supported formats
+            supported_formats = [
+                "image/jpeg",
+                "image/png",
+                "image/gif",
+                "image/webp",
+            ]
+            if mime_type not in supported_formats:
+                mime_type = (
+                    "image/jpeg"  # fallback to jpeg if unsupported
+                )
 
-        # Construct Anthropic vision message
-        messages.append(
-            {
-                "role": "user",
-                "content": [
-                    {"type": "text", "text": task},
-                    {
-                        "type": "image_url",
-                        "image_url": {
-                            "url": image_url,
-                            "format": mime_type,
+            # Construct Anthropic vision message with base64
+            messages.append(
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "text", "text": task},
+                        {
+                            "type": "image_url",
+                            "image_url": {
+                                "url": image_url,
+                                "format": mime_type,
+                            },
                         },
-                    },
-                ],
-            }
-        )
+                    ],
+                }
+            )
 
         return messages
 
@@ -260,21 +278,29 @@ class LiteLLM:
         Process vision input specifically for OpenAI models.
         Handles OpenAI's specific image format requirements.
         """
-        # Get base64 encoded image with proper format
-        image_url = get_image_base64(image)
+        # Check if we can use direct URL
+        if self._should_use_direct_url(image):
+            # Use direct URL without base64 conversion
+            vision_message = {
+                "type": "image_url",
+                "image_url": {"url": image},
+            }
+        else:
+            # Fall back to base64 conversion for local files
+            image_url = get_image_base64(image)
 
-        # Prepare vision message
-        vision_message = {
-            "type": "image_url",
-            "image_url": {"url": image_url},
-        }
+            # Prepare vision message with base64
+            vision_message = {
+                "type": "image_url",
+                "image_url": {"url": image_url},
+            }
 
-        # Add format for specific models
-        extension = Path(image).suffix.lower()
-        mime_type = (
-            f"image/{extension[1:]}" if extension else "image/jpeg"
-        )
-        vision_message["image_url"]["format"] = mime_type
+            # Add format for specific models
+            extension = Path(image).suffix.lower()
+            mime_type = (
+                f"image/{extension[1:]}" if extension else "image/jpeg"
+            )
+            vision_message["image_url"]["format"] = mime_type
 
         # Append vision message
         messages.append(
@@ -289,44 +315,86 @@ class LiteLLM:
 
         return messages
 
+    def _should_use_direct_url(self, image: str) -> bool:
+        """
+        Determine if we should use direct URL passing instead of base64 conversion.
+        
+        Args:
+            image (str): The image source (URL or file path)
+            
+        Returns:
+            bool: True if we should use direct URL, False if we need base64 conversion
+        """
+        # Only use direct URL for HTTP/HTTPS URLs
+        if not image.startswith(("http://", "https://")):
+            return False
+            
+        # Check if the model supports direct URL passing
+        # Most major providers (OpenAI, Anthropic, etc.) support direct URLs
+        model_lower = self.model_name.lower()
+        
+        # List of models/providers that support direct URL passing
+        url_supported_models = [
+            "gpt-4",
+            "gpt-4o", 
+            "gpt-4-vision",
+            "claude",
+            "anthropic",
+            "openai",
+            "gemini",
+            "vertex_ai",
+        ]
+        
+        # Check if any of the supported model patterns match
+        return any(pattern in model_lower for pattern in url_supported_models)
+
+    def _is_local_model(self) -> bool:
+        """
+        Check if the model is a local/custom model that might not support direct URLs.
+        
+        Returns:
+            bool: True if it's likely a local model
+        """
+        model_lower = self.model_name.lower()
+        
+        # Indicators of local/custom models
+        local_indicators = [
+            "localhost",
+            "127.0.0.1", 
+            "local",
+            "custom",
+            "ollama",
+            "llama-cpp",
+        ]
+        
+        return any(indicator in model_lower for indicator in local_indicators) or \
+               (self.base_url is not None and any(indicator in self.base_url.lower() for indicator in local_indicators))
+
     def vision_processing(
         self, task: str, image: str, messages: Optional[list] = None
     ):
         """
         Process the image for the given task.
         Handles different image formats and model requirements.
+        
+        This method now intelligently chooses between:
+        1. Direct URL passing (when model supports it and image is a URL)
+        2. Base64 conversion (for local files or unsupported models)
+        
+        This approach reduces server load and improves performance by avoiding
+        unnecessary image downloads and base64 conversions when possible.
         """
-        # # # Handle Anthropic models separately
-        # # if "anthropic" in self.model_name.lower() or "claude" in self.model_name.lower():
-        # #     messages = self.anthropic_vision_processing(task, image, messages)
-        # #     return messages
-
-        # # Get base64 encoded image with proper format
-        # image_url = get_image_base64(image)
-
-        # # Prepare vision message
-        # vision_message = {
-        #     "type": "image_url",
-        #     "image_url": {"url": image_url},
-        # }
-
-        # # Add format for specific models
-        # extension = Path(image).suffix.lower()
-        # mime_type = f"image/{extension[1:]}" if extension else "image/jpeg"
-        # vision_message["image_url"]["format"] = mime_type
-
-        # # Append vision message
-        # messages.append(
-        #     {
-        #         "role": "user",
-        #         "content": [
-        #             {"type": "text", "text": task},
-        #             vision_message,
-        #         ],
-        #     }
-        # )
-
-        # return messages
+        logger.info(f"Processing image for model: {self.model_name}")
+        
+        # Log whether we're using direct URL or base64 conversion
+        if self._should_use_direct_url(image):
+            logger.info(f"Using direct URL passing for image: {image[:100]}...")
+        else:
+            if image.startswith(("http://", "https://")):
+                logger.info("Converting URL image to base64 (model doesn't support direct URLs)")
+            else:
+                logger.info("Converting local file to base64")
+
         if (
             "anthropic" in self.model_name.lower()
             or "claude" in self.model_name.lower()
@@ -370,14 +438,25 @@ class LiteLLM:
 
     def check_if_model_supports_vision(self, img: str = None):
         """
-        Check if the model supports vision.
+        Check if the model supports vision capabilities.
+        
+        This method uses LiteLLM's built-in supports_vision function to verify
+        that the model can handle image inputs before processing.
+        
+        Args:
+            img (str, optional): Image path/URL to validate against model capabilities
+            
+        Raises:
+            ValueError: If the model doesn't support vision and an image is provided
         """
         if img is not None:
             out = supports_vision(model=self.model_name)
 
             if out is False:
                 raise ValueError(
-                    f"Model {self.model_name} does not support vision"
+                    f"Model {self.model_name} does not support vision. "
+                    f"Use a vision-capable model like gpt-4-vision-preview, "
+                    f"claude-3-sonnet, or gemini-pro-vision."
                 )
 
     def run(
@@ -619,3 +698,31 @@ class LiteLLM:
             f"Running {len(tasks)} tasks asynchronously in batches of {batch_size}"
         )
         return await self._process_batch(tasks, batch_size)
+
+    def get_vision_processing_info(self, image: str) -> dict:
+        """
+        Get information about how the image will be processed for this model.
+        
+        This utility method helps users understand whether their image will be:
+        - Passed directly as URL (more efficient)
+        - Converted to base64 (fallback for unsupported models/local files)
+        
+        Args:
+            image (str): The image source (URL or file path)
+            
+        Returns:
+            dict: Information about the processing approach
+        """
+        return {
+            "model_name": self.model_name,
+            "image_source": image,
+            "is_url": image.startswith(("http://", "https://")),
+            "is_local_file": not image.startswith(("http://", "https://", "data:")),
+            "will_use_direct_url": self._should_use_direct_url(image),
+            "supports_vision": supports_vision(model=self.model_name),
+            "processing_method": "direct_url" if self._should_use_direct_url(image) else "base64_conversion",
+            "benefits": {
+                "direct_url": "No server bandwidth/CPU usage for image processing",
+                "base64_conversion": "Works with local files and all model types"
+            }
+        }

From 7ec57fe39e7c10bfaf9ef6aabbf938b6a64b3a79 Mon Sep 17 00:00:00 2001
From: harshalmore31 <harshalmore2468@gmail.com>
Date: Tue, 8 Jul 2025 21:31:52 +0530
Subject: [PATCH 2/5] Refactor vision processing methods to streamline direct
 URL handling and remove deprecated functionality

---
 swarms/utils/litellm_wrapper.py | 92 +++++++++------------------------
 1 file changed, 24 insertions(+), 68 deletions(-)

diff --git a/swarms/utils/litellm_wrapper.py b/swarms/utils/litellm_wrapper.py
index ed06fc1e..d9832edb 100644
--- a/swarms/utils/litellm_wrapper.py
+++ b/swarms/utils/litellm_wrapper.py
@@ -328,47 +328,22 @@ class LiteLLM:
         # Only use direct URL for HTTP/HTTPS URLs
         if not image.startswith(("http://", "https://")):
             return False
-            
-        # Check if the model supports direct URL passing
-        # Most major providers (OpenAI, Anthropic, etc.) support direct URLs
-        model_lower = self.model_name.lower()
-        
-        # List of models/providers that support direct URL passing
-        url_supported_models = [
-            "gpt-4",
-            "gpt-4o", 
-            "gpt-4-vision",
-            "claude",
-            "anthropic",
-            "openai",
-            "gemini",
-            "vertex_ai",
-        ]
-        
-        # Check if any of the supported model patterns match
-        return any(pattern in model_lower for pattern in url_supported_models)
-
-    def _is_local_model(self) -> bool:
-        """
-        Check if the model is a local/custom model that might not support direct URLs.
         
-        Returns:
-            bool: True if it's likely a local model
-        """
+        # Check for local/custom models that might not support direct URLs
         model_lower = self.model_name.lower()
+        local_indicators = ["localhost", "127.0.0.1", "local", "custom", "ollama", "llama-cpp"]
         
-        # Indicators of local/custom models
-        local_indicators = [
-            "localhost",
-            "127.0.0.1", 
-            "local",
-            "custom",
-            "ollama",
-            "llama-cpp",
-        ]
+        is_local = any(indicator in model_lower for indicator in local_indicators) or \
+                   (self.base_url is not None and any(indicator in self.base_url.lower() for indicator in local_indicators))
         
-        return any(indicator in model_lower for indicator in local_indicators) or \
-               (self.base_url is not None and any(indicator in self.base_url.lower() for indicator in local_indicators))
+        if is_local:
+            return False
+        
+        # Use LiteLLM's supports_vision to check if model supports vision and direct URLs
+        try:
+            return supports_vision(model=self.model_name)
+        except Exception:
+            return False
 
     def vision_processing(
         self, task: str, image: str, messages: Optional[list] = None
@@ -571,12 +546,21 @@ class LiteLLM:
         """
         return self.run(task, *args, **kwargs)
 
-    async def arun(self, task: str, *args, **kwargs):
+    async def arun(
+        self,
+        task: str,
+        audio: Optional[str] = None,
+        img: Optional[str] = None,
+        *args,
+        **kwargs
+    ):
         """
         Run the LLM model asynchronously for the given task.
 
         Args:
             task (str): The task to run the model for.
+            audio (str, optional): Audio input if any. Defaults to None.
+            img (str, optional): Image input if any. Defaults to None.
             *args: Additional positional arguments.
             **kwargs: Additional keyword arguments.
 
@@ -584,9 +568,9 @@ class LiteLLM:
             str: The content of the response from the model.
         """
         try:
-            messages = self._prepare_messages(task)
+            messages = self._prepare_messages(task=task, img=img)
 
-            # Prepare common completion parameters
+            # Base completion parameters
             completion_params = {
                 "model": self.model_name,
                 "messages": messages,
@@ -698,31 +682,3 @@ class LiteLLM:
             f"Running {len(tasks)} tasks asynchronously in batches of {batch_size}"
         )
         return await self._process_batch(tasks, batch_size)
-
-    def get_vision_processing_info(self, image: str) -> dict:
-        """
-        Get information about how the image will be processed for this model.
-        
-        This utility method helps users understand whether their image will be:
-        - Passed directly as URL (more efficient)
-        - Converted to base64 (fallback for unsupported models/local files)
-        
-        Args:
-            image (str): The image source (URL or file path)
-            
-        Returns:
-            dict: Information about the processing approach
-        """
-        return {
-            "model_name": self.model_name,
-            "image_source": image,
-            "is_url": image.startswith(("http://", "https://")),
-            "is_local_file": not image.startswith(("http://", "https://", "data:")),
-            "will_use_direct_url": self._should_use_direct_url(image),
-            "supports_vision": supports_vision(model=self.model_name),
-            "processing_method": "direct_url" if self._should_use_direct_url(image) else "base64_conversion",
-            "benefits": {
-                "direct_url": "No server bandwidth/CPU usage for image processing",
-                "base64_conversion": "Works with local files and all model types"
-            }
-        }

From 81a60405273db865d7d1c5e3060bfc9878a50a5d Mon Sep 17 00:00:00 2001
From: harshalmore31 <harshalmore2468@gmail.com>
Date: Tue, 8 Jul 2025 21:37:31 +0530
Subject: [PATCH 3/5] fixes and clean up !

---
 swarms/utils/litellm_wrapper.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/swarms/utils/litellm_wrapper.py b/swarms/utils/litellm_wrapper.py
index d9832edb..3692cb65 100644
--- a/swarms/utils/litellm_wrapper.py
+++ b/swarms/utils/litellm_wrapper.py
@@ -429,9 +429,7 @@ class LiteLLM:
 
             if out is False:
                 raise ValueError(
-                    f"Model {self.model_name} does not support vision. "
-                    f"Use a vision-capable model like gpt-4-vision-preview, "
-                    f"claude-3-sonnet, or gemini-pro-vision."
+                    f"Model {self.model_name} does not support vision"
                 )
 
     def run(
@@ -570,7 +568,7 @@ class LiteLLM:
         try:
             messages = self._prepare_messages(task=task, img=img)
 
-            # Base completion parameters
+            # Prepare common completion parameters
             completion_params = {
                 "model": self.model_name,
                 "messages": messages,

From 0105c958512eb8b2dfad619cf273f29d39a43cde Mon Sep 17 00:00:00 2001
From: harshalmore31 <harshalmore2468@gmail.com>
Date: Tue, 8 Jul 2025 21:39:14 +0530
Subject: [PATCH 4/5] cleanup !

---
 swarms/utils/litellm_wrapper.py | 13 ++-----------
 1 file changed, 2 insertions(+), 11 deletions(-)

diff --git a/swarms/utils/litellm_wrapper.py b/swarms/utils/litellm_wrapper.py
index 3692cb65..52550800 100644
--- a/swarms/utils/litellm_wrapper.py
+++ b/swarms/utils/litellm_wrapper.py
@@ -544,21 +544,12 @@ class LiteLLM:
         """
         return self.run(task, *args, **kwargs)
 
-    async def arun(
-        self,
-        task: str,
-        audio: Optional[str] = None,
-        img: Optional[str] = None,
-        *args,
-        **kwargs
-    ):
+    async def arun(self, task: str, *args, **kwargs):
         """
         Run the LLM model asynchronously for the given task.
 
         Args:
             task (str): The task to run the model for.
-            audio (str, optional): Audio input if any. Defaults to None.
-            img (str, optional): Image input if any. Defaults to None.
             *args: Additional positional arguments.
             **kwargs: Additional keyword arguments.
 
@@ -566,7 +557,7 @@ class LiteLLM:
             str: The content of the response from the model.
         """
         try:
-            messages = self._prepare_messages(task=task, img=img)
+            messages = self._prepare_messages(task)
 
             # Prepare common completion parameters
             completion_params = {

From 3d89664193f61bbe31504c85418a749851a68ed6 Mon Sep 17 00:00:00 2001
From: harshalmore31 <harshalmore2468@gmail.com>
Date: Tue, 8 Jul 2025 22:58:16 +0530
Subject: [PATCH 5/5] Add vision support tests and enhance URL processing
 validation

---
 tests/utils/test_litellm_wrapper.py | 113 ++++++++++++++++++++++++++++
 1 file changed, 113 insertions(+)

diff --git a/tests/utils/test_litellm_wrapper.py b/tests/utils/test_litellm_wrapper.py
index 02e79c9f..3a657bae 100644
--- a/tests/utils/test_litellm_wrapper.py
+++ b/tests/utils/test_litellm_wrapper.py
@@ -201,6 +201,119 @@ def run_test_suite():
     except Exception as e:
         log_test_result("Batched Run", False, str(e))
 
+    # Test 8: Vision Support Check
+    try:
+        logger.info("Testing vision support check")
+        llm = LiteLLM(model_name="gpt-4o")
+        # This should not raise an error for vision-capable models
+        llm.check_if_model_supports_vision(img="test.jpg")
+        log_test_result("Vision Support Check", True)
+    except Exception as e:
+        log_test_result("Vision Support Check", False, str(e))
+
+    # Test 9: Direct URL Processing
+    try:
+        logger.info("Testing direct URL processing")
+        llm = LiteLLM(model_name="gpt-4o")
+        test_url = "https://github.com/kyegomez/swarms/blob/master/swarms_logo_new.png?raw=true"
+        should_use_direct = llm._should_use_direct_url(test_url)
+        assert isinstance(should_use_direct, bool)
+        log_test_result("Direct URL Processing", True)
+    except Exception as e:
+        log_test_result("Direct URL Processing", False, str(e))
+
+    # Test 10: Message Preparation with Image
+    try:
+        logger.info("Testing message preparation with image")
+        llm = LiteLLM(model_name="gpt-4o")
+        # Mock image URL to test message structure
+        test_img = "https://github.com/kyegomez/swarms/blob/master/swarms_logo_new.png?raw=true"
+        messages = llm._prepare_messages("Describe this image", img=test_img)
+        assert isinstance(messages, list)
+        assert len(messages) >= 1
+        # Check if image content is properly structured
+        user_message = next((msg for msg in messages if msg["role"] == "user"), None)
+        assert user_message is not None
+        log_test_result("Message Preparation with Image", True)
+    except Exception as e:
+        log_test_result("Message Preparation with Image", False, str(e))
+
+    # Test 11: Vision Processing Methods
+    try:
+        logger.info("Testing vision processing methods")
+        llm = LiteLLM(model_name="gpt-4o")
+        messages = []
+        
+        # Test OpenAI vision processing
+        processed_messages = llm.openai_vision_processing(
+            "Describe this image", 
+            "https://github.com/kyegomez/swarms/blob/master/swarms_logo_new.png?raw=true", 
+            messages.copy()
+        )
+        assert isinstance(processed_messages, list)
+        assert len(processed_messages) > 0
+        
+        # Test Anthropic vision processing
+        llm_anthropic = LiteLLM(model_name="claude-3-5-sonnet-20241022")
+        processed_messages_anthropic = llm_anthropic.anthropic_vision_processing(
+            "Describe this image", 
+            "https://github.com/kyegomez/swarms/blob/master/swarms_logo_new.png?raw=true", 
+            messages.copy()
+        )
+        assert isinstance(processed_messages_anthropic, list)
+        assert len(processed_messages_anthropic) > 0
+        
+        log_test_result("Vision Processing Methods", True)
+    except Exception as e:
+        log_test_result("Vision Processing Methods", False, str(e))
+
+    # Test 12: Local vs URL Detection
+    try:
+        logger.info("Testing local vs URL detection")
+        llm = LiteLLM(model_name="gpt-4o")
+        
+        # Test URL detection
+        url_test = "https://github.com/kyegomez/swarms/blob/master/swarms_logo_new.png?raw=true"
+        is_url_direct = llm._should_use_direct_url(url_test)
+        
+        # Test local file detection
+        local_test = "/path/to/local/image.jpg"
+        is_local_direct = llm._should_use_direct_url(local_test)
+        
+        # URLs should potentially use direct, local files should not
+        assert isinstance(is_url_direct, bool)
+        assert isinstance(is_local_direct, bool)
+        assert is_local_direct == False  # Local files should never use direct URL
+        
+        log_test_result("Local vs URL Detection", True)
+    except Exception as e:
+        log_test_result("Local vs URL Detection", False, str(e))
+
+    # Test 13: Vision Message Structure
+    try:
+        logger.info("Testing vision message structure")
+        llm = LiteLLM(model_name="gpt-4o")
+        messages = []
+        
+        # Test message structure for image input
+        result = llm.vision_processing(
+            task="What do you see?",
+            image="https://github.com/kyegomez/swarms/blob/master/swarms_logo_new.png?raw=true",
+            messages=messages
+        )
+        
+        assert isinstance(result, list)
+        assert len(result) > 0
+        
+        # Verify the message contains both text and image components
+        user_msg = result[-1]  # Last message should be user message
+        assert user_msg["role"] == "user"
+        assert "content" in user_msg
+        
+        log_test_result("Vision Message Structure", True)
+    except Exception as e:
+        log_test_result("Vision Message Structure", False, str(e))
+
     # Generate test report
     success_rate = (passed_tests / total_tests) * 100
     logger.info("\n=== Test Suite Report ===")