From a2c42fca54bc3dce45121f471195874531e017af Mon Sep 17 00:00:00 2001
From: Wxysnx <625024108@qq.com>
Date: Sat, 21 Jun 2025 17:48:31 +0800
Subject: [PATCH 1/2] Feed multiple images into the agent

---
 swarms/utils/litellm_wrapper.py | 253 ++++++++++++++++++--------------
 1 file changed, 139 insertions(+), 114 deletions(-)

diff --git a/swarms/utils/litellm_wrapper.py b/swarms/utils/litellm_wrapper.py
index 6aa5c7d3..d5ed3f60 100644
--- a/swarms/utils/litellm_wrapper.py
+++ b/swarms/utils/litellm_wrapper.py
@@ -1,5 +1,5 @@
 import traceback
-from typing import Optional
+from typing import Optional, List, Union
 import base64
 import requests
 from pathlib import Path
@@ -168,21 +168,33 @@ class LiteLLM:
                 out = out.model_dump()
             return out
 
+    # Modification: Updated _prepare_messages method to accept img parameter as string or list of strings
     def _prepare_messages(
         self,
         task: str,
-        img: str = None,
+        img: Union[str, List[str]] = None,  #Modification: Parameter type is string or string list
     ):
         """
         Prepare the messages for the given task.
 
         Args:
             task (str): The task to prepare messages for.
+            img (Union[str, List[str]], optional): Single image input or list of image inputs. Defaults to None.
 
         Returns:
             list: A list of messages prepared for the task.
         """
-        self.check_if_model_supports_vision(img=img)
+        # Edit: Convert single image string to list for unified processing
+        image_list = []
+        if img is not None:
+            if isinstance(img, str):
+                image_list = [img] 
+            else:
+                image_list = img  
+                
+        # Edit: Check if there is an image to process
+        if image_list:
+            self.check_if_model_supports_vision(image_list=image_list)
 
         # Initialize messages
         messages = []
@@ -194,148 +206,147 @@ class LiteLLM:
             )
 
         # Handle vision case
-        if img is not None:
+        if image_list:  # 修改：处理图像列表
             messages = self.vision_processing(
-                task=task, image=img, messages=messages
+                task=task, images=image_list, messages=messages
             )
         else:
             messages.append({"role": "user", "content": task})
 
         return messages
 
+    # Modification: Updated anthropic_vision_processing method to handle multiple images
     def anthropic_vision_processing(
-        self, task: str, image: str, messages: list
+        self, task: str, images: List[str], messages: list
     ) -> list:
         """
         Process vision input specifically for Anthropic models.
         Handles Anthropic's specific image format requirements.
+        
+        Args:
+            task (str): The task prompt
+            images (List[str]): List of image paths or URLs
+            messages (list): Current message list
+            
+        Returns:
+            list: Updated messages list with images
         """
-        # Get base64 encoded image
-        image_url = get_image_base64(image)
-
-        # Extract mime type from the data URI or use default
-        mime_type = "image/jpeg"  # default
-        if "data:" in image_url and ";base64," in image_url:
-            mime_type = image_url.split(";base64,")[0].split("data:")[
-                1
+        
+        content = [{"type": "text", "text": task}]
+        
+        
+        for image in images:
+            
+            image_url = get_image_base64(image)
+            
+           
+            mime_type = "image/jpeg"  
+            if "data:" in image_url and ";base64," in image_url:
+                mime_type = image_url.split(";base64,")[0].split("data:")[1]
+                
+            
+            supported_formats = [
+                "image/jpeg",
+                "image/png",
+                "image/gif",
+                "image/webp",
             ]
-
-        # Ensure mime type is one of the supported formats
-        supported_formats = [
-            "image/jpeg",
-            "image/png",
-            "image/gif",
-            "image/webp",
-        ]
-        if mime_type not in supported_formats:
-            mime_type = (
-                "image/jpeg"  # fallback to jpeg if unsupported
-            )
-
-        # Construct Anthropic vision message
-        messages.append(
-            {
-                "role": "user",
-                "content": [
-                    {"type": "text", "text": task},
-                    {
-                        "type": "image_url",
-                        "image_url": {
-                            "url": image_url,
-                            "format": mime_type,
-                        },
-                    },
-                ],
-            }
-        )
-
+            if mime_type not in supported_formats:
+                mime_type = "image/jpeg"  
+                
+            
+            content.append({
+                "type": "image_url",
+                "image_url": {
+                    "url": image_url,
+                    "format": mime_type,
+                },
+            })
+            
+        
+        messages.append({
+            "role": "user",
+            "content": content,
+        })
+        
         return messages
 
+    # Modification: Updated openai_vision_processing method to handle multiple images
     def openai_vision_processing(
-        self, task: str, image: str, messages: list
+        self, task: str, images: List[str], messages: list
     ) -> list:
         """
         Process vision input specifically for OpenAI models.
         Handles OpenAI's specific image format requirements.
+        
+        Args:
+            task (str): The task prompt
+            images (List[str]): List of image paths or URLs
+            messages (list): Current message list
+            
+        Returns:
+            list: Updated messages list with images
         """
-        # Get base64 encoded image with proper format
-        image_url = get_image_base64(image)
-
-        # Prepare vision message
-        vision_message = {
-            "type": "image_url",
-            "image_url": {"url": image_url},
-        }
-
-        # Add format for specific models
-        extension = Path(image).suffix.lower()
-        mime_type = (
-            f"image/{extension[1:]}" if extension else "image/jpeg"
-        )
-        vision_message["image_url"]["format"] = mime_type
-
-        # Append vision message
-        messages.append(
-            {
-                "role": "user",
-                "content": [
-                    {"type": "text", "text": task},
-                    vision_message,
-                ],
+        
+        content = [{"type": "text", "text": task}]
+        
+        
+        for image in images:
+            
+            image_url = get_image_base64(image)
+            
+            
+            vision_message = {
+                "type": "image_url",
+                "image_url": {"url": image_url},
             }
-        )
-
+            
+            
+            extension = Path(image).suffix.lower()
+            mime_type = f"image/{extension[1:]}" if extension else "image/jpeg"
+            vision_message["image_url"]["format"] = mime_type
+            
+            
+            content.append(vision_message)
+            
+       
+        messages.append({
+            "role": "user",
+            "content": content,
+        })
+        
         return messages
 
+    # Modification: Updated vision_processing method to handle multiple images
     def vision_processing(
-        self, task: str, image: str, messages: Optional[list] = None
+        self, task: str, images: List[str], messages: Optional[list] = None
     ):
         """
-        Process the image for the given task.
+        Process the images for the given task.
         Handles different image formats and model requirements.
+        
+        Args:
+            task (str): The task prompt
+            images (List[str]): List of image paths or URLs
+            messages (Optional[list], optional): Current messages list. Defaults to None.
+            
+        Returns:
+            list: Updated messages with image content
         """
-        # # # Handle Anthropic models separately
-        # # if "anthropic" in self.model_name.lower() or "claude" in self.model_name.lower():
-        # #     messages = self.anthropic_vision_processing(task, image, messages)
-        # #     return messages
-
-        # # Get base64 encoded image with proper format
-        # image_url = get_image_base64(image)
-
-        # # Prepare vision message
-        # vision_message = {
-        #     "type": "image_url",
-        #     "image_url": {"url": image_url},
-        # }
-
-        # # Add format for specific models
-        # extension = Path(image).suffix.lower()
-        # mime_type = f"image/{extension[1:]}" if extension else "image/jpeg"
-        # vision_message["image_url"]["format"] = mime_type
-
-        # # Append vision message
-        # messages.append(
-        #     {
-        #         "role": "user",
-        #         "content": [
-        #             {"type": "text", "text": task},
-        #             vision_message,
-        #         ],
-        #     }
-        # )
-
-        # return messages
+        if messages is None:
+            messages = []
+            
         if (
             "anthropic" in self.model_name.lower()
             or "claude" in self.model_name.lower()
         ):
             messages = self.anthropic_vision_processing(
-                task, image, messages
+                task, images, messages
             )
             return messages
         else:
             messages = self.openai_vision_processing(
-                task, image, messages
+                task, images, messages
             )
             return messages
 
@@ -366,23 +377,33 @@ class LiteLLM:
             }
         )
 
-    def check_if_model_supports_vision(self, img: str = None):
+    # Modification: Updated check_if_model_supports_vision method to support image lists
+    def check_if_model_supports_vision(self, img: str = None, image_list: List[str] = None):
         """
         Check if the model supports vision.
+        
+        Args:
+            img (str, optional): Single image path (for backward compatibility). Defaults to None.
+            image_list (List[str], optional): List of image paths. Defaults to None.
+        
+        Raises:
+            ValueError: If the model does not support vision.
         """
-        if img is not None:
+        # If there are any images (single or multiple), check if the model supports vision
+        if img is not None or (image_list and len(image_list) > 0):
             out = supports_vision(model=self.model_name)
-
+            
             if out is False:
                 raise ValueError(
                     f"Model {self.model_name} does not support vision"
                 )
 
+    # Modification: Update the run method so that the img parameter can accept a string or a list of strings
     def run(
         self,
         task: str,
         audio: Optional[str] = None,
-        img: Optional[str] = None,
+        img: Union[str, List[str]] = None,  
         *args,
         **kwargs,
     ):
@@ -392,7 +413,7 @@ class LiteLLM:
         Args:
             task (str): The task to run the model for.
             audio (str, optional): Audio input if any. Defaults to None.
-            img (str, optional): Image input if any. Defaults to None.
+            img (Union[str, List[str]], optional): Single image input or list of image inputs. Defaults to None.
             *args: Additional positional arguments.
             **kwargs: Additional keyword arguments.
 
@@ -403,6 +424,7 @@ class LiteLLM:
             Exception: If there is an error in processing the request.
         """
         try:
+            
             messages = self._prepare_messages(task=task, img=img)
 
             # Base completion parameters
@@ -486,12 +508,14 @@ class LiteLLM:
         """
         return self.run(task, *args, **kwargs)
 
-    async def arun(self, task: str, *args, **kwargs):
+    # Modification: Updated arun method to accept img parameter as string or list of strings
+    async def arun(self, task: str, img: Union[str, List[str]] = None, *args, **kwargs):
         """
         Run the LLM model asynchronously for the given task.
 
         Args:
             task (str): The task to run the model for.
+            img (Union[str, List[str]], optional): Single image input or list of image inputs. Defaults to None.
             *args: Additional positional arguments.
             **kwargs: Additional keyword arguments.
 
@@ -499,7 +523,8 @@ class LiteLLM:
             str: The content of the response from the model.
         """
         try:
-            messages = self._prepare_messages(task)
+            
+            messages = self._prepare_messages(task=task, img=img)
 
             # Prepare common completion parameters
             completion_params = {
@@ -612,4 +637,4 @@ class LiteLLM:
         logger.info(
             f"Running {len(tasks)} tasks asynchronously in batches of {batch_size}"
         )
-        return await self._process_batch(tasks, batch_size)
+        return await self._process_batch(tasks, batch_size)
\ No newline at end of file

From c28dea745f70c876777b7f0c506b662e550a4ac9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=8E=8B=E7=A5=A5=E5=AE=87?= <625024108@qq.com>
Date: Sun, 29 Jun 2025 11:29:38 +0800
Subject: [PATCH 2/2] =?UTF-8?q?=E6=9B=B4=E6=96=B0=20litellm=5Fwrapper.py?=
 =?UTF-8?q?=20=E6=96=87=E4=BB=B6?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 swarms/utils/litellm_wrapper.py | 49 ++++++++++++++++++++-------------
 1 file changed, 30 insertions(+), 19 deletions(-)

diff --git a/swarms/utils/litellm_wrapper.py b/swarms/utils/litellm_wrapper.py
index d5ed3f60..96390825 100644
--- a/swarms/utils/litellm_wrapper.py
+++ b/swarms/utils/litellm_wrapper.py
@@ -1,3 +1,4 @@
+
 import traceback
 from typing import Optional, List, Union
 import base64
@@ -51,11 +52,24 @@ def get_audio_base64(audio_source: str) -> str:
     return encoded_string
 
 
-def get_image_base64(image_source: str) -> str:
+# 修改：更新函数签名和实现以支持列表输入
+def get_image_base64(image_source: Union[str, List[str]]) -> Union[str, List[str]]:
     """
     Convert image from a given source to a base64 encoded string.
     Handles URLs, local file paths, and data URIs.
+    Now supports both single image path and list of image paths.
+    
+    Args:
+        image_source: String path to image or list of image paths
+        
+    Returns:
+        Single base64 string or list of base64 strings
     """
+    # 处理图像列表
+    if isinstance(image_source, list):
+        return [get_image_base64(single_image) for single_image in image_source]
+    
+    # 处理单个图像（原始逻辑）
     # If already a data URI, return as is
     if image_source.startswith("data:image"):
         return image_source
@@ -234,17 +248,16 @@ class LiteLLM:
         
         content = [{"type": "text", "text": task}]
         
-        
-        for image in images:
-            
-            image_url = get_image_base64(image)
+        # 修改：使用新版get_image_base64函数处理图像列表
+        image_urls = get_image_base64(images)
+        if not isinstance(image_urls, list):
+            image_urls = [image_urls]
             
-           
+        for i, image_url in enumerate(image_urls):
             mime_type = "image/jpeg"  
             if "data:" in image_url and ";base64," in image_url:
                 mime_type = image_url.split(";base64,")[0].split("data:")[1]
                 
-            
             supported_formats = [
                 "image/jpeg",
                 "image/png",
@@ -254,7 +267,6 @@ class LiteLLM:
             if mime_type not in supported_formats:
                 mime_type = "image/jpeg"  
                 
-            
             content.append({
                 "type": "image_url",
                 "image_url": {
@@ -263,7 +275,6 @@ class LiteLLM:
                 },
             })
             
-        
         messages.append({
             "role": "user",
             "content": content,
@@ -290,26 +301,25 @@ class LiteLLM:
         
         content = [{"type": "text", "text": task}]
         
-        
-        for image in images:
-            
-            image_url = get_image_base64(image)
-            
+        # 修改：使用新版get_image_base64函数处理图像列表
+        image_urls = get_image_base64(images)
+        if not isinstance(image_urls, list):
+            image_urls = [image_urls]
             
+        for i, image_url in enumerate(image_urls):
             vision_message = {
                 "type": "image_url",
                 "image_url": {"url": image_url},
             }
             
-            
-            extension = Path(image).suffix.lower()
+            # 获取对应图像的原始路径
+            original_image = images[i] if i < len(images) else images[0]
+            extension = Path(original_image).suffix.lower()
             mime_type = f"image/{extension[1:]}" if extension else "image/jpeg"
             vision_message["image_url"]["format"] = mime_type
             
-            
             content.append(vision_message)
             
-       
         messages.append({
             "role": "user",
             "content": content,
@@ -637,4 +647,5 @@ class LiteLLM:
         logger.info(
             f"Running {len(tasks)} tasks asynchronously in batches of {batch_size}"
         )
-        return await self._process_batch(tasks, batch_size)
\ No newline at end of file
+        return await self._process_batch(tasks, batch_size) 
+    
\ No newline at end of file