From a2c42fca54bc3dce45121f471195874531e017af Mon Sep 17 00:00:00 2001 From: Wxysnx <625024108@qq.com> Date: Sat, 21 Jun 2025 17:48:31 +0800 Subject: [PATCH 1/2] Feed multiple images into the agent --- swarms/utils/litellm_wrapper.py | 253 ++++++++++++++++++-------------- 1 file changed, 139 insertions(+), 114 deletions(-) diff --git a/swarms/utils/litellm_wrapper.py b/swarms/utils/litellm_wrapper.py index 6aa5c7d3..d5ed3f60 100644 --- a/swarms/utils/litellm_wrapper.py +++ b/swarms/utils/litellm_wrapper.py @@ -1,5 +1,5 @@ import traceback -from typing import Optional +from typing import Optional, List, Union import base64 import requests from pathlib import Path @@ -168,21 +168,33 @@ class LiteLLM: out = out.model_dump() return out + # Modification: Updated _prepare_messages method to accept img parameter as string or list of strings def _prepare_messages( self, task: str, - img: str = None, + img: Union[str, List[str]] = None, #Modification: Parameter type is string or string list ): """ Prepare the messages for the given task. Args: task (str): The task to prepare messages for. + img (Union[str, List[str]], optional): Single image input or list of image inputs. Defaults to None. Returns: list: A list of messages prepared for the task. """ - self.check_if_model_supports_vision(img=img) + # Edit: Convert single image string to list for unified processing + image_list = [] + if img is not None: + if isinstance(img, str): + image_list = [img] + else: + image_list = img + + # Edit: Check if there is an image to process + if image_list: + self.check_if_model_supports_vision(image_list=image_list) # Initialize messages messages = [] @@ -194,148 +206,147 @@ class LiteLLM: ) # Handle vision case - if img is not None: + if image_list: # 修改:处理图像列表 messages = self.vision_processing( - task=task, image=img, messages=messages + task=task, images=image_list, messages=messages ) else: messages.append({"role": "user", "content": task}) return messages + # Modification: Updated anthropic_vision_processing method to handle multiple images def anthropic_vision_processing( - self, task: str, image: str, messages: list + self, task: str, images: List[str], messages: list ) -> list: """ Process vision input specifically for Anthropic models. Handles Anthropic's specific image format requirements. + + Args: + task (str): The task prompt + images (List[str]): List of image paths or URLs + messages (list): Current message list + + Returns: + list: Updated messages list with images """ - # Get base64 encoded image - image_url = get_image_base64(image) - - # Extract mime type from the data URI or use default - mime_type = "image/jpeg" # default - if "data:" in image_url and ";base64," in image_url: - mime_type = image_url.split(";base64,")[0].split("data:")[ - 1 + + content = [{"type": "text", "text": task}] + + + for image in images: + + image_url = get_image_base64(image) + + + mime_type = "image/jpeg" + if "data:" in image_url and ";base64," in image_url: + mime_type = image_url.split(";base64,")[0].split("data:")[1] + + + supported_formats = [ + "image/jpeg", + "image/png", + "image/gif", + "image/webp", ] - - # Ensure mime type is one of the supported formats - supported_formats = [ - "image/jpeg", - "image/png", - "image/gif", - "image/webp", - ] - if mime_type not in supported_formats: - mime_type = ( - "image/jpeg" # fallback to jpeg if unsupported - ) - - # Construct Anthropic vision message - messages.append( - { - "role": "user", - "content": [ - {"type": "text", "text": task}, - { - "type": "image_url", - "image_url": { - "url": image_url, - "format": mime_type, - }, - }, - ], - } - ) - + if mime_type not in supported_formats: + mime_type = "image/jpeg" + + + content.append({ + "type": "image_url", + "image_url": { + "url": image_url, + "format": mime_type, + }, + }) + + + messages.append({ + "role": "user", + "content": content, + }) + return messages + # Modification: Updated openai_vision_processing method to handle multiple images def openai_vision_processing( - self, task: str, image: str, messages: list + self, task: str, images: List[str], messages: list ) -> list: """ Process vision input specifically for OpenAI models. Handles OpenAI's specific image format requirements. + + Args: + task (str): The task prompt + images (List[str]): List of image paths or URLs + messages (list): Current message list + + Returns: + list: Updated messages list with images """ - # Get base64 encoded image with proper format - image_url = get_image_base64(image) - - # Prepare vision message - vision_message = { - "type": "image_url", - "image_url": {"url": image_url}, - } - - # Add format for specific models - extension = Path(image).suffix.lower() - mime_type = ( - f"image/{extension[1:]}" if extension else "image/jpeg" - ) - vision_message["image_url"]["format"] = mime_type - - # Append vision message - messages.append( - { - "role": "user", - "content": [ - {"type": "text", "text": task}, - vision_message, - ], + + content = [{"type": "text", "text": task}] + + + for image in images: + + image_url = get_image_base64(image) + + + vision_message = { + "type": "image_url", + "image_url": {"url": image_url}, } - ) - + + + extension = Path(image).suffix.lower() + mime_type = f"image/{extension[1:]}" if extension else "image/jpeg" + vision_message["image_url"]["format"] = mime_type + + + content.append(vision_message) + + + messages.append({ + "role": "user", + "content": content, + }) + return messages + # Modification: Updated vision_processing method to handle multiple images def vision_processing( - self, task: str, image: str, messages: Optional[list] = None + self, task: str, images: List[str], messages: Optional[list] = None ): """ - Process the image for the given task. + Process the images for the given task. Handles different image formats and model requirements. + + Args: + task (str): The task prompt + images (List[str]): List of image paths or URLs + messages (Optional[list], optional): Current messages list. Defaults to None. + + Returns: + list: Updated messages with image content """ - # # # Handle Anthropic models separately - # # if "anthropic" in self.model_name.lower() or "claude" in self.model_name.lower(): - # # messages = self.anthropic_vision_processing(task, image, messages) - # # return messages - - # # Get base64 encoded image with proper format - # image_url = get_image_base64(image) - - # # Prepare vision message - # vision_message = { - # "type": "image_url", - # "image_url": {"url": image_url}, - # } - - # # Add format for specific models - # extension = Path(image).suffix.lower() - # mime_type = f"image/{extension[1:]}" if extension else "image/jpeg" - # vision_message["image_url"]["format"] = mime_type - - # # Append vision message - # messages.append( - # { - # "role": "user", - # "content": [ - # {"type": "text", "text": task}, - # vision_message, - # ], - # } - # ) - - # return messages + if messages is None: + messages = [] + if ( "anthropic" in self.model_name.lower() or "claude" in self.model_name.lower() ): messages = self.anthropic_vision_processing( - task, image, messages + task, images, messages ) return messages else: messages = self.openai_vision_processing( - task, image, messages + task, images, messages ) return messages @@ -366,23 +377,33 @@ class LiteLLM: } ) - def check_if_model_supports_vision(self, img: str = None): + # Modification: Updated check_if_model_supports_vision method to support image lists + def check_if_model_supports_vision(self, img: str = None, image_list: List[str] = None): """ Check if the model supports vision. + + Args: + img (str, optional): Single image path (for backward compatibility). Defaults to None. + image_list (List[str], optional): List of image paths. Defaults to None. + + Raises: + ValueError: If the model does not support vision. """ - if img is not None: + # If there are any images (single or multiple), check if the model supports vision + if img is not None or (image_list and len(image_list) > 0): out = supports_vision(model=self.model_name) - + if out is False: raise ValueError( f"Model {self.model_name} does not support vision" ) + # Modification: Update the run method so that the img parameter can accept a string or a list of strings def run( self, task: str, audio: Optional[str] = None, - img: Optional[str] = None, + img: Union[str, List[str]] = None, *args, **kwargs, ): @@ -392,7 +413,7 @@ class LiteLLM: Args: task (str): The task to run the model for. audio (str, optional): Audio input if any. Defaults to None. - img (str, optional): Image input if any. Defaults to None. + img (Union[str, List[str]], optional): Single image input or list of image inputs. Defaults to None. *args: Additional positional arguments. **kwargs: Additional keyword arguments. @@ -403,6 +424,7 @@ class LiteLLM: Exception: If there is an error in processing the request. """ try: + messages = self._prepare_messages(task=task, img=img) # Base completion parameters @@ -486,12 +508,14 @@ class LiteLLM: """ return self.run(task, *args, **kwargs) - async def arun(self, task: str, *args, **kwargs): + # Modification: Updated arun method to accept img parameter as string or list of strings + async def arun(self, task: str, img: Union[str, List[str]] = None, *args, **kwargs): """ Run the LLM model asynchronously for the given task. Args: task (str): The task to run the model for. + img (Union[str, List[str]], optional): Single image input or list of image inputs. Defaults to None. *args: Additional positional arguments. **kwargs: Additional keyword arguments. @@ -499,7 +523,8 @@ class LiteLLM: str: The content of the response from the model. """ try: - messages = self._prepare_messages(task) + + messages = self._prepare_messages(task=task, img=img) # Prepare common completion parameters completion_params = { @@ -612,4 +637,4 @@ class LiteLLM: logger.info( f"Running {len(tasks)} tasks asynchronously in batches of {batch_size}" ) - return await self._process_batch(tasks, batch_size) + return await self._process_batch(tasks, batch_size) \ No newline at end of file From c28dea745f70c876777b7f0c506b662e550a4ac9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8E=8B=E7=A5=A5=E5=AE=87?= <625024108@qq.com> Date: Sun, 29 Jun 2025 11:29:38 +0800 Subject: [PATCH 2/2] =?UTF-8?q?=E6=9B=B4=E6=96=B0=20litellm=5Fwrapper.py?= =?UTF-8?q?=20=E6=96=87=E4=BB=B6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- swarms/utils/litellm_wrapper.py | 49 ++++++++++++++++++++------------- 1 file changed, 30 insertions(+), 19 deletions(-) diff --git a/swarms/utils/litellm_wrapper.py b/swarms/utils/litellm_wrapper.py index d5ed3f60..96390825 100644 --- a/swarms/utils/litellm_wrapper.py +++ b/swarms/utils/litellm_wrapper.py @@ -1,3 +1,4 @@ + import traceback from typing import Optional, List, Union import base64 @@ -51,11 +52,24 @@ def get_audio_base64(audio_source: str) -> str: return encoded_string -def get_image_base64(image_source: str) -> str: +# 修改:更新函数签名和实现以支持列表输入 +def get_image_base64(image_source: Union[str, List[str]]) -> Union[str, List[str]]: """ Convert image from a given source to a base64 encoded string. Handles URLs, local file paths, and data URIs. + Now supports both single image path and list of image paths. + + Args: + image_source: String path to image or list of image paths + + Returns: + Single base64 string or list of base64 strings """ + # 处理图像列表 + if isinstance(image_source, list): + return [get_image_base64(single_image) for single_image in image_source] + + # 处理单个图像(原始逻辑) # If already a data URI, return as is if image_source.startswith("data:image"): return image_source @@ -234,17 +248,16 @@ class LiteLLM: content = [{"type": "text", "text": task}] - - for image in images: - - image_url = get_image_base64(image) + # 修改:使用新版get_image_base64函数处理图像列表 + image_urls = get_image_base64(images) + if not isinstance(image_urls, list): + image_urls = [image_urls] - + for i, image_url in enumerate(image_urls): mime_type = "image/jpeg" if "data:" in image_url and ";base64," in image_url: mime_type = image_url.split(";base64,")[0].split("data:")[1] - supported_formats = [ "image/jpeg", "image/png", @@ -254,7 +267,6 @@ class LiteLLM: if mime_type not in supported_formats: mime_type = "image/jpeg" - content.append({ "type": "image_url", "image_url": { @@ -263,7 +275,6 @@ class LiteLLM: }, }) - messages.append({ "role": "user", "content": content, @@ -290,26 +301,25 @@ class LiteLLM: content = [{"type": "text", "text": task}] - - for image in images: - - image_url = get_image_base64(image) - + # 修改:使用新版get_image_base64函数处理图像列表 + image_urls = get_image_base64(images) + if not isinstance(image_urls, list): + image_urls = [image_urls] + for i, image_url in enumerate(image_urls): vision_message = { "type": "image_url", "image_url": {"url": image_url}, } - - extension = Path(image).suffix.lower() + # 获取对应图像的原始路径 + original_image = images[i] if i < len(images) else images[0] + extension = Path(original_image).suffix.lower() mime_type = f"image/{extension[1:]}" if extension else "image/jpeg" vision_message["image_url"]["format"] = mime_type - content.append(vision_message) - messages.append({ "role": "user", "content": content, @@ -637,4 +647,5 @@ class LiteLLM: logger.info( f"Running {len(tasks)} tasks asynchronously in batches of {batch_size}" ) - return await self._process_batch(tasks, batch_size) \ No newline at end of file + return await self._process_batch(tasks, batch_size) + \ No newline at end of file