|
|
@ -212,44 +212,62 @@ class LiteLLM:
|
|
|
|
Process vision input specifically for Anthropic models.
|
|
|
|
Process vision input specifically for Anthropic models.
|
|
|
|
Handles Anthropic's specific image format requirements.
|
|
|
|
Handles Anthropic's specific image format requirements.
|
|
|
|
"""
|
|
|
|
"""
|
|
|
|
# Get base64 encoded image
|
|
|
|
# Check if we can use direct URL
|
|
|
|
image_url = get_image_base64(image)
|
|
|
|
if self._should_use_direct_url(image):
|
|
|
|
|
|
|
|
# Use direct URL without base64 conversion
|
|
|
|
# Extract mime type from the data URI or use default
|
|
|
|
messages.append(
|
|
|
|
mime_type = "image/jpeg" # default
|
|
|
|
{
|
|
|
|
if "data:" in image_url and ";base64," in image_url:
|
|
|
|
"role": "user",
|
|
|
|
mime_type = image_url.split(";base64,")[0].split("data:")[
|
|
|
|
"content": [
|
|
|
|
1
|
|
|
|
{"type": "text", "text": task},
|
|
|
|
]
|
|
|
|
{
|
|
|
|
|
|
|
|
"type": "image_url",
|
|
|
|
# Ensure mime type is one of the supported formats
|
|
|
|
"image_url": {
|
|
|
|
supported_formats = [
|
|
|
|
"url": image,
|
|
|
|
"image/jpeg",
|
|
|
|
},
|
|
|
|
"image/png",
|
|
|
|
},
|
|
|
|
"image/gif",
|
|
|
|
],
|
|
|
|
"image/webp",
|
|
|
|
}
|
|
|
|
]
|
|
|
|
|
|
|
|
if mime_type not in supported_formats:
|
|
|
|
|
|
|
|
mime_type = (
|
|
|
|
|
|
|
|
"image/jpeg" # fallback to jpeg if unsupported
|
|
|
|
|
|
|
|
)
|
|
|
|
)
|
|
|
|
|
|
|
|
else:
|
|
|
|
|
|
|
|
# Fall back to base64 conversion for local files
|
|
|
|
|
|
|
|
image_url = get_image_base64(image)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# Extract mime type from the data URI or use default
|
|
|
|
|
|
|
|
mime_type = "image/jpeg" # default
|
|
|
|
|
|
|
|
if "data:" in image_url and ";base64," in image_url:
|
|
|
|
|
|
|
|
mime_type = image_url.split(";base64,")[0].split("data:")[
|
|
|
|
|
|
|
|
1
|
|
|
|
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# Ensure mime type is one of the supported formats
|
|
|
|
|
|
|
|
supported_formats = [
|
|
|
|
|
|
|
|
"image/jpeg",
|
|
|
|
|
|
|
|
"image/png",
|
|
|
|
|
|
|
|
"image/gif",
|
|
|
|
|
|
|
|
"image/webp",
|
|
|
|
|
|
|
|
]
|
|
|
|
|
|
|
|
if mime_type not in supported_formats:
|
|
|
|
|
|
|
|
mime_type = (
|
|
|
|
|
|
|
|
"image/jpeg" # fallback to jpeg if unsupported
|
|
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
# Construct Anthropic vision message
|
|
|
|
# Construct Anthropic vision message with base64
|
|
|
|
messages.append(
|
|
|
|
messages.append(
|
|
|
|
{
|
|
|
|
{
|
|
|
|
"role": "user",
|
|
|
|
"role": "user",
|
|
|
|
"content": [
|
|
|
|
"content": [
|
|
|
|
{"type": "text", "text": task},
|
|
|
|
{"type": "text", "text": task},
|
|
|
|
{
|
|
|
|
{
|
|
|
|
"type": "image_url",
|
|
|
|
"type": "image_url",
|
|
|
|
"image_url": {
|
|
|
|
"image_url": {
|
|
|
|
"url": image_url,
|
|
|
|
"url": image_url,
|
|
|
|
"format": mime_type,
|
|
|
|
"format": mime_type,
|
|
|
|
|
|
|
|
},
|
|
|
|
},
|
|
|
|
},
|
|
|
|
},
|
|
|
|
],
|
|
|
|
],
|
|
|
|
}
|
|
|
|
}
|
|
|
|
)
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
return messages
|
|
|
|
return messages
|
|
|
|
|
|
|
|
|
|
|
@ -260,21 +278,29 @@ class LiteLLM:
|
|
|
|
Process vision input specifically for OpenAI models.
|
|
|
|
Process vision input specifically for OpenAI models.
|
|
|
|
Handles OpenAI's specific image format requirements.
|
|
|
|
Handles OpenAI's specific image format requirements.
|
|
|
|
"""
|
|
|
|
"""
|
|
|
|
# Get base64 encoded image with proper format
|
|
|
|
# Check if we can use direct URL
|
|
|
|
image_url = get_image_base64(image)
|
|
|
|
if self._should_use_direct_url(image):
|
|
|
|
|
|
|
|
# Use direct URL without base64 conversion
|
|
|
|
# Prepare vision message
|
|
|
|
vision_message = {
|
|
|
|
vision_message = {
|
|
|
|
"type": "image_url",
|
|
|
|
"type": "image_url",
|
|
|
|
"image_url": {"url": image},
|
|
|
|
"image_url": {"url": image_url},
|
|
|
|
}
|
|
|
|
}
|
|
|
|
else:
|
|
|
|
|
|
|
|
# Fall back to base64 conversion for local files
|
|
|
|
# Add format for specific models
|
|
|
|
image_url = get_image_base64(image)
|
|
|
|
extension = Path(image).suffix.lower()
|
|
|
|
|
|
|
|
mime_type = (
|
|
|
|
# Prepare vision message with base64
|
|
|
|
f"image/{extension[1:]}" if extension else "image/jpeg"
|
|
|
|
vision_message = {
|
|
|
|
)
|
|
|
|
"type": "image_url",
|
|
|
|
vision_message["image_url"]["format"] = mime_type
|
|
|
|
"image_url": {"url": image_url},
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# Add format for specific models
|
|
|
|
|
|
|
|
extension = Path(image).suffix.lower()
|
|
|
|
|
|
|
|
mime_type = (
|
|
|
|
|
|
|
|
f"image/{extension[1:]}" if extension else "image/jpeg"
|
|
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
vision_message["image_url"]["format"] = mime_type
|
|
|
|
|
|
|
|
|
|
|
|
# Append vision message
|
|
|
|
# Append vision message
|
|
|
|
messages.append(
|
|
|
|
messages.append(
|
|
|
@ -289,44 +315,61 @@ class LiteLLM:
|
|
|
|
|
|
|
|
|
|
|
|
return messages
|
|
|
|
return messages
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _should_use_direct_url(self, image: str) -> bool:
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
Determine if we should use direct URL passing instead of base64 conversion.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
|
|
|
image (str): The image source (URL or file path)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
|
|
|
bool: True if we should use direct URL, False if we need base64 conversion
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
# Only use direct URL for HTTP/HTTPS URLs
|
|
|
|
|
|
|
|
if not image.startswith(("http://", "https://")):
|
|
|
|
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# Check for local/custom models that might not support direct URLs
|
|
|
|
|
|
|
|
model_lower = self.model_name.lower()
|
|
|
|
|
|
|
|
local_indicators = ["localhost", "127.0.0.1", "local", "custom", "ollama", "llama-cpp"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
is_local = any(indicator in model_lower for indicator in local_indicators) or \
|
|
|
|
|
|
|
|
(self.base_url is not None and any(indicator in self.base_url.lower() for indicator in local_indicators))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if is_local:
|
|
|
|
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# Use LiteLLM's supports_vision to check if model supports vision and direct URLs
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
|
|
|
return supports_vision(model=self.model_name)
|
|
|
|
|
|
|
|
except Exception:
|
|
|
|
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
|
|
def vision_processing(
|
|
|
|
def vision_processing(
|
|
|
|
self, task: str, image: str, messages: Optional[list] = None
|
|
|
|
self, task: str, image: str, messages: Optional[list] = None
|
|
|
|
):
|
|
|
|
):
|
|
|
|
"""
|
|
|
|
"""
|
|
|
|
Process the image for the given task.
|
|
|
|
Process the image for the given task.
|
|
|
|
Handles different image formats and model requirements.
|
|
|
|
Handles different image formats and model requirements.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
This method now intelligently chooses between:
|
|
|
|
|
|
|
|
1. Direct URL passing (when model supports it and image is a URL)
|
|
|
|
|
|
|
|
2. Base64 conversion (for local files or unsupported models)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
This approach reduces server load and improves performance by avoiding
|
|
|
|
|
|
|
|
unnecessary image downloads and base64 conversions when possible.
|
|
|
|
"""
|
|
|
|
"""
|
|
|
|
# # # Handle Anthropic models separately
|
|
|
|
logger.info(f"Processing image for model: {self.model_name}")
|
|
|
|
# # if "anthropic" in self.model_name.lower() or "claude" in self.model_name.lower():
|
|
|
|
|
|
|
|
# # messages = self.anthropic_vision_processing(task, image, messages)
|
|
|
|
# Log whether we're using direct URL or base64 conversion
|
|
|
|
# # return messages
|
|
|
|
if self._should_use_direct_url(image):
|
|
|
|
|
|
|
|
logger.info(f"Using direct URL passing for image: {image[:100]}...")
|
|
|
|
# # Get base64 encoded image with proper format
|
|
|
|
else:
|
|
|
|
# image_url = get_image_base64(image)
|
|
|
|
if image.startswith(("http://", "https://")):
|
|
|
|
|
|
|
|
logger.info("Converting URL image to base64 (model doesn't support direct URLs)")
|
|
|
|
# # Prepare vision message
|
|
|
|
else:
|
|
|
|
# vision_message = {
|
|
|
|
logger.info("Converting local file to base64")
|
|
|
|
# "type": "image_url",
|
|
|
|
|
|
|
|
# "image_url": {"url": image_url},
|
|
|
|
|
|
|
|
# }
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# # Add format for specific models
|
|
|
|
|
|
|
|
# extension = Path(image).suffix.lower()
|
|
|
|
|
|
|
|
# mime_type = f"image/{extension[1:]}" if extension else "image/jpeg"
|
|
|
|
|
|
|
|
# vision_message["image_url"]["format"] = mime_type
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# # Append vision message
|
|
|
|
|
|
|
|
# messages.append(
|
|
|
|
|
|
|
|
# {
|
|
|
|
|
|
|
|
# "role": "user",
|
|
|
|
|
|
|
|
# "content": [
|
|
|
|
|
|
|
|
# {"type": "text", "text": task},
|
|
|
|
|
|
|
|
# vision_message,
|
|
|
|
|
|
|
|
# ],
|
|
|
|
|
|
|
|
# }
|
|
|
|
|
|
|
|
# )
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# return messages
|
|
|
|
|
|
|
|
if (
|
|
|
|
if (
|
|
|
|
"anthropic" in self.model_name.lower()
|
|
|
|
"anthropic" in self.model_name.lower()
|
|
|
|
or "claude" in self.model_name.lower()
|
|
|
|
or "claude" in self.model_name.lower()
|
|
|
@ -370,7 +413,16 @@ class LiteLLM:
|
|
|
|
|
|
|
|
|
|
|
|
def check_if_model_supports_vision(self, img: str = None):
|
|
|
|
def check_if_model_supports_vision(self, img: str = None):
|
|
|
|
"""
|
|
|
|
"""
|
|
|
|
Check if the model supports vision.
|
|
|
|
Check if the model supports vision capabilities.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
This method uses LiteLLM's built-in supports_vision function to verify
|
|
|
|
|
|
|
|
that the model can handle image inputs before processing.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
|
|
|
img (str, optional): Image path/URL to validate against model capabilities
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Raises:
|
|
|
|
|
|
|
|
ValueError: If the model doesn't support vision and an image is provided
|
|
|
|
"""
|
|
|
|
"""
|
|
|
|
if img is not None:
|
|
|
|
if img is not None:
|
|
|
|
out = supports_vision(model=self.model_name)
|
|
|
|
out = supports_vision(model=self.model_name)
|
|
|
|