Merge pull request #944 from harshalmore31/improve/vision

Enhance vision processing by supporting direct URL usage and adding utility methods
pull/942/merge
Kye Gomez 2 days ago committed by GitHub
commit d18723fd99
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

@ -212,7 +212,25 @@ class LiteLLM:
Process vision input specifically for Anthropic models. Process vision input specifically for Anthropic models.
Handles Anthropic's specific image format requirements. Handles Anthropic's specific image format requirements.
""" """
# Get base64 encoded image # Check if we can use direct URL
if self._should_use_direct_url(image):
# Use direct URL without base64 conversion
messages.append(
{
"role": "user",
"content": [
{"type": "text", "text": task},
{
"type": "image_url",
"image_url": {
"url": image,
},
},
],
}
)
else:
# Fall back to base64 conversion for local files
image_url = get_image_base64(image) image_url = get_image_base64(image)
# Extract mime type from the data URI or use default # Extract mime type from the data URI or use default
@ -234,7 +252,7 @@ class LiteLLM:
"image/jpeg" # fallback to jpeg if unsupported "image/jpeg" # fallback to jpeg if unsupported
) )
# Construct Anthropic vision message # Construct Anthropic vision message with base64
messages.append( messages.append(
{ {
"role": "user", "role": "user",
@ -260,10 +278,18 @@ class LiteLLM:
Process vision input specifically for OpenAI models. Process vision input specifically for OpenAI models.
Handles OpenAI's specific image format requirements. Handles OpenAI's specific image format requirements.
""" """
# Get base64 encoded image with proper format # Check if we can use direct URL
if self._should_use_direct_url(image):
# Use direct URL without base64 conversion
vision_message = {
"type": "image_url",
"image_url": {"url": image},
}
else:
# Fall back to base64 conversion for local files
image_url = get_image_base64(image) image_url = get_image_base64(image)
# Prepare vision message # Prepare vision message with base64
vision_message = { vision_message = {
"type": "image_url", "type": "image_url",
"image_url": {"url": image_url}, "image_url": {"url": image_url},
@ -289,44 +315,61 @@ class LiteLLM:
return messages return messages
def _should_use_direct_url(self, image: str) -> bool:
"""
Determine if we should use direct URL passing instead of base64 conversion.
Args:
image (str): The image source (URL or file path)
Returns:
bool: True if we should use direct URL, False if we need base64 conversion
"""
# Only use direct URL for HTTP/HTTPS URLs
if not image.startswith(("http://", "https://")):
return False
# Check for local/custom models that might not support direct URLs
model_lower = self.model_name.lower()
local_indicators = ["localhost", "127.0.0.1", "local", "custom", "ollama", "llama-cpp"]
is_local = any(indicator in model_lower for indicator in local_indicators) or \
(self.base_url is not None and any(indicator in self.base_url.lower() for indicator in local_indicators))
if is_local:
return False
# Use LiteLLM's supports_vision to check if model supports vision and direct URLs
try:
return supports_vision(model=self.model_name)
except Exception:
return False
def vision_processing( def vision_processing(
self, task: str, image: str, messages: Optional[list] = None self, task: str, image: str, messages: Optional[list] = None
): ):
""" """
Process the image for the given task. Process the image for the given task.
Handles different image formats and model requirements. Handles different image formats and model requirements.
This method now intelligently chooses between:
1. Direct URL passing (when model supports it and image is a URL)
2. Base64 conversion (for local files or unsupported models)
This approach reduces server load and improves performance by avoiding
unnecessary image downloads and base64 conversions when possible.
""" """
# # # Handle Anthropic models separately logger.info(f"Processing image for model: {self.model_name}")
# # if "anthropic" in self.model_name.lower() or "claude" in self.model_name.lower():
# # messages = self.anthropic_vision_processing(task, image, messages) # Log whether we're using direct URL or base64 conversion
# # return messages if self._should_use_direct_url(image):
logger.info(f"Using direct URL passing for image: {image[:100]}...")
# # Get base64 encoded image with proper format else:
# image_url = get_image_base64(image) if image.startswith(("http://", "https://")):
logger.info("Converting URL image to base64 (model doesn't support direct URLs)")
# # Prepare vision message else:
# vision_message = { logger.info("Converting local file to base64")
# "type": "image_url",
# "image_url": {"url": image_url},
# }
# # Add format for specific models
# extension = Path(image).suffix.lower()
# mime_type = f"image/{extension[1:]}" if extension else "image/jpeg"
# vision_message["image_url"]["format"] = mime_type
# # Append vision message
# messages.append(
# {
# "role": "user",
# "content": [
# {"type": "text", "text": task},
# vision_message,
# ],
# }
# )
# return messages
if ( if (
"anthropic" in self.model_name.lower() "anthropic" in self.model_name.lower()
or "claude" in self.model_name.lower() or "claude" in self.model_name.lower()
@ -370,7 +413,16 @@ class LiteLLM:
def check_if_model_supports_vision(self, img: str = None): def check_if_model_supports_vision(self, img: str = None):
""" """
Check if the model supports vision. Check if the model supports vision capabilities.
This method uses LiteLLM's built-in supports_vision function to verify
that the model can handle image inputs before processing.
Args:
img (str, optional): Image path/URL to validate against model capabilities
Raises:
ValueError: If the model doesn't support vision and an image is provided
""" """
if img is not None: if img is not None:
out = supports_vision(model=self.model_name) out = supports_vision(model=self.model_name)

@ -201,6 +201,119 @@ def run_test_suite():
except Exception as e: except Exception as e:
log_test_result("Batched Run", False, str(e)) log_test_result("Batched Run", False, str(e))
# Test 8: Vision Support Check
try:
logger.info("Testing vision support check")
llm = LiteLLM(model_name="gpt-4o")
# This should not raise an error for vision-capable models
llm.check_if_model_supports_vision(img="test.jpg")
log_test_result("Vision Support Check", True)
except Exception as e:
log_test_result("Vision Support Check", False, str(e))
# Test 9: Direct URL Processing
try:
logger.info("Testing direct URL processing")
llm = LiteLLM(model_name="gpt-4o")
test_url = "https://github.com/kyegomez/swarms/blob/master/swarms_logo_new.png?raw=true"
should_use_direct = llm._should_use_direct_url(test_url)
assert isinstance(should_use_direct, bool)
log_test_result("Direct URL Processing", True)
except Exception as e:
log_test_result("Direct URL Processing", False, str(e))
# Test 10: Message Preparation with Image
try:
logger.info("Testing message preparation with image")
llm = LiteLLM(model_name="gpt-4o")
# Mock image URL to test message structure
test_img = "https://github.com/kyegomez/swarms/blob/master/swarms_logo_new.png?raw=true"
messages = llm._prepare_messages("Describe this image", img=test_img)
assert isinstance(messages, list)
assert len(messages) >= 1
# Check if image content is properly structured
user_message = next((msg for msg in messages if msg["role"] == "user"), None)
assert user_message is not None
log_test_result("Message Preparation with Image", True)
except Exception as e:
log_test_result("Message Preparation with Image", False, str(e))
# Test 11: Vision Processing Methods
try:
logger.info("Testing vision processing methods")
llm = LiteLLM(model_name="gpt-4o")
messages = []
# Test OpenAI vision processing
processed_messages = llm.openai_vision_processing(
"Describe this image",
"https://github.com/kyegomez/swarms/blob/master/swarms_logo_new.png?raw=true",
messages.copy()
)
assert isinstance(processed_messages, list)
assert len(processed_messages) > 0
# Test Anthropic vision processing
llm_anthropic = LiteLLM(model_name="claude-3-5-sonnet-20241022")
processed_messages_anthropic = llm_anthropic.anthropic_vision_processing(
"Describe this image",
"https://github.com/kyegomez/swarms/blob/master/swarms_logo_new.png?raw=true",
messages.copy()
)
assert isinstance(processed_messages_anthropic, list)
assert len(processed_messages_anthropic) > 0
log_test_result("Vision Processing Methods", True)
except Exception as e:
log_test_result("Vision Processing Methods", False, str(e))
# Test 12: Local vs URL Detection
try:
logger.info("Testing local vs URL detection")
llm = LiteLLM(model_name="gpt-4o")
# Test URL detection
url_test = "https://github.com/kyegomez/swarms/blob/master/swarms_logo_new.png?raw=true"
is_url_direct = llm._should_use_direct_url(url_test)
# Test local file detection
local_test = "/path/to/local/image.jpg"
is_local_direct = llm._should_use_direct_url(local_test)
# URLs should potentially use direct, local files should not
assert isinstance(is_url_direct, bool)
assert isinstance(is_local_direct, bool)
assert is_local_direct == False # Local files should never use direct URL
log_test_result("Local vs URL Detection", True)
except Exception as e:
log_test_result("Local vs URL Detection", False, str(e))
# Test 13: Vision Message Structure
try:
logger.info("Testing vision message structure")
llm = LiteLLM(model_name="gpt-4o")
messages = []
# Test message structure for image input
result = llm.vision_processing(
task="What do you see?",
image="https://github.com/kyegomez/swarms/blob/master/swarms_logo_new.png?raw=true",
messages=messages
)
assert isinstance(result, list)
assert len(result) > 0
# Verify the message contains both text and image components
user_msg = result[-1] # Last message should be user message
assert user_msg["role"] == "user"
assert "content" in user_msg
log_test_result("Vision Message Structure", True)
except Exception as e:
log_test_result("Vision Message Structure", False, str(e))
# Generate test report # Generate test report
success_rate = (passed_tests / total_tests) * 100 success_rate = (passed_tests / total_tests) * 100
logger.info("\n=== Test Suite Report ===") logger.info("\n=== Test Suite Report ===")

Loading…
Cancel
Save