Fix empty image URLs in multimodal browsing causing litellm.BadRequestError (#9214)

Co-authored-by: openhands <openhands@all-hands.dev>
Co-authored-by: Engel Nyst <enyst@users.noreply.github.com>
This commit is contained in:
Graham Neubig 2025-06-20 12:44:11 -07:00 committed by GitHub
parent 8badcb7b35
commit bda0a64a3d
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 418 additions and 20 deletions

View File

@ -56,6 +56,18 @@ class ConversationMemory:
self.agent_config = config
self.prompt_manager = prompt_manager
@staticmethod
def _is_valid_image_url(url: str | None) -> bool:
"""Check if an image URL is valid and non-empty.
Args:
url: The image URL to validate
Returns:
True if the URL is valid, False otherwise
"""
return bool(url and url.strip())
def process_events(
self,
condensed_history: list[Event],
@ -380,7 +392,27 @@ class ConversationMemory:
# Add image URLs if available and vision is active
if vision_is_active and obs.image_urls:
content.append(ImageContent(image_urls=obs.image_urls))
# Filter out empty or invalid image URLs
valid_image_urls = [
url for url in obs.image_urls if self._is_valid_image_url(url)
]
invalid_count = len(obs.image_urls) - len(valid_image_urls)
if valid_image_urls:
content.append(ImageContent(image_urls=valid_image_urls))
if invalid_count > 0:
# Add text indicating some images were filtered
content[
0
].text += f'\n\nNote: {invalid_count} invalid or empty image(s) were filtered from this output. The agent may need to use alternative methods to access visual information.'
else:
logger.debug(
'IPython observation has image URLs but none are valid'
)
# Add text indicating all images were filtered
content[
0
].text += f'\n\nNote: All {len(obs.image_urls)} image(s) in this output were invalid or empty and have been filtered. The agent should use alternative methods to access visual information.'
message = Message(role='user', content=content)
elif isinstance(obs, FileEditObservation):
@ -398,25 +430,42 @@ class ConversationMemory:
and vision_is_active
):
text += 'Image: Current webpage screenshot (Note that only visible portion of webpage is present in the screenshot. You may need to scroll to view the remaining portion of the web-page.)\n'
message = Message(
role='user',
content=[
TextContent(text=text),
ImageContent(
image_urls=[
# show set of marks if it exists
# otherwise, show raw screenshot when using vision-supported model
obs.set_of_marks
if obs.set_of_marks is not None
and len(obs.set_of_marks) > 0
else obs.screenshot
]
),
],
)
logger.debug(
f'Vision enabled for browsing, showing {"set of marks" if obs.set_of_marks and len(obs.set_of_marks) > 0 else "screenshot"}'
)
# Determine which image to use and validate it
image_url = None
if obs.set_of_marks is not None and len(obs.set_of_marks) > 0:
image_url = obs.set_of_marks
image_type = 'set of marks'
elif obs.screenshot is not None and len(obs.screenshot) > 0:
image_url = obs.screenshot
image_type = 'screenshot'
# Create message content with text
content = [TextContent(text=text)]
# Only add ImageContent if we have a valid image URL
if self._is_valid_image_url(image_url):
content.append(ImageContent(image_urls=[image_url]))
logger.debug(f'Vision enabled for browsing, showing {image_type}')
else:
if image_url:
logger.warning(
f'Invalid image URL format for {image_type}: {image_url[:50]}...'
)
# Add text indicating the image was filtered
content[
0
].text += f'\n\nNote: The {image_type} for this webpage was invalid or empty and has been filtered. The agent should use alternative methods to access visual information about the webpage.'
else:
logger.debug(
'Vision enabled for browsing, but no valid image available'
)
# Add text indicating no image was available
content[
0
].text += '\n\nNote: No visual information (screenshot or set of marks) is available for this webpage. The agent should rely on the text content above.'
message = Message(role='user', content=content)
else:
message = Message(
role='user',

View File

@ -0,0 +1,267 @@
"""Test for fixing empty image URL issue in multimodal browsing."""
from openhands.core.config.agent_config import AgentConfig
from openhands.core.message import ImageContent
from openhands.events.observation.browse import BrowserOutputObservation
from openhands.memory.conversation_memory import ConversationMemory
from openhands.utils.prompt import PromptManager
def test_empty_image_url_handling():
"""Test that empty image URLs are properly filtered out and notification text is added."""
# Create a browser observation with empty screenshot and set_of_marks
browser_obs = BrowserOutputObservation(
url='https://example.com',
trigger_by_action='browse_interactive',
screenshot='', # Empty screenshot
set_of_marks='', # Empty set_of_marks
content='Some webpage content',
)
# Create conversation memory with vision enabled
agent_config = AgentConfig(enable_som_visual_browsing=True)
prompt_manager = PromptManager(
prompt_dir='openhands/agenthub/codeact_agent/prompts'
)
conv_memory = ConversationMemory(agent_config, prompt_manager)
# Process the observation with vision enabled
messages = conv_memory._process_observation(
obs=browser_obs,
tool_call_id_to_message={},
max_message_chars=None,
vision_is_active=True,
enable_som_visual_browsing=True,
current_index=0,
events=[],
)
# Check that no empty image URLs are included
has_image_content = False
has_notification_text = False
for message in messages:
for content in message.content:
if isinstance(content, ImageContent):
has_image_content = True
# All image URLs should be non-empty and valid
for url in content.image_urls:
assert url != '', 'Empty image URL should be filtered out'
assert url is not None, 'None image URL should be filtered out'
# Should start with data: prefix for base64 images
if url: # Only check if URL is not empty
assert url.startswith('data:'), (
f'Invalid image URL format: {url}'
)
elif hasattr(content, 'text'):
# Check for notification text about missing visual information
if (
'No visual information' in content.text
or 'has been filtered' in content.text
):
has_notification_text = True
# Should not have image content but should have notification text
assert not has_image_content, 'Should not have ImageContent for empty images'
assert has_notification_text, (
'Should have notification text about missing visual information'
)
def test_valid_image_url_handling():
"""Test that valid image URLs are properly handled."""
# Create a browser observation with valid base64 image data
valid_base64_image = 'data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mNk+M9QDwADhgGAWjR9awAAAABJRU5ErkJggg=='
browser_obs = BrowserOutputObservation(
url='https://example.com',
trigger_by_action='browse_interactive',
screenshot=valid_base64_image,
set_of_marks=valid_base64_image,
content='Some webpage content',
)
# Create conversation memory with vision enabled
agent_config = AgentConfig(enable_som_visual_browsing=True)
prompt_manager = PromptManager(
prompt_dir='openhands/agenthub/codeact_agent/prompts'
)
conv_memory = ConversationMemory(agent_config, prompt_manager)
# Process the observation with vision enabled
messages = conv_memory._process_observation(
obs=browser_obs,
tool_call_id_to_message={},
max_message_chars=None,
vision_is_active=True,
enable_som_visual_browsing=True,
current_index=0,
events=[],
)
# Check that valid image URLs are included
found_image_content = False
for message in messages:
for content in message.content:
if isinstance(content, ImageContent):
found_image_content = True
# Should have at least one valid image URL
assert len(content.image_urls) > 0, 'Should have at least one image URL'
for url in content.image_urls:
assert url != '', 'Image URL should not be empty'
assert url.startswith('data:image/'), (
f'Invalid image URL format: {url}'
)
assert found_image_content, 'Should have found ImageContent with valid URLs'
def test_mixed_image_url_handling():
"""Test handling of mixed valid and invalid image URLs."""
# Create a browser observation with one empty and one valid image
valid_base64_image = 'data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mNk+M9QDwADhgGAWjR9awAAAABJRU5ErkJggg=='
browser_obs = BrowserOutputObservation(
url='https://example.com',
trigger_by_action='browse_interactive',
screenshot='', # Empty screenshot
set_of_marks=valid_base64_image, # Valid set_of_marks
content='Some webpage content',
)
# Create conversation memory with vision enabled
agent_config = AgentConfig(enable_som_visual_browsing=True)
prompt_manager = PromptManager(
prompt_dir='openhands/agenthub/codeact_agent/prompts'
)
conv_memory = ConversationMemory(agent_config, prompt_manager)
# Process the observation with vision enabled
messages = conv_memory._process_observation(
obs=browser_obs,
tool_call_id_to_message={},
max_message_chars=None,
vision_is_active=True,
enable_som_visual_browsing=True,
current_index=0,
events=[],
)
# Check that only valid image URLs are included
found_image_content = False
for message in messages:
for content in message.content:
if isinstance(content, ImageContent):
found_image_content = True
# Should have exactly one valid image URL (set_of_marks)
assert len(content.image_urls) == 1, (
f'Should have exactly one image URL, got {len(content.image_urls)}'
)
url = content.image_urls[0]
assert url == valid_base64_image, (
f'Should use the valid image URL: {url}'
)
assert found_image_content, 'Should have found ImageContent with valid URL'
def test_ipython_empty_image_url_handling():
"""Test that empty image URLs in IPython observations are properly filtered with notification text."""
from openhands.events.observation.commands import IPythonRunCellObservation
# Create an IPython observation with empty image URLs
ipython_obs = IPythonRunCellObservation(
content='Some output',
code='print("hello")',
image_urls=['', None, ''], # Empty and None image URLs
)
# Create conversation memory with vision enabled
agent_config = AgentConfig(enable_som_visual_browsing=True)
prompt_manager = PromptManager(
prompt_dir='openhands/agenthub/codeact_agent/prompts'
)
conv_memory = ConversationMemory(agent_config, prompt_manager)
# Process the observation with vision enabled
messages = conv_memory._process_observation(
obs=ipython_obs,
tool_call_id_to_message={},
max_message_chars=None,
vision_is_active=True,
enable_som_visual_browsing=True,
current_index=0,
events=[],
)
# Check that no empty image URLs are included and notification text is added
has_image_content = False
has_notification_text = False
for message in messages:
for content in message.content:
if isinstance(content, ImageContent):
has_image_content = True
elif hasattr(content, 'text'):
# Check for notification text about filtered images
if 'invalid or empty and have been filtered' in content.text:
has_notification_text = True
# Should not have image content but should have notification text
assert not has_image_content, 'Should not have ImageContent for empty images'
assert has_notification_text, 'Should have notification text about filtered images'
def test_ipython_mixed_image_url_handling():
"""Test handling of mixed valid and invalid image URLs in IPython observations."""
from openhands.events.observation.commands import IPythonRunCellObservation
# Create an IPython observation with mixed image URLs
valid_base64_image = 'data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mNk+M9QDwADhgGAWjR9awAAAABJRU5ErkJggg=='
ipython_obs = IPythonRunCellObservation(
content='Some output',
code='print("hello")',
image_urls=['', valid_base64_image, None], # Mix of empty, valid, and None
)
# Create conversation memory with vision enabled
agent_config = AgentConfig(enable_som_visual_browsing=True)
prompt_manager = PromptManager(
prompt_dir='openhands/agenthub/codeact_agent/prompts'
)
conv_memory = ConversationMemory(agent_config, prompt_manager)
# Process the observation with vision enabled
messages = conv_memory._process_observation(
obs=ipython_obs,
tool_call_id_to_message={},
max_message_chars=None,
vision_is_active=True,
enable_som_visual_browsing=True,
current_index=0,
events=[],
)
# Check that only valid image URLs are included and notification text is added
found_image_content = False
has_notification_text = False
for message in messages:
for content in message.content:
if isinstance(content, ImageContent):
found_image_content = True
# Should have exactly one valid image URL
assert len(content.image_urls) == 1, (
f'Should have exactly one image URL, got {len(content.image_urls)}'
)
url = content.image_urls[0]
assert url == valid_base64_image, (
f'Should use the valid image URL: {url}'
)
elif hasattr(content, 'text'):
# Check for notification text about filtered images
if 'invalid or empty image(s) were filtered' in content.text:
has_notification_text = True
assert found_image_content, 'Should have found ImageContent with valid URL'
assert has_notification_text, 'Should have notification text about filtered images'

View File

@ -0,0 +1,82 @@
"""Test for ImageContent serialization behavior.
Note: Image URL filtering now happens at the conversation memory level,
not at the ImageContent serialization level. These tests verify that
ImageContent correctly serializes whatever URLs it's given.
"""
from openhands.core.message import ImageContent
def test_image_content_serializes_all_urls():
"""Test that ImageContent serializes all URLs it's given, including empty ones."""
# Create ImageContent with mixed valid and invalid URLs
image_content = ImageContent(
image_urls=[
'data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mNk+M9QDwADhgGAWjR9awAAAABJRU5ErkJggg==',
'', # Empty string
' ', # Whitespace only
'data:image/jpeg;base64,/9j/4AAQSkZJRgABAQEAYABgAAD/2wBDAAEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQH/2wBDAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQH/wAARCAABAAEDASIAAhEBAxEB/8QAFQABAQAAAAAAAAAAAAAAAAAAAAv/xAAUEAEAAAAAAAAAAAAAAAAAAAAA/8QAFQEBAQAAAAAAAAAAAAAAAAAAAAX/xAAUEQEAAAAAAAAAAAAAAAAAAAAA/9oADAMBAAIRAxEAPwA/wA==',
]
)
# Serialize the content
serialized = image_content.model_dump()
# Should serialize all URLs, including empty ones (filtering happens upstream)
assert len(serialized) == 4, (
f'Expected 4 URLs (including empty), got {len(serialized)}'
)
expected_urls = [
'data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mNk+M9QDwADhgGAWjR9awAAAABJRU5ErkJggg==',
'',
' ',
'data:image/jpeg;base64,/9j/4AAQSkZJRgABAQEAYABgAAD/2wBDAAEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQH/2wBDAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQH/wAARCAABAAEDASIAAhEBAxEB/8QAFQABAQAAAAAAAAAAAAAAAAAAAAv/xAAUEAEAAAAAAAAAAAAAAAAAAAAA/8QAFQEBAQAAAAAAAAAAAAAAAAAAAAX/xAAUEQEAAAAAAAAAAAAAAAAAAAAA/9oADAMBAAIRAxEAPwA/wA==',
]
for i, item in enumerate(serialized):
assert item['type'] == 'image_url'
assert 'image_url' in item
assert 'url' in item['image_url']
assert item['image_url']['url'] == expected_urls[i]
def test_image_content_serializes_empty_urls():
"""Test that ImageContent serializes empty URLs (filtering happens upstream)."""
# Create ImageContent with only empty URLs
image_content = ImageContent(image_urls=['', ' '])
# Serialize the content
serialized = image_content.model_dump()
# Should serialize all URLs, even empty ones
assert len(serialized) == 2, f'Expected 2 URLs, got {serialized}'
assert serialized[0]['image_url']['url'] == ''
assert serialized[1]['image_url']['url'] == ' '
def test_image_content_all_valid_urls():
"""Test that ImageContent preserves all valid URLs."""
valid_urls = [
'data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mNk+M9QDwADhgGAWjR9awAAAABJRU5ErkJggg==',
'data:image/jpeg;base64,/9j/4AAQSkZJRgABAQEAYABgAAD/2wBDAAEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQH/2wBDAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQH/wAARCAABAAEDASIAAhEBAxEB/8QAFQABAQAAAAAAAAAAAAAAAAAAAAv/xAAUEAEAAAAAAAAAAAAAAAAAAAAA/8QAFQEBAQAAAAAAAAAAAAAAAAAAAAX/xAAUEQEAAAAAAAAAAAAAAAAAAAAA/9oADAMBAAIRAxEAPwA/wA==',
]
# Create ImageContent with valid URLs
image_content = ImageContent(image_urls=valid_urls)
# Serialize the content
serialized = image_content.model_dump()
# Should preserve all valid URLs
assert len(serialized) == len(valid_urls), (
f'Expected {len(valid_urls)} URLs, got {len(serialized)}'
)
for i, item in enumerate(serialized):
assert item['type'] == 'image_url'
assert item['image_url']['url'] == valid_urls[i]