mirror of
https://github.com/OpenHands/OpenHands.git
synced 2025-12-26 05:48:36 +08:00
Fix empty image URLs in multimodal browsing causing litellm.BadRequestError (#9214)
Co-authored-by: openhands <openhands@all-hands.dev> Co-authored-by: Engel Nyst <enyst@users.noreply.github.com>
This commit is contained in:
parent
8badcb7b35
commit
bda0a64a3d
@ -56,6 +56,18 @@ class ConversationMemory:
|
||||
self.agent_config = config
|
||||
self.prompt_manager = prompt_manager
|
||||
|
||||
@staticmethod
|
||||
def _is_valid_image_url(url: str | None) -> bool:
|
||||
"""Check if an image URL is valid and non-empty.
|
||||
|
||||
Args:
|
||||
url: The image URL to validate
|
||||
|
||||
Returns:
|
||||
True if the URL is valid, False otherwise
|
||||
"""
|
||||
return bool(url and url.strip())
|
||||
|
||||
def process_events(
|
||||
self,
|
||||
condensed_history: list[Event],
|
||||
@ -380,7 +392,27 @@ class ConversationMemory:
|
||||
|
||||
# Add image URLs if available and vision is active
|
||||
if vision_is_active and obs.image_urls:
|
||||
content.append(ImageContent(image_urls=obs.image_urls))
|
||||
# Filter out empty or invalid image URLs
|
||||
valid_image_urls = [
|
||||
url for url in obs.image_urls if self._is_valid_image_url(url)
|
||||
]
|
||||
invalid_count = len(obs.image_urls) - len(valid_image_urls)
|
||||
|
||||
if valid_image_urls:
|
||||
content.append(ImageContent(image_urls=valid_image_urls))
|
||||
if invalid_count > 0:
|
||||
# Add text indicating some images were filtered
|
||||
content[
|
||||
0
|
||||
].text += f'\n\nNote: {invalid_count} invalid or empty image(s) were filtered from this output. The agent may need to use alternative methods to access visual information.'
|
||||
else:
|
||||
logger.debug(
|
||||
'IPython observation has image URLs but none are valid'
|
||||
)
|
||||
# Add text indicating all images were filtered
|
||||
content[
|
||||
0
|
||||
].text += f'\n\nNote: All {len(obs.image_urls)} image(s) in this output were invalid or empty and have been filtered. The agent should use alternative methods to access visual information.'
|
||||
|
||||
message = Message(role='user', content=content)
|
||||
elif isinstance(obs, FileEditObservation):
|
||||
@ -398,25 +430,42 @@ class ConversationMemory:
|
||||
and vision_is_active
|
||||
):
|
||||
text += 'Image: Current webpage screenshot (Note that only visible portion of webpage is present in the screenshot. You may need to scroll to view the remaining portion of the web-page.)\n'
|
||||
message = Message(
|
||||
role='user',
|
||||
content=[
|
||||
TextContent(text=text),
|
||||
ImageContent(
|
||||
image_urls=[
|
||||
# show set of marks if it exists
|
||||
# otherwise, show raw screenshot when using vision-supported model
|
||||
obs.set_of_marks
|
||||
if obs.set_of_marks is not None
|
||||
and len(obs.set_of_marks) > 0
|
||||
else obs.screenshot
|
||||
]
|
||||
),
|
||||
],
|
||||
)
|
||||
logger.debug(
|
||||
f'Vision enabled for browsing, showing {"set of marks" if obs.set_of_marks and len(obs.set_of_marks) > 0 else "screenshot"}'
|
||||
)
|
||||
|
||||
# Determine which image to use and validate it
|
||||
image_url = None
|
||||
if obs.set_of_marks is not None and len(obs.set_of_marks) > 0:
|
||||
image_url = obs.set_of_marks
|
||||
image_type = 'set of marks'
|
||||
elif obs.screenshot is not None and len(obs.screenshot) > 0:
|
||||
image_url = obs.screenshot
|
||||
image_type = 'screenshot'
|
||||
|
||||
# Create message content with text
|
||||
content = [TextContent(text=text)]
|
||||
|
||||
# Only add ImageContent if we have a valid image URL
|
||||
if self._is_valid_image_url(image_url):
|
||||
content.append(ImageContent(image_urls=[image_url]))
|
||||
logger.debug(f'Vision enabled for browsing, showing {image_type}')
|
||||
else:
|
||||
if image_url:
|
||||
logger.warning(
|
||||
f'Invalid image URL format for {image_type}: {image_url[:50]}...'
|
||||
)
|
||||
# Add text indicating the image was filtered
|
||||
content[
|
||||
0
|
||||
].text += f'\n\nNote: The {image_type} for this webpage was invalid or empty and has been filtered. The agent should use alternative methods to access visual information about the webpage.'
|
||||
else:
|
||||
logger.debug(
|
||||
'Vision enabled for browsing, but no valid image available'
|
||||
)
|
||||
# Add text indicating no image was available
|
||||
content[
|
||||
0
|
||||
].text += '\n\nNote: No visual information (screenshot or set of marks) is available for this webpage. The agent should rely on the text content above.'
|
||||
|
||||
message = Message(role='user', content=content)
|
||||
else:
|
||||
message = Message(
|
||||
role='user',
|
||||
|
||||
267
tests/unit/test_empty_image_url_fix_v2.py
Normal file
267
tests/unit/test_empty_image_url_fix_v2.py
Normal file
@ -0,0 +1,267 @@
|
||||
"""Test for fixing empty image URL issue in multimodal browsing."""
|
||||
|
||||
from openhands.core.config.agent_config import AgentConfig
|
||||
from openhands.core.message import ImageContent
|
||||
from openhands.events.observation.browse import BrowserOutputObservation
|
||||
from openhands.memory.conversation_memory import ConversationMemory
|
||||
from openhands.utils.prompt import PromptManager
|
||||
|
||||
|
||||
def test_empty_image_url_handling():
|
||||
"""Test that empty image URLs are properly filtered out and notification text is added."""
|
||||
|
||||
# Create a browser observation with empty screenshot and set_of_marks
|
||||
browser_obs = BrowserOutputObservation(
|
||||
url='https://example.com',
|
||||
trigger_by_action='browse_interactive',
|
||||
screenshot='', # Empty screenshot
|
||||
set_of_marks='', # Empty set_of_marks
|
||||
content='Some webpage content',
|
||||
)
|
||||
|
||||
# Create conversation memory with vision enabled
|
||||
agent_config = AgentConfig(enable_som_visual_browsing=True)
|
||||
prompt_manager = PromptManager(
|
||||
prompt_dir='openhands/agenthub/codeact_agent/prompts'
|
||||
)
|
||||
conv_memory = ConversationMemory(agent_config, prompt_manager)
|
||||
|
||||
# Process the observation with vision enabled
|
||||
messages = conv_memory._process_observation(
|
||||
obs=browser_obs,
|
||||
tool_call_id_to_message={},
|
||||
max_message_chars=None,
|
||||
vision_is_active=True,
|
||||
enable_som_visual_browsing=True,
|
||||
current_index=0,
|
||||
events=[],
|
||||
)
|
||||
|
||||
# Check that no empty image URLs are included
|
||||
has_image_content = False
|
||||
has_notification_text = False
|
||||
for message in messages:
|
||||
for content in message.content:
|
||||
if isinstance(content, ImageContent):
|
||||
has_image_content = True
|
||||
# All image URLs should be non-empty and valid
|
||||
for url in content.image_urls:
|
||||
assert url != '', 'Empty image URL should be filtered out'
|
||||
assert url is not None, 'None image URL should be filtered out'
|
||||
# Should start with data: prefix for base64 images
|
||||
if url: # Only check if URL is not empty
|
||||
assert url.startswith('data:'), (
|
||||
f'Invalid image URL format: {url}'
|
||||
)
|
||||
elif hasattr(content, 'text'):
|
||||
# Check for notification text about missing visual information
|
||||
if (
|
||||
'No visual information' in content.text
|
||||
or 'has been filtered' in content.text
|
||||
):
|
||||
has_notification_text = True
|
||||
|
||||
# Should not have image content but should have notification text
|
||||
assert not has_image_content, 'Should not have ImageContent for empty images'
|
||||
assert has_notification_text, (
|
||||
'Should have notification text about missing visual information'
|
||||
)
|
||||
|
||||
|
||||
def test_valid_image_url_handling():
|
||||
"""Test that valid image URLs are properly handled."""
|
||||
|
||||
# Create a browser observation with valid base64 image data
|
||||
valid_base64_image = ''
|
||||
|
||||
browser_obs = BrowserOutputObservation(
|
||||
url='https://example.com',
|
||||
trigger_by_action='browse_interactive',
|
||||
screenshot=valid_base64_image,
|
||||
set_of_marks=valid_base64_image,
|
||||
content='Some webpage content',
|
||||
)
|
||||
|
||||
# Create conversation memory with vision enabled
|
||||
agent_config = AgentConfig(enable_som_visual_browsing=True)
|
||||
prompt_manager = PromptManager(
|
||||
prompt_dir='openhands/agenthub/codeact_agent/prompts'
|
||||
)
|
||||
conv_memory = ConversationMemory(agent_config, prompt_manager)
|
||||
|
||||
# Process the observation with vision enabled
|
||||
messages = conv_memory._process_observation(
|
||||
obs=browser_obs,
|
||||
tool_call_id_to_message={},
|
||||
max_message_chars=None,
|
||||
vision_is_active=True,
|
||||
enable_som_visual_browsing=True,
|
||||
current_index=0,
|
||||
events=[],
|
||||
)
|
||||
|
||||
# Check that valid image URLs are included
|
||||
found_image_content = False
|
||||
for message in messages:
|
||||
for content in message.content:
|
||||
if isinstance(content, ImageContent):
|
||||
found_image_content = True
|
||||
# Should have at least one valid image URL
|
||||
assert len(content.image_urls) > 0, 'Should have at least one image URL'
|
||||
for url in content.image_urls:
|
||||
assert url != '', 'Image URL should not be empty'
|
||||
assert url.startswith(''
|
||||
|
||||
browser_obs = BrowserOutputObservation(
|
||||
url='https://example.com',
|
||||
trigger_by_action='browse_interactive',
|
||||
screenshot='', # Empty screenshot
|
||||
set_of_marks=valid_base64_image, # Valid set_of_marks
|
||||
content='Some webpage content',
|
||||
)
|
||||
|
||||
# Create conversation memory with vision enabled
|
||||
agent_config = AgentConfig(enable_som_visual_browsing=True)
|
||||
prompt_manager = PromptManager(
|
||||
prompt_dir='openhands/agenthub/codeact_agent/prompts'
|
||||
)
|
||||
conv_memory = ConversationMemory(agent_config, prompt_manager)
|
||||
|
||||
# Process the observation with vision enabled
|
||||
messages = conv_memory._process_observation(
|
||||
obs=browser_obs,
|
||||
tool_call_id_to_message={},
|
||||
max_message_chars=None,
|
||||
vision_is_active=True,
|
||||
enable_som_visual_browsing=True,
|
||||
current_index=0,
|
||||
events=[],
|
||||
)
|
||||
|
||||
# Check that only valid image URLs are included
|
||||
found_image_content = False
|
||||
for message in messages:
|
||||
for content in message.content:
|
||||
if isinstance(content, ImageContent):
|
||||
found_image_content = True
|
||||
# Should have exactly one valid image URL (set_of_marks)
|
||||
assert len(content.image_urls) == 1, (
|
||||
f'Should have exactly one image URL, got {len(content.image_urls)}'
|
||||
)
|
||||
url = content.image_urls[0]
|
||||
assert url == valid_base64_image, (
|
||||
f'Should use the valid image URL: {url}'
|
||||
)
|
||||
|
||||
assert found_image_content, 'Should have found ImageContent with valid URL'
|
||||
|
||||
|
||||
def test_ipython_empty_image_url_handling():
|
||||
"""Test that empty image URLs in IPython observations are properly filtered with notification text."""
|
||||
from openhands.events.observation.commands import IPythonRunCellObservation
|
||||
|
||||
# Create an IPython observation with empty image URLs
|
||||
ipython_obs = IPythonRunCellObservation(
|
||||
content='Some output',
|
||||
code='print("hello")',
|
||||
image_urls=['', None, ''], # Empty and None image URLs
|
||||
)
|
||||
|
||||
# Create conversation memory with vision enabled
|
||||
agent_config = AgentConfig(enable_som_visual_browsing=True)
|
||||
prompt_manager = PromptManager(
|
||||
prompt_dir='openhands/agenthub/codeact_agent/prompts'
|
||||
)
|
||||
conv_memory = ConversationMemory(agent_config, prompt_manager)
|
||||
|
||||
# Process the observation with vision enabled
|
||||
messages = conv_memory._process_observation(
|
||||
obs=ipython_obs,
|
||||
tool_call_id_to_message={},
|
||||
max_message_chars=None,
|
||||
vision_is_active=True,
|
||||
enable_som_visual_browsing=True,
|
||||
current_index=0,
|
||||
events=[],
|
||||
)
|
||||
|
||||
# Check that no empty image URLs are included and notification text is added
|
||||
has_image_content = False
|
||||
has_notification_text = False
|
||||
for message in messages:
|
||||
for content in message.content:
|
||||
if isinstance(content, ImageContent):
|
||||
has_image_content = True
|
||||
elif hasattr(content, 'text'):
|
||||
# Check for notification text about filtered images
|
||||
if 'invalid or empty and have been filtered' in content.text:
|
||||
has_notification_text = True
|
||||
|
||||
# Should not have image content but should have notification text
|
||||
assert not has_image_content, 'Should not have ImageContent for empty images'
|
||||
assert has_notification_text, 'Should have notification text about filtered images'
|
||||
|
||||
|
||||
def test_ipython_mixed_image_url_handling():
|
||||
"""Test handling of mixed valid and invalid image URLs in IPython observations."""
|
||||
from openhands.events.observation.commands import IPythonRunCellObservation
|
||||
|
||||
# Create an IPython observation with mixed image URLs
|
||||
valid_base64_image = ''
|
||||
ipython_obs = IPythonRunCellObservation(
|
||||
content='Some output',
|
||||
code='print("hello")',
|
||||
image_urls=['', valid_base64_image, None], # Mix of empty, valid, and None
|
||||
)
|
||||
|
||||
# Create conversation memory with vision enabled
|
||||
agent_config = AgentConfig(enable_som_visual_browsing=True)
|
||||
prompt_manager = PromptManager(
|
||||
prompt_dir='openhands/agenthub/codeact_agent/prompts'
|
||||
)
|
||||
conv_memory = ConversationMemory(agent_config, prompt_manager)
|
||||
|
||||
# Process the observation with vision enabled
|
||||
messages = conv_memory._process_observation(
|
||||
obs=ipython_obs,
|
||||
tool_call_id_to_message={},
|
||||
max_message_chars=None,
|
||||
vision_is_active=True,
|
||||
enable_som_visual_browsing=True,
|
||||
current_index=0,
|
||||
events=[],
|
||||
)
|
||||
|
||||
# Check that only valid image URLs are included and notification text is added
|
||||
found_image_content = False
|
||||
has_notification_text = False
|
||||
for message in messages:
|
||||
for content in message.content:
|
||||
if isinstance(content, ImageContent):
|
||||
found_image_content = True
|
||||
# Should have exactly one valid image URL
|
||||
assert len(content.image_urls) == 1, (
|
||||
f'Should have exactly one image URL, got {len(content.image_urls)}'
|
||||
)
|
||||
url = content.image_urls[0]
|
||||
assert url == valid_base64_image, (
|
||||
f'Should use the valid image URL: {url}'
|
||||
)
|
||||
elif hasattr(content, 'text'):
|
||||
# Check for notification text about filtered images
|
||||
if 'invalid or empty image(s) were filtered' in content.text:
|
||||
has_notification_text = True
|
||||
|
||||
assert found_image_content, 'Should have found ImageContent with valid URL'
|
||||
assert has_notification_text, 'Should have notification text about filtered images'
|
||||
82
tests/unit/test_image_content_validation.py
Normal file
82
tests/unit/test_image_content_validation.py
Normal file
@ -0,0 +1,82 @@
|
||||
"""Test for ImageContent serialization behavior.
|
||||
|
||||
Note: Image URL filtering now happens at the conversation memory level,
|
||||
not at the ImageContent serialization level. These tests verify that
|
||||
ImageContent correctly serializes whatever URLs it's given.
|
||||
"""
|
||||
|
||||
from openhands.core.message import ImageContent
|
||||
|
||||
|
||||
def test_image_content_serializes_all_urls():
|
||||
"""Test that ImageContent serializes all URLs it's given, including empty ones."""
|
||||
|
||||
# Create ImageContent with mixed valid and invalid URLs
|
||||
image_content = ImageContent(
|
||||
image_urls=[
|
||||
'',
|
||||
'', # Empty string
|
||||
' ', # Whitespace only
|
||||
'',
|
||||
]
|
||||
)
|
||||
|
||||
# Serialize the content
|
||||
serialized = image_content.model_dump()
|
||||
|
||||
# Should serialize all URLs, including empty ones (filtering happens upstream)
|
||||
assert len(serialized) == 4, (
|
||||
f'Expected 4 URLs (including empty), got {len(serialized)}'
|
||||
)
|
||||
|
||||
expected_urls = [
|
||||
'',
|
||||
'',
|
||||
' ',
|
||||
'',
|
||||
]
|
||||
|
||||
for i, item in enumerate(serialized):
|
||||
assert item['type'] == 'image_url'
|
||||
assert 'image_url' in item
|
||||
assert 'url' in item['image_url']
|
||||
assert item['image_url']['url'] == expected_urls[i]
|
||||
|
||||
|
||||
def test_image_content_serializes_empty_urls():
|
||||
"""Test that ImageContent serializes empty URLs (filtering happens upstream)."""
|
||||
|
||||
# Create ImageContent with only empty URLs
|
||||
image_content = ImageContent(image_urls=['', ' '])
|
||||
|
||||
# Serialize the content
|
||||
serialized = image_content.model_dump()
|
||||
|
||||
# Should serialize all URLs, even empty ones
|
||||
assert len(serialized) == 2, f'Expected 2 URLs, got {serialized}'
|
||||
assert serialized[0]['image_url']['url'] == ''
|
||||
assert serialized[1]['image_url']['url'] == ' '
|
||||
|
||||
|
||||
def test_image_content_all_valid_urls():
|
||||
"""Test that ImageContent preserves all valid URLs."""
|
||||
|
||||
valid_urls = [
|
||||
'',
|
||||
'',
|
||||
]
|
||||
|
||||
# Create ImageContent with valid URLs
|
||||
image_content = ImageContent(image_urls=valid_urls)
|
||||
|
||||
# Serialize the content
|
||||
serialized = image_content.model_dump()
|
||||
|
||||
# Should preserve all valid URLs
|
||||
assert len(serialized) == len(valid_urls), (
|
||||
f'Expected {len(valid_urls)} URLs, got {len(serialized)}'
|
||||
)
|
||||
|
||||
for i, item in enumerate(serialized):
|
||||
assert item['type'] == 'image_url'
|
||||
assert item['image_url']['url'] == valid_urls[i]
|
||||
Loading…
x
Reference in New Issue
Block a user