Fix empty image URLs in multimodal browsing causing litellm.BadRequestError (#9214)

Co-authored-by: openhands <openhands@all-hands.dev>
Co-authored-by: Engel Nyst <enyst@users.noreply.github.com>
This commit is contained in:
Graham Neubig
2025-06-20 12:44:11 -07:00
committed by GitHub
parent 8badcb7b35
commit bda0a64a3d
3 changed files with 418 additions and 20 deletions

View File

@@ -56,6 +56,18 @@ class ConversationMemory:
self.agent_config = config
self.prompt_manager = prompt_manager
@staticmethod
def _is_valid_image_url(url: str | None) -> bool:
"""Check if an image URL is valid and non-empty.
Args:
url: The image URL to validate
Returns:
True if the URL is valid, False otherwise
"""
return bool(url and url.strip())
def process_events(
self,
condensed_history: list[Event],
@@ -380,7 +392,27 @@ class ConversationMemory:
# Add image URLs if available and vision is active
if vision_is_active and obs.image_urls:
content.append(ImageContent(image_urls=obs.image_urls))
# Filter out empty or invalid image URLs
valid_image_urls = [
url for url in obs.image_urls if self._is_valid_image_url(url)
]
invalid_count = len(obs.image_urls) - len(valid_image_urls)
if valid_image_urls:
content.append(ImageContent(image_urls=valid_image_urls))
if invalid_count > 0:
# Add text indicating some images were filtered
content[
0
].text += f'\n\nNote: {invalid_count} invalid or empty image(s) were filtered from this output. The agent may need to use alternative methods to access visual information.'
else:
logger.debug(
'IPython observation has image URLs but none are valid'
)
# Add text indicating all images were filtered
content[
0
].text += f'\n\nNote: All {len(obs.image_urls)} image(s) in this output were invalid or empty and have been filtered. The agent should use alternative methods to access visual information.'
message = Message(role='user', content=content)
elif isinstance(obs, FileEditObservation):
@@ -398,25 +430,42 @@ class ConversationMemory:
and vision_is_active
):
text += 'Image: Current webpage screenshot (Note that only visible portion of webpage is present in the screenshot. You may need to scroll to view the remaining portion of the web-page.)\n'
message = Message(
role='user',
content=[
TextContent(text=text),
ImageContent(
image_urls=[
# show set of marks if it exists
# otherwise, show raw screenshot when using vision-supported model
obs.set_of_marks
if obs.set_of_marks is not None
and len(obs.set_of_marks) > 0
else obs.screenshot
]
),
],
)
logger.debug(
f'Vision enabled for browsing, showing {"set of marks" if obs.set_of_marks and len(obs.set_of_marks) > 0 else "screenshot"}'
)
# Determine which image to use and validate it
image_url = None
if obs.set_of_marks is not None and len(obs.set_of_marks) > 0:
image_url = obs.set_of_marks
image_type = 'set of marks'
elif obs.screenshot is not None and len(obs.screenshot) > 0:
image_url = obs.screenshot
image_type = 'screenshot'
# Create message content with text
content = [TextContent(text=text)]
# Only add ImageContent if we have a valid image URL
if self._is_valid_image_url(image_url):
content.append(ImageContent(image_urls=[image_url]))
logger.debug(f'Vision enabled for browsing, showing {image_type}')
else:
if image_url:
logger.warning(
f'Invalid image URL format for {image_type}: {image_url[:50]}...'
)
# Add text indicating the image was filtered
content[
0
].text += f'\n\nNote: The {image_type} for this webpage was invalid or empty and has been filtered. The agent should use alternative methods to access visual information about the webpage.'
else:
logger.debug(
'Vision enabled for browsing, but no valid image available'
)
# Add text indicating no image was available
content[
0
].text += '\n\nNote: No visual information (screenshot or set of marks) is available for this webpage. The agent should rely on the text content above.'
message = Message(role='user', content=content)
else:
message = Message(
role='user',