Fix empty image URLs in multimodal browsing causing litellm.BadRequestError (#9214)

Co-authored-by: openhands <openhands@all-hands.dev> Co-authored-by: Engel Nyst <enyst@users.noreply.github.com>
2025-12-26 05:48:36 +08:00 · 2025-06-20 12:44:11 -07:00 · 2025-06-20 12:44:11 -07:00 · bda0a64a3d
commit bda0a64a3d
parent 8badcb7b35
3 changed files with 418 additions and 20 deletions
--- a/openhands/memory/conversation_memory.py
+++ b/openhands/memory/conversation_memory.py
@ -56,6 +56,18 @@ class ConversationMemory:
        self.agent_config = config
        self.prompt_manager = prompt_manager

+    @staticmethod
+    def _is_valid_image_url(url: str | None) -> bool:
+        """Check if an image URL is valid and non-empty.
+
+        Args:
+            url: The image URL to validate
+
+        Returns:
+            True if the URL is valid, False otherwise
+        """
+        return bool(url and url.strip())
+
    def process_events(
        self,
        condensed_history: list[Event],
@ -380,7 +392,27 @@ class ConversationMemory:

            # Add image URLs if available and vision is active
            if vision_is_active and obs.image_urls:
-                content.append(ImageContent(image_urls=obs.image_urls))
+                # Filter out empty or invalid image URLs
+                valid_image_urls = [
+                    url for url in obs.image_urls if self._is_valid_image_url(url)
+                ]
+                invalid_count = len(obs.image_urls) - len(valid_image_urls)
+
+                if valid_image_urls:
+                    content.append(ImageContent(image_urls=valid_image_urls))
+                    if invalid_count > 0:
+                        # Add text indicating some images were filtered
+                        content[
+                            0
+                        ].text += f'\n\nNote: {invalid_count} invalid or empty image(s) were filtered from this output. The agent may need to use alternative methods to access visual information.'
+                else:
+                    logger.debug(
+                        'IPython observation has image URLs but none are valid'
+                    )
+                    # Add text indicating all images were filtered
+                    content[
+                        0
+                    ].text += f'\n\nNote: All {len(obs.image_urls)} image(s) in this output were invalid or empty and have been filtered. The agent should use alternative methods to access visual information.'

            message = Message(role='user', content=content)
        elif isinstance(obs, FileEditObservation):
@ -398,25 +430,42 @@ class ConversationMemory:
                and vision_is_active
            ):
                text += 'Image: Current webpage screenshot (Note that only visible portion of webpage is present in the screenshot. You may need to scroll to view the remaining portion of the web-page.)\n'
-                message = Message(
-                    role='user',
-                    content=[
-                        TextContent(text=text),
-                        ImageContent(
-                            image_urls=[
-                                # show set of marks if it exists
-                                # otherwise, show raw screenshot when using vision-supported model
-                                obs.set_of_marks
-                                if obs.set_of_marks is not None
-                                and len(obs.set_of_marks) > 0
-                                else obs.screenshot
-                            ]
-                        ),
-                    ],
-                )
-                logger.debug(
-                    f'Vision enabled for browsing, showing {"set of marks" if obs.set_of_marks and len(obs.set_of_marks) > 0 else "screenshot"}'
-                )
+
+                # Determine which image to use and validate it
+                image_url = None
+                if obs.set_of_marks is not None and len(obs.set_of_marks) > 0:
+                    image_url = obs.set_of_marks
+                    image_type = 'set of marks'
+                elif obs.screenshot is not None and len(obs.screenshot) > 0:
+                    image_url = obs.screenshot
+                    image_type = 'screenshot'
+
+                # Create message content with text
+                content = [TextContent(text=text)]
+
+                # Only add ImageContent if we have a valid image URL
+                if self._is_valid_image_url(image_url):
+                    content.append(ImageContent(image_urls=[image_url]))
+                    logger.debug(f'Vision enabled for browsing, showing {image_type}')
+                else:
+                    if image_url:
+                        logger.warning(
+                            f'Invalid image URL format for {image_type}: {image_url[:50]}...'
+                        )
+                        # Add text indicating the image was filtered
+                        content[
+                            0
+                        ].text += f'\n\nNote: The {image_type} for this webpage was invalid or empty and has been filtered. The agent should use alternative methods to access visual information about the webpage.'
+                    else:
+                        logger.debug(
+                            'Vision enabled for browsing, but no valid image available'
+                        )
+                        # Add text indicating no image was available
+                        content[
+                            0
+                        ].text += '\n\nNote: No visual information (screenshot or set of marks) is available for this webpage. The agent should rely on the text content above.'
+
+                message = Message(role='user', content=content)
            else:
                message = Message(
                    role='user',
--- a/tests/unit/test_empty_image_url_fix_v2.py
+++ b/tests/unit/test_empty_image_url_fix_v2.py
@ -0,0 +1,267 @@
+"""Test for fixing empty image URL issue in multimodal browsing."""
+
+from openhands.core.config.agent_config import AgentConfig
+from openhands.core.message import ImageContent
+from openhands.events.observation.browse import BrowserOutputObservation
+from openhands.memory.conversation_memory import ConversationMemory
+from openhands.utils.prompt import PromptManager
+
+
+def test_empty_image_url_handling():
+    """Test that empty image URLs are properly filtered out and notification text is added."""
+
+    # Create a browser observation with empty screenshot and set_of_marks
+    browser_obs = BrowserOutputObservation(
+        url='https://example.com',
+        trigger_by_action='browse_interactive',
+        screenshot='',  # Empty screenshot
+        set_of_marks='',  # Empty set_of_marks
+        content='Some webpage content',
+    )
+
+    # Create conversation memory with vision enabled
+    agent_config = AgentConfig(enable_som_visual_browsing=True)
+    prompt_manager = PromptManager(
+        prompt_dir='openhands/agenthub/codeact_agent/prompts'
+    )
+    conv_memory = ConversationMemory(agent_config, prompt_manager)
+
+    # Process the observation with vision enabled
+    messages = conv_memory._process_observation(
+        obs=browser_obs,
+        tool_call_id_to_message={},
+        max_message_chars=None,
+        vision_is_active=True,
+        enable_som_visual_browsing=True,
+        current_index=0,
+        events=[],
+    )
+
+    # Check that no empty image URLs are included
+    has_image_content = False
+    has_notification_text = False
+    for message in messages:
+        for content in message.content:
+            if isinstance(content, ImageContent):
+                has_image_content = True
+                # All image URLs should be non-empty and valid
+                for url in content.image_urls:
+                    assert url != '', 'Empty image URL should be filtered out'
+                    assert url is not None, 'None image URL should be filtered out'
+                    # Should start with data: prefix for base64 images
+                    if url:  # Only check if URL is not empty
+                        assert url.startswith('data:'), (
+                            f'Invalid image URL format: {url}'
+                        )
+            elif hasattr(content, 'text'):
+                # Check for notification text about missing visual information
+                if (
+                    'No visual information' in content.text
+                    or 'has been filtered' in content.text
+                ):
+                    has_notification_text = True
+
+    # Should not have image content but should have notification text
+    assert not has_image_content, 'Should not have ImageContent for empty images'
+    assert has_notification_text, (
+        'Should have notification text about missing visual information'
+    )
+
+
+def test_valid_image_url_handling():
+    """Test that valid image URLs are properly handled."""
+
+    # Create a browser observation with valid base64 image data
+    valid_base64_image = 'data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mNk+M9QDwADhgGAWjR9awAAAABJRU5ErkJggg=='
+
+    browser_obs = BrowserOutputObservation(
+        url='https://example.com',
+        trigger_by_action='browse_interactive',
+        screenshot=valid_base64_image,
+        set_of_marks=valid_base64_image,
+        content='Some webpage content',
+    )
+
+    # Create conversation memory with vision enabled
+    agent_config = AgentConfig(enable_som_visual_browsing=True)
+    prompt_manager = PromptManager(
+        prompt_dir='openhands/agenthub/codeact_agent/prompts'
+    )
+    conv_memory = ConversationMemory(agent_config, prompt_manager)
+
+    # Process the observation with vision enabled
+    messages = conv_memory._process_observation(
+        obs=browser_obs,
+        tool_call_id_to_message={},
+        max_message_chars=None,
+        vision_is_active=True,
+        enable_som_visual_browsing=True,
+        current_index=0,
+        events=[],
+    )
+
+    # Check that valid image URLs are included
+    found_image_content = False
+    for message in messages:
+        for content in message.content:
+            if isinstance(content, ImageContent):
+                found_image_content = True
+                # Should have at least one valid image URL
+                assert len(content.image_urls) > 0, 'Should have at least one image URL'
+                for url in content.image_urls:
+                    assert url != '', 'Image URL should not be empty'
+                    assert url.startswith('data:image/'), (
+                        f'Invalid image URL format: {url}'
+                    )
+
+    assert found_image_content, 'Should have found ImageContent with valid URLs'
+
+
+def test_mixed_image_url_handling():
+    """Test handling of mixed valid and invalid image URLs."""
+
+    # Create a browser observation with one empty and one valid image
+    valid_base64_image = 'data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mNk+M9QDwADhgGAWjR9awAAAABJRU5ErkJggg=='
+
+    browser_obs = BrowserOutputObservation(
+        url='https://example.com',
+        trigger_by_action='browse_interactive',
+        screenshot='',  # Empty screenshot
+        set_of_marks=valid_base64_image,  # Valid set_of_marks
+        content='Some webpage content',
+    )
+
+    # Create conversation memory with vision enabled
+    agent_config = AgentConfig(enable_som_visual_browsing=True)
+    prompt_manager = PromptManager(
+        prompt_dir='openhands/agenthub/codeact_agent/prompts'
+    )
+    conv_memory = ConversationMemory(agent_config, prompt_manager)
+
+    # Process the observation with vision enabled
+    messages = conv_memory._process_observation(
+        obs=browser_obs,
+        tool_call_id_to_message={},
+        max_message_chars=None,
+        vision_is_active=True,
+        enable_som_visual_browsing=True,
+        current_index=0,
+        events=[],
+    )
+
+    # Check that only valid image URLs are included
+    found_image_content = False
+    for message in messages:
+        for content in message.content:
+            if isinstance(content, ImageContent):
+                found_image_content = True
+                # Should have exactly one valid image URL (set_of_marks)
+                assert len(content.image_urls) == 1, (
+                    f'Should have exactly one image URL, got {len(content.image_urls)}'
+                )
+                url = content.image_urls[0]
+                assert url == valid_base64_image, (
+                    f'Should use the valid image URL: {url}'
+                )
+
+    assert found_image_content, 'Should have found ImageContent with valid URL'
+
+
+def test_ipython_empty_image_url_handling():
+    """Test that empty image URLs in IPython observations are properly filtered with notification text."""
+    from openhands.events.observation.commands import IPythonRunCellObservation
+
+    # Create an IPython observation with empty image URLs
+    ipython_obs = IPythonRunCellObservation(
+        content='Some output',
+        code='print("hello")',
+        image_urls=['', None, ''],  # Empty and None image URLs
+    )
+
+    # Create conversation memory with vision enabled
+    agent_config = AgentConfig(enable_som_visual_browsing=True)
+    prompt_manager = PromptManager(
+        prompt_dir='openhands/agenthub/codeact_agent/prompts'
+    )
+    conv_memory = ConversationMemory(agent_config, prompt_manager)
+
+    # Process the observation with vision enabled
+    messages = conv_memory._process_observation(
+        obs=ipython_obs,
+        tool_call_id_to_message={},
+        max_message_chars=None,
+        vision_is_active=True,
+        enable_som_visual_browsing=True,
+        current_index=0,
+        events=[],
+    )
+
+    # Check that no empty image URLs are included and notification text is added
+    has_image_content = False
+    has_notification_text = False
+    for message in messages:
+        for content in message.content:
+            if isinstance(content, ImageContent):
+                has_image_content = True
+            elif hasattr(content, 'text'):
+                # Check for notification text about filtered images
+                if 'invalid or empty and have been filtered' in content.text:
+                    has_notification_text = True
+
+    # Should not have image content but should have notification text
+    assert not has_image_content, 'Should not have ImageContent for empty images'
+    assert has_notification_text, 'Should have notification text about filtered images'
+
+
+def test_ipython_mixed_image_url_handling():
+    """Test handling of mixed valid and invalid image URLs in IPython observations."""
+    from openhands.events.observation.commands import IPythonRunCellObservation
+
+    # Create an IPython observation with mixed image URLs
+    valid_base64_image = 'data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mNk+M9QDwADhgGAWjR9awAAAABJRU5ErkJggg=='
+    ipython_obs = IPythonRunCellObservation(
+        content='Some output',
+        code='print("hello")',
+        image_urls=['', valid_base64_image, None],  # Mix of empty, valid, and None
+    )
+
+    # Create conversation memory with vision enabled
+    agent_config = AgentConfig(enable_som_visual_browsing=True)
+    prompt_manager = PromptManager(
+        prompt_dir='openhands/agenthub/codeact_agent/prompts'
+    )
+    conv_memory = ConversationMemory(agent_config, prompt_manager)
+
+    # Process the observation with vision enabled
+    messages = conv_memory._process_observation(
+        obs=ipython_obs,
+        tool_call_id_to_message={},
+        max_message_chars=None,
+        vision_is_active=True,
+        enable_som_visual_browsing=True,
+        current_index=0,
+        events=[],
+    )
+
+    # Check that only valid image URLs are included and notification text is added
+    found_image_content = False
+    has_notification_text = False
+    for message in messages:
+        for content in message.content:
+            if isinstance(content, ImageContent):
+                found_image_content = True
+                # Should have exactly one valid image URL
+                assert len(content.image_urls) == 1, (
+                    f'Should have exactly one image URL, got {len(content.image_urls)}'
+                )
+                url = content.image_urls[0]
+                assert url == valid_base64_image, (
+                    f'Should use the valid image URL: {url}'
+                )
+            elif hasattr(content, 'text'):
+                # Check for notification text about filtered images
+                if 'invalid or empty image(s) were filtered' in content.text:
+                    has_notification_text = True
+
+    assert found_image_content, 'Should have found ImageContent with valid URL'
+    assert has_notification_text, 'Should have notification text about filtered images'
--- a/tests/unit/test_image_content_validation.py
+++ b/tests/unit/test_image_content_validation.py
@ -0,0 +1,82 @@
+"""Test for ImageContent serialization behavior.
+
+Note: Image URL filtering now happens at the conversation memory level,
+not at the ImageContent serialization level. These tests verify that
+ImageContent correctly serializes whatever URLs it's given.
+"""
+
+from openhands.core.message import ImageContent
+
+
+def test_image_content_serializes_all_urls():
+    """Test that ImageContent serializes all URLs it's given, including empty ones."""
+
+    # Create ImageContent with mixed valid and invalid URLs
+    image_content = ImageContent(
+        image_urls=[
+            'data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mNk+M9QDwADhgGAWjR9awAAAABJRU5ErkJggg==',
+            '',  # Empty string
+            '   ',  # Whitespace only
+            'data:image/jpeg;base64,/9j/4AAQSkZJRgABAQEAYABgAAD/2wBDAAEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQH/2wBDAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQH/wAARCAABAAEDASIAAhEBAxEB/8QAFQABAQAAAAAAAAAAAAAAAAAAAAv/xAAUEAEAAAAAAAAAAAAAAAAAAAAA/8QAFQEBAQAAAAAAAAAAAAAAAAAAAAX/xAAUEQEAAAAAAAAAAAAAAAAAAAAA/9oADAMBAAIRAxEAPwA/wA==',
+        ]
+    )
+
+    # Serialize the content
+    serialized = image_content.model_dump()
+
+    # Should serialize all URLs, including empty ones (filtering happens upstream)
+    assert len(serialized) == 4, (
+        f'Expected 4 URLs (including empty), got {len(serialized)}'
+    )
+
+    expected_urls = [
+        'data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mNk+M9QDwADhgGAWjR9awAAAABJRU5ErkJggg==',
+        '',
+        '   ',
+        'data:image/jpeg;base64,/9j/4AAQSkZJRgABAQEAYABgAAD/2wBDAAEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQH/2wBDAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQH/wAARCAABAAEDASIAAhEBAxEB/8QAFQABAQAAAAAAAAAAAAAAAAAAAAv/xAAUEAEAAAAAAAAAAAAAAAAAAAAA/8QAFQEBAQAAAAAAAAAAAAAAAAAAAAX/xAAUEQEAAAAAAAAAAAAAAAAAAAAA/9oADAMBAAIRAxEAPwA/wA==',
+    ]
+
+    for i, item in enumerate(serialized):
+        assert item['type'] == 'image_url'
+        assert 'image_url' in item
+        assert 'url' in item['image_url']
+        assert item['image_url']['url'] == expected_urls[i]
+
+
+def test_image_content_serializes_empty_urls():
+    """Test that ImageContent serializes empty URLs (filtering happens upstream)."""
+
+    # Create ImageContent with only empty URLs
+    image_content = ImageContent(image_urls=['', '   '])
+
+    # Serialize the content
+    serialized = image_content.model_dump()
+
+    # Should serialize all URLs, even empty ones
+    assert len(serialized) == 2, f'Expected 2 URLs, got {serialized}'
+    assert serialized[0]['image_url']['url'] == ''
+    assert serialized[1]['image_url']['url'] == '   '
+
+
+def test_image_content_all_valid_urls():
+    """Test that ImageContent preserves all valid URLs."""
+
+    valid_urls = [
+        'data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mNk+M9QDwADhgGAWjR9awAAAABJRU5ErkJggg==',
+        'data:image/jpeg;base64,/9j/4AAQSkZJRgABAQEAYABgAAD/2wBDAAEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQH/2wBDAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQH/wAARCAABAAEDASIAAhEBAxEB/8QAFQABAQAAAAAAAAAAAAAAAAAAAAv/xAAUEAEAAAAAAAAAAAAAAAAAAAAA/8QAFQEBAQAAAAAAAAAAAAAAAAAAAAX/xAAUEQEAAAAAAAAAAAAAAAAAAAAA/9oADAMBAAIRAxEAPwA/wA==',
+    ]
+
+    # Create ImageContent with valid URLs
+    image_content = ImageContent(image_urls=valid_urls)
+
+    # Serialize the content
+    serialized = image_content.model_dump()
+
+    # Should preserve all valid URLs
+    assert len(serialized) == len(valid_urls), (
+        f'Expected {len(valid_urls)} URLs, got {len(serialized)}'
+    )
+
+    for i, item in enumerate(serialized):
+        assert item['type'] == 'image_url'
+        assert item['image_url']['url'] == valid_urls[i]