Remove image content filtering in ConversationMemory (#10645)

2026-03-22 13:47:19 +08:00 · 2025-08-27 22:28:09 +07:00
parent 77b5c6b161
commit 93ef1b0cda
2 changed files with 35 additions and 29 deletions
--- a/openhands/memory/conversation_memory.py
+++ b/openhands/memory/conversation_memory.py
@@ -302,10 +302,12 @@ class ConversationMemory:
        elif isinstance(action, MessageAction):
            role = 'user' if action.source == 'user' else 'assistant'
            content = [TextContent(text=action.content or '')]
-            if vision_is_active and action.image_urls:
+            if action.image_urls:
                if role == 'user':
                    for idx, url in enumerate(action.image_urls):
-                        content.append(TextContent(text=f'Image {idx + 1}:'))
+                        # Only add descriptive text if vision is active
+                        if vision_is_active:
+                            content.append(TextContent(text=f'Image {idx + 1}:'))
                        content.append(ImageContent(image_urls=[url]))
                else:
                    content.append(ImageContent(image_urls=action.image_urls))
@@ -414,8 +416,8 @@ class ConversationMemory:
            # Create message content with text
            content: list[TextContent | ImageContent] = [TextContent(text=text)]

-            # Add image URLs if available and vision is active
-            if vision_is_active and obs.image_urls:
+            # Add image URLs if available
+            if obs.image_urls:
                # Filter out empty or invalid image URLs
                valid_image_urls = [
                    url for url in obs.image_urls if self._is_valid_image_url(url)
@@ -424,7 +426,8 @@ class ConversationMemory:

                if valid_image_urls:
                    content.append(ImageContent(image_urls=valid_image_urls))
-                    if invalid_count > 0:
+                    # Only add explanatory text if vision is active
+                    if vision_is_active and invalid_count > 0:
                        # Add text indicating some images were filtered
                        content[
                            0
@@ -433,10 +436,12 @@ class ConversationMemory:
                    logger.debug(
                        'IPython observation has image URLs but none are valid'
                    )
-                    # Add text indicating all images were filtered
-                    content[
-                        0
-                    ].text += f'\n\nNote: All {len(obs.image_urls)} image(s) in this output were invalid or empty and have been filtered. The agent should use alternative methods to access visual information.'  # type: ignore[union-attr]
+                    # Only add explanatory text if vision is active
+                    if vision_is_active:
+                        # Add text indicating all images were filtered
+                        content[
+                            0
+                        ].text += f'\n\nNote: All {len(obs.image_urls)} image(s) in this output were invalid or empty and have been filtered. The agent should use alternative methods to access visual information.'  # type: ignore[union-attr]

            message = Message(role='user', content=content)
        elif isinstance(obs, FileEditObservation):
@@ -448,15 +453,21 @@ class ConversationMemory:
            )  # Content is already truncated by openhands-aci
        elif isinstance(obs, BrowserOutputObservation):
            text = obs.content
+            content = [TextContent(text=text)]
            if (
                obs.trigger_by_action == ActionType.BROWSE_INTERACTIVE
                and enable_som_visual_browsing
-                and vision_is_active
            ):
-                text += 'Image: Current webpage screenshot (Note that only visible portion of webpage is present in the screenshot. However, the Accessibility tree contains information from the entire webpage.)\n'
+                # Only add descriptive text if vision is active
+                if vision_is_active:
+                    # We know content[0] is TextContent since we just created it above
+                    text_content = content[0]
+                    assert isinstance(text_content, TextContent)
+                    text_content.text += 'Image: Current webpage screenshot (Note that only visible portion of webpage is present in the screenshot. However, the Accessibility tree contains information from the entire webpage.)\n'

                # Determine which image to use and validate it
                image_url = None
+                image_type = None
                if obs.set_of_marks is not None and len(obs.set_of_marks) > 0:
                    image_url = obs.set_of_marks
                    image_type = 'set of marks'
@@ -464,38 +475,29 @@ class ConversationMemory:
                    image_url = obs.screenshot
                    image_type = 'screenshot'

-                # Create message content with text
-                content = [TextContent(text=text)]
-
-                # Only add ImageContent if we have a valid image URL
+                # Always add ImageContent if we have a valid image URL
                if self._is_valid_image_url(image_url):
                    content.append(ImageContent(image_urls=[image_url]))  # type: ignore[list-item]
-                    logger.debug(f'Vision enabled for browsing, showing {image_type}')
+                    logger.debug(f'Adding {image_type} for browsing')
                else:
-                    if image_url:
+                    if vision_is_active and image_url:
                        logger.warning(
                            f'Invalid image URL format for {image_type}: {image_url[:50]}...'
                        )
-                        # Add text indicating the image was filtered
+                        # Add text indicating the image was filtered (only if vision is active)
                        content[
                            0
                        ].text += f'\n\nNote: The {image_type} for this webpage was invalid or empty and has been filtered. The agent should use alternative methods to access visual information about the webpage.'  # type: ignore[union-attr]
-                    else:
+                    elif vision_is_active and not image_url:
                        logger.debug(
                            'Vision enabled for browsing, but no valid image available'
                        )
-                        # Add text indicating no image was available
+                        # Add text indicating no image was available (only if vision is active)
                        content[
                            0
                        ].text += '\n\nNote: No visual information (screenshot or set of marks) is available for this webpage. The agent should rely on the text content above.'  # type: ignore[union-attr]

-                message = Message(role='user', content=content)
-            else:
-                message = Message(
-                    role='user',
-                    content=[TextContent(text=text)],
-                )
-                logger.debug('Vision disabled for browsing, showing text')
+            message = Message(role='user', content=content)
        elif isinstance(obs, AgentDelegateObservation):
            text = truncate_content(
                obs.outputs.get('content', obs.content),
--- a/tests/unit/memory/test_conversation_memory.py
+++ b/tests/unit/memory/test_conversation_memory.py
@@ -1574,8 +1574,12 @@ def test_process_ipython_observation_with_vision_disabled(
        vision_is_active=False,
    )

-    # Check that the message contains only text content
+    # Check that the message contains both text and image content
+    # (ImageContent is always included, filtering happens at Message serialization level)
    assert len(messages) == 1
    message = messages[0]
-    assert len(message.content) == 1
+    assert len(message.content) == 2
    assert isinstance(message.content[0], TextContent)
+    assert isinstance(message.content[1], ImageContent)
+    # Check that NO explanatory text about filtered images was added when vision is disabled
+    assert 'invalid or empty image(s) were filtered' not in message.content[0].text