Remove image content filtering in ConversationMemory (#10645)

This commit is contained in:
Ryan H. Tran
2025-08-27 22:28:09 +07:00
committed by GitHub
parent 77b5c6b161
commit 93ef1b0cda
2 changed files with 35 additions and 29 deletions

View File

@@ -302,10 +302,12 @@ class ConversationMemory:
elif isinstance(action, MessageAction):
role = 'user' if action.source == 'user' else 'assistant'
content = [TextContent(text=action.content or '')]
if vision_is_active and action.image_urls:
if action.image_urls:
if role == 'user':
for idx, url in enumerate(action.image_urls):
content.append(TextContent(text=f'Image {idx + 1}:'))
# Only add descriptive text if vision is active
if vision_is_active:
content.append(TextContent(text=f'Image {idx + 1}:'))
content.append(ImageContent(image_urls=[url]))
else:
content.append(ImageContent(image_urls=action.image_urls))
@@ -414,8 +416,8 @@ class ConversationMemory:
# Create message content with text
content: list[TextContent | ImageContent] = [TextContent(text=text)]
# Add image URLs if available and vision is active
if vision_is_active and obs.image_urls:
# Add image URLs if available
if obs.image_urls:
# Filter out empty or invalid image URLs
valid_image_urls = [
url for url in obs.image_urls if self._is_valid_image_url(url)
@@ -424,7 +426,8 @@ class ConversationMemory:
if valid_image_urls:
content.append(ImageContent(image_urls=valid_image_urls))
if invalid_count > 0:
# Only add explanatory text if vision is active
if vision_is_active and invalid_count > 0:
# Add text indicating some images were filtered
content[
0
@@ -433,10 +436,12 @@ class ConversationMemory:
logger.debug(
'IPython observation has image URLs but none are valid'
)
# Add text indicating all images were filtered
content[
0
].text += f'\n\nNote: All {len(obs.image_urls)} image(s) in this output were invalid or empty and have been filtered. The agent should use alternative methods to access visual information.' # type: ignore[union-attr]
# Only add explanatory text if vision is active
if vision_is_active:
# Add text indicating all images were filtered
content[
0
].text += f'\n\nNote: All {len(obs.image_urls)} image(s) in this output were invalid or empty and have been filtered. The agent should use alternative methods to access visual information.' # type: ignore[union-attr]
message = Message(role='user', content=content)
elif isinstance(obs, FileEditObservation):
@@ -448,15 +453,21 @@ class ConversationMemory:
) # Content is already truncated by openhands-aci
elif isinstance(obs, BrowserOutputObservation):
text = obs.content
content = [TextContent(text=text)]
if (
obs.trigger_by_action == ActionType.BROWSE_INTERACTIVE
and enable_som_visual_browsing
and vision_is_active
):
text += 'Image: Current webpage screenshot (Note that only visible portion of webpage is present in the screenshot. However, the Accessibility tree contains information from the entire webpage.)\n'
# Only add descriptive text if vision is active
if vision_is_active:
# We know content[0] is TextContent since we just created it above
text_content = content[0]
assert isinstance(text_content, TextContent)
text_content.text += 'Image: Current webpage screenshot (Note that only visible portion of webpage is present in the screenshot. However, the Accessibility tree contains information from the entire webpage.)\n'
# Determine which image to use and validate it
image_url = None
image_type = None
if obs.set_of_marks is not None and len(obs.set_of_marks) > 0:
image_url = obs.set_of_marks
image_type = 'set of marks'
@@ -464,38 +475,29 @@ class ConversationMemory:
image_url = obs.screenshot
image_type = 'screenshot'
# Create message content with text
content = [TextContent(text=text)]
# Only add ImageContent if we have a valid image URL
# Always add ImageContent if we have a valid image URL
if self._is_valid_image_url(image_url):
content.append(ImageContent(image_urls=[image_url])) # type: ignore[list-item]
logger.debug(f'Vision enabled for browsing, showing {image_type}')
logger.debug(f'Adding {image_type} for browsing')
else:
if image_url:
if vision_is_active and image_url:
logger.warning(
f'Invalid image URL format for {image_type}: {image_url[:50]}...'
)
# Add text indicating the image was filtered
# Add text indicating the image was filtered (only if vision is active)
content[
0
].text += f'\n\nNote: The {image_type} for this webpage was invalid or empty and has been filtered. The agent should use alternative methods to access visual information about the webpage.' # type: ignore[union-attr]
else:
elif vision_is_active and not image_url:
logger.debug(
'Vision enabled for browsing, but no valid image available'
)
# Add text indicating no image was available
# Add text indicating no image was available (only if vision is active)
content[
0
].text += '\n\nNote: No visual information (screenshot or set of marks) is available for this webpage. The agent should rely on the text content above.' # type: ignore[union-attr]
message = Message(role='user', content=content)
else:
message = Message(
role='user',
content=[TextContent(text=text)],
)
logger.debug('Vision disabled for browsing, showing text')
message = Message(role='user', content=content)
elif isinstance(obs, AgentDelegateObservation):
text = truncate_content(
obs.outputs.get('content', obs.content),

View File

@@ -1574,8 +1574,12 @@ def test_process_ipython_observation_with_vision_disabled(
vision_is_active=False,
)
# Check that the message contains only text content
# Check that the message contains both text and image content
# (ImageContent is always included, filtering happens at Message serialization level)
assert len(messages) == 1
message = messages[0]
assert len(message.content) == 1
assert len(message.content) == 2
assert isinstance(message.content[0], TextContent)
assert isinstance(message.content[1], ImageContent)
# Check that NO explanatory text about filtered images was added when vision is disabled
assert 'invalid or empty image(s) were filtered' not in message.content[0].text