mirror of
https://github.com/OpenHands/OpenHands.git
synced 2026-03-22 13:47:19 +08:00
Remove image content filtering in ConversationMemory (#10645)
This commit is contained in:
@@ -302,10 +302,12 @@ class ConversationMemory:
|
||||
elif isinstance(action, MessageAction):
|
||||
role = 'user' if action.source == 'user' else 'assistant'
|
||||
content = [TextContent(text=action.content or '')]
|
||||
if vision_is_active and action.image_urls:
|
||||
if action.image_urls:
|
||||
if role == 'user':
|
||||
for idx, url in enumerate(action.image_urls):
|
||||
content.append(TextContent(text=f'Image {idx + 1}:'))
|
||||
# Only add descriptive text if vision is active
|
||||
if vision_is_active:
|
||||
content.append(TextContent(text=f'Image {idx + 1}:'))
|
||||
content.append(ImageContent(image_urls=[url]))
|
||||
else:
|
||||
content.append(ImageContent(image_urls=action.image_urls))
|
||||
@@ -414,8 +416,8 @@ class ConversationMemory:
|
||||
# Create message content with text
|
||||
content: list[TextContent | ImageContent] = [TextContent(text=text)]
|
||||
|
||||
# Add image URLs if available and vision is active
|
||||
if vision_is_active and obs.image_urls:
|
||||
# Add image URLs if available
|
||||
if obs.image_urls:
|
||||
# Filter out empty or invalid image URLs
|
||||
valid_image_urls = [
|
||||
url for url in obs.image_urls if self._is_valid_image_url(url)
|
||||
@@ -424,7 +426,8 @@ class ConversationMemory:
|
||||
|
||||
if valid_image_urls:
|
||||
content.append(ImageContent(image_urls=valid_image_urls))
|
||||
if invalid_count > 0:
|
||||
# Only add explanatory text if vision is active
|
||||
if vision_is_active and invalid_count > 0:
|
||||
# Add text indicating some images were filtered
|
||||
content[
|
||||
0
|
||||
@@ -433,10 +436,12 @@ class ConversationMemory:
|
||||
logger.debug(
|
||||
'IPython observation has image URLs but none are valid'
|
||||
)
|
||||
# Add text indicating all images were filtered
|
||||
content[
|
||||
0
|
||||
].text += f'\n\nNote: All {len(obs.image_urls)} image(s) in this output were invalid or empty and have been filtered. The agent should use alternative methods to access visual information.' # type: ignore[union-attr]
|
||||
# Only add explanatory text if vision is active
|
||||
if vision_is_active:
|
||||
# Add text indicating all images were filtered
|
||||
content[
|
||||
0
|
||||
].text += f'\n\nNote: All {len(obs.image_urls)} image(s) in this output were invalid or empty and have been filtered. The agent should use alternative methods to access visual information.' # type: ignore[union-attr]
|
||||
|
||||
message = Message(role='user', content=content)
|
||||
elif isinstance(obs, FileEditObservation):
|
||||
@@ -448,15 +453,21 @@ class ConversationMemory:
|
||||
) # Content is already truncated by openhands-aci
|
||||
elif isinstance(obs, BrowserOutputObservation):
|
||||
text = obs.content
|
||||
content = [TextContent(text=text)]
|
||||
if (
|
||||
obs.trigger_by_action == ActionType.BROWSE_INTERACTIVE
|
||||
and enable_som_visual_browsing
|
||||
and vision_is_active
|
||||
):
|
||||
text += 'Image: Current webpage screenshot (Note that only visible portion of webpage is present in the screenshot. However, the Accessibility tree contains information from the entire webpage.)\n'
|
||||
# Only add descriptive text if vision is active
|
||||
if vision_is_active:
|
||||
# We know content[0] is TextContent since we just created it above
|
||||
text_content = content[0]
|
||||
assert isinstance(text_content, TextContent)
|
||||
text_content.text += 'Image: Current webpage screenshot (Note that only visible portion of webpage is present in the screenshot. However, the Accessibility tree contains information from the entire webpage.)\n'
|
||||
|
||||
# Determine which image to use and validate it
|
||||
image_url = None
|
||||
image_type = None
|
||||
if obs.set_of_marks is not None and len(obs.set_of_marks) > 0:
|
||||
image_url = obs.set_of_marks
|
||||
image_type = 'set of marks'
|
||||
@@ -464,38 +475,29 @@ class ConversationMemory:
|
||||
image_url = obs.screenshot
|
||||
image_type = 'screenshot'
|
||||
|
||||
# Create message content with text
|
||||
content = [TextContent(text=text)]
|
||||
|
||||
# Only add ImageContent if we have a valid image URL
|
||||
# Always add ImageContent if we have a valid image URL
|
||||
if self._is_valid_image_url(image_url):
|
||||
content.append(ImageContent(image_urls=[image_url])) # type: ignore[list-item]
|
||||
logger.debug(f'Vision enabled for browsing, showing {image_type}')
|
||||
logger.debug(f'Adding {image_type} for browsing')
|
||||
else:
|
||||
if image_url:
|
||||
if vision_is_active and image_url:
|
||||
logger.warning(
|
||||
f'Invalid image URL format for {image_type}: {image_url[:50]}...'
|
||||
)
|
||||
# Add text indicating the image was filtered
|
||||
# Add text indicating the image was filtered (only if vision is active)
|
||||
content[
|
||||
0
|
||||
].text += f'\n\nNote: The {image_type} for this webpage was invalid or empty and has been filtered. The agent should use alternative methods to access visual information about the webpage.' # type: ignore[union-attr]
|
||||
else:
|
||||
elif vision_is_active and not image_url:
|
||||
logger.debug(
|
||||
'Vision enabled for browsing, but no valid image available'
|
||||
)
|
||||
# Add text indicating no image was available
|
||||
# Add text indicating no image was available (only if vision is active)
|
||||
content[
|
||||
0
|
||||
].text += '\n\nNote: No visual information (screenshot or set of marks) is available for this webpage. The agent should rely on the text content above.' # type: ignore[union-attr]
|
||||
|
||||
message = Message(role='user', content=content)
|
||||
else:
|
||||
message = Message(
|
||||
role='user',
|
||||
content=[TextContent(text=text)],
|
||||
)
|
||||
logger.debug('Vision disabled for browsing, showing text')
|
||||
message = Message(role='user', content=content)
|
||||
elif isinstance(obs, AgentDelegateObservation):
|
||||
text = truncate_content(
|
||||
obs.outputs.get('content', obs.content),
|
||||
|
||||
@@ -1574,8 +1574,12 @@ def test_process_ipython_observation_with_vision_disabled(
|
||||
vision_is_active=False,
|
||||
)
|
||||
|
||||
# Check that the message contains only text content
|
||||
# Check that the message contains both text and image content
|
||||
# (ImageContent is always included, filtering happens at Message serialization level)
|
||||
assert len(messages) == 1
|
||||
message = messages[0]
|
||||
assert len(message.content) == 1
|
||||
assert len(message.content) == 2
|
||||
assert isinstance(message.content[0], TextContent)
|
||||
assert isinstance(message.content[1], ImageContent)
|
||||
# Check that NO explanatory text about filtered images was added when vision is disabled
|
||||
assert 'invalid or empty image(s) were filtered' not in message.content[0].text
|
||||
|
||||
Reference in New Issue
Block a user