Fix issue #8372: Implement browser screenshot saving functionality (#8383)

Co-authored-by: openhands <openhands@all-hands.dev>
Co-authored-by: Graham Neubig <neubig@gmail.com>
This commit is contained in:
Xingyao Wang 2025-05-11 15:51:18 +08:00 committed by GitHub
parent a17c57d82e
commit 3d02c0c3a3
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 124 additions and 49 deletions

View File

@ -14,6 +14,7 @@ class BrowserOutputObservation(Observation):
url: str
trigger_by_action: str
screenshot: str = field(repr=False, default='') # don't show in repr
screenshot_path: str | None = field(default=None) # path to saved screenshot file
set_of_marks: str = field(default='', repr=False) # don't show in repr
error: bool = False
observation: str = ObservationType.BROWSE
@ -49,6 +50,8 @@ class BrowserOutputObservation(Observation):
f'Last browser action error: {self.last_browser_action_error}\n'
f'Focused element bid: {self.focused_element_bid}\n'
)
if self.screenshot_path:
ret += f'Screenshot saved to: {self.screenshot_path}\n'
ret += '--- Agent Observation ---\n'
ret += self.get_agent_obs_text()
return ret
@ -57,7 +60,14 @@ class BrowserOutputObservation(Observation):
"""Get a concise text that will be shown to the agent."""
if self.trigger_by_action == ActionType.BROWSE_INTERACTIVE:
text = f'[Current URL: {self.url}]\n'
text += f'[Focused element bid: {self.focused_element_bid}]\n\n'
text += f'[Focused element bid: {self.focused_element_bid}]\n'
# Add screenshot path information if available
if self.screenshot_path:
text += f'[Screenshot saved to: {self.screenshot_path}]\n'
text += '\n'
if self.error:
text += (
'================ BEGIN error message ===============\n'
@ -85,6 +95,7 @@ class BrowserOutputObservation(Observation):
elif self.trigger_by_action == ActionType.BROWSE:
text = f'[Current URL: {self.url}]\n'
if self.error:
text += (
'================ BEGIN error message ===============\n'

View File

@ -602,7 +602,7 @@ class ActionExecutor:
'Browser functionality is not supported on Windows.'
)
await self._ensure_browser_ready()
return await browse(action, self.browser)
return await browse(action, self.browser, self.initial_cwd)
async def browse_interactive(self, action: BrowseInteractiveAction) -> Observation:
if self.browser is None:
@ -610,7 +610,7 @@ class ActionExecutor:
'Browser functionality is not supported on Windows.'
)
await self._ensure_browser_ready()
return await browse(action, self.browser)
return await browse(action, self.browser, self.initial_cwd)
def close(self):
self.memory_monitor.stop_monitoring()

View File

@ -0,0 +1,31 @@
import io
import base64
from PIL import Image
import numpy as np
def image_to_png_base64_url(
image: np.ndarray | Image.Image, add_data_prefix: bool = False
) -> str:
"""Convert a numpy array to a base64 encoded png image url."""
if isinstance(image, np.ndarray):
image = Image.fromarray(image)
if image.mode in ('RGBA', 'LA'):
image = image.convert('RGB')
buffered = io.BytesIO()
image.save(buffered, format='PNG')
image_base64 = base64.b64encode(buffered.getvalue()).decode()
return (
f'data:image/png;base64,{image_base64}'
if add_data_prefix
else f'{image_base64}'
)
def png_base64_url_to_image(png_base64_url: str) -> Image.Image:
"""Convert a base64 encoded png image url to a PIL Image."""
splited = png_base64_url.split(',')
if len(splited) == 2:
base64_data = splited[1]
else:
base64_data = png_base64_url
return Image.open(io.BytesIO(base64.b64decode(base64_data)))

View File

@ -1,6 +1,4 @@
import atexit
import base64
import io
import json
import multiprocessing
import time
@ -9,20 +7,18 @@ import uuid
import browsergym.core # noqa F401 (we register the openended task as a gym environment)
import gymnasium as gym
import html2text
import numpy as np
import tenacity
from browsergym.utils.obs import flatten_dom_to_str, overlay_som
from PIL import Image
from openhands.core.exceptions import BrowserInitException
from openhands.core.logger import openhands_logger as logger
from openhands.utils.shutdown_listener import should_continue, should_exit
from openhands.utils.tenacity_stop import stop_if_should_exit
from openhands.runtime.browser.base64 import image_to_png_base64_url
BROWSER_EVAL_GET_GOAL_ACTION = 'GET_EVAL_GOAL'
BROWSER_EVAL_GET_REWARDS_ACTION = 'GET_EVAL_REWARDS'
class BrowserEnv:
def __init__(self, browsergym_eval_env: str | None = None):
self.html_text_converter = self.get_html_text_converter()
@ -165,13 +161,13 @@ class BrowserEnv:
html_str = flatten_dom_to_str(obs['dom_object'])
obs['text_content'] = self.html_text_converter.handle(html_str)
# make observation serializable
obs['set_of_marks'] = self.image_to_png_base64_url(
obs['set_of_marks'] = image_to_png_base64_url(
overlay_som(
obs['screenshot'], obs.get('extra_element_properties', {})
),
add_data_prefix=True,
)
obs['screenshot'] = self.image_to_png_base64_url(
obs['screenshot'] = image_to_png_base64_url(
obs['screenshot'], add_data_prefix=True
)
obs['active_page_index'] = obs['active_page_index'].item()
@ -226,41 +222,3 @@ class BrowserEnv:
self.browser_side.close()
except Exception as e:
logger.error(f'Encountered an error when closing browser env: {e}')
@staticmethod
def image_to_png_base64_url(
image: np.ndarray | Image.Image, add_data_prefix: bool = False
) -> str:
"""Convert a numpy array to a base64 encoded png image url."""
if isinstance(image, np.ndarray):
image = Image.fromarray(image)
if image.mode in ('RGBA', 'LA'):
image = image.convert('RGB')
buffered = io.BytesIO()
image.save(buffered, format='PNG')
image_base64 = base64.b64encode(buffered.getvalue()).decode()
return (
f'data:image/png;base64,{image_base64}'
if add_data_prefix
else f'{image_base64}'
)
@staticmethod
def image_to_jpg_base64_url(
image: np.ndarray | Image.Image, add_data_prefix: bool = False
) -> str:
"""Convert a numpy array to a base64 encoded jpeg image url."""
if isinstance(image, np.ndarray):
image = Image.fromarray(image)
if image.mode in ('RGBA', 'LA'):
image = image.convert('RGB')
buffered = io.BytesIO()
image.save(buffered, format='JPEG')
image_base64 = base64.b64encode(buffered.getvalue()).decode()
return (
f'data:image/jpeg;base64,{image_base64}'
if add_data_prefix
else f'{image_base64}'
)

View File

@ -1,15 +1,23 @@
import base64
import datetime
import os
from pathlib import Path
from PIL import Image
from openhands.core.exceptions import BrowserUnavailableException
from openhands.core.schema import ActionType
from openhands.events.action import BrowseInteractiveAction, BrowseURLAction
from openhands.events.observation import BrowserOutputObservation
from openhands.runtime.browser.base64 import png_base64_url_to_image
from openhands.runtime.browser.browser_env import BrowserEnv
from openhands.utils.async_utils import call_sync_from_async
async def browse(
action: BrowseURLAction | BrowseInteractiveAction, browser: BrowserEnv | None
action: BrowseURLAction | BrowseInteractiveAction,
browser: BrowserEnv | None,
workspace_dir: str | None = None,
) -> BrowserOutputObservation:
if browser is None:
raise BrowserUnavailableException()
@ -31,10 +39,50 @@ async def browse(
try:
# obs provided by BrowserGym: see https://github.com/ServiceNow/BrowserGym/blob/main/core/src/browsergym/core/env.py#L396
obs = await call_sync_from_async(browser.step, action_str)
# Save screenshot if workspace_dir is provided
screenshot_path = None
if workspace_dir is not None and obs.get('screenshot'):
# Create screenshots directory if it doesn't exist
screenshots_dir = Path(workspace_dir) / '.browser_screenshots'
screenshots_dir.mkdir(exist_ok=True)
# Generate a filename based on timestamp
timestamp = datetime.datetime.now().strftime('%Y%m%d_%H%M%S_%f')
screenshot_filename = f'screenshot_{timestamp}.png'
screenshot_path = str(screenshots_dir / screenshot_filename)
# Direct image saving from base64 data without using PIL's Image.open
# This approach bypasses potential encoding issues that might occur when
# converting between different image representations, ensuring the raw PNG
# data from the browser is saved directly to disk.
# Extract the base64 data
base64_data = obs.get('screenshot', '')
if ',' in base64_data:
base64_data = base64_data.split(',')[1]
try:
# Decode base64 directly to binary
image_data = base64.b64decode(base64_data)
# Write binary data directly to file
with open(screenshot_path, 'wb') as f:
f.write(image_data)
# Verify the image was saved correctly by opening it
# This is just a verification step and can be removed in production
Image.open(screenshot_path).verify()
except Exception:
# If direct saving fails, fall back to the original method
image = png_base64_url_to_image(obs.get('screenshot'))
image.save(screenshot_path, format='PNG', optimize=True)
return BrowserOutputObservation(
content=obs['text_content'], # text content of the page
url=obs.get('url', ''), # URL of the page
screenshot=obs.get('screenshot', None), # base64-encoded screenshot, png
screenshot_path=screenshot_path, # path to saved screenshot file
set_of_marks=obs.get(
'set_of_marks', None
), # base64-encoded Set-of-Marks annotated screenshot, png,
@ -60,6 +108,7 @@ async def browse(
return BrowserOutputObservation(
content=str(e),
screenshot='',
screenshot_path=None,
error=True,
last_browser_action_error=str(e),
url=asked_url if action.action == ActionType.BROWSE else '',

View File

@ -117,7 +117,20 @@ def test_read_pdf_browse(temp_dir, runtime_cls, run_as_openhands):
observation_text = str(obs)
assert '[Action executed successfully.]' in observation_text
assert 'Canvas' in observation_text
assert (
'Screenshot saved to: /workspace/.browser_screenshots/screenshot_'
in observation_text
)
# Check the /workspace/.browser_screenshots folder
action_cmd = CmdRunAction(command='ls /workspace/.browser_screenshots')
logger.info(action_cmd, extra={'msg_type': 'ACTION'})
obs = runtime.run_action(action_cmd)
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
assert isinstance(obs, CmdOutputObservation)
assert obs.exit_code == 0
assert 'screenshot_' in obs.content
assert '.png' in obs.content
finally:
_close_test_runtime(runtime)
@ -169,6 +182,19 @@ def test_read_png_browse(temp_dir, runtime_cls, run_as_openhands):
observation_text = str(obs)
assert '[Action executed successfully.]' in observation_text
assert 'File Viewer - test_image.png' in observation_text
assert (
'Screenshot saved to: /workspace/.browser_screenshots/screenshot_'
in observation_text
)
# Check the /workspace/.browser_screenshots folder
action_cmd = CmdRunAction(command='ls /workspace/.browser_screenshots')
logger.info(action_cmd, extra={'msg_type': 'ACTION'})
obs = runtime.run_action(action_cmd)
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
assert isinstance(obs, CmdOutputObservation)
assert obs.exit_code == 0
assert 'screenshot_' in obs.content
assert '.png' in obs.content
finally:
_close_test_runtime(runtime)