From 3d02c0c3a3985c40ae863ee4c48e2d7b5917f3fe Mon Sep 17 00:00:00 2001 From: Xingyao Wang Date: Sun, 11 May 2025 15:51:18 +0800 Subject: [PATCH] Fix issue #8372: Implement browser screenshot saving functionality (#8383) Co-authored-by: openhands Co-authored-by: Graham Neubig --- openhands/events/observation/browse.py | 13 ++++- openhands/runtime/action_execution_server.py | 4 +- openhands/runtime/browser/base64.py | 31 ++++++++++++ openhands/runtime/browser/browser_env.py | 48 ++---------------- openhands/runtime/browser/utils.py | 51 +++++++++++++++++++- tests/runtime/test_browsing.py | 26 ++++++++++ 6 files changed, 124 insertions(+), 49 deletions(-) create mode 100644 openhands/runtime/browser/base64.py diff --git a/openhands/events/observation/browse.py b/openhands/events/observation/browse.py index da268a16e8..4474cfcb66 100644 --- a/openhands/events/observation/browse.py +++ b/openhands/events/observation/browse.py @@ -14,6 +14,7 @@ class BrowserOutputObservation(Observation): url: str trigger_by_action: str screenshot: str = field(repr=False, default='') # don't show in repr + screenshot_path: str | None = field(default=None) # path to saved screenshot file set_of_marks: str = field(default='', repr=False) # don't show in repr error: bool = False observation: str = ObservationType.BROWSE @@ -49,6 +50,8 @@ class BrowserOutputObservation(Observation): f'Last browser action error: {self.last_browser_action_error}\n' f'Focused element bid: {self.focused_element_bid}\n' ) + if self.screenshot_path: + ret += f'Screenshot saved to: {self.screenshot_path}\n' ret += '--- Agent Observation ---\n' ret += self.get_agent_obs_text() return ret @@ -57,7 +60,14 @@ class BrowserOutputObservation(Observation): """Get a concise text that will be shown to the agent.""" if self.trigger_by_action == ActionType.BROWSE_INTERACTIVE: text = f'[Current URL: {self.url}]\n' - text += f'[Focused element bid: {self.focused_element_bid}]\n\n' + text += f'[Focused element bid: {self.focused_element_bid}]\n' + + # Add screenshot path information if available + if self.screenshot_path: + text += f'[Screenshot saved to: {self.screenshot_path}]\n' + + text += '\n' + if self.error: text += ( '================ BEGIN error message ===============\n' @@ -85,6 +95,7 @@ class BrowserOutputObservation(Observation): elif self.trigger_by_action == ActionType.BROWSE: text = f'[Current URL: {self.url}]\n' + if self.error: text += ( '================ BEGIN error message ===============\n' diff --git a/openhands/runtime/action_execution_server.py b/openhands/runtime/action_execution_server.py index 3c82d51c09..ebd21e5634 100644 --- a/openhands/runtime/action_execution_server.py +++ b/openhands/runtime/action_execution_server.py @@ -602,7 +602,7 @@ class ActionExecutor: 'Browser functionality is not supported on Windows.' ) await self._ensure_browser_ready() - return await browse(action, self.browser) + return await browse(action, self.browser, self.initial_cwd) async def browse_interactive(self, action: BrowseInteractiveAction) -> Observation: if self.browser is None: @@ -610,7 +610,7 @@ class ActionExecutor: 'Browser functionality is not supported on Windows.' ) await self._ensure_browser_ready() - return await browse(action, self.browser) + return await browse(action, self.browser, self.initial_cwd) def close(self): self.memory_monitor.stop_monitoring() diff --git a/openhands/runtime/browser/base64.py b/openhands/runtime/browser/base64.py new file mode 100644 index 0000000000..94890e73c8 --- /dev/null +++ b/openhands/runtime/browser/base64.py @@ -0,0 +1,31 @@ +import io +import base64 +from PIL import Image +import numpy as np + +def image_to_png_base64_url( + image: np.ndarray | Image.Image, add_data_prefix: bool = False +) -> str: + """Convert a numpy array to a base64 encoded png image url.""" + if isinstance(image, np.ndarray): + image = Image.fromarray(image) + if image.mode in ('RGBA', 'LA'): + image = image.convert('RGB') + buffered = io.BytesIO() + image.save(buffered, format='PNG') + + image_base64 = base64.b64encode(buffered.getvalue()).decode() + return ( + f'data:image/png;base64,{image_base64}' + if add_data_prefix + else f'{image_base64}' + ) + +def png_base64_url_to_image(png_base64_url: str) -> Image.Image: + """Convert a base64 encoded png image url to a PIL Image.""" + splited = png_base64_url.split(',') + if len(splited) == 2: + base64_data = splited[1] + else: + base64_data = png_base64_url + return Image.open(io.BytesIO(base64.b64decode(base64_data))) diff --git a/openhands/runtime/browser/browser_env.py b/openhands/runtime/browser/browser_env.py index 7880d52165..e7087a1458 100644 --- a/openhands/runtime/browser/browser_env.py +++ b/openhands/runtime/browser/browser_env.py @@ -1,6 +1,4 @@ import atexit -import base64 -import io import json import multiprocessing import time @@ -9,20 +7,18 @@ import uuid import browsergym.core # noqa F401 (we register the openended task as a gym environment) import gymnasium as gym import html2text -import numpy as np import tenacity from browsergym.utils.obs import flatten_dom_to_str, overlay_som -from PIL import Image from openhands.core.exceptions import BrowserInitException from openhands.core.logger import openhands_logger as logger from openhands.utils.shutdown_listener import should_continue, should_exit from openhands.utils.tenacity_stop import stop_if_should_exit +from openhands.runtime.browser.base64 import image_to_png_base64_url BROWSER_EVAL_GET_GOAL_ACTION = 'GET_EVAL_GOAL' BROWSER_EVAL_GET_REWARDS_ACTION = 'GET_EVAL_REWARDS' - class BrowserEnv: def __init__(self, browsergym_eval_env: str | None = None): self.html_text_converter = self.get_html_text_converter() @@ -165,13 +161,13 @@ class BrowserEnv: html_str = flatten_dom_to_str(obs['dom_object']) obs['text_content'] = self.html_text_converter.handle(html_str) # make observation serializable - obs['set_of_marks'] = self.image_to_png_base64_url( + obs['set_of_marks'] = image_to_png_base64_url( overlay_som( obs['screenshot'], obs.get('extra_element_properties', {}) ), add_data_prefix=True, ) - obs['screenshot'] = self.image_to_png_base64_url( + obs['screenshot'] = image_to_png_base64_url( obs['screenshot'], add_data_prefix=True ) obs['active_page_index'] = obs['active_page_index'].item() @@ -226,41 +222,3 @@ class BrowserEnv: self.browser_side.close() except Exception as e: logger.error(f'Encountered an error when closing browser env: {e}') - - @staticmethod - def image_to_png_base64_url( - image: np.ndarray | Image.Image, add_data_prefix: bool = False - ) -> str: - """Convert a numpy array to a base64 encoded png image url.""" - if isinstance(image, np.ndarray): - image = Image.fromarray(image) - if image.mode in ('RGBA', 'LA'): - image = image.convert('RGB') - buffered = io.BytesIO() - image.save(buffered, format='PNG') - - image_base64 = base64.b64encode(buffered.getvalue()).decode() - return ( - f'data:image/png;base64,{image_base64}' - if add_data_prefix - else f'{image_base64}' - ) - - @staticmethod - def image_to_jpg_base64_url( - image: np.ndarray | Image.Image, add_data_prefix: bool = False - ) -> str: - """Convert a numpy array to a base64 encoded jpeg image url.""" - if isinstance(image, np.ndarray): - image = Image.fromarray(image) - if image.mode in ('RGBA', 'LA'): - image = image.convert('RGB') - buffered = io.BytesIO() - image.save(buffered, format='JPEG') - - image_base64 = base64.b64encode(buffered.getvalue()).decode() - return ( - f'data:image/jpeg;base64,{image_base64}' - if add_data_prefix - else f'{image_base64}' - ) diff --git a/openhands/runtime/browser/utils.py b/openhands/runtime/browser/utils.py index b029ac0841..ca9fe84143 100644 --- a/openhands/runtime/browser/utils.py +++ b/openhands/runtime/browser/utils.py @@ -1,15 +1,23 @@ +import base64 +import datetime import os +from pathlib import Path + +from PIL import Image from openhands.core.exceptions import BrowserUnavailableException from openhands.core.schema import ActionType from openhands.events.action import BrowseInteractiveAction, BrowseURLAction from openhands.events.observation import BrowserOutputObservation +from openhands.runtime.browser.base64 import png_base64_url_to_image from openhands.runtime.browser.browser_env import BrowserEnv from openhands.utils.async_utils import call_sync_from_async async def browse( - action: BrowseURLAction | BrowseInteractiveAction, browser: BrowserEnv | None + action: BrowseURLAction | BrowseInteractiveAction, + browser: BrowserEnv | None, + workspace_dir: str | None = None, ) -> BrowserOutputObservation: if browser is None: raise BrowserUnavailableException() @@ -31,10 +39,50 @@ async def browse( try: # obs provided by BrowserGym: see https://github.com/ServiceNow/BrowserGym/blob/main/core/src/browsergym/core/env.py#L396 obs = await call_sync_from_async(browser.step, action_str) + + # Save screenshot if workspace_dir is provided + screenshot_path = None + if workspace_dir is not None and obs.get('screenshot'): + # Create screenshots directory if it doesn't exist + screenshots_dir = Path(workspace_dir) / '.browser_screenshots' + screenshots_dir.mkdir(exist_ok=True) + + # Generate a filename based on timestamp + timestamp = datetime.datetime.now().strftime('%Y%m%d_%H%M%S_%f') + screenshot_filename = f'screenshot_{timestamp}.png' + screenshot_path = str(screenshots_dir / screenshot_filename) + + # Direct image saving from base64 data without using PIL's Image.open + # This approach bypasses potential encoding issues that might occur when + # converting between different image representations, ensuring the raw PNG + # data from the browser is saved directly to disk. + + # Extract the base64 data + base64_data = obs.get('screenshot', '') + if ',' in base64_data: + base64_data = base64_data.split(',')[1] + + try: + # Decode base64 directly to binary + image_data = base64.b64decode(base64_data) + + # Write binary data directly to file + with open(screenshot_path, 'wb') as f: + f.write(image_data) + + # Verify the image was saved correctly by opening it + # This is just a verification step and can be removed in production + Image.open(screenshot_path).verify() + except Exception: + # If direct saving fails, fall back to the original method + image = png_base64_url_to_image(obs.get('screenshot')) + image.save(screenshot_path, format='PNG', optimize=True) + return BrowserOutputObservation( content=obs['text_content'], # text content of the page url=obs.get('url', ''), # URL of the page screenshot=obs.get('screenshot', None), # base64-encoded screenshot, png + screenshot_path=screenshot_path, # path to saved screenshot file set_of_marks=obs.get( 'set_of_marks', None ), # base64-encoded Set-of-Marks annotated screenshot, png, @@ -60,6 +108,7 @@ async def browse( return BrowserOutputObservation( content=str(e), screenshot='', + screenshot_path=None, error=True, last_browser_action_error=str(e), url=asked_url if action.action == ActionType.BROWSE else '', diff --git a/tests/runtime/test_browsing.py b/tests/runtime/test_browsing.py index b8937f5c9f..79b46b93cb 100644 --- a/tests/runtime/test_browsing.py +++ b/tests/runtime/test_browsing.py @@ -117,7 +117,20 @@ def test_read_pdf_browse(temp_dir, runtime_cls, run_as_openhands): observation_text = str(obs) assert '[Action executed successfully.]' in observation_text assert 'Canvas' in observation_text + assert ( + 'Screenshot saved to: /workspace/.browser_screenshots/screenshot_' + in observation_text + ) + # Check the /workspace/.browser_screenshots folder + action_cmd = CmdRunAction(command='ls /workspace/.browser_screenshots') + logger.info(action_cmd, extra={'msg_type': 'ACTION'}) + obs = runtime.run_action(action_cmd) + logger.info(obs, extra={'msg_type': 'OBSERVATION'}) + assert isinstance(obs, CmdOutputObservation) + assert obs.exit_code == 0 + assert 'screenshot_' in obs.content + assert '.png' in obs.content finally: _close_test_runtime(runtime) @@ -169,6 +182,19 @@ def test_read_png_browse(temp_dir, runtime_cls, run_as_openhands): observation_text = str(obs) assert '[Action executed successfully.]' in observation_text assert 'File Viewer - test_image.png' in observation_text + assert ( + 'Screenshot saved to: /workspace/.browser_screenshots/screenshot_' + in observation_text + ) + # Check the /workspace/.browser_screenshots folder + action_cmd = CmdRunAction(command='ls /workspace/.browser_screenshots') + logger.info(action_cmd, extra={'msg_type': 'ACTION'}) + obs = runtime.run_action(action_cmd) + logger.info(obs, extra={'msg_type': 'OBSERVATION'}) + assert isinstance(obs, CmdOutputObservation) + assert obs.exit_code == 0 + assert 'screenshot_' in obs.content + assert '.png' in obs.content finally: _close_test_runtime(runtime)