Fix issue #8372: Implement browser screenshot saving functionality (#8383)

Co-authored-by: openhands <openhands@all-hands.dev> Co-authored-by: Graham Neubig <neubig@gmail.com>
2025-12-26 05:48:36 +08:00 · 2025-05-11 15:51:18 +08:00 · 2025-05-11 15:51:18 +08:00 · 3d02c0c3a3
commit 3d02c0c3a3
parent a17c57d82e
6 changed files with 124 additions and 49 deletions
--- a/openhands/events/observation/browse.py
+++ b/openhands/events/observation/browse.py
@ -14,6 +14,7 @@ class BrowserOutputObservation(Observation):
    url: str
    trigger_by_action: str
    screenshot: str = field(repr=False, default='')  # don't show in repr
+    screenshot_path: str | None = field(default=None)  # path to saved screenshot file
    set_of_marks: str = field(default='', repr=False)  # don't show in repr
    error: bool = False
    observation: str = ObservationType.BROWSE
@ -49,6 +50,8 @@ class BrowserOutputObservation(Observation):
            f'Last browser action error: {self.last_browser_action_error}\n'
            f'Focused element bid: {self.focused_element_bid}\n'
        )
+        if self.screenshot_path:
+            ret += f'Screenshot saved to: {self.screenshot_path}\n'
        ret += '--- Agent Observation ---\n'
        ret += self.get_agent_obs_text()
        return ret
@ -57,7 +60,14 @@ class BrowserOutputObservation(Observation):
        """Get a concise text that will be shown to the agent."""
        if self.trigger_by_action == ActionType.BROWSE_INTERACTIVE:
            text = f'[Current URL: {self.url}]\n'
-            text += f'[Focused element bid: {self.focused_element_bid}]\n\n'
+            text += f'[Focused element bid: {self.focused_element_bid}]\n'
+
+            # Add screenshot path information if available
+            if self.screenshot_path:
+                text += f'[Screenshot saved to: {self.screenshot_path}]\n'
+
+            text += '\n'
+
            if self.error:
                text += (
                    '================ BEGIN error message ===============\n'
@ -85,6 +95,7 @@ class BrowserOutputObservation(Observation):

        elif self.trigger_by_action == ActionType.BROWSE:
            text = f'[Current URL: {self.url}]\n'
+
            if self.error:
                text += (
                    '================ BEGIN error message ===============\n'
--- a/openhands/runtime/action_execution_server.py
+++ b/openhands/runtime/action_execution_server.py
@ -602,7 +602,7 @@ class ActionExecutor:
                'Browser functionality is not supported on Windows.'
            )
        await self._ensure_browser_ready()
-        return await browse(action, self.browser)
+        return await browse(action, self.browser, self.initial_cwd)

    async def browse_interactive(self, action: BrowseInteractiveAction) -> Observation:
        if self.browser is None:
@ -610,7 +610,7 @@ class ActionExecutor:
                'Browser functionality is not supported on Windows.'
            )
        await self._ensure_browser_ready()
-        return await browse(action, self.browser)
+        return await browse(action, self.browser, self.initial_cwd)

    def close(self):
        self.memory_monitor.stop_monitoring()
--- a/openhands/runtime/browser/base64.py
+++ b/openhands/runtime/browser/base64.py
@ -0,0 +1,31 @@
+import io
+import base64
+from PIL import Image
+import numpy as np
+
+def image_to_png_base64_url(
+    image: np.ndarray | Image.Image, add_data_prefix: bool = False
+) -> str:
+    """Convert a numpy array to a base64 encoded png image url."""
+    if isinstance(image, np.ndarray):
+        image = Image.fromarray(image)
+    if image.mode in ('RGBA', 'LA'):
+        image = image.convert('RGB')
+    buffered = io.BytesIO()
+    image.save(buffered, format='PNG')
+
+    image_base64 = base64.b64encode(buffered.getvalue()).decode()
+    return (
+        f'data:image/png;base64,{image_base64}'
+        if add_data_prefix
+        else f'{image_base64}'
+    )
+
+def png_base64_url_to_image(png_base64_url: str) -> Image.Image:
+    """Convert a base64 encoded png image url to a PIL Image."""
+    splited = png_base64_url.split(',')
+    if len(splited) == 2:
+        base64_data = splited[1]
+    else:
+        base64_data = png_base64_url
+    return Image.open(io.BytesIO(base64.b64decode(base64_data)))
--- a/openhands/runtime/browser/browser_env.py
+++ b/openhands/runtime/browser/browser_env.py
@ -1,6 +1,4 @@
 import atexit
-import base64
-import io
 import json
 import multiprocessing
 import time
@ -9,20 +7,18 @@ import uuid
 import browsergym.core  # noqa F401 (we register the openended task as a gym environment)
 import gymnasium as gym
 import html2text
-import numpy as np
 import tenacity
 from browsergym.utils.obs import flatten_dom_to_str, overlay_som
-from PIL import Image

 from openhands.core.exceptions import BrowserInitException
 from openhands.core.logger import openhands_logger as logger
 from openhands.utils.shutdown_listener import should_continue, should_exit
 from openhands.utils.tenacity_stop import stop_if_should_exit
+from openhands.runtime.browser.base64 import image_to_png_base64_url

 BROWSER_EVAL_GET_GOAL_ACTION = 'GET_EVAL_GOAL'
 BROWSER_EVAL_GET_REWARDS_ACTION = 'GET_EVAL_REWARDS'

-
 class BrowserEnv:
    def __init__(self, browsergym_eval_env: str | None = None):
        self.html_text_converter = self.get_html_text_converter()
@ -165,13 +161,13 @@ class BrowserEnv:
                    html_str = flatten_dom_to_str(obs['dom_object'])
                    obs['text_content'] = self.html_text_converter.handle(html_str)
                    # make observation serializable
-                    obs['set_of_marks'] = self.image_to_png_base64_url(
+                    obs['set_of_marks'] = image_to_png_base64_url(
                        overlay_som(
                            obs['screenshot'], obs.get('extra_element_properties', {})
                        ),
                        add_data_prefix=True,
                    )
-                    obs['screenshot'] = self.image_to_png_base64_url(
+                    obs['screenshot'] = image_to_png_base64_url(
                        obs['screenshot'], add_data_prefix=True
                    )
                    obs['active_page_index'] = obs['active_page_index'].item()
@ -226,41 +222,3 @@ class BrowserEnv:
            self.browser_side.close()
        except Exception as e:
            logger.error(f'Encountered an error when closing browser env: {e}')
-
-    @staticmethod
-    def image_to_png_base64_url(
-        image: np.ndarray | Image.Image, add_data_prefix: bool = False
-    ) -> str:
-        """Convert a numpy array to a base64 encoded png image url."""
-        if isinstance(image, np.ndarray):
-            image = Image.fromarray(image)
-        if image.mode in ('RGBA', 'LA'):
-            image = image.convert('RGB')
-        buffered = io.BytesIO()
-        image.save(buffered, format='PNG')
-
-        image_base64 = base64.b64encode(buffered.getvalue()).decode()
-        return (
-            f'data:image/png;base64,{image_base64}'
-            if add_data_prefix
-            else f'{image_base64}'
-        )
-
-    @staticmethod
-    def image_to_jpg_base64_url(
-        image: np.ndarray | Image.Image, add_data_prefix: bool = False
-    ) -> str:
-        """Convert a numpy array to a base64 encoded jpeg image url."""
-        if isinstance(image, np.ndarray):
-            image = Image.fromarray(image)
-        if image.mode in ('RGBA', 'LA'):
-            image = image.convert('RGB')
-        buffered = io.BytesIO()
-        image.save(buffered, format='JPEG')
-
-        image_base64 = base64.b64encode(buffered.getvalue()).decode()
-        return (
-            f'data:image/jpeg;base64,{image_base64}'
-            if add_data_prefix
-            else f'{image_base64}'
-        )
--- a/openhands/runtime/browser/utils.py
+++ b/openhands/runtime/browser/utils.py
@ -1,15 +1,23 @@
+import base64
+import datetime
 import os
+from pathlib import Path
+
+from PIL import Image

 from openhands.core.exceptions import BrowserUnavailableException
 from openhands.core.schema import ActionType
 from openhands.events.action import BrowseInteractiveAction, BrowseURLAction
 from openhands.events.observation import BrowserOutputObservation
+from openhands.runtime.browser.base64 import png_base64_url_to_image
 from openhands.runtime.browser.browser_env import BrowserEnv
 from openhands.utils.async_utils import call_sync_from_async


 async def browse(
-    action: BrowseURLAction | BrowseInteractiveAction, browser: BrowserEnv | None
+    action: BrowseURLAction | BrowseInteractiveAction,
+    browser: BrowserEnv | None,
+    workspace_dir: str | None = None,
 ) -> BrowserOutputObservation:
    if browser is None:
        raise BrowserUnavailableException()
@ -31,10 +39,50 @@ async def browse(
    try:
        # obs provided by BrowserGym: see https://github.com/ServiceNow/BrowserGym/blob/main/core/src/browsergym/core/env.py#L396
        obs = await call_sync_from_async(browser.step, action_str)
+
+        # Save screenshot if workspace_dir is provided
+        screenshot_path = None
+        if workspace_dir is not None and obs.get('screenshot'):
+            # Create screenshots directory if it doesn't exist
+            screenshots_dir = Path(workspace_dir) / '.browser_screenshots'
+            screenshots_dir.mkdir(exist_ok=True)
+
+            # Generate a filename based on timestamp
+            timestamp = datetime.datetime.now().strftime('%Y%m%d_%H%M%S_%f')
+            screenshot_filename = f'screenshot_{timestamp}.png'
+            screenshot_path = str(screenshots_dir / screenshot_filename)
+
+            # Direct image saving from base64 data without using PIL's Image.open
+            # This approach bypasses potential encoding issues that might occur when
+            # converting between different image representations, ensuring the raw PNG
+            # data from the browser is saved directly to disk.
+
+            # Extract the base64 data
+            base64_data = obs.get('screenshot', '')
+            if ',' in base64_data:
+                base64_data = base64_data.split(',')[1]
+
+            try:
+                # Decode base64 directly to binary
+                image_data = base64.b64decode(base64_data)
+
+                # Write binary data directly to file
+                with open(screenshot_path, 'wb') as f:
+                    f.write(image_data)
+
+                # Verify the image was saved correctly by opening it
+                # This is just a verification step and can be removed in production
+                Image.open(screenshot_path).verify()
+            except Exception:
+                # If direct saving fails, fall back to the original method
+                image = png_base64_url_to_image(obs.get('screenshot'))
+                image.save(screenshot_path, format='PNG', optimize=True)
+
        return BrowserOutputObservation(
            content=obs['text_content'],  # text content of the page
            url=obs.get('url', ''),  # URL of the page
            screenshot=obs.get('screenshot', None),  # base64-encoded screenshot, png
+            screenshot_path=screenshot_path,  # path to saved screenshot file
            set_of_marks=obs.get(
                'set_of_marks', None
            ),  # base64-encoded Set-of-Marks annotated screenshot, png,
@ -60,6 +108,7 @@ async def browse(
        return BrowserOutputObservation(
            content=str(e),
            screenshot='',
+            screenshot_path=None,
            error=True,
            last_browser_action_error=str(e),
            url=asked_url if action.action == ActionType.BROWSE else '',
--- a/tests/runtime/test_browsing.py
+++ b/tests/runtime/test_browsing.py
@ -117,7 +117,20 @@ def test_read_pdf_browse(temp_dir, runtime_cls, run_as_openhands):
        observation_text = str(obs)
        assert '[Action executed successfully.]' in observation_text
        assert 'Canvas' in observation_text
+        assert (
+            'Screenshot saved to: /workspace/.browser_screenshots/screenshot_'
+            in observation_text
+        )

+        # Check the /workspace/.browser_screenshots folder
+        action_cmd = CmdRunAction(command='ls /workspace/.browser_screenshots')
+        logger.info(action_cmd, extra={'msg_type': 'ACTION'})
+        obs = runtime.run_action(action_cmd)
+        logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+        assert isinstance(obs, CmdOutputObservation)
+        assert obs.exit_code == 0
+        assert 'screenshot_' in obs.content
+        assert '.png' in obs.content
    finally:
        _close_test_runtime(runtime)

@ -169,6 +182,19 @@ def test_read_png_browse(temp_dir, runtime_cls, run_as_openhands):
        observation_text = str(obs)
        assert '[Action executed successfully.]' in observation_text
        assert 'File Viewer - test_image.png' in observation_text
+        assert (
+            'Screenshot saved to: /workspace/.browser_screenshots/screenshot_'
+            in observation_text
+        )

+        # Check the /workspace/.browser_screenshots folder
+        action_cmd = CmdRunAction(command='ls /workspace/.browser_screenshots')
+        logger.info(action_cmd, extra={'msg_type': 'ACTION'})
+        obs = runtime.run_action(action_cmd)
+        logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+        assert isinstance(obs, CmdOutputObservation)
+        assert obs.exit_code == 0
+        assert 'screenshot_' in obs.content
+        assert '.png' in obs.content
    finally:
        _close_test_runtime(runtime)