[OH-Versa] Add remaining browsing & GAIA eval improvement (#9015)

Co-authored-by: openhands <openhands@all-hands.dev> Co-authored-by: Engel Nyst <enyst@users.noreply.github.com>
2025-12-26 05:48:36 +08:00 · 2025-06-25 12:36:15 +07:00 · 2025-06-25 12:36:15 +07:00 · dfa54673d2
commit dfa54673d2
parent 76914e3c26
16 changed files with 383 additions and 29 deletions
--- a/evaluation/benchmarks/gaia/run_infer.py
+++ b/evaluation/benchmarks/gaia/run_infer.py
@ -3,13 +3,20 @@ import copy
 import functools
 import os
 import re
+import shutil
+import zipfile

 import huggingface_hub
 import pandas as pd
 from datasets import load_dataset
+from PIL import Image
 from pydantic import SecretStr

 from evaluation.benchmarks.gaia.scorer import question_scorer
+from evaluation.benchmarks.gaia.utils import (
+    image_to_jpg_base64_url,
+    image_to_png_base64_url,
+)
 from evaluation.utils.shared import (
    EvalMetadata,
    EvalOutput,
@ -97,27 +104,44 @@ def initialize_runtime(
    if instance['file_name'] != '':
        # if this question comes with a file, we need to save it to the workspace
        assert metadata.data_split is not None
+        extension_name = instance['file_name'].split('.')[-1]
        src_file = os.path.join(
            DATASET_CACHE_DIR, '2023', metadata.data_split, instance['file_name']
        )
        assert os.path.exists(src_file)
-        dest_file = os.path.join('/workspace', instance['file_name'])
-        runtime.copy_to(src_file, dest_file)
+        if extension_name == 'zip':
+            temp_dir = os.path.join(
+                DATASET_CACHE_DIR, '2023', metadata.data_split, 'tmp_file'
+            )
+            os.makedirs(temp_dir, exist_ok=True)
+            with zipfile.ZipFile(src_file, 'r') as zip_ref:
+                zip_ref.extractall(temp_dir)
+            for root, dirs, files in os.walk(temp_dir):
+                for file in files:
+                    dest_file = '/workspace'
+                    runtime.copy_to(os.path.join(root, file), dest_file)
+            shutil.rmtree(temp_dir)
+        elif extension_name not in ['jpg', 'png']:
+            dest_file = '/workspace'
+            runtime.copy_to(src_file, dest_file)

-        # rename to file.extension_name
-        extension_name = instance['file_name'].split('.')[-1]
-        action = CmdRunAction(
-            command=f'mv /workspace/{instance["file_name"]} /workspace/file.{extension_name}'
-        )
-        logger.info(action, extra={'msg_type': 'ACTION'})
-        obs = runtime.run_action(action)
-        assert obs.exit_code == 0
+            # rename to file.extension_name
+            action = CmdRunAction(
+                command=f'mv /workspace/{instance["file_name"]} /workspace/file.{extension_name}'
+            )
+            logger.info(action, extra={'msg_type': 'ACTION'})
+            obs = runtime.run_action(action)
+            assert obs.exit_code == 0

    action = CmdRunAction(command='cd /workspace')
    logger.info(action, extra={'msg_type': 'ACTION'})
    obs = runtime.run_action(action)
    assert obs.exit_code == 0

+    action = CmdRunAction(
+        command='apt-get update && apt-get install -y ffmpeg && apt-get install -y ffprobe'
+    )
+    runtime.run_action(action)
    logger.info(f'{"-" * 50} END Runtime Initialization Fn {"-" * 50}')


@ -151,8 +175,31 @@ Here is the task:
        task_question=instance['Question'],
    )
    logger.info(f'Instruction: {instruction}')
+    image_urls = []
    if dest_file:
-        instruction += f'\n\nThe mentioned file is provided in the workspace at: {dest_file.split("/")[-1]}'
+        if extension_name not in ['jpg', 'png', 'zip']:
+            instruction += f'To solve this task you will have to use the attached file provided in the workspace at location: {dest_file}\n\n'
+        elif extension_name == 'zip':
+            filenames = []
+            src_file = os.path.join(
+                DATASET_CACHE_DIR, '2023', metadata.data_split, instance['file_name']
+            )
+            with zipfile.ZipFile(src_file, 'r') as zip_ref:
+                filenames = zip_ref.namelist()
+
+            filenames = [f'/workspace/{file}' for file in filenames]
+            filenames = ', '.join(filenames)
+            instruction += f'To solve this task you will have to use the attached files provided in the workspace at locations: {filenames}\n\n'
+        else:  # Image files: jpg, png
+            src_file = os.path.join(
+                DATASET_CACHE_DIR, '2023', metadata.data_split, instance['file_name']
+            )
+            instruction += 'Image: To solve this task you will have to use the image shown below.\n\n'
+            image = Image.open(src_file)
+            if extension_name == 'jpg':
+                image_urls.append(image_to_jpg_base64_url(image))
+            else:
+                image_urls.append(image_to_png_base64_url(image))

    instruction += """IMPORTANT: When seeking information from a website, REFRAIN from arbitrary URL navigation. You should utilize the designated search engine tool with precise keywords to obtain relevant URLs or use the specific website's search interface. DO NOT navigate directly to specific URLs as they may not exist.\n\nFor example: if you want to search for a research paper on Arxiv, either use the search engine tool with specific keywords or navigate to arxiv.org and then use its interface.\n"""
    instruction += 'IMPORTANT: You should NEVER ask for Human Help.\n'
@ -174,7 +221,9 @@ Here is the task:
    state: State | None = asyncio.run(
        run_controller(
            config=config,
-            initial_user_action=MessageAction(content=instruction),
+            initial_user_action=MessageAction(
+                content=instruction, image_urls=image_urls
+            ),
            runtime=runtime,
            fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN[
                metadata.agent_class
--- a/evaluation/benchmarks/gaia/utils.py
+++ b/evaluation/benchmarks/gaia/utils.py
@ -0,0 +1,43 @@
+import base64
+import io
+
+import numpy as np
+from PIL import Image
+
+
+def image_to_png_base64_url(
+    image: np.ndarray | Image.Image, add_data_prefix: bool = True
+):
+    """Convert a numpy array to a base64 encoded png image url."""
+    if isinstance(image, np.ndarray):
+        image = Image.fromarray(image)
+    if image.mode in ('RGBA', 'LA'):
+        image = image.convert('RGB')
+    buffered = io.BytesIO()
+    image.save(buffered, format='PNG')
+
+    image_base64 = base64.b64encode(buffered.getvalue()).decode()
+    return (
+        f'data:image/png;base64,{image_base64}'
+        if add_data_prefix
+        else f'{image_base64}'
+    )
+
+
+def image_to_jpg_base64_url(
+    image: np.ndarray | Image.Image, add_data_prefix: bool = True
+):
+    """Convert a numpy array to a base64 encoded jpeg image url."""
+    if isinstance(image, np.ndarray):
+        image = Image.fromarray(image)
+    if image.mode in ('RGBA', 'LA'):
+        image = image.convert('RGB')
+    buffered = io.BytesIO()
+    image.save(buffered, format='JPEG')
+
+    image_base64 = base64.b64encode(buffered.getvalue()).decode()
+    return (
+        f'data:image/jpeg;base64,{image_base64}'
+        if add_data_prefix
+        else f'{image_base64}'
+    )
--- a/evaluation/utils/shared.py
+++ b/evaluation/utils/shared.py
@ -109,7 +109,7 @@ def codeact_user_response(
 ) -> str:
    encaps_str = (
        (
-            'Please encapsulate your final answer (answer ONLY) within <solution> and </solution>.\n'
+            'Your final answer MUST be encapsulated within <solution> and </solution>.\n'
            'For example: The answer to the question is <solution> 42 </solution>.\n'
        )
        if encapsulate_solution
@ -117,7 +117,7 @@ def codeact_user_response(
    )
    msg = (
        'Please continue working on the task on whatever approach you think is suitable.\n'
-        'If you think you have solved the task, please first send your answer to user through message and then finish the interaction.\n'
+        'When you think you have solved the question, please use the finish tool and include your final answer in the message parameter of the finish tool.\n'
        f'{encaps_str}'
        'IMPORTANT: YOU SHOULD NEVER ASK FOR HUMAN HELP.\n'
    )
--- a/openhands/core/schema/observation.py
+++ b/openhands/core/schema/observation.py
@ -52,3 +52,6 @@ class ObservationType(str, Enum):

    MCP = 'mcp'
    """Result of a MCP Server operation"""
+
+    DOWNLOAD = 'download'
+    """Result of downloading/opening a file via the browser"""
--- a/openhands/events/observation/init.py
+++ b/openhands/events/observation/init.py
@ -16,6 +16,7 @@ from openhands.events.observation.empty import (
    NullObservation,
 )
 from openhands.events.observation.error import ErrorObservation
+from openhands.events.observation.file_download import FileDownloadObservation
 from openhands.events.observation.files import (
    FileEditObservation,
    FileReadObservation,
@ -46,4 +47,5 @@ __all__ = [
    'RecallObservation',
    'RecallType',
    'MCPObservation',
+    'FileDownloadObservation',
 ]
--- a/openhands/events/observation/browse.py
+++ b/openhands/events/observation/browse.py
@ -32,6 +32,7 @@ class BrowserOutputObservation(Observation):
    last_browser_action: str = ''
    last_browser_action_error: str = ''
    focused_element_bid: str = ''
+    filter_visible_only: bool = False

    @property
    def message(self) -> str:
--- a/openhands/events/observation/file_download.py
+++ b/openhands/events/observation/file_download.py
@ -0,0 +1,21 @@
+from dataclasses import dataclass
+
+from openhands.core.schema import ObservationType
+from openhands.events.observation.observation import Observation
+
+
+@dataclass
+class FileDownloadObservation(Observation):
+    file_path: str
+    observation: str = ObservationType.DOWNLOAD
+
+    @property
+    def message(self) -> str:
+        return f'Downloaded the file at location: {self.file_path}'
+
+    def __str__(self) -> str:
+        ret = (
+            '**FileDownloadObservation**\n'
+            f'Location of downloaded file: {self.file_path}\n'
+        )
+        return ret
--- a/openhands/events/serialization/observation.py
+++ b/openhands/events/serialization/observation.py
@ -20,6 +20,7 @@ from openhands.events.observation.empty import (
    NullObservation,
 )
 from openhands.events.observation.error import ErrorObservation
+from openhands.events.observation.file_download import FileDownloadObservation
 from openhands.events.observation.files import (
    FileEditObservation,
    FileReadObservation,
@ -47,6 +48,7 @@ observations = (
    AgentThinkObservation,
    RecallObservation,
    MCPObservation,
+    FileDownloadObservation,
 )

 OBSERVATION_TYPE_TO_CLASS = {
--- a/openhands/memory/conversation_memory.py
+++ b/openhands/memory/conversation_memory.py
@ -28,6 +28,7 @@ from openhands.events.observation import (
    AgentThinkObservation,
    BrowserOutputObservation,
    CmdOutputObservation,
+    FileDownloadObservation,
    FileEditObservation,
    FileReadObservation,
    IPythonRunCellObservation,
@ -288,7 +289,12 @@ class ConversationMemory:
            role = 'user' if action.source == 'user' else 'assistant'
            content = [TextContent(text=action.content or '')]
            if vision_is_active and action.image_urls:
-                content.append(ImageContent(image_urls=action.image_urls))
+                if role == 'user':
+                    for idx, url in enumerate(action.image_urls):
+                        content.append(TextContent(text=f'Image {idx + 1}:'))
+                        content.append(ImageContent(image_urls=[url]))
+                else:
+                    content.append(ImageContent(image_urls=action.image_urls))
            if role not in ('user', 'system', 'assistant', 'tool'):
                raise ValueError(f'Invalid role: {role}')
            return [
@ -339,6 +345,7 @@ class ConversationMemory:
        - AgentDelegateObservation: Formats results from delegated agent tasks
        - ErrorObservation: Formats error messages from failed actions
        - UserRejectObservation: Formats user rejection messages
+        - FileDownloadObservation: Formats the result of a browsing action that opened/downloaded a file

        In function calling mode, observations with tool_call_metadata are stored in
        tool_call_id_to_message for later processing instead of being returned immediately.
@ -429,7 +436,7 @@ class ConversationMemory:
                and enable_som_visual_browsing
                and vision_is_active
            ):
-                text += 'Image: Current webpage screenshot (Note that only visible portion of webpage is present in the screenshot. You may need to scroll to view the remaining portion of the web-page.)\n'
+                text += 'Image: Current webpage screenshot (Note that only visible portion of webpage is present in the screenshot. However, the Accessibility tree contains information from the entire webpage.)\n'

                # Determine which image to use and validate it
                image_url = None
@ -492,6 +499,9 @@ class ConversationMemory:
        elif isinstance(obs, AgentCondensationObservation):
            text = truncate_content(obs.content, max_message_chars)
            message = Message(role='user', content=[TextContent(text=text)])
+        elif isinstance(obs, FileDownloadObservation):
+            text = truncate_content(obs.content, max_message_chars)
+            message = Message(role='user', content=[TextContent(text=text)])
        elif (
            isinstance(obs, RecallObservation)
            and self.agent_config.enable_prompt_extensions
--- a/openhands/runtime/action_execution_server.py
+++ b/openhands/runtime/action_execution_server.py
@ -20,6 +20,7 @@ from contextlib import asynccontextmanager
 from pathlib import Path
 from zipfile import ZipFile

+import puremagic
 from binaryornot.check import is_binary
 from fastapi import Depends, FastAPI, HTTPException, Request, UploadFile
 from fastapi.exceptions import RequestValidationError
@ -51,6 +52,7 @@ from openhands.events.event import FileEditSource, FileReadSource
 from openhands.events.observation import (
    CmdOutputObservation,
    ErrorObservation,
+    FileDownloadObservation,
    FileEditObservation,
    FileReadObservation,
    FileWriteObservation,
@ -193,6 +195,8 @@ class ActionExecutor:
        self.start_time = time.time()
        self.last_execution_time = self.start_time
        self._initialized = False
+        self.downloaded_files: list[str] = []
+        self.downloads_directory = '/workspace/.downloads'

        self.max_memory_gb: int | None = None
        if _override_max_memory_gb := os.environ.get('RUNTIME_MAX_MEMORY_GB', None):
@ -603,7 +607,45 @@ class ActionExecutor:
                'Browser functionality is not supported on Windows.'
            )
        await self._ensure_browser_ready()
-        return await browse(action, self.browser, self.initial_cwd)
+        browser_observation = await browse(action, self.browser, self.initial_cwd)
+        if not browser_observation.error:
+            return browser_observation
+        else:
+            curr_files = os.listdir(self.downloads_directory)
+            new_download = False
+            for file in curr_files:
+                if file not in self.downloaded_files:
+                    new_download = True
+                    self.downloaded_files.append(file)
+                    break  # FIXME: assuming only one file will be downloaded for simplicity
+
+            if not new_download:
+                return browser_observation
+            else:
+                # A new file is downloaded in self.downloads_directory, shift file to /workspace
+                src_path = os.path.join(
+                    self.downloads_directory, self.downloaded_files[-1]
+                )
+                # Guess extension of file using puremagic and add it to tgt_path file name
+                file_ext = ''
+                try:
+                    guesses = puremagic.magic_file(src_path)
+                    if len(guesses) > 0:
+                        ext = guesses[0].extension.strip()
+                        if len(ext) > 0:
+                            file_ext = ext
+                except Exception as _:
+                    pass
+
+                tgt_path = os.path.join(
+                    '/workspace', f'file_{len(self.downloaded_files)}{file_ext}'
+                )
+                shutil.copy(src_path, tgt_path)
+                file_download_obs = FileDownloadObservation(
+                    content=f'Execution of the previous action {action.browser_actions} resulted in a file download. The downloaded file is saved at location: {tgt_path}',
+                    file_path=tgt_path,
+                )
+                return file_download_obs

    def close(self):
        self.memory_monitor.stop_monitoring()
--- a/openhands/runtime/browser/browser_env.py
+++ b/openhands/runtime/browser/browser_env.py
@ -94,6 +94,9 @@ class BrowserEnv:
                headless=True,
                disable_env_checker=True,
                tags_to_mark='all',
+                timeout=100000,
+                pw_context_kwargs={'accept_downloads': True},
+                pw_chromium_kwargs={'downloads_path': '/workspace/.downloads/'},
            )
        obs, info = env.reset()

@ -105,6 +108,7 @@ class BrowserEnv:
        if self.eval_mode:
            self.eval_goal = obs['goal']
            if 'goal_object' in obs:
+                obs['goal_object'] = list(obs['goal_object'])
                if len(obs['goal_object']) > 0:
                    self.eval_goal = obs['goal_object'][0]['text']
                for message in obs['goal_object']:
@ -182,7 +186,7 @@ class BrowserEnv:
                    pass
                return

-    def step(self, action_str: str, timeout: float = 100) -> dict:
+    def step(self, action_str: str, timeout: float = 120) -> dict:
        """Execute an action in the browser environment and return the observation."""
        unique_request_id = str(uuid.uuid4())
        self.agent_side.send((unique_request_id, {'action': action_str}))
--- a/openhands/runtime/browser/utils.py
+++ b/openhands/runtime/browser/utils.py
@ -59,13 +59,22 @@ def get_agent_obs_text(obs: BrowserOutputObservation) -> str:
            cur_axtree_txt = get_axtree_str(
                obs.axtree_object,
                obs.extra_element_properties,
-                filter_visible_only=False,
-            )
-            text += (
-                f'============== BEGIN accessibility tree ==============\n'
-                f'{cur_axtree_txt}\n'
-                f'============== END accessibility tree ==============\n'
+                filter_visible_only=obs.filter_visible_only,
            )
+            if not obs.filter_visible_only:
+                text += (
+                    f'Accessibility tree of the COMPLETE webpage:\nNote: [bid] is the unique alpha-numeric identifier at the beginning of lines for each element in the AXTree. Always use bid to refer to elements in your actions.\n'
+                    f'============== BEGIN accessibility tree ==============\n'
+                    f'{cur_axtree_txt}\n'
+                    f'============== END accessibility tree ==============\n'
+                )
+            else:
+                text += (
+                    f'Accessibility tree of the VISIBLE portion of the webpage (accessibility tree of complete webpage is too large and you may need to scroll to view remaining portion of the webpage):\nNote: [bid] is the unique alpha-numeric identifier at the beginning of lines for each element in the AXTree. Always use bid to refer to elements in your actions.\n'
+                    f'============== BEGIN accessibility tree ==============\n'
+                    f'{cur_axtree_txt}\n'
+                    f'============== END accessibility tree ==============\n'
+                )
        except Exception as e:
            text += f'\n[Error encountered when processing the accessibility tree: {e}]'
        return text
--- a/openhands/runtime/utils/runtime_templates/Dockerfile.j2
+++ b/openhands/runtime/utils/runtime_templates/Dockerfile.j2
@ -15,7 +15,7 @@ ENV POETRY_VIRTUALENVS_PATH=/openhands/poetry \
 # Install base system dependencies
 RUN apt-get update && \
    apt-get install -y --no-install-recommends \
-        wget curl ca-certificates sudo apt-utils git jq tmux build-essential ripgrep \
+        wget curl ca-certificates sudo apt-utils git jq tmux build-essential ripgrep ffmpeg \
        {%- if 'ubuntu' in base_image and (base_image.endswith(':latest') or base_image.endswith(':24.04')) -%}
        libgl1 \
        {%- else %}
--- a/tests/runtime/test_aci_edit.py
+++ b/tests/runtime/test_aci_edit.py
@ -59,7 +59,9 @@ def test_view_directory(temp_dir, runtime_cls, run_as_openhands):
            obs.content
            == f"""Here's the files and directories up to 2 levels deep in {config.workspace_mount_path_in_sandbox}, excluding hidden items:
 {config.workspace_mount_path_in_sandbox}/
-{config.workspace_mount_path_in_sandbox}/test.txt"""
+{config.workspace_mount_path_in_sandbox}/test.txt
+
+1 hidden files/directories in this directory are excluded. You can use 'ls -la /workspace' to see them."""  # The hidden dir is the /workspace/.downloads
        )

    finally:
--- a/tests/runtime/test_browsing.py
+++ b/tests/runtime/test_browsing.py
@ -14,6 +14,7 @@ from openhands.events.action import (
 from openhands.events.observation import (
    BrowserOutputObservation,
    CmdOutputObservation,
+    FileDownloadObservation,
 )

 # ============================================================================================================================
@ -215,3 +216,168 @@ def test_read_png_browse(temp_dir, runtime_cls, run_as_openhands):
        assert '.png' in obs.content
    finally:
        _close_test_runtime(runtime)
+
+
+@pytest.mark.skipif(
+    os.environ.get('TEST_RUNTIME') == 'cli',
+    reason='CLIRuntime does not support browsing actions',
+)
+def test_download_file(temp_dir, runtime_cls, run_as_openhands):
+    """Test downloading a file using the browser."""
+    runtime, config = _load_runtime(temp_dir, runtime_cls, run_as_openhands)
+    try:
+        # Minimal PDF content for testing
+        pdf_content = b"""%PDF-1.4
+        1 0 obj
+
+        /Type /Catalog
+        /Pages 2 0 R
+        >>
+        endobj
+        2 0 obj
+
+        /Type /Pages
+        /Kids [3 0 R]
+        /Count 1
+        >>
+        endobj
+        3 0 obj
+
+        /Type /Page
+        /Parent 2 0 R
+        /MediaBox [0 0 612 792]
+        >>
+        endobj
+        xref
+        0 4
+        0000000000 65535 f
+        0000000010 00000 n
+        0000000053 00000 n
+        0000000125 00000 n
+        trailer
+
+        /Size 4
+        /Root 1 0 R
+        >>
+        startxref
+        212
+        %%EOF"""
+
+        test_file_name = 'test_download.pdf'
+        test_file_path = os.path.join(temp_dir, test_file_name)
+        with open(test_file_path, 'wb') as f:
+            f.write(pdf_content)
+
+        # Copy the file to the sandbox
+        sandbox_dir = config.workspace_mount_path_in_sandbox
+        runtime.copy_to(test_file_path, sandbox_dir)
+
+        # Create a simple HTML page with a download link
+        html_content = f"""
+        <!DOCTYPE html>
+        <html>
+        <head>
+            <title>Download Test</title>
+        </head>
+        <body>
+            <h1>Download Test Page</h1>
+            <p>Click the link below to download the test file:</p>
+            <a href="/{test_file_name}" download="{test_file_name}" id="download-link">Download Test File</a>
+        </body>
+        </html>
+        """
+
+        html_file_path = os.path.join(temp_dir, 'download_test.html')
+        with open(html_file_path, 'w') as f:
+            f.write(html_content)
+
+        # Copy the HTML file to the sandbox
+        runtime.copy_to(html_file_path, sandbox_dir)
+
+        # Verify the files exist in the sandbox
+        action_cmd = CmdRunAction(command='ls -alh')
+        logger.info(action_cmd, extra={'msg_type': 'ACTION'})
+        obs = runtime.run_action(action_cmd)
+        logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+        assert isinstance(obs, CmdOutputObservation)
+        assert obs.exit_code == 0
+        assert test_file_name in obs.content
+        assert 'download_test.html' in obs.content
+
+        # Ensure downloads directory exists
+        action_cmd = CmdRunAction(command='mkdir -p /workspace/.downloads')
+        logger.info(action_cmd, extra={'msg_type': 'ACTION'})
+        obs = runtime.run_action(action_cmd)
+        logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+        assert obs.exit_code == 0
+
+        # Start HTTP server
+        action_cmd = CmdRunAction(
+            command='python3 -m http.server 8000 > server.log 2>&1 &'
+        )
+        logger.info(action_cmd, extra={'msg_type': 'ACTION'})
+        obs = runtime.run_action(action_cmd)
+        logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+        assert isinstance(obs, CmdOutputObservation)
+        assert obs.exit_code == 0
+
+        # Wait for server to start
+        action_cmd = CmdRunAction(command='sleep 2')
+        logger.info(action_cmd, extra={'msg_type': 'ACTION'})
+        obs = runtime.run_action(action_cmd)
+        logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+
+        # Browse to the HTML page
+        action_browse = BrowseURLAction(url='http://localhost:8000/download_test.html')
+        logger.info(action_browse, extra={'msg_type': 'ACTION'})
+        obs = runtime.run_action(action_browse)
+        logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+
+        # Verify the browser observation
+        assert isinstance(obs, BrowserOutputObservation)
+        assert 'http://localhost:8000/download_test.html' in obs.url
+        assert not obs.error
+        assert 'Download Test Page' in obs.content
+
+        # Go to the PDF file url directly - this should trigger download
+        file_url = f'http://localhost:8000/{test_file_name}'
+        action_browse = BrowseInteractiveAction(
+            browser_actions=f'goto("{file_url}")',
+        )
+        logger.info(action_browse, extra={'msg_type': 'ACTION'})
+        obs = runtime.run_action(action_browse)
+        logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+
+        # Verify the browser observation after navigating to PDF file
+        downloaded_file_name = 'file_1.pdf'
+        assert isinstance(obs, FileDownloadObservation)
+        assert 'Location of downloaded file:' in str(obs)
+        assert downloaded_file_name in str(obs)  # File is renamed
+
+        # Wait for download to complete
+        action_cmd = CmdRunAction(command='sleep 3')
+        logger.info(action_cmd, extra={'msg_type': 'ACTION'})
+        obs = runtime.run_action(action_cmd)
+        logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+
+        # Check if the file was downloaded
+        action_cmd = CmdRunAction(command='ls -la /workspace')
+        logger.info(action_cmd, extra={'msg_type': 'ACTION'})
+        obs = runtime.run_action(action_cmd)
+        logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+        assert isinstance(obs, CmdOutputObservation)
+        assert obs.exit_code == 0
+        assert downloaded_file_name in obs.content
+
+        # Clean up
+        action_cmd = CmdRunAction(command='pkill -f "python3 -m http.server" || true')
+        logger.info(action_cmd, extra={'msg_type': 'ACTION'})
+        obs = runtime.run_action(action_cmd)
+        logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+
+        action_cmd = CmdRunAction(command='rm -f server.log')
+        logger.info(action_cmd, extra={'msg_type': 'ACTION'})
+        obs = runtime.run_action(action_cmd)
+        logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    finally:
+        _close_test_runtime(runtime)
--- a/tests/runtime/test_mcp_action.py
+++ b/tests/runtime/test_mcp_action.py
@ -160,7 +160,7 @@ async def test_fetch_mcp_via_stdio(temp_dir, runtime_cls, run_as_openhands):
    assert result_json['content'][0]['type'] == 'text'
    assert (
        result_json['content'][0]['text']
-        == 'Contents of http://localhost:8000/:\n---\n\n* <server.log>\n\n---'
+        == 'Contents of http://localhost:8000/:\n---\n\n* <.downloads/>\n* <server.log>\n\n---'
    )

    runtime.close()
@ -269,7 +269,7 @@ async def test_both_stdio_and_sse_mcp(
        assert result_json['content'][0]['type'] == 'text'
        assert (
            result_json['content'][0]['text']
-            == 'Contents of http://localhost:8000/:\n---\n\n* <server.log>\n\n---'
+            == 'Contents of http://localhost:8000/:\n---\n\n* <.downloads/>\n* <server.log>\n\n---'
        )
    finally:
        if runtime:
@ -354,7 +354,7 @@ async def test_microagent_and_one_stdio_mcp_in_config(
        assert result_json['content'][0]['type'] == 'text'
        assert (
            result_json['content'][0]['text']
-            == 'Contents of http://localhost:8000/:\n---\n\n* <server.log>\n\n---'
+            == 'Contents of http://localhost:8000/:\n---\n\n* <.downloads/>\n* <server.log>\n\n---'
        )
    finally:
        if runtime: