diff --git a/.github/workflows/ghcr-build.yml b/.github/workflows/ghcr-build.yml index 5c2e6d111d..7948291d6e 100644 --- a/.github/workflows/ghcr-build.yml +++ b/.github/workflows/ghcr-build.yml @@ -291,7 +291,7 @@ jobs: SANDBOX_RUNTIME_CONTAINER_IMAGE=$image_name \ TEST_IN_CI=true \ RUN_AS_OPENHANDS=false \ - poetry run pytest -n 3 -raRs --reruns 2 --reruns-delay 5 --cov=openhands --cov-report=xml -s ./tests/runtime + poetry run pytest -n 3 -raRs --reruns 2 --reruns-delay 5 --cov=openhands --cov-report=xml -s ./tests/runtime --ignore=tests/runtime/test_browsergym_envs.py - name: Upload coverage to Codecov uses: codecov/codecov-action@v4 env: @@ -368,7 +368,7 @@ jobs: SANDBOX_RUNTIME_CONTAINER_IMAGE=$image_name \ TEST_IN_CI=true \ RUN_AS_OPENHANDS=true \ - poetry run pytest -n 3 -raRs --reruns 2 --reruns-delay 5 --cov=openhands --cov-report=xml -s ./tests/runtime + poetry run pytest -n 3 -raRs --reruns 2 --reruns-delay 5 --cov=openhands --cov-report=xml -s ./tests/runtime --ignore=tests/runtime/test_browsergym_envs.py - name: Upload coverage to Codecov uses: codecov/codecov-action@v4 env: diff --git a/poetry.lock b/poetry.lock index d97ef683fe..85831d58dc 100644 --- a/poetry.lock +++ b/poetry.lock @@ -601,43 +601,6 @@ urllib3 = {version = ">=1.25.4,<2.2.0 || >2.2.0,<3", markers = "python_version > [package.extras] crt = ["awscrt (==0.22.0)"] -[[package]] -name = "browsergym" -version = "0.10.2" -description = "BrowserGym: a gym environment for web task automation in the Chromium browser" -optional = false -python-versions = ">3.7" -files = [ - {file = "browsergym-0.10.2-py3-none-any.whl", hash = "sha256:9581d1d1f1fcd1cf35266cf30c881d60c147a0d374b3491eeaebb07d9690f868"}, - {file = "browsergym-0.10.2.tar.gz", hash = "sha256:3cdd7520cca857421aa7ec0a965968df4bcef721299a424397f86d7cad078ab0"}, -] - -[package.dependencies] -browsergym-assistantbench = "0.10.2" -browsergym-core = "0.10.2" -browsergym-experiments = "0.10.2" -browsergym-miniwob = "0.10.2" -browsergym-visualwebarena = "0.10.2" -browsergym-webarena = "0.10.2" -browsergym-workarena = ">=0.4.1" - -[[package]] -name = "browsergym-assistantbench" -version = "0.10.2" -description = "AssistantBench benchmark for BrowserGym" -optional = false -python-versions = ">3.7" -files = [ - {file = "browsergym_assistantbench-0.10.2-py3-none-any.whl", hash = "sha256:af0d3a3e23686066b070feca38f8740262bed6d65ccf9098f393334a005987c0"}, - {file = "browsergym_assistantbench-0.10.2.tar.gz", hash = "sha256:de18eb7c010403d5d467b927b4713b56f6e97a59493bee4c42599d4d7cb54dce"}, -] - -[package.dependencies] -browsergym-core = "0.10.2" -datasets = "*" -numpy = "*" -scipy = "*" - [[package]] name = "browsergym-core" version = "0.10.2" @@ -658,22 +621,6 @@ pillow = ">=10.1" playwright = ">=1.39,<2.0" pyparsing = ">=3" -[[package]] -name = "browsergym-experiments" -version = "0.10.2" -description = "Experimentation tools for BrowserGym" -optional = false -python-versions = ">3.7" -files = [ - {file = "browsergym_experiments-0.10.2-py3-none-any.whl", hash = "sha256:60a626b3159ef63b5ff72a6c8156c8f3cf82a9278dfc5a9d3ece39c2b1913595"}, - {file = "browsergym_experiments-0.10.2.tar.gz", hash = "sha256:b49bc27f315ad12014ff21580c7c7aca6489ca4106e7ab46502f716674efa236"}, -] - -[package.dependencies] -browsergym-core = "0.10.2" -dataclasses-json = "*" -tiktoken = ">=0.4" - [[package]] name = "browsergym-miniwob" version = "0.10.2" @@ -688,22 +635,6 @@ files = [ [package.dependencies] browsergym-core = "0.10.2" -[[package]] -name = "browsergym-visualwebarena" -version = "0.10.2" -description = "VisualWebArena benchmark for BrowserGym" -optional = false -python-versions = ">3.7" -files = [ - {file = "browsergym_visualwebarena-0.10.2-py3-none-any.whl", hash = "sha256:87c913ccd4d12a79c625b5c4d9ead7e0bc50b298d19e413204bb586a67736d83"}, - {file = "browsergym_visualwebarena-0.10.2.tar.gz", hash = "sha256:5f84a4f33a21106c9b650cecb0362b78af2546d9927255828c273fe800d776a1"}, -] - -[package.dependencies] -browsergym-core = "0.10.2" -libvisualwebarena = "0.0.14" -requests = "*" - [[package]] name = "browsergym-webarena" version = "0.10.2" @@ -719,26 +650,6 @@ files = [ browsergym-core = "0.10.2" libwebarena = "0.0.3" -[[package]] -name = "browsergym-workarena" -version = "0.4.1" -description = "WorkArena benchmark for BrowserGym" -optional = false -python-versions = ">3.7" -files = [ - {file = "browsergym_workarena-0.4.1-py3-none-any.whl", hash = "sha256:b8f04b2e3801fd32962b7d99f0685c507b258841e2b4bfdb46d041091d2f1b89"}, - {file = "browsergym_workarena-0.4.1.tar.gz", hash = "sha256:ba2958d804b80836c7f81360d66b99c6c655c5070eddc5fae9c1c88306a23403"}, -] - -[package.dependencies] -browsergym-core = ">=0.2" -english-words = ">=2.0.1" -faker = ">=24.8.0" -numpy = ">=1.14" -requests = ">=2.31" -tenacity = ">=8.2.3" -tqdm = ">=4.66.2" - [[package]] name = "build" version = "1.2.2.post1" @@ -1659,16 +1570,6 @@ typing-extensions = ">=4.8.0" urllib3 = ">=1.25.3" websockets = ">=11.0.3" -[[package]] -name = "english-words" -version = "2.0.1" -description = "Generate sets of english words by combining different word lists" -optional = false -python-versions = "*" -files = [ - {file = "english-words-2.0.1.tar.gz", hash = "sha256:a4105c57493bb757a3d8973fcf8e1dc05e7ca09c836dff467c3fb445f84bc43d"}, -] - [[package]] name = "evaluate" version = "0.4.3" @@ -1732,21 +1633,6 @@ files = [ [package.extras] tests = ["asttokens (>=2.1.0)", "coverage", "coverage-enable-subprocess", "ipython", "littleutils", "pytest", "rich"] -[[package]] -name = "faker" -version = "33.0.0" -description = "Faker is a Python package that generates fake data for you." -optional = false -python-versions = ">=3.8" -files = [ - {file = "Faker-33.0.0-py3-none-any.whl", hash = "sha256:68e5580cb6b4226710886e595eabc13127149d6e71e9d1db65506a7fbe2c7fce"}, - {file = "faker-33.0.0.tar.gz", hash = "sha256:9b01019c1ddaf2253ca2308c0472116e993f4ad8fc9905f82fa965e0c6f932e9"}, -] - -[package.dependencies] -python-dateutil = ">=2.4" -typing-extensions = "*" - [[package]] name = "farama-notifications" version = "0.0.4" @@ -3099,39 +2985,6 @@ files = [ [package.extras] all = ["flake8 (>=7.1.1)", "mypy (>=1.11.2)", "pytest (>=8.3.2)", "ruff (>=0.6.2)"] -[[package]] -name = "imageio" -version = "2.36.0" -description = "Library for reading and writing a wide range of image, video, scientific, and volumetric data formats." -optional = false -python-versions = ">=3.9" -files = [ - {file = "imageio-2.36.0-py3-none-any.whl", hash = "sha256:471f1eda55618ee44a3c9960911c35e647d9284c68f077e868df633398f137f0"}, - {file = "imageio-2.36.0.tar.gz", hash = "sha256:1c8f294db862c256e9562354d65aa54725b8dafed7f10f02bb3ec20ec1678850"}, -] - -[package.dependencies] -numpy = "*" -pillow = ">=8.3.2" - -[package.extras] -all-plugins = ["astropy", "av", "imageio-ffmpeg", "numpy (>2)", "pillow-heif", "psutil", "rawpy", "tifffile"] -all-plugins-pypy = ["av", "imageio-ffmpeg", "pillow-heif", "psutil", "tifffile"] -build = ["wheel"] -dev = ["black", "flake8", "fsspec[github]", "pytest", "pytest-cov"] -docs = ["numpydoc", "pydata-sphinx-theme", "sphinx (<6)"] -ffmpeg = ["imageio-ffmpeg", "psutil"] -fits = ["astropy"] -full = ["astropy", "av", "black", "flake8", "fsspec[github]", "gdal", "imageio-ffmpeg", "itk", "numpy (>2)", "numpydoc", "pillow-heif", "psutil", "pydata-sphinx-theme", "pytest", "pytest-cov", "rawpy", "sphinx (<6)", "tifffile", "wheel"] -gdal = ["gdal"] -itk = ["itk"] -linting = ["black", "flake8"] -pillow-heif = ["pillow-heif"] -pyav = ["av"] -rawpy = ["numpy (>2)", "rawpy"] -test = ["fsspec[github]", "pytest", "pytest-cov"] -tifffile = ["tifffile"] - [[package]] name = "importlib-metadata" version = "7.1.0" @@ -3885,52 +3738,6 @@ websocket-client = ">=0.32.0,<0.40.0 || >0.40.0,<0.41.dev0 || >=0.43.dev0" [package.extras] adal = ["adal (>=1.0.2)"] -[[package]] -name = "lazy-loader" -version = "0.4" -description = "Makes it easy to load subpackages and functions on demand." -optional = false -python-versions = ">=3.7" -files = [ - {file = "lazy_loader-0.4-py3-none-any.whl", hash = "sha256:342aa8e14d543a154047afb4ba8ef17f5563baad3fc610d7b15b213b0f119efc"}, - {file = "lazy_loader-0.4.tar.gz", hash = "sha256:47c75182589b91a4e1a85a136c074285a5ad4d9f39c63e0d7fb76391c4574cd1"}, -] - -[package.dependencies] -packaging = "*" - -[package.extras] -dev = ["changelist (==0.5)"] -lint = ["pre-commit (==3.7.0)"] -test = ["pytest (>=7.4)", "pytest-cov (>=4.1)"] - -[[package]] -name = "libvisualwebarena" -version = "0.0.14" -description = "This is an unofficial, use-at-your-own risks port of the visualwebarena benchmark, for use as a standalone library package." -optional = false -python-versions = "<4,>=3.7" -files = [ - {file = "libvisualwebarena-0.0.14-py3-none-any.whl", hash = "sha256:636b06ca1d52f1a363503b5b563492e83f2482efaf85bb26b69744565a499f0f"}, - {file = "libvisualwebarena-0.0.14.tar.gz", hash = "sha256:7e660179f60f1df8d884204f2b742a2117e7fe050823d839ca5744ea1c0709a7"}, -] - -[package.dependencies] -aiolimiter = "*" -beartype = "0.12.0" -evaluate = "*" -flask = "*" -gymnasium = "*" -nltk = "*" -openai = ">=1" -Pillow = "*" -playwright = ">=1.32,<1.40" -scikit-image = ">=0.16" -text-generation = "*" -tiktoken = "*" -transformers = "*" -types-tqdm = "*" - [[package]] name = "libwebarena" version = "0.0.3" @@ -8114,54 +7921,6 @@ files = [ attrs = ">=18.0.0" pathspec = ">=0.10.1" -[[package]] -name = "scikit-image" -version = "0.24.0" -description = "Image processing in Python" -optional = false -python-versions = ">=3.9" -files = [ - {file = "scikit_image-0.24.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:cb3bc0264b6ab30b43c4179ee6156bc18b4861e78bb329dd8d16537b7bbf827a"}, - {file = "scikit_image-0.24.0-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:9c7a52e20cdd760738da38564ba1fed7942b623c0317489af1a598a8dedf088b"}, - {file = "scikit_image-0.24.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:93f46e6ce42e5409f4d09ce1b0c7f80dd7e4373bcec635b6348b63e3c886eac8"}, - {file = "scikit_image-0.24.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:39ee0af13435c57351a3397eb379e72164ff85161923eec0c38849fecf1b4764"}, - {file = "scikit_image-0.24.0-cp310-cp310-win_amd64.whl", hash = "sha256:7ac7913b028b8aa780ffae85922894a69e33d1c0bf270ea1774f382fe8bf95e7"}, - {file = "scikit_image-0.24.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:272909e02a59cea3ed4aa03739bb88df2625daa809f633f40b5053cf09241831"}, - {file = "scikit_image-0.24.0-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:190ebde80b4470fe8838764b9b15f232a964f1a20391663e31008d76f0c696f7"}, - {file = "scikit_image-0.24.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:59c98cc695005faf2b79904e4663796c977af22586ddf1b12d6af2fa22842dc2"}, - {file = "scikit_image-0.24.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fa27b3a0dbad807b966b8db2d78da734cb812ca4787f7fbb143764800ce2fa9c"}, - {file = "scikit_image-0.24.0-cp311-cp311-win_amd64.whl", hash = "sha256:dacf591ac0c272a111181afad4b788a27fe70d213cfddd631d151cbc34f8ca2c"}, - {file = "scikit_image-0.24.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:6fccceb54c9574590abcddc8caf6cefa57c13b5b8b4260ab3ff88ad8f3c252b3"}, - {file = "scikit_image-0.24.0-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:ccc01e4760d655aab7601c1ba7aa4ddd8b46f494ac46ec9c268df6f33ccddf4c"}, - {file = "scikit_image-0.24.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:18836a18d3a7b6aca5376a2d805f0045826bc6c9fc85331659c33b4813e0b563"}, - {file = "scikit_image-0.24.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8579bda9c3f78cb3b3ed8b9425213c53a25fa7e994b7ac01f2440b395babf660"}, - {file = "scikit_image-0.24.0-cp312-cp312-win_amd64.whl", hash = "sha256:82ab903afa60b2da1da2e6f0c8c65e7c8868c60a869464c41971da929b3e82bc"}, - {file = "scikit_image-0.24.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:ef04360eda372ee5cd60aebe9be91258639c86ae2ea24093fb9182118008d009"}, - {file = "scikit_image-0.24.0-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:e9aadb442360a7e76f0c5c9d105f79a83d6df0e01e431bd1d5757e2c5871a1f3"}, - {file = "scikit_image-0.24.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5e37de6f4c1abcf794e13c258dc9b7d385d5be868441de11c180363824192ff7"}, - {file = "scikit_image-0.24.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4688c18bd7ec33c08d7bf0fd19549be246d90d5f2c1d795a89986629af0a1e83"}, - {file = "scikit_image-0.24.0-cp39-cp39-win_amd64.whl", hash = "sha256:56dab751d20b25d5d3985e95c9b4e975f55573554bd76b0aedf5875217c93e69"}, - {file = "scikit_image-0.24.0.tar.gz", hash = "sha256:5d16efe95da8edbeb363e0c4157b99becbd650a60b77f6e3af5768b66cf007ab"}, -] - -[package.dependencies] -imageio = ">=2.33" -lazy-loader = ">=0.4" -networkx = ">=2.8" -numpy = ">=1.23" -packaging = ">=21" -pillow = ">=9.1" -scipy = ">=1.9" -tifffile = ">=2022.8.12" - -[package.extras] -build = ["Cython (>=3.0.4)", "build", "meson-python (>=0.15)", "ninja", "numpy (>=2.0.0rc1)", "packaging (>=21)", "pythran", "setuptools (>=67)", "spin (==0.8)", "wheel"] -data = ["pooch (>=1.6.0)"] -developer = ["ipython", "pre-commit", "tomli"] -docs = ["PyWavelets (>=1.1.1)", "dask[array] (>=2022.9.2)", "ipykernel", "ipywidgets", "kaleido", "matplotlib (>=3.6)", "myst-parser", "numpydoc (>=1.7)", "pandas (>=1.5)", "plotly (>=5.10)", "pooch (>=1.6)", "pydata-sphinx-theme (>=0.15.2)", "pytest-doctestplus", "pytest-runner", "scikit-learn (>=1.1)", "seaborn (>=0.11)", "sphinx (>=7.3)", "sphinx-copybutton", "sphinx-gallery (>=0.14)", "sphinx_design (>=0.5)", "tifffile (>=2022.8.12)"] -optional = ["PyWavelets (>=1.1.1)", "SimpleITK", "astropy (>=5.0)", "cloudpickle (>=0.2.1)", "dask[array] (>=2021.1.0)", "matplotlib (>=3.6)", "pooch (>=1.6.0)", "pyamg", "scikit-learn (>=1.1)"] -test = ["asv", "numpydoc (>=1.7)", "pooch (>=1.6.0)", "pytest (>=7.0)", "pytest-cov (>=2.11.0)", "pytest-doctestplus", "pytest-faulthandler", "pytest-localserver"] - [[package]] name = "scikit-learn" version = "1.5.2" @@ -8843,28 +8602,6 @@ files = [ {file = "threadpoolctl-3.5.0.tar.gz", hash = "sha256:082433502dd922bf738de0d8bcc4fdcbf0979ff44c42bd40f5af8a282f6fa107"}, ] -[[package]] -name = "tifffile" -version = "2024.9.20" -description = "Read and write TIFF files" -optional = false -python-versions = ">=3.10" -files = [ - {file = "tifffile-2024.9.20-py3-none-any.whl", hash = "sha256:c54dc85bc1065d972cb8a6ffb3181389d597876aa80177933459733e4ed243dd"}, - {file = "tifffile-2024.9.20.tar.gz", hash = "sha256:3fbf3be2f995a7051a8ae05a4be70c96fc0789f22ed6f1c4104c973cf68a640b"}, -] - -[package.dependencies] -numpy = "*" - -[package.extras] -all = ["defusedxml", "fsspec", "imagecodecs (>=2023.8.12)", "lxml", "matplotlib", "zarr"] -codecs = ["imagecodecs (>=2023.8.12)"] -plot = ["matplotlib"] -test = ["cmapfile", "czifile", "dask", "defusedxml", "fsspec", "imagecodecs", "lfdfiles", "lxml", "ndtiff", "oiffile", "psdtags", "pytest", "roifile", "xarray", "zarr"] -xml = ["defusedxml", "lxml"] -zarr = ["fsspec", "zarr"] - [[package]] name = "tiktoken" version = "0.8.0" @@ -10350,4 +10087,4 @@ testing = ["coverage[toml]", "zope.event", "zope.testing"] [metadata] lock-version = "2.0" python-versions = "^3.12" -content-hash = "ff370b7b5077720b73fe3b90cc1b7fb9c7a262bfbd35885bb717369061e8a466" +content-hash = "56a80082afb76e518239060855598921d94a0373123b2d9222cf8c7b6238b7ad" diff --git a/pyproject.toml b/pyproject.toml index ec148baadc..6f15945e22 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -28,7 +28,7 @@ uvicorn = "*" types-toml = "*" numpy = "*" json-repair = "*" -browsergym = "0.10.2" # integrate browsergym as the browsing interface +browsergym-core = "0.10.2" # integrate browsergym-core as the browsing interface html2text = "*" e2b = ">=0.17.1,<1.1.0" pexpect = "*" @@ -63,6 +63,7 @@ opentelemetry-exporter-otlp-proto-grpc = "1.25.0" modal = "^0.66.26" runloop-api-client = "0.10.0" pygithub = "^2.5.0" +joblib = "*" openhands-aci = "^0.1.1" python-socketio = "^5.11.4" redis = "^5.2.0" @@ -142,6 +143,8 @@ gdown = "*" matplotlib = "*" seaborn = "*" tabulate = "*" +browsergym-webarena = "0.10.2" +browsergym-miniwob = "0.10.2" [tool.poetry-dynamic-versioning] enable = true diff --git a/tests/runtime/test_browsergym_envs.py b/tests/runtime/test_browsergym_envs.py new file mode 100644 index 0000000000..426ecacaf5 --- /dev/null +++ b/tests/runtime/test_browsergym_envs.py @@ -0,0 +1,73 @@ +import json + +import pytest + +from openhands.core.logger import openhands_logger as logger +from openhands.events.action.browse import BrowseInteractiveAction +from openhands.events.observation.browse import BrowserOutputObservation +from tests.runtime.conftest import _close_test_runtime, _load_runtime + + +def has_miniwob(): + try: + import importlib.util + + # try to find this browser environment, if it was installed + spec = importlib.util.find_spec('browsergym.miniwob') + if spec is None: + return False + + # try to import this environment + importlib.util.module_from_spec(spec) + return True + except ImportError: + return False + + +@pytest.mark.skipif( + not has_miniwob(), + reason='Requires browsergym-miniwob package to be installed', +) +def test_browsergym_eval_env(runtime_cls, temp_dir): + runtime = _load_runtime( + temp_dir, + runtime_cls=runtime_cls, + run_as_openhands=False, # need root permission to access file + base_container_image='xingyaoww/od-eval-miniwob:v1.0', + browsergym_eval_env='browsergym/miniwob.choose-list', + force_rebuild_runtime=True, + ) + from openhands.runtime.browser.browser_env import ( + BROWSER_EVAL_GET_GOAL_ACTION, + BROWSER_EVAL_GET_REWARDS_ACTION, + ) + + # Test browse + action = BrowseInteractiveAction(browser_actions=BROWSER_EVAL_GET_GOAL_ACTION) + logger.info(action, extra={'msg_type': 'ACTION'}) + obs = runtime.run_action(action) + logger.info(obs, extra={'msg_type': 'OBSERVATION'}) + + assert isinstance(obs, BrowserOutputObservation) + assert not obs.error + assert 'Select' in obs.content + assert 'from the list and click Submit' in obs.content + + # Make sure the browser can produce observation in eval env + action = BrowseInteractiveAction(browser_actions='noop()') + logger.info(action, extra={'msg_type': 'ACTION'}) + obs = runtime.run_action(action) + logger.info(obs, extra={'msg_type': 'OBSERVATION'}) + assert ( + obs.url.strip() + == 'file:///miniwob-plusplus/miniwob/html/miniwob/choose-list.html' + ) + + # Make sure the rewards are working + action = BrowseInteractiveAction(browser_actions=BROWSER_EVAL_GET_REWARDS_ACTION) + logger.info(action, extra={'msg_type': 'ACTION'}) + obs = runtime.run_action(action) + logger.info(obs, extra={'msg_type': 'OBSERVATION'}) + assert json.loads(obs.content) == [0.0] + + _close_test_runtime(runtime) diff --git a/tests/runtime/test_browsing.py b/tests/runtime/test_browsing.py index f24e37cd06..6097c89190 100644 --- a/tests/runtime/test_browsing.py +++ b/tests/runtime/test_browsing.py @@ -1,12 +1,9 @@ """Browsing-related tests for the EventStreamRuntime, which connects to the ActionExecutor running in the sandbox.""" -import json - from conftest import _close_test_runtime, _load_runtime from openhands.core.logger import openhands_logger as logger from openhands.events.action import ( - BrowseInteractiveAction, BrowseURLAction, CmdRunAction, ) @@ -16,7 +13,8 @@ from openhands.events.observation import ( ) # ============================================================================================================================ -# Browsing tests +# Browsing tests, without evaluation (poetry install --without evaluation) +# For eval environments, tests need to run with poetry install # ============================================================================================================================ PY3_FOR_TESTING = '/openhands/micromamba/bin/micromamba run -n openhands python3' @@ -66,48 +64,3 @@ def test_simple_browse(temp_dir, runtime_cls, run_as_openhands): assert obs.exit_code == 0 _close_test_runtime(runtime) - - -def test_browsergym_eval_env(runtime_cls, temp_dir): - runtime = _load_runtime( - temp_dir, - runtime_cls=runtime_cls, - run_as_openhands=False, # need root permission to access file - base_container_image='xingyaoww/od-eval-miniwob:v1.0', - browsergym_eval_env='browsergym/miniwob.choose-list', - force_rebuild_runtime=True, - ) - from openhands.runtime.browser.browser_env import ( - BROWSER_EVAL_GET_GOAL_ACTION, - BROWSER_EVAL_GET_REWARDS_ACTION, - ) - - # Test browse - action = BrowseInteractiveAction(browser_actions=BROWSER_EVAL_GET_GOAL_ACTION) - logger.info(action, extra={'msg_type': 'ACTION'}) - obs = runtime.run_action(action) - logger.info(obs, extra={'msg_type': 'OBSERVATION'}) - - assert isinstance(obs, BrowserOutputObservation) - assert not obs.error - assert 'Select' in obs.content - assert 'from the list and click Submit' in obs.content - - # Make sure the browser can produce observation in eva[l - action = BrowseInteractiveAction(browser_actions='noop()') - logger.info(action, extra={'msg_type': 'ACTION'}) - obs = runtime.run_action(action) - logger.info(obs, extra={'msg_type': 'OBSERVATION'}) - assert ( - obs.url.strip() - == 'file:///miniwob-plusplus/miniwob/html/miniwob/choose-list.html' - ) - - # Make sure the rewards are working - action = BrowseInteractiveAction(browser_actions=BROWSER_EVAL_GET_REWARDS_ACTION) - logger.info(action, extra={'msg_type': 'ACTION'}) - obs = runtime.run_action(action) - logger.info(obs, extra={'msg_type': 'OBSERVATION'}) - assert json.loads(obs.content) == [0.0] - - _close_test_runtime(runtime)