mirror of
https://github.com/OpenHands/OpenHands.git
synced 2025-12-26 05:48:36 +08:00
* move multi-line bash tests to test_runtime; support multi-line bash for esruntime; * add testcase to handle PS2 prompt * use bashlex for bash parsing to handle multi-line commands; add testcases for multi-line commands * revert ghcr runtime change * Apply stash * fix run as other user; make test async; * fix test runtime for run as od * add run-as-devin to all the runtime tests * handle the case when username is root * move all run-as-devin tests from sandbox; only tests a few cases on different user to save time; * move over multi-line echo related tests to test_runtime * fix user-specific jupyter by fixing the pypoetry virtualenv folder * make plugin's init async; chdir at initialization of jupyter plugin; move ipy simple testcase to test runtime; * support agentskills import in move tests for jupyter pwd tests; overload `add_env_vars` for EventStreamRuntime to update env var also in Jupyter; make agentskills read env var lazily, in case env var is updated; * fix ServerRuntime agentskills issue * move agnostic image test to test_runtime * merge runtime tests in CI * fix enable auto lint as env var * update warning message * update warning message * test for different container images * change parsing output as debug * add exception handling for update_pwd_decorator * fix unit test indentation * add plugins as default input to Runtime class; remove init_sandbox_plugins; implement add_env_var (include jupyter) in the base class; * fix server runtime auto lint * Revert "add exception handling for update_pwd_decorator" This reverts commit 2b668b1506e02145cb8f87e321aad62febca3d50. * tries to print debugging info for agentskills * explictly setting uid (try fix permission issue) * Revert "tries to print debugging info for agentskills" This reverts commit 8be4c86756f0e3fc62957b327ba2ac4999c419de. * set sandbox user id during testing to hopefully fix the permission issue * add browser tools for server runtime * try to debug for old pwd * update debug cmd * only test agnostic runtime when TEST_RUNTIME is Server * fix temp dir mkdir * load TEST_RUNTIME at the beginning * remove ipython tests * only log to file when DEBUG * default logging to project root * temporarily remove log to file * fix LLM logger dir * fix logger * make set pwd an optional aux action * fix prev pwd * fix infinity recursion * simplify * do not import the whole od library to avoid logger folder by jupyter * fix browsing * increase timeout * attempt to fix agentskills yet again * clean up in testcases, since CI maybe run as non-root * add _cause attribute for event.id * remove parent * add a bunch of debugging statement again for CI :( * fix temp_dir fixture * change all temp dir to follow pytest's tmp_path_factory * remove extra bracket * clean up error printing a bit * jupyter chdir to self.config.workspace_mount_path_in_sandbox on initialization * jupyter chdir to self.config.workspace_mount_path_in_sandbox on initialization * add typing for tmp dir fixture * clear the directory before running the test to avoid weird CI temp dir * remove agnostic test case for server runtime * Revert "remove agnostic test case for server runtime" This reverts commit 30e2181c3fc1410e69596c2dcd06be01f1d016b3. * disable agnostic tests in CI * fix test * make sure plugin arg is not passed when no plugin is specified; remove redundant on_event function; * move mock prompt * rename runtime * remove extra logging * refactor run_controller's interface; support multiple runtime for integration test; filter out hostname for prompt * uncomment other tests * pass the right runtime to controller * log runtime when start * uncomment tests * improve symbol filters * add intergration test prompts that seemd ok * add integration test workflow * add python3 to default ubuntu image * symlink python and fix permission to jupyter pip * add retry for jupyter execute server * fix jupyter pip install; add post-process for jupyter pip install; simplify init by add agent_skills path to PYTHONPATH; add testcase to tests jupyter pip install; * fix bug * use ubuntu:22.04 for eventstream integration tests * add todo * update testcase * remove redundant code * fix unit test * reduce dependency for runtime * try making llama-index an optional dependency that's not installed by default * remove pip install since it seemd not needed * log ipython execution; await write message since it returns a future * update ipy testcase * do not install llama-index in CI * do not install llama-index in the app docker as well * set sandbox container image in the integration test script * log plugins & env var for runtime * update conftest for sha256 * add git * remove all non-alphanumeric chalracters * add working ipy module tests! * default to use host network * remove is_async from browser to make thing a little more reliable; retry loading browser when error; * add sleep to wait a bit for http server * kill http server before regenerate browsing tests * fix browsing * only set sandbox container image if undefined * skip empty config value * update evaluation to use the latest run_controller * revert logger in execute_server to be compatible with server runtime * revert logging level to fix jupyter * set logger level * revert the logging * chmod for workspace to fix permission * support getting timeout from action * update test for server runtime * try to fix file permission * fix test_cmd_run_action_serialization_deserialization test (added timeout) * poetry: pip 24.2, torch 2.2.2 * revert adding pip to pyproject.toml * add build to dependencies in pyproject.toml * forgot poetry lock --no-update * fix a DelegatorAgent prompt_002.log (timeout) * fix a DelegatorAgent prompt_003.log (timeout) * couple more timeout attribs in prompt files * some more prompt files * prompts galore * add clarification comment for timeout * default timeout to config * add assert * update integraton tests for eventstream * update integration tests * fix timeout for action<->dict * remove redundant on_event * default to use instance image * update run_controller interface * add logging for copy * refactor swe_bench for the new design * fix action execution timeout * updatelock * remove build sandbox locally * fix runtime * use plain for-loop for single process * remove extra print * get swebench inference working * print whole `test_result` dict * got swebench patch post-process working * update swe-bench evaluation readme * refactor using shared reset_logger function * move messy swebench prompt to a different file * support the ability to specify whether to keep prompt * support the ability to specify whether to keep prompt * fix dockerfile * fix import and remove unnecessary strip logic * fix action serialization * get agentbench running * remove extra ls for agent bench * fix agentbench metric * factor out common documentation for eval * update biocoder doc * remove swe_env_box since it is no longer needed * get biocoder working * add func timeout for bird * fix jupyter pwd with ~ as user name * fix jupyter pwd with ~ as user name * get bird working * get browsing evaluation working * make eda runnable * fix id column * fix eda run_infer * unify eval output using a structured format; make swebench coompatible with that format; update client source code for every swebench run; do not inject testcmd for swebench * standardize existing benchs for the new eval output * set update source code = true * get gaia standardized * fix gaia * gorilla refactored but stuck at language.so to test * refactor and make gpqa work * refactor humanevalfix and get it working * refactor logic reasoning and get it working * refactor browser env so it works with eventstream runtime for eval * add initial version of miniwob refactor * fix browsergym environment * get miniwob working!! * allowing injecting additional dependency to OD runtime docker image * allowing injecting additional dependency to OD runtime docker image * support logic reasoning with pre-injected dependency * get mint working * update runtime build * fix mint docker * add test for keep_prompt; add missing await close for some tests * update integration tests for eventstream runtime * fix integration tests for server runtime * refactor ml bench and toolqa * refactor webarena * fix default factory * Update run_infer.py * add APIError to retry * increase timeout for swebench * make sure to hide api key when dump eval output * update the behavior of put source code to put files instead of tarball * add dishash to dependency * sendintr when timeout * fix dockerfile copy * reduce timeout * use dirhash to avoid repeat building for update source * fix runtime_build testcase * add dir_hash to docker build pipeline * revert api error * update poetry lock * add retries for swebench run infer * fix git patch * update poetry lock * adjust config order * fix mount volumns * enforce all eval to use "instance_id" * remove file store from runtime * make file_store public inside eventstream * move the runtime logic inside `main` out * support using async function for process_instance_fn * refactor run_infer with the create_time * fix file store * Update evaluation/toolqa/utils.py Co-authored-by: Graham Neubig <neubig@gmail.com> * fix typo --------- Co-authored-by: tobitege <tobitege@gmx.de> Co-authored-by: super-dainiu <78588128+super-dainiu@users.noreply.github.com> Co-authored-by: Graham Neubig <neubig@gmail.com>
213 lines
7.8 KiB
Python
213 lines
7.8 KiB
Python
import asyncio
|
|
import atexit
|
|
import copy
|
|
import json
|
|
import os
|
|
from abc import abstractmethod
|
|
from typing import Any, Optional
|
|
|
|
from opendevin.core.config import AppConfig, SandboxConfig
|
|
from opendevin.core.logger import opendevin_logger as logger
|
|
from opendevin.events import EventSource, EventStream, EventStreamSubscriber
|
|
from opendevin.events.action import (
|
|
Action,
|
|
ActionConfirmationStatus,
|
|
BrowseInteractiveAction,
|
|
BrowseURLAction,
|
|
CmdRunAction,
|
|
FileReadAction,
|
|
FileWriteAction,
|
|
IPythonRunCellAction,
|
|
)
|
|
from opendevin.events.event import Event
|
|
from opendevin.events.observation import (
|
|
CmdOutputObservation,
|
|
ErrorObservation,
|
|
NullObservation,
|
|
Observation,
|
|
UserRejectObservation,
|
|
)
|
|
from opendevin.events.serialization.action import ACTION_TYPE_TO_CLASS
|
|
from opendevin.runtime.plugins import JupyterRequirement, PluginRequirement
|
|
from opendevin.runtime.tools import RuntimeTool
|
|
|
|
|
|
def _default_env_vars(sandbox_config: SandboxConfig) -> dict[str, str]:
|
|
ret = {}
|
|
for key in os.environ:
|
|
if key.startswith('SANDBOX_ENV_'):
|
|
sandbox_key = key.removeprefix('SANDBOX_ENV_')
|
|
ret[sandbox_key] = os.environ[key]
|
|
if sandbox_config.enable_auto_lint:
|
|
ret['ENABLE_AUTO_LINT'] = 'true'
|
|
return ret
|
|
|
|
|
|
class Runtime:
|
|
"""The runtime is how the agent interacts with the external environment.
|
|
This includes a bash sandbox, a browser, and filesystem interactions.
|
|
|
|
sid is the session id, which is used to identify the current user session.
|
|
"""
|
|
|
|
sid: str
|
|
DEFAULT_ENV_VARS: dict[str, str]
|
|
|
|
def __init__(
|
|
self,
|
|
config: AppConfig,
|
|
event_stream: EventStream,
|
|
sid: str = 'default',
|
|
plugins: list[PluginRequirement] | None = None,
|
|
):
|
|
self.sid = sid
|
|
self.event_stream = event_stream
|
|
self.event_stream.subscribe(EventStreamSubscriber.RUNTIME, self.on_event)
|
|
self.plugins = plugins if plugins is not None and len(plugins) > 0 else []
|
|
|
|
self.config = copy.deepcopy(config)
|
|
self.DEFAULT_ENV_VARS = _default_env_vars(config.sandbox)
|
|
atexit.register(self.close_sync)
|
|
logger.debug(f'Runtime `{sid}` config:\n{self.config}')
|
|
|
|
async def ainit(self, env_vars: dict[str, str] | None = None) -> None:
|
|
"""
|
|
Initialize the runtime (asynchronously).
|
|
|
|
This method should be called after the runtime's constructor.
|
|
"""
|
|
if self.DEFAULT_ENV_VARS:
|
|
logger.debug(f'Adding default env vars: {self.DEFAULT_ENV_VARS}')
|
|
await self.add_env_vars(self.DEFAULT_ENV_VARS)
|
|
if env_vars is not None:
|
|
logger.debug(f'Adding provided env vars: {env_vars}')
|
|
await self.add_env_vars(env_vars)
|
|
|
|
async def close(self) -> None:
|
|
pass
|
|
|
|
def close_sync(self) -> None:
|
|
try:
|
|
loop = asyncio.get_running_loop()
|
|
except RuntimeError:
|
|
# No running event loop, use asyncio.run()
|
|
asyncio.run(self.close())
|
|
else:
|
|
# There is a running event loop, create a task
|
|
if loop.is_running():
|
|
loop.create_task(self.close())
|
|
else:
|
|
loop.run_until_complete(self.close())
|
|
|
|
# ====================================================================
|
|
# Methods we plan to deprecate when we move to new EventStreamRuntime
|
|
# ====================================================================
|
|
|
|
def init_runtime_tools(
|
|
self,
|
|
runtime_tools: list[RuntimeTool],
|
|
runtime_tools_config: Optional[dict[RuntimeTool, Any]] = None,
|
|
) -> None:
|
|
# TODO: deprecate this method when we move to the new EventStreamRuntime
|
|
raise NotImplementedError('This method is not implemented in the base class.')
|
|
|
|
# ====================================================================
|
|
|
|
async def add_env_vars(self, env_vars: dict[str, str]) -> None:
|
|
# Add env vars to the IPython shell (if Jupyter is used)
|
|
if any(isinstance(plugin, JupyterRequirement) for plugin in self.plugins):
|
|
code = 'import os\n'
|
|
for key, value in env_vars.items():
|
|
# Note: json.dumps gives us nice escaping for free
|
|
code += f'os.environ["{key}"] = {json.dumps(value)}\n'
|
|
code += '\n'
|
|
obs = await self.run_ipython(IPythonRunCellAction(code))
|
|
logger.info(f'Added env vars to IPython: code={code}, obs={obs}')
|
|
|
|
# Add env vars to the Bash shell
|
|
cmd = ''
|
|
for key, value in env_vars.items():
|
|
# Note: json.dumps gives us nice escaping for free
|
|
cmd += f'export {key}={json.dumps(value)}; '
|
|
if not cmd:
|
|
return
|
|
cmd = cmd.strip()
|
|
logger.debug(f'Adding env var: {cmd}')
|
|
obs = await self.run(CmdRunAction(cmd))
|
|
if not isinstance(obs, CmdOutputObservation) or obs.exit_code != 0:
|
|
raise RuntimeError(
|
|
f'Failed to add env vars [{env_vars}] to environment: {obs.content}'
|
|
)
|
|
|
|
async def on_event(self, event: Event) -> None:
|
|
if isinstance(event, Action):
|
|
# set timeout to default if not set
|
|
if event.timeout is None:
|
|
event.timeout = self.config.sandbox.timeout
|
|
assert event.timeout is not None
|
|
observation = await self.run_action(event)
|
|
observation._cause = event.id # type: ignore[attr-defined]
|
|
source = event.source if event.source else EventSource.AGENT
|
|
self.event_stream.add_event(observation, source) # type: ignore[arg-type]
|
|
|
|
async def run_action(self, action: Action) -> Observation:
|
|
"""Run an action and return the resulting observation.
|
|
If the action is not runnable in any runtime, a NullObservation is returned.
|
|
If the action is not supported by the current runtime, an ErrorObservation is returned.
|
|
"""
|
|
if not action.runnable:
|
|
return NullObservation('')
|
|
if (
|
|
hasattr(action, 'is_confirmed')
|
|
and action.is_confirmed == ActionConfirmationStatus.AWAITING_CONFIRMATION
|
|
):
|
|
return NullObservation('')
|
|
action_type = action.action # type: ignore[attr-defined]
|
|
if action_type not in ACTION_TYPE_TO_CLASS:
|
|
return ErrorObservation(f'Action {action_type} does not exist.')
|
|
if not hasattr(self, action_type):
|
|
return ErrorObservation(
|
|
f'Action {action_type} is not supported in the current runtime.'
|
|
)
|
|
if (
|
|
hasattr(action, 'is_confirmed')
|
|
and action.is_confirmed == ActionConfirmationStatus.REJECTED
|
|
):
|
|
return UserRejectObservation(
|
|
'Action has been rejected by the user! Waiting for further user input.'
|
|
)
|
|
observation = await getattr(self, action_type)(action)
|
|
return observation
|
|
|
|
@abstractmethod
|
|
async def copy_to(self, host_src: str, sandbox_dest: str, recursive: bool = False):
|
|
raise NotImplementedError('This method is not implemented in the base class.')
|
|
|
|
# ====================================================================
|
|
# Implement these methods in the subclass
|
|
# ====================================================================
|
|
|
|
@abstractmethod
|
|
async def run(self, action: CmdRunAction) -> Observation:
|
|
pass
|
|
|
|
@abstractmethod
|
|
async def run_ipython(self, action: IPythonRunCellAction) -> Observation:
|
|
pass
|
|
|
|
@abstractmethod
|
|
async def read(self, action: FileReadAction) -> Observation:
|
|
pass
|
|
|
|
@abstractmethod
|
|
async def write(self, action: FileWriteAction) -> Observation:
|
|
pass
|
|
|
|
@abstractmethod
|
|
async def browse(self, action: BrowseURLAction) -> Observation:
|
|
pass
|
|
|
|
@abstractmethod
|
|
async def browse_interactive(self, action: BrowseInteractiveAction) -> Observation:
|
|
pass
|