From 395c1ea9e3f593848c53b11dd896b60a3b8279b1 Mon Sep 17 00:00:00 2001 From: Engel Nyst Date: Mon, 3 Mar 2025 00:19:25 +0100 Subject: [PATCH] [Refactor] split runtime initialization (create, connect, init) in cli scripts (#7036) --- evaluation/benchmarks/EDA/run_infer.py | 2 + .../benchmarks/agent_bench/run_infer.py | 2 + .../benchmarks/aider_bench/run_infer.py | 3 +- evaluation/benchmarks/biocoder/run_infer.py | 2 + evaluation/benchmarks/bird/run_infer.py | 2 + .../browsing_delegation/run_infer.py | 2 + .../benchmarks/commit0_bench/run_infer.py | 2 + .../benchmarks/discoverybench/run_infer.py | 2 + evaluation/benchmarks/gaia/run_infer.py | 2 + evaluation/benchmarks/gorilla/run_infer.py | 2 + evaluation/benchmarks/gpqa/run_infer.py | 2 + .../benchmarks/humanevalfix/run_infer.py | 2 + .../benchmarks/logic_reasoning/run_infer.py | 2 + evaluation/benchmarks/miniwob/run_infer.py | 2 + evaluation/benchmarks/mint/run_infer.py | 2 + evaluation/benchmarks/ml_bench/run_infer.py | 2 + .../benchmarks/scienceagentbench/run_infer.py | 2 + evaluation/benchmarks/swe_bench/run_infer.py | 2 + .../benchmarks/the_agent_company/run_infer.py | 3 +- evaluation/benchmarks/toolqa/run_infer.py | 2 + .../benchmarks/visualwebarena/run_infer.py | 3 ++ evaluation/benchmarks/webarena/run_infer.py | 2 + evaluation/integration_tests/run_infer.py | 2 + openhands/core/cli.py | 16 +++++- openhands/core/main.py | 13 ++++- openhands/core/setup.py | 50 +++++++++++++------ 26 files changed, 107 insertions(+), 21 deletions(-) diff --git a/evaluation/benchmarks/EDA/run_infer.py b/evaluation/benchmarks/EDA/run_infer.py index 636a52e2bd..f216a86ff8 100644 --- a/evaluation/benchmarks/EDA/run_infer.py +++ b/evaluation/benchmarks/EDA/run_infer.py @@ -24,6 +24,7 @@ from openhands.core.config import ( from openhands.core.logger import openhands_logger as logger from openhands.core.main import create_runtime, run_controller from openhands.events.action import MessageAction +from openhands.utils.async_utils import call_async_from_sync game = None @@ -121,6 +122,7 @@ def process_instance( # Here's how you can run the agent (similar to the `main` function) and get the final task state runtime = create_runtime(config) + call_async_from_sync(runtime.connect) state: State | None = asyncio.run( run_controller( diff --git a/evaluation/benchmarks/agent_bench/run_infer.py b/evaluation/benchmarks/agent_bench/run_infer.py index 68cf2ff793..a78e402395 100644 --- a/evaluation/benchmarks/agent_bench/run_infer.py +++ b/evaluation/benchmarks/agent_bench/run_infer.py @@ -34,6 +34,7 @@ from openhands.core.main import create_runtime, run_controller from openhands.events.action import AgentFinishAction, CmdRunAction, MessageAction from openhands.events.observation import CmdOutputObservation from openhands.runtime.base import Runtime +from openhands.utils.async_utils import call_async_from_sync def get_config( @@ -210,6 +211,7 @@ def process_instance( # ============================================= runtime: Runtime = create_runtime(config) + call_async_from_sync(runtime.connect) initialize_runtime(runtime, instance=instance) diff --git a/evaluation/benchmarks/aider_bench/run_infer.py b/evaluation/benchmarks/aider_bench/run_infer.py index 9c848f67b1..0d97496acd 100644 --- a/evaluation/benchmarks/aider_bench/run_infer.py +++ b/evaluation/benchmarks/aider_bench/run_infer.py @@ -34,6 +34,7 @@ from openhands.core.main import create_runtime, run_controller from openhands.events.action import CmdRunAction, MessageAction from openhands.events.observation import CmdOutputObservation from openhands.runtime.base import Runtime +from openhands.utils.async_utils import call_async_from_sync # Configure visibility of unit tests to the Agent. USE_UNIT_TESTS = os.environ.get('USE_UNIT_TESTS', 'false').lower() == 'true' @@ -203,7 +204,7 @@ def process_instance( # ============================================= runtime: Runtime = create_runtime(config) - + call_async_from_sync(runtime.connect) initialize_runtime(runtime, instance=instance) # Here's how you can run the agent (similar to the `main` function) and get the final task state diff --git a/evaluation/benchmarks/biocoder/run_infer.py b/evaluation/benchmarks/biocoder/run_infer.py index b0a06a6ece..f1c98ed066 100644 --- a/evaluation/benchmarks/biocoder/run_infer.py +++ b/evaluation/benchmarks/biocoder/run_infer.py @@ -31,6 +31,7 @@ from openhands.core.main import create_runtime, run_controller from openhands.events.action import CmdRunAction, MessageAction from openhands.events.observation import CmdOutputObservation from openhands.runtime.base import Runtime +from openhands.utils.async_utils import call_async_from_sync AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = { 'CodeActAgent': functools.partial( @@ -274,6 +275,7 @@ def process_instance( instruction += AGENT_CLS_TO_INST_SUFFIX[metadata.agent_class] runtime = create_runtime(config) + call_async_from_sync(runtime.connect) initialize_runtime(runtime, instance) # Here's how you can run the agent (similar to the `main` function) and get the final task state diff --git a/evaluation/benchmarks/bird/run_infer.py b/evaluation/benchmarks/bird/run_infer.py index 4cdd29862f..1c56deb967 100644 --- a/evaluation/benchmarks/bird/run_infer.py +++ b/evaluation/benchmarks/bird/run_infer.py @@ -34,6 +34,7 @@ from openhands.core.main import create_runtime, run_controller from openhands.events.action import CmdRunAction, MessageAction from openhands.events.observation import CmdOutputObservation from openhands.runtime.base import Runtime +from openhands.utils.async_utils import call_async_from_sync def codeact_user_response(state: State) -> str: @@ -399,6 +400,7 @@ def process_instance( instruction += AGENT_CLS_TO_INST_SUFFIX[metadata.agent_class] runtime = create_runtime(config) + call_async_from_sync(runtime.connect) initialize_runtime(runtime, instance) # Here's how you can run the agent (similar to the `main` function) and get the final task state diff --git a/evaluation/benchmarks/browsing_delegation/run_infer.py b/evaluation/benchmarks/browsing_delegation/run_infer.py index 5f3ee99d74..0ef080dbca 100644 --- a/evaluation/benchmarks/browsing_delegation/run_infer.py +++ b/evaluation/benchmarks/browsing_delegation/run_infer.py @@ -25,6 +25,7 @@ from openhands.core.config import ( from openhands.core.logger import openhands_logger as logger from openhands.core.main import create_runtime, run_controller from openhands.events.action import MessageAction +from openhands.utils.async_utils import call_async_from_sync # Only CodeActAgent can delegate to BrowsingAgent SUPPORTED_AGENT_CLS = {'CodeActAgent'} @@ -74,6 +75,7 @@ def process_instance( ) runtime = create_runtime(config) + call_async_from_sync(runtime.connect) state: State | None = asyncio.run( run_controller( diff --git a/evaluation/benchmarks/commit0_bench/run_infer.py b/evaluation/benchmarks/commit0_bench/run_infer.py index cf6148975b..63d394a029 100644 --- a/evaluation/benchmarks/commit0_bench/run_infer.py +++ b/evaluation/benchmarks/commit0_bench/run_infer.py @@ -35,6 +35,7 @@ from openhands.events.action import CmdRunAction, MessageAction from openhands.events.observation import CmdOutputObservation, ErrorObservation from openhands.events.serialization.event import event_to_dict from openhands.runtime.base import Runtime +from openhands.utils.async_utils import call_async_from_sync from openhands.utils.shutdown_listener import sleep_if_should_continue USE_HINT_TEXT = os.environ.get('USE_HINT_TEXT', 'false').lower() == 'true' @@ -394,6 +395,7 @@ def process_instance( logger.info(f'Starting evaluation for instance {instance.instance_id}.') runtime = create_runtime(config) + call_async_from_sync(runtime.connect) try: initialize_runtime(runtime, instance) diff --git a/evaluation/benchmarks/discoverybench/run_infer.py b/evaluation/benchmarks/discoverybench/run_infer.py index f3fdadab8e..d91d01194d 100644 --- a/evaluation/benchmarks/discoverybench/run_infer.py +++ b/evaluation/benchmarks/discoverybench/run_infer.py @@ -34,6 +34,7 @@ from openhands.core.main import create_runtime, run_controller from openhands.events.action import AgentFinishAction, CmdRunAction, MessageAction from openhands.events.observation import CmdOutputObservation from openhands.runtime.base import Runtime +from openhands.utils.async_utils import call_async_from_sync EVALUATION_LLM = 'gpt-4-1106-preview' @@ -281,6 +282,7 @@ def process_instance( # Here's how you can run the agent (similar to the `main` function) and get the final task state runtime = create_runtime(config) + call_async_from_sync(runtime.connect) initialize_runtime(runtime, instance.data_files) state: State | None = asyncio.run( diff --git a/evaluation/benchmarks/gaia/run_infer.py b/evaluation/benchmarks/gaia/run_infer.py index e0e5ed0363..e63026e813 100644 --- a/evaluation/benchmarks/gaia/run_infer.py +++ b/evaluation/benchmarks/gaia/run_infer.py @@ -31,6 +31,7 @@ from openhands.core.main import create_runtime, run_controller from openhands.events.action import AgentFinishAction, CmdRunAction, MessageAction from openhands.events.observation import CmdOutputObservation from openhands.runtime.base import Runtime +from openhands.utils.async_utils import call_async_from_sync DATASET_CACHE_DIR = os.path.join(os.path.dirname(__file__), 'data') @@ -148,6 +149,7 @@ def process_instance( logger.info(f'Instruction:\n{instruction}', extra={'msg_type': 'OBSERVATION'}) runtime = create_runtime(config) + call_async_from_sync(runtime.connect) initialize_runtime(runtime, instance) # Here's how you can run the agent (similar to the `main` function) and get the final task state diff --git a/evaluation/benchmarks/gorilla/run_infer.py b/evaluation/benchmarks/gorilla/run_infer.py index 22b42a8545..e856fa267c 100644 --- a/evaluation/benchmarks/gorilla/run_infer.py +++ b/evaluation/benchmarks/gorilla/run_infer.py @@ -26,6 +26,7 @@ from openhands.core.config import ( from openhands.core.logger import openhands_logger as logger from openhands.core.main import create_runtime, run_controller from openhands.events.action import MessageAction +from openhands.utils.async_utils import call_async_from_sync AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = { 'CodeActAgent': codeact_user_response, @@ -82,6 +83,7 @@ def process_instance( # Here's how you can run the agent (similar to the `main` function) and get the final task state runtime = create_runtime(config) + call_async_from_sync(runtime.connect) state: State | None = asyncio.run( run_controller( config=config, diff --git a/evaluation/benchmarks/gpqa/run_infer.py b/evaluation/benchmarks/gpqa/run_infer.py index 0f19755c34..e297e3fb9e 100644 --- a/evaluation/benchmarks/gpqa/run_infer.py +++ b/evaluation/benchmarks/gpqa/run_infer.py @@ -49,6 +49,7 @@ from openhands.events.action import ( MessageAction, ) from openhands.events.observation import Observation +from openhands.utils.async_utils import call_async_from_sync ACTION_FORMAT = """ < Runtime: """Create a runtime for the agent to run on. - config: The app config. - sid: (optional) The session id. IMPORTANT: please don't set this unless you know what you're doing. - Set it to incompatible value will cause unexpected behavior on RemoteRuntime. - headless_mode: Whether the agent is run in headless mode. `create_runtime` is typically called within evaluation scripts, - where we don't want to have the VSCode UI open, so it defaults to True. - selected_repository: (optional) The GitHub repository to use. - github_token: (optional) The GitHub token to use. + Args: + config: The app config. + sid: (optional) The session id. IMPORTANT: please don't set this unless you know what you're doing. + Set it to incompatible value will cause unexpected behavior on RemoteRuntime. + headless_mode: Whether the agent is run in headless mode. `create_runtime` is typically called within evaluation scripts, + where we don't want to have the VSCode UI open, so it defaults to True. + agent: (optional) The agent instance to use for configuring the runtime. + + Returns: + The created Runtime instance (not yet connected or initialized). """ # if sid is provided on the command line, use it as the name of the event stream # otherwise generate it on the basis of the configured jwt_secret @@ -74,8 +74,30 @@ def create_runtime( headless_mode=headless_mode, ) - call_async_from_sync(runtime.connect) + logger.debug( + f'Runtime created with plugins: {[plugin.name for plugin in runtime.plugins]}' + ) + return runtime + + +def initialize_repository_for_runtime( + runtime: Runtime, + agent: Agent | None = None, + selected_repository: str | None = None, + github_token: SecretStr | None = None, +) -> str | None: + """Initialize the repository for the runtime. + + Args: + runtime: The runtime to initialize the repository for. + agent: (optional) The agent to load microagents for. + selected_repository: (optional) The GitHub repository to use. + github_token: (optional) The GitHub token to use. + + Returns: + The repository directory path if a repository was cloned, None otherwise. + """ # clone selected repository if provided repo_directory = None github_token = ( @@ -98,11 +120,7 @@ def create_runtime( agent.prompt_manager.load_microagents(microagents) agent.prompt_manager.set_repository_info(selected_repository, repo_directory) - logger.debug( - f'Runtime initialized with plugins: {[plugin.name for plugin in runtime.plugins]}' - ) - - return runtime + return repo_directory def create_agent(config: AppConfig) -> Agent: