mirror of
https://github.com/OpenHands/OpenHands.git
synced 2025-12-26 05:48:36 +08:00
fix(eval): add runtime.connect to all eval harness (#4565)
This commit is contained in:
parent
7340b78962
commit
1f23dc89b6
@ -134,9 +134,11 @@ To create an evaluation workflow for your benchmark, follow these steps:
|
||||
|
||||
4. Create a function to process each instance:
|
||||
```python
|
||||
from openhands.utils.async_utils import call_async_from_sync
|
||||
def process_instance(instance: pd.Series, metadata: EvalMetadata) -> EvalOutput:
|
||||
config = get_config(instance, metadata)
|
||||
runtime = create_runtime(config)
|
||||
call_async_from_sync(runtime.connect)
|
||||
initialize_runtime(runtime, instance)
|
||||
|
||||
instruction = get_instruction(instance, metadata)
|
||||
|
||||
@ -23,6 +23,7 @@ from openhands.core.config import (
|
||||
from openhands.core.logger import openhands_logger as logger
|
||||
from openhands.core.main import create_runtime, run_controller
|
||||
from openhands.events.action import MessageAction
|
||||
from openhands.utils.async_utils import call_async_from_sync
|
||||
|
||||
game = None
|
||||
|
||||
@ -119,6 +120,7 @@ def process_instance(
|
||||
|
||||
# Here's how you can run the agent (similar to the `main` function) and get the final task state
|
||||
runtime = create_runtime(config)
|
||||
call_async_from_sync(runtime.connect)
|
||||
|
||||
state: State | None = asyncio.run(
|
||||
run_controller(
|
||||
|
||||
@ -33,6 +33,7 @@ from openhands.core.main import create_runtime, run_controller
|
||||
from openhands.events.action import AgentFinishAction, CmdRunAction, MessageAction
|
||||
from openhands.events.observation import CmdOutputObservation
|
||||
from openhands.runtime.base import Runtime
|
||||
from openhands.utils.async_utils import call_async_from_sync
|
||||
|
||||
|
||||
def get_config(
|
||||
@ -210,6 +211,7 @@ def process_instance(
|
||||
# =============================================
|
||||
|
||||
runtime: Runtime = create_runtime(config)
|
||||
call_async_from_sync(runtime.connect)
|
||||
|
||||
initialize_runtime(runtime, instance=instance)
|
||||
|
||||
|
||||
@ -33,6 +33,7 @@ from openhands.core.main import create_runtime, run_controller
|
||||
from openhands.events.action import CmdRunAction, MessageAction
|
||||
from openhands.events.observation import CmdOutputObservation
|
||||
from openhands.runtime.base import Runtime
|
||||
from openhands.utils.async_utils import call_async_from_sync
|
||||
|
||||
# Configure visibility of unit tests to the Agent.
|
||||
USE_UNIT_TESTS = os.environ.get('USE_UNIT_TESTS', 'false').lower() == 'true'
|
||||
@ -207,6 +208,7 @@ def process_instance(
|
||||
# =============================================
|
||||
|
||||
runtime: Runtime = create_runtime(config)
|
||||
call_async_from_sync(runtime.connect)
|
||||
|
||||
initialize_runtime(runtime, instance=instance)
|
||||
|
||||
|
||||
@ -30,6 +30,7 @@ from openhands.core.main import create_runtime, run_controller
|
||||
from openhands.events.action import CmdRunAction, MessageAction
|
||||
from openhands.events.observation import CmdOutputObservation
|
||||
from openhands.runtime.base import Runtime
|
||||
from openhands.utils.async_utils import call_async_from_sync
|
||||
|
||||
AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = {
|
||||
'CodeActAgent': functools.partial(
|
||||
@ -275,7 +276,7 @@ def process_instance(
|
||||
instruction += AGENT_CLS_TO_INST_SUFFIX[metadata.agent_class]
|
||||
|
||||
runtime = create_runtime(config)
|
||||
|
||||
call_async_from_sync(runtime.connect)
|
||||
initialize_runtime(runtime, instance)
|
||||
|
||||
# Here's how you can run the agent (similar to the `main` function) and get the final task state
|
||||
|
||||
@ -33,6 +33,7 @@ from openhands.core.main import create_runtime, run_controller
|
||||
from openhands.events.action import CmdRunAction, MessageAction
|
||||
from openhands.events.observation import CmdOutputObservation
|
||||
from openhands.runtime.base import Runtime
|
||||
from openhands.utils.async_utils import call_async_from_sync
|
||||
|
||||
|
||||
def codeact_user_response(state: State) -> str:
|
||||
@ -403,6 +404,7 @@ def process_instance(
|
||||
instruction += AGENT_CLS_TO_INST_SUFFIX[metadata.agent_class]
|
||||
|
||||
runtime = create_runtime(config)
|
||||
call_async_from_sync(runtime.connect)
|
||||
initialize_runtime(runtime, instance)
|
||||
|
||||
# Here's how you can run the agent (similar to the `main` function) and get the final task state
|
||||
|
||||
@ -29,6 +29,7 @@ from openhands.core.main import create_runtime, run_controller
|
||||
from openhands.events.action import AgentFinishAction, CmdRunAction, MessageAction
|
||||
from openhands.events.observation import CmdOutputObservation
|
||||
from openhands.runtime.base import Runtime
|
||||
from openhands.utils.async_utils import call_async_from_sync
|
||||
|
||||
DATASET_CACHE_DIR = os.path.join(os.path.dirname(__file__), 'data')
|
||||
|
||||
@ -142,6 +143,7 @@ def process_instance(
|
||||
logger.info(f'Instruction:\n{instruction}', extra={'msg_type': 'OBSERVATION'})
|
||||
|
||||
runtime = create_runtime(config)
|
||||
call_async_from_sync(runtime.connect)
|
||||
initialize_runtime(runtime, instance)
|
||||
|
||||
# Here's how you can run the agent (similar to the `main` function) and get the final task state
|
||||
|
||||
@ -25,6 +25,7 @@ from openhands.core.config import (
|
||||
from openhands.core.logger import openhands_logger as logger
|
||||
from openhands.core.main import create_runtime, run_controller
|
||||
from openhands.events.action import MessageAction
|
||||
from openhands.utils.async_utils import call_async_from_sync
|
||||
|
||||
AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = {
|
||||
'CodeActAgent': codeact_user_response,
|
||||
@ -81,6 +82,7 @@ def process_instance(
|
||||
|
||||
# Here's how you can run the agent (similar to the `main` function) and get the final task state
|
||||
runtime = create_runtime(config)
|
||||
call_async_from_sync(runtime.connect)
|
||||
state: State | None = asyncio.run(
|
||||
run_controller(
|
||||
config=config,
|
||||
|
||||
@ -48,6 +48,7 @@ from openhands.events.action import (
|
||||
MessageAction,
|
||||
)
|
||||
from openhands.events.observation import Observation
|
||||
from openhands.utils.async_utils import call_async_from_sync
|
||||
|
||||
ACTION_FORMAT = """
|
||||
<<FINAL_ANSWER||
|
||||
@ -215,7 +216,7 @@ Ok now its time to start solving the question. Good luck!
|
||||
"""
|
||||
|
||||
runtime = create_runtime(config)
|
||||
|
||||
call_async_from_sync(runtime.connect)
|
||||
state: State | None = asyncio.run(
|
||||
run_controller(
|
||||
config=config,
|
||||
|
||||
@ -38,6 +38,7 @@ from openhands.core.main import create_runtime, run_controller
|
||||
from openhands.events.action import CmdRunAction, MessageAction
|
||||
from openhands.events.observation import CmdOutputObservation
|
||||
from openhands.runtime.base import Runtime
|
||||
from openhands.utils.async_utils import call_async_from_sync
|
||||
|
||||
IMPORT_HELPER = {
|
||||
'python': [
|
||||
@ -233,6 +234,7 @@ def process_instance(
|
||||
|
||||
# Here's how you can run the agent (similar to the `main` function) and get the final task state
|
||||
runtime = create_runtime(config)
|
||||
call_async_from_sync(runtime.connect)
|
||||
initialize_runtime(runtime, instance)
|
||||
state: State | None = asyncio.run(
|
||||
run_controller(
|
||||
|
||||
@ -25,6 +25,7 @@ from openhands.core.logger import openhands_logger as logger
|
||||
from openhands.core.main import create_runtime, run_controller
|
||||
from openhands.events.action import MessageAction
|
||||
from openhands.runtime.base import Runtime
|
||||
from openhands.utils.async_utils import call_async_from_sync
|
||||
|
||||
FAKE_RESPONSES = {
|
||||
'CodeActAgent': codeact_user_response,
|
||||
@ -101,6 +102,7 @@ def process_instance(
|
||||
# =============================================
|
||||
|
||||
runtime: Runtime = create_runtime(config)
|
||||
call_async_from_sync(runtime.connect)
|
||||
|
||||
test_class.initialize_runtime(runtime)
|
||||
|
||||
|
||||
@ -30,6 +30,7 @@ from openhands.events.action import (
|
||||
)
|
||||
from openhands.events.observation import CmdOutputObservation
|
||||
from openhands.runtime.base import Runtime
|
||||
from openhands.utils.async_utils import call_async_from_sync
|
||||
|
||||
AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = {
|
||||
'CodeActAgent': codeact_user_response,
|
||||
@ -202,6 +203,7 @@ def process_instance(
|
||||
instruction += AGENT_CLS_TO_INST_SUFFIX[metadata.agent_class]
|
||||
|
||||
runtime = create_runtime(config)
|
||||
call_async_from_sync(runtime.connect)
|
||||
initialize_runtime(runtime, instance)
|
||||
|
||||
# Here's how you can run the agent (similar to the `main` function) and get the final task state
|
||||
|
||||
@ -35,6 +35,7 @@ from openhands.runtime.browser.browser_env import (
|
||||
BROWSER_EVAL_GET_GOAL_ACTION,
|
||||
BROWSER_EVAL_GET_REWARDS_ACTION,
|
||||
)
|
||||
from openhands.utils.async_utils import call_async_from_sync
|
||||
|
||||
SUPPORTED_AGENT_CLS = {'BrowsingAgent'}
|
||||
|
||||
@ -127,6 +128,7 @@ def process_instance(
|
||||
logger.info(f'Starting evaluation for instance {env_id}.')
|
||||
|
||||
runtime = create_runtime(config)
|
||||
call_async_from_sync(runtime.connect)
|
||||
task_str = initialize_runtime(runtime)
|
||||
state: State | None = asyncio.run(
|
||||
run_controller(
|
||||
|
||||
@ -33,6 +33,7 @@ from openhands.events.action import (
|
||||
)
|
||||
from openhands.events.observation import CmdOutputObservation
|
||||
from openhands.runtime.base import Runtime
|
||||
from openhands.utils.async_utils import call_async_from_sync
|
||||
|
||||
|
||||
def codeact_user_response_mint(state: State, task: Task, task_config: dict[str, int]):
|
||||
@ -176,6 +177,7 @@ def process_instance(
|
||||
)
|
||||
|
||||
runtime = create_runtime(config)
|
||||
call_async_from_sync(runtime.connect)
|
||||
initialize_runtime(runtime)
|
||||
|
||||
state: State | None = asyncio.run(
|
||||
|
||||
@ -42,6 +42,7 @@ from openhands.core.main import create_runtime, run_controller
|
||||
from openhands.events.action import CmdRunAction, MessageAction
|
||||
from openhands.events.observation import CmdOutputObservation
|
||||
from openhands.runtime.base import Runtime
|
||||
from openhands.utils.async_utils import call_async_from_sync
|
||||
|
||||
config = load_app_config()
|
||||
|
||||
@ -233,6 +234,7 @@ def process_instance(instance: Any, metadata: EvalMetadata, reset_logger: bool =
|
||||
instruction += AGENT_CLS_TO_INST_SUFFIX[metadata.agent_class]
|
||||
|
||||
runtime = create_runtime(config)
|
||||
call_async_from_sync(runtime.connect)
|
||||
initialize_runtime(runtime, instance)
|
||||
|
||||
# Run the agent
|
||||
|
||||
@ -28,6 +28,7 @@ from openhands.core.logger import openhands_logger as logger
|
||||
from openhands.core.main import create_runtime
|
||||
from openhands.events.action import CmdRunAction
|
||||
from openhands.events.observation import CmdOutputObservation
|
||||
from openhands.utils.async_utils import call_async_from_sync
|
||||
|
||||
# TODO: migrate all swe-bench docker to ghcr.io/openhands
|
||||
DOCKER_IMAGE_PREFIX = os.environ.get('EVAL_DOCKER_IMAGE_PREFIX', 'docker.io/xingyaoww/')
|
||||
@ -128,7 +129,7 @@ def process_instance(
|
||||
)
|
||||
|
||||
runtime = create_runtime(config)
|
||||
|
||||
call_async_from_sync(runtime.connect)
|
||||
# Get patch and save it to /tmp/patch.diff
|
||||
with tempfile.TemporaryDirectory() as temp_dir:
|
||||
# Patch file
|
||||
|
||||
@ -35,6 +35,7 @@ from openhands.events.observation import CmdOutputObservation, ErrorObservation
|
||||
from openhands.events.serialization.event import event_to_dict
|
||||
from openhands.runtime.base import Runtime
|
||||
from openhands.runtime.utils.shutdown_listener import sleep_if_should_continue
|
||||
from openhands.utils.async_utils import call_async_from_sync
|
||||
|
||||
USE_HINT_TEXT = os.environ.get('USE_HINT_TEXT', 'false').lower() == 'true'
|
||||
USE_INSTANCE_IMAGE = os.environ.get('USE_INSTANCE_IMAGE', 'false').lower() == 'true'
|
||||
@ -380,6 +381,7 @@ def process_instance(
|
||||
logger.info(f'Starting evaluation for instance {instance.instance_id}.')
|
||||
|
||||
runtime = create_runtime(config)
|
||||
call_async_from_sync(runtime.connect)
|
||||
|
||||
try:
|
||||
initialize_runtime(runtime, instance)
|
||||
|
||||
@ -26,6 +26,7 @@ from openhands.core.main import create_runtime, run_controller
|
||||
from openhands.events.action import CmdRunAction, MessageAction
|
||||
from openhands.events.observation import CmdOutputObservation
|
||||
from openhands.runtime.base import Runtime
|
||||
from openhands.utils.async_utils import call_async_from_sync
|
||||
|
||||
AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = {
|
||||
'CodeActAgent': codeact_user_response,
|
||||
@ -103,6 +104,7 @@ def process_instance(instance: Any, metadata: EvalMetadata, reset_logger: bool =
|
||||
logger.info(f'Instruction:\n{instruction}', extra={'msg_type': 'OBSERVATION'})
|
||||
|
||||
runtime = create_runtime(config)
|
||||
call_async_from_sync(runtime.connect)
|
||||
initialize_runtime(runtime)
|
||||
|
||||
# Here's how you can run the agent (similar to the `main` function) and get the final task state
|
||||
|
||||
@ -35,6 +35,7 @@ from openhands.runtime.browser.browser_env import (
|
||||
BROWSER_EVAL_GET_GOAL_ACTION,
|
||||
BROWSER_EVAL_GET_REWARDS_ACTION,
|
||||
)
|
||||
from openhands.utils.async_utils import call_async_from_sync
|
||||
|
||||
SUPPORTED_AGENT_CLS = {'BrowsingAgent'}
|
||||
|
||||
@ -143,6 +144,7 @@ def process_instance(
|
||||
logger.info(f'Starting evaluation for instance {env_id}.')
|
||||
|
||||
runtime = create_runtime(config)
|
||||
call_async_from_sync(runtime.connect)
|
||||
task_str = initialize_runtime(runtime)
|
||||
|
||||
state: State | None = asyncio.run(
|
||||
|
||||
@ -122,7 +122,7 @@ async def run_controller(
|
||||
|
||||
if runtime is None:
|
||||
runtime = create_runtime(config, sid=sid)
|
||||
await runtime.connect()
|
||||
await runtime.connect()
|
||||
|
||||
event_stream = runtime.event_stream
|
||||
# restore cli session if enabled
|
||||
|
||||
@ -4,6 +4,7 @@ This module monitors the app for shutdown signals
|
||||
|
||||
import asyncio
|
||||
import signal
|
||||
import threading
|
||||
import time
|
||||
from types import FrameType
|
||||
|
||||
@ -29,8 +30,11 @@ def _register_signal_handlers():
|
||||
if _should_exit is not None:
|
||||
return
|
||||
_should_exit = False
|
||||
for sig in HANDLED_SIGNALS:
|
||||
_register_signal_handler(sig)
|
||||
|
||||
# Check if we're in the main thread of the main interpreter
|
||||
if threading.current_thread() is threading.main_thread():
|
||||
for sig in HANDLED_SIGNALS:
|
||||
_register_signal_handler(sig)
|
||||
|
||||
|
||||
def should_exit() -> bool:
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user