diff --git a/docs/modules/usage/how-to/evaluation-harness.md b/docs/modules/usage/how-to/evaluation-harness.md index 7209e9a567..32717675e3 100644 --- a/docs/modules/usage/how-to/evaluation-harness.md +++ b/docs/modules/usage/how-to/evaluation-harness.md @@ -136,7 +136,7 @@ To create an evaluation workflow for your benchmark, follow these steps: ```python def process_instance(instance: pd.Series, metadata: EvalMetadata) -> EvalOutput: config = get_config(instance, metadata) - runtime = create_runtime(config, sid=instance.instance_id) + runtime = create_runtime(config) initialize_runtime(runtime, instance) instruction = get_instruction(instance, metadata) diff --git a/evaluation/EDA/run_infer.py b/evaluation/EDA/run_infer.py index 2607ba10f8..81c7455e00 100644 --- a/evaluation/EDA/run_infer.py +++ b/evaluation/EDA/run_infer.py @@ -118,7 +118,7 @@ def process_instance( instruction += AGENT_CLS_TO_INST_SUFFIX[metadata.agent_class] # Here's how you can run the agent (similar to the `main` function) and get the final task state - runtime = create_runtime(config, sid=instance['text'].strip()) + runtime = create_runtime(config) state: State | None = asyncio.run( run_controller( diff --git a/evaluation/agent_bench/run_infer.py b/evaluation/agent_bench/run_infer.py index 3b50c526de..b851f86fa4 100644 --- a/evaluation/agent_bench/run_infer.py +++ b/evaluation/agent_bench/run_infer.py @@ -209,7 +209,7 @@ def process_instance( # create sandbox and run the agent # ============================================= - runtime: Runtime = create_runtime(config, sid=instance.instance_id) + runtime: Runtime = create_runtime(config) initialize_runtime(runtime, instance=instance) diff --git a/evaluation/aider_bench/run_infer.py b/evaluation/aider_bench/run_infer.py index fde483d0be..b4698a7c69 100644 --- a/evaluation/aider_bench/run_infer.py +++ b/evaluation/aider_bench/run_infer.py @@ -203,7 +203,7 @@ def process_instance( # create sandbox and run the agent # ============================================= - runtime: Runtime = create_runtime(config, sid=str(instance.instance_id)) + runtime: Runtime = create_runtime(config) initialize_runtime(runtime, instance=instance) diff --git a/evaluation/biocoder/run_infer.py b/evaluation/biocoder/run_infer.py index 5732f8ac65..3574089291 100644 --- a/evaluation/biocoder/run_infer.py +++ b/evaluation/biocoder/run_infer.py @@ -274,10 +274,7 @@ def process_instance( # NOTE: You can actually set slightly different instruction for different agents instruction += AGENT_CLS_TO_INST_SUFFIX[metadata.agent_class] - # use a session id for concurrent evaluation - sid = instance.instance_id.replace('/', '__') - - runtime = create_runtime(config, sid=sid) + runtime = create_runtime(config) initialize_runtime(runtime, instance) diff --git a/evaluation/bird/run_infer.py b/evaluation/bird/run_infer.py index 36d5a4a23b..aae58cc7d5 100644 --- a/evaluation/bird/run_infer.py +++ b/evaluation/bird/run_infer.py @@ -402,7 +402,7 @@ def process_instance( # NOTE: You can actually set slightly different instruction for different agents instruction += AGENT_CLS_TO_INST_SUFFIX[metadata.agent_class] - runtime = create_runtime(config, sid=instance_id) + runtime = create_runtime(config) initialize_runtime(runtime, instance) # Here's how you can run the agent (similar to the `main` function) and get the final task state diff --git a/evaluation/browsing_delegation/run_infer.py b/evaluation/browsing_delegation/run_infer.py index c84dac7bf9..c9fe2ebd18 100644 --- a/evaluation/browsing_delegation/run_infer.py +++ b/evaluation/browsing_delegation/run_infer.py @@ -72,7 +72,7 @@ def process_instance( f'NOTE: You should copy the "query" as is into the tag. DO NOT change ANYTHING in the query.' ) - runtime = create_runtime(config, sid=instance.instance_id) + runtime = create_runtime(config) state: State | None = asyncio.run( run_controller( diff --git a/evaluation/gaia/run_infer.py b/evaluation/gaia/run_infer.py index f460372cc8..9f6f3884f5 100644 --- a/evaluation/gaia/run_infer.py +++ b/evaluation/gaia/run_infer.py @@ -141,7 +141,7 @@ def process_instance( instruction += AGENT_CLS_TO_INST_SUFFIX.get(metadata.agent_class, '') logger.info(f'Instruction:\n{instruction}', extra={'msg_type': 'OBSERVATION'}) - runtime = create_runtime(config, sid=instance['instance_id']) + runtime = create_runtime(config) initialize_runtime(runtime, instance) # Here's how you can run the agent (similar to the `main` function) and get the final task state diff --git a/evaluation/gorilla/run_infer.py b/evaluation/gorilla/run_infer.py index 092c58e610..ac49a8078d 100644 --- a/evaluation/gorilla/run_infer.py +++ b/evaluation/gorilla/run_infer.py @@ -80,7 +80,7 @@ def process_instance( # logger.info(f'Instruction:\n{instruction}', extra={'msg_type': 'OBSERVATION'}) # Here's how you can run the agent (similar to the `main` function) and get the final task state - runtime = create_runtime(config, sid=instance_id) + runtime = create_runtime(config) state: State | None = asyncio.run( run_controller( config=config, diff --git a/evaluation/gpqa/run_infer.py b/evaluation/gpqa/run_infer.py index dc90b88795..fe7ff4bf1c 100644 --- a/evaluation/gpqa/run_infer.py +++ b/evaluation/gpqa/run_infer.py @@ -214,7 +214,7 @@ Again do not quit without reporting the answer first. Ok now its time to start solving the question. Good luck! """ - runtime = create_runtime(config, sid=f'gptq_{str(instance.instance_id)}') + runtime = create_runtime(config) state: State | None = asyncio.run( run_controller( diff --git a/evaluation/humanevalfix/run_infer.py b/evaluation/humanevalfix/run_infer.py index 4bd3663c86..c6d643f94e 100644 --- a/evaluation/humanevalfix/run_infer.py +++ b/evaluation/humanevalfix/run_infer.py @@ -232,7 +232,7 @@ def process_instance( instruction += AGENT_CLS_TO_INST_SUFFIX[metadata.agent_class] # Here's how you can run the agent (similar to the `main` function) and get the final task state - runtime = create_runtime(config, sid=sid) + runtime = create_runtime(config) initialize_runtime(runtime, instance) state: State | None = asyncio.run( run_controller( diff --git a/evaluation/logic_reasoning/run_infer.py b/evaluation/logic_reasoning/run_infer.py index 2f0a306154..7fa6a5bb50 100644 --- a/evaluation/logic_reasoning/run_infer.py +++ b/evaluation/logic_reasoning/run_infer.py @@ -201,10 +201,7 @@ def process_instance( # NOTE: You can actually set slightly different instruction for different agents instruction += AGENT_CLS_TO_INST_SUFFIX[metadata.agent_class] - # use a session id for concurrent evaluation - sid = instance['instance_id'] - - runtime = create_runtime(config, sid=sid) + runtime = create_runtime(config) initialize_runtime(runtime, instance) # Here's how you can run the agent (similar to the `main` function) and get the final task state diff --git a/evaluation/miniwob/run_infer.py b/evaluation/miniwob/run_infer.py index 63a306ac81..1df7dac028 100644 --- a/evaluation/miniwob/run_infer.py +++ b/evaluation/miniwob/run_infer.py @@ -126,7 +126,7 @@ def process_instance( else: logger.info(f'Starting evaluation for instance {env_id}.') - runtime = create_runtime(config, sid=env_id) + runtime = create_runtime(config) task_str = initialize_runtime(runtime) state: State | None = asyncio.run( run_controller( diff --git a/evaluation/mint/run_infer.py b/evaluation/mint/run_infer.py index fe2fd04203..481336d59d 100644 --- a/evaluation/mint/run_infer.py +++ b/evaluation/mint/run_infer.py @@ -175,7 +175,7 @@ def process_instance( }, ) - runtime = create_runtime(config, sid=instance.instance_id) + runtime = create_runtime(config) initialize_runtime(runtime) state: State | None = asyncio.run( diff --git a/evaluation/ml_bench/run_infer.py b/evaluation/ml_bench/run_infer.py index 0c487b4ce7..671c6350a4 100644 --- a/evaluation/ml_bench/run_infer.py +++ b/evaluation/ml_bench/run_infer.py @@ -211,9 +211,6 @@ def process_instance(instance: Any, metadata: EvalMetadata, reset_logger: bool = else: logger.info(f'Starting evaluation for instance {instance["instance_id"]}.') - # Create a sandbox, using the instance ID and PID as the session ID to avoid conflicts - sid = str(instance['instance_id']) - repo_url = instance['github'] repo_name = repo_url.split('/')[-1] task_path = os.path.join('/workspace', repo_name, instance['path'][2:]) @@ -235,7 +232,7 @@ def process_instance(instance: Any, metadata: EvalMetadata, reset_logger: bool = ) instruction += AGENT_CLS_TO_INST_SUFFIX[metadata.agent_class] - runtime = create_runtime(config, sid=sid) + runtime = create_runtime(config) initialize_runtime(runtime, instance) # Run the agent diff --git a/evaluation/swe_bench/eval_infer.py b/evaluation/swe_bench/eval_infer.py index 525bc17e97..14429fc985 100644 --- a/evaluation/swe_bench/eval_infer.py +++ b/evaluation/swe_bench/eval_infer.py @@ -127,7 +127,7 @@ def process_instance( test_result=instance['test_result'], ) - runtime = create_runtime(config, sid=instance_id) + runtime = create_runtime(config) # Get patch and save it to /tmp/patch.diff with tempfile.TemporaryDirectory() as temp_dir: diff --git a/evaluation/swe_bench/run_infer.py b/evaluation/swe_bench/run_infer.py index 835f80ef55..9e4f1c4165 100644 --- a/evaluation/swe_bench/run_infer.py +++ b/evaluation/swe_bench/run_infer.py @@ -365,7 +365,7 @@ def process_instance( else: logger.info(f'Starting evaluation for instance {instance.instance_id}.') - runtime = create_runtime(config, sid=instance.instance_id) + runtime = create_runtime(config) try: initialize_runtime(runtime, instance) diff --git a/evaluation/toolqa/run_infer.py b/evaluation/toolqa/run_infer.py index 2a9dba4114..8b3ebdc58b 100644 --- a/evaluation/toolqa/run_infer.py +++ b/evaluation/toolqa/run_infer.py @@ -102,7 +102,7 @@ def process_instance(instance: Any, metadata: EvalMetadata, reset_logger: bool = instruction += AGENT_CLS_TO_INST_SUFFIX[metadata.agent_class] logger.info(f'Instruction:\n{instruction}', extra={'msg_type': 'OBSERVATION'}) - runtime = create_runtime(config, sid=qid) + runtime = create_runtime(config) initialize_runtime(runtime) # Here's how you can run the agent (similar to the `main` function) and get the final task state diff --git a/evaluation/webarena/run_infer.py b/evaluation/webarena/run_infer.py index 159f49a800..26637e00be 100644 --- a/evaluation/webarena/run_infer.py +++ b/evaluation/webarena/run_infer.py @@ -142,7 +142,7 @@ def process_instance( else: logger.info(f'Starting evaluation for instance {env_id}.') - runtime = create_runtime(config, sid=env_id) + runtime = create_runtime(config) task_str = initialize_runtime(runtime) state: State | None = asyncio.run( diff --git a/openhands/core/main.py b/openhands/core/main.py index adda15aeab..b0702c943b 100644 --- a/openhands/core/main.py +++ b/openhands/core/main.py @@ -211,7 +211,7 @@ def generate_sid(config: AppConfig, session_name: str | None = None) -> str: jwt_secret = config.jwt_secret hash_str = hashlib.sha256(f'{session_name}{jwt_secret}'.encode('utf-8')).hexdigest() - return f'{session_name}_{hash_str[:16]}' + return f'{session_name}-{hash_str[:16]}' if __name__ == '__main__': diff --git a/openhands/runtime/remote/runtime.py b/openhands/runtime/remote/runtime.py index 5e2c35af8d..085a160597 100644 --- a/openhands/runtime/remote/runtime.py +++ b/openhands/runtime/remote/runtime.py @@ -126,7 +126,7 @@ class RemoteRuntime(Runtime): timeout=5, ) except Exception as e: - logger.error(f'Error while looking for remote runtime: {e}') + logger.debug(f'Error while looking for remote runtime: {e}') return False if response.status_code == 200: