[eval] stop set sid in eval (#4311)

2025-12-26 05:48:36 +08:00 · 2024-10-09 22:47:27 -05:00 · 2024-10-09 22:47:27 -05:00 · b23c7aab5a
commit b23c7aab5a
parent a6993b7bf5
21 changed files with 21 additions and 30 deletions
--- a/docs/modules/usage/how-to/evaluation-harness.md
+++ b/docs/modules/usage/how-to/evaluation-harness.md
@ -136,7 +136,7 @@ To create an evaluation workflow for your benchmark, follow these steps:
   ```python
   def process_instance(instance: pd.Series, metadata: EvalMetadata) -> EvalOutput:
       config = get_config(instance, metadata)
-       runtime = create_runtime(config, sid=instance.instance_id)
+       runtime = create_runtime(config)
       initialize_runtime(runtime, instance)

       instruction = get_instruction(instance, metadata)
--- a/evaluation/EDA/run_infer.py
+++ b/evaluation/EDA/run_infer.py
@ -118,7 +118,7 @@ def process_instance(
    instruction += AGENT_CLS_TO_INST_SUFFIX[metadata.agent_class]

    # Here's how you can run the agent (similar to the `main` function) and get the final task state
-    runtime = create_runtime(config, sid=instance['text'].strip())
+    runtime = create_runtime(config)

    state: State | None = asyncio.run(
        run_controller(
--- a/evaluation/agent_bench/run_infer.py
+++ b/evaluation/agent_bench/run_infer.py
@ -209,7 +209,7 @@ def process_instance(
    # create sandbox and run the agent
    # =============================================

-    runtime: Runtime = create_runtime(config, sid=instance.instance_id)
+    runtime: Runtime = create_runtime(config)

    initialize_runtime(runtime, instance=instance)

--- a/evaluation/aider_bench/run_infer.py
+++ b/evaluation/aider_bench/run_infer.py
@ -203,7 +203,7 @@ def process_instance(
    # create sandbox and run the agent
    # =============================================

-    runtime: Runtime = create_runtime(config, sid=str(instance.instance_id))
+    runtime: Runtime = create_runtime(config)

    initialize_runtime(runtime, instance=instance)

--- a/evaluation/biocoder/run_infer.py
+++ b/evaluation/biocoder/run_infer.py
@ -274,10 +274,7 @@ def process_instance(
    # NOTE: You can actually set slightly different instruction for different agents
    instruction += AGENT_CLS_TO_INST_SUFFIX[metadata.agent_class]

-    # use a session id for concurrent evaluation
-    sid = instance.instance_id.replace('/', '__')
-
-    runtime = create_runtime(config, sid=sid)
+    runtime = create_runtime(config)

    initialize_runtime(runtime, instance)

--- a/evaluation/bird/run_infer.py
+++ b/evaluation/bird/run_infer.py
@ -402,7 +402,7 @@ def process_instance(
    # NOTE: You can actually set slightly different instruction for different agents
    instruction += AGENT_CLS_TO_INST_SUFFIX[metadata.agent_class]

-    runtime = create_runtime(config, sid=instance_id)
+    runtime = create_runtime(config)
    initialize_runtime(runtime, instance)

    # Here's how you can run the agent (similar to the `main` function) and get the final task state
--- a/evaluation/browsing_delegation/run_infer.py
+++ b/evaluation/browsing_delegation/run_infer.py
@ -72,7 +72,7 @@ def process_instance(
        f'NOTE: You should copy the "query" as is into the <execute_browse> tag. DO NOT change ANYTHING in the query.'
    )

-    runtime = create_runtime(config, sid=instance.instance_id)
+    runtime = create_runtime(config)

    state: State | None = asyncio.run(
        run_controller(
--- a/evaluation/gaia/run_infer.py
+++ b/evaluation/gaia/run_infer.py
@ -141,7 +141,7 @@ def process_instance(
    instruction += AGENT_CLS_TO_INST_SUFFIX.get(metadata.agent_class, '')
    logger.info(f'Instruction:\n{instruction}', extra={'msg_type': 'OBSERVATION'})

-    runtime = create_runtime(config, sid=instance['instance_id'])
+    runtime = create_runtime(config)
    initialize_runtime(runtime, instance)

    # Here's how you can run the agent (similar to the `main` function) and get the final task state
--- a/evaluation/gorilla/run_infer.py
+++ b/evaluation/gorilla/run_infer.py
@ -80,7 +80,7 @@ def process_instance(
    # logger.info(f'Instruction:\n{instruction}', extra={'msg_type': 'OBSERVATION'})

    # Here's how you can run the agent (similar to the `main` function) and get the final task state
-    runtime = create_runtime(config, sid=instance_id)
+    runtime = create_runtime(config)
    state: State | None = asyncio.run(
        run_controller(
            config=config,
--- a/evaluation/gpqa/run_infer.py
+++ b/evaluation/gpqa/run_infer.py
@ -214,7 +214,7 @@ Again do not quit without reporting the answer first.
 Ok now its time to start solving the question. Good luck!
 """

-    runtime = create_runtime(config, sid=f'gptq_{str(instance.instance_id)}')
+    runtime = create_runtime(config)

    state: State | None = asyncio.run(
        run_controller(
--- a/evaluation/humanevalfix/run_infer.py
+++ b/evaluation/humanevalfix/run_infer.py
@ -232,7 +232,7 @@ def process_instance(
    instruction += AGENT_CLS_TO_INST_SUFFIX[metadata.agent_class]

    # Here's how you can run the agent (similar to the `main` function) and get the final task state
-    runtime = create_runtime(config, sid=sid)
+    runtime = create_runtime(config)
    initialize_runtime(runtime, instance)
    state: State | None = asyncio.run(
        run_controller(
--- a/evaluation/logic_reasoning/run_infer.py
+++ b/evaluation/logic_reasoning/run_infer.py
@ -201,10 +201,7 @@ def process_instance(
    # NOTE: You can actually set slightly different instruction for different agents
    instruction += AGENT_CLS_TO_INST_SUFFIX[metadata.agent_class]

-    # use a session id for concurrent evaluation
-    sid = instance['instance_id']
-
-    runtime = create_runtime(config, sid=sid)
+    runtime = create_runtime(config)
    initialize_runtime(runtime, instance)

    # Here's how you can run the agent (similar to the `main` function) and get the final task state
--- a/evaluation/miniwob/run_infer.py
+++ b/evaluation/miniwob/run_infer.py
@ -126,7 +126,7 @@ def process_instance(
    else:
        logger.info(f'Starting evaluation for instance {env_id}.')

-    runtime = create_runtime(config, sid=env_id)
+    runtime = create_runtime(config)
    task_str = initialize_runtime(runtime)
    state: State | None = asyncio.run(
        run_controller(
--- a/evaluation/mint/run_infer.py
+++ b/evaluation/mint/run_infer.py
@ -175,7 +175,7 @@ def process_instance(
        },
    )

-    runtime = create_runtime(config, sid=instance.instance_id)
+    runtime = create_runtime(config)
    initialize_runtime(runtime)

    state: State | None = asyncio.run(
--- a/evaluation/ml_bench/run_infer.py
+++ b/evaluation/ml_bench/run_infer.py
@ -211,9 +211,6 @@ def process_instance(instance: Any, metadata: EvalMetadata, reset_logger: bool =
    else:
        logger.info(f'Starting evaluation for instance {instance["instance_id"]}.')

-    # Create a sandbox, using the instance ID and PID as the session ID to avoid conflicts
-    sid = str(instance['instance_id'])
-
    repo_url = instance['github']
    repo_name = repo_url.split('/')[-1]
    task_path = os.path.join('/workspace', repo_name, instance['path'][2:])
@ -235,7 +232,7 @@ def process_instance(instance: Any, metadata: EvalMetadata, reset_logger: bool =
    )
    instruction += AGENT_CLS_TO_INST_SUFFIX[metadata.agent_class]

-    runtime = create_runtime(config, sid=sid)
+    runtime = create_runtime(config)
    initialize_runtime(runtime, instance)

    # Run the agent
--- a/evaluation/swe_bench/eval_infer.py
+++ b/evaluation/swe_bench/eval_infer.py
@ -127,7 +127,7 @@ def process_instance(
            test_result=instance['test_result'],
        )

-    runtime = create_runtime(config, sid=instance_id)
+    runtime = create_runtime(config)

    # Get patch and save it to /tmp/patch.diff
    with tempfile.TemporaryDirectory() as temp_dir:
--- a/evaluation/swe_bench/run_infer.py
+++ b/evaluation/swe_bench/run_infer.py
@ -365,7 +365,7 @@ def process_instance(
    else:
        logger.info(f'Starting evaluation for instance {instance.instance_id}.')

-    runtime = create_runtime(config, sid=instance.instance_id)
+    runtime = create_runtime(config)

    try:
        initialize_runtime(runtime, instance)
--- a/evaluation/toolqa/run_infer.py
+++ b/evaluation/toolqa/run_infer.py
@ -102,7 +102,7 @@ def process_instance(instance: Any, metadata: EvalMetadata, reset_logger: bool =
    instruction += AGENT_CLS_TO_INST_SUFFIX[metadata.agent_class]
    logger.info(f'Instruction:\n{instruction}', extra={'msg_type': 'OBSERVATION'})

-    runtime = create_runtime(config, sid=qid)
+    runtime = create_runtime(config)
    initialize_runtime(runtime)

    # Here's how you can run the agent (similar to the `main` function) and get the final task state
--- a/evaluation/webarena/run_infer.py
+++ b/evaluation/webarena/run_infer.py
@ -142,7 +142,7 @@ def process_instance(
    else:
        logger.info(f'Starting evaluation for instance {env_id}.')

-    runtime = create_runtime(config, sid=env_id)
+    runtime = create_runtime(config)
    task_str = initialize_runtime(runtime)

    state: State | None = asyncio.run(
--- a/openhands/core/main.py
+++ b/openhands/core/main.py
@ -211,7 +211,7 @@ def generate_sid(config: AppConfig, session_name: str | None = None) -> str:
    jwt_secret = config.jwt_secret

    hash_str = hashlib.sha256(f'{session_name}{jwt_secret}'.encode('utf-8')).hexdigest()
-    return f'{session_name}_{hash_str[:16]}'
+    return f'{session_name}-{hash_str[:16]}'


 if __name__ == '__main__':
--- a/openhands/runtime/remote/runtime.py
+++ b/openhands/runtime/remote/runtime.py
@ -126,7 +126,7 @@ class RemoteRuntime(Runtime):
                timeout=5,
            )
        except Exception as e:
-            logger.error(f'Error while looking for remote runtime: {e}')
+            logger.debug(f'Error while looking for remote runtime: {e}')
            return False

        if response.status_code == 200: