rename commit0_bench to commit0 (#7124)

2025-12-26 05:48:36 +08:00 · 2025-03-05 18:55:39 -08:00 · 2025-03-05 18:55:39 -08:00 · ec087993f1
commit ec087993f1
parent f12e9e94f7
3 changed files with 19 additions and 17 deletions
--- a/evaluation/benchmarks/commit0_bench/README.md
+++ b/evaluation/benchmarks/commit0_bench/README.md
@ -23,10 +23,10 @@ Make sure your Docker daemon is running, and you have ample disk space (at least
 When the `run_infer.sh` script is started, it will automatically pull the `lite` split in Commit0. For example, for instance ID `commit-0/minitorch`, it will try to pull our pre-build docker image `wentingzhao/minitorch` from DockerHub. This image will be used create an OpenHands runtime image where the agent will operate on.

 ```bash
-./evaluation/benchmarks/commit0_bench/scripts/run_infer.sh [repo_split] [model_config] [git-version] [agent] [eval_limit] [max_iter] [num_workers] [dataset] [dataset_split]
+./evaluation/benchmarks/commit0/scripts/run_infer.sh [repo_split] [model_config] [git-version] [agent] [eval_limit] [max_iter] [num_workers] [dataset] [dataset_split]

 # Example
-./evaluation/benchmarks/commit0_bench/scripts/run_infer.sh lite llm.eval_sonnet HEAD CodeActAgent 16 100 8 wentingzhao/commit0_combined test
+./evaluation/benchmarks/commit0/scripts/run_infer.sh lite llm.eval_sonnet HEAD CodeActAgent 16 100 8 wentingzhao/commit0_combined test
 ```

 where `model_config` is mandatory, and the rest are optional.
@ -53,7 +53,7 @@ Let's say you'd like to run 10 instances using `llm.eval_sonnet` and CodeActAgen
 then your command would be:

 ```bash
-./evaluation/benchmarks/commit0_bench/scripts/run_infer.sh lite llm.eval_sonnet HEAD CodeActAgent 10 30 1 wentingzhao/commit0_combined test
+./evaluation/benchmarks/commit0/scripts/run_infer.sh lite llm.eval_sonnet HEAD CodeActAgent 10 30 1 wentingzhao/commit0_combined test
 ```

 ### Run Inference on `RemoteRuntime`
@ -62,11 +62,11 @@ This is in beta. Fill out [this form](https://docs.google.com/forms/d/e/1FAIpQLS


 ```bash
-./evaluation/benchmarks/commit0_bench/scripts/run_infer.sh [repo_split] [model_config] [git-version] [agent] [eval_limit] [max_iter] [num_workers] [dataset] [dataset_split]
+./evaluation/benchmarks/commit0/scripts/run_infer.sh [repo_split] [model_config] [git-version] [agent] [eval_limit] [max_iter] [num_workers] [dataset] [dataset_split]

 # Example - This runs evaluation on CodeActAgent for 10 instances on "wentingzhao/commit0_combined"'s test set, with max 30 iteration per instances, with 1 number of workers running in parallel
 ALLHANDS_API_KEY="YOUR-API-KEY" RUNTIME=remote SANDBOX_REMOTE_RUNTIME_API_URL="https://runtime.eval.all-hands.dev" EVAL_DOCKER_IMAGE_PREFIX="docker.io/wentingzhao" \
-./evaluation/benchmarks/commit0_bench/scripts/run_infer.sh lite llm.eval_sonnet HEAD CodeActAgent 10 30 1 wentingzhao/commit0_combined test
+./evaluation/benchmarks/commit0/scripts/run_infer.sh lite llm.eval_sonnet HEAD CodeActAgent 10 30 1 wentingzhao/commit0_combined test
 ```

 To clean-up all existing runtime you've already started, run:
--- a/evaluation/benchmarks/commit0_bench/run_infer.py
+++ b/evaluation/benchmarks/commit0_bench/run_infer.py
@ -301,16 +301,6 @@ def complete_runtime(
    pytest_exit_code = obs.content.strip()
    # logger.info(f'Pytest exit code: {pytest_exit_code}')

-    # Read the test report
-    action = CmdRunAction(command='cat report.json')
-    action.set_hard_timeout(600)
-    logger.info(action, extra={'msg_type': 'ACTION'})
-    obs = runtime.run_action(action)
-    # logger.info(obs, extra={'msg_type': 'OBSERVATION'})
-    assert_and_raise(
-        isinstance(obs, CmdOutputObservation),
-        f'Failed to read test report: {str(obs)}',
-    )
    # Get test IDs from instance
    repo_name = instance['repo'].split('/')[1]
    repo_name = repo_name.replace('.', '-')
@ -321,8 +311,20 @@ def complete_runtime(
    # logger.info(obs, extra={'msg_type': 'OBSERVATION'})
    test_ids = obs.content.strip().split('\n')

+    # Read the test report
+    action = CmdRunAction(command='cat report.json')
+    action.set_hard_timeout(600)
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = runtime.run_action(action)
+    # logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    assert_and_raise(
+        isinstance(obs, CmdOutputObservation),
+        f'Failed to read test report: {str(obs)}',
+    )
+    json_report = obs.content.strip()
+
    try:
-        report = json.loads(obs.content)
+        report = json.loads(json_report)
        tests = {x['nodeid']: x['call'] for x in report['tests'] if 'call' in x}

        # Calculate test statistics
--- a/evaluation/benchmarks/commit0_bench/scripts/run_infer.sh
+++ b/evaluation/benchmarks/commit0_bench/scripts/run_infer.sh
@ -84,7 +84,7 @@ fi

 function run_eval() {
  local eval_note=$1
-  COMMAND="poetry run python evaluation/benchmarks/commit0_bench/run_infer.py \
+  COMMAND="poetry run python evaluation/benchmarks/commit0/run_infer.py \
    --agent-cls $AGENT \
    --llm-config $MODEL_CONFIG \
    --max-iterations $MAX_ITER \