mirror of
https://github.com/OpenHands/OpenHands.git
synced 2025-12-26 05:48:36 +08:00
rename commit0_bench to commit0 (#7124)
This commit is contained in:
parent
f12e9e94f7
commit
ec087993f1
@ -23,10 +23,10 @@ Make sure your Docker daemon is running, and you have ample disk space (at least
|
||||
When the `run_infer.sh` script is started, it will automatically pull the `lite` split in Commit0. For example, for instance ID `commit-0/minitorch`, it will try to pull our pre-build docker image `wentingzhao/minitorch` from DockerHub. This image will be used create an OpenHands runtime image where the agent will operate on.
|
||||
|
||||
```bash
|
||||
./evaluation/benchmarks/commit0_bench/scripts/run_infer.sh [repo_split] [model_config] [git-version] [agent] [eval_limit] [max_iter] [num_workers] [dataset] [dataset_split]
|
||||
./evaluation/benchmarks/commit0/scripts/run_infer.sh [repo_split] [model_config] [git-version] [agent] [eval_limit] [max_iter] [num_workers] [dataset] [dataset_split]
|
||||
|
||||
# Example
|
||||
./evaluation/benchmarks/commit0_bench/scripts/run_infer.sh lite llm.eval_sonnet HEAD CodeActAgent 16 100 8 wentingzhao/commit0_combined test
|
||||
./evaluation/benchmarks/commit0/scripts/run_infer.sh lite llm.eval_sonnet HEAD CodeActAgent 16 100 8 wentingzhao/commit0_combined test
|
||||
```
|
||||
|
||||
where `model_config` is mandatory, and the rest are optional.
|
||||
@ -53,7 +53,7 @@ Let's say you'd like to run 10 instances using `llm.eval_sonnet` and CodeActAgen
|
||||
then your command would be:
|
||||
|
||||
```bash
|
||||
./evaluation/benchmarks/commit0_bench/scripts/run_infer.sh lite llm.eval_sonnet HEAD CodeActAgent 10 30 1 wentingzhao/commit0_combined test
|
||||
./evaluation/benchmarks/commit0/scripts/run_infer.sh lite llm.eval_sonnet HEAD CodeActAgent 10 30 1 wentingzhao/commit0_combined test
|
||||
```
|
||||
|
||||
### Run Inference on `RemoteRuntime`
|
||||
@ -62,11 +62,11 @@ This is in beta. Fill out [this form](https://docs.google.com/forms/d/e/1FAIpQLS
|
||||
|
||||
|
||||
```bash
|
||||
./evaluation/benchmarks/commit0_bench/scripts/run_infer.sh [repo_split] [model_config] [git-version] [agent] [eval_limit] [max_iter] [num_workers] [dataset] [dataset_split]
|
||||
./evaluation/benchmarks/commit0/scripts/run_infer.sh [repo_split] [model_config] [git-version] [agent] [eval_limit] [max_iter] [num_workers] [dataset] [dataset_split]
|
||||
|
||||
# Example - This runs evaluation on CodeActAgent for 10 instances on "wentingzhao/commit0_combined"'s test set, with max 30 iteration per instances, with 1 number of workers running in parallel
|
||||
ALLHANDS_API_KEY="YOUR-API-KEY" RUNTIME=remote SANDBOX_REMOTE_RUNTIME_API_URL="https://runtime.eval.all-hands.dev" EVAL_DOCKER_IMAGE_PREFIX="docker.io/wentingzhao" \
|
||||
./evaluation/benchmarks/commit0_bench/scripts/run_infer.sh lite llm.eval_sonnet HEAD CodeActAgent 10 30 1 wentingzhao/commit0_combined test
|
||||
./evaluation/benchmarks/commit0/scripts/run_infer.sh lite llm.eval_sonnet HEAD CodeActAgent 10 30 1 wentingzhao/commit0_combined test
|
||||
```
|
||||
|
||||
To clean-up all existing runtime you've already started, run:
|
||||
@ -301,16 +301,6 @@ def complete_runtime(
|
||||
pytest_exit_code = obs.content.strip()
|
||||
# logger.info(f'Pytest exit code: {pytest_exit_code}')
|
||||
|
||||
# Read the test report
|
||||
action = CmdRunAction(command='cat report.json')
|
||||
action.set_hard_timeout(600)
|
||||
logger.info(action, extra={'msg_type': 'ACTION'})
|
||||
obs = runtime.run_action(action)
|
||||
# logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
||||
assert_and_raise(
|
||||
isinstance(obs, CmdOutputObservation),
|
||||
f'Failed to read test report: {str(obs)}',
|
||||
)
|
||||
# Get test IDs from instance
|
||||
repo_name = instance['repo'].split('/')[1]
|
||||
repo_name = repo_name.replace('.', '-')
|
||||
@ -321,8 +311,20 @@ def complete_runtime(
|
||||
# logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
||||
test_ids = obs.content.strip().split('\n')
|
||||
|
||||
# Read the test report
|
||||
action = CmdRunAction(command='cat report.json')
|
||||
action.set_hard_timeout(600)
|
||||
logger.info(action, extra={'msg_type': 'ACTION'})
|
||||
obs = runtime.run_action(action)
|
||||
# logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
||||
assert_and_raise(
|
||||
isinstance(obs, CmdOutputObservation),
|
||||
f'Failed to read test report: {str(obs)}',
|
||||
)
|
||||
json_report = obs.content.strip()
|
||||
|
||||
try:
|
||||
report = json.loads(obs.content)
|
||||
report = json.loads(json_report)
|
||||
tests = {x['nodeid']: x['call'] for x in report['tests'] if 'call' in x}
|
||||
|
||||
# Calculate test statistics
|
||||
@ -84,7 +84,7 @@ fi
|
||||
|
||||
function run_eval() {
|
||||
local eval_note=$1
|
||||
COMMAND="poetry run python evaluation/benchmarks/commit0_bench/run_infer.py \
|
||||
COMMAND="poetry run python evaluation/benchmarks/commit0/run_infer.py \
|
||||
--agent-cls $AGENT \
|
||||
--llm-config $MODEL_CONFIG \
|
||||
--max-iterations $MAX_ITER \
|
||||
Loading…
x
Reference in New Issue
Block a user