From 678436da30333240f01780f3fa5ac162b0a1d13a Mon Sep 17 00:00:00 2001 From: OpenHands Date: Mon, 25 Nov 2024 08:35:52 -0500 Subject: [PATCH] Fix issue #5222: [Refactor]: Refactor the evaluation directory (#5223) Co-authored-by: Engel Nyst --- .github/workflows/eval-runner.yml | 6 +-- .../usage/how-to/evaluation-harness.md | 2 +- .../usage/how-to/evaluation-harness.md | 2 +- .../usage/how-to/evaluation-harness.md | 2 +- evaluation/README.md | 38 ++++++++++--------- evaluation/{ => benchmarks}/EDA/README.md | 4 +- evaluation/{ => benchmarks}/EDA/game.py | 0 evaluation/{ => benchmarks}/EDA/run_infer.py | 2 +- .../{ => benchmarks}/EDA/scripts/run_infer.sh | 2 +- .../{ => benchmarks}/agent_bench/README.md | 6 +-- .../{ => benchmarks}/agent_bench/__init__.py | 0 .../{ => benchmarks}/agent_bench/helper.py | 0 .../{ => benchmarks}/agent_bench/run_infer.py | 2 +- .../agent_bench/scripts/run_infer.sh | 2 +- .../agent_bench/scripts/summarise_results.py | 0 .../{ => benchmarks}/aider_bench/README.md | 14 +++---- .../aider_bench/create_dataset.py | 0 .../{ => benchmarks}/aider_bench/helper.py | 0 .../{ => benchmarks}/aider_bench/run_infer.py | 2 +- .../aider_bench/scripts/run_infer.sh | 2 +- .../aider_bench/scripts/summarize_results.py | 0 .../{ => benchmarks}/biocoder/README.md | 4 +- .../{ => benchmarks}/biocoder/run_infer.py | 2 +- .../biocoder/scripts/run_infer.sh | 2 +- .../scripts/setup/copy_changed_code.py | 0 .../biocoder/scripts/setup/remove_code.py | 0 evaluation/{ => benchmarks}/biocoder/utils.py | 0 evaluation/{ => benchmarks}/bird/README.md | 2 +- evaluation/{ => benchmarks}/bird/__init__.py | 0 evaluation/{ => benchmarks}/bird/run_infer.py | 0 .../bird/scripts/run_infer.sh | 2 +- .../browsing_delegation/README.md | 2 +- .../browsing_delegation/run_infer.py | 0 .../browsing_delegation/scripts/run_infer.sh | 2 +- .../{ => benchmarks}/commit0_bench/README.md | 12 +++--- .../commit0_bench/run_infer.py | 0 .../scripts/cleanup_remote_runtime.sh | 0 .../commit0_bench/scripts/run_infer.sh | 2 +- .../{ => benchmarks}/discoverybench/README.md | 4 +- .../discoverybench/eval_utils/README.md | 0 .../discoverybench/eval_utils/__init__.py | 0 .../eval_utils/eval_w_subhypo_gen.py | 0 .../discoverybench/eval_utils/lm_utils.py | 0 .../eval_utils/openai_helpers.py | 0 .../eval_utils/openai_semantic_gen_prompts.py | 0 .../eval_utils/response_parser.py | 0 .../discoverybench/run_infer.py | 4 +- .../discoverybench/scripts/run_infer.sh | 2 +- evaluation/{ => benchmarks}/gaia/README.md | 10 ++--- evaluation/{ => benchmarks}/gaia/get_score.py | 0 evaluation/{ => benchmarks}/gaia/run_infer.py | 2 +- evaluation/{ => benchmarks}/gaia/scorer.py | 0 .../gaia/scripts/run_infer.sh | 2 +- evaluation/{ => benchmarks}/gorilla/README.md | 4 +- .../{ => benchmarks}/gorilla/ast_eval_hf.py | 0 .../{ => benchmarks}/gorilla/ast_eval_tf.py | 0 .../{ => benchmarks}/gorilla/ast_eval_th.py | 0 .../{ => benchmarks}/gorilla/run_infer.py | 2 +- .../gorilla/scripts/run_infer.sh | 2 +- evaluation/{ => benchmarks}/gorilla/utils.py | 0 evaluation/{ => benchmarks}/gpqa/README.md | 2 +- evaluation/{ => benchmarks}/gpqa/__init__.py | 0 evaluation/{ => benchmarks}/gpqa/run_infer.py | 0 .../gpqa/scripts/run_infer.sh | 2 +- .../{ => benchmarks}/humanevalfix/README.md | 2 +- .../{ => benchmarks}/humanevalfix/__init__.py | 0 .../humanevalfix/run_infer.py | 0 .../humanevalfix/scripts/run_infer.sh | 2 +- .../logic_reasoning/.cache_program/facts.kfb | 0 .../logic_reasoning/.cache_program/rules.krb | 0 .../logic_reasoning/Dockerfile | 0 .../logic_reasoning/README.md | 2 +- .../logic_reasoning/__init__.py | 0 .../logic_reasoning/instruction.txt | 0 .../logic_reasoning/logic_inference.py | 0 .../logic_reasoning/run_infer.py | 0 .../logic_reasoning/scripts/run_infer.sh | 2 +- .../{ => benchmarks}/miniwob/Dockerfile | 0 evaluation/{ => benchmarks}/miniwob/README.md | 8 ++-- .../miniwob/get_avg_reward.py | 0 .../{ => benchmarks}/miniwob/run_infer.py | 0 .../miniwob/scripts/run_infer.sh | 2 +- evaluation/{ => benchmarks}/mint/.gitignore | 0 evaluation/{ => benchmarks}/mint/Dockerfile | 0 evaluation/{ => benchmarks}/mint/README.md | 6 +-- .../{ => benchmarks}/mint/config_variables.py | 0 evaluation/{ => benchmarks}/mint/datatypes.py | 0 evaluation/{ => benchmarks}/mint/env.py | 0 .../{ => benchmarks}/mint/prompts/__init__.py | 0 .../mint/prompts/template_with_tool.txt | 0 .../{ => benchmarks}/mint/requirements.txt | 0 evaluation/{ => benchmarks}/mint/run_infer.py | 8 ++-- .../mint/scripts/run_infer.sh | 0 .../{ => benchmarks}/mint/tasks/__init__.py | 6 +-- .../{ => benchmarks}/mint/tasks/base.py | 0 .../{ => benchmarks}/mint/tasks/codegen.py | 2 +- .../humaneval/with_tool.txt | 0 .../in_context_examples/mbpp/with_tool.txt | 0 .../reasoning/with_tool.txt | 0 .../{ => benchmarks}/mint/tasks/reasoning.py | 0 evaluation/{ => benchmarks}/mint/utils.py | 0 .../{ => benchmarks}/ml_bench/README.md | 14 +++---- .../{ => benchmarks}/ml_bench/__init__.py | 0 .../{ => benchmarks}/ml_bench/run_analysis.py | 0 .../{ => benchmarks}/ml_bench/run_infer.py | 0 .../ml_bench/scripts/cleanup.sh | 0 .../ml_bench/scripts/run_analysis.sh | 2 +- .../ml_bench/scripts/run_infer.sh | 2 +- .../ml_bench/scripts/summarise_results.py | 0 .../scienceagentbench/Dockerfile | 0 .../scienceagentbench/Dockerfile.evaluator | 0 .../scienceagentbench/README.md | 4 +- .../scienceagentbench/post_proc.py | 0 .../scienceagentbench/run_infer.py | 0 .../scienceagentbench/scripts/run_infer.sh | 2 +- .../{ => benchmarks}/swe_bench/README.md | 34 ++++++++--------- .../{ => benchmarks}/swe_bench/__init__.py | 0 .../{ => benchmarks}/swe_bench/eval_infer.py | 2 +- .../examples/example_agent_output.jsonl | 0 .../examples/example_model_output.json | 0 .../{ => benchmarks}/swe_bench/prompt.py | 0 .../{ => benchmarks}/swe_bench/run_infer.py | 2 +- .../scripts/cleanup_remote_runtime.sh | 0 .../all-swebench-full-instance-images.txt | 0 .../all-swebench-lite-instance-images.txt | 0 .../scripts/docker/pull_all_eval_docker.sh | 0 .../docker/push_docker_instance_images.py | 2 +- .../scripts/docker/push_eval_docker.sh | 0 .../swe_bench/scripts/eval/compare_outputs.py | 0 ...onvert_oh_folder_to_swebench_submission.sh | 2 +- .../scripts/eval/convert_oh_output_to_md.py | 2 +- .../eval/convert_oh_output_to_swe_json.py | 2 +- .../scripts/eval/download_gold_patch.py | 0 .../scripts/eval/summarize_outputs.py | 0 .../scripts/eval/update_output_with_eval.py | 0 .../swe_bench/scripts/eval_infer.sh | 4 +- .../swe_bench/scripts/eval_infer_remote.sh | 4 +- .../swe_bench/scripts/run_infer.sh | 2 +- .../scripts/setup/compare_patch_filename.py | 0 .../scripts/setup/instance_swe_entry.sh | 0 .../scripts/setup/prepare_swe_utils.sh | 2 +- .../swe_bench/scripts/setup/swe_entry.sh | 0 evaluation/{ => benchmarks}/toolqa/Dockerfile | 0 evaluation/{ => benchmarks}/toolqa/README.md | 4 +- .../{ => benchmarks}/toolqa/run_infer.py | 2 +- .../toolqa/scripts/run_infer.sh | 2 +- evaluation/{ => benchmarks}/toolqa/utils.py | 0 .../{ => benchmarks}/webarena/README.md | 4 +- .../{ => benchmarks}/webarena/__init__.py | 0 .../webarena/get_success_rate.py | 0 .../{ => benchmarks}/webarena/run_infer.py | 0 .../webarena/scripts/run_infer.sh | 4 +- 152 files changed, 147 insertions(+), 143 deletions(-) rename evaluation/{ => benchmarks}/EDA/README.md (88%) rename evaluation/{ => benchmarks}/EDA/game.py (100%) rename evaluation/{ => benchmarks}/EDA/run_infer.py (99%) rename evaluation/{ => benchmarks}/EDA/scripts/run_infer.sh (95%) rename evaluation/{ => benchmarks}/agent_bench/README.md (80%) rename evaluation/{ => benchmarks}/agent_bench/__init__.py (100%) rename evaluation/{ => benchmarks}/agent_bench/helper.py (100%) rename evaluation/{ => benchmarks}/agent_bench/run_infer.py (99%) rename evaluation/{ => benchmarks}/agent_bench/scripts/run_infer.sh (84%) rename evaluation/{ => benchmarks}/agent_bench/scripts/summarise_results.py (100%) rename evaluation/{ => benchmarks}/aider_bench/README.md (80%) rename evaluation/{ => benchmarks}/aider_bench/create_dataset.py (100%) rename evaluation/{ => benchmarks}/aider_bench/helper.py (100%) rename evaluation/{ => benchmarks}/aider_bench/run_infer.py (99%) rename evaluation/{ => benchmarks}/aider_bench/scripts/run_infer.sh (89%) rename evaluation/{ => benchmarks}/aider_bench/scripts/summarize_results.py (100%) rename evaluation/{ => benchmarks}/biocoder/README.md (92%) rename evaluation/{ => benchmarks}/biocoder/run_infer.py (99%) rename evaluation/{ => benchmarks}/biocoder/scripts/run_infer.sh (92%) rename evaluation/{ => benchmarks}/biocoder/scripts/setup/copy_changed_code.py (100%) rename evaluation/{ => benchmarks}/biocoder/scripts/setup/remove_code.py (100%) rename evaluation/{ => benchmarks}/biocoder/utils.py (100%) rename evaluation/{ => benchmarks}/bird/README.md (99%) rename evaluation/{ => benchmarks}/bird/__init__.py (100%) rename evaluation/{ => benchmarks}/bird/run_infer.py (100%) rename evaluation/{ => benchmarks}/bird/scripts/run_infer.sh (92%) rename evaluation/{ => benchmarks}/browsing_delegation/README.md (91%) rename evaluation/{ => benchmarks}/browsing_delegation/run_infer.py (100%) rename evaluation/{ => benchmarks}/browsing_delegation/scripts/run_infer.sh (90%) rename evaluation/{ => benchmarks}/commit0_bench/README.md (81%) rename evaluation/{ => benchmarks}/commit0_bench/run_infer.py (100%) rename evaluation/{ => benchmarks}/commit0_bench/scripts/cleanup_remote_runtime.sh (100%) rename evaluation/{ => benchmarks}/commit0_bench/scripts/run_infer.sh (97%) rename evaluation/{ => benchmarks}/discoverybench/README.md (90%) rename evaluation/{ => benchmarks}/discoverybench/eval_utils/README.md (100%) rename evaluation/{ => benchmarks}/discoverybench/eval_utils/__init__.py (100%) rename evaluation/{ => benchmarks}/discoverybench/eval_utils/eval_w_subhypo_gen.py (100%) rename evaluation/{ => benchmarks}/discoverybench/eval_utils/lm_utils.py (100%) rename evaluation/{ => benchmarks}/discoverybench/eval_utils/openai_helpers.py (100%) rename evaluation/{ => benchmarks}/discoverybench/eval_utils/openai_semantic_gen_prompts.py (100%) rename evaluation/{ => benchmarks}/discoverybench/eval_utils/response_parser.py (100%) rename evaluation/{ => benchmarks}/discoverybench/run_infer.py (99%) rename evaluation/{ => benchmarks}/discoverybench/scripts/run_infer.sh (91%) rename evaluation/{ => benchmarks}/gaia/README.md (77%) rename evaluation/{ => benchmarks}/gaia/get_score.py (100%) rename evaluation/{ => benchmarks}/gaia/run_infer.py (99%) rename evaluation/{ => benchmarks}/gaia/scorer.py (100%) rename evaluation/{ => benchmarks}/gaia/scripts/run_infer.sh (93%) rename evaluation/{ => benchmarks}/gorilla/README.md (88%) rename evaluation/{ => benchmarks}/gorilla/ast_eval_hf.py (100%) rename evaluation/{ => benchmarks}/gorilla/ast_eval_tf.py (100%) rename evaluation/{ => benchmarks}/gorilla/ast_eval_th.py (100%) rename evaluation/{ => benchmarks}/gorilla/run_infer.py (98%) rename evaluation/{ => benchmarks}/gorilla/scripts/run_infer.sh (93%) rename evaluation/{ => benchmarks}/gorilla/utils.py (100%) rename evaluation/{ => benchmarks}/gpqa/README.md (94%) rename evaluation/{ => benchmarks}/gpqa/__init__.py (100%) rename evaluation/{ => benchmarks}/gpqa/run_infer.py (100%) rename evaluation/{ => benchmarks}/gpqa/scripts/run_infer.sh (93%) rename evaluation/{ => benchmarks}/humanevalfix/README.md (99%) rename evaluation/{ => benchmarks}/humanevalfix/__init__.py (100%) rename evaluation/{ => benchmarks}/humanevalfix/run_infer.py (100%) rename evaluation/{ => benchmarks}/humanevalfix/scripts/run_infer.sh (97%) rename evaluation/{ => benchmarks}/logic_reasoning/.cache_program/facts.kfb (100%) rename evaluation/{ => benchmarks}/logic_reasoning/.cache_program/rules.krb (100%) rename evaluation/{ => benchmarks}/logic_reasoning/Dockerfile (100%) rename evaluation/{ => benchmarks}/logic_reasoning/README.md (83%) rename evaluation/{ => benchmarks}/logic_reasoning/__init__.py (100%) rename evaluation/{ => benchmarks}/logic_reasoning/instruction.txt (100%) rename evaluation/{ => benchmarks}/logic_reasoning/logic_inference.py (100%) rename evaluation/{ => benchmarks}/logic_reasoning/run_infer.py (100%) rename evaluation/{ => benchmarks}/logic_reasoning/scripts/run_infer.sh (92%) rename evaluation/{ => benchmarks}/miniwob/Dockerfile (100%) rename evaluation/{ => benchmarks}/miniwob/README.md (79%) rename evaluation/{ => benchmarks}/miniwob/get_avg_reward.py (100%) rename evaluation/{ => benchmarks}/miniwob/run_infer.py (100%) rename evaluation/{ => benchmarks}/miniwob/scripts/run_infer.sh (86%) rename evaluation/{ => benchmarks}/mint/.gitignore (100%) rename evaluation/{ => benchmarks}/mint/Dockerfile (100%) rename evaluation/{ => benchmarks}/mint/README.md (85%) rename evaluation/{ => benchmarks}/mint/config_variables.py (100%) rename evaluation/{ => benchmarks}/mint/datatypes.py (100%) rename evaluation/{ => benchmarks}/mint/env.py (100%) rename evaluation/{ => benchmarks}/mint/prompts/__init__.py (100%) rename evaluation/{ => benchmarks}/mint/prompts/template_with_tool.txt (100%) rename evaluation/{ => benchmarks}/mint/requirements.txt (100%) rename evaluation/{ => benchmarks}/mint/run_infer.py (97%) rename evaluation/{ => benchmarks}/mint/scripts/run_infer.sh (100%) rename evaluation/{ => benchmarks}/mint/tasks/__init__.py (50%) rename evaluation/{ => benchmarks}/mint/tasks/base.py (100%) rename evaluation/{ => benchmarks}/mint/tasks/codegen.py (98%) rename evaluation/{ => benchmarks}/mint/tasks/in_context_examples/humaneval/with_tool.txt (100%) rename evaluation/{ => benchmarks}/mint/tasks/in_context_examples/mbpp/with_tool.txt (100%) rename evaluation/{ => benchmarks}/mint/tasks/in_context_examples/reasoning/with_tool.txt (100%) rename evaluation/{ => benchmarks}/mint/tasks/reasoning.py (100%) rename evaluation/{ => benchmarks}/mint/utils.py (100%) rename evaluation/{ => benchmarks}/ml_bench/README.md (88%) rename evaluation/{ => benchmarks}/ml_bench/__init__.py (100%) rename evaluation/{ => benchmarks}/ml_bench/run_analysis.py (100%) rename evaluation/{ => benchmarks}/ml_bench/run_infer.py (100%) rename evaluation/{ => benchmarks}/ml_bench/scripts/cleanup.sh (100%) rename evaluation/{ => benchmarks}/ml_bench/scripts/run_analysis.sh (84%) rename evaluation/{ => benchmarks}/ml_bench/scripts/run_infer.sh (93%) rename evaluation/{ => benchmarks}/ml_bench/scripts/summarise_results.py (100%) rename evaluation/{ => benchmarks}/scienceagentbench/Dockerfile (100%) rename evaluation/{ => benchmarks}/scienceagentbench/Dockerfile.evaluator (100%) rename evaluation/{ => benchmarks}/scienceagentbench/README.md (90%) rename evaluation/{ => benchmarks}/scienceagentbench/post_proc.py (100%) rename evaluation/{ => benchmarks}/scienceagentbench/run_infer.py (100%) rename evaluation/{ => benchmarks}/scienceagentbench/scripts/run_infer.sh (92%) rename evaluation/{ => benchmarks}/swe_bench/README.md (80%) rename evaluation/{ => benchmarks}/swe_bench/__init__.py (100%) rename evaluation/{ => benchmarks}/swe_bench/eval_infer.py (99%) rename evaluation/{ => benchmarks}/swe_bench/examples/example_agent_output.jsonl (100%) rename evaluation/{ => benchmarks}/swe_bench/examples/example_model_output.json (100%) rename evaluation/{ => benchmarks}/swe_bench/prompt.py (100%) rename evaluation/{ => benchmarks}/swe_bench/run_infer.py (99%) rename evaluation/{ => benchmarks}/swe_bench/scripts/cleanup_remote_runtime.sh (100%) rename evaluation/{ => benchmarks}/swe_bench/scripts/docker/all-swebench-full-instance-images.txt (100%) rename evaluation/{ => benchmarks}/swe_bench/scripts/docker/all-swebench-lite-instance-images.txt (100%) rename evaluation/{ => benchmarks}/swe_bench/scripts/docker/pull_all_eval_docker.sh (100%) rename evaluation/{ => benchmarks}/swe_bench/scripts/docker/push_docker_instance_images.py (96%) rename evaluation/{ => benchmarks}/swe_bench/scripts/docker/push_eval_docker.sh (100%) rename evaluation/{ => benchmarks}/swe_bench/scripts/eval/compare_outputs.py (100%) rename evaluation/{ => benchmarks}/swe_bench/scripts/eval/convert_oh_folder_to_swebench_submission.sh (87%) rename evaluation/{ => benchmarks}/swe_bench/scripts/eval/convert_oh_output_to_md.py (97%) rename evaluation/{ => benchmarks}/swe_bench/scripts/eval/convert_oh_output_to_swe_json.py (93%) rename evaluation/{ => benchmarks}/swe_bench/scripts/eval/download_gold_patch.py (100%) rename evaluation/{ => benchmarks}/swe_bench/scripts/eval/summarize_outputs.py (100%) rename evaluation/{ => benchmarks}/swe_bench/scripts/eval/update_output_with_eval.py (100%) rename evaluation/{ => benchmarks}/swe_bench/scripts/eval_infer.sh (95%) rename evaluation/{ => benchmarks}/swe_bench/scripts/eval_infer_remote.sh (83%) rename evaluation/{ => benchmarks}/swe_bench/scripts/run_infer.sh (97%) rename evaluation/{ => benchmarks}/swe_bench/scripts/setup/compare_patch_filename.py (100%) rename evaluation/{ => benchmarks}/swe_bench/scripts/setup/instance_swe_entry.sh (100%) rename evaluation/{ => benchmarks}/swe_bench/scripts/setup/prepare_swe_utils.sh (93%) rename evaluation/{ => benchmarks}/swe_bench/scripts/setup/swe_entry.sh (100%) rename evaluation/{ => benchmarks}/toolqa/Dockerfile (100%) rename evaluation/{ => benchmarks}/toolqa/README.md (88%) rename evaluation/{ => benchmarks}/toolqa/run_infer.py (98%) rename evaluation/{ => benchmarks}/toolqa/scripts/run_infer.sh (95%) rename evaluation/{ => benchmarks}/toolqa/utils.py (100%) rename evaluation/{ => benchmarks}/webarena/README.md (91%) rename evaluation/{ => benchmarks}/webarena/__init__.py (100%) rename evaluation/{ => benchmarks}/webarena/get_success_rate.py (100%) rename evaluation/{ => benchmarks}/webarena/run_infer.py (100%) rename evaluation/{ => benchmarks}/webarena/scripts/run_infer.sh (87%) diff --git a/.github/workflows/eval-runner.yml b/.github/workflows/eval-runner.yml index 6f1c225efe..9b2576a264 100644 --- a/.github/workflows/eval-runner.yml +++ b/.github/workflows/eval-runner.yml @@ -84,12 +84,12 @@ jobs: EVAL_DOCKER_IMAGE_PREFIX: us-central1-docker.pkg.dev/evaluation-092424/swe-bench-images run: | - poetry run ./evaluation/swe_bench/scripts/run_infer.sh llm.eval HEAD CodeActAgent 300 30 $N_PROCESSES "princeton-nlp/SWE-bench_Lite" test + poetry run ./evaluation/benchmarks/swe_bench/scripts/run_infer.sh llm.eval HEAD CodeActAgent 300 30 $N_PROCESSES "princeton-nlp/SWE-bench_Lite" test OUTPUT_FOLDER=$(find evaluation/evaluation_outputs/outputs/princeton-nlp__SWE-bench_Lite-test/CodeActAgent -name "deepseek-chat_maxiter_50_N_*-no-hint-run_1" -type d | head -n 1) echo "OUTPUT_FOLDER for SWE-bench evaluation: $OUTPUT_FOLDER" - poetry run ./evaluation/swe_bench/scripts/eval_infer_remote.sh $OUTPUT_FOLDER/output.jsonl $N_PROCESSES "princeton-nlp/SWE-bench_Lite" test + poetry run ./evaluation/benchmarks/swe_bench/scripts/eval_infer_remote.sh $OUTPUT_FOLDER/output.jsonl $N_PROCESSES "princeton-nlp/SWE-bench_Lite" test - poetry run ./evaluation/swe_bench/scripts/eval/summarize_outputs.py $OUTPUT_FOLDER/output.jsonl > summarize_outputs.log 2>&1 + poetry run ./evaluation/benchmarks/swe_bench/scripts/eval/summarize_outputs.py $OUTPUT_FOLDER/output.jsonl > summarize_outputs.log 2>&1 echo "SWEBENCH_REPORT<> $GITHUB_ENV cat summarize_outputs.log >> $GITHUB_ENV echo "EOF" >> $GITHUB_ENV diff --git a/docs/i18n/fr/docusaurus-plugin-content-docs/current/usage/how-to/evaluation-harness.md b/docs/i18n/fr/docusaurus-plugin-content-docs/current/usage/how-to/evaluation-harness.md index 3f19105399..b215a1ca51 100644 --- a/docs/i18n/fr/docusaurus-plugin-content-docs/current/usage/how-to/evaluation-harness.md +++ b/docs/i18n/fr/docusaurus-plugin-content-docs/current/usage/how-to/evaluation-harness.md @@ -76,7 +76,7 @@ La fonction `run_controller()` est le cœur de l'exécution d'OpenHands. Elle g ## Le moyen le plus simple de commencer : Explorer les benchmarks existants -Nous vous encourageons à examiner les différents benchmarks d'évaluation disponibles dans le [répertoire `evaluation/`](https://github.com/All-Hands-AI/OpenHands/blob/main/evaluation) de notre dépôt. +Nous vous encourageons à examiner les différents benchmarks d'évaluation disponibles dans le [répertoire `evaluation/benchmarks/`](https://github.com/All-Hands-AI/OpenHands/blob/main/evaluation/benchmarks) de notre dépôt. Pour intégrer votre propre benchmark, nous vous suggérons de commencer par celui qui ressemble le plus à vos besoins. Cette approche peut considérablement rationaliser votre processus d'intégration, vous permettant de vous appuyer sur les structures existantes et de les adapter à vos exigences spécifiques. diff --git a/docs/i18n/zh-Hans/docusaurus-plugin-content-docs/current/usage/how-to/evaluation-harness.md b/docs/i18n/zh-Hans/docusaurus-plugin-content-docs/current/usage/how-to/evaluation-harness.md index eb99a30ea3..dc41e0fa1c 100644 --- a/docs/i18n/zh-Hans/docusaurus-plugin-content-docs/current/usage/how-to/evaluation-harness.md +++ b/docs/i18n/zh-Hans/docusaurus-plugin-content-docs/current/usage/how-to/evaluation-harness.md @@ -73,7 +73,7 @@ OpenHands 的主要入口点在 `openhands/core/main.py` 中。以下是它工 ## 入门最简单的方法:探索现有基准 -我们鼓励您查看我们仓库的 [`evaluation/` 目录](https://github.com/All-Hands-AI/OpenHands/blob/main/evaluation)中提供的各种评估基准。 +我们鼓励您查看我们仓库的 [`evaluation/benchmarks/` 目录](https://github.com/All-Hands-AI/OpenHands/blob/main/evaluation/benchmarks)中提供的各种评估基准。 要集成您自己的基准,我们建议从最接近您需求的基准开始。这种方法可以显著简化您的集成过程,允许您在现有结构的基础上进行构建并使其适应您的特定要求。 diff --git a/docs/modules/usage/how-to/evaluation-harness.md b/docs/modules/usage/how-to/evaluation-harness.md index e4d1e5d15b..339783ea8d 100644 --- a/docs/modules/usage/how-to/evaluation-harness.md +++ b/docs/modules/usage/how-to/evaluation-harness.md @@ -73,7 +73,7 @@ The `run_controller()` function is the core of OpenHands's execution. It manages ## Easiest way to get started: Exploring Existing Benchmarks -We encourage you to review the various evaluation benchmarks available in the [`evaluation/` directory](https://github.com/All-Hands-AI/OpenHands/blob/main/evaluation) of our repository. +We encourage you to review the various evaluation benchmarks available in the [`evaluation/benchmarks/` directory](https://github.com/All-Hands-AI/OpenHands/blob/main/evaluation/benchmarks) of our repository. To integrate your own benchmark, we suggest starting with the one that most closely resembles your needs. This approach can significantly streamline your integration process, allowing you to build upon existing structures and adapt them to your specific requirements. diff --git a/evaluation/README.md b/evaluation/README.md index 8be0822875..8ef9bcce65 100644 --- a/evaluation/README.md +++ b/evaluation/README.md @@ -46,28 +46,32 @@ The OpenHands evaluation harness supports a wide variety of benchmarks across so ### Software Engineering -- SWE-Bench: [`evaluation/swe_bench`](./swe_bench) -- HumanEvalFix: [`evaluation/humanevalfix`](./humanevalfix) -- BIRD: [`evaluation/bird`](./bird) -- BioCoder: [`evaluation/ml_bench`](./ml_bench) -- ML-Bench: [`evaluation/ml_bench`](./ml_bench) -- APIBench: [`evaluation/gorilla`](./gorilla/) -- ToolQA: [`evaluation/toolqa`](./toolqa/) -- AiderBench: [`evaluation/aider_bench`](./aider_bench/) +- SWE-Bench: [`evaluation/benchmarks/swe_bench`](./benchmarks/swe_bench) +- HumanEvalFix: [`evaluation/benchmarks/humanevalfix`](./benchmarks/humanevalfix) +- BIRD: [`evaluation/benchmarks/bird`](./benchmarks/bird) +- BioCoder: [`evaluation/benchmarks/ml_bench`](./benchmarks/ml_bench) +- ML-Bench: [`evaluation/benchmarks/ml_bench`](./benchmarks/ml_bench) +- APIBench: [`evaluation/benchmarks/gorilla`](./benchmarks/gorilla/) +- ToolQA: [`evaluation/benchmarks/toolqa`](./benchmarks/toolqa/) +- AiderBench: [`evaluation/benchmarks/aider_bench`](./benchmarks/aider_bench/) +- Commit0: [`evaluation/benchmarks/commit0_bench`](./benchmarks/commit0_bench/) +- DiscoveryBench: [`evaluation/benchmarks/discoverybench`](./benchmarks/discoverybench/) ### Web Browsing -- WebArena: [`evaluation/webarena`](./webarena/) -- MiniWob++: [`evaluation/miniwob`](./miniwob/) +- WebArena: [`evaluation/benchmarks/webarena`](./benchmarks/webarena/) +- MiniWob++: [`evaluation/benchmarks/miniwob`](./benchmarks/miniwob/) +- Browsing Delegation: [`evaluation/benchmarks/browsing_delegation`](./benchmarks/browsing_delegation/) ### Misc. Assistance -- GAIA: [`evaluation/gaia`](./gaia) -- GPQA: [`evaluation/gpqa`](./gpqa) -- AgentBench: [`evaluation/agent_bench`](./agent_bench) -- MINT: [`evaluation/mint`](./mint) -- Entity deduction Arena (EDA): [`evaluation/EDA`](./EDA) -- ProofWriter: [`evaluation/logic_reasoning`](./logic_reasoning) +- GAIA: [`evaluation/benchmarks/gaia`](./benchmarks/gaia) +- GPQA: [`evaluation/benchmarks/gpqa`](./benchmarks/gpqa) +- AgentBench: [`evaluation/benchmarks/agent_bench`](./benchmarks/agent_bench) +- MINT: [`evaluation/benchmarks/mint`](./benchmarks/mint) +- Entity deduction Arena (EDA): [`evaluation/benchmarks/EDA`](./benchmarks/EDA) +- ProofWriter: [`evaluation/benchmarks/logic_reasoning`](./benchmarks/logic_reasoning) +- ScienceAgentBench: [`evaluation/benchmarks/scienceagentbench`](./benchmarks/scienceagentbench) ## Result Visualization @@ -79,7 +83,7 @@ You can start your own fork of [our huggingface evaluation outputs](https://hugg To learn more about how to integrate your benchmark into OpenHands, check out [tutorial here](https://docs.all-hands.dev/modules/usage/how-to/evaluation-harness). Briefly, -- Each subfolder contains a specific benchmark or experiment. For example, `evaluation/swe_bench` should contain +- Each subfolder contains a specific benchmark or experiment. For example, `evaluation/benchmarks/swe_bench` should contain all the preprocessing/evaluation/analysis scripts. - Raw data and experimental records should not be stored within this repo. - For model outputs, they should be stored at [this huggingface space](https://huggingface.co/spaces/OpenHands/evaluation) for visualization. diff --git a/evaluation/EDA/README.md b/evaluation/benchmarks/EDA/README.md similarity index 88% rename from evaluation/EDA/README.md rename to evaluation/benchmarks/EDA/README.md index a8a2e4fbf0..fee875c5dd 100644 --- a/evaluation/EDA/README.md +++ b/evaluation/benchmarks/EDA/README.md @@ -12,7 +12,7 @@ Please follow instruction [here](../README.md#setup) to setup your local develop ```bash export OPENAI_API_KEY="sk-XXX"; # This is required for evaluation (to simulate another party of conversation) -./evaluation/EDA/scripts/run_infer.sh [model_config] [git-version] [agent] [dataset] [eval_limit] +./evaluation/benchmarks/EDA/scripts/run_infer.sh [model_config] [git-version] [agent] [dataset] [eval_limit] ``` where `model_config` is mandatory, while `git-version`, `agent`, `dataset` and `eval_limit` are optional. @@ -33,7 +33,7 @@ to `CodeActAgent`. For example, ```bash -./evaluation/EDA/scripts/run_infer.sh eval_gpt4o_2024_05_13 0.6.2 CodeActAgent things +./evaluation/benchmarks/EDA/scripts/run_infer.sh eval_gpt4o_2024_05_13 0.6.2 CodeActAgent things ``` ## Reference diff --git a/evaluation/EDA/game.py b/evaluation/benchmarks/EDA/game.py similarity index 100% rename from evaluation/EDA/game.py rename to evaluation/benchmarks/EDA/game.py diff --git a/evaluation/EDA/run_infer.py b/evaluation/benchmarks/EDA/run_infer.py similarity index 99% rename from evaluation/EDA/run_infer.py rename to evaluation/benchmarks/EDA/run_infer.py index 2549207392..cce795e954 100644 --- a/evaluation/EDA/run_infer.py +++ b/evaluation/benchmarks/EDA/run_infer.py @@ -4,7 +4,7 @@ import os import pandas as pd from datasets import load_dataset -from evaluation.EDA.game import Q20Game, Q20GameCelebrity +from evaluation.benchmarks.EDA.game import Q20Game, Q20GameCelebrity from evaluation.utils.shared import ( EvalMetadata, EvalOutput, diff --git a/evaluation/EDA/scripts/run_infer.sh b/evaluation/benchmarks/EDA/scripts/run_infer.sh similarity index 95% rename from evaluation/EDA/scripts/run_infer.sh rename to evaluation/benchmarks/EDA/scripts/run_infer.sh index afa9eaa7b2..a803073f73 100755 --- a/evaluation/EDA/scripts/run_infer.sh +++ b/evaluation/benchmarks/EDA/scripts/run_infer.sh @@ -43,7 +43,7 @@ echo "AGENT_VERSION: $AGENT_VERSION" echo "MODEL_CONFIG: $MODEL_CONFIG" echo "DATASET: $DATASET" -COMMAND="poetry run python evaluation/EDA/run_infer.py \ +COMMAND="poetry run python evaluation/benchmarks/EDA/run_infer.py \ --agent-cls $AGENT \ --llm-config $MODEL_CONFIG \ --dataset $DATASET \ diff --git a/evaluation/agent_bench/README.md b/evaluation/benchmarks/agent_bench/README.md similarity index 80% rename from evaluation/agent_bench/README.md rename to evaluation/benchmarks/agent_bench/README.md index 1133a09a5c..e8a1e3dc95 100644 --- a/evaluation/agent_bench/README.md +++ b/evaluation/benchmarks/agent_bench/README.md @@ -9,7 +9,7 @@ Please follow instruction [here](../README.md#setup) to setup your local develop ## Start the evaluation ```bash -./evaluation/agent_bench/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit] +./evaluation/benchmarks/agent_bench/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit] ``` - `model_config`, e.g. `eval_gpt4_1106_preview`, is the config group name for your @@ -25,7 +25,7 @@ in order to use `eval_limit`, you must also set `agent`. Following is the basic command to start the evaluation. -You can update the arguments in the script `evaluation/agent_bench/scripts/run_infer.sh`, such as `--max-iterations`, `--eval-num-workers` and so on. +You can update the arguments in the script `evaluation/benchmarks/agent_bench/scripts/run_infer.sh`, such as `--max-iterations`, `--eval-num-workers` and so on. - `--agent-cls`, the agent to use. For example, `CodeActAgent`. - `--llm-config`: the LLM configuration to use. For example, `eval_gpt4_1106_preview`. @@ -34,5 +34,5 @@ You can update the arguments in the script `evaluation/agent_bench/scripts/run_i - `--eval-n-limit`: the number of examples to evaluate. For example, `100`. ```bash -./evaluation/agent_bench/scripts/run_infer.sh eval_gpt35_turbo HEAD CodeActAgent 1 +./evaluation/benchmarks/agent_bench/scripts/run_infer.sh eval_gpt35_turbo HEAD CodeActAgent 1 ``` diff --git a/evaluation/agent_bench/__init__.py b/evaluation/benchmarks/agent_bench/__init__.py similarity index 100% rename from evaluation/agent_bench/__init__.py rename to evaluation/benchmarks/agent_bench/__init__.py diff --git a/evaluation/agent_bench/helper.py b/evaluation/benchmarks/agent_bench/helper.py similarity index 100% rename from evaluation/agent_bench/helper.py rename to evaluation/benchmarks/agent_bench/helper.py diff --git a/evaluation/agent_bench/run_infer.py b/evaluation/benchmarks/agent_bench/run_infer.py similarity index 99% rename from evaluation/agent_bench/run_infer.py rename to evaluation/benchmarks/agent_bench/run_infer.py index acdf60fe48..693718357a 100644 --- a/evaluation/agent_bench/run_infer.py +++ b/evaluation/benchmarks/agent_bench/run_infer.py @@ -7,7 +7,7 @@ from typing import Any import pandas as pd from datasets import load_dataset -from evaluation.agent_bench.helper import ( +from evaluation.benchmarks.agent_bench.helper import ( FAKE_RESPONSES, INST_SUFFIXES, compare_results, diff --git a/evaluation/agent_bench/scripts/run_infer.sh b/evaluation/benchmarks/agent_bench/scripts/run_infer.sh similarity index 84% rename from evaluation/agent_bench/scripts/run_infer.sh rename to evaluation/benchmarks/agent_bench/scripts/run_infer.sh index 713e420d53..16e98b074b 100755 --- a/evaluation/agent_bench/scripts/run_infer.sh +++ b/evaluation/benchmarks/agent_bench/scripts/run_infer.sh @@ -26,7 +26,7 @@ echo "AGENT: $AGENT" echo "AGENT_VERSION: $AGENT_VERSION" echo "MODEL_CONFIG: $MODEL_CONFIG" -COMMAND="export PYTHONPATH=evaluation/agent_bench:\$PYTHONPATH && poetry run python evaluation/agent_bench/run_infer.py \ +COMMAND="export PYTHONPATH=evaluation/benchmarks/agent_bench:\$PYTHONPATH && poetry run python evaluation/benchmarks/agent_bench/run_infer.py \ --agent-cls $AGENT \ --llm-config $MODEL_CONFIG \ --max-iterations 30 \ diff --git a/evaluation/agent_bench/scripts/summarise_results.py b/evaluation/benchmarks/agent_bench/scripts/summarise_results.py similarity index 100% rename from evaluation/agent_bench/scripts/summarise_results.py rename to evaluation/benchmarks/agent_bench/scripts/summarise_results.py diff --git a/evaluation/aider_bench/README.md b/evaluation/benchmarks/aider_bench/README.md similarity index 80% rename from evaluation/aider_bench/README.md rename to evaluation/benchmarks/aider_bench/README.md index 07b782a256..965fc06d7e 100644 --- a/evaluation/aider_bench/README.md +++ b/evaluation/benchmarks/aider_bench/README.md @@ -16,7 +16,7 @@ development environment and LLM. ## Start the evaluation ```bash -./evaluation/aider_bench/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit] [eval-num-workers] [eval_ids] +./evaluation/benchmarks/aider_bench/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit] [eval-num-workers] [eval_ids] ``` - `model_config`, e.g. `eval_gpt4_1106_preview`, is the config group name for @@ -42,7 +42,7 @@ export SKIP_NUM=12 # skip the first 12 instances from the dataset Following is the basic command to start the evaluation. You can update the arguments in the script -`evaluation/aider_bench/scripts/run_infer.sh`, such as `--max-iterations`, +`evaluation/benchmarks/aider_bench/scripts/run_infer.sh`, such as `--max-iterations`, `--eval-num-workers` and so on: - `--agent-cls`, the agent to use. For example, `CodeActAgent`. @@ -53,7 +53,7 @@ You can update the arguments in the script - `--eval-ids`: the IDs of the examples to evaluate (comma separated). For example, `"1,3,10"`. ```bash -./evaluation/aider_bench/scripts/run_infer.sh eval_gpt35_turbo HEAD CodeActAgent 100 1 "1,3,10" +./evaluation/benchmarks/aider_bench/scripts/run_infer.sh eval_gpt35_turbo HEAD CodeActAgent 100 1 "1,3,10" ``` ### Run Inference on `RemoteRuntime` (experimental) @@ -61,25 +61,25 @@ You can update the arguments in the script This is in limited beta. Contact Xingyao over slack if you want to try this out! ```bash -./evaluation/aider_bench/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit] [eval-num-workers] [eval_ids] +./evaluation/benchmarks/aider_bench/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit] [eval-num-workers] [eval_ids] # Example - This runs evaluation on CodeActAgent for 133 instances on aider_bench test set, with 2 workers running in parallel export ALLHANDS_API_KEY="YOUR-API-KEY" export RUNTIME=remote export SANDBOX_REMOTE_RUNTIME_API_URL="https://runtime.eval.all-hands.dev" -./evaluation/aider_bench/scripts/run_infer.sh llm.eval HEAD CodeActAgent 133 2 +./evaluation/benchmarks/aider_bench/scripts/run_infer.sh llm.eval HEAD CodeActAgent 133 2 ``` ## Summarize Results ```bash -poetry run python ./evaluation/aider_bench/scripts/summarize_results.py [path_to_output_jsonl_file] +poetry run python ./evaluation/benchmarks/aider_bench/scripts/summarize_results.py [path_to_output_jsonl_file] ``` Full example: ```bash -poetry run python ./evaluation/aider_bench/scripts/summarize_results.py evaluation/evaluation_outputs/outputs/AiderBench/CodeActAgent/claude-3-5-sonnet@20240620_maxiter_30_N_v1.9/output.jsonl +poetry run python ./evaluation/benchmarks/aider_bench/scripts/summarize_results.py evaluation/evaluation_outputs/outputs/AiderBench/CodeActAgent/claude-3-5-sonnet@20240620_maxiter_30_N_v1.9/output.jsonl ``` This will list the instances that passed and the instances that failed. For each diff --git a/evaluation/aider_bench/create_dataset.py b/evaluation/benchmarks/aider_bench/create_dataset.py similarity index 100% rename from evaluation/aider_bench/create_dataset.py rename to evaluation/benchmarks/aider_bench/create_dataset.py diff --git a/evaluation/aider_bench/helper.py b/evaluation/benchmarks/aider_bench/helper.py similarity index 100% rename from evaluation/aider_bench/helper.py rename to evaluation/benchmarks/aider_bench/helper.py diff --git a/evaluation/aider_bench/run_infer.py b/evaluation/benchmarks/aider_bench/run_infer.py similarity index 99% rename from evaluation/aider_bench/run_infer.py rename to evaluation/benchmarks/aider_bench/run_infer.py index c6e5bbb9db..f7796c7696 100644 --- a/evaluation/aider_bench/run_infer.py +++ b/evaluation/benchmarks/aider_bench/run_infer.py @@ -7,7 +7,7 @@ from typing import Any import pandas as pd from datasets import load_dataset -from evaluation.aider_bench.helper import ( +from evaluation.benchmarks.aider_bench.helper import ( FAKE_RESPONSES, INST_SUFFIXES, INSTRUCTIONS_ADDENDUM, diff --git a/evaluation/aider_bench/scripts/run_infer.sh b/evaluation/benchmarks/aider_bench/scripts/run_infer.sh similarity index 89% rename from evaluation/aider_bench/scripts/run_infer.sh rename to evaluation/benchmarks/aider_bench/scripts/run_infer.sh index 0afc060f36..0b3824ceae 100755 --- a/evaluation/aider_bench/scripts/run_infer.sh +++ b/evaluation/benchmarks/aider_bench/scripts/run_infer.sh @@ -39,7 +39,7 @@ if [ "$USE_UNIT_TESTS" = true ]; then EVAL_NOTE=$EVAL_NOTE-w-test fi -COMMAND="export PYTHONPATH=evaluation/aider_bench:\$PYTHONPATH && poetry run python evaluation/aider_bench/run_infer.py \ +COMMAND="export PYTHONPATH=evaluation/benchmarks/aider_bench:\$PYTHONPATH && poetry run python evaluation/benchmarks/aider_bench/run_infer.py \ --agent-cls $AGENT \ --llm-config $MODEL_CONFIG \ --max-iterations 30 \ diff --git a/evaluation/aider_bench/scripts/summarize_results.py b/evaluation/benchmarks/aider_bench/scripts/summarize_results.py similarity index 100% rename from evaluation/aider_bench/scripts/summarize_results.py rename to evaluation/benchmarks/aider_bench/scripts/summarize_results.py diff --git a/evaluation/biocoder/README.md b/evaluation/benchmarks/biocoder/README.md similarity index 92% rename from evaluation/biocoder/README.md rename to evaluation/benchmarks/biocoder/README.md index ad9cf55f53..035f2d20bf 100644 --- a/evaluation/biocoder/README.md +++ b/evaluation/benchmarks/biocoder/README.md @@ -21,7 +21,7 @@ To reproduce this image, please see the Dockerfile_Openopenhands in the `biocode ```bash -./evaluation/biocoder/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit] +./evaluation/benchmarks/biocoder/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit] ``` where `model_config` is mandatory, while `git-version`, `agent`, `dataset` and `eval_limit` are optional. @@ -43,7 +43,7 @@ with current OpenHands version, then your command would be: ## Examples ```bash -./evaluation/biocoder/scripts/run_infer.sh eval_gpt4o_2024_05_13 HEAD CodeActAgent 1 +./evaluation/benchmarks/biocoder/scripts/run_infer.sh eval_gpt4o_2024_05_13 HEAD CodeActAgent 1 ``` ## Reference diff --git a/evaluation/biocoder/run_infer.py b/evaluation/benchmarks/biocoder/run_infer.py similarity index 99% rename from evaluation/biocoder/run_infer.py rename to evaluation/benchmarks/biocoder/run_infer.py index 68bbf892d5..f5cdd44471 100644 --- a/evaluation/biocoder/run_infer.py +++ b/evaluation/benchmarks/biocoder/run_infer.py @@ -8,7 +8,7 @@ from typing import Any import pandas as pd from datasets import load_dataset -from evaluation.biocoder.utils import BiocoderData +from evaluation.benchmarks.biocoder.utils import BiocoderData from evaluation.utils.shared import ( EvalMetadata, EvalOutput, diff --git a/evaluation/biocoder/scripts/run_infer.sh b/evaluation/benchmarks/biocoder/scripts/run_infer.sh similarity index 92% rename from evaluation/biocoder/scripts/run_infer.sh rename to evaluation/benchmarks/biocoder/scripts/run_infer.sh index b2ae17f98c..61fddb6211 100755 --- a/evaluation/biocoder/scripts/run_infer.sh +++ b/evaluation/benchmarks/biocoder/scripts/run_infer.sh @@ -28,7 +28,7 @@ echo "AGENT_VERSION: $AGENT_VERSION" echo "MODEL_CONFIG: $MODEL_CONFIG" echo "DATASET: $DATASET" -COMMAND="poetry run python evaluation/biocoder/run_infer.py \ +COMMAND="poetry run python evaluation/benchmarks/biocoder/run_infer.py \ --agent-cls $AGENT \ --llm-config $MODEL_CONFIG \ --max-iterations 10 \ diff --git a/evaluation/biocoder/scripts/setup/copy_changed_code.py b/evaluation/benchmarks/biocoder/scripts/setup/copy_changed_code.py similarity index 100% rename from evaluation/biocoder/scripts/setup/copy_changed_code.py rename to evaluation/benchmarks/biocoder/scripts/setup/copy_changed_code.py diff --git a/evaluation/biocoder/scripts/setup/remove_code.py b/evaluation/benchmarks/biocoder/scripts/setup/remove_code.py similarity index 100% rename from evaluation/biocoder/scripts/setup/remove_code.py rename to evaluation/benchmarks/biocoder/scripts/setup/remove_code.py diff --git a/evaluation/biocoder/utils.py b/evaluation/benchmarks/biocoder/utils.py similarity index 100% rename from evaluation/biocoder/utils.py rename to evaluation/benchmarks/biocoder/utils.py diff --git a/evaluation/bird/README.md b/evaluation/benchmarks/bird/README.md similarity index 99% rename from evaluation/bird/README.md rename to evaluation/benchmarks/bird/README.md index 8f63423527..90e3fa300c 100644 --- a/evaluation/bird/README.md +++ b/evaluation/benchmarks/bird/README.md @@ -9,7 +9,7 @@ Please follow instruction [here](../README.md#setup) to setup your local develop ## Run Inference on Bird ```bash -./evaluation/bird/scripts/run_infer.sh [model_config] [git-version] +./evaluation/benchmarks/bird/scripts/run_infer.sh [model_config] [git-version] ``` - `model_config`, e.g. `eval_gpt4_1106_preview`, is the config group name for your diff --git a/evaluation/bird/__init__.py b/evaluation/benchmarks/bird/__init__.py similarity index 100% rename from evaluation/bird/__init__.py rename to evaluation/benchmarks/bird/__init__.py diff --git a/evaluation/bird/run_infer.py b/evaluation/benchmarks/bird/run_infer.py similarity index 100% rename from evaluation/bird/run_infer.py rename to evaluation/benchmarks/bird/run_infer.py diff --git a/evaluation/bird/scripts/run_infer.sh b/evaluation/benchmarks/bird/scripts/run_infer.sh similarity index 92% rename from evaluation/bird/scripts/run_infer.sh rename to evaluation/benchmarks/bird/scripts/run_infer.sh index b2e2c64c42..bf69d9d50b 100755 --- a/evaluation/bird/scripts/run_infer.sh +++ b/evaluation/benchmarks/bird/scripts/run_infer.sh @@ -26,7 +26,7 @@ echo "AGENT: $AGENT" echo "AGENT_VERSION: $AGENT_VERSION" echo "MODEL_CONFIG: $MODEL_CONFIG" -COMMAND="poetry run python evaluation/bird/run_infer.py \ +COMMAND="poetry run python evaluation/benchmarks/bird/run_infer.py \ --agent-cls $AGENT \ --llm-config $MODEL_CONFIG \ --max-iterations 5 \ diff --git a/evaluation/browsing_delegation/README.md b/evaluation/benchmarks/browsing_delegation/README.md similarity index 91% rename from evaluation/browsing_delegation/README.md rename to evaluation/benchmarks/browsing_delegation/README.md index 92e9410971..a06170f8b9 100644 --- a/evaluation/browsing_delegation/README.md +++ b/evaluation/benchmarks/browsing_delegation/README.md @@ -12,7 +12,7 @@ Please follow instruction [here](../README.md#setup) to setup your local develop ## Run Inference ```bash -./evaluation/browsing_delegation/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit] +./evaluation/benchmarks/browsing_delegation/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit] # e.g., ./evaluation/swe_bench/scripts/run_infer.sh llm.eval_gpt4_1106_preview_llm HEAD CodeActAgent 300 ``` diff --git a/evaluation/browsing_delegation/run_infer.py b/evaluation/benchmarks/browsing_delegation/run_infer.py similarity index 100% rename from evaluation/browsing_delegation/run_infer.py rename to evaluation/benchmarks/browsing_delegation/run_infer.py diff --git a/evaluation/browsing_delegation/scripts/run_infer.sh b/evaluation/benchmarks/browsing_delegation/scripts/run_infer.sh similarity index 90% rename from evaluation/browsing_delegation/scripts/run_infer.sh rename to evaluation/benchmarks/browsing_delegation/scripts/run_infer.sh index c33768dc01..30607ca333 100755 --- a/evaluation/browsing_delegation/scripts/run_infer.sh +++ b/evaluation/benchmarks/browsing_delegation/scripts/run_infer.sh @@ -28,7 +28,7 @@ echo "MODEL_CONFIG: $MODEL_CONFIG" EVAL_NOTE="$AGENT_VERSION" -COMMAND="poetry run python evaluation/browsing_delegation/run_infer.py \ +COMMAND="poetry run python evaluation/benchmarks/browsing_delegation/run_infer.py \ --agent-cls $AGENT \ --llm-config $MODEL_CONFIG \ --max-iterations 1 \ diff --git a/evaluation/commit0_bench/README.md b/evaluation/benchmarks/commit0_bench/README.md similarity index 81% rename from evaluation/commit0_bench/README.md rename to evaluation/benchmarks/commit0_bench/README.md index fdfd5812a8..78b58b0213 100644 --- a/evaluation/commit0_bench/README.md +++ b/evaluation/benchmarks/commit0_bench/README.md @@ -24,10 +24,10 @@ Make sure your Docker daemon is running, and you have ample disk space (at least When the `run_infer.sh` script is started, it will automatically pull the `lite` split in Commit0. For example, for instance ID `commit-0/minitorch`, it will try to pull our pre-build docker image `wentingzhao/minitorch` from DockerHub. This image will be used create an OpenHands runtime image where the agent will operate on. ```bash -./evaluation/commit0_bench/scripts/run_infer.sh [repo_split] [model_config] [git-version] [agent] [eval_limit] [max_iter] [num_workers] [dataset] [dataset_split] +./evaluation/benchmarks/commit0_bench/scripts/run_infer.sh [repo_split] [model_config] [git-version] [agent] [eval_limit] [max_iter] [num_workers] [dataset] [dataset_split] # Example -./evaluation/commit0_bench/scripts/run_infer.sh lite llm.eval_sonnet HEAD CodeActAgent 16 100 8 wentingzhao/commit0_combined test +./evaluation/benchmarks/commit0_bench/scripts/run_infer.sh lite llm.eval_sonnet HEAD CodeActAgent 16 100 8 wentingzhao/commit0_combined test ``` where `model_config` is mandatory, and the rest are optional. @@ -56,7 +56,7 @@ Let's say you'd like to run 10 instances using `llm.eval_sonnet` and CodeActAgen then your command would be: ```bash -./evaluation/commit0_bench/scripts/run_infer.sh lite llm.eval_sonnet HEAD CodeActAgent 10 30 1 wentingzhao/commit0_combined test +./evaluation/benchmarks/commit0_bench/scripts/run_infer.sh lite llm.eval_sonnet HEAD CodeActAgent 10 30 1 wentingzhao/commit0_combined test ``` ### Run Inference on `RemoteRuntime` (experimental) @@ -64,17 +64,17 @@ then your command would be: This is in limited beta. Contact Xingyao over slack if you want to try this out! ```bash -./evaluation/commit0_bench/scripts/run_infer.sh [repo_split] [model_config] [git-version] [agent] [eval_limit] [max_iter] [num_workers] [dataset] [dataset_split] +./evaluation/benchmarks/commit0_bench/scripts/run_infer.sh [repo_split] [model_config] [git-version] [agent] [eval_limit] [max_iter] [num_workers] [dataset] [dataset_split] # Example - This runs evaluation on CodeActAgent for 10 instances on "wentingzhao/commit0_combined"'s test set, with max 30 iteration per instances, with 1 number of workers running in parallel ALLHANDS_API_KEY="YOUR-API-KEY" RUNTIME=remote SANDBOX_REMOTE_RUNTIME_API_URL="https://runtime.eval.all-hands.dev" EVAL_DOCKER_IMAGE_PREFIX="docker.io/wentingzhao" \ -./evaluation/commit0_bench/scripts/run_infer.sh lite llm.eval_sonnet HEAD CodeActAgent 10 30 1 wentingzhao/commit0_combined test +./evaluation/benchmarks/commit0_bench/scripts/run_infer.sh lite llm.eval_sonnet HEAD CodeActAgent 10 30 1 wentingzhao/commit0_combined test ``` To clean-up all existing runtime you've already started, run: ```bash -ALLHANDS_API_KEY="YOUR-API-KEY" ./evaluation/commit0_bench/scripts/cleanup_remote_runtime.sh +ALLHANDS_API_KEY="YOUR-API-KEY" ./evaluation/benchmarks/commit0_bench/scripts/cleanup_remote_runtime.sh ``` ### Specify a subset of tasks to run infer diff --git a/evaluation/commit0_bench/run_infer.py b/evaluation/benchmarks/commit0_bench/run_infer.py similarity index 100% rename from evaluation/commit0_bench/run_infer.py rename to evaluation/benchmarks/commit0_bench/run_infer.py diff --git a/evaluation/commit0_bench/scripts/cleanup_remote_runtime.sh b/evaluation/benchmarks/commit0_bench/scripts/cleanup_remote_runtime.sh similarity index 100% rename from evaluation/commit0_bench/scripts/cleanup_remote_runtime.sh rename to evaluation/benchmarks/commit0_bench/scripts/cleanup_remote_runtime.sh diff --git a/evaluation/commit0_bench/scripts/run_infer.sh b/evaluation/benchmarks/commit0_bench/scripts/run_infer.sh similarity index 97% rename from evaluation/commit0_bench/scripts/run_infer.sh rename to evaluation/benchmarks/commit0_bench/scripts/run_infer.sh index d362a09667..227a5ff05e 100755 --- a/evaluation/commit0_bench/scripts/run_infer.sh +++ b/evaluation/benchmarks/commit0_bench/scripts/run_infer.sh @@ -91,7 +91,7 @@ fi function run_eval() { local eval_note=$1 - COMMAND="poetry run python evaluation/commit0_bench/run_infer.py \ + COMMAND="poetry run python evaluation/benchmarks/commit0_bench/run_infer.py \ --agent-cls $AGENT \ --llm-config $MODEL_CONFIG \ --max-iterations $MAX_ITER \ diff --git a/evaluation/discoverybench/README.md b/evaluation/benchmarks/discoverybench/README.md similarity index 90% rename from evaluation/discoverybench/README.md rename to evaluation/benchmarks/discoverybench/README.md index a0d8994709..daf5cc34bb 100644 --- a/evaluation/discoverybench/README.md +++ b/evaluation/benchmarks/discoverybench/README.md @@ -16,7 +16,7 @@ 2. Execute the bash script to start DiscoveryBench Evaluation ``` -./evaluation/discoverybench/scripts/run_infer.sh [YOUR MODEL CONFIG] +./evaluation/benchmarks/discoverybench/scripts/run_infer.sh [YOUR MODEL CONFIG] ``` Replace `[YOUR MODEL CONFIG]` with any model the model that you have set up in `config.toml` @@ -27,7 +27,7 @@ When the `run_infer.sh` script is started, it will automatically pull the latest ``` -./evaluation/discoverybench/scripts/run_infer.sh [MODEL_CONFIG] [GIT_COMMIT] [AGENT] [EVAL_LIMIT] [NUM_WORKERS] +./evaluation/benchmarks/discoverybench/scripts/run_infer.sh [MODEL_CONFIG] [GIT_COMMIT] [AGENT] [EVAL_LIMIT] [NUM_WORKERS] ``` - `MODEL_CONFIG`: Name of the model you want to evaluate with diff --git a/evaluation/discoverybench/eval_utils/README.md b/evaluation/benchmarks/discoverybench/eval_utils/README.md similarity index 100% rename from evaluation/discoverybench/eval_utils/README.md rename to evaluation/benchmarks/discoverybench/eval_utils/README.md diff --git a/evaluation/discoverybench/eval_utils/__init__.py b/evaluation/benchmarks/discoverybench/eval_utils/__init__.py similarity index 100% rename from evaluation/discoverybench/eval_utils/__init__.py rename to evaluation/benchmarks/discoverybench/eval_utils/__init__.py diff --git a/evaluation/discoverybench/eval_utils/eval_w_subhypo_gen.py b/evaluation/benchmarks/discoverybench/eval_utils/eval_w_subhypo_gen.py similarity index 100% rename from evaluation/discoverybench/eval_utils/eval_w_subhypo_gen.py rename to evaluation/benchmarks/discoverybench/eval_utils/eval_w_subhypo_gen.py diff --git a/evaluation/discoverybench/eval_utils/lm_utils.py b/evaluation/benchmarks/discoverybench/eval_utils/lm_utils.py similarity index 100% rename from evaluation/discoverybench/eval_utils/lm_utils.py rename to evaluation/benchmarks/discoverybench/eval_utils/lm_utils.py diff --git a/evaluation/discoverybench/eval_utils/openai_helpers.py b/evaluation/benchmarks/discoverybench/eval_utils/openai_helpers.py similarity index 100% rename from evaluation/discoverybench/eval_utils/openai_helpers.py rename to evaluation/benchmarks/discoverybench/eval_utils/openai_helpers.py diff --git a/evaluation/discoverybench/eval_utils/openai_semantic_gen_prompts.py b/evaluation/benchmarks/discoverybench/eval_utils/openai_semantic_gen_prompts.py similarity index 100% rename from evaluation/discoverybench/eval_utils/openai_semantic_gen_prompts.py rename to evaluation/benchmarks/discoverybench/eval_utils/openai_semantic_gen_prompts.py diff --git a/evaluation/discoverybench/eval_utils/response_parser.py b/evaluation/benchmarks/discoverybench/eval_utils/response_parser.py similarity index 100% rename from evaluation/discoverybench/eval_utils/response_parser.py rename to evaluation/benchmarks/discoverybench/eval_utils/response_parser.py diff --git a/evaluation/discoverybench/run_infer.py b/evaluation/benchmarks/discoverybench/run_infer.py similarity index 99% rename from evaluation/discoverybench/run_infer.py rename to evaluation/benchmarks/discoverybench/run_infer.py index 7cfd2dbac7..6d8dcbd89b 100644 --- a/evaluation/discoverybench/run_infer.py +++ b/evaluation/benchmarks/discoverybench/run_infer.py @@ -5,10 +5,10 @@ import os import git import pandas as pd -from evaluation.discoverybench.eval_utils.eval_w_subhypo_gen import ( +from evaluation.benchmarks.discoverybench.eval_utils.eval_w_subhypo_gen import ( run_eval_gold_vs_gen_NL_hypo_workflow, ) -from evaluation.discoverybench.eval_utils.response_parser import ( +from evaluation.benchmarks.discoverybench.eval_utils.response_parser import ( extract_gen_hypo_from_logs, ) from evaluation.utils.shared import ( diff --git a/evaluation/discoverybench/scripts/run_infer.sh b/evaluation/benchmarks/discoverybench/scripts/run_infer.sh similarity index 91% rename from evaluation/discoverybench/scripts/run_infer.sh rename to evaluation/benchmarks/discoverybench/scripts/run_infer.sh index 8b9fffd7c5..e12b9c1398 100755 --- a/evaluation/discoverybench/scripts/run_infer.sh +++ b/evaluation/benchmarks/discoverybench/scripts/run_infer.sh @@ -29,7 +29,7 @@ echo "AGENT: $AGENT" echo "AGENT_VERSION: $AGENT_VERSION" echo "MODEL_CONFIG: $MODEL_CONFIG" -COMMAND="poetry run python evaluation/discoverybench/run_infer.py \ +COMMAND="poetry run python evaluation/benchmarks/discoverybench/run_infer.py \ --agent-cls $AGENT \ --llm-config $MODEL_CONFIG \ --max-iterations 10 \ diff --git a/evaluation/gaia/README.md b/evaluation/benchmarks/gaia/README.md similarity index 77% rename from evaluation/gaia/README.md rename to evaluation/benchmarks/gaia/README.md index bf1c701328..f592e5f711 100644 --- a/evaluation/gaia/README.md +++ b/evaluation/benchmarks/gaia/README.md @@ -10,11 +10,11 @@ Please follow instruction [here](../README.md#setup) to setup your local develop We are using the GAIA dataset hosted on [Hugging Face](https://huggingface.co/datasets/gaia-benchmark/GAIA). Please accept the terms and make sure to have logged in on your computer by `huggingface-cli login` before running the evaluation. -Following is the basic command to start the evaluation. Here we are evaluating on the validation set for the `2023_all` split. You can adjust `./evaluation/gaia/scripts/run_infer.sh` to change the subset you want to evaluate on. +Following is the basic command to start the evaluation. Here we are evaluating on the validation set for the `2023_all` split. You can adjust `./evaluation/benchmarks/gaia/scripts/run_infer.sh` to change the subset you want to evaluate on. ```bash -./evaluation/gaia/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit] [gaia_subset] -# e.g., ./evaluation/gaia/scripts/run_infer.sh eval_gpt4_1106_preview 0.6.2 CodeActAgent 300 +./evaluation/benchmarks/gaia/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit] [gaia_subset] +# e.g., ./evaluation/benchmarks/gaia/scripts/run_infer.sh eval_gpt4_1106_preview 0.6.2 CodeActAgent 300 ``` where `model_config` is mandatory, while `git-version`, `agent`, `eval_limit` and `gaia_subset` are optional. @@ -35,13 +35,13 @@ to `CodeActAgent`. For example, ```bash -./evaluation/gaia/scripts/run_infer.sh eval_gpt4_1106_preview 0.6.2 CodeActAgent 10 +./evaluation/benchmarks/gaia/scripts/run_infer.sh eval_gpt4_1106_preview 0.6.2 CodeActAgent 10 ``` ## Get score Then you can get stats by running the following command: ```bash -python ./evaluation/gaia/get_score.py \ +python ./evaluation/benchmarks/gaia/get_score.py \ --file ``` diff --git a/evaluation/gaia/get_score.py b/evaluation/benchmarks/gaia/get_score.py similarity index 100% rename from evaluation/gaia/get_score.py rename to evaluation/benchmarks/gaia/get_score.py diff --git a/evaluation/gaia/run_infer.py b/evaluation/benchmarks/gaia/run_infer.py similarity index 99% rename from evaluation/gaia/run_infer.py rename to evaluation/benchmarks/gaia/run_infer.py index 1fa0c00e6d..fb6d4b3db0 100644 --- a/evaluation/gaia/run_infer.py +++ b/evaluation/benchmarks/gaia/run_infer.py @@ -7,7 +7,7 @@ import huggingface_hub import pandas as pd from datasets import load_dataset -from evaluation.gaia.scorer import question_scorer +from evaluation.benchmarks.gaia.scorer import question_scorer from evaluation.utils.shared import ( EvalMetadata, EvalOutput, diff --git a/evaluation/gaia/scorer.py b/evaluation/benchmarks/gaia/scorer.py similarity index 100% rename from evaluation/gaia/scorer.py rename to evaluation/benchmarks/gaia/scorer.py diff --git a/evaluation/gaia/scripts/run_infer.sh b/evaluation/benchmarks/gaia/scripts/run_infer.sh similarity index 93% rename from evaluation/gaia/scripts/run_infer.sh rename to evaluation/benchmarks/gaia/scripts/run_infer.sh index aedfe01a0c..5ad012d07d 100755 --- a/evaluation/gaia/scripts/run_infer.sh +++ b/evaluation/benchmarks/gaia/scripts/run_infer.sh @@ -35,7 +35,7 @@ echo "AGENT_VERSION: $AGENT_VERSION" echo "MODEL_CONFIG: $MODEL_CONFIG" echo "LEVELS: $LEVELS" -COMMAND="poetry run python ./evaluation/gaia/run_infer.py \ +COMMAND="poetry run python ./evaluation/benchmarks/gaia/run_infer.py \ --agent-cls $AGENT \ --llm-config $MODEL_CONFIG \ --max-iterations 30 \ diff --git a/evaluation/gorilla/README.md b/evaluation/benchmarks/gorilla/README.md similarity index 88% rename from evaluation/gorilla/README.md rename to evaluation/benchmarks/gorilla/README.md index 4cd90f554a..c6f1cde55b 100644 --- a/evaluation/gorilla/README.md +++ b/evaluation/benchmarks/gorilla/README.md @@ -11,7 +11,7 @@ Please follow instruction [here](../README.md#setup) to setup your local develop Make sure your Docker daemon is running, then run this bash script: ```bash -./evaluation/gorilla/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit] [hubs] +./evaluation/benchmarks/gorilla/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit] [hubs] ``` where `model_config` is mandatory, while all other arguments are optional. @@ -35,5 +35,5 @@ Note: in order to use `eval_limit`, you must also set `agent`; in order to use ` For example, ```bash -./evaluation/gorilla/scripts/run_infer.sh llm 0.6.2 CodeActAgent 10 th +./evaluation/benchmarks/gorilla/scripts/run_infer.sh llm 0.6.2 CodeActAgent 10 th ``` diff --git a/evaluation/gorilla/ast_eval_hf.py b/evaluation/benchmarks/gorilla/ast_eval_hf.py similarity index 100% rename from evaluation/gorilla/ast_eval_hf.py rename to evaluation/benchmarks/gorilla/ast_eval_hf.py diff --git a/evaluation/gorilla/ast_eval_tf.py b/evaluation/benchmarks/gorilla/ast_eval_tf.py similarity index 100% rename from evaluation/gorilla/ast_eval_tf.py rename to evaluation/benchmarks/gorilla/ast_eval_tf.py diff --git a/evaluation/gorilla/ast_eval_th.py b/evaluation/benchmarks/gorilla/ast_eval_th.py similarity index 100% rename from evaluation/gorilla/ast_eval_th.py rename to evaluation/benchmarks/gorilla/ast_eval_th.py diff --git a/evaluation/gorilla/run_infer.py b/evaluation/benchmarks/gorilla/run_infer.py similarity index 98% rename from evaluation/gorilla/run_infer.py rename to evaluation/benchmarks/gorilla/run_infer.py index aa932a388f..6f5b6c9d43 100644 --- a/evaluation/gorilla/run_infer.py +++ b/evaluation/benchmarks/gorilla/run_infer.py @@ -5,7 +5,7 @@ import os import pandas as pd import requests -from evaluation.gorilla.utils import encode_question, get_data_for_hub +from evaluation.benchmarks.gorilla.utils import encode_question, get_data_for_hub from evaluation.utils.shared import ( EvalMetadata, EvalOutput, diff --git a/evaluation/gorilla/scripts/run_infer.sh b/evaluation/benchmarks/gorilla/scripts/run_infer.sh similarity index 93% rename from evaluation/gorilla/scripts/run_infer.sh rename to evaluation/benchmarks/gorilla/scripts/run_infer.sh index c39bd74bf2..4542444443 100755 --- a/evaluation/gorilla/scripts/run_infer.sh +++ b/evaluation/benchmarks/gorilla/scripts/run_infer.sh @@ -33,7 +33,7 @@ echo "AGENT_VERSION: $AGENT_VERSION" echo "MODEL_CONFIG: $MODEL_CONFIG" echo "HUBS: $HUBS" -COMMAND="poetry run python evaluation/gorilla/run_infer.py \ +COMMAND="poetry run python evaluation/benchmarks/gorilla/run_infer.py \ --agent-cls $AGENT \ --llm-config $MODEL_CONFIG \ --max-iterations 30 \ diff --git a/evaluation/gorilla/utils.py b/evaluation/benchmarks/gorilla/utils.py similarity index 100% rename from evaluation/gorilla/utils.py rename to evaluation/benchmarks/gorilla/utils.py diff --git a/evaluation/gpqa/README.md b/evaluation/benchmarks/gpqa/README.md similarity index 94% rename from evaluation/gpqa/README.md rename to evaluation/benchmarks/gpqa/README.md index b96c5913cb..235b9ab9b2 100644 --- a/evaluation/gpqa/README.md +++ b/evaluation/benchmarks/gpqa/README.md @@ -23,7 +23,7 @@ Please follow instruction [here](../README.md#setup) to setup your local develop 'gpqa_main', 'gqpa_diamond', 'gpqa_experts', 'gpqa_extended' -- data split options From the root of the OpenHands repo, run the following command: ```bash -./evaluation/gpqa/scripts/run_infer.sh [model_config_name] [git-version] [num_samples_eval] [data_split] [AgentClass] +./evaluation/benchmarks/gpqa/scripts/run_infer.sh [model_config_name] [git-version] [num_samples_eval] [data_split] [AgentClass] ``` You can replace `model_config_name` with any model you set up in `config.toml`. diff --git a/evaluation/gpqa/__init__.py b/evaluation/benchmarks/gpqa/__init__.py similarity index 100% rename from evaluation/gpqa/__init__.py rename to evaluation/benchmarks/gpqa/__init__.py diff --git a/evaluation/gpqa/run_infer.py b/evaluation/benchmarks/gpqa/run_infer.py similarity index 100% rename from evaluation/gpqa/run_infer.py rename to evaluation/benchmarks/gpqa/run_infer.py diff --git a/evaluation/gpqa/scripts/run_infer.sh b/evaluation/benchmarks/gpqa/scripts/run_infer.sh similarity index 93% rename from evaluation/gpqa/scripts/run_infer.sh rename to evaluation/benchmarks/gpqa/scripts/run_infer.sh index b45435e631..ec5a61dbbb 100755 --- a/evaluation/gpqa/scripts/run_infer.sh +++ b/evaluation/benchmarks/gpqa/scripts/run_infer.sh @@ -33,7 +33,7 @@ echo "AGENT: $AGENT" echo "AGENT_VERSION: $AGENT_VERSION" echo "MODEL_CONFIG: $MODEL_CONFIG" -COMMAND="poetry run python evaluation/gpqa/run_infer.py \ +COMMAND="poetry run python evaluation/benchmarks/gpqa/run_infer.py \ --agent-cls $AGENT \ --llm-config $MODEL_CONFIG \ --max-iterations 10 \ diff --git a/evaluation/humanevalfix/README.md b/evaluation/benchmarks/humanevalfix/README.md similarity index 99% rename from evaluation/humanevalfix/README.md rename to evaluation/benchmarks/humanevalfix/README.md index b887f57ac9..5f3ae58ee2 100644 --- a/evaluation/humanevalfix/README.md +++ b/evaluation/benchmarks/humanevalfix/README.md @@ -9,7 +9,7 @@ Please follow instruction [here](../README.md#setup) to setup your local develop ## Run Inference on HumanEvalFix ```bash -./evaluation/humanevalfix/scripts/run_infer.sh eval_gpt4_1106_preview +./evaluation/benchmarks/humanevalfix/scripts/run_infer.sh eval_gpt4_1106_preview ``` You can replace `eval_gpt4_1106_preview` with any model you set up in `config.toml`. diff --git a/evaluation/humanevalfix/__init__.py b/evaluation/benchmarks/humanevalfix/__init__.py similarity index 100% rename from evaluation/humanevalfix/__init__.py rename to evaluation/benchmarks/humanevalfix/__init__.py diff --git a/evaluation/humanevalfix/run_infer.py b/evaluation/benchmarks/humanevalfix/run_infer.py similarity index 100% rename from evaluation/humanevalfix/run_infer.py rename to evaluation/benchmarks/humanevalfix/run_infer.py diff --git a/evaluation/humanevalfix/scripts/run_infer.sh b/evaluation/benchmarks/humanevalfix/scripts/run_infer.sh similarity index 97% rename from evaluation/humanevalfix/scripts/run_infer.sh rename to evaluation/benchmarks/humanevalfix/scripts/run_infer.sh index f63e13d16a..b0b30628eb 100755 --- a/evaluation/humanevalfix/scripts/run_infer.sh +++ b/evaluation/benchmarks/humanevalfix/scripts/run_infer.sh @@ -64,7 +64,7 @@ echo "AGENT: $AGENT" echo "AGENT_VERSION: $AGENT_VERSION" echo "MODEL_CONFIG: $MODEL_CONFIG" -COMMAND="poetry run python evaluation/humanevalfix/run_infer.py \ +COMMAND="poetry run python evaluation/benchmarks/humanevalfix/run_infer.py \ --agent-cls $AGENT \ --llm-config $MODEL_CONFIG \ --max-iterations 10 \ diff --git a/evaluation/logic_reasoning/.cache_program/facts.kfb b/evaluation/benchmarks/logic_reasoning/.cache_program/facts.kfb similarity index 100% rename from evaluation/logic_reasoning/.cache_program/facts.kfb rename to evaluation/benchmarks/logic_reasoning/.cache_program/facts.kfb diff --git a/evaluation/logic_reasoning/.cache_program/rules.krb b/evaluation/benchmarks/logic_reasoning/.cache_program/rules.krb similarity index 100% rename from evaluation/logic_reasoning/.cache_program/rules.krb rename to evaluation/benchmarks/logic_reasoning/.cache_program/rules.krb diff --git a/evaluation/logic_reasoning/Dockerfile b/evaluation/benchmarks/logic_reasoning/Dockerfile similarity index 100% rename from evaluation/logic_reasoning/Dockerfile rename to evaluation/benchmarks/logic_reasoning/Dockerfile diff --git a/evaluation/logic_reasoning/README.md b/evaluation/benchmarks/logic_reasoning/README.md similarity index 83% rename from evaluation/logic_reasoning/README.md rename to evaluation/benchmarks/logic_reasoning/README.md index 79faae4fe0..d4e4d3e9a5 100644 --- a/evaluation/logic_reasoning/README.md +++ b/evaluation/benchmarks/logic_reasoning/README.md @@ -10,5 +10,5 @@ Please follow instruction [here](../README.md#setup) to setup your local develop The following code will run inference on the first example of the ProofWriter dataset, ```bash -./evaluation/logic_reasoning/scripts/run_infer.sh eval_gpt4_1106_preview_llm ProofWriter +./evaluation/benchmarks/logic_reasoning/scripts/run_infer.sh eval_gpt4_1106_preview_llm ProofWriter ``` diff --git a/evaluation/logic_reasoning/__init__.py b/evaluation/benchmarks/logic_reasoning/__init__.py similarity index 100% rename from evaluation/logic_reasoning/__init__.py rename to evaluation/benchmarks/logic_reasoning/__init__.py diff --git a/evaluation/logic_reasoning/instruction.txt b/evaluation/benchmarks/logic_reasoning/instruction.txt similarity index 100% rename from evaluation/logic_reasoning/instruction.txt rename to evaluation/benchmarks/logic_reasoning/instruction.txt diff --git a/evaluation/logic_reasoning/logic_inference.py b/evaluation/benchmarks/logic_reasoning/logic_inference.py similarity index 100% rename from evaluation/logic_reasoning/logic_inference.py rename to evaluation/benchmarks/logic_reasoning/logic_inference.py diff --git a/evaluation/logic_reasoning/run_infer.py b/evaluation/benchmarks/logic_reasoning/run_infer.py similarity index 100% rename from evaluation/logic_reasoning/run_infer.py rename to evaluation/benchmarks/logic_reasoning/run_infer.py diff --git a/evaluation/logic_reasoning/scripts/run_infer.sh b/evaluation/benchmarks/logic_reasoning/scripts/run_infer.sh similarity index 92% rename from evaluation/logic_reasoning/scripts/run_infer.sh rename to evaluation/benchmarks/logic_reasoning/scripts/run_infer.sh index 4c064c102c..40c244d18b 100755 --- a/evaluation/logic_reasoning/scripts/run_infer.sh +++ b/evaluation/benchmarks/logic_reasoning/scripts/run_infer.sh @@ -34,7 +34,7 @@ echo "AGENT: $AGENT" echo "AGENT_VERSION: $AGENT_VERSION" echo "MODEL_CONFIG: $MODEL_CONFIG" -COMMAND="poetry run python evaluation/logic_reasoning/run_infer.py \ +COMMAND="poetry run python evaluation/benchmarks/logic_reasoning/run_infer.py \ --agent-cls $AGENT \ --llm-config $MODEL_CONFIG \ --dataset $DATASET \ diff --git a/evaluation/miniwob/Dockerfile b/evaluation/benchmarks/miniwob/Dockerfile similarity index 100% rename from evaluation/miniwob/Dockerfile rename to evaluation/benchmarks/miniwob/Dockerfile diff --git a/evaluation/miniwob/README.md b/evaluation/benchmarks/miniwob/README.md similarity index 79% rename from evaluation/miniwob/README.md rename to evaluation/benchmarks/miniwob/README.md index a462232649..5535e45a7d 100644 --- a/evaluation/miniwob/README.md +++ b/evaluation/benchmarks/miniwob/README.md @@ -13,7 +13,7 @@ Access with browser the above MiniWoB URLs and see if they load correctly. ## Run Evaluation ```sh -./evaluation/miniwob/scripts/run_infer.sh llm.claude-35-sonnet-eval +./evaluation/benchmarks/miniwob/scripts/run_infer.sh llm.claude-35-sonnet-eval ``` ### Run Inference on `RemoteRuntime` (experimental) @@ -21,13 +21,13 @@ Access with browser the above MiniWoB URLs and see if they load correctly. This is in limited beta. Contact Xingyao over slack if you want to try this out! ```bash -./evaluation/miniwob/scripts/run_infer.sh [model_config] [git-version] [agent] [note] [eval_limit] [num_workers] +./evaluation/benchmarks/miniwob/scripts/run_infer.sh [model_config] [git-version] [agent] [note] [eval_limit] [num_workers] # Example - This runs evaluation on BrowsingAgent for 125 instances on miniwob, with 2 workers running in parallel export ALLHANDS_API_KEY="YOUR-API-KEY" export RUNTIME=remote export SANDBOX_REMOTE_RUNTIME_API_URL="https://runtime.eval.all-hands.dev" -./evaluation/miniwob/scripts/run_infer.sh llm.eval HEAD BrowsingAgent "" 125 2 +./evaluation/benchmarks/miniwob/scripts/run_infer.sh llm.eval HEAD BrowsingAgent "" 125 2 ``` Results will be in `evaluation/evaluation_outputs/outputs/miniwob/` @@ -35,7 +35,7 @@ Results will be in `evaluation/evaluation_outputs/outputs/miniwob/` To calculate the average reward, run: ```sh -poetry run python evaluation/miniwob/get_success_rate.py evaluation/evaluation_outputs/outputs/miniwob/SOME_AGENT/EXP_NAME/output.jsonl +poetry run python evaluation/benchmarks/miniwob/get_success_rate.py evaluation/evaluation_outputs/outputs/miniwob/SOME_AGENT/EXP_NAME/output.jsonl ``` ## Submit your evaluation results diff --git a/evaluation/miniwob/get_avg_reward.py b/evaluation/benchmarks/miniwob/get_avg_reward.py similarity index 100% rename from evaluation/miniwob/get_avg_reward.py rename to evaluation/benchmarks/miniwob/get_avg_reward.py diff --git a/evaluation/miniwob/run_infer.py b/evaluation/benchmarks/miniwob/run_infer.py similarity index 100% rename from evaluation/miniwob/run_infer.py rename to evaluation/benchmarks/miniwob/run_infer.py diff --git a/evaluation/miniwob/scripts/run_infer.sh b/evaluation/benchmarks/miniwob/scripts/run_infer.sh similarity index 86% rename from evaluation/miniwob/scripts/run_infer.sh rename to evaluation/benchmarks/miniwob/scripts/run_infer.sh index ece7cafbe7..8f997e29c3 100755 --- a/evaluation/miniwob/scripts/run_infer.sh +++ b/evaluation/benchmarks/miniwob/scripts/run_infer.sh @@ -33,7 +33,7 @@ echo "MODEL_CONFIG: $MODEL_CONFIG" EVAL_NOTE="${AGENT_VERSION}_${NOTE}" -COMMAND="export PYTHONPATH=evaluation/miniwob:\$PYTHONPATH && poetry run python evaluation/miniwob/run_infer.py \ +COMMAND="export PYTHONPATH=evaluation/benchmarks/miniwob:\$PYTHONPATH && poetry run python evaluation/benchmarks/miniwob/run_infer.py \ --agent-cls $AGENT \ --llm-config $MODEL_CONFIG \ --max-iterations 10 \ diff --git a/evaluation/mint/.gitignore b/evaluation/benchmarks/mint/.gitignore similarity index 100% rename from evaluation/mint/.gitignore rename to evaluation/benchmarks/mint/.gitignore diff --git a/evaluation/mint/Dockerfile b/evaluation/benchmarks/mint/Dockerfile similarity index 100% rename from evaluation/mint/Dockerfile rename to evaluation/benchmarks/mint/Dockerfile diff --git a/evaluation/mint/README.md b/evaluation/benchmarks/mint/README.md similarity index 85% rename from evaluation/mint/README.md rename to evaluation/benchmarks/mint/README.md index 950996cc49..bfaeb713bc 100644 --- a/evaluation/mint/README.md +++ b/evaluation/benchmarks/mint/README.md @@ -6,7 +6,7 @@ We support evaluation of the [Eurus subset focus on math and code reasoning](htt ## Setup Environment and LLM Configuration -Please follow instruction [here](../README.md#setup) to setup your local development environment and LLM. +Please follow instruction [here](../../README.md#setup) to setup your local development environment and LLM. ## Start the evaluation @@ -15,7 +15,7 @@ We are using the MINT dataset hosted on [Hugging Face](https://huggingface.co/da Following is the basic command to start the evaluation. Currently, the only agent supported with MINT is `CodeActAgent`. ```bash -./evaluation/mint/scripts/run_infer.sh [model_config] [git-version] [subset] [eval_limit] +./evaluation/benchmarks/mint/scripts/run_infer.sh [model_config] [git-version] [subset] [eval_limit] ``` where `model_config` is mandatory, while others are optional. @@ -34,7 +34,7 @@ Note: in order to use `eval_limit`, you must also set `subset`. For example, ```bash -./evaluation/swe_bench/scripts/run_infer.sh eval_gpt4_1106_preview 0.6.2 gsm8k 3 +./evaluation/benchmarks/mint/scripts/run_infer.sh eval_gpt4_1106_preview 0.6.2 gsm8k 3 ``` ## Reference diff --git a/evaluation/mint/config_variables.py b/evaluation/benchmarks/mint/config_variables.py similarity index 100% rename from evaluation/mint/config_variables.py rename to evaluation/benchmarks/mint/config_variables.py diff --git a/evaluation/mint/datatypes.py b/evaluation/benchmarks/mint/datatypes.py similarity index 100% rename from evaluation/mint/datatypes.py rename to evaluation/benchmarks/mint/datatypes.py diff --git a/evaluation/mint/env.py b/evaluation/benchmarks/mint/env.py similarity index 100% rename from evaluation/mint/env.py rename to evaluation/benchmarks/mint/env.py diff --git a/evaluation/mint/prompts/__init__.py b/evaluation/benchmarks/mint/prompts/__init__.py similarity index 100% rename from evaluation/mint/prompts/__init__.py rename to evaluation/benchmarks/mint/prompts/__init__.py diff --git a/evaluation/mint/prompts/template_with_tool.txt b/evaluation/benchmarks/mint/prompts/template_with_tool.txt similarity index 100% rename from evaluation/mint/prompts/template_with_tool.txt rename to evaluation/benchmarks/mint/prompts/template_with_tool.txt diff --git a/evaluation/mint/requirements.txt b/evaluation/benchmarks/mint/requirements.txt similarity index 100% rename from evaluation/mint/requirements.txt rename to evaluation/benchmarks/mint/requirements.txt diff --git a/evaluation/mint/run_infer.py b/evaluation/benchmarks/mint/run_infer.py similarity index 97% rename from evaluation/mint/run_infer.py rename to evaluation/benchmarks/mint/run_infer.py index 7f6985fc2a..4414e1c462 100644 --- a/evaluation/mint/run_infer.py +++ b/evaluation/benchmarks/mint/run_infer.py @@ -6,10 +6,10 @@ from typing import Any import pandas as pd from datasets import load_dataset -from evaluation.mint.datatypes import TaskState -from evaluation.mint.env import SimplifiedEnv -from evaluation.mint.prompts import ToolPromptTemplate -from evaluation.mint.tasks import Task +from evaluation.benchmarks.mint.datatypes import TaskState +from evaluation.benchmarks.mint.env import SimplifiedEnv +from evaluation.benchmarks.mint.prompts import ToolPromptTemplate +from evaluation.benchmarks.mint.tasks import Task from evaluation.utils.shared import ( EvalMetadata, EvalOutput, diff --git a/evaluation/mint/scripts/run_infer.sh b/evaluation/benchmarks/mint/scripts/run_infer.sh similarity index 100% rename from evaluation/mint/scripts/run_infer.sh rename to evaluation/benchmarks/mint/scripts/run_infer.sh diff --git a/evaluation/mint/tasks/__init__.py b/evaluation/benchmarks/mint/tasks/__init__.py similarity index 50% rename from evaluation/mint/tasks/__init__.py rename to evaluation/benchmarks/mint/tasks/__init__.py index 4f6ac721ac..96c628f854 100644 --- a/evaluation/mint/tasks/__init__.py +++ b/evaluation/benchmarks/mint/tasks/__init__.py @@ -1,6 +1,6 @@ -from evaluation.mint.tasks.base import Task -from evaluation.mint.tasks.codegen import HumanEvalTask, MBPPTask -from evaluation.mint.tasks.reasoning import ( +from evaluation.benchmarks.mint.tasks.base import Task +from evaluation.benchmarks.mint.tasks.codegen import HumanEvalTask, MBPPTask +from evaluation.benchmarks.mint.tasks.reasoning import ( MultipleChoiceTask, ReasoningTask, TheoremqaTask, diff --git a/evaluation/mint/tasks/base.py b/evaluation/benchmarks/mint/tasks/base.py similarity index 100% rename from evaluation/mint/tasks/base.py rename to evaluation/benchmarks/mint/tasks/base.py diff --git a/evaluation/mint/tasks/codegen.py b/evaluation/benchmarks/mint/tasks/codegen.py similarity index 98% rename from evaluation/mint/tasks/codegen.py rename to evaluation/benchmarks/mint/tasks/codegen.py index 8a80594ce4..cbd127ac0e 100644 --- a/evaluation/mint/tasks/codegen.py +++ b/evaluation/benchmarks/mint/tasks/codegen.py @@ -2,7 +2,7 @@ import logging from utils import check_correctness -from evaluation.mint.tasks.base import Task +from evaluation.benchmarks.mint.tasks.base import Task LOGGER = logging.getLogger('MINT') diff --git a/evaluation/mint/tasks/in_context_examples/humaneval/with_tool.txt b/evaluation/benchmarks/mint/tasks/in_context_examples/humaneval/with_tool.txt similarity index 100% rename from evaluation/mint/tasks/in_context_examples/humaneval/with_tool.txt rename to evaluation/benchmarks/mint/tasks/in_context_examples/humaneval/with_tool.txt diff --git a/evaluation/mint/tasks/in_context_examples/mbpp/with_tool.txt b/evaluation/benchmarks/mint/tasks/in_context_examples/mbpp/with_tool.txt similarity index 100% rename from evaluation/mint/tasks/in_context_examples/mbpp/with_tool.txt rename to evaluation/benchmarks/mint/tasks/in_context_examples/mbpp/with_tool.txt diff --git a/evaluation/mint/tasks/in_context_examples/reasoning/with_tool.txt b/evaluation/benchmarks/mint/tasks/in_context_examples/reasoning/with_tool.txt similarity index 100% rename from evaluation/mint/tasks/in_context_examples/reasoning/with_tool.txt rename to evaluation/benchmarks/mint/tasks/in_context_examples/reasoning/with_tool.txt diff --git a/evaluation/mint/tasks/reasoning.py b/evaluation/benchmarks/mint/tasks/reasoning.py similarity index 100% rename from evaluation/mint/tasks/reasoning.py rename to evaluation/benchmarks/mint/tasks/reasoning.py diff --git a/evaluation/mint/utils.py b/evaluation/benchmarks/mint/utils.py similarity index 100% rename from evaluation/mint/utils.py rename to evaluation/benchmarks/mint/utils.py diff --git a/evaluation/ml_bench/README.md b/evaluation/benchmarks/ml_bench/README.md similarity index 88% rename from evaluation/ml_bench/README.md rename to evaluation/benchmarks/ml_bench/README.md index 0ad9cca8f7..528edddc14 100644 --- a/evaluation/ml_bench/README.md +++ b/evaluation/benchmarks/ml_bench/README.md @@ -19,8 +19,8 @@ Please follow instruction [here](../README.md#setup) to setup your local develop To run the evaluation on the ML-Bench dataset, use the following command: ```bash -./evaluation/ml_bench/scripts/run_infer.sh [model_config] [git-version] [split] [agent] [eval_limit] -# e.g., ./evaluation/ml_bench/scripts/run_infer.sh eval_gpt4_1106_preview 0.6.2 full CodeActAgent 10 +./evaluation/benchmarks/ml_bench/scripts/run_infer.sh [model_config] [git-version] [split] [agent] [eval_limit] +# e.g., ./evaluation/benchmarks/ml_bench/scripts/run_infer.sh eval_gpt4_1106_preview 0.6.2 full CodeActAgent 10 ``` You can replace `eval_gpt4_1106_preview` with any model you set up in `config.toml`. @@ -30,8 +30,8 @@ You can replace `eval_gpt4_1106_preview` with any model you set up in `config.to To score the evaluation output, use the following command: ```bash -./evaluation/ml_bench/scripts/summarise_results.py [eval_output_dir] -# e.g., ./evaluation/ml_bench/scripts/summarise_results.py evaluation/evaluation_outputs/outputs/ml_bench/CodeActAgent/gpt-4-1106-preview_maxiter_10_N_v1.5 +./evaluation/benchmarks/ml_bench/scripts/summarise_results.py [eval_output_dir] +# e.g., ./evaluation/benchmarks/ml_bench/scripts/summarise_results.py evaluation/evaluation_outputs/outputs/ml_bench/CodeActAgent/gpt-4-1106-preview_maxiter_10_N_v1.5 ``` ## Run Error Analysis on ML-Bench @@ -39,8 +39,8 @@ To score the evaluation output, use the following command: To run error analysis on the ML-Bench dataset, use the following command: ```bash -./evaluation/ml_bench/scripts/run_analysis.sh [eval_output_dir] [model_config] -# e.g., ./evaluation/ml_bench/scripts/run_analysis.sh evaluation/evaluation_outputs/outputs/ml_bench/CodeActAgent/gpt-4-1106-preview_maxiter_10_N_v1.5/output.jsonl eval_gpt4_1106_preview +./evaluation/benchmarks/ml_bench/scripts/run_analysis.sh [eval_output_dir] [model_config] +# e.g., ./evaluation/benchmarks/ml_bench/scripts/run_analysis.sh evaluation/evaluation_outputs/outputs/ml_bench/CodeActAgent/gpt-4-1106-preview_maxiter_10_N_v1.5/output.jsonl eval_gpt4_1106_preview ``` This command generates a report on the evaluation output and provides insights into the agent's performance. @@ -105,7 +105,7 @@ The `metrics` field contains the parsed evaluation metrics from the `eval_output ## Customization -You can customize the evaluation script by modifying the `evaluation/ml_bench/run_infer.py` file. This script handles loading the ML-Bench dataset, running the agent on each task instance, and saving the evaluation outputs. +You can customize the evaluation script by modifying the `evaluation/benchmarks/ml_bench/run_infer.py` file. This script handles loading the ML-Bench dataset, running the agent on each task instance, and saving the evaluation outputs. Feel free to adjust the configuration, logging, and output formatting to suit your needs. diff --git a/evaluation/ml_bench/__init__.py b/evaluation/benchmarks/ml_bench/__init__.py similarity index 100% rename from evaluation/ml_bench/__init__.py rename to evaluation/benchmarks/ml_bench/__init__.py diff --git a/evaluation/ml_bench/run_analysis.py b/evaluation/benchmarks/ml_bench/run_analysis.py similarity index 100% rename from evaluation/ml_bench/run_analysis.py rename to evaluation/benchmarks/ml_bench/run_analysis.py diff --git a/evaluation/ml_bench/run_infer.py b/evaluation/benchmarks/ml_bench/run_infer.py similarity index 100% rename from evaluation/ml_bench/run_infer.py rename to evaluation/benchmarks/ml_bench/run_infer.py diff --git a/evaluation/ml_bench/scripts/cleanup.sh b/evaluation/benchmarks/ml_bench/scripts/cleanup.sh similarity index 100% rename from evaluation/ml_bench/scripts/cleanup.sh rename to evaluation/benchmarks/ml_bench/scripts/cleanup.sh diff --git a/evaluation/ml_bench/scripts/run_analysis.sh b/evaluation/benchmarks/ml_bench/scripts/run_analysis.sh similarity index 84% rename from evaluation/ml_bench/scripts/run_analysis.sh rename to evaluation/benchmarks/ml_bench/scripts/run_analysis.sh index 8571fe70f3..d5fe6365ca 100644 --- a/evaluation/ml_bench/scripts/run_analysis.sh +++ b/evaluation/benchmarks/ml_bench/scripts/run_analysis.sh @@ -17,7 +17,7 @@ fi echo "MODEL_CONFIG: $MODEL_CONFIG" echo "RESULT_FILE: $RESULT_FILE" -COMMAND="poetry run python evaluation/ml_bench/run_analysis.py \ +COMMAND="poetry run python evaluation/benchmarks/ml_bench/run_analysis.py \ --llm-config $MODEL_CONFIG \ --json_file_path $RESULT_FILE" diff --git a/evaluation/ml_bench/scripts/run_infer.sh b/evaluation/benchmarks/ml_bench/scripts/run_infer.sh similarity index 93% rename from evaluation/ml_bench/scripts/run_infer.sh rename to evaluation/benchmarks/ml_bench/scripts/run_infer.sh index 4ecbae514a..97ff0003fc 100755 --- a/evaluation/ml_bench/scripts/run_infer.sh +++ b/evaluation/benchmarks/ml_bench/scripts/run_infer.sh @@ -32,7 +32,7 @@ echo "AGENT: $AGENT" echo "AGENT_VERSION: $AGENT_VERSION" echo "MODEL_CONFIG: $MODEL_CONFIG" -COMMAND="poetry run python evaluation/ml_bench/run_infer.py \ +COMMAND="poetry run python evaluation/benchmarks/ml_bench/run_infer.py \ --agent-cls $AGENT \ --llm-config $MODEL_CONFIG \ --max-iterations 10 \ diff --git a/evaluation/ml_bench/scripts/summarise_results.py b/evaluation/benchmarks/ml_bench/scripts/summarise_results.py similarity index 100% rename from evaluation/ml_bench/scripts/summarise_results.py rename to evaluation/benchmarks/ml_bench/scripts/summarise_results.py diff --git a/evaluation/scienceagentbench/Dockerfile b/evaluation/benchmarks/scienceagentbench/Dockerfile similarity index 100% rename from evaluation/scienceagentbench/Dockerfile rename to evaluation/benchmarks/scienceagentbench/Dockerfile diff --git a/evaluation/scienceagentbench/Dockerfile.evaluator b/evaluation/benchmarks/scienceagentbench/Dockerfile.evaluator similarity index 100% rename from evaluation/scienceagentbench/Dockerfile.evaluator rename to evaluation/benchmarks/scienceagentbench/Dockerfile.evaluator diff --git a/evaluation/scienceagentbench/README.md b/evaluation/benchmarks/scienceagentbench/README.md similarity index 90% rename from evaluation/scienceagentbench/README.md rename to evaluation/benchmarks/scienceagentbench/README.md index 3182c2e117..4d97917721 100644 --- a/evaluation/scienceagentbench/README.md +++ b/evaluation/benchmarks/scienceagentbench/README.md @@ -13,10 +13,10 @@ To prevent benchmark data contamination, we only provide the annotation sheet on ## Run Inference on ScienceAgentBench ```bash -./evaluation/scienceagentbench/scripts/run_infer.sh [model_config] [git-version] [use_knowledge] [agent] [eval_limit] [max_iter] [num_workers] [dataset] [dataset_split] +./evaluation/benchmarks/scienceagentbench/scripts/run_infer.sh [model_config] [git-version] [use_knowledge] [agent] [eval_limit] [max_iter] [num_workers] [dataset] [dataset_split] # Example -./evaluation/scienceagentbench/scripts/run_infer.sh llm.eval_gpt4o 0.9.3 +./evaluation/benchmarks/scienceagentbench/scripts/run_infer.sh llm.eval_gpt4o 0.9.3 ``` where `model_config` is mandatory, and the rest are optional. diff --git a/evaluation/scienceagentbench/post_proc.py b/evaluation/benchmarks/scienceagentbench/post_proc.py similarity index 100% rename from evaluation/scienceagentbench/post_proc.py rename to evaluation/benchmarks/scienceagentbench/post_proc.py diff --git a/evaluation/scienceagentbench/run_infer.py b/evaluation/benchmarks/scienceagentbench/run_infer.py similarity index 100% rename from evaluation/scienceagentbench/run_infer.py rename to evaluation/benchmarks/scienceagentbench/run_infer.py diff --git a/evaluation/scienceagentbench/scripts/run_infer.sh b/evaluation/benchmarks/scienceagentbench/scripts/run_infer.sh similarity index 92% rename from evaluation/scienceagentbench/scripts/run_infer.sh rename to evaluation/benchmarks/scienceagentbench/scripts/run_infer.sh index 7667e57237..970f10ed2f 100755 --- a/evaluation/scienceagentbench/scripts/run_infer.sh +++ b/evaluation/benchmarks/scienceagentbench/scripts/run_infer.sh @@ -32,7 +32,7 @@ echo "AGENT: $AGENT" echo "AGENT_VERSION: $AGENT_VERSION" echo "MODEL_CONFIG: $MODEL_CONFIG" -COMMAND="poetry run python evaluation/scienceagentbench/run_infer.py \ +COMMAND="poetry run python evaluation/benchmarks/scienceagentbench/run_infer.py \ --agent-cls $AGENT \ --llm-config $MODEL_CONFIG \ --use_knowledge $USE_KNOWLEDGE \ diff --git a/evaluation/swe_bench/README.md b/evaluation/benchmarks/swe_bench/README.md similarity index 80% rename from evaluation/swe_bench/README.md rename to evaluation/benchmarks/swe_bench/README.md index 147d2a35ea..b69a738955 100644 --- a/evaluation/swe_bench/README.md +++ b/evaluation/benchmarks/swe_bench/README.md @@ -27,10 +27,10 @@ Make sure your Docker daemon is running, and you have ample disk space (at least When the `run_infer.sh` script is started, it will automatically pull the relevant SWE-Bench images. For example, for instance ID `django_django-11011`, it will try to pull our pre-build docker image `sweb.eval.x86_64.django_s_django-11011` from DockerHub. This image will be used create an OpenHands runtime image where the agent will operate on. ```bash -./evaluation/swe_bench/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit] [max_iter] [num_workers] [dataset] [dataset_split] +./evaluation/benchmarks/swe_bench/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit] [max_iter] [num_workers] [dataset] [dataset_split] # Example -./evaluation/swe_bench/scripts/run_infer.sh llm.eval_gpt4_1106_preview HEAD CodeActAgent 300 30 1 princeton-nlp/SWE-bench_Lite test +./evaluation/benchmarks/swe_bench/scripts/run_infer.sh llm.eval_gpt4_1106_preview HEAD CodeActAgent 300 30 1 princeton-nlp/SWE-bench_Lite test ``` where `model_config` is mandatory, and the rest are optional. @@ -62,7 +62,7 @@ Let's say you'd like to run 10 instances using `llm.eval_gpt4_1106_preview` and then your command would be: ```bash -./evaluation/swe_bench/scripts/run_infer.sh llm.eval_gpt4_1106_preview HEAD CodeActAgent 10 +./evaluation/benchmarks/swe_bench/scripts/run_infer.sh llm.eval_gpt4_1106_preview HEAD CodeActAgent 10 ``` ### Run Inference on `RemoteRuntime` (experimental) @@ -70,23 +70,23 @@ then your command would be: This is in limited beta. Contact Xingyao over slack if you want to try this out! ```bash -./evaluation/swe_bench/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit] [max_iter] [num_workers] [dataset] [dataset_split] +./evaluation/benchmarks/swe_bench/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit] [max_iter] [num_workers] [dataset] [dataset_split] # Example - This runs evaluation on CodeActAgent for 300 instances on "princeton-nlp/SWE-bench_Lite"'s test set, with max 30 iteration per instances, with 16 number of workers running in parallel ALLHANDS_API_KEY="YOUR-API-KEY" RUNTIME=remote SANDBOX_REMOTE_RUNTIME_API_URL="https://runtime.eval.all-hands.dev" EVAL_DOCKER_IMAGE_PREFIX="us-central1-docker.pkg.dev/evaluation-092424/swe-bench-images" \ -./evaluation/swe_bench/scripts/run_infer.sh llm.eval HEAD CodeActAgent 300 30 16 "princeton-nlp/SWE-bench_Lite" test +./evaluation/benchmarks/swe_bench/scripts/run_infer.sh llm.eval HEAD CodeActAgent 300 30 16 "princeton-nlp/SWE-bench_Lite" test ``` To clean-up all existing runtime you've already started, run: ```bash -ALLHANDS_API_KEY="YOUR-API-KEY" ./evaluation/swe_bench/scripts/cleanup_remote_runtime.sh +ALLHANDS_API_KEY="YOUR-API-KEY" ./evaluation/benchmarks/swe_bench/scripts/cleanup_remote_runtime.sh ``` ### Specify a subset of tasks to run infer If you would like to specify a list of tasks you'd like to benchmark on, you could -create a `config.toml` under `./evaluation/swe_bench/` folder, and put a list +create a `config.toml` under `./evaluation/benchmarks/swe_bench/` folder, and put a list attribute named `selected_ids`, e.g. ```toml @@ -105,19 +105,19 @@ After running the inference, you will obtain a `output.jsonl` (by default it wil **(Recommended for reproducibility)** If you have extra local space (e.g., 200GB), you can try pull the [instance-level docker images](https://github.com/princeton-nlp/SWE-bench/blob/main/docs/20240627_docker/README.md#choosing-the-right-cache_level) we've prepared by running: ```bash -evaluation/swe_bench/scripts/docker/pull_all_eval_docker.sh instance +evaluation/benchmarks/swe_bench/scripts/docker/pull_all_eval_docker.sh instance ``` If you want to save disk space a bit (e.g., with ~50GB free disk space), while speeding up the image pre-build process, you can pull the environment-level docker images: ```bash -evaluation/swe_bench/scripts/docker/pull_all_eval_docker.sh env +evaluation/benchmarks/swe_bench/scripts/docker/pull_all_eval_docker.sh env ``` If you want to evaluate on the full SWE-Bench test set: ```bash -evaluation/swe_bench/scripts/docker/pull_all_eval_docker.sh instance full +evaluation/benchmarks/swe_bench/scripts/docker/pull_all_eval_docker.sh instance full ``` ### Run evaluation @@ -136,10 +136,10 @@ NOTE, you should have already pulled the instance-level OR env-level docker imag Then you can run the following: ```bash -./evaluation/swe_bench/scripts/eval_infer.sh $YOUR_OUTPUT_JSONL [instance_id] [dataset_name] [split] +./evaluation/benchmarks/swe_bench/scripts/eval_infer.sh $YOUR_OUTPUT_JSONL [instance_id] [dataset_name] [split] # Example -./evaluation/swe_bench/scripts/eval_infer.sh evaluation/evaluation_outputs/outputs/swe_bench/CodeActAgent/gpt-4-1106-preview_maxiter_50_N_v1.0/output.jsonl +./evaluation/benchmarks/swe_bench/scripts/eval_infer.sh evaluation/evaluation_outputs/outputs/swe_bench/CodeActAgent/gpt-4-1106-preview_maxiter_50_N_v1.0/output.jsonl ``` The script now accepts optional arguments: @@ -150,10 +150,10 @@ The script now accepts optional arguments: For example, to evaluate a specific instance with a custom dataset and split: ```bash -./evaluation/swe_bench/scripts/eval_infer.sh $YOUR_OUTPUT_JSONL instance_123 princeton-nlp/SWE-bench test +./evaluation/benchmarks/swe_bench/scripts/eval_infer.sh $YOUR_OUTPUT_JSONL instance_123 princeton-nlp/SWE-bench test ``` -> You can also pass in a JSONL with [SWE-Bench format](https://github.com/princeton-nlp/SWE-bench/blob/main/tutorials/evaluation.md#-creating-predictions) to `./evaluation/swe_bench/scripts/eval_infer.sh`, where each line is a JSON of `{"model_patch": "XXX", "model_name_or_path": "YYY", "instance_id": "ZZZ"}`. +> You can also pass in a JSONL with [SWE-Bench format](https://github.com/princeton-nlp/SWE-bench/blob/main/tutorials/evaluation.md#-creating-predictions) to `./evaluation/benchmarks/swe_bench/scripts/eval_infer.sh`, where each line is a JSON of `{"model_patch": "XXX", "model_name_or_path": "YYY", "instance_id": "ZZZ"}`. The final results will be saved to `evaluation/evaluation_outputs/outputs/swe_bench/CodeActAgent/gpt-4-1106-preview_maxiter_50_N_v1.0/` with the following files/directory: @@ -166,17 +166,17 @@ The final results will be saved to `evaluation/evaluation_outputs/outputs/swe_be This is in limited beta. Contact Xingyao over slack if you want to try this out! ```bash -./evaluation/swe_bench/scripts/eval_infer_remote.sh [output.jsonl filepath] [num_workers] +./evaluation/benchmarks/swe_bench/scripts/eval_infer_remote.sh [output.jsonl filepath] [num_workers] # Example - This evaluates patches generated by CodeActAgent on Llama-3.1-70B-Instruct-Turbo on "princeton-nlp/SWE-bench_Lite"'s test set, with 16 number of workers running in parallel ALLHANDS_API_KEY="YOUR-API-KEY" RUNTIME=remote SANDBOX_REMOTE_RUNTIME_API_URL="https://runtime.eval.all-hands.dev" EVAL_DOCKER_IMAGE_PREFIX="us-central1-docker.pkg.dev/evaluation-092424/swe-bench-images" \ -evaluation/swe_bench/scripts/eval_infer_remote.sh evaluation/evaluation_outputs/outputs/swe-bench-lite/CodeActAgent/Llama-3.1-70B-Instruct-Turbo_maxiter_30_N_v1.9-no-hint/output.jsonl 16 "princeton-nlp/SWE-bench_Lite" "test" +evaluation/benchmarks/swe_bench/scripts/eval_infer_remote.sh evaluation/evaluation_outputs/outputs/swe-bench-lite/CodeActAgent/Llama-3.1-70B-Instruct-Turbo_maxiter_30_N_v1.9-no-hint/output.jsonl 16 "princeton-nlp/SWE-bench_Lite" "test" ``` To clean-up all existing runtimes that you've already started, run: ```bash -ALLHANDS_API_KEY="YOUR-API-KEY" ./evaluation/swe_bench/scripts/cleanup_remote_runtime.sh +ALLHANDS_API_KEY="YOUR-API-KEY" ./evaluation/benchmarks/swe_bench/scripts/cleanup_remote_runtime.sh ``` diff --git a/evaluation/swe_bench/__init__.py b/evaluation/benchmarks/swe_bench/__init__.py similarity index 100% rename from evaluation/swe_bench/__init__.py rename to evaluation/benchmarks/swe_bench/__init__.py diff --git a/evaluation/swe_bench/eval_infer.py b/evaluation/benchmarks/swe_bench/eval_infer.py similarity index 99% rename from evaluation/swe_bench/eval_infer.py rename to evaluation/benchmarks/swe_bench/eval_infer.py index d40f984fca..95f65245f2 100644 --- a/evaluation/swe_bench/eval_infer.py +++ b/evaluation/benchmarks/swe_bench/eval_infer.py @@ -12,7 +12,7 @@ from swebench.harness.run_evaluation import ( from swebench.harness.test_spec import SWEbenchInstance, TestSpec, make_test_spec from swebench.harness.utils import load_swebench_dataset -from evaluation.swe_bench.run_infer import get_instance_docker_image +from evaluation.benchmarks.swe_bench.run_infer import get_instance_docker_image from evaluation.utils.shared import ( EvalMetadata, EvalOutput, diff --git a/evaluation/swe_bench/examples/example_agent_output.jsonl b/evaluation/benchmarks/swe_bench/examples/example_agent_output.jsonl similarity index 100% rename from evaluation/swe_bench/examples/example_agent_output.jsonl rename to evaluation/benchmarks/swe_bench/examples/example_agent_output.jsonl diff --git a/evaluation/swe_bench/examples/example_model_output.json b/evaluation/benchmarks/swe_bench/examples/example_model_output.json similarity index 100% rename from evaluation/swe_bench/examples/example_model_output.json rename to evaluation/benchmarks/swe_bench/examples/example_model_output.json diff --git a/evaluation/swe_bench/prompt.py b/evaluation/benchmarks/swe_bench/prompt.py similarity index 100% rename from evaluation/swe_bench/prompt.py rename to evaluation/benchmarks/swe_bench/prompt.py diff --git a/evaluation/swe_bench/run_infer.py b/evaluation/benchmarks/swe_bench/run_infer.py similarity index 99% rename from evaluation/swe_bench/run_infer.py rename to evaluation/benchmarks/swe_bench/run_infer.py index 9cb9dd77f4..3ffc08d29b 100644 --- a/evaluation/swe_bench/run_infer.py +++ b/evaluation/benchmarks/swe_bench/run_infer.py @@ -9,7 +9,7 @@ import toml from datasets import load_dataset import openhands.agenthub -from evaluation.swe_bench.prompt import CODEACT_SWE_PROMPT +from evaluation.benchmarks.swe_bench.prompt import CODEACT_SWE_PROMPT from evaluation.utils.shared import ( EvalException, EvalMetadata, diff --git a/evaluation/swe_bench/scripts/cleanup_remote_runtime.sh b/evaluation/benchmarks/swe_bench/scripts/cleanup_remote_runtime.sh similarity index 100% rename from evaluation/swe_bench/scripts/cleanup_remote_runtime.sh rename to evaluation/benchmarks/swe_bench/scripts/cleanup_remote_runtime.sh diff --git a/evaluation/swe_bench/scripts/docker/all-swebench-full-instance-images.txt b/evaluation/benchmarks/swe_bench/scripts/docker/all-swebench-full-instance-images.txt similarity index 100% rename from evaluation/swe_bench/scripts/docker/all-swebench-full-instance-images.txt rename to evaluation/benchmarks/swe_bench/scripts/docker/all-swebench-full-instance-images.txt diff --git a/evaluation/swe_bench/scripts/docker/all-swebench-lite-instance-images.txt b/evaluation/benchmarks/swe_bench/scripts/docker/all-swebench-lite-instance-images.txt similarity index 100% rename from evaluation/swe_bench/scripts/docker/all-swebench-lite-instance-images.txt rename to evaluation/benchmarks/swe_bench/scripts/docker/all-swebench-lite-instance-images.txt diff --git a/evaluation/swe_bench/scripts/docker/pull_all_eval_docker.sh b/evaluation/benchmarks/swe_bench/scripts/docker/pull_all_eval_docker.sh similarity index 100% rename from evaluation/swe_bench/scripts/docker/pull_all_eval_docker.sh rename to evaluation/benchmarks/swe_bench/scripts/docker/pull_all_eval_docker.sh diff --git a/evaluation/swe_bench/scripts/docker/push_docker_instance_images.py b/evaluation/benchmarks/swe_bench/scripts/docker/push_docker_instance_images.py similarity index 96% rename from evaluation/swe_bench/scripts/docker/push_docker_instance_images.py rename to evaluation/benchmarks/swe_bench/scripts/docker/push_docker_instance_images.py index 20fb1b94c0..52e2ea4cb1 100644 --- a/evaluation/swe_bench/scripts/docker/push_docker_instance_images.py +++ b/evaluation/benchmarks/swe_bench/scripts/docker/push_docker_instance_images.py @@ -32,7 +32,7 @@ from tqdm import tqdm from openhands.core.logger import openhands_logger as logger logger.setLevel('ERROR') -from evaluation.swe_bench.run_infer import get_instance_docker_image # noqa +from evaluation.benchmarks.swe_bench.run_infer import get_instance_docker_image # noqa parser = argparse.ArgumentParser() parser.add_argument('--dataset', type=str, default='princeton-nlp/SWE-bench_Lite') diff --git a/evaluation/swe_bench/scripts/docker/push_eval_docker.sh b/evaluation/benchmarks/swe_bench/scripts/docker/push_eval_docker.sh similarity index 100% rename from evaluation/swe_bench/scripts/docker/push_eval_docker.sh rename to evaluation/benchmarks/swe_bench/scripts/docker/push_eval_docker.sh diff --git a/evaluation/swe_bench/scripts/eval/compare_outputs.py b/evaluation/benchmarks/swe_bench/scripts/eval/compare_outputs.py similarity index 100% rename from evaluation/swe_bench/scripts/eval/compare_outputs.py rename to evaluation/benchmarks/swe_bench/scripts/eval/compare_outputs.py diff --git a/evaluation/swe_bench/scripts/eval/convert_oh_folder_to_swebench_submission.sh b/evaluation/benchmarks/swe_bench/scripts/eval/convert_oh_folder_to_swebench_submission.sh similarity index 87% rename from evaluation/swe_bench/scripts/eval/convert_oh_folder_to_swebench_submission.sh rename to evaluation/benchmarks/swe_bench/scripts/eval/convert_oh_folder_to_swebench_submission.sh index 8bbaa6ddce..044f9972f4 100755 --- a/evaluation/swe_bench/scripts/eval/convert_oh_folder_to_swebench_submission.sh +++ b/evaluation/benchmarks/swe_bench/scripts/eval/convert_oh_folder_to_swebench_submission.sh @@ -5,7 +5,7 @@ NEW_FOLDER_PATH=${FOLDER_PATH}.swebench_submission mkdir -p $NEW_FOLDER_PATH # Build all_preds.jsonl -poetry run python evaluation/swe_bench/scripts/eval/convert_oh_output_to_swe_json.py $FOLDER_PATH/output.jsonl +poetry run python evaluation/benchmarks/swe_bench/scripts/eval/convert_oh_output_to_swe_json.py $FOLDER_PATH/output.jsonl mv $FOLDER_PATH/output.swebench.jsonl $NEW_FOLDER_PATH/all_preds.jsonl # Build trajs/ diff --git a/evaluation/swe_bench/scripts/eval/convert_oh_output_to_md.py b/evaluation/benchmarks/swe_bench/scripts/eval/convert_oh_output_to_md.py similarity index 97% rename from evaluation/swe_bench/scripts/eval/convert_oh_output_to_md.py rename to evaluation/benchmarks/swe_bench/scripts/eval/convert_oh_output_to_md.py index 17a375ee3b..8e9fc407d9 100755 --- a/evaluation/swe_bench/scripts/eval/convert_oh_output_to_md.py +++ b/evaluation/benchmarks/swe_bench/scripts/eval/convert_oh_output_to_md.py @@ -8,7 +8,7 @@ import os import pandas as pd from tqdm import tqdm -from evaluation.swe_bench.eval_infer import process_git_patch +from evaluation.benchmarks.swe_bench.eval_infer import process_git_patch from openhands.events.serialization import event_from_dict tqdm.pandas() diff --git a/evaluation/swe_bench/scripts/eval/convert_oh_output_to_swe_json.py b/evaluation/benchmarks/swe_bench/scripts/eval/convert_oh_output_to_swe_json.py similarity index 93% rename from evaluation/swe_bench/scripts/eval/convert_oh_output_to_swe_json.py rename to evaluation/benchmarks/swe_bench/scripts/eval/convert_oh_output_to_swe_json.py index 5006d3dde3..f333012f48 100644 --- a/evaluation/swe_bench/scripts/eval/convert_oh_output_to_swe_json.py +++ b/evaluation/benchmarks/swe_bench/scripts/eval/convert_oh_output_to_swe_json.py @@ -3,7 +3,7 @@ import os import pandas as pd -from evaluation.swe_bench.eval_infer import process_git_patch +from evaluation.benchmarks.swe_bench.eval_infer import process_git_patch parser = argparse.ArgumentParser() parser.add_argument('oh_output_file', type=str) diff --git a/evaluation/swe_bench/scripts/eval/download_gold_patch.py b/evaluation/benchmarks/swe_bench/scripts/eval/download_gold_patch.py similarity index 100% rename from evaluation/swe_bench/scripts/eval/download_gold_patch.py rename to evaluation/benchmarks/swe_bench/scripts/eval/download_gold_patch.py diff --git a/evaluation/swe_bench/scripts/eval/summarize_outputs.py b/evaluation/benchmarks/swe_bench/scripts/eval/summarize_outputs.py similarity index 100% rename from evaluation/swe_bench/scripts/eval/summarize_outputs.py rename to evaluation/benchmarks/swe_bench/scripts/eval/summarize_outputs.py diff --git a/evaluation/swe_bench/scripts/eval/update_output_with_eval.py b/evaluation/benchmarks/swe_bench/scripts/eval/update_output_with_eval.py similarity index 100% rename from evaluation/swe_bench/scripts/eval/update_output_with_eval.py rename to evaluation/benchmarks/swe_bench/scripts/eval/update_output_with_eval.py diff --git a/evaluation/swe_bench/scripts/eval_infer.sh b/evaluation/benchmarks/swe_bench/scripts/eval_infer.sh similarity index 95% rename from evaluation/swe_bench/scripts/eval_infer.sh rename to evaluation/benchmarks/swe_bench/scripts/eval_infer.sh index 8e263e10ca..13ef271671 100755 --- a/evaluation/swe_bench/scripts/eval_infer.sh +++ b/evaluation/benchmarks/swe_bench/scripts/eval_infer.sh @@ -58,7 +58,7 @@ else # ==== Convert OH format to SWE-bench format ==== echo "Merged output file with fine-grained report will be saved to $FILE_DIR" - poetry run python3 evaluation/swe_bench/scripts/eval/convert_oh_output_to_swe_json.py $PROCESS_FILEPATH + poetry run python3 evaluation/benchmarks/swe_bench/scripts/eval/convert_oh_output_to_swe_json.py $PROCESS_FILEPATH # replace .jsonl with .swebench.jsonl in filename SWEBENCH_FORMAT_JSONL=${PROCESS_FILEPATH/.jsonl/.swebench.jsonl} echo "SWEBENCH_FORMAT_JSONL: $SWEBENCH_FORMAT_JSONL" @@ -125,7 +125,7 @@ if [ -z "$INSTANCE_ID" ]; then mv $REPORT_PATH $RESULT_OUTPUT_DIR/report.json fi - poetry run python evaluation/swe_bench/scripts/eval/update_output_with_eval.py $PROCESS_FILEPATH + poetry run python evaluation/benchmarks/swe_bench/scripts/eval/update_output_with_eval.py $PROCESS_FILEPATH else echo "Running SWE-bench evaluation on the instance_id: $INSTANCE_ID" diff --git a/evaluation/swe_bench/scripts/eval_infer_remote.sh b/evaluation/benchmarks/swe_bench/scripts/eval_infer_remote.sh similarity index 83% rename from evaluation/swe_bench/scripts/eval_infer_remote.sh rename to evaluation/benchmarks/swe_bench/scripts/eval_infer_remote.sh index dead194ef2..6828097836 100755 --- a/evaluation/swe_bench/scripts/eval_infer_remote.sh +++ b/evaluation/benchmarks/swe_bench/scripts/eval_infer_remote.sh @@ -28,7 +28,7 @@ fi echo "... Evaluating on $INPUT_FILE ..." -COMMAND="poetry run python evaluation/swe_bench/eval_infer.py \ +COMMAND="poetry run python evaluation/benchmarks/swe_bench/eval_infer.py \ --eval-num-workers $NUM_WORKERS \ --input-file $INPUT_FILE \ --dataset $DATASET \ @@ -43,4 +43,4 @@ fi eval $COMMAND # update the output with evaluation results -poetry run python evaluation/swe_bench/scripts/eval/update_output_with_eval.py $INPUT_FILE +poetry run python evaluation/benchmarks/swe_bench/scripts/eval/update_output_with_eval.py $INPUT_FILE diff --git a/evaluation/swe_bench/scripts/run_infer.sh b/evaluation/benchmarks/swe_bench/scripts/run_infer.sh similarity index 97% rename from evaluation/swe_bench/scripts/run_infer.sh rename to evaluation/benchmarks/swe_bench/scripts/run_infer.sh index 520003635a..a27bd7cdbb 100755 --- a/evaluation/swe_bench/scripts/run_infer.sh +++ b/evaluation/benchmarks/swe_bench/scripts/run_infer.sh @@ -84,7 +84,7 @@ fi function run_eval() { local eval_note=$1 - COMMAND="poetry run python evaluation/swe_bench/run_infer.py \ + COMMAND="poetry run python evaluation/benchmarks/swe_bench/run_infer.py \ --agent-cls $AGENT \ --llm-config $MODEL_CONFIG \ --max-iterations $MAX_ITER \ diff --git a/evaluation/swe_bench/scripts/setup/compare_patch_filename.py b/evaluation/benchmarks/swe_bench/scripts/setup/compare_patch_filename.py similarity index 100% rename from evaluation/swe_bench/scripts/setup/compare_patch_filename.py rename to evaluation/benchmarks/swe_bench/scripts/setup/compare_patch_filename.py diff --git a/evaluation/swe_bench/scripts/setup/instance_swe_entry.sh b/evaluation/benchmarks/swe_bench/scripts/setup/instance_swe_entry.sh similarity index 100% rename from evaluation/swe_bench/scripts/setup/instance_swe_entry.sh rename to evaluation/benchmarks/swe_bench/scripts/setup/instance_swe_entry.sh diff --git a/evaluation/swe_bench/scripts/setup/prepare_swe_utils.sh b/evaluation/benchmarks/swe_bench/scripts/setup/prepare_swe_utils.sh similarity index 93% rename from evaluation/swe_bench/scripts/setup/prepare_swe_utils.sh rename to evaluation/benchmarks/swe_bench/scripts/setup/prepare_swe_utils.sh index bc1f4c03b7..7091b6f586 100755 --- a/evaluation/swe_bench/scripts/setup/prepare_swe_utils.sh +++ b/evaluation/benchmarks/swe_bench/scripts/setup/prepare_swe_utils.sh @@ -1,7 +1,7 @@ #!/bin/bash set -e -EVAL_WORKSPACE="evaluation/swe_bench/eval_workspace" +EVAL_WORKSPACE="evaluation/benchmarks/swe_bench/eval_workspace" mkdir -p $EVAL_WORKSPACE # 1. Prepare REPO diff --git a/evaluation/swe_bench/scripts/setup/swe_entry.sh b/evaluation/benchmarks/swe_bench/scripts/setup/swe_entry.sh similarity index 100% rename from evaluation/swe_bench/scripts/setup/swe_entry.sh rename to evaluation/benchmarks/swe_bench/scripts/setup/swe_entry.sh diff --git a/evaluation/toolqa/Dockerfile b/evaluation/benchmarks/toolqa/Dockerfile similarity index 100% rename from evaluation/toolqa/Dockerfile rename to evaluation/benchmarks/toolqa/Dockerfile diff --git a/evaluation/toolqa/README.md b/evaluation/benchmarks/toolqa/README.md similarity index 88% rename from evaluation/toolqa/README.md rename to evaluation/benchmarks/toolqa/README.md index 07f74645e2..eda478f448 100644 --- a/evaluation/toolqa/README.md +++ b/evaluation/benchmarks/toolqa/README.md @@ -11,7 +11,7 @@ Please follow instruction [here](../README.md#setup) to setup your local develop Make sure your Docker daemon is running, then run this bash script: ```bash -bash evaluation/toolqa/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit] [dataset] [hardness] [wolfram_alpha_appid] +bash evaluation/benchmarks/toolqa/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit] [dataset] [hardness] [wolfram_alpha_appid] ``` where `model_config` is mandatory, while all other arguments are optional. @@ -40,5 +40,5 @@ Let's say you'd like to run 10 instances using `llm` and CodeActAgent on `coffee then your command would be: ```bash -bash evaluation/toolqa/scripts/run_infer.sh llm CodeActAgent 10 coffee easy +bash evaluation/benchmarks/toolqa/scripts/run_infer.sh llm CodeActAgent 10 coffee easy ``` diff --git a/evaluation/toolqa/run_infer.py b/evaluation/benchmarks/toolqa/run_infer.py similarity index 98% rename from evaluation/toolqa/run_infer.py rename to evaluation/benchmarks/toolqa/run_infer.py index a7c5242d2f..c99f15a89a 100644 --- a/evaluation/toolqa/run_infer.py +++ b/evaluation/benchmarks/toolqa/run_infer.py @@ -4,7 +4,7 @@ from typing import Any import pandas as pd -from evaluation.toolqa.utils import encode_question, eval_answer, get_data +from evaluation.benchmarks.toolqa.utils import encode_question, eval_answer, get_data from evaluation.utils.shared import ( EvalMetadata, EvalOutput, diff --git a/evaluation/toolqa/scripts/run_infer.sh b/evaluation/benchmarks/toolqa/scripts/run_infer.sh similarity index 95% rename from evaluation/toolqa/scripts/run_infer.sh rename to evaluation/benchmarks/toolqa/scripts/run_infer.sh index 2af978e76a..bfe3471f4f 100755 --- a/evaluation/toolqa/scripts/run_infer.sh +++ b/evaluation/benchmarks/toolqa/scripts/run_infer.sh @@ -47,7 +47,7 @@ echo "DATASET: $DATASET" echo "HARDNESS: $HARDNESS" echo "WOLFRAM_APPID: $WOLFRAM_APPID" -COMMAND="poetry run python evaluation/toolqa/run_infer.py \ +COMMAND="poetry run python evaluation/benchmarks/toolqa/run_infer.py \ --agent-cls $AGENT \ --llm-config $MODEL_CONFIG \ --max-iterations 30 \ diff --git a/evaluation/toolqa/utils.py b/evaluation/benchmarks/toolqa/utils.py similarity index 100% rename from evaluation/toolqa/utils.py rename to evaluation/benchmarks/toolqa/utils.py diff --git a/evaluation/webarena/README.md b/evaluation/benchmarks/webarena/README.md similarity index 91% rename from evaluation/webarena/README.md rename to evaluation/benchmarks/webarena/README.md index e81f92c592..3e403d5a7f 100644 --- a/evaluation/webarena/README.md +++ b/evaluation/benchmarks/webarena/README.md @@ -24,7 +24,7 @@ Follow the WebArena environment setup guide carefully, and make sure the URL fie ```bash export WEBARENA_BASE_URL= export OPENAI_API_KEY="yourkey" # this key is required for some WebArena validators that utilize LLMs -bash evaluation/webarena/scripts/run_infer.sh +bash evaluation/benchmarks/webarena/scripts/run_infer.sh ``` Results will be in `evaluation/evaluation_outputs/outputs/webarena/` @@ -32,7 +32,7 @@ Results will be in `evaluation/evaluation_outputs/outputs/webarena/` To calculate the success rate, run: ```sh -poetry run python evaluation/webarena/get_success_rate.py evaluation/evaluation_outputs/outputs/webarena/SOME_AGENT/EXP_NAME/output.jsonl +poetry run python evaluation/benchmarks/webarena/get_success_rate.py evaluation/evaluation_outputs/outputs/webarena/SOME_AGENT/EXP_NAME/output.jsonl ``` ## Submit your evaluation results diff --git a/evaluation/webarena/__init__.py b/evaluation/benchmarks/webarena/__init__.py similarity index 100% rename from evaluation/webarena/__init__.py rename to evaluation/benchmarks/webarena/__init__.py diff --git a/evaluation/webarena/get_success_rate.py b/evaluation/benchmarks/webarena/get_success_rate.py similarity index 100% rename from evaluation/webarena/get_success_rate.py rename to evaluation/benchmarks/webarena/get_success_rate.py diff --git a/evaluation/webarena/run_infer.py b/evaluation/benchmarks/webarena/run_infer.py similarity index 100% rename from evaluation/webarena/run_infer.py rename to evaluation/benchmarks/webarena/run_infer.py diff --git a/evaluation/webarena/scripts/run_infer.sh b/evaluation/benchmarks/webarena/scripts/run_infer.sh similarity index 87% rename from evaluation/webarena/scripts/run_infer.sh rename to evaluation/benchmarks/webarena/scripts/run_infer.sh index c5b2c1ecd0..22372b82d7 100755 --- a/evaluation/webarena/scripts/run_infer.sh +++ b/evaluation/benchmarks/webarena/scripts/run_infer.sh @@ -4,7 +4,7 @@ set -eo pipefail source "evaluation/utils/version_control.sh" # configure webarena websites and environment -source evaluation/webarena/scripts/webarena_env.sh +source evaluation/benchmarks/webarena/scripts/webarena_env.sh # configure browsing agent export USE_NAV="false" @@ -35,7 +35,7 @@ echo "MODEL_CONFIG: $MODEL_CONFIG" EVAL_NOTE="$AGENT_VERSION" -COMMAND="poetry run python evaluation/webarena/run_infer.py \ +COMMAND="poetry run python evaluation/benchmarks/webarena/run_infer.py \ --agent-cls $AGENT \ --llm-config $MODEL_CONFIG \ --max-iterations 15 \