From 678436da30333240f01780f3fa5ac162b0a1d13a Mon Sep 17 00:00:00 2001
From: OpenHands <opendevin@all-hands.dev>
Date: Mon, 25 Nov 2024 08:35:52 -0500
Subject: [PATCH] Fix issue #5222: [Refactor]: Refactor the evaluation
 directory (#5223)

Co-authored-by: Engel Nyst <enyst@users.noreply.github.com>
---
 .github/workflows/eval-runner.yml             |  6 +--
 .../usage/how-to/evaluation-harness.md        |  2 +-
 .../usage/how-to/evaluation-harness.md        |  2 +-
 .../usage/how-to/evaluation-harness.md        |  2 +-
 evaluation/README.md                          | 38 ++++++++++---------
 evaluation/{ => benchmarks}/EDA/README.md     |  4 +-
 evaluation/{ => benchmarks}/EDA/game.py       |  0
 evaluation/{ => benchmarks}/EDA/run_infer.py  |  2 +-
 .../{ => benchmarks}/EDA/scripts/run_infer.sh |  2 +-
 .../{ => benchmarks}/agent_bench/README.md    |  6 +--
 .../{ => benchmarks}/agent_bench/__init__.py  |  0
 .../{ => benchmarks}/agent_bench/helper.py    |  0
 .../{ => benchmarks}/agent_bench/run_infer.py |  2 +-
 .../agent_bench/scripts/run_infer.sh          |  2 +-
 .../agent_bench/scripts/summarise_results.py  |  0
 .../{ => benchmarks}/aider_bench/README.md    | 14 +++----
 .../aider_bench/create_dataset.py             |  0
 .../{ => benchmarks}/aider_bench/helper.py    |  0
 .../{ => benchmarks}/aider_bench/run_infer.py |  2 +-
 .../aider_bench/scripts/run_infer.sh          |  2 +-
 .../aider_bench/scripts/summarize_results.py  |  0
 .../{ => benchmarks}/biocoder/README.md       |  4 +-
 .../{ => benchmarks}/biocoder/run_infer.py    |  2 +-
 .../biocoder/scripts/run_infer.sh             |  2 +-
 .../scripts/setup/copy_changed_code.py        |  0
 .../biocoder/scripts/setup/remove_code.py     |  0
 evaluation/{ => benchmarks}/biocoder/utils.py |  0
 evaluation/{ => benchmarks}/bird/README.md    |  2 +-
 evaluation/{ => benchmarks}/bird/__init__.py  |  0
 evaluation/{ => benchmarks}/bird/run_infer.py |  0
 .../bird/scripts/run_infer.sh                 |  2 +-
 .../browsing_delegation/README.md             |  2 +-
 .../browsing_delegation/run_infer.py          |  0
 .../browsing_delegation/scripts/run_infer.sh  |  2 +-
 .../{ => benchmarks}/commit0_bench/README.md  | 12 +++---
 .../commit0_bench/run_infer.py                |  0
 .../scripts/cleanup_remote_runtime.sh         |  0
 .../commit0_bench/scripts/run_infer.sh        |  2 +-
 .../{ => benchmarks}/discoverybench/README.md |  4 +-
 .../discoverybench/eval_utils/README.md       |  0
 .../discoverybench/eval_utils/__init__.py     |  0
 .../eval_utils/eval_w_subhypo_gen.py          |  0
 .../discoverybench/eval_utils/lm_utils.py     |  0
 .../eval_utils/openai_helpers.py              |  0
 .../eval_utils/openai_semantic_gen_prompts.py |  0
 .../eval_utils/response_parser.py             |  0
 .../discoverybench/run_infer.py               |  4 +-
 .../discoverybench/scripts/run_infer.sh       |  2 +-
 evaluation/{ => benchmarks}/gaia/README.md    | 10 ++---
 evaluation/{ => benchmarks}/gaia/get_score.py |  0
 evaluation/{ => benchmarks}/gaia/run_infer.py |  2 +-
 evaluation/{ => benchmarks}/gaia/scorer.py    |  0
 .../gaia/scripts/run_infer.sh                 |  2 +-
 evaluation/{ => benchmarks}/gorilla/README.md |  4 +-
 .../{ => benchmarks}/gorilla/ast_eval_hf.py   |  0
 .../{ => benchmarks}/gorilla/ast_eval_tf.py   |  0
 .../{ => benchmarks}/gorilla/ast_eval_th.py   |  0
 .../{ => benchmarks}/gorilla/run_infer.py     |  2 +-
 .../gorilla/scripts/run_infer.sh              |  2 +-
 evaluation/{ => benchmarks}/gorilla/utils.py  |  0
 evaluation/{ => benchmarks}/gpqa/README.md    |  2 +-
 evaluation/{ => benchmarks}/gpqa/__init__.py  |  0
 evaluation/{ => benchmarks}/gpqa/run_infer.py |  0
 .../gpqa/scripts/run_infer.sh                 |  2 +-
 .../{ => benchmarks}/humanevalfix/README.md   |  2 +-
 .../{ => benchmarks}/humanevalfix/__init__.py |  0
 .../humanevalfix/run_infer.py                 |  0
 .../humanevalfix/scripts/run_infer.sh         |  2 +-
 .../logic_reasoning/.cache_program/facts.kfb  |  0
 .../logic_reasoning/.cache_program/rules.krb  |  0
 .../logic_reasoning/Dockerfile                |  0
 .../logic_reasoning/README.md                 |  2 +-
 .../logic_reasoning/__init__.py               |  0
 .../logic_reasoning/instruction.txt           |  0
 .../logic_reasoning/logic_inference.py        |  0
 .../logic_reasoning/run_infer.py              |  0
 .../logic_reasoning/scripts/run_infer.sh      |  2 +-
 .../{ => benchmarks}/miniwob/Dockerfile       |  0
 evaluation/{ => benchmarks}/miniwob/README.md |  8 ++--
 .../miniwob/get_avg_reward.py                 |  0
 .../{ => benchmarks}/miniwob/run_infer.py     |  0
 .../miniwob/scripts/run_infer.sh              |  2 +-
 evaluation/{ => benchmarks}/mint/.gitignore   |  0
 evaluation/{ => benchmarks}/mint/Dockerfile   |  0
 evaluation/{ => benchmarks}/mint/README.md    |  6 +--
 .../{ => benchmarks}/mint/config_variables.py |  0
 evaluation/{ => benchmarks}/mint/datatypes.py |  0
 evaluation/{ => benchmarks}/mint/env.py       |  0
 .../{ => benchmarks}/mint/prompts/__init__.py |  0
 .../mint/prompts/template_with_tool.txt       |  0
 .../{ => benchmarks}/mint/requirements.txt    |  0
 evaluation/{ => benchmarks}/mint/run_infer.py |  8 ++--
 .../mint/scripts/run_infer.sh                 |  0
 .../{ => benchmarks}/mint/tasks/__init__.py   |  6 +--
 .../{ => benchmarks}/mint/tasks/base.py       |  0
 .../{ => benchmarks}/mint/tasks/codegen.py    |  2 +-
 .../humaneval/with_tool.txt                   |  0
 .../in_context_examples/mbpp/with_tool.txt    |  0
 .../reasoning/with_tool.txt                   |  0
 .../{ => benchmarks}/mint/tasks/reasoning.py  |  0
 evaluation/{ => benchmarks}/mint/utils.py     |  0
 .../{ => benchmarks}/ml_bench/README.md       | 14 +++----
 .../{ => benchmarks}/ml_bench/__init__.py     |  0
 .../{ => benchmarks}/ml_bench/run_analysis.py |  0
 .../{ => benchmarks}/ml_bench/run_infer.py    |  0
 .../ml_bench/scripts/cleanup.sh               |  0
 .../ml_bench/scripts/run_analysis.sh          |  2 +-
 .../ml_bench/scripts/run_infer.sh             |  2 +-
 .../ml_bench/scripts/summarise_results.py     |  0
 .../scienceagentbench/Dockerfile              |  0
 .../scienceagentbench/Dockerfile.evaluator    |  0
 .../scienceagentbench/README.md               |  4 +-
 .../scienceagentbench/post_proc.py            |  0
 .../scienceagentbench/run_infer.py            |  0
 .../scienceagentbench/scripts/run_infer.sh    |  2 +-
 .../{ => benchmarks}/swe_bench/README.md      | 34 ++++++++---------
 .../{ => benchmarks}/swe_bench/__init__.py    |  0
 .../{ => benchmarks}/swe_bench/eval_infer.py  |  2 +-
 .../examples/example_agent_output.jsonl       |  0
 .../examples/example_model_output.json        |  0
 .../{ => benchmarks}/swe_bench/prompt.py      |  0
 .../{ => benchmarks}/swe_bench/run_infer.py   |  2 +-
 .../scripts/cleanup_remote_runtime.sh         |  0
 .../all-swebench-full-instance-images.txt     |  0
 .../all-swebench-lite-instance-images.txt     |  0
 .../scripts/docker/pull_all_eval_docker.sh    |  0
 .../docker/push_docker_instance_images.py     |  2 +-
 .../scripts/docker/push_eval_docker.sh        |  0
 .../swe_bench/scripts/eval/compare_outputs.py |  0
 ...onvert_oh_folder_to_swebench_submission.sh |  2 +-
 .../scripts/eval/convert_oh_output_to_md.py   |  2 +-
 .../eval/convert_oh_output_to_swe_json.py     |  2 +-
 .../scripts/eval/download_gold_patch.py       |  0
 .../scripts/eval/summarize_outputs.py         |  0
 .../scripts/eval/update_output_with_eval.py   |  0
 .../swe_bench/scripts/eval_infer.sh           |  4 +-
 .../swe_bench/scripts/eval_infer_remote.sh    |  4 +-
 .../swe_bench/scripts/run_infer.sh            |  2 +-
 .../scripts/setup/compare_patch_filename.py   |  0
 .../scripts/setup/instance_swe_entry.sh       |  0
 .../scripts/setup/prepare_swe_utils.sh        |  2 +-
 .../swe_bench/scripts/setup/swe_entry.sh      |  0
 evaluation/{ => benchmarks}/toolqa/Dockerfile |  0
 evaluation/{ => benchmarks}/toolqa/README.md  |  4 +-
 .../{ => benchmarks}/toolqa/run_infer.py      |  2 +-
 .../toolqa/scripts/run_infer.sh               |  2 +-
 evaluation/{ => benchmarks}/toolqa/utils.py   |  0
 .../{ => benchmarks}/webarena/README.md       |  4 +-
 .../{ => benchmarks}/webarena/__init__.py     |  0
 .../webarena/get_success_rate.py              |  0
 .../{ => benchmarks}/webarena/run_infer.py    |  0
 .../webarena/scripts/run_infer.sh             |  4 +-
 152 files changed, 147 insertions(+), 143 deletions(-)
 rename evaluation/{ => benchmarks}/EDA/README.md (88%)
 rename evaluation/{ => benchmarks}/EDA/game.py (100%)
 rename evaluation/{ => benchmarks}/EDA/run_infer.py (99%)
 rename evaluation/{ => benchmarks}/EDA/scripts/run_infer.sh (95%)
 rename evaluation/{ => benchmarks}/agent_bench/README.md (80%)
 rename evaluation/{ => benchmarks}/agent_bench/__init__.py (100%)
 rename evaluation/{ => benchmarks}/agent_bench/helper.py (100%)
 rename evaluation/{ => benchmarks}/agent_bench/run_infer.py (99%)
 rename evaluation/{ => benchmarks}/agent_bench/scripts/run_infer.sh (84%)
 rename evaluation/{ => benchmarks}/agent_bench/scripts/summarise_results.py (100%)
 rename evaluation/{ => benchmarks}/aider_bench/README.md (80%)
 rename evaluation/{ => benchmarks}/aider_bench/create_dataset.py (100%)
 rename evaluation/{ => benchmarks}/aider_bench/helper.py (100%)
 rename evaluation/{ => benchmarks}/aider_bench/run_infer.py (99%)
 rename evaluation/{ => benchmarks}/aider_bench/scripts/run_infer.sh (89%)
 rename evaluation/{ => benchmarks}/aider_bench/scripts/summarize_results.py (100%)
 rename evaluation/{ => benchmarks}/biocoder/README.md (92%)
 rename evaluation/{ => benchmarks}/biocoder/run_infer.py (99%)
 rename evaluation/{ => benchmarks}/biocoder/scripts/run_infer.sh (92%)
 rename evaluation/{ => benchmarks}/biocoder/scripts/setup/copy_changed_code.py (100%)
 rename evaluation/{ => benchmarks}/biocoder/scripts/setup/remove_code.py (100%)
 rename evaluation/{ => benchmarks}/biocoder/utils.py (100%)
 rename evaluation/{ => benchmarks}/bird/README.md (99%)
 rename evaluation/{ => benchmarks}/bird/__init__.py (100%)
 rename evaluation/{ => benchmarks}/bird/run_infer.py (100%)
 rename evaluation/{ => benchmarks}/bird/scripts/run_infer.sh (92%)
 rename evaluation/{ => benchmarks}/browsing_delegation/README.md (91%)
 rename evaluation/{ => benchmarks}/browsing_delegation/run_infer.py (100%)
 rename evaluation/{ => benchmarks}/browsing_delegation/scripts/run_infer.sh (90%)
 rename evaluation/{ => benchmarks}/commit0_bench/README.md (81%)
 rename evaluation/{ => benchmarks}/commit0_bench/run_infer.py (100%)
 rename evaluation/{ => benchmarks}/commit0_bench/scripts/cleanup_remote_runtime.sh (100%)
 rename evaluation/{ => benchmarks}/commit0_bench/scripts/run_infer.sh (97%)
 rename evaluation/{ => benchmarks}/discoverybench/README.md (90%)
 rename evaluation/{ => benchmarks}/discoverybench/eval_utils/README.md (100%)
 rename evaluation/{ => benchmarks}/discoverybench/eval_utils/__init__.py (100%)
 rename evaluation/{ => benchmarks}/discoverybench/eval_utils/eval_w_subhypo_gen.py (100%)
 rename evaluation/{ => benchmarks}/discoverybench/eval_utils/lm_utils.py (100%)
 rename evaluation/{ => benchmarks}/discoverybench/eval_utils/openai_helpers.py (100%)
 rename evaluation/{ => benchmarks}/discoverybench/eval_utils/openai_semantic_gen_prompts.py (100%)
 rename evaluation/{ => benchmarks}/discoverybench/eval_utils/response_parser.py (100%)
 rename evaluation/{ => benchmarks}/discoverybench/run_infer.py (99%)
 rename evaluation/{ => benchmarks}/discoverybench/scripts/run_infer.sh (91%)
 rename evaluation/{ => benchmarks}/gaia/README.md (77%)
 rename evaluation/{ => benchmarks}/gaia/get_score.py (100%)
 rename evaluation/{ => benchmarks}/gaia/run_infer.py (99%)
 rename evaluation/{ => benchmarks}/gaia/scorer.py (100%)
 rename evaluation/{ => benchmarks}/gaia/scripts/run_infer.sh (93%)
 rename evaluation/{ => benchmarks}/gorilla/README.md (88%)
 rename evaluation/{ => benchmarks}/gorilla/ast_eval_hf.py (100%)
 rename evaluation/{ => benchmarks}/gorilla/ast_eval_tf.py (100%)
 rename evaluation/{ => benchmarks}/gorilla/ast_eval_th.py (100%)
 rename evaluation/{ => benchmarks}/gorilla/run_infer.py (98%)
 rename evaluation/{ => benchmarks}/gorilla/scripts/run_infer.sh (93%)
 rename evaluation/{ => benchmarks}/gorilla/utils.py (100%)
 rename evaluation/{ => benchmarks}/gpqa/README.md (94%)
 rename evaluation/{ => benchmarks}/gpqa/__init__.py (100%)
 rename evaluation/{ => benchmarks}/gpqa/run_infer.py (100%)
 rename evaluation/{ => benchmarks}/gpqa/scripts/run_infer.sh (93%)
 rename evaluation/{ => benchmarks}/humanevalfix/README.md (99%)
 rename evaluation/{ => benchmarks}/humanevalfix/__init__.py (100%)
 rename evaluation/{ => benchmarks}/humanevalfix/run_infer.py (100%)
 rename evaluation/{ => benchmarks}/humanevalfix/scripts/run_infer.sh (97%)
 rename evaluation/{ => benchmarks}/logic_reasoning/.cache_program/facts.kfb (100%)
 rename evaluation/{ => benchmarks}/logic_reasoning/.cache_program/rules.krb (100%)
 rename evaluation/{ => benchmarks}/logic_reasoning/Dockerfile (100%)
 rename evaluation/{ => benchmarks}/logic_reasoning/README.md (83%)
 rename evaluation/{ => benchmarks}/logic_reasoning/__init__.py (100%)
 rename evaluation/{ => benchmarks}/logic_reasoning/instruction.txt (100%)
 rename evaluation/{ => benchmarks}/logic_reasoning/logic_inference.py (100%)
 rename evaluation/{ => benchmarks}/logic_reasoning/run_infer.py (100%)
 rename evaluation/{ => benchmarks}/logic_reasoning/scripts/run_infer.sh (92%)
 rename evaluation/{ => benchmarks}/miniwob/Dockerfile (100%)
 rename evaluation/{ => benchmarks}/miniwob/README.md (79%)
 rename evaluation/{ => benchmarks}/miniwob/get_avg_reward.py (100%)
 rename evaluation/{ => benchmarks}/miniwob/run_infer.py (100%)
 rename evaluation/{ => benchmarks}/miniwob/scripts/run_infer.sh (86%)
 rename evaluation/{ => benchmarks}/mint/.gitignore (100%)
 rename evaluation/{ => benchmarks}/mint/Dockerfile (100%)
 rename evaluation/{ => benchmarks}/mint/README.md (85%)
 rename evaluation/{ => benchmarks}/mint/config_variables.py (100%)
 rename evaluation/{ => benchmarks}/mint/datatypes.py (100%)
 rename evaluation/{ => benchmarks}/mint/env.py (100%)
 rename evaluation/{ => benchmarks}/mint/prompts/__init__.py (100%)
 rename evaluation/{ => benchmarks}/mint/prompts/template_with_tool.txt (100%)
 rename evaluation/{ => benchmarks}/mint/requirements.txt (100%)
 rename evaluation/{ => benchmarks}/mint/run_infer.py (97%)
 rename evaluation/{ => benchmarks}/mint/scripts/run_infer.sh (100%)
 rename evaluation/{ => benchmarks}/mint/tasks/__init__.py (50%)
 rename evaluation/{ => benchmarks}/mint/tasks/base.py (100%)
 rename evaluation/{ => benchmarks}/mint/tasks/codegen.py (98%)
 rename evaluation/{ => benchmarks}/mint/tasks/in_context_examples/humaneval/with_tool.txt (100%)
 rename evaluation/{ => benchmarks}/mint/tasks/in_context_examples/mbpp/with_tool.txt (100%)
 rename evaluation/{ => benchmarks}/mint/tasks/in_context_examples/reasoning/with_tool.txt (100%)
 rename evaluation/{ => benchmarks}/mint/tasks/reasoning.py (100%)
 rename evaluation/{ => benchmarks}/mint/utils.py (100%)
 rename evaluation/{ => benchmarks}/ml_bench/README.md (88%)
 rename evaluation/{ => benchmarks}/ml_bench/__init__.py (100%)
 rename evaluation/{ => benchmarks}/ml_bench/run_analysis.py (100%)
 rename evaluation/{ => benchmarks}/ml_bench/run_infer.py (100%)
 rename evaluation/{ => benchmarks}/ml_bench/scripts/cleanup.sh (100%)
 rename evaluation/{ => benchmarks}/ml_bench/scripts/run_analysis.sh (84%)
 rename evaluation/{ => benchmarks}/ml_bench/scripts/run_infer.sh (93%)
 rename evaluation/{ => benchmarks}/ml_bench/scripts/summarise_results.py (100%)
 rename evaluation/{ => benchmarks}/scienceagentbench/Dockerfile (100%)
 rename evaluation/{ => benchmarks}/scienceagentbench/Dockerfile.evaluator (100%)
 rename evaluation/{ => benchmarks}/scienceagentbench/README.md (90%)
 rename evaluation/{ => benchmarks}/scienceagentbench/post_proc.py (100%)
 rename evaluation/{ => benchmarks}/scienceagentbench/run_infer.py (100%)
 rename evaluation/{ => benchmarks}/scienceagentbench/scripts/run_infer.sh (92%)
 rename evaluation/{ => benchmarks}/swe_bench/README.md (80%)
 rename evaluation/{ => benchmarks}/swe_bench/__init__.py (100%)
 rename evaluation/{ => benchmarks}/swe_bench/eval_infer.py (99%)
 rename evaluation/{ => benchmarks}/swe_bench/examples/example_agent_output.jsonl (100%)
 rename evaluation/{ => benchmarks}/swe_bench/examples/example_model_output.json (100%)
 rename evaluation/{ => benchmarks}/swe_bench/prompt.py (100%)
 rename evaluation/{ => benchmarks}/swe_bench/run_infer.py (99%)
 rename evaluation/{ => benchmarks}/swe_bench/scripts/cleanup_remote_runtime.sh (100%)
 rename evaluation/{ => benchmarks}/swe_bench/scripts/docker/all-swebench-full-instance-images.txt (100%)
 rename evaluation/{ => benchmarks}/swe_bench/scripts/docker/all-swebench-lite-instance-images.txt (100%)
 rename evaluation/{ => benchmarks}/swe_bench/scripts/docker/pull_all_eval_docker.sh (100%)
 rename evaluation/{ => benchmarks}/swe_bench/scripts/docker/push_docker_instance_images.py (96%)
 rename evaluation/{ => benchmarks}/swe_bench/scripts/docker/push_eval_docker.sh (100%)
 rename evaluation/{ => benchmarks}/swe_bench/scripts/eval/compare_outputs.py (100%)
 rename evaluation/{ => benchmarks}/swe_bench/scripts/eval/convert_oh_folder_to_swebench_submission.sh (87%)
 rename evaluation/{ => benchmarks}/swe_bench/scripts/eval/convert_oh_output_to_md.py (97%)
 rename evaluation/{ => benchmarks}/swe_bench/scripts/eval/convert_oh_output_to_swe_json.py (93%)
 rename evaluation/{ => benchmarks}/swe_bench/scripts/eval/download_gold_patch.py (100%)
 rename evaluation/{ => benchmarks}/swe_bench/scripts/eval/summarize_outputs.py (100%)
 rename evaluation/{ => benchmarks}/swe_bench/scripts/eval/update_output_with_eval.py (100%)
 rename evaluation/{ => benchmarks}/swe_bench/scripts/eval_infer.sh (95%)
 rename evaluation/{ => benchmarks}/swe_bench/scripts/eval_infer_remote.sh (83%)
 rename evaluation/{ => benchmarks}/swe_bench/scripts/run_infer.sh (97%)
 rename evaluation/{ => benchmarks}/swe_bench/scripts/setup/compare_patch_filename.py (100%)
 rename evaluation/{ => benchmarks}/swe_bench/scripts/setup/instance_swe_entry.sh (100%)
 rename evaluation/{ => benchmarks}/swe_bench/scripts/setup/prepare_swe_utils.sh (93%)
 rename evaluation/{ => benchmarks}/swe_bench/scripts/setup/swe_entry.sh (100%)
 rename evaluation/{ => benchmarks}/toolqa/Dockerfile (100%)
 rename evaluation/{ => benchmarks}/toolqa/README.md (88%)
 rename evaluation/{ => benchmarks}/toolqa/run_infer.py (98%)
 rename evaluation/{ => benchmarks}/toolqa/scripts/run_infer.sh (95%)
 rename evaluation/{ => benchmarks}/toolqa/utils.py (100%)
 rename evaluation/{ => benchmarks}/webarena/README.md (91%)
 rename evaluation/{ => benchmarks}/webarena/__init__.py (100%)
 rename evaluation/{ => benchmarks}/webarena/get_success_rate.py (100%)
 rename evaluation/{ => benchmarks}/webarena/run_infer.py (100%)
 rename evaluation/{ => benchmarks}/webarena/scripts/run_infer.sh (87%)

diff --git a/.github/workflows/eval-runner.yml b/.github/workflows/eval-runner.yml
index 6f1c225efe..9b2576a264 100644
--- a/.github/workflows/eval-runner.yml
+++ b/.github/workflows/eval-runner.yml
@@ -84,12 +84,12 @@ jobs:
           EVAL_DOCKER_IMAGE_PREFIX: us-central1-docker.pkg.dev/evaluation-092424/swe-bench-images
 
         run: |
-          poetry run ./evaluation/swe_bench/scripts/run_infer.sh llm.eval HEAD CodeActAgent 300 30 $N_PROCESSES "princeton-nlp/SWE-bench_Lite" test
+          poetry run ./evaluation/benchmarks/swe_bench/scripts/run_infer.sh llm.eval HEAD CodeActAgent 300 30 $N_PROCESSES "princeton-nlp/SWE-bench_Lite" test
           OUTPUT_FOLDER=$(find evaluation/evaluation_outputs/outputs/princeton-nlp__SWE-bench_Lite-test/CodeActAgent -name "deepseek-chat_maxiter_50_N_*-no-hint-run_1" -type d | head -n 1)
           echo "OUTPUT_FOLDER for SWE-bench evaluation: $OUTPUT_FOLDER"
-          poetry run ./evaluation/swe_bench/scripts/eval_infer_remote.sh $OUTPUT_FOLDER/output.jsonl $N_PROCESSES "princeton-nlp/SWE-bench_Lite" test
+          poetry run ./evaluation/benchmarks/swe_bench/scripts/eval_infer_remote.sh $OUTPUT_FOLDER/output.jsonl $N_PROCESSES "princeton-nlp/SWE-bench_Lite" test
 
-          poetry run ./evaluation/swe_bench/scripts/eval/summarize_outputs.py $OUTPUT_FOLDER/output.jsonl > summarize_outputs.log 2>&1
+          poetry run ./evaluation/benchmarks/swe_bench/scripts/eval/summarize_outputs.py $OUTPUT_FOLDER/output.jsonl > summarize_outputs.log 2>&1
           echo "SWEBENCH_REPORT<<EOF" >> $GITHUB_ENV
           cat summarize_outputs.log >> $GITHUB_ENV
           echo "EOF" >> $GITHUB_ENV
diff --git a/docs/i18n/fr/docusaurus-plugin-content-docs/current/usage/how-to/evaluation-harness.md b/docs/i18n/fr/docusaurus-plugin-content-docs/current/usage/how-to/evaluation-harness.md
index 3f19105399..b215a1ca51 100644
--- a/docs/i18n/fr/docusaurus-plugin-content-docs/current/usage/how-to/evaluation-harness.md
+++ b/docs/i18n/fr/docusaurus-plugin-content-docs/current/usage/how-to/evaluation-harness.md
@@ -76,7 +76,7 @@ La fonction `run_controller()` est le cœur de l'exécution d'OpenHands. Elle g
 
 ## Le moyen le plus simple de commencer : Explorer les benchmarks existants
 
-Nous vous encourageons à examiner les différents benchmarks d'évaluation disponibles dans le [répertoire `evaluation/`](https://github.com/All-Hands-AI/OpenHands/blob/main/evaluation) de notre dépôt.
+Nous vous encourageons à examiner les différents benchmarks d'évaluation disponibles dans le [répertoire `evaluation/benchmarks/`](https://github.com/All-Hands-AI/OpenHands/blob/main/evaluation/benchmarks) de notre dépôt.
 
 Pour intégrer votre propre benchmark, nous vous suggérons de commencer par celui qui ressemble le plus à vos besoins. Cette approche peut considérablement rationaliser votre processus d'intégration, vous permettant de vous appuyer sur les structures existantes et de les adapter à vos exigences spécifiques.
 
diff --git a/docs/i18n/zh-Hans/docusaurus-plugin-content-docs/current/usage/how-to/evaluation-harness.md b/docs/i18n/zh-Hans/docusaurus-plugin-content-docs/current/usage/how-to/evaluation-harness.md
index eb99a30ea3..dc41e0fa1c 100644
--- a/docs/i18n/zh-Hans/docusaurus-plugin-content-docs/current/usage/how-to/evaluation-harness.md
+++ b/docs/i18n/zh-Hans/docusaurus-plugin-content-docs/current/usage/how-to/evaluation-harness.md
@@ -73,7 +73,7 @@ OpenHands 的主要入口点在 `openhands/core/main.py` 中。以下是它工
 
 ## 入门最简单的方法：探索现有基准
 
-我们鼓励您查看我们仓库的 [`evaluation/` 目录](https://github.com/All-Hands-AI/OpenHands/blob/main/evaluation)中提供的各种评估基准。
+我们鼓励您查看我们仓库的 [`evaluation/benchmarks/` 目录](https://github.com/All-Hands-AI/OpenHands/blob/main/evaluation/benchmarks)中提供的各种评估基准。
 
 要集成您自己的基准，我们建议从最接近您需求的基准开始。这种方法可以显著简化您的集成过程，允许您在现有结构的基础上进行构建并使其适应您的特定要求。
 
diff --git a/docs/modules/usage/how-to/evaluation-harness.md b/docs/modules/usage/how-to/evaluation-harness.md
index e4d1e5d15b..339783ea8d 100644
--- a/docs/modules/usage/how-to/evaluation-harness.md
+++ b/docs/modules/usage/how-to/evaluation-harness.md
@@ -73,7 +73,7 @@ The `run_controller()` function is the core of OpenHands's execution. It manages
 
 ## Easiest way to get started: Exploring Existing Benchmarks
 
-We encourage you to review the various evaluation benchmarks available in the [`evaluation/` directory](https://github.com/All-Hands-AI/OpenHands/blob/main/evaluation) of our repository.
+We encourage you to review the various evaluation benchmarks available in the [`evaluation/benchmarks/` directory](https://github.com/All-Hands-AI/OpenHands/blob/main/evaluation/benchmarks) of our repository.
 
 To integrate your own benchmark, we suggest starting with the one that most closely resembles your needs. This approach can significantly streamline your integration process, allowing you to build upon existing structures and adapt them to your specific requirements.
 
diff --git a/evaluation/README.md b/evaluation/README.md
index 8be0822875..8ef9bcce65 100644
--- a/evaluation/README.md
+++ b/evaluation/README.md
@@ -46,28 +46,32 @@ The OpenHands evaluation harness supports a wide variety of benchmarks across so
 
 ### Software Engineering
 
-- SWE-Bench: [`evaluation/swe_bench`](./swe_bench)
-- HumanEvalFix: [`evaluation/humanevalfix`](./humanevalfix)
-- BIRD: [`evaluation/bird`](./bird)
-- BioCoder: [`evaluation/ml_bench`](./ml_bench)
-- ML-Bench: [`evaluation/ml_bench`](./ml_bench)
-- APIBench: [`evaluation/gorilla`](./gorilla/)
-- ToolQA: [`evaluation/toolqa`](./toolqa/)
-- AiderBench: [`evaluation/aider_bench`](./aider_bench/)
+- SWE-Bench: [`evaluation/benchmarks/swe_bench`](./benchmarks/swe_bench)
+- HumanEvalFix: [`evaluation/benchmarks/humanevalfix`](./benchmarks/humanevalfix)
+- BIRD: [`evaluation/benchmarks/bird`](./benchmarks/bird)
+- BioCoder: [`evaluation/benchmarks/ml_bench`](./benchmarks/ml_bench)
+- ML-Bench: [`evaluation/benchmarks/ml_bench`](./benchmarks/ml_bench)
+- APIBench: [`evaluation/benchmarks/gorilla`](./benchmarks/gorilla/)
+- ToolQA: [`evaluation/benchmarks/toolqa`](./benchmarks/toolqa/)
+- AiderBench: [`evaluation/benchmarks/aider_bench`](./benchmarks/aider_bench/)
+- Commit0: [`evaluation/benchmarks/commit0_bench`](./benchmarks/commit0_bench/)
+- DiscoveryBench: [`evaluation/benchmarks/discoverybench`](./benchmarks/discoverybench/)
 
 ### Web Browsing
 
-- WebArena: [`evaluation/webarena`](./webarena/)
-- MiniWob++: [`evaluation/miniwob`](./miniwob/)
+- WebArena: [`evaluation/benchmarks/webarena`](./benchmarks/webarena/)
+- MiniWob++: [`evaluation/benchmarks/miniwob`](./benchmarks/miniwob/)
+- Browsing Delegation: [`evaluation/benchmarks/browsing_delegation`](./benchmarks/browsing_delegation/)
 
 ### Misc. Assistance
 
-- GAIA: [`evaluation/gaia`](./gaia)
-- GPQA: [`evaluation/gpqa`](./gpqa)
-- AgentBench: [`evaluation/agent_bench`](./agent_bench)
-- MINT: [`evaluation/mint`](./mint)
-- Entity deduction Arena (EDA): [`evaluation/EDA`](./EDA)
-- ProofWriter: [`evaluation/logic_reasoning`](./logic_reasoning)
+- GAIA: [`evaluation/benchmarks/gaia`](./benchmarks/gaia)
+- GPQA: [`evaluation/benchmarks/gpqa`](./benchmarks/gpqa)
+- AgentBench: [`evaluation/benchmarks/agent_bench`](./benchmarks/agent_bench)
+- MINT: [`evaluation/benchmarks/mint`](./benchmarks/mint)
+- Entity deduction Arena (EDA): [`evaluation/benchmarks/EDA`](./benchmarks/EDA)
+- ProofWriter: [`evaluation/benchmarks/logic_reasoning`](./benchmarks/logic_reasoning)
+- ScienceAgentBench: [`evaluation/benchmarks/scienceagentbench`](./benchmarks/scienceagentbench)
 
 ## Result Visualization
 
@@ -79,7 +83,7 @@ You can start your own fork of [our huggingface evaluation outputs](https://hugg
 
 To learn more about how to integrate your benchmark into OpenHands, check out [tutorial here](https://docs.all-hands.dev/modules/usage/how-to/evaluation-harness). Briefly,
 
-- Each subfolder contains a specific benchmark or experiment. For example, `evaluation/swe_bench` should contain
+- Each subfolder contains a specific benchmark or experiment. For example, `evaluation/benchmarks/swe_bench` should contain
 all the preprocessing/evaluation/analysis scripts.
 - Raw data and experimental records should not be stored within this repo.
 - For model outputs, they should be stored at [this huggingface space](https://huggingface.co/spaces/OpenHands/evaluation) for visualization.
diff --git a/evaluation/EDA/README.md b/evaluation/benchmarks/EDA/README.md
similarity index 88%
rename from evaluation/EDA/README.md
rename to evaluation/benchmarks/EDA/README.md
index a8a2e4fbf0..fee875c5dd 100644
--- a/evaluation/EDA/README.md
+++ b/evaluation/benchmarks/EDA/README.md
@@ -12,7 +12,7 @@ Please follow instruction [here](../README.md#setup) to setup your local develop
 
 ```bash
 export OPENAI_API_KEY="sk-XXX"; # This is required for evaluation (to simulate another party of conversation)
-./evaluation/EDA/scripts/run_infer.sh [model_config] [git-version] [agent] [dataset] [eval_limit]
+./evaluation/benchmarks/EDA/scripts/run_infer.sh [model_config] [git-version] [agent] [dataset] [eval_limit]
 ```
 
 where `model_config` is mandatory, while `git-version`, `agent`, `dataset` and `eval_limit` are optional.
@@ -33,7 +33,7 @@ to `CodeActAgent`.
 For example,
 
 ```bash
-./evaluation/EDA/scripts/run_infer.sh eval_gpt4o_2024_05_13 0.6.2 CodeActAgent things
+./evaluation/benchmarks/EDA/scripts/run_infer.sh eval_gpt4o_2024_05_13 0.6.2 CodeActAgent things
 ```
 
 ## Reference
diff --git a/evaluation/EDA/game.py b/evaluation/benchmarks/EDA/game.py
similarity index 100%
rename from evaluation/EDA/game.py
rename to evaluation/benchmarks/EDA/game.py
diff --git a/evaluation/EDA/run_infer.py b/evaluation/benchmarks/EDA/run_infer.py
similarity index 99%
rename from evaluation/EDA/run_infer.py
rename to evaluation/benchmarks/EDA/run_infer.py
index 2549207392..cce795e954 100644
--- a/evaluation/EDA/run_infer.py
+++ b/evaluation/benchmarks/EDA/run_infer.py
@@ -4,7 +4,7 @@ import os
 import pandas as pd
 from datasets import load_dataset
 
-from evaluation.EDA.game import Q20Game, Q20GameCelebrity
+from evaluation.benchmarks.EDA.game import Q20Game, Q20GameCelebrity
 from evaluation.utils.shared import (
     EvalMetadata,
     EvalOutput,
diff --git a/evaluation/EDA/scripts/run_infer.sh b/evaluation/benchmarks/EDA/scripts/run_infer.sh
similarity index 95%
rename from evaluation/EDA/scripts/run_infer.sh
rename to evaluation/benchmarks/EDA/scripts/run_infer.sh
index afa9eaa7b2..a803073f73 100755
--- a/evaluation/EDA/scripts/run_infer.sh
+++ b/evaluation/benchmarks/EDA/scripts/run_infer.sh
@@ -43,7 +43,7 @@ echo "AGENT_VERSION: $AGENT_VERSION"
 echo "MODEL_CONFIG: $MODEL_CONFIG"
 echo "DATASET: $DATASET"
 
-COMMAND="poetry run python evaluation/EDA/run_infer.py \
+COMMAND="poetry run python evaluation/benchmarks/EDA/run_infer.py \
   --agent-cls $AGENT \
   --llm-config $MODEL_CONFIG \
   --dataset $DATASET \
diff --git a/evaluation/agent_bench/README.md b/evaluation/benchmarks/agent_bench/README.md
similarity index 80%
rename from evaluation/agent_bench/README.md
rename to evaluation/benchmarks/agent_bench/README.md
index 1133a09a5c..e8a1e3dc95 100644
--- a/evaluation/agent_bench/README.md
+++ b/evaluation/benchmarks/agent_bench/README.md
@@ -9,7 +9,7 @@ Please follow instruction [here](../README.md#setup) to setup your local develop
 ## Start the evaluation
 
 ```bash
-./evaluation/agent_bench/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit]
+./evaluation/benchmarks/agent_bench/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit]
 ```
 
 - `model_config`, e.g. `eval_gpt4_1106_preview`, is the config group name for your
@@ -25,7 +25,7 @@ in order to use `eval_limit`, you must also set `agent`.
 
 Following is the basic command to start the evaluation.
 
-You can update the arguments in the script `evaluation/agent_bench/scripts/run_infer.sh`, such as `--max-iterations`, `--eval-num-workers` and so on.
+You can update the arguments in the script `evaluation/benchmarks/agent_bench/scripts/run_infer.sh`, such as `--max-iterations`, `--eval-num-workers` and so on.
 
 - `--agent-cls`, the agent to use. For example, `CodeActAgent`.
 - `--llm-config`: the LLM configuration to use. For example, `eval_gpt4_1106_preview`.
@@ -34,5 +34,5 @@ You can update the arguments in the script `evaluation/agent_bench/scripts/run_i
 - `--eval-n-limit`: the number of examples to evaluate. For example, `100`.
 
 ```bash
-./evaluation/agent_bench/scripts/run_infer.sh eval_gpt35_turbo HEAD CodeActAgent 1
+./evaluation/benchmarks/agent_bench/scripts/run_infer.sh eval_gpt35_turbo HEAD CodeActAgent 1
 ```
diff --git a/evaluation/agent_bench/__init__.py b/evaluation/benchmarks/agent_bench/__init__.py
similarity index 100%
rename from evaluation/agent_bench/__init__.py
rename to evaluation/benchmarks/agent_bench/__init__.py
diff --git a/evaluation/agent_bench/helper.py b/evaluation/benchmarks/agent_bench/helper.py
similarity index 100%
rename from evaluation/agent_bench/helper.py
rename to evaluation/benchmarks/agent_bench/helper.py
diff --git a/evaluation/agent_bench/run_infer.py b/evaluation/benchmarks/agent_bench/run_infer.py
similarity index 99%
rename from evaluation/agent_bench/run_infer.py
rename to evaluation/benchmarks/agent_bench/run_infer.py
index acdf60fe48..693718357a 100644
--- a/evaluation/agent_bench/run_infer.py
+++ b/evaluation/benchmarks/agent_bench/run_infer.py
@@ -7,7 +7,7 @@ from typing import Any
 import pandas as pd
 from datasets import load_dataset
 
-from evaluation.agent_bench.helper import (
+from evaluation.benchmarks.agent_bench.helper import (
     FAKE_RESPONSES,
     INST_SUFFIXES,
     compare_results,
diff --git a/evaluation/agent_bench/scripts/run_infer.sh b/evaluation/benchmarks/agent_bench/scripts/run_infer.sh
similarity index 84%
rename from evaluation/agent_bench/scripts/run_infer.sh
rename to evaluation/benchmarks/agent_bench/scripts/run_infer.sh
index 713e420d53..16e98b074b 100755
--- a/evaluation/agent_bench/scripts/run_infer.sh
+++ b/evaluation/benchmarks/agent_bench/scripts/run_infer.sh
@@ -26,7 +26,7 @@ echo "AGENT: $AGENT"
 echo "AGENT_VERSION: $AGENT_VERSION"
 echo "MODEL_CONFIG: $MODEL_CONFIG"
 
-COMMAND="export PYTHONPATH=evaluation/agent_bench:\$PYTHONPATH && poetry run python evaluation/agent_bench/run_infer.py \
+COMMAND="export PYTHONPATH=evaluation/benchmarks/agent_bench:\$PYTHONPATH && poetry run python evaluation/benchmarks/agent_bench/run_infer.py \
   --agent-cls $AGENT \
   --llm-config $MODEL_CONFIG \
   --max-iterations 30 \
diff --git a/evaluation/agent_bench/scripts/summarise_results.py b/evaluation/benchmarks/agent_bench/scripts/summarise_results.py
similarity index 100%
rename from evaluation/agent_bench/scripts/summarise_results.py
rename to evaluation/benchmarks/agent_bench/scripts/summarise_results.py
diff --git a/evaluation/aider_bench/README.md b/evaluation/benchmarks/aider_bench/README.md
similarity index 80%
rename from evaluation/aider_bench/README.md
rename to evaluation/benchmarks/aider_bench/README.md
index 07b782a256..965fc06d7e 100644
--- a/evaluation/aider_bench/README.md
+++ b/evaluation/benchmarks/aider_bench/README.md
@@ -16,7 +16,7 @@ development environment and LLM.
 ## Start the evaluation
 
 ```bash
-./evaluation/aider_bench/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit] [eval-num-workers] [eval_ids]
+./evaluation/benchmarks/aider_bench/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit] [eval-num-workers] [eval_ids]
 ```
 
 - `model_config`, e.g. `eval_gpt4_1106_preview`, is the config group name for
@@ -42,7 +42,7 @@ export SKIP_NUM=12 # skip the first 12 instances from the dataset
 Following is the basic command to start the evaluation.
 
 You can update the arguments in the script
-`evaluation/aider_bench/scripts/run_infer.sh`, such as `--max-iterations`,
+`evaluation/benchmarks/aider_bench/scripts/run_infer.sh`, such as `--max-iterations`,
 `--eval-num-workers` and so on:
 
 - `--agent-cls`, the agent to use. For example, `CodeActAgent`.
@@ -53,7 +53,7 @@ You can update the arguments in the script
 - `--eval-ids`: the IDs of the examples to evaluate (comma separated). For example, `"1,3,10"`.
 
 ```bash
-./evaluation/aider_bench/scripts/run_infer.sh eval_gpt35_turbo HEAD CodeActAgent 100 1 "1,3,10"
+./evaluation/benchmarks/aider_bench/scripts/run_infer.sh eval_gpt35_turbo HEAD CodeActAgent 100 1 "1,3,10"
 ```
 
 ### Run Inference on `RemoteRuntime` (experimental)
@@ -61,25 +61,25 @@ You can update the arguments in the script
 This is in limited beta. Contact Xingyao over slack if you want to try this out!
 
 ```bash
-./evaluation/aider_bench/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit] [eval-num-workers] [eval_ids]
+./evaluation/benchmarks/aider_bench/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit] [eval-num-workers] [eval_ids]
 
 # Example - This runs evaluation on CodeActAgent for 133 instances on aider_bench test set, with 2 workers running in parallel
 export ALLHANDS_API_KEY="YOUR-API-KEY"
 export RUNTIME=remote
 export SANDBOX_REMOTE_RUNTIME_API_URL="https://runtime.eval.all-hands.dev"
-./evaluation/aider_bench/scripts/run_infer.sh llm.eval HEAD CodeActAgent 133 2
+./evaluation/benchmarks/aider_bench/scripts/run_infer.sh llm.eval HEAD CodeActAgent 133 2
 ```
 
 ## Summarize Results
 
 ```bash
-poetry run python ./evaluation/aider_bench/scripts/summarize_results.py [path_to_output_jsonl_file]
+poetry run python ./evaluation/benchmarks/aider_bench/scripts/summarize_results.py [path_to_output_jsonl_file]
 ```
 
 Full example:
 
 ```bash
-poetry run python ./evaluation/aider_bench/scripts/summarize_results.py evaluation/evaluation_outputs/outputs/AiderBench/CodeActAgent/claude-3-5-sonnet@20240620_maxiter_30_N_v1.9/output.jsonl
+poetry run python ./evaluation/benchmarks/aider_bench/scripts/summarize_results.py evaluation/evaluation_outputs/outputs/AiderBench/CodeActAgent/claude-3-5-sonnet@20240620_maxiter_30_N_v1.9/output.jsonl
 ```
 
 This will list the instances that passed and the instances that failed. For each
diff --git a/evaluation/aider_bench/create_dataset.py b/evaluation/benchmarks/aider_bench/create_dataset.py
similarity index 100%
rename from evaluation/aider_bench/create_dataset.py
rename to evaluation/benchmarks/aider_bench/create_dataset.py
diff --git a/evaluation/aider_bench/helper.py b/evaluation/benchmarks/aider_bench/helper.py
similarity index 100%
rename from evaluation/aider_bench/helper.py
rename to evaluation/benchmarks/aider_bench/helper.py
diff --git a/evaluation/aider_bench/run_infer.py b/evaluation/benchmarks/aider_bench/run_infer.py
similarity index 99%
rename from evaluation/aider_bench/run_infer.py
rename to evaluation/benchmarks/aider_bench/run_infer.py
index c6e5bbb9db..f7796c7696 100644
--- a/evaluation/aider_bench/run_infer.py
+++ b/evaluation/benchmarks/aider_bench/run_infer.py
@@ -7,7 +7,7 @@ from typing import Any
 import pandas as pd
 from datasets import load_dataset
 
-from evaluation.aider_bench.helper import (
+from evaluation.benchmarks.aider_bench.helper import (
     FAKE_RESPONSES,
     INST_SUFFIXES,
     INSTRUCTIONS_ADDENDUM,
diff --git a/evaluation/aider_bench/scripts/run_infer.sh b/evaluation/benchmarks/aider_bench/scripts/run_infer.sh
similarity index 89%
rename from evaluation/aider_bench/scripts/run_infer.sh
rename to evaluation/benchmarks/aider_bench/scripts/run_infer.sh
index 0afc060f36..0b3824ceae 100755
--- a/evaluation/aider_bench/scripts/run_infer.sh
+++ b/evaluation/benchmarks/aider_bench/scripts/run_infer.sh
@@ -39,7 +39,7 @@ if [ "$USE_UNIT_TESTS" = true ]; then
   EVAL_NOTE=$EVAL_NOTE-w-test
 fi
 
-COMMAND="export PYTHONPATH=evaluation/aider_bench:\$PYTHONPATH && poetry run python evaluation/aider_bench/run_infer.py \
+COMMAND="export PYTHONPATH=evaluation/benchmarks/aider_bench:\$PYTHONPATH && poetry run python evaluation/benchmarks/aider_bench/run_infer.py \
   --agent-cls $AGENT \
   --llm-config $MODEL_CONFIG \
   --max-iterations 30 \
diff --git a/evaluation/aider_bench/scripts/summarize_results.py b/evaluation/benchmarks/aider_bench/scripts/summarize_results.py
similarity index 100%
rename from evaluation/aider_bench/scripts/summarize_results.py
rename to evaluation/benchmarks/aider_bench/scripts/summarize_results.py
diff --git a/evaluation/biocoder/README.md b/evaluation/benchmarks/biocoder/README.md
similarity index 92%
rename from evaluation/biocoder/README.md
rename to evaluation/benchmarks/biocoder/README.md
index ad9cf55f53..035f2d20bf 100644
--- a/evaluation/biocoder/README.md
+++ b/evaluation/benchmarks/biocoder/README.md
@@ -21,7 +21,7 @@ To reproduce this image, please see the Dockerfile_Openopenhands in the `biocode
 
 
 ```bash
-./evaluation/biocoder/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit]
+./evaluation/benchmarks/biocoder/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit]
 ```
 
 where `model_config` is mandatory, while `git-version`, `agent`, `dataset` and `eval_limit` are optional.
@@ -43,7 +43,7 @@ with current OpenHands version, then your command would be:
 ## Examples
 
 ```bash
-./evaluation/biocoder/scripts/run_infer.sh eval_gpt4o_2024_05_13 HEAD CodeActAgent 1
+./evaluation/benchmarks/biocoder/scripts/run_infer.sh eval_gpt4o_2024_05_13 HEAD CodeActAgent 1
 ```
 
 ## Reference
diff --git a/evaluation/biocoder/run_infer.py b/evaluation/benchmarks/biocoder/run_infer.py
similarity index 99%
rename from evaluation/biocoder/run_infer.py
rename to evaluation/benchmarks/biocoder/run_infer.py
index 68bbf892d5..f5cdd44471 100644
--- a/evaluation/biocoder/run_infer.py
+++ b/evaluation/benchmarks/biocoder/run_infer.py
@@ -8,7 +8,7 @@ from typing import Any
 import pandas as pd
 from datasets import load_dataset
 
-from evaluation.biocoder.utils import BiocoderData
+from evaluation.benchmarks.biocoder.utils import BiocoderData
 from evaluation.utils.shared import (
     EvalMetadata,
     EvalOutput,
diff --git a/evaluation/biocoder/scripts/run_infer.sh b/evaluation/benchmarks/biocoder/scripts/run_infer.sh
similarity index 92%
rename from evaluation/biocoder/scripts/run_infer.sh
rename to evaluation/benchmarks/biocoder/scripts/run_infer.sh
index b2ae17f98c..61fddb6211 100755
--- a/evaluation/biocoder/scripts/run_infer.sh
+++ b/evaluation/benchmarks/biocoder/scripts/run_infer.sh
@@ -28,7 +28,7 @@ echo "AGENT_VERSION: $AGENT_VERSION"
 echo "MODEL_CONFIG: $MODEL_CONFIG"
 echo "DATASET: $DATASET"
 
-COMMAND="poetry run python evaluation/biocoder/run_infer.py \
+COMMAND="poetry run python evaluation/benchmarks/biocoder/run_infer.py \
   --agent-cls $AGENT \
   --llm-config $MODEL_CONFIG \
   --max-iterations 10 \
diff --git a/evaluation/biocoder/scripts/setup/copy_changed_code.py b/evaluation/benchmarks/biocoder/scripts/setup/copy_changed_code.py
similarity index 100%
rename from evaluation/biocoder/scripts/setup/copy_changed_code.py
rename to evaluation/benchmarks/biocoder/scripts/setup/copy_changed_code.py
diff --git a/evaluation/biocoder/scripts/setup/remove_code.py b/evaluation/benchmarks/biocoder/scripts/setup/remove_code.py
similarity index 100%
rename from evaluation/biocoder/scripts/setup/remove_code.py
rename to evaluation/benchmarks/biocoder/scripts/setup/remove_code.py
diff --git a/evaluation/biocoder/utils.py b/evaluation/benchmarks/biocoder/utils.py
similarity index 100%
rename from evaluation/biocoder/utils.py
rename to evaluation/benchmarks/biocoder/utils.py
diff --git a/evaluation/bird/README.md b/evaluation/benchmarks/bird/README.md
similarity index 99%
rename from evaluation/bird/README.md
rename to evaluation/benchmarks/bird/README.md
index 8f63423527..90e3fa300c 100644
--- a/evaluation/bird/README.md
+++ b/evaluation/benchmarks/bird/README.md
@@ -9,7 +9,7 @@ Please follow instruction [here](../README.md#setup) to setup your local develop
 ## Run Inference on Bird
 
 ```bash
-./evaluation/bird/scripts/run_infer.sh [model_config] [git-version]
+./evaluation/benchmarks/bird/scripts/run_infer.sh [model_config] [git-version]
 ```
 
 - `model_config`, e.g. `eval_gpt4_1106_preview`, is the config group name for your
diff --git a/evaluation/bird/__init__.py b/evaluation/benchmarks/bird/__init__.py
similarity index 100%
rename from evaluation/bird/__init__.py
rename to evaluation/benchmarks/bird/__init__.py
diff --git a/evaluation/bird/run_infer.py b/evaluation/benchmarks/bird/run_infer.py
similarity index 100%
rename from evaluation/bird/run_infer.py
rename to evaluation/benchmarks/bird/run_infer.py
diff --git a/evaluation/bird/scripts/run_infer.sh b/evaluation/benchmarks/bird/scripts/run_infer.sh
similarity index 92%
rename from evaluation/bird/scripts/run_infer.sh
rename to evaluation/benchmarks/bird/scripts/run_infer.sh
index b2e2c64c42..bf69d9d50b 100755
--- a/evaluation/bird/scripts/run_infer.sh
+++ b/evaluation/benchmarks/bird/scripts/run_infer.sh
@@ -26,7 +26,7 @@ echo "AGENT: $AGENT"
 echo "AGENT_VERSION: $AGENT_VERSION"
 echo "MODEL_CONFIG: $MODEL_CONFIG"
 
-COMMAND="poetry run python evaluation/bird/run_infer.py \
+COMMAND="poetry run python evaluation/benchmarks/bird/run_infer.py \
   --agent-cls $AGENT \
   --llm-config $MODEL_CONFIG \
   --max-iterations 5 \
diff --git a/evaluation/browsing_delegation/README.md b/evaluation/benchmarks/browsing_delegation/README.md
similarity index 91%
rename from evaluation/browsing_delegation/README.md
rename to evaluation/benchmarks/browsing_delegation/README.md
index 92e9410971..a06170f8b9 100644
--- a/evaluation/browsing_delegation/README.md
+++ b/evaluation/benchmarks/browsing_delegation/README.md
@@ -12,7 +12,7 @@ Please follow instruction [here](../README.md#setup) to setup your local develop
 ## Run Inference
 
 ```bash
-./evaluation/browsing_delegation/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit]
+./evaluation/benchmarks/browsing_delegation/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit]
 # e.g., ./evaluation/swe_bench/scripts/run_infer.sh llm.eval_gpt4_1106_preview_llm HEAD CodeActAgent 300
 ```
 
diff --git a/evaluation/browsing_delegation/run_infer.py b/evaluation/benchmarks/browsing_delegation/run_infer.py
similarity index 100%
rename from evaluation/browsing_delegation/run_infer.py
rename to evaluation/benchmarks/browsing_delegation/run_infer.py
diff --git a/evaluation/browsing_delegation/scripts/run_infer.sh b/evaluation/benchmarks/browsing_delegation/scripts/run_infer.sh
similarity index 90%
rename from evaluation/browsing_delegation/scripts/run_infer.sh
rename to evaluation/benchmarks/browsing_delegation/scripts/run_infer.sh
index c33768dc01..30607ca333 100755
--- a/evaluation/browsing_delegation/scripts/run_infer.sh
+++ b/evaluation/benchmarks/browsing_delegation/scripts/run_infer.sh
@@ -28,7 +28,7 @@ echo "MODEL_CONFIG: $MODEL_CONFIG"
 
 EVAL_NOTE="$AGENT_VERSION"
 
-COMMAND="poetry run python evaluation/browsing_delegation/run_infer.py \
+COMMAND="poetry run python evaluation/benchmarks/browsing_delegation/run_infer.py \
   --agent-cls $AGENT \
   --llm-config $MODEL_CONFIG \
   --max-iterations 1 \
diff --git a/evaluation/commit0_bench/README.md b/evaluation/benchmarks/commit0_bench/README.md
similarity index 81%
rename from evaluation/commit0_bench/README.md
rename to evaluation/benchmarks/commit0_bench/README.md
index fdfd5812a8..78b58b0213 100644
--- a/evaluation/commit0_bench/README.md
+++ b/evaluation/benchmarks/commit0_bench/README.md
@@ -24,10 +24,10 @@ Make sure your Docker daemon is running, and you have ample disk space (at least
 When the `run_infer.sh` script is started, it will automatically pull the `lite` split in Commit0. For example, for instance ID `commit-0/minitorch`, it will try to pull our pre-build docker image `wentingzhao/minitorch` from DockerHub. This image will be used create an OpenHands runtime image where the agent will operate on.
 
 ```bash
-./evaluation/commit0_bench/scripts/run_infer.sh [repo_split] [model_config] [git-version] [agent] [eval_limit] [max_iter] [num_workers] [dataset] [dataset_split]
+./evaluation/benchmarks/commit0_bench/scripts/run_infer.sh [repo_split] [model_config] [git-version] [agent] [eval_limit] [max_iter] [num_workers] [dataset] [dataset_split]
 
 # Example
-./evaluation/commit0_bench/scripts/run_infer.sh lite llm.eval_sonnet HEAD CodeActAgent 16 100 8 wentingzhao/commit0_combined test
+./evaluation/benchmarks/commit0_bench/scripts/run_infer.sh lite llm.eval_sonnet HEAD CodeActAgent 16 100 8 wentingzhao/commit0_combined test
 ```
 
 where `model_config` is mandatory, and the rest are optional.
@@ -56,7 +56,7 @@ Let's say you'd like to run 10 instances using `llm.eval_sonnet` and CodeActAgen
 then your command would be:
 
 ```bash
-./evaluation/commit0_bench/scripts/run_infer.sh lite llm.eval_sonnet HEAD CodeActAgent 10 30 1 wentingzhao/commit0_combined test
+./evaluation/benchmarks/commit0_bench/scripts/run_infer.sh lite llm.eval_sonnet HEAD CodeActAgent 10 30 1 wentingzhao/commit0_combined test
 ```
 
 ### Run Inference on `RemoteRuntime` (experimental)
@@ -64,17 +64,17 @@ then your command would be:
 This is in limited beta. Contact Xingyao over slack if you want to try this out!
 
 ```bash
-./evaluation/commit0_bench/scripts/run_infer.sh [repo_split] [model_config] [git-version] [agent] [eval_limit] [max_iter] [num_workers] [dataset] [dataset_split]
+./evaluation/benchmarks/commit0_bench/scripts/run_infer.sh [repo_split] [model_config] [git-version] [agent] [eval_limit] [max_iter] [num_workers] [dataset] [dataset_split]
 
 # Example - This runs evaluation on CodeActAgent for 10 instances on "wentingzhao/commit0_combined"'s test set, with max 30 iteration per instances, with 1 number of workers running in parallel
 ALLHANDS_API_KEY="YOUR-API-KEY" RUNTIME=remote SANDBOX_REMOTE_RUNTIME_API_URL="https://runtime.eval.all-hands.dev" EVAL_DOCKER_IMAGE_PREFIX="docker.io/wentingzhao" \
-./evaluation/commit0_bench/scripts/run_infer.sh lite llm.eval_sonnet HEAD CodeActAgent 10 30 1 wentingzhao/commit0_combined test
+./evaluation/benchmarks/commit0_bench/scripts/run_infer.sh lite llm.eval_sonnet HEAD CodeActAgent 10 30 1 wentingzhao/commit0_combined test
 ```
 
 To clean-up all existing runtime you've already started, run:
 
 ```bash
-ALLHANDS_API_KEY="YOUR-API-KEY" ./evaluation/commit0_bench/scripts/cleanup_remote_runtime.sh
+ALLHANDS_API_KEY="YOUR-API-KEY" ./evaluation/benchmarks/commit0_bench/scripts/cleanup_remote_runtime.sh
 ```
 
 ### Specify a subset of tasks to run infer
diff --git a/evaluation/commit0_bench/run_infer.py b/evaluation/benchmarks/commit0_bench/run_infer.py
similarity index 100%
rename from evaluation/commit0_bench/run_infer.py
rename to evaluation/benchmarks/commit0_bench/run_infer.py
diff --git a/evaluation/commit0_bench/scripts/cleanup_remote_runtime.sh b/evaluation/benchmarks/commit0_bench/scripts/cleanup_remote_runtime.sh
similarity index 100%
rename from evaluation/commit0_bench/scripts/cleanup_remote_runtime.sh
rename to evaluation/benchmarks/commit0_bench/scripts/cleanup_remote_runtime.sh
diff --git a/evaluation/commit0_bench/scripts/run_infer.sh b/evaluation/benchmarks/commit0_bench/scripts/run_infer.sh
similarity index 97%
rename from evaluation/commit0_bench/scripts/run_infer.sh
rename to evaluation/benchmarks/commit0_bench/scripts/run_infer.sh
index d362a09667..227a5ff05e 100755
--- a/evaluation/commit0_bench/scripts/run_infer.sh
+++ b/evaluation/benchmarks/commit0_bench/scripts/run_infer.sh
@@ -91,7 +91,7 @@ fi
 
 function run_eval() {
   local eval_note=$1
-  COMMAND="poetry run python evaluation/commit0_bench/run_infer.py \
+  COMMAND="poetry run python evaluation/benchmarks/commit0_bench/run_infer.py \
     --agent-cls $AGENT \
     --llm-config $MODEL_CONFIG \
     --max-iterations $MAX_ITER \
diff --git a/evaluation/discoverybench/README.md b/evaluation/benchmarks/discoverybench/README.md
similarity index 90%
rename from evaluation/discoverybench/README.md
rename to evaluation/benchmarks/discoverybench/README.md
index a0d8994709..daf5cc34bb 100644
--- a/evaluation/discoverybench/README.md
+++ b/evaluation/benchmarks/discoverybench/README.md
@@ -16,7 +16,7 @@
 2. Execute the bash script to start DiscoveryBench Evaluation
 
 ```
-./evaluation/discoverybench/scripts/run_infer.sh [YOUR MODEL CONFIG]
+./evaluation/benchmarks/discoverybench/scripts/run_infer.sh [YOUR MODEL CONFIG]
 ```
 Replace `[YOUR MODEL CONFIG]` with any model the model that you have set up in `config.toml`
 
@@ -27,7 +27,7 @@ When the `run_infer.sh` script is started, it will automatically pull the latest
 
 
 ```
-./evaluation/discoverybench/scripts/run_infer.sh [MODEL_CONFIG] [GIT_COMMIT] [AGENT] [EVAL_LIMIT] [NUM_WORKERS]
+./evaluation/benchmarks/discoverybench/scripts/run_infer.sh [MODEL_CONFIG] [GIT_COMMIT] [AGENT] [EVAL_LIMIT] [NUM_WORKERS]
 ```
 
 - `MODEL_CONFIG`: Name of the model you want to evaluate with
diff --git a/evaluation/discoverybench/eval_utils/README.md b/evaluation/benchmarks/discoverybench/eval_utils/README.md
similarity index 100%
rename from evaluation/discoverybench/eval_utils/README.md
rename to evaluation/benchmarks/discoverybench/eval_utils/README.md
diff --git a/evaluation/discoverybench/eval_utils/__init__.py b/evaluation/benchmarks/discoverybench/eval_utils/__init__.py
similarity index 100%
rename from evaluation/discoverybench/eval_utils/__init__.py
rename to evaluation/benchmarks/discoverybench/eval_utils/__init__.py
diff --git a/evaluation/discoverybench/eval_utils/eval_w_subhypo_gen.py b/evaluation/benchmarks/discoverybench/eval_utils/eval_w_subhypo_gen.py
similarity index 100%
rename from evaluation/discoverybench/eval_utils/eval_w_subhypo_gen.py
rename to evaluation/benchmarks/discoverybench/eval_utils/eval_w_subhypo_gen.py
diff --git a/evaluation/discoverybench/eval_utils/lm_utils.py b/evaluation/benchmarks/discoverybench/eval_utils/lm_utils.py
similarity index 100%
rename from evaluation/discoverybench/eval_utils/lm_utils.py
rename to evaluation/benchmarks/discoverybench/eval_utils/lm_utils.py
diff --git a/evaluation/discoverybench/eval_utils/openai_helpers.py b/evaluation/benchmarks/discoverybench/eval_utils/openai_helpers.py
similarity index 100%
rename from evaluation/discoverybench/eval_utils/openai_helpers.py
rename to evaluation/benchmarks/discoverybench/eval_utils/openai_helpers.py
diff --git a/evaluation/discoverybench/eval_utils/openai_semantic_gen_prompts.py b/evaluation/benchmarks/discoverybench/eval_utils/openai_semantic_gen_prompts.py
similarity index 100%
rename from evaluation/discoverybench/eval_utils/openai_semantic_gen_prompts.py
rename to evaluation/benchmarks/discoverybench/eval_utils/openai_semantic_gen_prompts.py
diff --git a/evaluation/discoverybench/eval_utils/response_parser.py b/evaluation/benchmarks/discoverybench/eval_utils/response_parser.py
similarity index 100%
rename from evaluation/discoverybench/eval_utils/response_parser.py
rename to evaluation/benchmarks/discoverybench/eval_utils/response_parser.py
diff --git a/evaluation/discoverybench/run_infer.py b/evaluation/benchmarks/discoverybench/run_infer.py
similarity index 99%
rename from evaluation/discoverybench/run_infer.py
rename to evaluation/benchmarks/discoverybench/run_infer.py
index 7cfd2dbac7..6d8dcbd89b 100644
--- a/evaluation/discoverybench/run_infer.py
+++ b/evaluation/benchmarks/discoverybench/run_infer.py
@@ -5,10 +5,10 @@ import os
 import git
 import pandas as pd
 
-from evaluation.discoverybench.eval_utils.eval_w_subhypo_gen import (
+from evaluation.benchmarks.discoverybench.eval_utils.eval_w_subhypo_gen import (
     run_eval_gold_vs_gen_NL_hypo_workflow,
 )
-from evaluation.discoverybench.eval_utils.response_parser import (
+from evaluation.benchmarks.discoverybench.eval_utils.response_parser import (
     extract_gen_hypo_from_logs,
 )
 from evaluation.utils.shared import (
diff --git a/evaluation/discoverybench/scripts/run_infer.sh b/evaluation/benchmarks/discoverybench/scripts/run_infer.sh
similarity index 91%
rename from evaluation/discoverybench/scripts/run_infer.sh
rename to evaluation/benchmarks/discoverybench/scripts/run_infer.sh
index 8b9fffd7c5..e12b9c1398 100755
--- a/evaluation/discoverybench/scripts/run_infer.sh
+++ b/evaluation/benchmarks/discoverybench/scripts/run_infer.sh
@@ -29,7 +29,7 @@ echo "AGENT: $AGENT"
 echo "AGENT_VERSION: $AGENT_VERSION"
 echo "MODEL_CONFIG: $MODEL_CONFIG"
 
-COMMAND="poetry run python evaluation/discoverybench/run_infer.py \
+COMMAND="poetry run python evaluation/benchmarks/discoverybench/run_infer.py \
   --agent-cls $AGENT \
   --llm-config $MODEL_CONFIG \
   --max-iterations 10 \
diff --git a/evaluation/gaia/README.md b/evaluation/benchmarks/gaia/README.md
similarity index 77%
rename from evaluation/gaia/README.md
rename to evaluation/benchmarks/gaia/README.md
index bf1c701328..f592e5f711 100644
--- a/evaluation/gaia/README.md
+++ b/evaluation/benchmarks/gaia/README.md
@@ -10,11 +10,11 @@ Please follow instruction [here](../README.md#setup) to setup your local develop
 We are using the GAIA dataset hosted on [Hugging Face](https://huggingface.co/datasets/gaia-benchmark/GAIA).
 Please accept the terms and make sure to have logged in on your computer by `huggingface-cli login` before running the evaluation.
 
-Following is the basic command to start the evaluation. Here we are evaluating on the validation set for the `2023_all` split. You can adjust `./evaluation/gaia/scripts/run_infer.sh` to change the subset you want to evaluate on.
+Following is the basic command to start the evaluation. Here we are evaluating on the validation set for the `2023_all` split. You can adjust `./evaluation/benchmarks/gaia/scripts/run_infer.sh` to change the subset you want to evaluate on.
 
 ```bash
-./evaluation/gaia/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit] [gaia_subset]
-# e.g., ./evaluation/gaia/scripts/run_infer.sh eval_gpt4_1106_preview 0.6.2 CodeActAgent 300
+./evaluation/benchmarks/gaia/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit] [gaia_subset]
+# e.g., ./evaluation/benchmarks/gaia/scripts/run_infer.sh eval_gpt4_1106_preview 0.6.2 CodeActAgent 300
 ```
 
 where `model_config` is mandatory, while `git-version`, `agent`, `eval_limit` and `gaia_subset` are optional.
@@ -35,13 +35,13 @@ to `CodeActAgent`.
 For example,
 
 ```bash
-./evaluation/gaia/scripts/run_infer.sh eval_gpt4_1106_preview 0.6.2 CodeActAgent 10
+./evaluation/benchmarks/gaia/scripts/run_infer.sh eval_gpt4_1106_preview 0.6.2 CodeActAgent 10
 ```
 
 ## Get score
 
 Then you can get stats by running the following command:
 ```bash
-python ./evaluation/gaia/get_score.py \
+python ./evaluation/benchmarks/gaia/get_score.py \
 --file <path_to/output.json>
 ```
diff --git a/evaluation/gaia/get_score.py b/evaluation/benchmarks/gaia/get_score.py
similarity index 100%
rename from evaluation/gaia/get_score.py
rename to evaluation/benchmarks/gaia/get_score.py
diff --git a/evaluation/gaia/run_infer.py b/evaluation/benchmarks/gaia/run_infer.py
similarity index 99%
rename from evaluation/gaia/run_infer.py
rename to evaluation/benchmarks/gaia/run_infer.py
index 1fa0c00e6d..fb6d4b3db0 100644
--- a/evaluation/gaia/run_infer.py
+++ b/evaluation/benchmarks/gaia/run_infer.py
@@ -7,7 +7,7 @@ import huggingface_hub
 import pandas as pd
 from datasets import load_dataset
 
-from evaluation.gaia.scorer import question_scorer
+from evaluation.benchmarks.gaia.scorer import question_scorer
 from evaluation.utils.shared import (
     EvalMetadata,
     EvalOutput,
diff --git a/evaluation/gaia/scorer.py b/evaluation/benchmarks/gaia/scorer.py
similarity index 100%
rename from evaluation/gaia/scorer.py
rename to evaluation/benchmarks/gaia/scorer.py
diff --git a/evaluation/gaia/scripts/run_infer.sh b/evaluation/benchmarks/gaia/scripts/run_infer.sh
similarity index 93%
rename from evaluation/gaia/scripts/run_infer.sh
rename to evaluation/benchmarks/gaia/scripts/run_infer.sh
index aedfe01a0c..5ad012d07d 100755
--- a/evaluation/gaia/scripts/run_infer.sh
+++ b/evaluation/benchmarks/gaia/scripts/run_infer.sh
@@ -35,7 +35,7 @@ echo "AGENT_VERSION: $AGENT_VERSION"
 echo "MODEL_CONFIG: $MODEL_CONFIG"
 echo "LEVELS: $LEVELS"
 
-COMMAND="poetry run python ./evaluation/gaia/run_infer.py \
+COMMAND="poetry run python ./evaluation/benchmarks/gaia/run_infer.py \
   --agent-cls $AGENT \
   --llm-config $MODEL_CONFIG \
   --max-iterations 30 \
diff --git a/evaluation/gorilla/README.md b/evaluation/benchmarks/gorilla/README.md
similarity index 88%
rename from evaluation/gorilla/README.md
rename to evaluation/benchmarks/gorilla/README.md
index 4cd90f554a..c6f1cde55b 100644
--- a/evaluation/gorilla/README.md
+++ b/evaluation/benchmarks/gorilla/README.md
@@ -11,7 +11,7 @@ Please follow instruction [here](../README.md#setup) to setup your local develop
 Make sure your Docker daemon is running, then run this bash script:
 
 ```bash
-./evaluation/gorilla/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit] [hubs]
+./evaluation/benchmarks/gorilla/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit] [hubs]
 ```
 
 where `model_config` is mandatory, while all other arguments are optional.
@@ -35,5 +35,5 @@ Note: in order to use `eval_limit`, you must also set `agent`; in order to use `
 For example,
 
 ```bash
-./evaluation/gorilla/scripts/run_infer.sh llm 0.6.2 CodeActAgent 10 th
+./evaluation/benchmarks/gorilla/scripts/run_infer.sh llm 0.6.2 CodeActAgent 10 th
 ```
diff --git a/evaluation/gorilla/ast_eval_hf.py b/evaluation/benchmarks/gorilla/ast_eval_hf.py
similarity index 100%
rename from evaluation/gorilla/ast_eval_hf.py
rename to evaluation/benchmarks/gorilla/ast_eval_hf.py
diff --git a/evaluation/gorilla/ast_eval_tf.py b/evaluation/benchmarks/gorilla/ast_eval_tf.py
similarity index 100%
rename from evaluation/gorilla/ast_eval_tf.py
rename to evaluation/benchmarks/gorilla/ast_eval_tf.py
diff --git a/evaluation/gorilla/ast_eval_th.py b/evaluation/benchmarks/gorilla/ast_eval_th.py
similarity index 100%
rename from evaluation/gorilla/ast_eval_th.py
rename to evaluation/benchmarks/gorilla/ast_eval_th.py
diff --git a/evaluation/gorilla/run_infer.py b/evaluation/benchmarks/gorilla/run_infer.py
similarity index 98%
rename from evaluation/gorilla/run_infer.py
rename to evaluation/benchmarks/gorilla/run_infer.py
index aa932a388f..6f5b6c9d43 100644
--- a/evaluation/gorilla/run_infer.py
+++ b/evaluation/benchmarks/gorilla/run_infer.py
@@ -5,7 +5,7 @@ import os
 import pandas as pd
 import requests
 
-from evaluation.gorilla.utils import encode_question, get_data_for_hub
+from evaluation.benchmarks.gorilla.utils import encode_question, get_data_for_hub
 from evaluation.utils.shared import (
     EvalMetadata,
     EvalOutput,
diff --git a/evaluation/gorilla/scripts/run_infer.sh b/evaluation/benchmarks/gorilla/scripts/run_infer.sh
similarity index 93%
rename from evaluation/gorilla/scripts/run_infer.sh
rename to evaluation/benchmarks/gorilla/scripts/run_infer.sh
index c39bd74bf2..4542444443 100755
--- a/evaluation/gorilla/scripts/run_infer.sh
+++ b/evaluation/benchmarks/gorilla/scripts/run_infer.sh
@@ -33,7 +33,7 @@ echo "AGENT_VERSION: $AGENT_VERSION"
 echo "MODEL_CONFIG: $MODEL_CONFIG"
 echo "HUBS: $HUBS"
 
-COMMAND="poetry run python evaluation/gorilla/run_infer.py \
+COMMAND="poetry run python evaluation/benchmarks/gorilla/run_infer.py \
   --agent-cls $AGENT \
   --llm-config $MODEL_CONFIG \
   --max-iterations 30 \
diff --git a/evaluation/gorilla/utils.py b/evaluation/benchmarks/gorilla/utils.py
similarity index 100%
rename from evaluation/gorilla/utils.py
rename to evaluation/benchmarks/gorilla/utils.py
diff --git a/evaluation/gpqa/README.md b/evaluation/benchmarks/gpqa/README.md
similarity index 94%
rename from evaluation/gpqa/README.md
rename to evaluation/benchmarks/gpqa/README.md
index b96c5913cb..235b9ab9b2 100644
--- a/evaluation/gpqa/README.md
+++ b/evaluation/benchmarks/gpqa/README.md
@@ -23,7 +23,7 @@ Please follow instruction [here](../README.md#setup) to setup your local develop
 'gpqa_main', 'gqpa_diamond', 'gpqa_experts', 'gpqa_extended' -- data split options
 From the root of the OpenHands repo, run the following command:
 ```bash
-./evaluation/gpqa/scripts/run_infer.sh [model_config_name] [git-version] [num_samples_eval] [data_split] [AgentClass]
+./evaluation/benchmarks/gpqa/scripts/run_infer.sh [model_config_name] [git-version] [num_samples_eval] [data_split] [AgentClass]
 ```
 You can replace `model_config_name` with any model you set up in `config.toml`.
 
diff --git a/evaluation/gpqa/__init__.py b/evaluation/benchmarks/gpqa/__init__.py
similarity index 100%
rename from evaluation/gpqa/__init__.py
rename to evaluation/benchmarks/gpqa/__init__.py
diff --git a/evaluation/gpqa/run_infer.py b/evaluation/benchmarks/gpqa/run_infer.py
similarity index 100%
rename from evaluation/gpqa/run_infer.py
rename to evaluation/benchmarks/gpqa/run_infer.py
diff --git a/evaluation/gpqa/scripts/run_infer.sh b/evaluation/benchmarks/gpqa/scripts/run_infer.sh
similarity index 93%
rename from evaluation/gpqa/scripts/run_infer.sh
rename to evaluation/benchmarks/gpqa/scripts/run_infer.sh
index b45435e631..ec5a61dbbb 100755
--- a/evaluation/gpqa/scripts/run_infer.sh
+++ b/evaluation/benchmarks/gpqa/scripts/run_infer.sh
@@ -33,7 +33,7 @@ echo "AGENT: $AGENT"
 echo "AGENT_VERSION: $AGENT_VERSION"
 echo "MODEL_CONFIG: $MODEL_CONFIG"
 
-COMMAND="poetry run python evaluation/gpqa/run_infer.py \
+COMMAND="poetry run python evaluation/benchmarks/gpqa/run_infer.py \
   --agent-cls $AGENT \
   --llm-config $MODEL_CONFIG \
   --max-iterations 10 \
diff --git a/evaluation/humanevalfix/README.md b/evaluation/benchmarks/humanevalfix/README.md
similarity index 99%
rename from evaluation/humanevalfix/README.md
rename to evaluation/benchmarks/humanevalfix/README.md
index b887f57ac9..5f3ae58ee2 100644
--- a/evaluation/humanevalfix/README.md
+++ b/evaluation/benchmarks/humanevalfix/README.md
@@ -9,7 +9,7 @@ Please follow instruction [here](../README.md#setup) to setup your local develop
 ## Run Inference on HumanEvalFix
 
 ```bash
-./evaluation/humanevalfix/scripts/run_infer.sh eval_gpt4_1106_preview
+./evaluation/benchmarks/humanevalfix/scripts/run_infer.sh eval_gpt4_1106_preview
 ```
 
 You can replace `eval_gpt4_1106_preview` with any model you set up in `config.toml`.
diff --git a/evaluation/humanevalfix/__init__.py b/evaluation/benchmarks/humanevalfix/__init__.py
similarity index 100%
rename from evaluation/humanevalfix/__init__.py
rename to evaluation/benchmarks/humanevalfix/__init__.py
diff --git a/evaluation/humanevalfix/run_infer.py b/evaluation/benchmarks/humanevalfix/run_infer.py
similarity index 100%
rename from evaluation/humanevalfix/run_infer.py
rename to evaluation/benchmarks/humanevalfix/run_infer.py
diff --git a/evaluation/humanevalfix/scripts/run_infer.sh b/evaluation/benchmarks/humanevalfix/scripts/run_infer.sh
similarity index 97%
rename from evaluation/humanevalfix/scripts/run_infer.sh
rename to evaluation/benchmarks/humanevalfix/scripts/run_infer.sh
index f63e13d16a..b0b30628eb 100755
--- a/evaluation/humanevalfix/scripts/run_infer.sh
+++ b/evaluation/benchmarks/humanevalfix/scripts/run_infer.sh
@@ -64,7 +64,7 @@ echo "AGENT: $AGENT"
 echo "AGENT_VERSION: $AGENT_VERSION"
 echo "MODEL_CONFIG: $MODEL_CONFIG"
 
-COMMAND="poetry run python evaluation/humanevalfix/run_infer.py \
+COMMAND="poetry run python evaluation/benchmarks/humanevalfix/run_infer.py \
   --agent-cls $AGENT \
   --llm-config $MODEL_CONFIG \
   --max-iterations 10 \
diff --git a/evaluation/logic_reasoning/.cache_program/facts.kfb b/evaluation/benchmarks/logic_reasoning/.cache_program/facts.kfb
similarity index 100%
rename from evaluation/logic_reasoning/.cache_program/facts.kfb
rename to evaluation/benchmarks/logic_reasoning/.cache_program/facts.kfb
diff --git a/evaluation/logic_reasoning/.cache_program/rules.krb b/evaluation/benchmarks/logic_reasoning/.cache_program/rules.krb
similarity index 100%
rename from evaluation/logic_reasoning/.cache_program/rules.krb
rename to evaluation/benchmarks/logic_reasoning/.cache_program/rules.krb
diff --git a/evaluation/logic_reasoning/Dockerfile b/evaluation/benchmarks/logic_reasoning/Dockerfile
similarity index 100%
rename from evaluation/logic_reasoning/Dockerfile
rename to evaluation/benchmarks/logic_reasoning/Dockerfile
diff --git a/evaluation/logic_reasoning/README.md b/evaluation/benchmarks/logic_reasoning/README.md
similarity index 83%
rename from evaluation/logic_reasoning/README.md
rename to evaluation/benchmarks/logic_reasoning/README.md
index 79faae4fe0..d4e4d3e9a5 100644
--- a/evaluation/logic_reasoning/README.md
+++ b/evaluation/benchmarks/logic_reasoning/README.md
@@ -10,5 +10,5 @@ Please follow instruction [here](../README.md#setup) to setup your local develop
 The following code will run inference on the first example of the ProofWriter dataset,
 
 ```bash
-./evaluation/logic_reasoning/scripts/run_infer.sh eval_gpt4_1106_preview_llm ProofWriter
+./evaluation/benchmarks/logic_reasoning/scripts/run_infer.sh eval_gpt4_1106_preview_llm ProofWriter
 ```
diff --git a/evaluation/logic_reasoning/__init__.py b/evaluation/benchmarks/logic_reasoning/__init__.py
similarity index 100%
rename from evaluation/logic_reasoning/__init__.py
rename to evaluation/benchmarks/logic_reasoning/__init__.py
diff --git a/evaluation/logic_reasoning/instruction.txt b/evaluation/benchmarks/logic_reasoning/instruction.txt
similarity index 100%
rename from evaluation/logic_reasoning/instruction.txt
rename to evaluation/benchmarks/logic_reasoning/instruction.txt
diff --git a/evaluation/logic_reasoning/logic_inference.py b/evaluation/benchmarks/logic_reasoning/logic_inference.py
similarity index 100%
rename from evaluation/logic_reasoning/logic_inference.py
rename to evaluation/benchmarks/logic_reasoning/logic_inference.py
diff --git a/evaluation/logic_reasoning/run_infer.py b/evaluation/benchmarks/logic_reasoning/run_infer.py
similarity index 100%
rename from evaluation/logic_reasoning/run_infer.py
rename to evaluation/benchmarks/logic_reasoning/run_infer.py
diff --git a/evaluation/logic_reasoning/scripts/run_infer.sh b/evaluation/benchmarks/logic_reasoning/scripts/run_infer.sh
similarity index 92%
rename from evaluation/logic_reasoning/scripts/run_infer.sh
rename to evaluation/benchmarks/logic_reasoning/scripts/run_infer.sh
index 4c064c102c..40c244d18b 100755
--- a/evaluation/logic_reasoning/scripts/run_infer.sh
+++ b/evaluation/benchmarks/logic_reasoning/scripts/run_infer.sh
@@ -34,7 +34,7 @@ echo "AGENT: $AGENT"
 echo "AGENT_VERSION: $AGENT_VERSION"
 echo "MODEL_CONFIG: $MODEL_CONFIG"
 
-COMMAND="poetry run python evaluation/logic_reasoning/run_infer.py \
+COMMAND="poetry run python evaluation/benchmarks/logic_reasoning/run_infer.py \
   --agent-cls $AGENT \
   --llm-config $MODEL_CONFIG \
   --dataset $DATASET \
diff --git a/evaluation/miniwob/Dockerfile b/evaluation/benchmarks/miniwob/Dockerfile
similarity index 100%
rename from evaluation/miniwob/Dockerfile
rename to evaluation/benchmarks/miniwob/Dockerfile
diff --git a/evaluation/miniwob/README.md b/evaluation/benchmarks/miniwob/README.md
similarity index 79%
rename from evaluation/miniwob/README.md
rename to evaluation/benchmarks/miniwob/README.md
index a462232649..5535e45a7d 100644
--- a/evaluation/miniwob/README.md
+++ b/evaluation/benchmarks/miniwob/README.md
@@ -13,7 +13,7 @@ Access with browser the above MiniWoB URLs and see if they load correctly.
 ## Run Evaluation
 
 ```sh
-./evaluation/miniwob/scripts/run_infer.sh llm.claude-35-sonnet-eval
+./evaluation/benchmarks/miniwob/scripts/run_infer.sh llm.claude-35-sonnet-eval
 ```
 
 ### Run Inference on `RemoteRuntime` (experimental)
@@ -21,13 +21,13 @@ Access with browser the above MiniWoB URLs and see if they load correctly.
 This is in limited beta. Contact Xingyao over slack if you want to try this out!
 
 ```bash
-./evaluation/miniwob/scripts/run_infer.sh [model_config] [git-version] [agent] [note] [eval_limit] [num_workers]
+./evaluation/benchmarks/miniwob/scripts/run_infer.sh [model_config] [git-version] [agent] [note] [eval_limit] [num_workers]
 
 # Example - This runs evaluation on BrowsingAgent for 125 instances on miniwob, with 2 workers running in parallel
 export ALLHANDS_API_KEY="YOUR-API-KEY"
 export RUNTIME=remote
 export SANDBOX_REMOTE_RUNTIME_API_URL="https://runtime.eval.all-hands.dev"
-./evaluation/miniwob/scripts/run_infer.sh llm.eval HEAD BrowsingAgent "" 125 2
+./evaluation/benchmarks/miniwob/scripts/run_infer.sh llm.eval HEAD BrowsingAgent "" 125 2
 ```
 
 Results will be in `evaluation/evaluation_outputs/outputs/miniwob/`
@@ -35,7 +35,7 @@ Results will be in `evaluation/evaluation_outputs/outputs/miniwob/`
 To calculate the average reward, run:
 
 ```sh
-poetry run python evaluation/miniwob/get_success_rate.py evaluation/evaluation_outputs/outputs/miniwob/SOME_AGENT/EXP_NAME/output.jsonl
+poetry run python evaluation/benchmarks/miniwob/get_success_rate.py evaluation/evaluation_outputs/outputs/miniwob/SOME_AGENT/EXP_NAME/output.jsonl
 ```
 
 ## Submit your evaluation results
diff --git a/evaluation/miniwob/get_avg_reward.py b/evaluation/benchmarks/miniwob/get_avg_reward.py
similarity index 100%
rename from evaluation/miniwob/get_avg_reward.py
rename to evaluation/benchmarks/miniwob/get_avg_reward.py
diff --git a/evaluation/miniwob/run_infer.py b/evaluation/benchmarks/miniwob/run_infer.py
similarity index 100%
rename from evaluation/miniwob/run_infer.py
rename to evaluation/benchmarks/miniwob/run_infer.py
diff --git a/evaluation/miniwob/scripts/run_infer.sh b/evaluation/benchmarks/miniwob/scripts/run_infer.sh
similarity index 86%
rename from evaluation/miniwob/scripts/run_infer.sh
rename to evaluation/benchmarks/miniwob/scripts/run_infer.sh
index ece7cafbe7..8f997e29c3 100755
--- a/evaluation/miniwob/scripts/run_infer.sh
+++ b/evaluation/benchmarks/miniwob/scripts/run_infer.sh
@@ -33,7 +33,7 @@ echo "MODEL_CONFIG: $MODEL_CONFIG"
 
 EVAL_NOTE="${AGENT_VERSION}_${NOTE}"
 
-COMMAND="export PYTHONPATH=evaluation/miniwob:\$PYTHONPATH && poetry run python evaluation/miniwob/run_infer.py \
+COMMAND="export PYTHONPATH=evaluation/benchmarks/miniwob:\$PYTHONPATH && poetry run python evaluation/benchmarks/miniwob/run_infer.py \
   --agent-cls $AGENT \
   --llm-config $MODEL_CONFIG \
   --max-iterations 10 \
diff --git a/evaluation/mint/.gitignore b/evaluation/benchmarks/mint/.gitignore
similarity index 100%
rename from evaluation/mint/.gitignore
rename to evaluation/benchmarks/mint/.gitignore
diff --git a/evaluation/mint/Dockerfile b/evaluation/benchmarks/mint/Dockerfile
similarity index 100%
rename from evaluation/mint/Dockerfile
rename to evaluation/benchmarks/mint/Dockerfile
diff --git a/evaluation/mint/README.md b/evaluation/benchmarks/mint/README.md
similarity index 85%
rename from evaluation/mint/README.md
rename to evaluation/benchmarks/mint/README.md
index 950996cc49..bfaeb713bc 100644
--- a/evaluation/mint/README.md
+++ b/evaluation/benchmarks/mint/README.md
@@ -6,7 +6,7 @@ We support evaluation of the [Eurus subset focus on math and code reasoning](htt
 
 ## Setup Environment and LLM Configuration
 
-Please follow instruction [here](../README.md#setup) to setup your local development environment and LLM.
+Please follow instruction [here](../../README.md#setup) to setup your local development environment and LLM.
 
 ## Start the evaluation
 
@@ -15,7 +15,7 @@ We are using the MINT dataset hosted on [Hugging Face](https://huggingface.co/da
 Following is the basic command to start the evaluation. Currently, the only agent supported with MINT is `CodeActAgent`.
 
 ```bash
-./evaluation/mint/scripts/run_infer.sh [model_config] [git-version] [subset] [eval_limit]
+./evaluation/benchmarks/mint/scripts/run_infer.sh [model_config] [git-version] [subset] [eval_limit]
 ```
 
 where `model_config` is mandatory, while others are optional.
@@ -34,7 +34,7 @@ Note: in order to use `eval_limit`, you must also set `subset`.
 For example,
 
 ```bash
-./evaluation/swe_bench/scripts/run_infer.sh eval_gpt4_1106_preview 0.6.2 gsm8k 3
+./evaluation/benchmarks/mint/scripts/run_infer.sh eval_gpt4_1106_preview 0.6.2 gsm8k 3
 ```
 
 ## Reference
diff --git a/evaluation/mint/config_variables.py b/evaluation/benchmarks/mint/config_variables.py
similarity index 100%
rename from evaluation/mint/config_variables.py
rename to evaluation/benchmarks/mint/config_variables.py
diff --git a/evaluation/mint/datatypes.py b/evaluation/benchmarks/mint/datatypes.py
similarity index 100%
rename from evaluation/mint/datatypes.py
rename to evaluation/benchmarks/mint/datatypes.py
diff --git a/evaluation/mint/env.py b/evaluation/benchmarks/mint/env.py
similarity index 100%
rename from evaluation/mint/env.py
rename to evaluation/benchmarks/mint/env.py
diff --git a/evaluation/mint/prompts/__init__.py b/evaluation/benchmarks/mint/prompts/__init__.py
similarity index 100%
rename from evaluation/mint/prompts/__init__.py
rename to evaluation/benchmarks/mint/prompts/__init__.py
diff --git a/evaluation/mint/prompts/template_with_tool.txt b/evaluation/benchmarks/mint/prompts/template_with_tool.txt
similarity index 100%
rename from evaluation/mint/prompts/template_with_tool.txt
rename to evaluation/benchmarks/mint/prompts/template_with_tool.txt
diff --git a/evaluation/mint/requirements.txt b/evaluation/benchmarks/mint/requirements.txt
similarity index 100%
rename from evaluation/mint/requirements.txt
rename to evaluation/benchmarks/mint/requirements.txt
diff --git a/evaluation/mint/run_infer.py b/evaluation/benchmarks/mint/run_infer.py
similarity index 97%
rename from evaluation/mint/run_infer.py
rename to evaluation/benchmarks/mint/run_infer.py
index 7f6985fc2a..4414e1c462 100644
--- a/evaluation/mint/run_infer.py
+++ b/evaluation/benchmarks/mint/run_infer.py
@@ -6,10 +6,10 @@ from typing import Any
 import pandas as pd
 from datasets import load_dataset
 
-from evaluation.mint.datatypes import TaskState
-from evaluation.mint.env import SimplifiedEnv
-from evaluation.mint.prompts import ToolPromptTemplate
-from evaluation.mint.tasks import Task
+from evaluation.benchmarks.mint.datatypes import TaskState
+from evaluation.benchmarks.mint.env import SimplifiedEnv
+from evaluation.benchmarks.mint.prompts import ToolPromptTemplate
+from evaluation.benchmarks.mint.tasks import Task
 from evaluation.utils.shared import (
     EvalMetadata,
     EvalOutput,
diff --git a/evaluation/mint/scripts/run_infer.sh b/evaluation/benchmarks/mint/scripts/run_infer.sh
similarity index 100%
rename from evaluation/mint/scripts/run_infer.sh
rename to evaluation/benchmarks/mint/scripts/run_infer.sh
diff --git a/evaluation/mint/tasks/__init__.py b/evaluation/benchmarks/mint/tasks/__init__.py
similarity index 50%
rename from evaluation/mint/tasks/__init__.py
rename to evaluation/benchmarks/mint/tasks/__init__.py
index 4f6ac721ac..96c628f854 100644
--- a/evaluation/mint/tasks/__init__.py
+++ b/evaluation/benchmarks/mint/tasks/__init__.py
@@ -1,6 +1,6 @@
-from evaluation.mint.tasks.base import Task
-from evaluation.mint.tasks.codegen import HumanEvalTask, MBPPTask
-from evaluation.mint.tasks.reasoning import (
+from evaluation.benchmarks.mint.tasks.base import Task
+from evaluation.benchmarks.mint.tasks.codegen import HumanEvalTask, MBPPTask
+from evaluation.benchmarks.mint.tasks.reasoning import (
     MultipleChoiceTask,
     ReasoningTask,
     TheoremqaTask,
diff --git a/evaluation/mint/tasks/base.py b/evaluation/benchmarks/mint/tasks/base.py
similarity index 100%
rename from evaluation/mint/tasks/base.py
rename to evaluation/benchmarks/mint/tasks/base.py
diff --git a/evaluation/mint/tasks/codegen.py b/evaluation/benchmarks/mint/tasks/codegen.py
similarity index 98%
rename from evaluation/mint/tasks/codegen.py
rename to evaluation/benchmarks/mint/tasks/codegen.py
index 8a80594ce4..cbd127ac0e 100644
--- a/evaluation/mint/tasks/codegen.py
+++ b/evaluation/benchmarks/mint/tasks/codegen.py
@@ -2,7 +2,7 @@ import logging
 
 from utils import check_correctness
 
-from evaluation.mint.tasks.base import Task
+from evaluation.benchmarks.mint.tasks.base import Task
 
 LOGGER = logging.getLogger('MINT')
 
diff --git a/evaluation/mint/tasks/in_context_examples/humaneval/with_tool.txt b/evaluation/benchmarks/mint/tasks/in_context_examples/humaneval/with_tool.txt
similarity index 100%
rename from evaluation/mint/tasks/in_context_examples/humaneval/with_tool.txt
rename to evaluation/benchmarks/mint/tasks/in_context_examples/humaneval/with_tool.txt
diff --git a/evaluation/mint/tasks/in_context_examples/mbpp/with_tool.txt b/evaluation/benchmarks/mint/tasks/in_context_examples/mbpp/with_tool.txt
similarity index 100%
rename from evaluation/mint/tasks/in_context_examples/mbpp/with_tool.txt
rename to evaluation/benchmarks/mint/tasks/in_context_examples/mbpp/with_tool.txt
diff --git a/evaluation/mint/tasks/in_context_examples/reasoning/with_tool.txt b/evaluation/benchmarks/mint/tasks/in_context_examples/reasoning/with_tool.txt
similarity index 100%
rename from evaluation/mint/tasks/in_context_examples/reasoning/with_tool.txt
rename to evaluation/benchmarks/mint/tasks/in_context_examples/reasoning/with_tool.txt
diff --git a/evaluation/mint/tasks/reasoning.py b/evaluation/benchmarks/mint/tasks/reasoning.py
similarity index 100%
rename from evaluation/mint/tasks/reasoning.py
rename to evaluation/benchmarks/mint/tasks/reasoning.py
diff --git a/evaluation/mint/utils.py b/evaluation/benchmarks/mint/utils.py
similarity index 100%
rename from evaluation/mint/utils.py
rename to evaluation/benchmarks/mint/utils.py
diff --git a/evaluation/ml_bench/README.md b/evaluation/benchmarks/ml_bench/README.md
similarity index 88%
rename from evaluation/ml_bench/README.md
rename to evaluation/benchmarks/ml_bench/README.md
index 0ad9cca8f7..528edddc14 100644
--- a/evaluation/ml_bench/README.md
+++ b/evaluation/benchmarks/ml_bench/README.md
@@ -19,8 +19,8 @@ Please follow instruction [here](../README.md#setup) to setup your local develop
 To run the evaluation on the ML-Bench dataset, use the following command:
 
 ```bash
-./evaluation/ml_bench/scripts/run_infer.sh [model_config] [git-version] [split] [agent] [eval_limit]
-# e.g., ./evaluation/ml_bench/scripts/run_infer.sh eval_gpt4_1106_preview 0.6.2 full CodeActAgent 10
+./evaluation/benchmarks/ml_bench/scripts/run_infer.sh [model_config] [git-version] [split] [agent] [eval_limit]
+# e.g., ./evaluation/benchmarks/ml_bench/scripts/run_infer.sh eval_gpt4_1106_preview 0.6.2 full CodeActAgent 10
 ```
 
 You can replace `eval_gpt4_1106_preview` with any model you set up in `config.toml`.
@@ -30,8 +30,8 @@ You can replace `eval_gpt4_1106_preview` with any model you set up in `config.to
 To score the evaluation output, use the following command:
 
 ```bash
-./evaluation/ml_bench/scripts/summarise_results.py [eval_output_dir]
-# e.g., ./evaluation/ml_bench/scripts/summarise_results.py evaluation/evaluation_outputs/outputs/ml_bench/CodeActAgent/gpt-4-1106-preview_maxiter_10_N_v1.5
+./evaluation/benchmarks/ml_bench/scripts/summarise_results.py [eval_output_dir]
+# e.g., ./evaluation/benchmarks/ml_bench/scripts/summarise_results.py evaluation/evaluation_outputs/outputs/ml_bench/CodeActAgent/gpt-4-1106-preview_maxiter_10_N_v1.5
 ```
 
 ## Run Error Analysis on ML-Bench
@@ -39,8 +39,8 @@ To score the evaluation output, use the following command:
 To run error analysis on the ML-Bench dataset, use the following command:
 
 ```bash
-./evaluation/ml_bench/scripts/run_analysis.sh [eval_output_dir] [model_config]
-# e.g., ./evaluation/ml_bench/scripts/run_analysis.sh evaluation/evaluation_outputs/outputs/ml_bench/CodeActAgent/gpt-4-1106-preview_maxiter_10_N_v1.5/output.jsonl eval_gpt4_1106_preview
+./evaluation/benchmarks/ml_bench/scripts/run_analysis.sh [eval_output_dir] [model_config]
+# e.g., ./evaluation/benchmarks/ml_bench/scripts/run_analysis.sh evaluation/evaluation_outputs/outputs/ml_bench/CodeActAgent/gpt-4-1106-preview_maxiter_10_N_v1.5/output.jsonl eval_gpt4_1106_preview
 ```
 
 This command generates a report on the evaluation output and provides insights into the agent's performance.
@@ -105,7 +105,7 @@ The `metrics` field contains the parsed evaluation metrics from the `eval_output
 
 ## Customization
 
-You can customize the evaluation script by modifying the `evaluation/ml_bench/run_infer.py` file. This script handles loading the ML-Bench dataset, running the agent on each task instance, and saving the evaluation outputs.
+You can customize the evaluation script by modifying the `evaluation/benchmarks/ml_bench/run_infer.py` file. This script handles loading the ML-Bench dataset, running the agent on each task instance, and saving the evaluation outputs.
 
 Feel free to adjust the configuration, logging, and output formatting to suit your needs.
 
diff --git a/evaluation/ml_bench/__init__.py b/evaluation/benchmarks/ml_bench/__init__.py
similarity index 100%
rename from evaluation/ml_bench/__init__.py
rename to evaluation/benchmarks/ml_bench/__init__.py
diff --git a/evaluation/ml_bench/run_analysis.py b/evaluation/benchmarks/ml_bench/run_analysis.py
similarity index 100%
rename from evaluation/ml_bench/run_analysis.py
rename to evaluation/benchmarks/ml_bench/run_analysis.py
diff --git a/evaluation/ml_bench/run_infer.py b/evaluation/benchmarks/ml_bench/run_infer.py
similarity index 100%
rename from evaluation/ml_bench/run_infer.py
rename to evaluation/benchmarks/ml_bench/run_infer.py
diff --git a/evaluation/ml_bench/scripts/cleanup.sh b/evaluation/benchmarks/ml_bench/scripts/cleanup.sh
similarity index 100%
rename from evaluation/ml_bench/scripts/cleanup.sh
rename to evaluation/benchmarks/ml_bench/scripts/cleanup.sh
diff --git a/evaluation/ml_bench/scripts/run_analysis.sh b/evaluation/benchmarks/ml_bench/scripts/run_analysis.sh
similarity index 84%
rename from evaluation/ml_bench/scripts/run_analysis.sh
rename to evaluation/benchmarks/ml_bench/scripts/run_analysis.sh
index 8571fe70f3..d5fe6365ca 100644
--- a/evaluation/ml_bench/scripts/run_analysis.sh
+++ b/evaluation/benchmarks/ml_bench/scripts/run_analysis.sh
@@ -17,7 +17,7 @@ fi
 echo "MODEL_CONFIG: $MODEL_CONFIG"
 echo "RESULT_FILE: $RESULT_FILE"
 
-COMMAND="poetry run python evaluation/ml_bench/run_analysis.py \
+COMMAND="poetry run python evaluation/benchmarks/ml_bench/run_analysis.py \
   --llm-config $MODEL_CONFIG \
   --json_file_path $RESULT_FILE"
 
diff --git a/evaluation/ml_bench/scripts/run_infer.sh b/evaluation/benchmarks/ml_bench/scripts/run_infer.sh
similarity index 93%
rename from evaluation/ml_bench/scripts/run_infer.sh
rename to evaluation/benchmarks/ml_bench/scripts/run_infer.sh
index 4ecbae514a..97ff0003fc 100755
--- a/evaluation/ml_bench/scripts/run_infer.sh
+++ b/evaluation/benchmarks/ml_bench/scripts/run_infer.sh
@@ -32,7 +32,7 @@ echo "AGENT: $AGENT"
 echo "AGENT_VERSION: $AGENT_VERSION"
 echo "MODEL_CONFIG: $MODEL_CONFIG"
 
-COMMAND="poetry run python evaluation/ml_bench/run_infer.py \
+COMMAND="poetry run python evaluation/benchmarks/ml_bench/run_infer.py \
   --agent-cls $AGENT \
   --llm-config $MODEL_CONFIG \
   --max-iterations 10 \
diff --git a/evaluation/ml_bench/scripts/summarise_results.py b/evaluation/benchmarks/ml_bench/scripts/summarise_results.py
similarity index 100%
rename from evaluation/ml_bench/scripts/summarise_results.py
rename to evaluation/benchmarks/ml_bench/scripts/summarise_results.py
diff --git a/evaluation/scienceagentbench/Dockerfile b/evaluation/benchmarks/scienceagentbench/Dockerfile
similarity index 100%
rename from evaluation/scienceagentbench/Dockerfile
rename to evaluation/benchmarks/scienceagentbench/Dockerfile
diff --git a/evaluation/scienceagentbench/Dockerfile.evaluator b/evaluation/benchmarks/scienceagentbench/Dockerfile.evaluator
similarity index 100%
rename from evaluation/scienceagentbench/Dockerfile.evaluator
rename to evaluation/benchmarks/scienceagentbench/Dockerfile.evaluator
diff --git a/evaluation/scienceagentbench/README.md b/evaluation/benchmarks/scienceagentbench/README.md
similarity index 90%
rename from evaluation/scienceagentbench/README.md
rename to evaluation/benchmarks/scienceagentbench/README.md
index 3182c2e117..4d97917721 100644
--- a/evaluation/scienceagentbench/README.md
+++ b/evaluation/benchmarks/scienceagentbench/README.md
@@ -13,10 +13,10 @@ To prevent benchmark data contamination, we only provide the annotation sheet on
 ## Run Inference on ScienceAgentBench
 
 ```bash
-./evaluation/scienceagentbench/scripts/run_infer.sh [model_config] [git-version] [use_knowledge] [agent] [eval_limit] [max_iter] [num_workers] [dataset] [dataset_split]
+./evaluation/benchmarks/scienceagentbench/scripts/run_infer.sh [model_config] [git-version] [use_knowledge] [agent] [eval_limit] [max_iter] [num_workers] [dataset] [dataset_split]
 
 # Example
-./evaluation/scienceagentbench/scripts/run_infer.sh llm.eval_gpt4o 0.9.3
+./evaluation/benchmarks/scienceagentbench/scripts/run_infer.sh llm.eval_gpt4o 0.9.3
 ```
 
 where `model_config` is mandatory, and the rest are optional.
diff --git a/evaluation/scienceagentbench/post_proc.py b/evaluation/benchmarks/scienceagentbench/post_proc.py
similarity index 100%
rename from evaluation/scienceagentbench/post_proc.py
rename to evaluation/benchmarks/scienceagentbench/post_proc.py
diff --git a/evaluation/scienceagentbench/run_infer.py b/evaluation/benchmarks/scienceagentbench/run_infer.py
similarity index 100%
rename from evaluation/scienceagentbench/run_infer.py
rename to evaluation/benchmarks/scienceagentbench/run_infer.py
diff --git a/evaluation/scienceagentbench/scripts/run_infer.sh b/evaluation/benchmarks/scienceagentbench/scripts/run_infer.sh
similarity index 92%
rename from evaluation/scienceagentbench/scripts/run_infer.sh
rename to evaluation/benchmarks/scienceagentbench/scripts/run_infer.sh
index 7667e57237..970f10ed2f 100755
--- a/evaluation/scienceagentbench/scripts/run_infer.sh
+++ b/evaluation/benchmarks/scienceagentbench/scripts/run_infer.sh
@@ -32,7 +32,7 @@ echo "AGENT: $AGENT"
 echo "AGENT_VERSION: $AGENT_VERSION"
 echo "MODEL_CONFIG: $MODEL_CONFIG"
 
-COMMAND="poetry run python evaluation/scienceagentbench/run_infer.py \
+COMMAND="poetry run python evaluation/benchmarks/scienceagentbench/run_infer.py \
   --agent-cls $AGENT \
   --llm-config $MODEL_CONFIG \
   --use_knowledge $USE_KNOWLEDGE \
diff --git a/evaluation/swe_bench/README.md b/evaluation/benchmarks/swe_bench/README.md
similarity index 80%
rename from evaluation/swe_bench/README.md
rename to evaluation/benchmarks/swe_bench/README.md
index 147d2a35ea..b69a738955 100644
--- a/evaluation/swe_bench/README.md
+++ b/evaluation/benchmarks/swe_bench/README.md
@@ -27,10 +27,10 @@ Make sure your Docker daemon is running, and you have ample disk space (at least
 When the `run_infer.sh` script is started, it will automatically pull the relevant SWE-Bench images. For example, for instance ID `django_django-11011`, it will try to pull our pre-build docker image `sweb.eval.x86_64.django_s_django-11011` from DockerHub. This image will be used create an OpenHands runtime image where the agent will operate on.
 
 ```bash
-./evaluation/swe_bench/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit] [max_iter] [num_workers] [dataset] [dataset_split]
+./evaluation/benchmarks/swe_bench/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit] [max_iter] [num_workers] [dataset] [dataset_split]
 
 # Example
-./evaluation/swe_bench/scripts/run_infer.sh llm.eval_gpt4_1106_preview HEAD CodeActAgent 300 30 1 princeton-nlp/SWE-bench_Lite test
+./evaluation/benchmarks/swe_bench/scripts/run_infer.sh llm.eval_gpt4_1106_preview HEAD CodeActAgent 300 30 1 princeton-nlp/SWE-bench_Lite test
 ```
 
 where `model_config` is mandatory, and the rest are optional.
@@ -62,7 +62,7 @@ Let's say you'd like to run 10 instances using `llm.eval_gpt4_1106_preview` and
 then your command would be:
 
 ```bash
-./evaluation/swe_bench/scripts/run_infer.sh llm.eval_gpt4_1106_preview HEAD CodeActAgent 10
+./evaluation/benchmarks/swe_bench/scripts/run_infer.sh llm.eval_gpt4_1106_preview HEAD CodeActAgent 10
 ```
 
 ### Run Inference on `RemoteRuntime` (experimental)
@@ -70,23 +70,23 @@ then your command would be:
 This is in limited beta. Contact Xingyao over slack if you want to try this out!
 
 ```bash
-./evaluation/swe_bench/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit] [max_iter] [num_workers] [dataset] [dataset_split]
+./evaluation/benchmarks/swe_bench/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit] [max_iter] [num_workers] [dataset] [dataset_split]
 
 # Example - This runs evaluation on CodeActAgent for 300 instances on "princeton-nlp/SWE-bench_Lite"'s test set, with max 30 iteration per instances, with 16 number of workers running in parallel
 ALLHANDS_API_KEY="YOUR-API-KEY" RUNTIME=remote SANDBOX_REMOTE_RUNTIME_API_URL="https://runtime.eval.all-hands.dev" EVAL_DOCKER_IMAGE_PREFIX="us-central1-docker.pkg.dev/evaluation-092424/swe-bench-images" \
-./evaluation/swe_bench/scripts/run_infer.sh llm.eval HEAD CodeActAgent 300 30 16 "princeton-nlp/SWE-bench_Lite" test
+./evaluation/benchmarks/swe_bench/scripts/run_infer.sh llm.eval HEAD CodeActAgent 300 30 16 "princeton-nlp/SWE-bench_Lite" test
 ```
 
 To clean-up all existing runtime you've already started, run:
 
 ```bash
-ALLHANDS_API_KEY="YOUR-API-KEY" ./evaluation/swe_bench/scripts/cleanup_remote_runtime.sh
+ALLHANDS_API_KEY="YOUR-API-KEY" ./evaluation/benchmarks/swe_bench/scripts/cleanup_remote_runtime.sh
 ```
 
 ### Specify a subset of tasks to run infer
 
 If you would like to specify a list of tasks you'd like to benchmark on, you could
-create a `config.toml` under `./evaluation/swe_bench/` folder, and put a list
+create a `config.toml` under `./evaluation/benchmarks/swe_bench/` folder, and put a list
 attribute named `selected_ids`, e.g.
 
 ```toml
@@ -105,19 +105,19 @@ After running the inference, you will obtain a `output.jsonl` (by default it wil
 **(Recommended for reproducibility)** If you have extra local space (e.g., 200GB), you can try pull the [instance-level docker images](https://github.com/princeton-nlp/SWE-bench/blob/main/docs/20240627_docker/README.md#choosing-the-right-cache_level) we've prepared by running:
 
 ```bash
-evaluation/swe_bench/scripts/docker/pull_all_eval_docker.sh instance
+evaluation/benchmarks/swe_bench/scripts/docker/pull_all_eval_docker.sh instance
 ```
 
 If you want to save disk space a bit (e.g., with ~50GB free disk space), while speeding up the image pre-build process, you can pull the environment-level docker images:
 
 ```bash
-evaluation/swe_bench/scripts/docker/pull_all_eval_docker.sh env
+evaluation/benchmarks/swe_bench/scripts/docker/pull_all_eval_docker.sh env
 ```
 
 If you want to evaluate on the full SWE-Bench test set:
 
 ```bash
-evaluation/swe_bench/scripts/docker/pull_all_eval_docker.sh instance full
+evaluation/benchmarks/swe_bench/scripts/docker/pull_all_eval_docker.sh instance full
 ```
 
 ### Run evaluation
@@ -136,10 +136,10 @@ NOTE, you should have already pulled the instance-level OR env-level docker imag
 Then you can run the following:
 
 ```bash
-./evaluation/swe_bench/scripts/eval_infer.sh $YOUR_OUTPUT_JSONL [instance_id] [dataset_name] [split]
+./evaluation/benchmarks/swe_bench/scripts/eval_infer.sh $YOUR_OUTPUT_JSONL [instance_id] [dataset_name] [split]
 
 # Example
-./evaluation/swe_bench/scripts/eval_infer.sh evaluation/evaluation_outputs/outputs/swe_bench/CodeActAgent/gpt-4-1106-preview_maxiter_50_N_v1.0/output.jsonl
+./evaluation/benchmarks/swe_bench/scripts/eval_infer.sh evaluation/evaluation_outputs/outputs/swe_bench/CodeActAgent/gpt-4-1106-preview_maxiter_50_N_v1.0/output.jsonl
 ```
 
 The script now accepts optional arguments:
@@ -150,10 +150,10 @@ The script now accepts optional arguments:
 For example, to evaluate a specific instance with a custom dataset and split:
 
 ```bash
-./evaluation/swe_bench/scripts/eval_infer.sh $YOUR_OUTPUT_JSONL instance_123 princeton-nlp/SWE-bench test
+./evaluation/benchmarks/swe_bench/scripts/eval_infer.sh $YOUR_OUTPUT_JSONL instance_123 princeton-nlp/SWE-bench test
 ```
 
-> You can also pass in a JSONL with [SWE-Bench format](https://github.com/princeton-nlp/SWE-bench/blob/main/tutorials/evaluation.md#-creating-predictions) to `./evaluation/swe_bench/scripts/eval_infer.sh`, where each line is a JSON of `{"model_patch": "XXX", "model_name_or_path": "YYY", "instance_id": "ZZZ"}`.
+> You can also pass in a JSONL with [SWE-Bench format](https://github.com/princeton-nlp/SWE-bench/blob/main/tutorials/evaluation.md#-creating-predictions) to `./evaluation/benchmarks/swe_bench/scripts/eval_infer.sh`, where each line is a JSON of `{"model_patch": "XXX", "model_name_or_path": "YYY", "instance_id": "ZZZ"}`.
 
 The final results will be saved to `evaluation/evaluation_outputs/outputs/swe_bench/CodeActAgent/gpt-4-1106-preview_maxiter_50_N_v1.0/` with the following files/directory:
 
@@ -166,17 +166,17 @@ The final results will be saved to `evaluation/evaluation_outputs/outputs/swe_be
 This is in limited beta. Contact Xingyao over slack if you want to try this out!
 
 ```bash
-./evaluation/swe_bench/scripts/eval_infer_remote.sh [output.jsonl filepath] [num_workers]
+./evaluation/benchmarks/swe_bench/scripts/eval_infer_remote.sh [output.jsonl filepath] [num_workers]
 
 # Example - This evaluates patches generated by CodeActAgent on Llama-3.1-70B-Instruct-Turbo on "princeton-nlp/SWE-bench_Lite"'s test set, with 16 number of workers running in parallel
 ALLHANDS_API_KEY="YOUR-API-KEY" RUNTIME=remote SANDBOX_REMOTE_RUNTIME_API_URL="https://runtime.eval.all-hands.dev" EVAL_DOCKER_IMAGE_PREFIX="us-central1-docker.pkg.dev/evaluation-092424/swe-bench-images" \
-evaluation/swe_bench/scripts/eval_infer_remote.sh evaluation/evaluation_outputs/outputs/swe-bench-lite/CodeActAgent/Llama-3.1-70B-Instruct-Turbo_maxiter_30_N_v1.9-no-hint/output.jsonl 16 "princeton-nlp/SWE-bench_Lite" "test"
+evaluation/benchmarks/swe_bench/scripts/eval_infer_remote.sh evaluation/evaluation_outputs/outputs/swe-bench-lite/CodeActAgent/Llama-3.1-70B-Instruct-Turbo_maxiter_30_N_v1.9-no-hint/output.jsonl 16 "princeton-nlp/SWE-bench_Lite" "test"
 ```
 
 To clean-up all existing runtimes that you've already started, run:
 
 ```bash
-ALLHANDS_API_KEY="YOUR-API-KEY" ./evaluation/swe_bench/scripts/cleanup_remote_runtime.sh
+ALLHANDS_API_KEY="YOUR-API-KEY" ./evaluation/benchmarks/swe_bench/scripts/cleanup_remote_runtime.sh
 ```
 
 
diff --git a/evaluation/swe_bench/__init__.py b/evaluation/benchmarks/swe_bench/__init__.py
similarity index 100%
rename from evaluation/swe_bench/__init__.py
rename to evaluation/benchmarks/swe_bench/__init__.py
diff --git a/evaluation/swe_bench/eval_infer.py b/evaluation/benchmarks/swe_bench/eval_infer.py
similarity index 99%
rename from evaluation/swe_bench/eval_infer.py
rename to evaluation/benchmarks/swe_bench/eval_infer.py
index d40f984fca..95f65245f2 100644
--- a/evaluation/swe_bench/eval_infer.py
+++ b/evaluation/benchmarks/swe_bench/eval_infer.py
@@ -12,7 +12,7 @@ from swebench.harness.run_evaluation import (
 from swebench.harness.test_spec import SWEbenchInstance, TestSpec, make_test_spec
 from swebench.harness.utils import load_swebench_dataset
 
-from evaluation.swe_bench.run_infer import get_instance_docker_image
+from evaluation.benchmarks.swe_bench.run_infer import get_instance_docker_image
 from evaluation.utils.shared import (
     EvalMetadata,
     EvalOutput,
diff --git a/evaluation/swe_bench/examples/example_agent_output.jsonl b/evaluation/benchmarks/swe_bench/examples/example_agent_output.jsonl
similarity index 100%
rename from evaluation/swe_bench/examples/example_agent_output.jsonl
rename to evaluation/benchmarks/swe_bench/examples/example_agent_output.jsonl
diff --git a/evaluation/swe_bench/examples/example_model_output.json b/evaluation/benchmarks/swe_bench/examples/example_model_output.json
similarity index 100%
rename from evaluation/swe_bench/examples/example_model_output.json
rename to evaluation/benchmarks/swe_bench/examples/example_model_output.json
diff --git a/evaluation/swe_bench/prompt.py b/evaluation/benchmarks/swe_bench/prompt.py
similarity index 100%
rename from evaluation/swe_bench/prompt.py
rename to evaluation/benchmarks/swe_bench/prompt.py
diff --git a/evaluation/swe_bench/run_infer.py b/evaluation/benchmarks/swe_bench/run_infer.py
similarity index 99%
rename from evaluation/swe_bench/run_infer.py
rename to evaluation/benchmarks/swe_bench/run_infer.py
index 9cb9dd77f4..3ffc08d29b 100644
--- a/evaluation/swe_bench/run_infer.py
+++ b/evaluation/benchmarks/swe_bench/run_infer.py
@@ -9,7 +9,7 @@ import toml
 from datasets import load_dataset
 
 import openhands.agenthub
-from evaluation.swe_bench.prompt import CODEACT_SWE_PROMPT
+from evaluation.benchmarks.swe_bench.prompt import CODEACT_SWE_PROMPT
 from evaluation.utils.shared import (
     EvalException,
     EvalMetadata,
diff --git a/evaluation/swe_bench/scripts/cleanup_remote_runtime.sh b/evaluation/benchmarks/swe_bench/scripts/cleanup_remote_runtime.sh
similarity index 100%
rename from evaluation/swe_bench/scripts/cleanup_remote_runtime.sh
rename to evaluation/benchmarks/swe_bench/scripts/cleanup_remote_runtime.sh
diff --git a/evaluation/swe_bench/scripts/docker/all-swebench-full-instance-images.txt b/evaluation/benchmarks/swe_bench/scripts/docker/all-swebench-full-instance-images.txt
similarity index 100%
rename from evaluation/swe_bench/scripts/docker/all-swebench-full-instance-images.txt
rename to evaluation/benchmarks/swe_bench/scripts/docker/all-swebench-full-instance-images.txt
diff --git a/evaluation/swe_bench/scripts/docker/all-swebench-lite-instance-images.txt b/evaluation/benchmarks/swe_bench/scripts/docker/all-swebench-lite-instance-images.txt
similarity index 100%
rename from evaluation/swe_bench/scripts/docker/all-swebench-lite-instance-images.txt
rename to evaluation/benchmarks/swe_bench/scripts/docker/all-swebench-lite-instance-images.txt
diff --git a/evaluation/swe_bench/scripts/docker/pull_all_eval_docker.sh b/evaluation/benchmarks/swe_bench/scripts/docker/pull_all_eval_docker.sh
similarity index 100%
rename from evaluation/swe_bench/scripts/docker/pull_all_eval_docker.sh
rename to evaluation/benchmarks/swe_bench/scripts/docker/pull_all_eval_docker.sh
diff --git a/evaluation/swe_bench/scripts/docker/push_docker_instance_images.py b/evaluation/benchmarks/swe_bench/scripts/docker/push_docker_instance_images.py
similarity index 96%
rename from evaluation/swe_bench/scripts/docker/push_docker_instance_images.py
rename to evaluation/benchmarks/swe_bench/scripts/docker/push_docker_instance_images.py
index 20fb1b94c0..52e2ea4cb1 100644
--- a/evaluation/swe_bench/scripts/docker/push_docker_instance_images.py
+++ b/evaluation/benchmarks/swe_bench/scripts/docker/push_docker_instance_images.py
@@ -32,7 +32,7 @@ from tqdm import tqdm
 from openhands.core.logger import openhands_logger as logger
 
 logger.setLevel('ERROR')
-from evaluation.swe_bench.run_infer import get_instance_docker_image  # noqa
+from evaluation.benchmarks.swe_bench.run_infer import get_instance_docker_image  # noqa
 
 parser = argparse.ArgumentParser()
 parser.add_argument('--dataset', type=str, default='princeton-nlp/SWE-bench_Lite')
diff --git a/evaluation/swe_bench/scripts/docker/push_eval_docker.sh b/evaluation/benchmarks/swe_bench/scripts/docker/push_eval_docker.sh
similarity index 100%
rename from evaluation/swe_bench/scripts/docker/push_eval_docker.sh
rename to evaluation/benchmarks/swe_bench/scripts/docker/push_eval_docker.sh
diff --git a/evaluation/swe_bench/scripts/eval/compare_outputs.py b/evaluation/benchmarks/swe_bench/scripts/eval/compare_outputs.py
similarity index 100%
rename from evaluation/swe_bench/scripts/eval/compare_outputs.py
rename to evaluation/benchmarks/swe_bench/scripts/eval/compare_outputs.py
diff --git a/evaluation/swe_bench/scripts/eval/convert_oh_folder_to_swebench_submission.sh b/evaluation/benchmarks/swe_bench/scripts/eval/convert_oh_folder_to_swebench_submission.sh
similarity index 87%
rename from evaluation/swe_bench/scripts/eval/convert_oh_folder_to_swebench_submission.sh
rename to evaluation/benchmarks/swe_bench/scripts/eval/convert_oh_folder_to_swebench_submission.sh
index 8bbaa6ddce..044f9972f4 100755
--- a/evaluation/swe_bench/scripts/eval/convert_oh_folder_to_swebench_submission.sh
+++ b/evaluation/benchmarks/swe_bench/scripts/eval/convert_oh_folder_to_swebench_submission.sh
@@ -5,7 +5,7 @@ NEW_FOLDER_PATH=${FOLDER_PATH}.swebench_submission
 mkdir -p $NEW_FOLDER_PATH
 
 # Build all_preds.jsonl
-poetry run python evaluation/swe_bench/scripts/eval/convert_oh_output_to_swe_json.py $FOLDER_PATH/output.jsonl
+poetry run python evaluation/benchmarks/swe_bench/scripts/eval/convert_oh_output_to_swe_json.py $FOLDER_PATH/output.jsonl
 mv $FOLDER_PATH/output.swebench.jsonl $NEW_FOLDER_PATH/all_preds.jsonl
 
 # Build trajs/
diff --git a/evaluation/swe_bench/scripts/eval/convert_oh_output_to_md.py b/evaluation/benchmarks/swe_bench/scripts/eval/convert_oh_output_to_md.py
similarity index 97%
rename from evaluation/swe_bench/scripts/eval/convert_oh_output_to_md.py
rename to evaluation/benchmarks/swe_bench/scripts/eval/convert_oh_output_to_md.py
index 17a375ee3b..8e9fc407d9 100755
--- a/evaluation/swe_bench/scripts/eval/convert_oh_output_to_md.py
+++ b/evaluation/benchmarks/swe_bench/scripts/eval/convert_oh_output_to_md.py
@@ -8,7 +8,7 @@ import os
 import pandas as pd
 from tqdm import tqdm
 
-from evaluation.swe_bench.eval_infer import process_git_patch
+from evaluation.benchmarks.swe_bench.eval_infer import process_git_patch
 from openhands.events.serialization import event_from_dict
 
 tqdm.pandas()
diff --git a/evaluation/swe_bench/scripts/eval/convert_oh_output_to_swe_json.py b/evaluation/benchmarks/swe_bench/scripts/eval/convert_oh_output_to_swe_json.py
similarity index 93%
rename from evaluation/swe_bench/scripts/eval/convert_oh_output_to_swe_json.py
rename to evaluation/benchmarks/swe_bench/scripts/eval/convert_oh_output_to_swe_json.py
index 5006d3dde3..f333012f48 100644
--- a/evaluation/swe_bench/scripts/eval/convert_oh_output_to_swe_json.py
+++ b/evaluation/benchmarks/swe_bench/scripts/eval/convert_oh_output_to_swe_json.py
@@ -3,7 +3,7 @@ import os
 
 import pandas as pd
 
-from evaluation.swe_bench.eval_infer import process_git_patch
+from evaluation.benchmarks.swe_bench.eval_infer import process_git_patch
 
 parser = argparse.ArgumentParser()
 parser.add_argument('oh_output_file', type=str)
diff --git a/evaluation/swe_bench/scripts/eval/download_gold_patch.py b/evaluation/benchmarks/swe_bench/scripts/eval/download_gold_patch.py
similarity index 100%
rename from evaluation/swe_bench/scripts/eval/download_gold_patch.py
rename to evaluation/benchmarks/swe_bench/scripts/eval/download_gold_patch.py
diff --git a/evaluation/swe_bench/scripts/eval/summarize_outputs.py b/evaluation/benchmarks/swe_bench/scripts/eval/summarize_outputs.py
similarity index 100%
rename from evaluation/swe_bench/scripts/eval/summarize_outputs.py
rename to evaluation/benchmarks/swe_bench/scripts/eval/summarize_outputs.py
diff --git a/evaluation/swe_bench/scripts/eval/update_output_with_eval.py b/evaluation/benchmarks/swe_bench/scripts/eval/update_output_with_eval.py
similarity index 100%
rename from evaluation/swe_bench/scripts/eval/update_output_with_eval.py
rename to evaluation/benchmarks/swe_bench/scripts/eval/update_output_with_eval.py
diff --git a/evaluation/swe_bench/scripts/eval_infer.sh b/evaluation/benchmarks/swe_bench/scripts/eval_infer.sh
similarity index 95%
rename from evaluation/swe_bench/scripts/eval_infer.sh
rename to evaluation/benchmarks/swe_bench/scripts/eval_infer.sh
index 8e263e10ca..13ef271671 100755
--- a/evaluation/swe_bench/scripts/eval_infer.sh
+++ b/evaluation/benchmarks/swe_bench/scripts/eval_infer.sh
@@ -58,7 +58,7 @@ else
 
     # ==== Convert OH format to SWE-bench format ====
     echo "Merged output file with fine-grained report will be saved to $FILE_DIR"
-    poetry run python3 evaluation/swe_bench/scripts/eval/convert_oh_output_to_swe_json.py $PROCESS_FILEPATH
+    poetry run python3 evaluation/benchmarks/swe_bench/scripts/eval/convert_oh_output_to_swe_json.py $PROCESS_FILEPATH
     # replace .jsonl with .swebench.jsonl in filename
     SWEBENCH_FORMAT_JSONL=${PROCESS_FILEPATH/.jsonl/.swebench.jsonl}
     echo "SWEBENCH_FORMAT_JSONL: $SWEBENCH_FORMAT_JSONL"
@@ -125,7 +125,7 @@ if [ -z "$INSTANCE_ID" ]; then
         mv $REPORT_PATH $RESULT_OUTPUT_DIR/report.json
     fi
 
-    poetry run python evaluation/swe_bench/scripts/eval/update_output_with_eval.py $PROCESS_FILEPATH
+    poetry run python evaluation/benchmarks/swe_bench/scripts/eval/update_output_with_eval.py $PROCESS_FILEPATH
 
 else
     echo "Running SWE-bench evaluation on the instance_id: $INSTANCE_ID"
diff --git a/evaluation/swe_bench/scripts/eval_infer_remote.sh b/evaluation/benchmarks/swe_bench/scripts/eval_infer_remote.sh
similarity index 83%
rename from evaluation/swe_bench/scripts/eval_infer_remote.sh
rename to evaluation/benchmarks/swe_bench/scripts/eval_infer_remote.sh
index dead194ef2..6828097836 100755
--- a/evaluation/swe_bench/scripts/eval_infer_remote.sh
+++ b/evaluation/benchmarks/swe_bench/scripts/eval_infer_remote.sh
@@ -28,7 +28,7 @@ fi
 
 echo "... Evaluating on $INPUT_FILE ..."
 
-COMMAND="poetry run python evaluation/swe_bench/eval_infer.py \
+COMMAND="poetry run python evaluation/benchmarks/swe_bench/eval_infer.py \
   --eval-num-workers $NUM_WORKERS \
   --input-file $INPUT_FILE \
   --dataset $DATASET \
@@ -43,4 +43,4 @@ fi
 eval $COMMAND
 
 # update the output with evaluation results
-poetry run python evaluation/swe_bench/scripts/eval/update_output_with_eval.py $INPUT_FILE
+poetry run python evaluation/benchmarks/swe_bench/scripts/eval/update_output_with_eval.py $INPUT_FILE
diff --git a/evaluation/swe_bench/scripts/run_infer.sh b/evaluation/benchmarks/swe_bench/scripts/run_infer.sh
similarity index 97%
rename from evaluation/swe_bench/scripts/run_infer.sh
rename to evaluation/benchmarks/swe_bench/scripts/run_infer.sh
index 520003635a..a27bd7cdbb 100755
--- a/evaluation/swe_bench/scripts/run_infer.sh
+++ b/evaluation/benchmarks/swe_bench/scripts/run_infer.sh
@@ -84,7 +84,7 @@ fi
 
 function run_eval() {
   local eval_note=$1
-  COMMAND="poetry run python evaluation/swe_bench/run_infer.py \
+  COMMAND="poetry run python evaluation/benchmarks/swe_bench/run_infer.py \
     --agent-cls $AGENT \
     --llm-config $MODEL_CONFIG \
     --max-iterations $MAX_ITER \
diff --git a/evaluation/swe_bench/scripts/setup/compare_patch_filename.py b/evaluation/benchmarks/swe_bench/scripts/setup/compare_patch_filename.py
similarity index 100%
rename from evaluation/swe_bench/scripts/setup/compare_patch_filename.py
rename to evaluation/benchmarks/swe_bench/scripts/setup/compare_patch_filename.py
diff --git a/evaluation/swe_bench/scripts/setup/instance_swe_entry.sh b/evaluation/benchmarks/swe_bench/scripts/setup/instance_swe_entry.sh
similarity index 100%
rename from evaluation/swe_bench/scripts/setup/instance_swe_entry.sh
rename to evaluation/benchmarks/swe_bench/scripts/setup/instance_swe_entry.sh
diff --git a/evaluation/swe_bench/scripts/setup/prepare_swe_utils.sh b/evaluation/benchmarks/swe_bench/scripts/setup/prepare_swe_utils.sh
similarity index 93%
rename from evaluation/swe_bench/scripts/setup/prepare_swe_utils.sh
rename to evaluation/benchmarks/swe_bench/scripts/setup/prepare_swe_utils.sh
index bc1f4c03b7..7091b6f586 100755
--- a/evaluation/swe_bench/scripts/setup/prepare_swe_utils.sh
+++ b/evaluation/benchmarks/swe_bench/scripts/setup/prepare_swe_utils.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 
 set -e
-EVAL_WORKSPACE="evaluation/swe_bench/eval_workspace"
+EVAL_WORKSPACE="evaluation/benchmarks/swe_bench/eval_workspace"
 mkdir -p $EVAL_WORKSPACE
 
 # 1. Prepare REPO
diff --git a/evaluation/swe_bench/scripts/setup/swe_entry.sh b/evaluation/benchmarks/swe_bench/scripts/setup/swe_entry.sh
similarity index 100%
rename from evaluation/swe_bench/scripts/setup/swe_entry.sh
rename to evaluation/benchmarks/swe_bench/scripts/setup/swe_entry.sh
diff --git a/evaluation/toolqa/Dockerfile b/evaluation/benchmarks/toolqa/Dockerfile
similarity index 100%
rename from evaluation/toolqa/Dockerfile
rename to evaluation/benchmarks/toolqa/Dockerfile
diff --git a/evaluation/toolqa/README.md b/evaluation/benchmarks/toolqa/README.md
similarity index 88%
rename from evaluation/toolqa/README.md
rename to evaluation/benchmarks/toolqa/README.md
index 07f74645e2..eda478f448 100644
--- a/evaluation/toolqa/README.md
+++ b/evaluation/benchmarks/toolqa/README.md
@@ -11,7 +11,7 @@ Please follow instruction [here](../README.md#setup) to setup your local develop
 Make sure your Docker daemon is running, then run this bash script:
 
 ```bash
-bash evaluation/toolqa/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit] [dataset] [hardness] [wolfram_alpha_appid]
+bash evaluation/benchmarks/toolqa/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit] [dataset] [hardness] [wolfram_alpha_appid]
 ```
 
 where `model_config` is mandatory, while all other arguments are optional.
@@ -40,5 +40,5 @@ Let's say you'd like to run 10 instances using `llm` and CodeActAgent on `coffee
 then your command would be:
 
 ```bash
-bash evaluation/toolqa/scripts/run_infer.sh llm CodeActAgent 10 coffee easy
+bash evaluation/benchmarks/toolqa/scripts/run_infer.sh llm CodeActAgent 10 coffee easy
 ```
diff --git a/evaluation/toolqa/run_infer.py b/evaluation/benchmarks/toolqa/run_infer.py
similarity index 98%
rename from evaluation/toolqa/run_infer.py
rename to evaluation/benchmarks/toolqa/run_infer.py
index a7c5242d2f..c99f15a89a 100644
--- a/evaluation/toolqa/run_infer.py
+++ b/evaluation/benchmarks/toolqa/run_infer.py
@@ -4,7 +4,7 @@ from typing import Any
 
 import pandas as pd
 
-from evaluation.toolqa.utils import encode_question, eval_answer, get_data
+from evaluation.benchmarks.toolqa.utils import encode_question, eval_answer, get_data
 from evaluation.utils.shared import (
     EvalMetadata,
     EvalOutput,
diff --git a/evaluation/toolqa/scripts/run_infer.sh b/evaluation/benchmarks/toolqa/scripts/run_infer.sh
similarity index 95%
rename from evaluation/toolqa/scripts/run_infer.sh
rename to evaluation/benchmarks/toolqa/scripts/run_infer.sh
index 2af978e76a..bfe3471f4f 100755
--- a/evaluation/toolqa/scripts/run_infer.sh
+++ b/evaluation/benchmarks/toolqa/scripts/run_infer.sh
@@ -47,7 +47,7 @@ echo "DATASET: $DATASET"
 echo "HARDNESS: $HARDNESS"
 echo "WOLFRAM_APPID: $WOLFRAM_APPID"
 
-COMMAND="poetry run python evaluation/toolqa/run_infer.py \
+COMMAND="poetry run python evaluation/benchmarks/toolqa/run_infer.py \
   --agent-cls $AGENT \
   --llm-config $MODEL_CONFIG \
   --max-iterations 30 \
diff --git a/evaluation/toolqa/utils.py b/evaluation/benchmarks/toolqa/utils.py
similarity index 100%
rename from evaluation/toolqa/utils.py
rename to evaluation/benchmarks/toolqa/utils.py
diff --git a/evaluation/webarena/README.md b/evaluation/benchmarks/webarena/README.md
similarity index 91%
rename from evaluation/webarena/README.md
rename to evaluation/benchmarks/webarena/README.md
index e81f92c592..3e403d5a7f 100644
--- a/evaluation/webarena/README.md
+++ b/evaluation/benchmarks/webarena/README.md
@@ -24,7 +24,7 @@ Follow the WebArena environment setup guide carefully, and make sure the URL fie
 ```bash
 export WEBARENA_BASE_URL=<YOUR_SERVER_URL_HERE>
 export OPENAI_API_KEY="yourkey" # this key is required for some WebArena validators that utilize LLMs
-bash evaluation/webarena/scripts/run_infer.sh
+bash evaluation/benchmarks/webarena/scripts/run_infer.sh
 ```
 
 Results will be in `evaluation/evaluation_outputs/outputs/webarena/`
@@ -32,7 +32,7 @@ Results will be in `evaluation/evaluation_outputs/outputs/webarena/`
 To calculate the success rate, run:
 
 ```sh
-poetry run python evaluation/webarena/get_success_rate.py evaluation/evaluation_outputs/outputs/webarena/SOME_AGENT/EXP_NAME/output.jsonl
+poetry run python evaluation/benchmarks/webarena/get_success_rate.py evaluation/evaluation_outputs/outputs/webarena/SOME_AGENT/EXP_NAME/output.jsonl
 ```
 
 ## Submit your evaluation results
diff --git a/evaluation/webarena/__init__.py b/evaluation/benchmarks/webarena/__init__.py
similarity index 100%
rename from evaluation/webarena/__init__.py
rename to evaluation/benchmarks/webarena/__init__.py
diff --git a/evaluation/webarena/get_success_rate.py b/evaluation/benchmarks/webarena/get_success_rate.py
similarity index 100%
rename from evaluation/webarena/get_success_rate.py
rename to evaluation/benchmarks/webarena/get_success_rate.py
diff --git a/evaluation/webarena/run_infer.py b/evaluation/benchmarks/webarena/run_infer.py
similarity index 100%
rename from evaluation/webarena/run_infer.py
rename to evaluation/benchmarks/webarena/run_infer.py
diff --git a/evaluation/webarena/scripts/run_infer.sh b/evaluation/benchmarks/webarena/scripts/run_infer.sh
similarity index 87%
rename from evaluation/webarena/scripts/run_infer.sh
rename to evaluation/benchmarks/webarena/scripts/run_infer.sh
index c5b2c1ecd0..22372b82d7 100755
--- a/evaluation/webarena/scripts/run_infer.sh
+++ b/evaluation/benchmarks/webarena/scripts/run_infer.sh
@@ -4,7 +4,7 @@ set -eo pipefail
 source "evaluation/utils/version_control.sh"
 
 # configure webarena websites and environment
-source evaluation/webarena/scripts/webarena_env.sh
+source evaluation/benchmarks/webarena/scripts/webarena_env.sh
 
 # configure browsing agent
 export USE_NAV="false"
@@ -35,7 +35,7 @@ echo "MODEL_CONFIG: $MODEL_CONFIG"
 
 EVAL_NOTE="$AGENT_VERSION"
 
-COMMAND="poetry run python evaluation/webarena/run_infer.py \
+COMMAND="poetry run python evaluation/benchmarks/webarena/run_infer.py \
   --agent-cls $AGENT \
   --llm-config $MODEL_CONFIG \
   --max-iterations 15 \