diff --git a/evaluation/benchmarks/EDA/scripts/run_infer.sh b/evaluation/benchmarks/EDA/scripts/run_infer.sh index a803073f73..9897ad3c61 100755 --- a/evaluation/benchmarks/EDA/scripts/run_infer.sh +++ b/evaluation/benchmarks/EDA/scripts/run_infer.sh @@ -21,7 +21,7 @@ if [ -z "$AGENT" ]; then AGENT="CodeActAgent" fi -get_agent_version +get_openhands_version if [ -z "$DATASET" ]; then echo "Dataset not specified, use default 'things'" @@ -34,12 +34,9 @@ if [ -z "$OPENAI_API_KEY" ]; then exit 1 fi -# IMPORTANT: Because Agent's prompt changes fairly often in the rapidly evolving codebase of OpenHands -# We need to track the version of Agent in the evaluation to make sure results are comparable -AGENT_VERSION=v$(poetry run python -c "import openhands.agenthub; from openhands.controller.agent import Agent; print(Agent.get_cls('$AGENT').VERSION)") echo "AGENT: $AGENT" -echo "AGENT_VERSION: $AGENT_VERSION" +echo "OPENHANDS_VERSION: $OPENHANDS_VERSION" echo "MODEL_CONFIG: $MODEL_CONFIG" echo "DATASET: $DATASET" @@ -51,7 +48,7 @@ COMMAND="poetry run python evaluation/benchmarks/EDA/run_infer.py \ --max-iterations 20 \ --OPENAI_API_KEY $OPENAI_API_KEY \ --eval-num-workers $NUM_WORKERS \ - --eval-note ${AGENT_VERSION}_${DATASET}" + --eval-note ${OPENHANDS_VERSION}_${DATASET}" if [ -n "$EVAL_LIMIT" ]; then echo "EVAL_LIMIT: $EVAL_LIMIT" diff --git a/evaluation/benchmarks/agent_bench/scripts/run_infer.sh b/evaluation/benchmarks/agent_bench/scripts/run_infer.sh index 16e98b074b..6a22cdcc45 100755 --- a/evaluation/benchmarks/agent_bench/scripts/run_infer.sh +++ b/evaluation/benchmarks/agent_bench/scripts/run_infer.sh @@ -20,10 +20,10 @@ if [ -z "$AGENT" ]; then AGENT="CodeActAgent" fi -get_agent_version +get_openhands_version echo "AGENT: $AGENT" -echo "AGENT_VERSION: $AGENT_VERSION" +echo "OPENHANDS_VERSION: $OPENHANDS_VERSION" echo "MODEL_CONFIG: $MODEL_CONFIG" COMMAND="export PYTHONPATH=evaluation/benchmarks/agent_bench:\$PYTHONPATH && poetry run python evaluation/benchmarks/agent_bench/run_infer.py \ @@ -31,7 +31,7 @@ COMMAND="export PYTHONPATH=evaluation/benchmarks/agent_bench:\$PYTHONPATH && poe --llm-config $MODEL_CONFIG \ --max-iterations 30 \ --eval-num-workers $NUM_WORKERS \ - --eval-note $AGENT_VERSION" + --eval-note $OPENHANDS_VERSION" if [ -n "$EVAL_LIMIT" ]; then echo "EVAL_LIMIT: $EVAL_LIMIT" diff --git a/evaluation/benchmarks/aider_bench/scripts/run_infer.sh b/evaluation/benchmarks/aider_bench/scripts/run_infer.sh index 0b3824ceae..34249e94c5 100755 --- a/evaluation/benchmarks/aider_bench/scripts/run_infer.sh +++ b/evaluation/benchmarks/aider_bench/scripts/run_infer.sh @@ -21,13 +21,13 @@ if [ -z "$AGENT" ]; then AGENT="CodeActAgent" fi -get_agent_version +get_openhands_version echo "AGENT: $AGENT" -echo "AGENT_VERSION: $AGENT_VERSION" +echo "OPENHANDS_VERSION: $OPENHANDS_VERSION" echo "MODEL_CONFIG: $MODEL_CONFIG" -EVAL_NOTE=$AGENT_VERSION +EVAL_NOTE=$OPENHANDS_VERSION # Default to NOT use unit tests. if [ -z "$USE_UNIT_TESTS" ]; then diff --git a/evaluation/benchmarks/biocoder/scripts/run_infer.sh b/evaluation/benchmarks/biocoder/scripts/run_infer.sh index 61fddb6211..76c4f007dc 100755 --- a/evaluation/benchmarks/biocoder/scripts/run_infer.sh +++ b/evaluation/benchmarks/biocoder/scripts/run_infer.sh @@ -21,10 +21,10 @@ if [ -z "$AGENT" ]; then AGENT="CodeActAgent" fi -get_agent_version +get_openhands_version echo "AGENT: $AGENT" -echo "AGENT_VERSION: $AGENT_VERSION" +echo "OPENHANDS_VERSION: $OPENHANDS_VERSION" echo "MODEL_CONFIG: $MODEL_CONFIG" echo "DATASET: $DATASET" @@ -33,7 +33,7 @@ COMMAND="poetry run python evaluation/benchmarks/biocoder/run_infer.py \ --llm-config $MODEL_CONFIG \ --max-iterations 10 \ --eval-num-workers $NUM_WORKERS \ - --eval-note ${AGENT_VERSION}_${DATASET}" + --eval-note ${OPENHANDS_VERSION}_${DATASET}" if [ -n "$EVAL_LIMIT" ]; then echo "EVAL_LIMIT: $EVAL_LIMIT" diff --git a/evaluation/benchmarks/bird/scripts/run_infer.sh b/evaluation/benchmarks/bird/scripts/run_infer.sh index bf69d9d50b..835f511652 100755 --- a/evaluation/benchmarks/bird/scripts/run_infer.sh +++ b/evaluation/benchmarks/bird/scripts/run_infer.sh @@ -20,10 +20,10 @@ if [ -z "$AGENT" ]; then AGENT="CodeActAgent" fi -get_agent_version +get_openhands_version echo "AGENT: $AGENT" -echo "AGENT_VERSION: $AGENT_VERSION" +echo "OPENHANDS_VERSION: $OPENHANDS_VERSION" echo "MODEL_CONFIG: $MODEL_CONFIG" COMMAND="poetry run python evaluation/benchmarks/bird/run_infer.py \ @@ -31,7 +31,7 @@ COMMAND="poetry run python evaluation/benchmarks/bird/run_infer.py \ --llm-config $MODEL_CONFIG \ --max-iterations 5 \ --eval-num-workers $NUM_WORKERS \ - --eval-note $AGENT_VERSION" \ + --eval-note $OPENHANDS_VERSION" \ if [ -n "$EVAL_LIMIT" ]; then echo "EVAL_LIMIT: $EVAL_LIMIT" diff --git a/evaluation/benchmarks/browsing_delegation/scripts/run_infer.sh b/evaluation/benchmarks/browsing_delegation/scripts/run_infer.sh index 30607ca333..78d19fe1b7 100755 --- a/evaluation/benchmarks/browsing_delegation/scripts/run_infer.sh +++ b/evaluation/benchmarks/browsing_delegation/scripts/run_infer.sh @@ -20,13 +20,13 @@ if [ -z "$AGENT" ]; then AGENT="CodeActAgent" fi -get_agent_version +get_openhands_version echo "AGENT: $AGENT" -echo "AGENT_VERSION: $AGENT_VERSION" +echo "OPENHANDS_VERSION: $OPENHANDS_VERSION" echo "MODEL_CONFIG: $MODEL_CONFIG" -EVAL_NOTE="$AGENT_VERSION" +EVAL_NOTE="$OPENHANDS_VERSION" COMMAND="poetry run python evaluation/benchmarks/browsing_delegation/run_infer.py \ --agent-cls $AGENT \ diff --git a/evaluation/benchmarks/commit0_bench/scripts/run_infer.sh b/evaluation/benchmarks/commit0_bench/scripts/run_infer.sh index 227a5ff05e..93df2208b0 100755 --- a/evaluation/benchmarks/commit0_bench/scripts/run_infer.sh +++ b/evaluation/benchmarks/commit0_bench/scripts/run_infer.sh @@ -61,10 +61,10 @@ echo "USE_INSTANCE_IMAGE: $USE_INSTANCE_IMAGE" export RUN_WITH_BROWSING=$RUN_WITH_BROWSING echo "RUN_WITH_BROWSING: $RUN_WITH_BROWSING" -get_agent_version +get_openhands_version echo "AGENT: $AGENT" -echo "AGENT_VERSION: $AGENT_VERSION" +echo "OPENHANDS_VERSION: $OPENHANDS_VERSION" echo "MODEL_CONFIG: $MODEL_CONFIG" echo "DATASET: $DATASET" echo "HF SPLIT: $SPLIT" @@ -75,7 +75,7 @@ if [ -z "$USE_HINT_TEXT" ]; then export USE_HINT_TEXT=false fi echo "USE_HINT_TEXT: $USE_HINT_TEXT" -EVAL_NOTE="$AGENT_VERSION" +EVAL_NOTE="$OPENHANDS_VERSION" # if not using Hint, add -no-hint to the eval note if [ "$USE_HINT_TEXT" = false ]; then EVAL_NOTE="$EVAL_NOTE-no-hint" diff --git a/evaluation/benchmarks/discoverybench/scripts/run_infer.sh b/evaluation/benchmarks/discoverybench/scripts/run_infer.sh index e12b9c1398..0c693a7579 100755 --- a/evaluation/benchmarks/discoverybench/scripts/run_infer.sh +++ b/evaluation/benchmarks/discoverybench/scripts/run_infer.sh @@ -23,10 +23,10 @@ if [ -z "$AGENT" ]; then AGENT="CodeActAgent" fi -get_agent_version +get_openhands_version echo "AGENT: $AGENT" -echo "AGENT_VERSION: $AGENT_VERSION" +echo "OPENHANDS_VERSION: $OPENHANDS_VERSION" echo "MODEL_CONFIG: $MODEL_CONFIG" COMMAND="poetry run python evaluation/benchmarks/discoverybench/run_infer.py \ @@ -35,7 +35,7 @@ COMMAND="poetry run python evaluation/benchmarks/discoverybench/run_infer.py \ --max-iterations 10 \ --max-chars 10000000 \ --eval-num-workers $NUM_WORKERS \ - --eval-note $AGENT_VERSION" + --eval-note $OPENHANDS_VERSION" if [ -n "$EVAL_LIMIT" ]; then echo "EVAL_LIMIT: $EVAL_LIMIT" diff --git a/evaluation/benchmarks/gaia/scripts/run_infer.sh b/evaluation/benchmarks/gaia/scripts/run_infer.sh index 5ad012d07d..4b2f8f73df 100755 --- a/evaluation/benchmarks/gaia/scripts/run_infer.sh +++ b/evaluation/benchmarks/gaia/scripts/run_infer.sh @@ -21,17 +21,17 @@ if [ -z "$AGENT" ]; then AGENT="CodeActAgent" fi -get_agent_version +get_openhands_version if [ -z "$LEVELS" ]; then LEVELS="2023_level1" echo "Levels not specified, use default $LEVELS" fi -get_agent_version +get_openhands_version echo "AGENT: $AGENT" -echo "AGENT_VERSION: $AGENT_VERSION" +echo "OPENHANDS_VERSION: $OPENHANDS_VERSION" echo "MODEL_CONFIG: $MODEL_CONFIG" echo "LEVELS: $LEVELS" @@ -42,7 +42,7 @@ COMMAND="poetry run python ./evaluation/benchmarks/gaia/run_infer.py \ --level $LEVELS \ --data-split validation \ --eval-num-workers $NUM_WORKERS \ - --eval-note ${AGENT_VERSION}_${LEVELS}" + --eval-note ${OPENHANDS_VERSION}_${LEVELS}" if [ -n "$EVAL_LIMIT" ]; then echo "EVAL_LIMIT: $EVAL_LIMIT" diff --git a/evaluation/benchmarks/gorilla/scripts/run_infer.sh b/evaluation/benchmarks/gorilla/scripts/run_infer.sh index 4542444443..2efcdc9579 100755 --- a/evaluation/benchmarks/gorilla/scripts/run_infer.sh +++ b/evaluation/benchmarks/gorilla/scripts/run_infer.sh @@ -21,7 +21,7 @@ if [ -z "$AGENT" ]; then AGENT="CodeActAgent" fi -get_agent_version +get_openhands_version if [ -z "$HUBS" ]; then HUBS="hf,torch,tf" @@ -29,7 +29,7 @@ if [ -z "$HUBS" ]; then fi echo "AGENT: $AGENT" -echo "AGENT_VERSION: $AGENT_VERSION" +echo "OPENHANDS_VERSION: $OPENHANDS_VERSION" echo "MODEL_CONFIG: $MODEL_CONFIG" echo "HUBS: $HUBS" @@ -40,7 +40,7 @@ COMMAND="poetry run python evaluation/benchmarks/gorilla/run_infer.py \ --hubs $HUBS \ --data-split validation \ --eval-num-workers $NUM_WORKERS \ - --eval-note ${AGENT_VERSION}_${LEVELS}" + --eval-note ${OPENHANDS_VERSION}_${LEVELS}" if [ -n "$EVAL_LIMIT" ]; then echo "EVAL_LIMIT: $EVAL_LIMIT" diff --git a/evaluation/benchmarks/gpqa/scripts/run_infer.sh b/evaluation/benchmarks/gpqa/scripts/run_infer.sh index ec5a61dbbb..dbd7cda98f 100755 --- a/evaluation/benchmarks/gpqa/scripts/run_infer.sh +++ b/evaluation/benchmarks/gpqa/scripts/run_infer.sh @@ -27,10 +27,10 @@ if [ -z "$DATA_SPLIT" ]; then DATA_SPLIT="gpqa_diamond" fi -get_agent_version +get_openhands_version echo "AGENT: $AGENT" -echo "AGENT_VERSION: $AGENT_VERSION" +echo "OPENHANDS_VERSION: $OPENHANDS_VERSION" echo "MODEL_CONFIG: $MODEL_CONFIG" COMMAND="poetry run python evaluation/benchmarks/gpqa/run_infer.py \ @@ -39,7 +39,7 @@ COMMAND="poetry run python evaluation/benchmarks/gpqa/run_infer.py \ --max-iterations 10 \ --eval-num-workers $NUM_WORKERS \ --data-split $DATA_SPLIT \ - --eval-note $AGENT_VERSION" + --eval-note $OPENHANDS_VERSION" if [ -n "$EVAL_LIMIT" ]; then echo "EVAL_LIMIT: $EVAL_LIMIT" diff --git a/evaluation/benchmarks/humanevalfix/scripts/run_infer.sh b/evaluation/benchmarks/humanevalfix/scripts/run_infer.sh index b0b30628eb..bf36e92bc0 100755 --- a/evaluation/benchmarks/humanevalfix/scripts/run_infer.sh +++ b/evaluation/benchmarks/humanevalfix/scripts/run_infer.sh @@ -58,10 +58,10 @@ if [ -z "$AGENT" ]; then AGENT="CodeActAgent" fi -get_agent_version +get_openhands_version echo "AGENT: $AGENT" -echo "AGENT_VERSION: $AGENT_VERSION" +echo "OPENHANDS_VERSION: $OPENHANDS_VERSION" echo "MODEL_CONFIG: $MODEL_CONFIG" COMMAND="poetry run python evaluation/benchmarks/humanevalfix/run_infer.py \ @@ -69,7 +69,7 @@ COMMAND="poetry run python evaluation/benchmarks/humanevalfix/run_infer.py \ --llm-config $MODEL_CONFIG \ --max-iterations 10 \ --eval-num-workers $NUM_WORKERS \ - --eval-note $AGENT_VERSION" + --eval-note $OPENHANDS_VERSION" if [ -n "$EVAL_LIMIT" ]; then echo "EVAL_LIMIT: $EVAL_LIMIT" diff --git a/evaluation/benchmarks/logic_reasoning/scripts/run_infer.sh b/evaluation/benchmarks/logic_reasoning/scripts/run_infer.sh index 40c244d18b..5a93a65ca8 100755 --- a/evaluation/benchmarks/logic_reasoning/scripts/run_infer.sh +++ b/evaluation/benchmarks/logic_reasoning/scripts/run_infer.sh @@ -28,10 +28,10 @@ if [ -z "$DATASET" ]; then DATASET="ProofWriter" fi -get_agent_version +get_openhands_version echo "AGENT: $AGENT" -echo "AGENT_VERSION: $AGENT_VERSION" +echo "OPENHANDS_VERSION: $OPENHANDS_VERSION" echo "MODEL_CONFIG: $MODEL_CONFIG" COMMAND="poetry run python evaluation/benchmarks/logic_reasoning/run_infer.py \ @@ -40,7 +40,7 @@ COMMAND="poetry run python evaluation/benchmarks/logic_reasoning/run_infer.py \ --dataset $DATASET \ --max-iterations 10 \ --eval-num-workers $NUM_WORKERS \ - --eval-note $AGENT_VERSION" + --eval-note $OPENHANDS_VERSION" if [ -n "$EVAL_LIMIT" ]; then echo "EVAL_LIMIT: $EVAL_LIMIT" diff --git a/evaluation/benchmarks/miniwob/scripts/run_infer.sh b/evaluation/benchmarks/miniwob/scripts/run_infer.sh index 8f997e29c3..e261a12365 100755 --- a/evaluation/benchmarks/miniwob/scripts/run_infer.sh +++ b/evaluation/benchmarks/miniwob/scripts/run_infer.sh @@ -25,13 +25,13 @@ if [ -z "$AGENT" ]; then AGENT="BrowsingAgent" fi -get_agent_version +get_openhands_version echo "AGENT: $AGENT" -echo "AGENT_VERSION: $AGENT_VERSION" +echo "OPENHANDS_VERSION: $OPENHANDS_VERSION" echo "MODEL_CONFIG: $MODEL_CONFIG" -EVAL_NOTE="${AGENT_VERSION}_${NOTE}" +EVAL_NOTE="${OPENHANDS_VERSION}_${NOTE}" COMMAND="export PYTHONPATH=evaluation/benchmarks/miniwob:\$PYTHONPATH && poetry run python evaluation/benchmarks/miniwob/run_infer.py \ --agent-cls $AGENT \ diff --git a/evaluation/benchmarks/mint/scripts/run_infer.sh b/evaluation/benchmarks/mint/scripts/run_infer.sh index b9ec6d7a7a..52ab0cb81b 100755 --- a/evaluation/benchmarks/mint/scripts/run_infer.sh +++ b/evaluation/benchmarks/mint/scripts/run_infer.sh @@ -18,10 +18,10 @@ checkout_eval_branch # Only 'CodeActAgent' is supported for MINT now AGENT="CodeActAgent" -get_agent_version +get_openhands_version echo "AGENT: $AGENT" -echo "AGENT_VERSION: $AGENT_VERSION" +echo "OPENHANDS_VERSION: $OPENHANDS_VERSION" export PYTHONPATH=$(pwd) diff --git a/evaluation/benchmarks/ml_bench/scripts/run_infer.sh b/evaluation/benchmarks/ml_bench/scripts/run_infer.sh index 97ff0003fc..e693285173 100755 --- a/evaluation/benchmarks/ml_bench/scripts/run_infer.sh +++ b/evaluation/benchmarks/ml_bench/scripts/run_infer.sh @@ -26,10 +26,10 @@ if [ -z "$AGENT" ]; then AGENT="CodeActAgent" fi -get_agent_version +get_openhands_version echo "AGENT: $AGENT" -echo "AGENT_VERSION: $AGENT_VERSION" +echo "OPENHANDS_VERSION: $OPENHANDS_VERSION" echo "MODEL_CONFIG: $MODEL_CONFIG" COMMAND="poetry run python evaluation/benchmarks/ml_bench/run_infer.py \ @@ -37,7 +37,7 @@ COMMAND="poetry run python evaluation/benchmarks/ml_bench/run_infer.py \ --llm-config $MODEL_CONFIG \ --max-iterations 10 \ --eval-num-workers $NUM_WORKERS \ - --eval-note $AGENT_VERSION" + --eval-note $OPENHANDS_VERSION" if [ -n "$EVAL_LIMIT" ]; then echo "EVAL_LIMIT: $EVAL_LIMIT" diff --git a/evaluation/benchmarks/scienceagentbench/scripts/run_infer.sh b/evaluation/benchmarks/scienceagentbench/scripts/run_infer.sh index 970f10ed2f..e8abf58e03 100755 --- a/evaluation/benchmarks/scienceagentbench/scripts/run_infer.sh +++ b/evaluation/benchmarks/scienceagentbench/scripts/run_infer.sh @@ -26,10 +26,10 @@ if [ -z "$USE_KNOWLEDGE" ]; then USE_KNOWLEDGE=false fi -get_agent_version +get_openhands_version echo "AGENT: $AGENT" -echo "AGENT_VERSION: $AGENT_VERSION" +echo "OPENHANDS_VERSION: $OPENHANDS_VERSION" echo "MODEL_CONFIG: $MODEL_CONFIG" COMMAND="poetry run python evaluation/benchmarks/scienceagentbench/run_infer.py \ @@ -38,7 +38,7 @@ COMMAND="poetry run python evaluation/benchmarks/scienceagentbench/run_infer.py --use_knowledge $USE_KNOWLEDGE \ --max-iterations 30 \ --eval-num-workers $NUM_WORKERS \ - --eval-note $AGENT_VERSION" \ + --eval-note $OPENHANDS_VERSION" \ if [ -n "$EVAL_LIMIT" ]; then echo "EVAL_LIMIT: $EVAL_LIMIT" diff --git a/evaluation/benchmarks/swe_bench/scripts/run_infer.sh b/evaluation/benchmarks/swe_bench/scripts/run_infer.sh index a27bd7cdbb..b1d375152d 100755 --- a/evaluation/benchmarks/swe_bench/scripts/run_infer.sh +++ b/evaluation/benchmarks/swe_bench/scripts/run_infer.sh @@ -55,10 +55,10 @@ echo "USE_INSTANCE_IMAGE: $USE_INSTANCE_IMAGE" export RUN_WITH_BROWSING=$RUN_WITH_BROWSING echo "RUN_WITH_BROWSING: $RUN_WITH_BROWSING" -get_agent_version +get_openhands_version echo "AGENT: $AGENT" -echo "AGENT_VERSION: $AGENT_VERSION" +echo "OPENHANDS_VERSION: $OPENHANDS_VERSION" echo "MODEL_CONFIG: $MODEL_CONFIG" echo "DATASET: $DATASET" echo "SPLIT: $SPLIT" @@ -68,7 +68,7 @@ if [ -z "$USE_HINT_TEXT" ]; then export USE_HINT_TEXT=false fi echo "USE_HINT_TEXT: $USE_HINT_TEXT" -EVAL_NOTE="$AGENT_VERSION" +EVAL_NOTE="$OPENHANDS_VERSION" # if not using Hint, add -no-hint to the eval note if [ "$USE_HINT_TEXT" = false ]; then EVAL_NOTE="$EVAL_NOTE-no-hint" diff --git a/evaluation/benchmarks/toolqa/scripts/run_infer.sh b/evaluation/benchmarks/toolqa/scripts/run_infer.sh index bfe3471f4f..4760613431 100755 --- a/evaluation/benchmarks/toolqa/scripts/run_infer.sh +++ b/evaluation/benchmarks/toolqa/scripts/run_infer.sh @@ -38,10 +38,10 @@ if [ -z "$WOLFRAM_APPID" ]; then echo "WOLFRAM_APPID not specified" fi -get_agent_version +get_openhands_version echo "AGENT: $AGENT" -echo "AGENT_VERSION: $AGENT_VERSION" +echo "OPENHANDS_VERSION: $OPENHANDS_VERSION" echo "MODEL_CONFIG: $MODEL_CONFIG" echo "DATASET: $DATASET" echo "HARDNESS: $HARDNESS" @@ -56,7 +56,7 @@ COMMAND="poetry run python evaluation/benchmarks/toolqa/run_infer.py \ --wolfram_alpha_appid $WOLFRAM_APPID\ --data-split validation \ --eval-num-workers $NUM_WORKERS \ - --eval-note ${AGENT_VERSION}_${LEVELS}" + --eval-note ${OPENHANDS_VERSION}_${LEVELS}" if [ -n "$EVAL_LIMIT" ]; then echo "EVAL_LIMIT: $EVAL_LIMIT" diff --git a/evaluation/benchmarks/webarena/scripts/run_infer.sh b/evaluation/benchmarks/webarena/scripts/run_infer.sh index 22372b82d7..e3e08dcd48 100755 --- a/evaluation/benchmarks/webarena/scripts/run_infer.sh +++ b/evaluation/benchmarks/webarena/scripts/run_infer.sh @@ -27,13 +27,13 @@ if [ -z "$AGENT" ]; then AGENT="BrowsingAgent" fi -get_agent_version +get_openhands_version echo "AGENT: $AGENT" -echo "AGENT_VERSION: $AGENT_VERSION" +echo "OPENHANDS_VERSION: $OPENHANDS_VERSION" echo "MODEL_CONFIG: $MODEL_CONFIG" -EVAL_NOTE="$AGENT_VERSION" +EVAL_NOTE="$OPENHANDS_VERSION" COMMAND="poetry run python evaluation/benchmarks/webarena/run_infer.py \ --agent-cls $AGENT \ diff --git a/evaluation/integration_tests/scripts/run_infer.sh b/evaluation/integration_tests/scripts/run_infer.sh index de019bed3c..3ca1529359 100755 --- a/evaluation/integration_tests/scripts/run_infer.sh +++ b/evaluation/integration_tests/scripts/run_infer.sh @@ -21,13 +21,13 @@ if [ -z "$AGENT" ]; then AGENT="CodeActAgent" fi -get_agent_version +get_openhands_version echo "AGENT: $AGENT" -echo "AGENT_VERSION: $AGENT_VERSION" +echo "OPENHANDS_VERSION: $OPENHANDS_VERSION" echo "MODEL_CONFIG: $MODEL_CONFIG" -EVAL_NOTE=$AGENT_VERSION +EVAL_NOTE=$OPENHANDS_VERSION # Default to NOT use unit tests. if [ -z "$USE_UNIT_TESTS" ]; then diff --git a/evaluation/utils/version_control.sh b/evaluation/utils/version_control.sh index 11b366d4b8..5fc58e43ab 100644 --- a/evaluation/utils/version_control.sh +++ b/evaluation/utils/version_control.sh @@ -39,8 +39,8 @@ checkout_original_branch() { git checkout $current_branch } -get_agent_version() { +get_openhands_version() { # IMPORTANT: Because Agent's prompt changes fairly often in the rapidly evolving codebase of OpenHands # We need to track the version of Agent in the evaluation to make sure results are comparable - AGENT_VERSION=v$(poetry run python -c "import openhands.agenthub; from openhands.controller.agent import Agent; print(Agent.get_cls('$AGENT').VERSION)") + OPENHANDS_VERSION=v$(poetry run python -c "from openhands import get_version; print(get_version())") }