diff --git a/.github/workflows/eval-runner.yml b/.github/workflows/eval-runner.yml new file mode 100644 index 0000000000..f788cf78d2 --- /dev/null +++ b/.github/workflows/eval-runner.yml @@ -0,0 +1,160 @@ +name: Run Evaluation + +on: + pull_request: + types: [labeled] + schedule: + - cron: "0 1 * * *" # Run daily at 1 AM UTC + workflow_dispatch: + inputs: + reason: + description: "Reason for manual trigger" + required: true + default: "" + +env: + N_PROCESSES: 32 # Global configuration for number of parallel processes for evaluation + +jobs: + run-evaluation: + if: github.event.label.name == 'eval-this' || github.event_name != 'pull_request' + runs-on: ubuntu-latest + permissions: + contents: "read" + id-token: "write" + pull-requests: "write" + issues: "write" + strategy: + matrix: + python-version: ["3.12"] + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Install poetry via pipx + run: pipx install poetry + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + cache: "poetry" + + - name: Comment on PR if 'eval-this' label is present + if: github.event_name == 'pull_request' && github.event.label.name == 'eval-this' + uses: KeisukeYamashita/create-comment@v1 + with: + unique: false + comment: | + Hi! I started running the evaluation on your PR. You will receive a comment with the results shortly. + + - name: Install Python dependencies using Poetry + run: poetry install + + - name: Configure config.toml for evaluation + env: + DEEPSEEK_API_KEY: ${{ secrets.DEEPSEEK_LLM_API_KEY }} + run: | + echo "[llm.eval]" > config.toml + echo "model = \"deepseek/deepseek-chat\"" >> config.toml + echo "api_key = \"$DEEPSEEK_API_KEY\"" >> config.toml + echo "temperature = 0.0" >> config.toml + + - name: Run integration test evaluation + env: + ALLHANDS_API_KEY: ${{ secrets.ALLHANDS_EVAL_RUNTIME_API_KEY }} + RUNTIME: remote + SANDBOX_REMOTE_RUNTIME_API_URL: https://runtime.eval.all-hands.dev + EVAL_DOCKER_IMAGE_PREFIX: us-central1-docker.pkg.dev/evaluation-092424/swe-bench-images + + run: | + poetry run ./evaluation/integration_tests/scripts/run_infer.sh llm.eval HEAD CodeActAgent '' $N_PROCESSES + + # get evaluation report + REPORT_FILE=$(find evaluation/evaluation_outputs/outputs/integration_tests/CodeActAgent/deepseek-chat_maxiter_10_N* -name "report.md" -type f | head -n 1) + echo "REPORT_FILE: $REPORT_FILE" + echo "INTEGRATION_TEST_REPORT<> $GITHUB_ENV + cat $REPORT_FILE >> $GITHUB_ENV + echo >> $GITHUB_ENV + echo "EOF" >> $GITHUB_ENV + + - name: Run SWE-Bench evaluation + env: + ALLHANDS_API_KEY: ${{ secrets.ALLHANDS_EVAL_RUNTIME_API_KEY }} + RUNTIME: remote + SANDBOX_REMOTE_RUNTIME_API_URL: https://runtime.eval.all-hands.dev + EVAL_DOCKER_IMAGE_PREFIX: us-central1-docker.pkg.dev/evaluation-092424/swe-bench-images + + run: | + poetry run ./evaluation/swe_bench/scripts/run_infer.sh llm.eval HEAD CodeActAgent 300 30 $N_PROCESSES "princeton-nlp/SWE-bench_Lite" test + OUTPUT_FOLDER=$(find evaluation/evaluation_outputs/outputs/princeton-nlp__SWE-bench_Lite-test/CodeActAgent -name "deepseek-chat_maxiter_50_N_*-no-hint-run_1" -type d | head -n 1) + echo "OUTPUT_FOLDER for SWE-bench evaluation: $OUTPUT_FOLDER" + poetry run ./evaluation/swe_bench/scripts/eval_infer_remote.sh $OUTPUT_FOLDER/output.jsonl $N_PROCESSES "princeton-nlp/SWE-bench_Lite" test + + poetry run ./evaluation/swe_bench/scripts/eval/summarize_outputs.py $OUTPUT_FOLDER/output.jsonl > summarize_outputs.log 2>&1 + echo "SWEBENCH_REPORT<> $GITHUB_ENV + cat summarize_outputs.log >> $GITHUB_ENV + echo "EOF" >> $GITHUB_ENV + + - name: Create tar.gz of evaluation outputs + run: | + TIMESTAMP=$(date +'%y-%m-%d-%H-%M') + tar -czvf evaluation_outputs_${TIMESTAMP}.tar.gz evaluation/evaluation_outputs/outputs + + - name: Upload evaluation results as artifact + uses: actions/upload-artifact@v4 + id: upload_results_artifact + with: + name: evaluation-outputs + path: evaluation_outputs_*.tar.gz + + - name: Get artifact URL + run: echo "ARTIFACT_URL=${{ steps.upload_results_artifact.outputs.artifact-url }}" >> $GITHUB_ENV + + - name: Authenticate to Google Cloud + uses: 'google-github-actions/auth@v2' + with: + credentials_json: ${{ secrets.GCP_RESEARCH_OBJECT_CREATOR_SA_KEY }} + + - name: Set timestamp and trigger reason + run: | + echo "TIMESTAMP=$(date +'%Y-%m-%d-%H-%M')" >> $GITHUB_ENV + if [[ "${{ github.event_name }}" == "pull_request" ]]; then + echo "TRIGGER_REASON=pr-${{ github.event.pull_request.number }}" >> $GITHUB_ENV + elif [[ "${{ github.event_name }}" == "schedule" ]]; then + echo "TRIGGER_REASON=schedule" >> $GITHUB_ENV + else + echo "TRIGGER_REASON=manual-${{ github.event.inputs.reason }}" >> $GITHUB_ENV + fi + + - name: Upload evaluation results to Google Cloud Storage + uses: 'google-github-actions/upload-cloud-storage@v2' + with: + path: 'evaluation/evaluation_outputs/outputs' + destination: 'openhands-oss-eval-results/${{ env.TIMESTAMP }}-${{ env.TRIGGER_REASON }}' + + - name: Comment with evaluation results and artifact link + id: create_comment + uses: KeisukeYamashita/create-comment@v1 + with: + number: ${{ github.event_name == 'pull_request' && github.event.pull_request.number || 4504 }} + unique: false + comment: | + Trigger by: ${{ github.event_name == 'pull_request' && format('Pull Request (eval-this label on PR #{0})', github.event.pull_request.number) || github.event_name == 'schedule' && 'Daily Schedule' || format('Manual Trigger: {0}', github.event.inputs.reason) }} + Commit: ${{ github.sha }} + **SWE-Bench Evaluation Report** + ${{ env.SWEBENCH_REPORT }} + --- + **Integration Tests Evaluation Report** + ${{ env.INTEGRATION_TEST_REPORT }} + --- + You can download the full evaluation outputs [here](${{ env.ARTIFACT_URL }}). + + - name: Post to a Slack channel + id: slack + uses: slackapi/slack-github-action@v1.27.0 + with: + channel-id: 'C07SVQSCR6F' + slack-message: "*Evaluation Trigger:* ${{ github.event_name == 'pull_request' && format('Pull Request (eval-this label on PR #{0})', github.event.pull_request.number) || github.event_name == 'schedule' && 'Daily Schedule' || format('Manual Trigger: {0}', github.event.inputs.reason) }}\n\nLink to summary: [here](https://github.com/${{ github.repository }}/issues/${{ github.event_name == 'pull_request' && github.event.pull_request.number || 4504 }}#issuecomment-${{ steps.create_comment.outputs.comment-id }})" + env: + SLACK_BOT_TOKEN: ${{ secrets.EVAL_NOTIF_SLACK_BOT_TOKEN }} diff --git a/evaluation/integration_tests/run_infer.py b/evaluation/integration_tests/run_infer.py index 621a0fa91c..ddc044088b 100644 --- a/evaluation/integration_tests/run_infer.py +++ b/evaluation/integration_tests/run_infer.py @@ -41,13 +41,15 @@ def get_config( config = AppConfig( default_agent=metadata.agent_class, run_as_openhands=False, - runtime='eventstream', + runtime=os.environ.get('RUNTIME', 'eventstream'), max_iterations=metadata.max_iterations, sandbox=SandboxConfig( # use default base_container_image enable_auto_lint=True, use_host_network=False, timeout=100, + api_key=os.environ.get('ALLHANDS_API_KEY', None), + remote_runtime_api_url=os.environ.get('SANDBOX_REMOTE_RUNTIME_API_URL'), ), # do not mount workspace workspace_base=None, @@ -219,3 +221,10 @@ if __name__ == '__main__': + df[['instance_id', 'success', 'reason']].to_string(index=False) ) logger.info('-' * 100) + + report_file = os.path.join(metadata.eval_output_dir, 'report.md') + with open(report_file, 'w') as f: + f.write( + f'Success rate: {df["success"].mean():.2%} ({df["success"].sum()}/{len(df)})\n' + ) + f.write(df[['instance_id', 'success', 'reason']].to_markdown(index=False)) diff --git a/evaluation/swe_bench/eval_infer.py b/evaluation/swe_bench/eval_infer.py index cf6d71d3b3..a214f4781e 100644 --- a/evaluation/swe_bench/eval_infer.py +++ b/evaluation/swe_bench/eval_infer.py @@ -369,10 +369,12 @@ if __name__ == '__main__': def count_report_field(row, field): return row['test_result']['report'][field] + report = {} for field in fields: count = evaluated_predictions.apply( count_report_field, args=(field,), axis=1 ).sum() + report[field] = count logger.info( f'# {field}: {count} / {len(evaluated_predictions)}. ({count / len(evaluated_predictions):.2%})' ) diff --git a/evaluation/swe_bench/scripts/eval/summarize_outputs.py b/evaluation/swe_bench/scripts/eval/summarize_outputs.py index e47cceb993..5d5dbbf2a3 100755 --- a/evaluation/swe_bench/scripts/eval/summarize_outputs.py +++ b/evaluation/swe_bench/scripts/eval/summarize_outputs.py @@ -90,30 +90,29 @@ if __name__ == '__main__': break # print the error counter (with percentage) - print('-' * 100) print( - f'# of resolved: {num_resolved} / {num_lines} ({num_resolved / num_lines * 100:.2f}%)' + f'Number of resolved: {num_resolved} / {num_lines} ({num_resolved / num_lines * 100:.2f}%)' ) print( - f'# of empty patch: {num_empty_patch} / {num_lines} ({num_empty_patch / num_lines * 100:.2f}%)' + f'Number of empty patch: {num_empty_patch} / {num_lines} ({num_empty_patch / num_lines * 100:.2f}%)' ) print( - f'# of error lines: {num_error_lines} / {num_lines} ({num_error_lines / num_lines * 100:.2f}%)' + f'Number of error lines: {num_error_lines} / {num_lines} ({num_error_lines / num_lines * 100:.2f}%)' ) print( - f'# of loop: {num_agent_stuck_in_loop} / {num_lines} ({num_agent_stuck_in_loop / num_lines * 100:.2f}%)' + f'Number of agent stuck in loop: {num_agent_stuck_in_loop} / {num_lines} ({num_agent_stuck_in_loop / num_lines * 100:.2f}%)' ) assert len(num_turns) == num_lines assert len(main_agent_cost) == num_lines assert len(editor_cost) == num_lines + print('## Statistics') print(f'Avg. num of turns per instance: {sum(num_turns) / num_lines:.2f}') print(f'Avg. agent cost per instance: {sum(main_agent_cost) / num_lines:.2f} USD') print(f'Avg. editor cost per instance: {sum(editor_cost) / num_lines:.2f} USD') print( f'Avg. total cost per instance: {(sum(main_agent_cost) + sum(editor_cost)) / num_lines:.2f} USD' ) - print('-' * 100) - print('Detailed error breakdown:') + + print('## Detailed error breakdown:') for error, count in error_counter.items(): print(f'{error}: {count} ({count / num_lines * 100:.2f}%)') - print('-' * 100) diff --git a/openhands/agenthub/codeact_agent/action_parser.py b/openhands/agenthub/codeact_agent/action_parser.py index 893effd825..75fab1156f 100644 --- a/openhands/agenthub/codeact_agent/action_parser.py +++ b/openhands/agenthub/codeact_agent/action_parser.py @@ -54,6 +54,11 @@ class CodeActResponseParser(ResponseParser): if f'' in action and f'' not in action: action += f'' + + # special handling for DeepSeek: it has stop-word bug and returns + if '' not in action: + action = action.replace('') + if '' not in action: action += '' return action diff --git a/poetry.lock b/poetry.lock index 6ef4a43322..4ac715f968 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.8.2 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand. [[package]] name = "aenum" @@ -8512,6 +8512,20 @@ files = [ sigtools = "4.0.1" typing-extensions = ">=4.6" +[[package]] +name = "tabulate" +version = "0.9.0" +description = "Pretty-print tabular data" +optional = false +python-versions = ">=3.7" +files = [ + {file = "tabulate-0.9.0-py3-none-any.whl", hash = "sha256:024ca478df22e9340661486f85298cff5f6dcdba14f3813e8830015b9ed1948f"}, + {file = "tabulate-0.9.0.tar.gz", hash = "sha256:0095b12bf5966de529c0feb1fa08671671b3368eec77d7ef7ab114be2c068b3c"}, +] + +[package.extras] +widechars = ["wcwidth"] + [[package]] name = "tenacity" version = "8.5.0" @@ -10099,4 +10113,4 @@ testing = ["coverage[toml]", "zope.event", "zope.testing"] [metadata] lock-version = "2.0" python-versions = "^3.12" -content-hash = "880b0251ec1ac83a7a8f6b1637b0860f75553d1f9c7c67313af9f3bb686c166a" +content-hash = "709ae467d042d1c9fa3799711f50445324420de32dc9552a42284aba99903981" diff --git a/pyproject.toml b/pyproject.toml index a7daebd37d..978ff0ab05 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -132,6 +132,7 @@ sympy = "*" gdown = "*" matplotlib = "*" seaborn = "*" +tabulate = "*" [tool.poetry-dynamic-versioning] enable = true