[eval] add evaluation workflow (#4489)

Co-authored-by: openhands <openhands@all-hands.dev> Co-authored-by: Engel Nyst <enyst@users.noreply.github.com>
2025-12-26 05:48:36 +08:00 · 2024-10-29 08:52:25 -05:00 · 2024-10-29 08:52:25 -05:00 · 6d19c93d19
commit 6d19c93d19
parent 30eeaa641c
7 changed files with 201 additions and 11 deletions
--- a/.github/workflows/eval-runner.yml
+++ b/.github/workflows/eval-runner.yml
@ -0,0 +1,160 @@
+name: Run Evaluation
+
+on:
+  pull_request:
+    types: [labeled]
+  schedule:
+    - cron: "0 1 * * *" # Run daily at 1 AM UTC
+  workflow_dispatch:
+    inputs:
+      reason:
+        description: "Reason for manual trigger"
+        required: true
+        default: ""
+
+env:
+  N_PROCESSES: 32 # Global configuration for number of parallel processes for evaluation
+
+jobs:
+  run-evaluation:
+    if: github.event.label.name == 'eval-this' || github.event_name != 'pull_request'
+    runs-on: ubuntu-latest
+    permissions:
+      contents: "read"
+      id-token: "write"
+      pull-requests: "write"
+      issues: "write"
+    strategy:
+      matrix:
+        python-version: ["3.12"]
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Install poetry via pipx
+        run: pipx install poetry
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python-version }}
+          cache: "poetry"
+
+      - name: Comment on PR if 'eval-this' label is present
+        if: github.event_name == 'pull_request' && github.event.label.name == 'eval-this'
+        uses: KeisukeYamashita/create-comment@v1
+        with:
+          unique: false
+          comment: |
+            Hi! I started running the evaluation on your PR. You will receive a comment with the results shortly.
+
+      - name: Install Python dependencies using Poetry
+        run: poetry install
+
+      - name: Configure config.toml for evaluation
+        env:
+          DEEPSEEK_API_KEY: ${{ secrets.DEEPSEEK_LLM_API_KEY }}
+        run: |
+          echo "[llm.eval]" > config.toml
+          echo "model = \"deepseek/deepseek-chat\"" >> config.toml
+          echo "api_key = \"$DEEPSEEK_API_KEY\"" >> config.toml
+          echo "temperature = 0.0" >> config.toml
+
+      - name: Run integration test evaluation
+        env:
+          ALLHANDS_API_KEY: ${{ secrets.ALLHANDS_EVAL_RUNTIME_API_KEY }}
+          RUNTIME: remote
+          SANDBOX_REMOTE_RUNTIME_API_URL: https://runtime.eval.all-hands.dev
+          EVAL_DOCKER_IMAGE_PREFIX: us-central1-docker.pkg.dev/evaluation-092424/swe-bench-images
+
+        run: |
+          poetry run ./evaluation/integration_tests/scripts/run_infer.sh llm.eval HEAD CodeActAgent '' $N_PROCESSES
+
+          # get evaluation report
+          REPORT_FILE=$(find evaluation/evaluation_outputs/outputs/integration_tests/CodeActAgent/deepseek-chat_maxiter_10_N* -name "report.md" -type f | head -n 1)
+          echo "REPORT_FILE: $REPORT_FILE"
+          echo "INTEGRATION_TEST_REPORT<<EOF" >> $GITHUB_ENV
+          cat $REPORT_FILE >> $GITHUB_ENV
+          echo >> $GITHUB_ENV
+          echo "EOF" >> $GITHUB_ENV
+
+      - name: Run SWE-Bench evaluation
+        env:
+          ALLHANDS_API_KEY: ${{ secrets.ALLHANDS_EVAL_RUNTIME_API_KEY }}
+          RUNTIME: remote
+          SANDBOX_REMOTE_RUNTIME_API_URL: https://runtime.eval.all-hands.dev
+          EVAL_DOCKER_IMAGE_PREFIX: us-central1-docker.pkg.dev/evaluation-092424/swe-bench-images
+
+        run: |
+          poetry run ./evaluation/swe_bench/scripts/run_infer.sh llm.eval HEAD CodeActAgent 300 30 $N_PROCESSES "princeton-nlp/SWE-bench_Lite" test
+          OUTPUT_FOLDER=$(find evaluation/evaluation_outputs/outputs/princeton-nlp__SWE-bench_Lite-test/CodeActAgent -name "deepseek-chat_maxiter_50_N_*-no-hint-run_1" -type d | head -n 1)
+          echo "OUTPUT_FOLDER for SWE-bench evaluation: $OUTPUT_FOLDER"
+          poetry run ./evaluation/swe_bench/scripts/eval_infer_remote.sh $OUTPUT_FOLDER/output.jsonl $N_PROCESSES "princeton-nlp/SWE-bench_Lite" test
+
+          poetry run ./evaluation/swe_bench/scripts/eval/summarize_outputs.py $OUTPUT_FOLDER/output.jsonl > summarize_outputs.log 2>&1
+          echo "SWEBENCH_REPORT<<EOF" >> $GITHUB_ENV
+          cat summarize_outputs.log >> $GITHUB_ENV
+          echo "EOF" >> $GITHUB_ENV
+
+      - name: Create tar.gz of evaluation outputs
+        run: |
+          TIMESTAMP=$(date +'%y-%m-%d-%H-%M')
+          tar -czvf evaluation_outputs_${TIMESTAMP}.tar.gz evaluation/evaluation_outputs/outputs
+
+      - name: Upload evaluation results as artifact
+        uses: actions/upload-artifact@v4
+        id: upload_results_artifact
+        with:
+          name: evaluation-outputs
+          path: evaluation_outputs_*.tar.gz
+
+      - name: Get artifact URL
+        run: echo "ARTIFACT_URL=${{ steps.upload_results_artifact.outputs.artifact-url }}" >> $GITHUB_ENV
+
+      - name: Authenticate to Google Cloud
+        uses: 'google-github-actions/auth@v2'
+        with:
+          credentials_json: ${{ secrets.GCP_RESEARCH_OBJECT_CREATOR_SA_KEY }}
+
+      - name: Set timestamp and trigger reason
+        run: |
+          echo "TIMESTAMP=$(date +'%Y-%m-%d-%H-%M')" >> $GITHUB_ENV
+          if [[ "${{ github.event_name }}" == "pull_request" ]]; then
+            echo "TRIGGER_REASON=pr-${{ github.event.pull_request.number }}" >> $GITHUB_ENV
+          elif [[ "${{ github.event_name }}" == "schedule" ]]; then
+            echo "TRIGGER_REASON=schedule" >> $GITHUB_ENV
+          else
+            echo "TRIGGER_REASON=manual-${{ github.event.inputs.reason }}" >> $GITHUB_ENV
+          fi
+
+      - name: Upload evaluation results to Google Cloud Storage
+        uses: 'google-github-actions/upload-cloud-storage@v2'
+        with:
+          path: 'evaluation/evaluation_outputs/outputs'
+          destination: 'openhands-oss-eval-results/${{ env.TIMESTAMP }}-${{ env.TRIGGER_REASON }}'
+
+      - name: Comment with evaluation results and artifact link
+        id: create_comment
+        uses: KeisukeYamashita/create-comment@v1
+        with:
+          number: ${{ github.event_name == 'pull_request' && github.event.pull_request.number || 4504 }}
+          unique: false
+          comment: |
+              Trigger by: ${{ github.event_name == 'pull_request' && format('Pull Request (eval-this label on PR #{0})', github.event.pull_request.number) || github.event_name == 'schedule' && 'Daily Schedule' || format('Manual Trigger: {0}', github.event.inputs.reason) }}
+              Commit: ${{ github.sha }}
+              **SWE-Bench Evaluation Report**
+              ${{ env.SWEBENCH_REPORT }}
+              ---
+              **Integration Tests Evaluation Report**
+              ${{ env.INTEGRATION_TEST_REPORT }}
+              ---
+              You can download the full evaluation outputs [here](${{ env.ARTIFACT_URL }}).
+
+      - name: Post to a Slack channel
+        id: slack
+        uses: slackapi/slack-github-action@v1.27.0
+        with:
+          channel-id: 'C07SVQSCR6F'
+          slack-message: "*Evaluation Trigger:* ${{ github.event_name == 'pull_request' && format('Pull Request (eval-this label on PR #{0})', github.event.pull_request.number) || github.event_name == 'schedule' && 'Daily Schedule' || format('Manual Trigger: {0}', github.event.inputs.reason) }}\n\nLink to summary: [here](https://github.com/${{ github.repository }}/issues/${{ github.event_name == 'pull_request' && github.event.pull_request.number || 4504 }}#issuecomment-${{ steps.create_comment.outputs.comment-id }})"
+        env:
+          SLACK_BOT_TOKEN: ${{ secrets.EVAL_NOTIF_SLACK_BOT_TOKEN }}
--- a/evaluation/integration_tests/run_infer.py
+++ b/evaluation/integration_tests/run_infer.py
@ -41,13 +41,15 @@ def get_config(
    config = AppConfig(
        default_agent=metadata.agent_class,
        run_as_openhands=False,
-        runtime='eventstream',
+        runtime=os.environ.get('RUNTIME', 'eventstream'),
        max_iterations=metadata.max_iterations,
        sandbox=SandboxConfig(
            # use default base_container_image
            enable_auto_lint=True,
            use_host_network=False,
            timeout=100,
+            api_key=os.environ.get('ALLHANDS_API_KEY', None),
+            remote_runtime_api_url=os.environ.get('SANDBOX_REMOTE_RUNTIME_API_URL'),
        ),
        # do not mount workspace
        workspace_base=None,
@ -219,3 +221,10 @@ if __name__ == '__main__':
        + df[['instance_id', 'success', 'reason']].to_string(index=False)
    )
    logger.info('-' * 100)
+
+    report_file = os.path.join(metadata.eval_output_dir, 'report.md')
+    with open(report_file, 'w') as f:
+        f.write(
+            f'Success rate: {df["success"].mean():.2%} ({df["success"].sum()}/{len(df)})\n'
+        )
+        f.write(df[['instance_id', 'success', 'reason']].to_markdown(index=False))
--- a/evaluation/swe_bench/eval_infer.py
+++ b/evaluation/swe_bench/eval_infer.py
@ -369,10 +369,12 @@ if __name__ == '__main__':
    def count_report_field(row, field):
        return row['test_result']['report'][field]

+    report = {}
    for field in fields:
        count = evaluated_predictions.apply(
            count_report_field, args=(field,), axis=1
        ).sum()
+        report[field] = count
        logger.info(
            f'# {field}: {count} / {len(evaluated_predictions)}. ({count / len(evaluated_predictions):.2%})'
        )
--- a/evaluation/swe_bench/scripts/eval/summarize_outputs.py
+++ b/evaluation/swe_bench/scripts/eval/summarize_outputs.py
@ -90,30 +90,29 @@ if __name__ == '__main__':
                break

    # print the error counter (with percentage)
-    print('-' * 100)
    print(
-        f'# of resolved: {num_resolved} / {num_lines} ({num_resolved / num_lines * 100:.2f}%)'
+        f'Number of resolved: {num_resolved} / {num_lines} ({num_resolved / num_lines * 100:.2f}%)'
    )
    print(
-        f'# of empty patch: {num_empty_patch} / {num_lines} ({num_empty_patch / num_lines * 100:.2f}%)'
+        f'Number of empty patch: {num_empty_patch} / {num_lines} ({num_empty_patch / num_lines * 100:.2f}%)'
    )
    print(
-        f'# of error lines: {num_error_lines} / {num_lines} ({num_error_lines / num_lines * 100:.2f}%)'
+        f'Number of error lines: {num_error_lines} / {num_lines} ({num_error_lines / num_lines * 100:.2f}%)'
    )
    print(
-        f'# of loop: {num_agent_stuck_in_loop} / {num_lines} ({num_agent_stuck_in_loop / num_lines * 100:.2f}%)'
+        f'Number of agent stuck in loop: {num_agent_stuck_in_loop} / {num_lines} ({num_agent_stuck_in_loop / num_lines * 100:.2f}%)'
    )
    assert len(num_turns) == num_lines
    assert len(main_agent_cost) == num_lines
    assert len(editor_cost) == num_lines
+    print('## Statistics')
    print(f'Avg. num of turns per instance: {sum(num_turns) / num_lines:.2f}')
    print(f'Avg. agent cost per instance: {sum(main_agent_cost) / num_lines:.2f} USD')
    print(f'Avg. editor cost per instance: {sum(editor_cost) / num_lines:.2f} USD')
    print(
        f'Avg. total cost per instance: {(sum(main_agent_cost) + sum(editor_cost)) / num_lines:.2f} USD'
    )
-    print('-' * 100)
-    print('Detailed error breakdown:')
+
+    print('## Detailed error breakdown:')
    for error, count in error_counter.items():
        print(f'{error}: {count} ({count / num_lines * 100:.2f}%)')
-    print('-' * 100)
--- a/openhands/agenthub/codeact_agent/action_parser.py
+++ b/openhands/agenthub/codeact_agent/action_parser.py
@ -54,6 +54,11 @@ class CodeActResponseParser(ResponseParser):

            if f'<execute_{lang}>' in action and f'</execute_{lang}>' not in action:
                action += f'</execute_{lang}>'
+
+        # special handling for DeepSeek: it has stop-word bug and returns </execute_ipython instead of </execute_ipython>
+        if '</file_edit' in action and '</file_edit>' not in action:
+            action = action.replace('</file_edit', '</file_edit>')
+
        if '<file_edit' in action and '</file_edit>' not in action:
            action += '</file_edit>'
        return action
--- a/poetry.lock
+++ b/poetry.lock
@ -1,4 +1,4 @@
-# This file is automatically @generated by Poetry 1.8.2 and should not be changed by hand.
+# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand.

 [[package]]
 name = "aenum"
@ -8512,6 +8512,20 @@ files = [
 sigtools = "4.0.1"
 typing-extensions = ">=4.6"

+[[package]]
+name = "tabulate"
+version = "0.9.0"
+description = "Pretty-print tabular data"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "tabulate-0.9.0-py3-none-any.whl", hash = "sha256:024ca478df22e9340661486f85298cff5f6dcdba14f3813e8830015b9ed1948f"},
+    {file = "tabulate-0.9.0.tar.gz", hash = "sha256:0095b12bf5966de529c0feb1fa08671671b3368eec77d7ef7ab114be2c068b3c"},
+]
+
+[package.extras]
+widechars = ["wcwidth"]
+
 [[package]]
 name = "tenacity"
 version = "8.5.0"
@ -10099,4 +10113,4 @@ testing = ["coverage[toml]", "zope.event", "zope.testing"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.12"
-content-hash = "880b0251ec1ac83a7a8f6b1637b0860f75553d1f9c7c67313af9f3bb686c166a"
+content-hash = "709ae467d042d1c9fa3799711f50445324420de32dc9552a42284aba99903981"
--- a/pyproject.toml
+++ b/pyproject.toml
@ -132,6 +132,7 @@ sympy = "*"
 gdown = "*"
 matplotlib = "*"
 seaborn = "*"
+tabulate = "*"

 [tool.poetry-dynamic-versioning]
 enable = true