mirror of
https://github.com/OpenHands/OpenHands.git
synced 2025-12-26 05:48:36 +08:00
Co-authored-by: openhands <openhands@all-hands.dev> Co-authored-by: Engel Nyst <enyst@users.noreply.github.com> Co-authored-by: Robert Brennan <accounts@rbren.io>
140 lines
6.1 KiB
YAML
140 lines
6.1 KiB
YAML
name: Run SWE-Bench Evaluation
|
|
|
|
on:
|
|
pull_request:
|
|
types: [labeled]
|
|
workflow_dispatch:
|
|
inputs:
|
|
reason:
|
|
description: "Reason for manual trigger"
|
|
required: true
|
|
default: ""
|
|
|
|
env:
|
|
N_PROCESSES: 32 # Global configuration for number of parallel processes for evaluation
|
|
|
|
jobs:
|
|
run-evaluation:
|
|
if: github.event.label.name == 'eval-this' || github.event_name != 'pull_request'
|
|
runs-on: ubuntu-latest
|
|
permissions:
|
|
contents: "read"
|
|
id-token: "write"
|
|
pull-requests: "write"
|
|
issues: "write"
|
|
strategy:
|
|
matrix:
|
|
python-version: ["3.12"]
|
|
steps:
|
|
- name: Checkout repository
|
|
uses: actions/checkout@v4
|
|
|
|
- name: Install tmux
|
|
run: sudo apt-get update && sudo apt-get install -y tmux
|
|
- name: Install poetry via pipx
|
|
run: pipx install poetry
|
|
|
|
- name: Set up Python
|
|
uses: actions/setup-python@v5
|
|
with:
|
|
python-version: ${{ matrix.python-version }}
|
|
cache: "poetry"
|
|
|
|
- name: Comment on PR if 'eval-this' label is present
|
|
if: github.event_name == 'pull_request' && github.event.label.name == 'eval-this'
|
|
uses: KeisukeYamashita/create-comment@v1
|
|
with:
|
|
unique: false
|
|
comment: |
|
|
Hi! I started running the evaluation on your PR. You will receive a comment with the results shortly.
|
|
|
|
- name: Install Python dependencies using Poetry
|
|
run: poetry install
|
|
|
|
- name: Configure config.toml for evaluation
|
|
env:
|
|
DEEPSEEK_API_KEY: ${{ secrets.DEEPSEEK_LLM_API_KEY }}
|
|
run: |
|
|
echo "[llm.eval]" > config.toml
|
|
echo "model = \"deepseek/deepseek-chat\"" >> config.toml
|
|
echo "api_key = \"$DEEPSEEK_API_KEY\"" >> config.toml
|
|
echo "temperature = 0.0" >> config.toml
|
|
|
|
- name: Run SWE-Bench evaluation
|
|
env:
|
|
ALLHANDS_API_KEY: ${{ secrets.ALLHANDS_EVAL_RUNTIME_API_KEY }}
|
|
RUNTIME: remote
|
|
SANDBOX_REMOTE_RUNTIME_API_URL: https://runtime.eval.all-hands.dev
|
|
EVAL_DOCKER_IMAGE_PREFIX: us-central1-docker.pkg.dev/evaluation-092424/swe-bench-images
|
|
|
|
run: |
|
|
poetry run ./evaluation/benchmarks/swe_bench/scripts/run_infer.sh llm.eval HEAD CodeActAgent 300 30 $N_PROCESSES "princeton-nlp/SWE-bench_Lite" test
|
|
OUTPUT_FOLDER=$(find evaluation/evaluation_outputs/outputs/princeton-nlp__SWE-bench_Lite-test/CodeActAgent -name "deepseek-chat_maxiter_50_N_*-no-hint-run_1" -type d | head -n 1)
|
|
echo "OUTPUT_FOLDER for SWE-bench evaluation: $OUTPUT_FOLDER"
|
|
poetry run ./evaluation/benchmarks/swe_bench/scripts/eval_infer_remote.sh $OUTPUT_FOLDER/output.jsonl $N_PROCESSES "princeton-nlp/SWE-bench_Lite" test
|
|
|
|
poetry run ./evaluation/benchmarks/swe_bench/scripts/eval/summarize_outputs.py $OUTPUT_FOLDER/output.jsonl > summarize_outputs.log 2>&1
|
|
echo "SWEBENCH_REPORT<<EOF" >> $GITHUB_ENV
|
|
cat summarize_outputs.log >> $GITHUB_ENV
|
|
echo "EOF" >> $GITHUB_ENV
|
|
|
|
- name: Create tar.gz of evaluation outputs
|
|
run: |
|
|
TIMESTAMP=$(date +'%y-%m-%d-%H-%M')
|
|
tar -czvf evaluation_outputs_${TIMESTAMP}.tar.gz evaluation/evaluation_outputs/outputs
|
|
|
|
- name: Upload evaluation results as artifact
|
|
uses: actions/upload-artifact@v4
|
|
id: upload_results_artifact
|
|
with:
|
|
name: evaluation-outputs
|
|
path: evaluation_outputs_*.tar.gz
|
|
|
|
- name: Get artifact URL
|
|
run: echo "ARTIFACT_URL=${{ steps.upload_results_artifact.outputs.artifact-url }}" >> $GITHUB_ENV
|
|
|
|
- name: Authenticate to Google Cloud
|
|
uses: 'google-github-actions/auth@v2'
|
|
with:
|
|
credentials_json: ${{ secrets.GCP_RESEARCH_OBJECT_CREATOR_SA_KEY }}
|
|
|
|
- name: Set timestamp and trigger reason
|
|
run: |
|
|
echo "TIMESTAMP=$(date +'%Y-%m-%d-%H-%M')" >> $GITHUB_ENV
|
|
if [[ "${{ github.event_name }}" == "pull_request" ]]; then
|
|
echo "TRIGGER_REASON=pr-${{ github.event.pull_request.number }}" >> $GITHUB_ENV
|
|
elif [[ "${{ github.event_name }}" == "schedule" ]]; then
|
|
echo "TRIGGER_REASON=schedule" >> $GITHUB_ENV
|
|
else
|
|
echo "TRIGGER_REASON=manual-${{ github.event.inputs.reason }}" >> $GITHUB_ENV
|
|
fi
|
|
|
|
- name: Upload evaluation results to Google Cloud Storage
|
|
uses: 'google-github-actions/upload-cloud-storage@v2'
|
|
with:
|
|
path: 'evaluation/evaluation_outputs/outputs'
|
|
destination: 'openhands-oss-eval-results/${{ env.TIMESTAMP }}-${{ env.TRIGGER_REASON }}'
|
|
|
|
- name: Comment with evaluation results and artifact link
|
|
id: create_comment
|
|
uses: KeisukeYamashita/create-comment@v1
|
|
with:
|
|
number: ${{ github.event_name == 'pull_request' && github.event.pull_request.number || 4504 }}
|
|
unique: false
|
|
comment: |
|
|
Trigger by: ${{ github.event_name == 'pull_request' && format('Pull Request (eval-this label on PR #{0})', github.event.pull_request.number) || github.event_name == 'schedule' && 'Daily Schedule' || format('Manual Trigger: {0}', github.event.inputs.reason) }}
|
|
Commit: ${{ github.sha }}
|
|
**SWE-Bench Evaluation Report**
|
|
${{ env.SWEBENCH_REPORT }}
|
|
---
|
|
You can download the full evaluation outputs [here](${{ env.ARTIFACT_URL }}).
|
|
|
|
- name: Post to a Slack channel
|
|
id: slack
|
|
uses: slackapi/slack-github-action@v2.0.0
|
|
with:
|
|
channel-id: 'C07SVQSCR6F'
|
|
slack-message: "*Evaluation Trigger:* ${{ github.event_name == 'pull_request' && format('Pull Request (eval-this label on PR #{0})', github.event.pull_request.number) || github.event_name == 'schedule' && 'Daily Schedule' || format('Manual Trigger: {0}', github.event.inputs.reason) }}\n\nLink to summary: [here](https://github.com/${{ github.repository }}/issues/${{ github.event_name == 'pull_request' && github.event.pull_request.number || 4504 }}#issuecomment-${{ steps.create_comment.outputs.comment-id }})"
|
|
env:
|
|
SLACK_BOT_TOKEN: ${{ secrets.EVAL_NOTIF_SLACK_BOT_TOKEN }}
|