mirror of
https://github.com/OpenHands/OpenHands.git
synced 2025-12-26 05:48:36 +08:00
[eval] add evaluation workflow (#4489)
Co-authored-by: openhands <openhands@all-hands.dev> Co-authored-by: Engel Nyst <enyst@users.noreply.github.com>
This commit is contained in:
parent
30eeaa641c
commit
6d19c93d19
160
.github/workflows/eval-runner.yml
vendored
Normal file
160
.github/workflows/eval-runner.yml
vendored
Normal file
@ -0,0 +1,160 @@
|
||||
name: Run Evaluation
|
||||
|
||||
on:
|
||||
pull_request:
|
||||
types: [labeled]
|
||||
schedule:
|
||||
- cron: "0 1 * * *" # Run daily at 1 AM UTC
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
reason:
|
||||
description: "Reason for manual trigger"
|
||||
required: true
|
||||
default: ""
|
||||
|
||||
env:
|
||||
N_PROCESSES: 32 # Global configuration for number of parallel processes for evaluation
|
||||
|
||||
jobs:
|
||||
run-evaluation:
|
||||
if: github.event.label.name == 'eval-this' || github.event_name != 'pull_request'
|
||||
runs-on: ubuntu-latest
|
||||
permissions:
|
||||
contents: "read"
|
||||
id-token: "write"
|
||||
pull-requests: "write"
|
||||
issues: "write"
|
||||
strategy:
|
||||
matrix:
|
||||
python-version: ["3.12"]
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Install poetry via pipx
|
||||
run: pipx install poetry
|
||||
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: ${{ matrix.python-version }}
|
||||
cache: "poetry"
|
||||
|
||||
- name: Comment on PR if 'eval-this' label is present
|
||||
if: github.event_name == 'pull_request' && github.event.label.name == 'eval-this'
|
||||
uses: KeisukeYamashita/create-comment@v1
|
||||
with:
|
||||
unique: false
|
||||
comment: |
|
||||
Hi! I started running the evaluation on your PR. You will receive a comment with the results shortly.
|
||||
|
||||
- name: Install Python dependencies using Poetry
|
||||
run: poetry install
|
||||
|
||||
- name: Configure config.toml for evaluation
|
||||
env:
|
||||
DEEPSEEK_API_KEY: ${{ secrets.DEEPSEEK_LLM_API_KEY }}
|
||||
run: |
|
||||
echo "[llm.eval]" > config.toml
|
||||
echo "model = \"deepseek/deepseek-chat\"" >> config.toml
|
||||
echo "api_key = \"$DEEPSEEK_API_KEY\"" >> config.toml
|
||||
echo "temperature = 0.0" >> config.toml
|
||||
|
||||
- name: Run integration test evaluation
|
||||
env:
|
||||
ALLHANDS_API_KEY: ${{ secrets.ALLHANDS_EVAL_RUNTIME_API_KEY }}
|
||||
RUNTIME: remote
|
||||
SANDBOX_REMOTE_RUNTIME_API_URL: https://runtime.eval.all-hands.dev
|
||||
EVAL_DOCKER_IMAGE_PREFIX: us-central1-docker.pkg.dev/evaluation-092424/swe-bench-images
|
||||
|
||||
run: |
|
||||
poetry run ./evaluation/integration_tests/scripts/run_infer.sh llm.eval HEAD CodeActAgent '' $N_PROCESSES
|
||||
|
||||
# get evaluation report
|
||||
REPORT_FILE=$(find evaluation/evaluation_outputs/outputs/integration_tests/CodeActAgent/deepseek-chat_maxiter_10_N* -name "report.md" -type f | head -n 1)
|
||||
echo "REPORT_FILE: $REPORT_FILE"
|
||||
echo "INTEGRATION_TEST_REPORT<<EOF" >> $GITHUB_ENV
|
||||
cat $REPORT_FILE >> $GITHUB_ENV
|
||||
echo >> $GITHUB_ENV
|
||||
echo "EOF" >> $GITHUB_ENV
|
||||
|
||||
- name: Run SWE-Bench evaluation
|
||||
env:
|
||||
ALLHANDS_API_KEY: ${{ secrets.ALLHANDS_EVAL_RUNTIME_API_KEY }}
|
||||
RUNTIME: remote
|
||||
SANDBOX_REMOTE_RUNTIME_API_URL: https://runtime.eval.all-hands.dev
|
||||
EVAL_DOCKER_IMAGE_PREFIX: us-central1-docker.pkg.dev/evaluation-092424/swe-bench-images
|
||||
|
||||
run: |
|
||||
poetry run ./evaluation/swe_bench/scripts/run_infer.sh llm.eval HEAD CodeActAgent 300 30 $N_PROCESSES "princeton-nlp/SWE-bench_Lite" test
|
||||
OUTPUT_FOLDER=$(find evaluation/evaluation_outputs/outputs/princeton-nlp__SWE-bench_Lite-test/CodeActAgent -name "deepseek-chat_maxiter_50_N_*-no-hint-run_1" -type d | head -n 1)
|
||||
echo "OUTPUT_FOLDER for SWE-bench evaluation: $OUTPUT_FOLDER"
|
||||
poetry run ./evaluation/swe_bench/scripts/eval_infer_remote.sh $OUTPUT_FOLDER/output.jsonl $N_PROCESSES "princeton-nlp/SWE-bench_Lite" test
|
||||
|
||||
poetry run ./evaluation/swe_bench/scripts/eval/summarize_outputs.py $OUTPUT_FOLDER/output.jsonl > summarize_outputs.log 2>&1
|
||||
echo "SWEBENCH_REPORT<<EOF" >> $GITHUB_ENV
|
||||
cat summarize_outputs.log >> $GITHUB_ENV
|
||||
echo "EOF" >> $GITHUB_ENV
|
||||
|
||||
- name: Create tar.gz of evaluation outputs
|
||||
run: |
|
||||
TIMESTAMP=$(date +'%y-%m-%d-%H-%M')
|
||||
tar -czvf evaluation_outputs_${TIMESTAMP}.tar.gz evaluation/evaluation_outputs/outputs
|
||||
|
||||
- name: Upload evaluation results as artifact
|
||||
uses: actions/upload-artifact@v4
|
||||
id: upload_results_artifact
|
||||
with:
|
||||
name: evaluation-outputs
|
||||
path: evaluation_outputs_*.tar.gz
|
||||
|
||||
- name: Get artifact URL
|
||||
run: echo "ARTIFACT_URL=${{ steps.upload_results_artifact.outputs.artifact-url }}" >> $GITHUB_ENV
|
||||
|
||||
- name: Authenticate to Google Cloud
|
||||
uses: 'google-github-actions/auth@v2'
|
||||
with:
|
||||
credentials_json: ${{ secrets.GCP_RESEARCH_OBJECT_CREATOR_SA_KEY }}
|
||||
|
||||
- name: Set timestamp and trigger reason
|
||||
run: |
|
||||
echo "TIMESTAMP=$(date +'%Y-%m-%d-%H-%M')" >> $GITHUB_ENV
|
||||
if [[ "${{ github.event_name }}" == "pull_request" ]]; then
|
||||
echo "TRIGGER_REASON=pr-${{ github.event.pull_request.number }}" >> $GITHUB_ENV
|
||||
elif [[ "${{ github.event_name }}" == "schedule" ]]; then
|
||||
echo "TRIGGER_REASON=schedule" >> $GITHUB_ENV
|
||||
else
|
||||
echo "TRIGGER_REASON=manual-${{ github.event.inputs.reason }}" >> $GITHUB_ENV
|
||||
fi
|
||||
|
||||
- name: Upload evaluation results to Google Cloud Storage
|
||||
uses: 'google-github-actions/upload-cloud-storage@v2'
|
||||
with:
|
||||
path: 'evaluation/evaluation_outputs/outputs'
|
||||
destination: 'openhands-oss-eval-results/${{ env.TIMESTAMP }}-${{ env.TRIGGER_REASON }}'
|
||||
|
||||
- name: Comment with evaluation results and artifact link
|
||||
id: create_comment
|
||||
uses: KeisukeYamashita/create-comment@v1
|
||||
with:
|
||||
number: ${{ github.event_name == 'pull_request' && github.event.pull_request.number || 4504 }}
|
||||
unique: false
|
||||
comment: |
|
||||
Trigger by: ${{ github.event_name == 'pull_request' && format('Pull Request (eval-this label on PR #{0})', github.event.pull_request.number) || github.event_name == 'schedule' && 'Daily Schedule' || format('Manual Trigger: {0}', github.event.inputs.reason) }}
|
||||
Commit: ${{ github.sha }}
|
||||
**SWE-Bench Evaluation Report**
|
||||
${{ env.SWEBENCH_REPORT }}
|
||||
---
|
||||
**Integration Tests Evaluation Report**
|
||||
${{ env.INTEGRATION_TEST_REPORT }}
|
||||
---
|
||||
You can download the full evaluation outputs [here](${{ env.ARTIFACT_URL }}).
|
||||
|
||||
- name: Post to a Slack channel
|
||||
id: slack
|
||||
uses: slackapi/slack-github-action@v1.27.0
|
||||
with:
|
||||
channel-id: 'C07SVQSCR6F'
|
||||
slack-message: "*Evaluation Trigger:* ${{ github.event_name == 'pull_request' && format('Pull Request (eval-this label on PR #{0})', github.event.pull_request.number) || github.event_name == 'schedule' && 'Daily Schedule' || format('Manual Trigger: {0}', github.event.inputs.reason) }}\n\nLink to summary: [here](https://github.com/${{ github.repository }}/issues/${{ github.event_name == 'pull_request' && github.event.pull_request.number || 4504 }}#issuecomment-${{ steps.create_comment.outputs.comment-id }})"
|
||||
env:
|
||||
SLACK_BOT_TOKEN: ${{ secrets.EVAL_NOTIF_SLACK_BOT_TOKEN }}
|
||||
@ -41,13 +41,15 @@ def get_config(
|
||||
config = AppConfig(
|
||||
default_agent=metadata.agent_class,
|
||||
run_as_openhands=False,
|
||||
runtime='eventstream',
|
||||
runtime=os.environ.get('RUNTIME', 'eventstream'),
|
||||
max_iterations=metadata.max_iterations,
|
||||
sandbox=SandboxConfig(
|
||||
# use default base_container_image
|
||||
enable_auto_lint=True,
|
||||
use_host_network=False,
|
||||
timeout=100,
|
||||
api_key=os.environ.get('ALLHANDS_API_KEY', None),
|
||||
remote_runtime_api_url=os.environ.get('SANDBOX_REMOTE_RUNTIME_API_URL'),
|
||||
),
|
||||
# do not mount workspace
|
||||
workspace_base=None,
|
||||
@ -219,3 +221,10 @@ if __name__ == '__main__':
|
||||
+ df[['instance_id', 'success', 'reason']].to_string(index=False)
|
||||
)
|
||||
logger.info('-' * 100)
|
||||
|
||||
report_file = os.path.join(metadata.eval_output_dir, 'report.md')
|
||||
with open(report_file, 'w') as f:
|
||||
f.write(
|
||||
f'Success rate: {df["success"].mean():.2%} ({df["success"].sum()}/{len(df)})\n'
|
||||
)
|
||||
f.write(df[['instance_id', 'success', 'reason']].to_markdown(index=False))
|
||||
|
||||
@ -369,10 +369,12 @@ if __name__ == '__main__':
|
||||
def count_report_field(row, field):
|
||||
return row['test_result']['report'][field]
|
||||
|
||||
report = {}
|
||||
for field in fields:
|
||||
count = evaluated_predictions.apply(
|
||||
count_report_field, args=(field,), axis=1
|
||||
).sum()
|
||||
report[field] = count
|
||||
logger.info(
|
||||
f'# {field}: {count} / {len(evaluated_predictions)}. ({count / len(evaluated_predictions):.2%})'
|
||||
)
|
||||
|
||||
@ -90,30 +90,29 @@ if __name__ == '__main__':
|
||||
break
|
||||
|
||||
# print the error counter (with percentage)
|
||||
print('-' * 100)
|
||||
print(
|
||||
f'# of resolved: {num_resolved} / {num_lines} ({num_resolved / num_lines * 100:.2f}%)'
|
||||
f'Number of resolved: {num_resolved} / {num_lines} ({num_resolved / num_lines * 100:.2f}%)'
|
||||
)
|
||||
print(
|
||||
f'# of empty patch: {num_empty_patch} / {num_lines} ({num_empty_patch / num_lines * 100:.2f}%)'
|
||||
f'Number of empty patch: {num_empty_patch} / {num_lines} ({num_empty_patch / num_lines * 100:.2f}%)'
|
||||
)
|
||||
print(
|
||||
f'# of error lines: {num_error_lines} / {num_lines} ({num_error_lines / num_lines * 100:.2f}%)'
|
||||
f'Number of error lines: {num_error_lines} / {num_lines} ({num_error_lines / num_lines * 100:.2f}%)'
|
||||
)
|
||||
print(
|
||||
f'# of loop: {num_agent_stuck_in_loop} / {num_lines} ({num_agent_stuck_in_loop / num_lines * 100:.2f}%)'
|
||||
f'Number of agent stuck in loop: {num_agent_stuck_in_loop} / {num_lines} ({num_agent_stuck_in_loop / num_lines * 100:.2f}%)'
|
||||
)
|
||||
assert len(num_turns) == num_lines
|
||||
assert len(main_agent_cost) == num_lines
|
||||
assert len(editor_cost) == num_lines
|
||||
print('## Statistics')
|
||||
print(f'Avg. num of turns per instance: {sum(num_turns) / num_lines:.2f}')
|
||||
print(f'Avg. agent cost per instance: {sum(main_agent_cost) / num_lines:.2f} USD')
|
||||
print(f'Avg. editor cost per instance: {sum(editor_cost) / num_lines:.2f} USD')
|
||||
print(
|
||||
f'Avg. total cost per instance: {(sum(main_agent_cost) + sum(editor_cost)) / num_lines:.2f} USD'
|
||||
)
|
||||
print('-' * 100)
|
||||
print('Detailed error breakdown:')
|
||||
|
||||
print('## Detailed error breakdown:')
|
||||
for error, count in error_counter.items():
|
||||
print(f'{error}: {count} ({count / num_lines * 100:.2f}%)')
|
||||
print('-' * 100)
|
||||
|
||||
@ -54,6 +54,11 @@ class CodeActResponseParser(ResponseParser):
|
||||
|
||||
if f'<execute_{lang}>' in action and f'</execute_{lang}>' not in action:
|
||||
action += f'</execute_{lang}>'
|
||||
|
||||
# special handling for DeepSeek: it has stop-word bug and returns </execute_ipython instead of </execute_ipython>
|
||||
if '</file_edit' in action and '</file_edit>' not in action:
|
||||
action = action.replace('</file_edit', '</file_edit>')
|
||||
|
||||
if '<file_edit' in action and '</file_edit>' not in action:
|
||||
action += '</file_edit>'
|
||||
return action
|
||||
|
||||
18
poetry.lock
generated
18
poetry.lock
generated
@ -1,4 +1,4 @@
|
||||
# This file is automatically @generated by Poetry 1.8.2 and should not be changed by hand.
|
||||
# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand.
|
||||
|
||||
[[package]]
|
||||
name = "aenum"
|
||||
@ -8512,6 +8512,20 @@ files = [
|
||||
sigtools = "4.0.1"
|
||||
typing-extensions = ">=4.6"
|
||||
|
||||
[[package]]
|
||||
name = "tabulate"
|
||||
version = "0.9.0"
|
||||
description = "Pretty-print tabular data"
|
||||
optional = false
|
||||
python-versions = ">=3.7"
|
||||
files = [
|
||||
{file = "tabulate-0.9.0-py3-none-any.whl", hash = "sha256:024ca478df22e9340661486f85298cff5f6dcdba14f3813e8830015b9ed1948f"},
|
||||
{file = "tabulate-0.9.0.tar.gz", hash = "sha256:0095b12bf5966de529c0feb1fa08671671b3368eec77d7ef7ab114be2c068b3c"},
|
||||
]
|
||||
|
||||
[package.extras]
|
||||
widechars = ["wcwidth"]
|
||||
|
||||
[[package]]
|
||||
name = "tenacity"
|
||||
version = "8.5.0"
|
||||
@ -10099,4 +10113,4 @@ testing = ["coverage[toml]", "zope.event", "zope.testing"]
|
||||
[metadata]
|
||||
lock-version = "2.0"
|
||||
python-versions = "^3.12"
|
||||
content-hash = "880b0251ec1ac83a7a8f6b1637b0860f75553d1f9c7c67313af9f3bb686c166a"
|
||||
content-hash = "709ae467d042d1c9fa3799711f50445324420de32dc9552a42284aba99903981"
|
||||
|
||||
@ -132,6 +132,7 @@ sympy = "*"
|
||||
gdown = "*"
|
||||
matplotlib = "*"
|
||||
seaborn = "*"
|
||||
tabulate = "*"
|
||||
|
||||
[tool.poetry-dynamic-versioning]
|
||||
enable = true
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user