mirror of
https://github.com/OpenHands/OpenHands.git
synced 2025-12-26 05:48:36 +08:00
More integration tests info (#5319)
This commit is contained in:
parent
8f750de752
commit
ea994b6209
6
.github/workflows/integration-runner.yml
vendored
6
.github/workflows/integration-runner.yml
vendored
@ -141,8 +141,8 @@ jobs:
|
||||
id: create_comment
|
||||
uses: KeisukeYamashita/create-comment@v1
|
||||
with:
|
||||
# if triggered by PR, use PR number, otherwise use 5077 as fallback issue number for manual triggers
|
||||
number: ${{ github.event_name == 'pull_request' && github.event.pull_request.number || 5077 }}
|
||||
# if triggered by PR, use PR number, otherwise use 5318 as fallback issue number for manual triggers
|
||||
number: ${{ github.event_name == 'pull_request' && github.event.pull_request.number || 5318 }}
|
||||
unique: false
|
||||
comment: |
|
||||
Trigger by: ${{ github.event_name == 'pull_request' && format('Pull Request (integration-test label on PR #{0})', github.event.pull_request.number) || (github.event_name == 'workflow_dispatch' && format('Manual Trigger: {0}', github.event.inputs.reason)) || 'Nightly Scheduled Run' }}
|
||||
@ -155,4 +155,4 @@ jobs:
|
||||
DeepSeek LLM Test Results:
|
||||
${{ env.INTEGRATION_TEST_REPORT_DEEPSEEK }}
|
||||
---
|
||||
Download evaluation outputs (includes both Haiku and DeepSeek results): [Download](${{ steps.upload_results_artifact.outputs.artifact-url }})
|
||||
Download testing outputs (includes both Haiku and DeepSeek results): [Download](${{ steps.upload_results_artifact.outputs.artifact-url }})
|
||||
|
||||
@ -218,6 +218,8 @@ if __name__ == '__main__':
|
||||
)
|
||||
|
||||
df = pd.read_json(output_file, lines=True, orient='records')
|
||||
|
||||
# record success and reason for failure for the final report
|
||||
df['success'] = df['test_result'].apply(lambda x: x['success'])
|
||||
df['reason'] = df['test_result'].apply(lambda x: x['reason'])
|
||||
logger.info('-' * 100)
|
||||
@ -231,9 +233,16 @@ if __name__ == '__main__':
|
||||
)
|
||||
logger.info('-' * 100)
|
||||
|
||||
# record cost for each instance, with 3 decimal places
|
||||
df['cost'] = df['metrics'].apply(lambda x: round(x['accumulated_cost'], 3))
|
||||
logger.info(f'Total cost: USD {df["cost"].sum():.2f}')
|
||||
|
||||
report_file = os.path.join(metadata.eval_output_dir, 'report.md')
|
||||
with open(report_file, 'w') as f:
|
||||
f.write(
|
||||
f'Success rate: {df["success"].mean():.2%} ({df["success"].sum()}/{len(df)})\n'
|
||||
)
|
||||
f.write(df[['instance_id', 'success', 'reason']].to_markdown(index=False))
|
||||
f.write(f'\nTotal cost: USD {df["cost"].sum():.2f}\n')
|
||||
f.write(
|
||||
df[['instance_id', 'success', 'reason', 'cost']].to_markdown(index=False)
|
||||
)
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user