mirror of
https://github.com/OpenHands/OpenHands.git
synced 2025-12-26 05:48:36 +08:00
200 lines
8.6 KiB
YAML
200 lines
8.6 KiB
YAML
name: Run Integration Tests
|
|
|
|
on:
|
|
pull_request:
|
|
types: [labeled]
|
|
workflow_dispatch:
|
|
inputs:
|
|
reason:
|
|
description: 'Reason for manual trigger'
|
|
required: true
|
|
default: ''
|
|
schedule:
|
|
- cron: '30 22 * * *' # Runs at 10:30pm UTC every day
|
|
|
|
env:
|
|
N_PROCESSES: 10 # Global configuration for number of parallel processes for evaluation
|
|
|
|
jobs:
|
|
run-integration-tests:
|
|
if: github.event.label.name == 'integration-test' || github.event_name == 'workflow_dispatch' || github.event_name == 'schedule'
|
|
runs-on: blacksmith-4vcpu-ubuntu-2204
|
|
permissions:
|
|
contents: "read"
|
|
id-token: "write"
|
|
pull-requests: "write"
|
|
issues: "write"
|
|
strategy:
|
|
matrix:
|
|
python-version: ["3.12"]
|
|
steps:
|
|
- name: Checkout repository
|
|
uses: actions/checkout@v4
|
|
|
|
- name: Install poetry via pipx
|
|
run: pipx install poetry
|
|
|
|
- name: Set up Python
|
|
uses: useblacksmith/setup-python@v6
|
|
with:
|
|
python-version: ${{ matrix.python-version }}
|
|
cache: "poetry"
|
|
|
|
- name: Setup Node.js
|
|
uses: useblacksmith/setup-node@v5
|
|
with:
|
|
node-version: '22.x'
|
|
|
|
- name: Comment on PR if 'integration-test' label is present
|
|
if: github.event_name == 'pull_request' && github.event.label.name == 'integration-test'
|
|
uses: KeisukeYamashita/create-comment@v1
|
|
with:
|
|
unique: false
|
|
comment: |
|
|
Hi! I started running the integration tests on your PR. You will receive a comment with the results shortly.
|
|
|
|
- name: Install Python dependencies using Poetry
|
|
run: poetry install --with dev,test,runtime,evaluation
|
|
|
|
- name: Configure config.toml for testing with Haiku
|
|
env:
|
|
LLM_MODEL: "litellm_proxy/claude-3-5-haiku-20241022"
|
|
LLM_API_KEY: ${{ secrets.LLM_API_KEY }}
|
|
LLM_BASE_URL: ${{ secrets.LLM_BASE_URL }}
|
|
MAX_ITERATIONS: 10
|
|
run: |
|
|
echo "[llm.eval]" > config.toml
|
|
echo "model = \"$LLM_MODEL\"" >> config.toml
|
|
echo "api_key = \"$LLM_API_KEY\"" >> config.toml
|
|
echo "base_url = \"$LLM_BASE_URL\"" >> config.toml
|
|
echo "temperature = 0.0" >> config.toml
|
|
|
|
- name: Build environment
|
|
run: make build
|
|
|
|
- name: Run integration test evaluation for Haiku
|
|
env:
|
|
SANDBOX_FORCE_REBUILD_RUNTIME: True
|
|
run: |
|
|
poetry run ./evaluation/integration_tests/scripts/run_infer.sh llm.eval HEAD CodeActAgent '' 10 $N_PROCESSES '' 'haiku_run'
|
|
|
|
# get integration tests report
|
|
REPORT_FILE_HAIKU=$(find evaluation/evaluation_outputs/outputs/integration_tests/CodeActAgent/*haiku*_maxiter_10_N* -name "report.md" -type f | head -n 1)
|
|
echo "REPORT_FILE: $REPORT_FILE_HAIKU"
|
|
echo "INTEGRATION_TEST_REPORT_HAIKU<<EOF" >> $GITHUB_ENV
|
|
cat $REPORT_FILE_HAIKU >> $GITHUB_ENV
|
|
echo >> $GITHUB_ENV
|
|
echo "EOF" >> $GITHUB_ENV
|
|
|
|
- name: Wait a little bit
|
|
run: sleep 10
|
|
|
|
- name: Configure config.toml for testing with DeepSeek
|
|
env:
|
|
LLM_MODEL: "litellm_proxy/deepseek-chat"
|
|
LLM_API_KEY: ${{ secrets.LLM_API_KEY }}
|
|
LLM_BASE_URL: ${{ secrets.LLM_BASE_URL }}
|
|
MAX_ITERATIONS: 10
|
|
run: |
|
|
echo "[llm.eval]" > config.toml
|
|
echo "model = \"$LLM_MODEL\"" >> config.toml
|
|
echo "api_key = \"$LLM_API_KEY\"" >> config.toml
|
|
echo "base_url = \"$LLM_BASE_URL\"" >> config.toml
|
|
echo "temperature = 0.0" >> config.toml
|
|
|
|
- name: Run integration test evaluation for DeepSeek
|
|
env:
|
|
SANDBOX_FORCE_REBUILD_RUNTIME: True
|
|
run: |
|
|
poetry run ./evaluation/integration_tests/scripts/run_infer.sh llm.eval HEAD CodeActAgent '' 10 $N_PROCESSES '' 'deepseek_run'
|
|
|
|
# get integration tests report
|
|
REPORT_FILE_DEEPSEEK=$(find evaluation/evaluation_outputs/outputs/integration_tests/CodeActAgent/deepseek*_maxiter_10_N* -name "report.md" -type f | head -n 1)
|
|
echo "REPORT_FILE: $REPORT_FILE_DEEPSEEK"
|
|
echo "INTEGRATION_TEST_REPORT_DEEPSEEK<<EOF" >> $GITHUB_ENV
|
|
cat $REPORT_FILE_DEEPSEEK >> $GITHUB_ENV
|
|
echo >> $GITHUB_ENV
|
|
echo "EOF" >> $GITHUB_ENV
|
|
|
|
# -------------------------------------------------------------
|
|
# Run VisualBrowsingAgent tests for DeepSeek, limited to t05 and t06
|
|
- name: Wait a little bit (again)
|
|
run: sleep 5
|
|
|
|
- name: Configure config.toml for testing VisualBrowsingAgent (DeepSeek)
|
|
env:
|
|
LLM_MODEL: "litellm_proxy/deepseek-chat"
|
|
LLM_API_KEY: ${{ secrets.LLM_API_KEY }}
|
|
LLM_BASE_URL: ${{ secrets.LLM_BASE_URL }}
|
|
MAX_ITERATIONS: 15
|
|
run: |
|
|
echo "[llm.eval]" > config.toml
|
|
echo "model = \"$LLM_MODEL\"" >> config.toml
|
|
echo "api_key = \"$LLM_API_KEY\"" >> config.toml
|
|
echo "base_url = \"$LLM_BASE_URL\"" >> config.toml
|
|
echo "temperature = 0.0" >> config.toml
|
|
- name: Run integration test evaluation for VisualBrowsingAgent (DeepSeek)
|
|
env:
|
|
SANDBOX_FORCE_REBUILD_RUNTIME: True
|
|
run: |
|
|
poetry run ./evaluation/integration_tests/scripts/run_infer.sh llm.eval HEAD VisualBrowsingAgent '' 15 $N_PROCESSES "t05_simple_browsing,t06_github_pr_browsing.py" 'visualbrowsing_deepseek_run'
|
|
|
|
# Find and export the visual browsing agent test results
|
|
REPORT_FILE_VISUALBROWSING_DEEPSEEK=$(find evaluation/evaluation_outputs/outputs/integration_tests/VisualBrowsingAgent/deepseek*_maxiter_15_N* -name "report.md" -type f | head -n 1)
|
|
echo "REPORT_FILE_VISUALBROWSING_DEEPSEEK: $REPORT_FILE_VISUALBROWSING_DEEPSEEK"
|
|
echo "INTEGRATION_TEST_REPORT_VISUALBROWSING_DEEPSEEK<<EOF" >> $GITHUB_ENV
|
|
cat $REPORT_FILE_VISUALBROWSING_DEEPSEEK >> $GITHUB_ENV
|
|
echo >> $GITHUB_ENV
|
|
echo "EOF" >> $GITHUB_ENV
|
|
|
|
- name: Create archive of evaluation outputs
|
|
run: |
|
|
TIMESTAMP=$(date +'%y-%m-%d-%H-%M')
|
|
cd evaluation/evaluation_outputs/outputs # Change to the outputs directory
|
|
tar -czvf ../../../integration_tests_${TIMESTAMP}.tar.gz integration_tests/CodeActAgent/* integration_tests/VisualBrowsingAgent/* # Only include the actual result directories
|
|
|
|
- name: Upload evaluation results as artifact
|
|
uses: actions/upload-artifact@v4
|
|
id: upload_results_artifact
|
|
with:
|
|
name: integration-test-outputs-${{ github.run_id }}-${{ github.run_attempt }}
|
|
path: integration_tests_*.tar.gz
|
|
|
|
- name: Get artifact URLs
|
|
run: |
|
|
echo "ARTIFACT_URL=${{ steps.upload_results_artifact.outputs.artifact-url }}" >> $GITHUB_ENV
|
|
|
|
- name: Set timestamp and trigger reason
|
|
run: |
|
|
echo "TIMESTAMP=$(date +'%Y-%m-%d-%H-%M')" >> $GITHUB_ENV
|
|
if [[ "${{ github.event_name }}" == "pull_request" ]]; then
|
|
echo "TRIGGER_REASON=pr-${{ github.event.pull_request.number }}" >> $GITHUB_ENV
|
|
elif [[ "${{ github.event_name }}" == "workflow_dispatch" ]]; then
|
|
echo "TRIGGER_REASON=manual-${{ github.event.inputs.reason }}" >> $GITHUB_ENV
|
|
else
|
|
echo "TRIGGER_REASON=nightly-scheduled" >> $GITHUB_ENV
|
|
fi
|
|
|
|
- name: Comment with results and artifact link
|
|
id: create_comment
|
|
uses: KeisukeYamashita/create-comment@v1
|
|
with:
|
|
# if triggered by PR, use PR number, otherwise use 9745 as fallback issue number for manual triggers
|
|
number: ${{ github.event_name == 'pull_request' && github.event.pull_request.number || 9745 }}
|
|
unique: false
|
|
comment: |
|
|
Trigger by: ${{ github.event_name == 'pull_request' && format('Pull Request (integration-test label on PR #{0})', github.event.pull_request.number) || (github.event_name == 'workflow_dispatch' && format('Manual Trigger: {0}', github.event.inputs.reason)) || 'Nightly Scheduled Run' }}
|
|
Commit: ${{ github.sha }}
|
|
**Integration Tests Report (Haiku)**
|
|
Haiku LLM Test Results:
|
|
${{ env.INTEGRATION_TEST_REPORT_HAIKU }}
|
|
---
|
|
**Integration Tests Report (DeepSeek)**
|
|
DeepSeek LLM Test Results:
|
|
${{ env.INTEGRATION_TEST_REPORT_DEEPSEEK }}
|
|
---
|
|
**Integration Tests Report VisualBrowsing (DeepSeek)**
|
|
${{ env.INTEGRATION_TEST_REPORT_VISUALBROWSING_DEEPSEEK }}
|
|
---
|
|
Download testing outputs (includes both Haiku and DeepSeek results): [Download](${{ steps.upload_results_artifact.outputs.artifact-url }})
|