Merge branch 'main' into rb/experimental-ui

This commit is contained in:
Robert Brennan
2024-08-09 13:54:03 -04:00
529 changed files with 31360 additions and 8552 deletions

68
.github/workflows/clean-up.yml vendored Normal file
View File

@@ -0,0 +1,68 @@
# Workflow that cleans up outdated and old workflows to prevent out of disk issues
name: Delete old workflow runs
on:
workflow_dispatch:
inputs:
days:
description: 'Days-worth of runs to keep for each workflow'
required: true
default: '30'
minimum_runs:
description: 'Minimum runs to keep for each workflow'
required: true
default: '10'
delete_workflow_pattern:
description: 'Name or filename of the workflow (if not set, all workflows are targeted)'
required: false
delete_workflow_by_state_pattern:
description: 'Filter workflows by state: active, deleted, disabled_fork, disabled_inactivity, disabled_manually'
required: true
default: "ALL"
type: choice
options:
- "ALL"
- active
- deleted
- disabled_inactivity
- disabled_manually
delete_run_by_conclusion_pattern:
description: 'Remove runs based on conclusion: action_required, cancelled, failure, skipped, success'
required: true
default: 'ALL'
type: choice
options:
- 'ALL'
- 'Unsuccessful: action_required,cancelled,failure,skipped'
- action_required
- cancelled
- failure
- skipped
- success
dry_run:
description: 'Logs simulated changes, no deletions are performed'
required: false
jobs:
del_runs:
runs-on: ubuntu-latest
permissions:
actions: write
contents: read
steps:
- name: Delete workflow runs
uses: Mattraks/delete-workflow-runs@v2
with:
token: ${{ github.token }}
repository: ${{ github.repository }}
retain_days: ${{ github.event.inputs.days }}
keep_minimum_runs: ${{ github.event.inputs.minimum_runs }}
delete_workflow_pattern: ${{ github.event.inputs.delete_workflow_pattern }}
delete_workflow_by_state_pattern: ${{ github.event.inputs.delete_workflow_by_state_pattern }}
delete_run_by_conclusion_pattern: >-
${{
startsWith(github.event.inputs.delete_run_by_conclusion_pattern, 'Unsuccessful:')
&& 'action_required,cancelled,failure,skipped'
|| github.event.inputs.delete_run_by_conclusion_pattern
}}
dry_run: ${{ github.event.inputs.dry_run }}

View File

@@ -1,3 +1,4 @@
# Workflow that builds and deploys the documentation website
name: Deploy Docs to GitHub Pages
on:
@@ -5,10 +6,13 @@ on:
branches:
- main
pull_request:
paths:
- 'docs/**'
branches:
- main
jobs:
# Build the documentation website
build:
name: Build Docusaurus
runs-on: ubuntu-latest
@@ -25,23 +29,23 @@ jobs:
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: "3.11"
python-version: '3.11'
- name: Generate Python Docs
run: rm -rf docs/modules/python && pip install pydoc-markdown && pydoc-markdown
- name: Install dependencies
run: cd docs && npm ci
- name: Build website
run: cd docs && npm run build
- name: Upload Build Artifact
if: github.ref == 'refs/heads/main'
uses: actions/upload-pages-artifact@v3
with:
path: docs/build
# Deploy the documentation website
deploy:
name: Deploy to GitHub Pages
runs-on: ubuntu-latest
needs: build
if: github.ref == 'refs/heads/main' && github.repository == 'OpenDevin/OpenDevin'
# Grant GITHUB_TOKEN the permissions required to make a Pages deployment
@@ -52,7 +56,6 @@ jobs:
environment:
name: github-pages
url: ${{ steps.deployment.outputs.page_url }}
runs-on: ubuntu-latest
steps:
- name: Deploy to GitHub Pages
id: deployment

View File

@@ -1,3 +1,4 @@
# Workflow that uses the DummyAgent to run a simple task
name: Run E2E test with dummy agent
concurrency:
@@ -10,9 +11,6 @@ on:
- main
pull_request:
env:
PERSIST_SANDBOX : "false"
jobs:
test:
runs-on: ubuntu-latest

View File

@@ -1,3 +1,4 @@
# Workflow that builds, tests and then pushes the docker images to the ghcr.io repository
name: Build Publish and Test Runtime Image
concurrency:
@@ -19,25 +20,21 @@ on:
default: ''
jobs:
# Builds the OpenDevin Docker images
ghcr_build:
runs-on: ubuntu-latest
outputs:
tags: ${{ steps.capture-tags.outputs.tags }}
permissions:
contents: read
packages: write
strategy:
matrix:
image: ["sandbox", "opendevin"]
platform: ["amd64", "arm64"]
image: ['opendevin']
platform: ['amd64', 'arm64']
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Free Disk Space (Ubuntu)
uses: jlumbroso/free-disk-space@main
with:
@@ -52,51 +49,43 @@ jobs:
large-packages: true
docker-images: false
swap-storage: true
- name: Set up QEMU
uses: docker/setup-qemu-action@v3
- name: Set up Docker Buildx
id: buildx
uses: docker/setup-buildx-action@v3
- name: Build and export image
id: build
run: ./containers/build.sh ${{ matrix.image }} ${{ github.repository_owner }} ${{ matrix.platform }}
- name: Capture tags
id: capture-tags
run: |
tags=$(cat tags.txt)
echo "tags=$tags"
echo "tags=$tags" >> $GITHUB_OUTPUT
- name: Upload Docker image as artifact
uses: actions/upload-artifact@v4
with:
name: ${{ matrix.image }}-docker-image-${{ matrix.platform }}
path: /tmp/${{ matrix.image }}_image_${{ matrix.platform }}.tar
retention-days: 14
# Builds the runtime Docker images
ghcr_build_runtime:
runs-on: ubuntu-latest
outputs:
tags: ${{ steps.capture-tags.outputs.tags }}
permissions:
contents: read
packages: write
strategy:
matrix:
image: ["od_runtime"]
base_image: ["ubuntu:22.04"]
platform: ["amd64", "arm64"]
image: ['od_runtime']
base_image: ['ubuntu:22.04']
platform: ['amd64', 'arm64']
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Free Disk Space (Ubuntu)
uses: jlumbroso/free-disk-space@main
with:
@@ -111,67 +100,65 @@ jobs:
large-packages: true
docker-images: false
swap-storage: true
- name: Set up QEMU
uses: docker/setup-qemu-action@v3
- name: Set up Docker Buildx
id: buildx
uses: docker/setup-buildx-action@v3
- name: Install poetry via pipx
run: pipx install poetry
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: "3.11"
cache: "poetry"
python-version: '3.11'
cache: 'poetry'
- name: Install Python dependencies using Poetry
run: make install-python-dependencies
- name: Create source distribution and Dockerfile
run: poetry run python3 opendevin/runtime/utils/runtime_build.py --base_image ${{ matrix.base_image }} --build_folder containers/runtime
- name: Build and export image
id: build
run: ./containers/build.sh ${{ matrix.image }} ${{ github.repository_owner }} ${{ matrix.platform }}
run: |
if [ -f 'containers/runtime/Dockerfile' ]; then
echo 'Dockerfile detected, building runtime image...'
./containers/build.sh ${{ matrix.image }} ${{ github.repository_owner }} ${{ matrix.platform }}
else
echo 'No Dockerfile detected which means an exact image is already built. Pulling the image and saving it to a tar file...'
source containers/runtime/config.sh
echo "$DOCKER_IMAGE_TAG $DOCKER_IMAGE_HASH_TAG" >> tags.txt
echo "Pulling image $DOCKER_IMAGE/$DOCKER_IMAGE_HASH_TAG to /tmp/${{ matrix.image }}_image_${{ matrix.platform }}.tar"
docker pull $DOCKER_IMAGE:$DOCKER_IMAGE_HASH_TAG
docker save $DOCKER_IMAGE:$DOCKER_IMAGE_HASH_TAG -o /tmp/${{ matrix.image }}_image_${{ matrix.platform }}.tar
fi
- name: Capture tags
id: capture-tags
run: |
tags=$(cat tags.txt)
echo "tags=$tags"
echo "tags=$tags" >> $GITHUB_OUTPUT
- name: Upload Docker image as artifact
uses: actions/upload-artifact@v4
with:
name: ${{ matrix.image }}-docker-image-${{ matrix.platform }}
path: /tmp/${{ matrix.image }}_image_${{ matrix.platform }}.tar
retention-days: 14
# Run unit tests with the EventStream and Server runtime Docker images
test_runtime:
name: Test Runtime
runs-on: ubuntu-latest
needs: [ghcr_build_runtime, ghcr_build]
env:
PERSIST_SANDBOX: "false"
strategy:
matrix:
runtime_type: ["eventstream", "server"]
runtime_type: ['eventstream']
steps:
- uses: actions/checkout@v4
- name: Free Disk Space (Ubuntu)
uses: jlumbroso/free-disk-space@main
with:
# this might remove tools that are actually needed,
# when set to "true" but frees about 6 GB
tool-cache: true
# all of these default to true, but feel free to set to
# "false" if necessary for your workflow
android: true
@@ -179,33 +166,27 @@ jobs:
haskell: true
large-packages: true
swap-storage: true
- name: Install poetry via pipx
run: pipx install poetry
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: "3.11"
cache: "poetry"
python-version: '3.11'
cache: 'poetry'
- name: Install Python dependencies using Poetry
run: make install-python-dependencies
- name: Download Runtime Docker image
if: matrix.runtime_type == 'eventstream'
uses: actions/download-artifact@v4
with:
name: od_runtime-docker-image-amd64
path: /tmp/
- name: Download Sandbox Docker image
if: matrix.runtime_type == 'server'
uses: actions/download-artifact@v4
with:
name: sandbox-docker-image-amd64
path: /tmp/
- name: Load Runtime image and run runtime tests
run: |
# Load the Docker image and capture the output
@@ -222,50 +203,47 @@ jobs:
echo "Loaded Docker image: $image_name"
TEST_RUNTIME=${{ matrix.runtime_type }} SANDBOX_USER_ID=$(id -u) SANDBOX_CONTAINER_IMAGE=$image_name TEST_IN_CI=true poetry run pytest --cov=agenthub --cov=opendevin --cov-report=xml -s ./tests/unit/test_runtime.py
- name: Upload coverage to Codecov
uses: codecov/codecov-action@v4
env:
CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
integration_tests_on_linux:
name: Integration Tests on Linux
# Run integration tests with the eventstream runtime Docker image
runtime_integration_tests_on_linux:
name: Runtime Integration Tests on Linux
runs-on: ubuntu-latest
needs: ghcr_build
env:
PERSIST_SANDBOX: "false"
needs: [ghcr_build_runtime]
strategy:
fail-fast: false
matrix:
python-version: ["3.11"]
sandbox: ["ssh", "local"]
python-version: ['3.11']
# server is tested in a separate workflow
runtime_type: ['eventstream']
steps:
- uses: actions/checkout@v4
- name: Install poetry via pipx
run: pipx install poetry
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python-version }}
cache: 'poetry'
- name: Install Python dependencies using Poetry
run: make install-python-dependencies
- name: Download sandbox Docker image
- name: Download Runtime Docker image
uses: actions/download-artifact@v4
with:
name: sandbox-docker-image-amd64
name: od_runtime-docker-image-amd64
path: /tmp/
- name: Load sandbox image and run integration tests
env:
SANDBOX_BOX_TYPE: ${{ matrix.sandbox }}
- name: Load runtime image and run integration tests
run: |
# Load the Docker image and capture the output
output=$(docker load -i /tmp/sandbox_image_amd64.tar)
if [ "${{ matrix.runtime_type }}" == "eventstream" ]; then
output=$(docker load -i /tmp/od_runtime_image_amd64.tar)
else
echo "No Runtime Docker image to load"
exit 1
fi
# Extract the first image name from the output
image_name=$(echo "$output" | grep -oP 'Loaded image: \K.*' | head -n 1)
@@ -273,49 +251,40 @@ jobs:
# Print the full name of the image
echo "Loaded Docker image: $image_name"
SANDBOX_CONTAINER_IMAGE=$image_name TEST_IN_CI=true TEST_ONLY=true ./tests/integration/regenerate.sh
TEST_RUNTIME=${{ matrix.runtime_type }} SANDBOX_USER_ID=$(id -u) SANDBOX_CONTAINER_IMAGE=$image_name TEST_IN_CI=true TEST_ONLY=true ./tests/integration/regenerate.sh
- name: Upload coverage to Codecov
uses: codecov/codecov-action@v4
env:
CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
# Push the OpenDevin and sandbox Docker images to the ghcr.io repository
ghcr_push:
runs-on: ubuntu-latest
# don't push if integration tests or sandbox tests fail
needs: [ghcr_build, test_runtime, integration_tests_on_linux]
needs: [ghcr_build]
if: github.ref == 'refs/heads/main' || startsWith(github.ref, 'refs/tags/')
env:
tags: ${{ needs.ghcr_build.outputs.tags }}
permissions:
contents: read
packages: write
strategy:
matrix:
image: ["sandbox", "opendevin"]
platform: ["amd64", "arm64"]
image: ['opendevin']
platform: ['amd64', 'arm64']
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Login to GHCR
uses: docker/login-action@v2
with:
registry: ghcr.io
username: ${{ github.repository_owner }}
password: ${{ secrets.GITHUB_TOKEN }}
- name: Download Docker images
uses: actions/download-artifact@v4
with:
name: ${{ matrix.image }}-docker-image-${{ matrix.platform }}
path: /tmp/${{ matrix.platform }}
- name: Load images and push to registry
run: |
mv /tmp/${{ matrix.platform }}/${{ matrix.image }}_image_${{ matrix.platform }}.tar .
@@ -330,28 +299,23 @@ jobs:
docker push $image_name:${tag}_${{ matrix.platform }}
done
# Push the runtime Docker images to the ghcr.io repository
ghcr_push_runtime:
runs-on: ubuntu-latest
# don't push if runtime tests fail
needs: [ghcr_build_runtime, test_runtime, integration_tests_on_linux]
needs: [ghcr_build_runtime, test_runtime, runtime_integration_tests_on_linux]
if: github.ref == 'refs/heads/main' || startsWith(github.ref, 'refs/tags/')
env:
tags: ${{ needs.ghcr_build_runtime.outputs.tags }}
RUNTIME_TAGS: ${{ needs.ghcr_build_runtime.outputs.tags }}
permissions:
contents: read
packages: write
strategy:
matrix:
image: ["od_runtime"]
platform: ["amd64", "arm64"]
image: ['od_runtime']
platform: ['amd64', 'arm64']
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Free Disk Space (Ubuntu)
uses: jlumbroso/free-disk-space@main
with:
@@ -362,25 +326,21 @@ jobs:
large-packages: true
docker-images: false
swap-storage: true
- name: Login to GHCR
uses: docker/login-action@v2
with:
registry: ghcr.io
username: ${{ github.repository_owner }}
password: ${{ secrets.GITHUB_TOKEN }}
- name: Download Docker images
uses: actions/download-artifact@v4
with:
name: ${{ matrix.image }}-docker-image-${{ matrix.platform }}
path: /tmp/${{ matrix.platform }}
- name: List downloaded files
run: |
ls -la /tmp/${{ matrix.platform }}
file /tmp/${{ matrix.platform }}/*
- name: Load images and push to registry
run: |
mv /tmp/${{ matrix.platform }}/${{ matrix.image }}_image_${{ matrix.platform }}.tar ./${{ matrix.image }}_image_${{ matrix.platform }}.tar
@@ -389,46 +349,40 @@ jobs:
exit 1
fi
echo "loaded image = $loaded_image"
tags=$(echo ${tags} | tr ' ' '\n')
image_name=$(echo "ghcr.io/${{ github.repository_owner }}/${{ matrix.image }}" | tr '[:upper:]' '[:lower:]')
echo "image name = $image_name"
for tag in $tags; do
echo "$RUNTIME_TAGS" | tr ' ' '\n' | while read -r tag; do
echo "tag = $tag"
if [ -n "$image_name" ]; then
if [ -n "$image_name" ] && [ -n "$tag" ]; then
docker tag $loaded_image $image_name:${tag}_${{ matrix.platform }}
docker push $image_name:${tag}_${{ matrix.platform }}
else
echo "Skipping tag and push due to empty image_name"
echo "Skipping tag and push due to empty image_name or tag"
fi
done
# Creates and pushes the OpenDevin and sandbox Docker image manifests
create_manifest:
runs-on: ubuntu-latest
needs: [ghcr_build, ghcr_push]
if: github.ref == 'refs/heads/main' || startsWith(github.ref, 'refs/tags/')
env:
tags: ${{ needs.ghcr_build.outputs.tags }}
strategy:
matrix:
image: ["sandbox", "opendevin"]
image: ['opendevin']
permissions:
contents: read
packages: write
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Login to GHCR
uses: docker/login-action@v2
with:
registry: ghcr.io
username: ${{ github.repository_owner }}
password: ${{ secrets.GITHUB_TOKEN }}
- name: Create and push multi-platform manifest
run: |
image_name=$(echo "ghcr.io/${{ github.repository_owner }}/${{ matrix.image }}" | tr '[:upper:]' '[:lower:]')
@@ -441,33 +395,28 @@ jobs:
$image_name:${tag}_arm64
done
# Creates and pushes the runtime Docker image manifest
create_manifest_runtime:
runs-on: ubuntu-latest
needs: [ghcr_build_runtime, ghcr_push_runtime]
if: github.ref == 'refs/heads/main' || startsWith(github.ref, 'refs/tags/')
env:
tags: ${{ needs.ghcr_build_runtime.outputs.tags }}
strategy:
matrix:
image: ["od_runtime"]
image: ['od_runtime']
permissions:
contents: read
packages: write
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Login to GHCR
uses: docker/login-action@v2
with:
registry: ghcr.io
username: ${{ github.repository_owner }}
password: ${{ secrets.GITHUB_TOKEN }}
- name: Create and push multi-platform manifest
run: |
image_name=$(echo "ghcr.io/${{ github.repository_owner }}/${{ matrix.image }}" | tr '[:upper:]' '[:lower:]')

View File

@@ -1,3 +1,4 @@
# Workflow that runs lint on the frontend and python code
name: Lint
concurrency:
@@ -11,27 +12,26 @@ on:
pull_request:
jobs:
# Run lint on the frontend code
lint-frontend:
name: Lint frontend
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Install Node.js 20
uses: actions/setup-node@v4
with:
node-version: 20
- name: Install dependencies
run: |
cd frontend
npm install --frozen-lockfile
- name: Lint
run: |
cd frontend
npm run lint
# Run lint on the python code
lint-python:
name: Lint python
runs-on: ubuntu-latest

View File

@@ -1,3 +1,4 @@
# Workflow that uses OpenDevin to review a pull request. PR must be labeled 'review-this'
name: Use OpenDevin to Review Pull Request
on:
@@ -22,16 +23,13 @@ jobs:
run: |
sudo apt-get install -y git gh
git config --global --add safe.directory $PWD
- name: Checkout Repository
uses: actions/checkout@v4
with:
ref: ${{ github.event.pull_request.base.ref }} # check out the target branch
- name: Download Diff
run: |
curl -O "${{ github.event.pull_request.diff_url }}" -L
- name: Write Task File
run: |
echo "Your coworker wants to apply a pull request to this project." > task.txt
@@ -45,19 +43,16 @@ jobs:
echo "${{ github.event.pull_request.body }}" >> task.txt
echo "" >> task.txt
echo "Diff file is: ${{ github.event.pull_request.number }}.diff" >> task.txt
- name: Set up environment
run: |
curl -sSL https://install.python-poetry.org | python3 -
export PATH="/github/home/.local/bin:$PATH"
poetry install --without evaluation,llama-index
poetry run playwright install --with-deps chromium
- name: Run OpenDevin
env:
LLM_API_KEY: ${{ secrets.LLM_API_KEY }}
LLM_MODEL: ${{ vars.LLM_MODEL }}
SANDBOX_BOX_TYPE: ssh
run: |
# Append path to launch poetry
export PATH="/github/home/.local/bin:$PATH"
@@ -67,7 +62,6 @@ jobs:
export WORKSPACE_BASE=$GITHUB_WORKSPACE
echo -e "/exit\n" | poetry run python opendevin/core/main.py -i 50 -f task.txt
rm task.txt
- name: Check if review file is non-empty
id: check_file
run: |
@@ -76,7 +70,6 @@ jobs:
echo "non_empty=true" >> $GITHUB_OUTPUT
fi
shell: bash
- name: Create PR review if file is non-empty
env:
GH_TOKEN: ${{ github.token }}

View File

@@ -1,3 +1,4 @@
# Workflow that runs frontend and python unit tests
name: Run Unit Tests
concurrency:
@@ -15,63 +16,52 @@ on:
- 'evaluation/**'
pull_request:
env:
PERSIST_SANDBOX : "false"
jobs:
# Run frontend unit tests
fe-test:
runs-on: ubuntu-latest
strategy:
matrix:
node-version: [20]
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Set up Node.js
uses: actions/setup-node@v4
with:
node-version: ${{ matrix.node-version }}
- name: Install dependencies
working-directory: ./frontend
run: npm ci
- name: Run tests and collect coverage
working-directory: ./frontend
run: npm run test:coverage
- name: Upload coverage to Codecov
uses: codecov/codecov-action@v4
env:
CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
# Run python unit tests on macOS
test-on-macos:
name: Test on macOS
runs-on: macos-12
env:
INSTALL_DOCKER: "1" # Set to '0' to skip Docker installation
INSTALL_DOCKER: '1' # Set to '0' to skip Docker installation
strategy:
matrix:
python-version: ["3.11"]
python-version: ['3.11']
steps:
- uses: actions/checkout@v4
- name: Install poetry via pipx
run: pipx install poetry
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python-version }}
cache: "poetry"
cache: 'poetry'
- name: Install Python dependencies using Poetry
run: poetry install --without evaluation,llama-index
- name: Install & Start Docker
if: env.INSTALL_DOCKER == '1'
run: |
@@ -120,47 +110,39 @@ jobs:
# For testcontainers to find the Colima socket
# https://github.com/abiosoft/colima/blob/main/docs/FAQ.md#cannot-connect-to-the-docker-daemon-at-unixvarrundockersock-is-the-docker-daemon-running
sudo ln -sf $HOME/.colima/default/docker.sock /var/run/docker.sock
- name: Build Environment
run: make build
- name: Run Tests
run: poetry run pytest --forked --cov=agenthub --cov=opendevin --cov-report=xml ./tests/unit -k "not test_sandbox.py and not test_runtime.py"
- name: Upload coverage to Codecov
uses: codecov/codecov-action@v4
env:
CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
# Run python unit tests on Linux
test-on-linux:
name: Test on Linux
runs-on: ubuntu-latest
env:
INSTALL_DOCKER: "0" # Set to '0' to skip Docker installation
INSTALL_DOCKER: '0' # Set to '0' to skip Docker installation
strategy:
matrix:
python-version: ["3.11"]
python-version: ['3.11']
steps:
- uses: actions/checkout@v4
- name: Install poetry via pipx
run: pipx install poetry
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python-version }}
cache: "poetry"
cache: 'poetry'
- name: Install Python dependencies using Poetry
run: poetry install --without evaluation,llama-index
- name: Build Environment
run: make build
- name: Run Tests
run: poetry run pytest --forked --cov=agenthub --cov=opendevin --cov-report=xml ./tests/unit -k "not test_sandbox.py and not test_runtime.py"
- name: Upload coverage to Codecov
uses: codecov/codecov-action@v4
env:

View File

@@ -1,3 +1,4 @@
# Workflow that uses OpenDevin to resolve a GitHub issue. Issue must be labeled 'solve-this'
name: Use OpenDevin to Resolve GitHub Issue
on:
@@ -17,14 +18,11 @@ jobs:
image: ghcr.io/opendevin/opendevin
volumes:
- /var/run/docker.sock:/var/run/docker.sock
steps:
- name: install git, github cli
run: apt-get install -y git gh
- name: Checkout Repository
uses: actions/checkout@v4
- name: Write Task File
env:
ISSUE_TITLE: ${{ github.event.issue.title }}
@@ -35,22 +33,18 @@ jobs:
echo "" >> task.txt
echo "BODY:" >> task.txt
echo "${ISSUE_BODY}" >> task.txt
- name: Set up environment
run: |
curl -sSL https://install.python-poetry.org | python3 -
export PATH="/github/home/.local/bin:$PATH"
poetry install --without evaluation,llama-index
poetry run playwright install --with-deps chromium
- name: Run OpenDevin
env:
ISSUE_TITLE: ${{ github.event.issue.title }}
ISSUE_BODY: ${{ github.event.issue.body }}
LLM_API_KEY: ${{ secrets.OPENAI_API_KEY }}
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
SANDBOX_BOX_TYPE: ssh
run: |
# Append path to launch poetry
export PATH="/github/home/.local/bin:$PATH"
@@ -58,7 +52,6 @@ jobs:
export PYTHONPATH=$(pwd):$PYTHONPATH
WORKSPACE_MOUNT_PATH=$GITHUB_WORKSPACE poetry run python ./opendevin/core/main.py -i 50 -f task.txt -d $GITHUB_WORKSPACE
rm task.txt
- name: Setup Git, Create Branch, and Commit Changes
run: |
# Setup Git configuration
@@ -84,7 +77,6 @@ jobs:
# Push changes
git push --set-upstream origin $BRANCH_NAME
- name: Fetch Default Branch
env:
GH_TOKEN: ${{ github.token }}
@@ -93,7 +85,6 @@ jobs:
DEFAULT_BRANCH=$(gh repo view --json defaultBranchRef --jq .defaultBranchRef.name)
echo "Default branch is $DEFAULT_BRANCH"
echo "DEFAULT_BRANCH=$DEFAULT_BRANCH" >> $GITHUB_ENV
- name: Generate PR
env:
GH_TOKEN: ${{ github.token }}

View File

@@ -1,4 +1,6 @@
# Workflow that marks issues and PRs with no activity for 30 days with "Stale" and closes them after 7 more days of no activity
name: 'Close stale issues'
on:
schedule:
- cron: '30 1 * * *'
@@ -9,21 +11,9 @@ jobs:
steps:
- uses: actions/stale@v9
with:
# Aggressively close issues that have been explicitly labeled `age-out`
any-of-labels: age-out
stale-issue-message: 'This issue is stale because it has been open for 7 days with no activity. Remove stale label or comment or this will be closed in 1 day.'
close-issue-message: 'This issue was closed because it has been stalled for over 7 days with no activity.'
stale-pr-message: 'This PR is stale because it has been open for 7 days with no activity. Remove stale label or comment or this will be closed in 1 days.'
close-pr-message: 'This PR was closed because it has been stalled for over 7 days with no activity.'
days-before-stale: 7
days-before-close: 1
- uses: actions/stale@v9
with:
# Be more lenient with other issues
stale-issue-message: 'This issue is stale because it has been open for 30 days with no activity. Remove stale label or comment or this will be closed in 7 days.'
close-issue-message: 'This issue was closed because it has been stalled for over 30 days with no activity.'
stale-pr-message: 'This PR is stale because it has been open for 30 days with no activity. Remove stale label or comment or this will be closed in 7 days.'
close-pr-message: 'This PR was closed because it has been stalled for over 30 days with no activity.'
days-before-stale: 30
close-issue-message: 'This issue was closed because it has been stalled for over 30 days with no activity.'
close-pr-message: 'This PR was closed because it has been stalled for over 30 days with no activity.'
days-before-close: 7

View File

@@ -1,48 +0,0 @@
name: Update pyproject.toml Version and Tags
on:
release:
types:
- published
jobs:
update-pyproject-and-tags:
runs-on: ubuntu-latest
steps:
- name: Checkout code
uses: actions/checkout@v4
with:
fetch-depth: 0 # Fetch all history for all branches and tags
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: "3.11"
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install toml
- name: Get release tag
id: get_release_tag
run: echo "RELEASE_TAG=${GITHUB_REF#refs/tags/}" >> $GITHUB_ENV
- name: Update pyproject.toml with release tag
run: |
python -c "
import toml
with open('pyproject.toml', 'r') as f:
data = toml.load(f)
data['tool']['poetry']['version'] = '${{ env.RELEASE_TAG }}'
with open('pyproject.toml', 'w') as f:
toml.dump(data, f)
"
- name: Commit and push pyproject.toml changes
uses: stefanzweifel/git-auto-commit-action@v4
with:
commit_message: "Update pyproject.toml version to ${{ env.RELEASE_TAG }}"
branch: main
file_pattern: pyproject.toml

4
.gitignore vendored
View File

@@ -169,6 +169,10 @@ evaluation/outputs
evaluation/swe_bench/eval_workspace*
evaluation/SWE-bench/data
evaluation/webarena/scripts/webarena_env.sh
evaluation/bird/data
evaluation/gaia/data
evaluation/gorilla/data
evaluation/toolqa/data
# frontend

View File

@@ -23,9 +23,6 @@ RESET=$(shell tput -Txterm sgr0)
build:
@echo "$(GREEN)Building project...$(RESET)"
@$(MAKE) -s check-dependencies
ifeq ($(INSTALL_DOCKER),)
@$(MAKE) -s pull-docker-image
endif
@$(MAKE) -s install-python-dependencies
@$(MAKE) -s install-frontend-dependencies
@$(MAKE) -s install-pre-commit-hooks
@@ -124,11 +121,6 @@ check-poetry:
exit 1; \
fi
pull-docker-image:
@echo "$(YELLOW)Pulling Docker image...$(RESET)"
@docker pull $(DOCKER_IMAGE)
@echo "$(GREEN)Docker image pulled successfully.$(RESET)"
install-python-dependencies:
@echo "$(GREEN)Installing Python dependencies...$(RESET)"
@if [ -z "${TZ}" ]; then \
@@ -246,16 +238,6 @@ setup-config-prompts:
workspace_dir=$${workspace_dir:-$(DEFAULT_WORKSPACE_DIR)}; \
echo "workspace_base=\"$$workspace_dir\"" >> $(CONFIG_FILE).tmp
@read -p "Do you want to persist the sandbox container? [true/false] [default: false]: " persist_sandbox; \
persist_sandbox=$${persist_sandbox:-false}; \
if [ "$$persist_sandbox" = "true" ]; then \
read -p "Enter a password for the sandbox container: " ssh_password; \
echo "ssh_password=\"$$ssh_password\"" >> $(CONFIG_FILE).tmp; \
echo "persist_sandbox=$$persist_sandbox" >> $(CONFIG_FILE).tmp; \
else \
echo "persist_sandbox=$$persist_sandbox" >> $(CONFIG_FILE).tmp; \
fi
@echo "" >> $(CONFIG_FILE).tmp
@echo "[llm]" >> $(CONFIG_FILE).tmp
@@ -316,4 +298,4 @@ help:
@echo " $(GREEN)help$(RESET) - Display this help message, providing information on available targets."
# Phony targets
.PHONY: build check-dependencies check-python check-npm check-docker check-poetry pull-docker-image install-python-dependencies install-frontend-dependencies install-pre-commit-hooks lint start-backend start-frontend run run-wsl setup-config setup-config-prompts help
.PHONY: build check-dependencies check-python check-npm check-docker check-poetry install-python-dependencies install-frontend-dependencies install-pre-commit-hooks lint start-backend start-frontend run run-wsl setup-config setup-config-prompts help

View File

@@ -24,7 +24,7 @@
<a href="https://github.com/OpenDevin/OpenDevin/issues"><img src="https://img.shields.io/github/issues/opendevin/opendevin?style=for-the-badge&color=blue" alt="Issues"></a>
<a href="https://github.com/OpenDevin/OpenDevin/blob/main/LICENSE"><img src="https://img.shields.io/github/license/opendevin/opendevin?style=for-the-badge&color=blue" alt="MIT License"></a>
<br/>
<a href="https://join.slack.com/t/opendevin/shared_invite/zt-2i1iqdag6-bVmvamiPA9EZUu7oCO6KhA"><img src="https://img.shields.io/badge/Slack-Join%20Us-red?logo=slack&logoColor=white&style=for-the-badge" alt="Join our Slack community"></a>
<a href="https://join.slack.com/t/opendevin/shared_invite/zt-2ngejmfw6-9gW4APWOC9XUp1n~SiQ6iw"><img src="https://img.shields.io/badge/Slack-Join%20Us-red?logo=slack&logoColor=white&style=for-the-badge" alt="Join our Slack community"></a>
<a href="https://discord.gg/ESHStjSjD4"><img src="https://img.shields.io/badge/Discord-Join%20Us-purple?logo=discord&logoColor=white&style=for-the-badge" alt="Join our Discord community"></a>
<a href="https://codecov.io/github/opendevin/opendevin?branch=main"><img alt="CodeCov" src="https://img.shields.io/codecov/c/github/opendevin/opendevin?style=for-the-badge"></a>
</div>
@@ -66,7 +66,7 @@ docker run -it \
-p 3000:3000 \
--add-host host.docker.internal:host-gateway \
--name opendevin-app-$(date +%Y%m%d%H%M%S) \
ghcr.io/opendevin/opendevin
ghcr.io/opendevin/opendevin:0.8
```
> [!NOTE]
@@ -111,7 +111,7 @@ For details, please check [CONTRIBUTING.md](./CONTRIBUTING.md).
Whether you're a developer, a researcher, or simply enthusiastic about OpenDevin, we'd love to have you in our community.
Let's make software engineering better together!
- [Slack workspace](https://join.slack.com/t/opendevin/shared_invite/zt-2jsrl32uf-fTeeFjNyNYxqSZt5NPY3fA) - Here we talk about research, architecture, and future development.
- [Slack workspace](https://join.slack.com/t/opendevin/shared_invite/zt-2ngejmfw6-9gW4APWOC9XUp1n~SiQ6iw) - Here we talk about research, architecture, and future development.
- [Discord server](https://discord.gg/ESHStjSjD4) - This is a community-run server for general discussion, questions, and feedback.
## 📈 Progress
@@ -141,12 +141,12 @@ Distributed under the MIT License. See [`LICENSE`](./LICENSE) for more informati
```
@misc{opendevin,
title={{OpenDevin: An Open Platform for AI Software Developers as Generalist Agents}},
title={{OpenDevin: An Open Platform for AI Software Developers as Generalist Agents}},
author={Xingyao Wang and Boxuan Li and Yufan Song and Frank F. Xu and Xiangru Tang and Mingchen Zhuge and Jiayi Pan and Yueqi Song and Bowen Li and Jaskirat Singh and Hoang H. Tran and Fuqiang Li and Ren Ma and Mingzhang Zheng and Bill Qian and Yanjun Shao and Niklas Muennighoff and Yizhe Zhang and Binyuan Hui and Junyang Lin and Robert Brennan and Hao Peng and Heng Ji and Graham Neubig},
year={2024},
eprint={2407.16741},
archivePrefix={arXiv},
primaryClass={cs.SE},
url={https://arxiv.org/abs/2407.16741},
url={https://arxiv.org/abs/2407.16741},
}
```

View File

@@ -7,6 +7,7 @@ from agenthub.browsing_agent.response_parser import BrowsingResponseParser
from opendevin.controller.agent import Agent
from opendevin.controller.state.state import State
from opendevin.core.logger import opendevin_logger as logger
from opendevin.core.message import Message, TextContent
from opendevin.events.action import (
Action,
AgentFinishAction,
@@ -136,7 +137,7 @@ class BrowsingAgent(Agent):
- MessageAction(content) - Message action to run (e.g. ask for clarification)
- AgentFinishAction() - end the interaction
"""
messages = []
messages: list[Message] = []
prev_actions = []
cur_axtree_txt = ''
error_prefix = ''
@@ -191,20 +192,23 @@ class BrowsingAgent(Agent):
)
return MessageAction('Error encountered when browsing.')
if (goal := state.get_current_user_intent()) is None:
goal, _ = state.get_current_user_intent()
if goal is None:
goal = state.inputs['task']
system_msg = get_system_message(
goal,
self.action_space.describe(with_long_description=False, with_examples=True),
)
messages.append({'role': 'system', 'content': system_msg})
messages.append(Message(role='system', content=[TextContent(text=system_msg)]))
prompt = get_prompt(error_prefix, cur_axtree_txt, prev_action_str)
messages.append({'role': 'user', 'content': prompt})
messages.append(Message(role='user', content=[TextContent(text=prompt)]))
logger.debug(prompt)
response = self.llm.completion(
messages=messages,
messages=[message.model_dump() for message in messages],
temperature=0.0,
stop=[')```', ')\n```'],
)

View File

@@ -8,6 +8,7 @@ from agenthub.codeact_agent.prompt import (
)
from opendevin.controller.agent import Agent
from opendevin.controller.state.state import State
from opendevin.core.message import ImageContent, Message, TextContent
from opendevin.events.action import (
Action,
AgentDelegateAction,
@@ -131,7 +132,7 @@ class CodeActAgent(Agent):
return action.thought
return ''
def get_action_message(self, action: Action) -> dict[str, str] | None:
def get_action_message(self, action: Action) -> Message | None:
if (
isinstance(action, AgentDelegateAction)
or isinstance(action, CmdRunAction)
@@ -139,39 +140,41 @@ class CodeActAgent(Agent):
or isinstance(action, MessageAction)
or (isinstance(action, AgentFinishAction) and action.source == 'agent')
):
return {
'role': 'user' if action.source == 'user' else 'assistant',
'content': self.action_to_str(action),
}
content = [TextContent(text=self.action_to_str(action))]
if isinstance(action, MessageAction) and action.images_urls:
content.append(ImageContent(image_urls=action.images_urls))
return Message(
role='user' if action.source == 'user' else 'assistant', content=content
)
return None
def get_observation_message(self, obs: Observation) -> dict[str, str] | None:
def get_observation_message(self, obs: Observation) -> Message | None:
max_message_chars = self.llm.config.max_message_chars
if isinstance(obs, CmdOutputObservation):
content = 'OBSERVATION:\n' + truncate_content(
obs.content, max_message_chars
)
content += (
text = 'OBSERVATION:\n' + truncate_content(obs.content, max_message_chars)
text += (
f'\n[Command {obs.command_id} finished with exit code {obs.exit_code}]'
)
return {'role': 'user', 'content': content}
return Message(role='user', content=[TextContent(text=text)])
elif isinstance(obs, IPythonRunCellObservation):
content = 'OBSERVATION:\n' + obs.content
text = 'OBSERVATION:\n' + obs.content
# replace base64 images with a placeholder
splitted = content.split('\n')
splitted = text.split('\n')
for i, line in enumerate(splitted):
if '![image](data:image/png;base64,' in line:
splitted[i] = (
'![image](data:image/png;base64, ...) already displayed to user'
)
content = '\n'.join(splitted)
content = truncate_content(content, max_message_chars)
return {'role': 'user', 'content': content}
text = '\n'.join(splitted)
text = truncate_content(text, max_message_chars)
return Message(role='user', content=[TextContent(text=text)])
elif isinstance(obs, AgentDelegateObservation):
content = 'OBSERVATION:\n' + truncate_content(
text = 'OBSERVATION:\n' + truncate_content(
str(obs.outputs), max_message_chars
)
return {'role': 'user', 'content': content}
return Message(role='user', content=[TextContent(text=text)])
return None
def reset(self) -> None:
@@ -198,10 +201,10 @@ class CodeActAgent(Agent):
return AgentFinishAction()
# prepare what we want to send to the LLM
messages: list[dict[str, str]] = self._get_messages(state)
messages = self._get_messages(state)
response = self.llm.completion(
messages=messages,
messages=[message.model_dump() for message in messages],
stop=[
'</execute_ipython>',
'</execute_bash>',
@@ -211,11 +214,11 @@ class CodeActAgent(Agent):
)
return self.action_parser.parse(response)
def _get_messages(self, state: State) -> list[dict[str, str]]:
def _get_messages(self, state: State) -> list[Message]:
system_message: str = get_system_message(state.prompt_context)
messages = [
{'role': 'system', 'content': system_message},
{'role': 'user', 'content': self.in_context_example},
Message(role='system', content=[TextContent(text=system_message)]),
Message(role='user', content=[TextContent(text=self.in_context_example)]),
]
for event in state.history.get_events():
@@ -229,18 +232,44 @@ class CodeActAgent(Agent):
# add regular message
if message:
messages.append(message)
# handle error if the message is the SAME role as the previous message
# litellm.exceptions.BadRequestError: litellm.BadRequestError: OpenAIException - Error code: 400 - {'detail': 'Only supports u/a/u/a/u...'}
# there should not have two consecutive messages from the same role
if messages and messages[-1].role == message.role:
messages[-1].content.extend(message.content)
else:
messages.append(message)
# the latest user message is important:
# we want to remind the agent of the environment constraints
latest_user_message = next(
(m for m in reversed(messages) if m['role'] == 'user'), None
(
m
for m in reversed(messages)
if m.role == 'user'
and any(isinstance(c, TextContent) for c in m.content)
),
None,
)
# add a reminder to the prompt
# Get the last user text inside content
if latest_user_message:
latest_user_message['content'] += (
f'\n\nENVIRONMENT REMINDER: You have {state.max_iterations - state.iteration} turns left to complete the task. When finished reply with <finish></finish>'
latest_user_message_text = next(
(
t
for t in reversed(latest_user_message.content)
if isinstance(t, TextContent)
)
)
# add a reminder to the prompt
reminder_text = f'\n\nENVIRONMENT REMINDER: You have {state.max_iterations - state.iteration} turns left to complete the task. When finished reply with <finish></finish>.'
if latest_user_message_text:
latest_user_message_text.text = (
latest_user_message_text.text + reminder_text
)
else:
latest_user_message_text = TextContent(text=reminder_text)
latest_user_message.content.append(latest_user_message_text)
return messages

View File

@@ -7,6 +7,7 @@ from agenthub.codeact_swe_agent.prompt import (
from agenthub.codeact_swe_agent.response_parser import CodeActSWEResponseParser
from opendevin.controller.agent import Agent
from opendevin.controller.state.state import State
from opendevin.core.message import ImageContent, Message, TextContent
from opendevin.events.action import (
Action,
AgentFinishAction,
@@ -84,40 +85,43 @@ class CodeActSWEAgent(Agent):
return action.content
return ''
def get_action_message(self, action: Action) -> dict[str, str] | None:
def get_action_message(self, action: Action) -> Message | None:
if (
isinstance(action, CmdRunAction)
or isinstance(action, IPythonRunCellAction)
or isinstance(action, MessageAction)
):
return {
'role': 'user' if action.source == 'user' else 'assistant',
'content': self.action_to_str(action),
}
content = [TextContent(text=self.action_to_str(action))]
if isinstance(action, MessageAction) and action.images_urls:
content.append(ImageContent(image_urls=action.images_urls))
return Message(
role='user' if action.source == 'user' else 'assistant', content=content
)
return None
def get_observation_message(self, obs: Observation) -> dict[str, str] | None:
def get_observation_message(self, obs: Observation) -> Message | None:
max_message_chars = self.llm.config.max_message_chars
if isinstance(obs, CmdOutputObservation):
content = 'OBSERVATION:\n' + truncate_content(
obs.content, max_message_chars
)
content += (
text = 'OBSERVATION:\n' + truncate_content(obs.content, max_message_chars)
text += (
f'\n[Command {obs.command_id} finished with exit code {obs.exit_code}]'
)
return {'role': 'user', 'content': content}
return Message(role='user', content=[TextContent(text=text)])
elif isinstance(obs, IPythonRunCellObservation):
content = 'OBSERVATION:\n' + obs.content
text = 'OBSERVATION:\n' + obs.content
# replace base64 images with a placeholder
splitted = content.split('\n')
splitted = text.split('\n')
for i, line in enumerate(splitted):
if '![image](data:image/png;base64,' in line:
splitted[i] = (
'![image](data:image/png;base64, ...) already displayed to user'
)
content = '\n'.join(splitted)
content = truncate_content(content, max_message_chars)
return {'role': 'user', 'content': content}
text = '\n'.join(splitted)
text = truncate_content(text, max_message_chars)
return Message(role='user', content=[TextContent(text=text)])
return None
def reset(self) -> None:
@@ -143,10 +147,10 @@ class CodeActSWEAgent(Agent):
return AgentFinishAction()
# prepare what we want to send to the LLM
messages: list[dict[str, str]] = self._get_messages(state)
messages: list[Message] = self._get_messages(state)
response = self.llm.completion(
messages=messages,
messages=[message.model_dump() for message in messages],
stop=[
'</execute_ipython>',
'</execute_bash>',
@@ -156,10 +160,10 @@ class CodeActSWEAgent(Agent):
return self.response_parser.parse(response)
def _get_messages(self, state: State) -> list[dict[str, str]]:
messages = [
{'role': 'system', 'content': self.system_message},
{'role': 'user', 'content': self.in_context_example},
def _get_messages(self, state: State) -> list[Message]:
messages: list[Message] = [
Message(role='system', content=[TextContent(text=self.system_message)]),
Message(role='user', content=[TextContent(text=self.in_context_example)]),
]
for event in state.history.get_events():
@@ -173,18 +177,38 @@ class CodeActSWEAgent(Agent):
# add regular message
if message:
messages.append(message)
# handle error if the message is the SAME role as the previous message
# litellm.exceptions.BadRequestError: litellm.BadRequestError: OpenAIException - Error code: 400 - {'detail': 'Only supports u/a/u/a/u...'}
# there should not have two consecutive messages from the same role
if messages and messages[-1].role == message.role:
messages[-1].content.extend(message.content)
else:
messages.append(message)
# the latest user message is important:
# we want to remind the agent of the environment constraints
latest_user_message = next(
(m for m in reversed(messages) if m['role'] == 'user'), None
(m for m in reversed(messages) if m.role == 'user'), None
)
# add a reminder to the prompt
# Get the last user text inside content
if latest_user_message:
latest_user_message['content'] += (
f'\n\nENVIRONMENT REMINDER: You have {state.max_iterations - state.iteration} turns left to complete the task.'
latest_user_message_text = next(
(
t
for t in reversed(latest_user_message.content)
if isinstance(t, TextContent)
)
)
# add a reminder to the prompt
reminder_text = f'\n\nENVIRONMENT REMINDER: You have {state.max_iterations - state.iteration} turns left to complete the task. When finished reply with <finish></finish>.'
if latest_user_message_text:
latest_user_message_text.text = (
latest_user_message_text.text + reminder_text
)
else:
latest_user_message_text = TextContent(text=reminder_text)
latest_user_message.content.append(latest_user_message_text)
return messages

View File

@@ -34,7 +34,7 @@ class DelegatorAgent(Agent):
"""
if self.current_delegate == '':
self.current_delegate = 'study'
task = state.get_current_user_intent()
task, _ = state.get_current_user_intent()
return AgentDelegateAction(
agent='StudyRepoForTaskAgent', inputs={'task': task}
)
@@ -45,7 +45,7 @@ class DelegatorAgent(Agent):
if not isinstance(last_observation, AgentDelegateObservation):
raise Exception('Last observation is not an AgentDelegateObservation')
goal = state.get_current_user_intent()
goal, _ = state.get_current_user_intent()
if self.current_delegate == 'study':
self.current_delegate = 'coder'
return AgentDelegateAction(

View File

@@ -208,9 +208,3 @@ class DummyAgent(Agent):
f' Unable to perform interactive browsing: {action.browser_actions}'
)
return MessageAction(content=message)
async def get_working_directory(self, state: State) -> str:
# Implement this method to return the current working directory
# This might involve accessing state information or making an async call
# For now, we'll return a placeholder value
return './workspace'

View File

@@ -2,6 +2,7 @@ from jinja2 import BaseLoader, Environment
from opendevin.controller.agent import Agent
from opendevin.controller.state.state import State
from opendevin.core.message import ImageContent, Message, TextContent
from opendevin.core.utils import json
from opendevin.events.action import Action
from opendevin.events.serialization.action import action_from_dict
@@ -62,16 +63,20 @@ class MicroAgent(Agent):
del self.delegates[self.agent_definition['name']]
def step(self, state: State) -> Action:
last_user_message, last_image_urls = state.get_current_user_intent()
prompt = self.prompt_template.render(
state=state,
instructions=instructions,
to_json=to_json,
history_to_json=self.history_to_json,
delegates=self.delegates,
latest_user_message=state.get_current_user_intent(),
latest_user_message=last_user_message,
)
messages = [{'content': prompt, 'role': 'user'}]
resp = self.llm.completion(messages=messages)
content = [TextContent(text=prompt)]
if last_image_urls:
content.append(ImageContent(image_urls=last_image_urls))
message = Message(role='user', content=content)
resp = self.llm.completion(messages=[message.model_dump()])
action_resp = resp['choices'][0]['message']['content']
action = parse_response(action_resp)
return action

View File

@@ -3,7 +3,7 @@
CommitWriterAgent can help write git commit message. Example:
```bash
WORKSPACE_MOUNT_PATH="`PWD`" SANDBOX_BOX_TYPE="ssh" \
WORKSPACE_MOUNT_PATH="`PWD`" \
poetry run python opendevin/core/main.py -t "dummy task" -c CommitWriterAgent -d ./
```

View File

@@ -1,11 +1,12 @@
from agenthub.planner_agent.response_parser import PlannerResponseParser
from opendevin.controller.agent import Agent
from opendevin.controller.state.state import State
from opendevin.core.message import ImageContent, Message, TextContent
from opendevin.events.action import Action, AgentFinishAction
from opendevin.llm.llm import LLM
from opendevin.runtime.tools import RuntimeTool
from .prompt import get_prompt
from .prompt import get_prompt_and_images
class PlannerAgent(Agent):
@@ -42,7 +43,13 @@ class PlannerAgent(Agent):
'abandoned',
]:
return AgentFinishAction()
prompt = get_prompt(state, self.llm.config.max_message_chars)
messages = [{'content': prompt, 'role': 'user'}]
resp = self.llm.completion(messages=messages)
prompt, image_urls = get_prompt_and_images(
state, self.llm.config.max_message_chars
)
content = [TextContent(text=prompt)]
if image_urls:
content.append(ImageContent(image_urls=image_urls))
message = Message(role='user', content=content)
resp = self.llm.completion(messages=[message.model_dump()])
return self.response_parser.parse(resp)

View File

@@ -115,7 +115,9 @@ def get_hint(latest_action_id: str) -> str:
return hints.get(latest_action_id, '')
def get_prompt(state: State, max_message_chars: int) -> str:
def get_prompt_and_images(
state: State, max_message_chars: int
) -> tuple[str, list[str]]:
"""Gets the prompt for the planner agent.
Formatted with the most recent action-observation pairs, current task, and hint based on last action
@@ -161,16 +163,16 @@ def get_prompt(state: State, max_message_chars: int) -> str:
logger.info('HINT:\n' + hint, extra={'msg_type': 'DETAIL'})
# the last relevant user message (the task)
task = state.get_current_user_intent()
message, image_urls = state.get_current_user_intent()
# finally, fill in the prompt
return prompt % {
'task': task,
'task': message,
'plan': plan_str,
'history': history_str,
'hint': hint,
'plan_status': plan_status,
}
}, image_urls
def parse_response(response: str) -> Action:

View File

@@ -55,24 +55,11 @@ workspace_base = "./workspace"
# Path to rewrite the workspace mount path to
#workspace_mount_rewrite = ""
# Persist the sandbox
persist_sandbox = false
# Run as devin
#run_as_devin = true
# Runtime environment
#runtime = "server"
# SSH hostname for the sandbox
#ssh_hostname = "localhost"
# SSH password for the sandbox
#ssh_password = ""
# SSH port for the sandbox
#ssh_port = 63710
#runtime = "eventstream"
# Name of the default agent
#default_agent = "CodeActAgent"
@@ -183,9 +170,6 @@ llm_config = 'gpt3'
# Sandbox timeout in seconds
#timeout = 120
# Sandbox type (ssh, e2b, local)
#box_type = "ssh"
# Sandbox user ID
#user_id = 1000

View File

@@ -32,11 +32,13 @@ FROM python:3.12.3-slim AS runtime
WORKDIR /app
ARG OPEN_DEVIN_BUILD_VERSION #re-declare for this section
ENV RUN_AS_DEVIN=true
# A random number--we need this to be different from the user's UID on the host machine
ENV OPENDEVIN_USER_ID=42420
ENV SANDBOX_API_HOSTNAME=host.docker.internal
ENV USE_HOST_NETWORK=false
ENV SSH_HOSTNAME=host.docker.internal
ENV WORKSPACE_BASE=/opt/workspace_base
ENV OPEN_DEVIN_BUILD_VERSION=$OPEN_DEVIN_BUILD_VERSION
RUN mkdir -p $WORKSPACE_BASE
@@ -44,8 +46,10 @@ RUN mkdir -p $WORKSPACE_BASE
RUN apt-get update -y \
&& apt-get install -y curl ssh sudo
RUN sed -i 's/^UID_MIN.*/UID_MIN 499/' /etc/login.defs # Default is 1000, but OSX is often 501
RUN sed -i 's/^UID_MAX.*/UID_MAX 1000000/' /etc/login.defs # Default is 60000, but we've seen up to 200000
# Default is 1000, but OSX is often 501
RUN sed -i 's/^UID_MIN.*/UID_MIN 499/' /etc/login.defs
# Default is 60000, but we've seen up to 200000
RUN sed -i 's/^UID_MAX.*/UID_MAX 1000000/' /etc/login.defs
RUN groupadd app
RUN useradd -l -m -u $OPENDEVIN_USER_ID -s /bin/bash opendevin && \
@@ -66,6 +70,9 @@ RUN playwright install --with-deps chromium
COPY --chown=opendevin:app --chmod=770 ./opendevin ./opendevin
COPY --chown=opendevin:app --chmod=777 ./opendevin/runtime/plugins ./opendevin/runtime/plugins
COPY --chown=opendevin:app --chmod=770 ./agenthub ./agenthub
COPY --chown=opendevin:app --chmod=770 ./pyproject.toml ./pyproject.toml
COPY --chown=opendevin:app --chmod=770 ./poetry.lock ./poetry.lock
COPY --chown=opendevin:app --chmod=770 ./README.md ./README.md
RUN python opendevin/core/download.py # No-op to download assets
RUN chown -R opendevin:app /app/logs && chmod -R 770 /app/logs # This gets created by the download.py script

View File

@@ -53,6 +53,11 @@ fi
if [[ -n "$DOCKER_IMAGE_TAG" ]]; then
tags+=("$DOCKER_IMAGE_TAG")
fi
# If $DOCKER_IMAGE_HASH_TAG is set, add it to the tags
if [[ -n "$DOCKER_IMAGE_HASH_TAG" ]]; then
tags+=("$DOCKER_IMAGE_HASH_TAG")
fi
DOCKER_REPOSITORY="$DOCKER_REGISTRY/$DOCKER_ORG/$DOCKER_IMAGE"
DOCKER_REPOSITORY=${DOCKER_REPOSITORY,,} # lowercase

View File

@@ -4,5 +4,3 @@ DOCKER_BASE_DIR="./containers/runtime"
# These two variables will be appended by the runtime_build.py script
# DOCKER_IMAGE=
# DOCKER_IMAGE_TAG=
DOCKER_IMAGE=od_runtime
DOCKER_IMAGE_TAG=od_v0.8.1_image_ubuntu_tag_22.04

View File

@@ -4,7 +4,7 @@ import { themes as prismThemes } from "prism-react-renderer";
const config: Config = {
title: "OpenDevin",
tagline: "Code Less, Make More",
tagline: "An Open Platform for AI Software Developers as Generalist Agents",
favicon: "img/logo.png",
// Set the production url of your site here
@@ -32,6 +32,10 @@ const config: Config = {
},
},
markdown: {
mermaid: true,
},
themes: ['@docusaurus/theme-mermaid'],
presets: [
[
"classic",
@@ -77,7 +81,6 @@ const config: Config = {
position: "left",
label: "Codebase",
},
{ to: "/faq", label: "FAQ", position: "left" },
{
href: "https://github.com/OpenDevin/OpenDevin",
label: "GitHub",

View File

@@ -31,7 +31,7 @@ Pour plus de détails, veuillez consulter [ce document](https://github.com/OpenD
Nous avons maintenant à la fois un espace de travail Slack pour la collaboration sur la construction d'OpenDevin et un serveur Discord pour discuter de tout ce qui est lié, par exemple, à ce projet, aux LLM, aux agents, etc.
- [Espace de travail Slack](https://join.slack.com/t/opendevin/shared_invite/zt-2jsrl32uf-fTeeFjNyNYxqSZt5NPY3fA)
- [Espace de travail Slack](https://join.slack.com/t/opendevin/shared_invite/zt-2ngejmfw6-9gW4APWOC9XUp1n~SiQ6iw)
- [Serveur Discord](https://discord.gg/ESHStjSjD4)
Si vous souhaitez contribuer, n'hésitez pas à rejoindre notre communauté. Simplifions l'ingénierie logicielle ensemble !

View File

@@ -41,7 +41,6 @@ Créez un fichier ```config.toml``` dans le répertoire OpenDevin et entrez ces
```toml
[core]
workspace_base="./workspace"
persist_sandbox=false
run_as_devin=true
sandbox_container_image="image_personnalisée"
```
@@ -92,7 +91,6 @@ Si vous voyez cette erreur dans la sortie de la console, il s'agit du fait que O
```toml
[core]
workspace_base="./workspace"
persist_sandbox=false
run_as_devin=true
sandbox_container_image="image_personnalisée"
sandbox_user_id="1001"
@@ -104,4 +102,4 @@ Si vous voyez un message d'erreur indiquant que le port est utilisé ou indispon
## Discuter
Pour d'autres problèmes ou questions rejoignez le [Slack](https://join.slack.com/t/opendevin/shared_invite/zt-2jsrl32uf-fTeeFjNyNYxqSZt5NPY3fA) ou le [Discord](https://discord.gg/ESHStjSjD4) et demandez!
Pour d'autres problèmes ou questions rejoignez le [Slack](https://join.slack.com/t/opendevin/shared_invite/zt-2ngejmfw6-9gW4APWOC9XUp1n~SiQ6iw) ou le [Discord](https://discord.gg/ESHStjSjD4) et demandez!

View File

@@ -42,7 +42,7 @@ Explorez le code source d'OpenDevin sur [GitHub](https://github.com/OpenDevin/Op
/>
</a>
<br></br>
<a href="https://join.slack.com/t/opendevin/shared_invite/zt-2jsrl32uf-fTeeFjNyNYxqSZt5NPY3fA">
<a href="https://join.slack.com/t/opendevin/shared_invite/zt-2ngejmfw6-9gW4APWOC9XUp1n~SiQ6iw">
<img
src="https://img.shields.io/badge/Slack-Join%20Us-red?logo=slack&logoColor=white&style=for-the-badge"
alt="Join our Slack community"
@@ -72,8 +72,6 @@ WORKSPACE_BASE=$(pwd)/workspace
docker run -it \
--pull=always \
-e SANDBOX_USER_ID=$(id -u) \
-e PERSIST_SANDBOX="true" \
-e SSH_PASSWORD="make something up here" \
-e WORKSPACE_MOUNT_PATH=$WORKSPACE_BASE \
-v $WORKSPACE_BASE:/opt/workspace_base \
-v /var/run/docker.sock:/var/run/docker.sock \

View File

@@ -31,7 +31,7 @@ OpenDevin 是一个社区驱动的项目,我们欢迎每个人的贡献。无
我们现在有一个 Slack 工作区,用于合作建设 OpenDevin还设有一个 Discord 服务器用于讨论与该项目、LLM、代理等相关的任何事情。
- [Slack 工作区](https://join.slack.com/t/opendevin/shared_invite/zt-2jsrl32uf-fTeeFjNyNYxqSZt5NPY3fA)
- [Slack 工作区](https://join.slack.com/t/opendevin/shared_invite/zt-2ngejmfw6-9gW4APWOC9XUp1n~SiQ6iw)
- [Discord 服务器](https://discord.gg/ESHStjSjD4)
如果您愿意贡献,请随时加入我们的社区。让我们一起简化软件工程!

View File

@@ -40,7 +40,6 @@ docker build -t custom_image .
```
[core]
workspace_base="./workspace"
persist_sandbox=false
run_as_devin=true
sandbox_container_image="custom_image"
```
@@ -92,7 +91,6 @@ dockerfile_content = (
```
[core]
workspace_base="./workspace"
persist_sandbox=false
run_as_devin=true
sandbox_container_image="custom_image"
sandbox_user_id="1001"
@@ -104,4 +102,4 @@ sandbox_user_id="1001"
## 讨论
对于其他问题或疑问,请加入 [Slack](https://join.slack.com/t/opendevin/shared_invite/zt-2jsrl32uf-fTeeFjNyNYxqSZt5NPY3fA) 或 [Discord](https://discord.gg/ESHStjSjD4),并提问!
对于其他问题或疑问,请加入 [Slack](https://join.slack.com/t/opendevin/shared_invite/zt-2ngejmfw6-9gW4APWOC9XUp1n~SiQ6iw) 或 [Discord](https://discord.gg/ESHStjSjD4),并提问!

View File

@@ -42,7 +42,7 @@ OpenDevin 是一个**自主 AI 软件工程师**,能够执行复杂的工程
/>
</a>
<br></br>
<a href="https://join.slack.com/t/opendevin/shared_invite/zt-2jsrl32uf-fTeeFjNyNYxqSZt5NPY3fA">
<a href="https://join.slack.com/t/opendevin/shared_invite/zt-2ngejmfw6-9gW4APWOC9XUp1n~SiQ6iw">
<img
src="https://img.shields.io/badge/Slack-Join%20Us-red?logo=slack&logoColor=white&style=for-the-badge"
alt="Join our Slack community"
@@ -72,8 +72,6 @@ WORKSPACE_BASE=$(pwd)/workspace
docker run -it \
--pull=always \
-e SANDBOX_USER_ID=$(id -u) \
-e PERSIST_SANDBOX="true" \
-e SSH_PASSWORD="make something up here" \
-e WORKSPACE_MOUNT_PATH=$WORKSPACE_BASE \
-v $WORKSPACE_BASE:/opt/workspace_base \
-v /var/run/docker.sock:/var/run/docker.sock \

View File

@@ -31,7 +31,7 @@ For details, please check [this document](https://github.com/OpenDevin/OpenDevin
We have both Slack workspace for the collaboration on building OpenDevin and Discord server for discussion about anything related, e.g., this project, LLM, agent, etc.
- [Slack workspace](https://join.slack.com/t/opendevin/shared_invite/zt-2jsrl32uf-fTeeFjNyNYxqSZt5NPY3fA)
- [Slack workspace](https://join.slack.com/t/opendevin/shared_invite/zt-2ngejmfw6-9gW4APWOC9XUp1n~SiQ6iw)
- [Discord server](https://discord.gg/ESHStjSjD4)
If you would love to contribute, feel free to join our community. Let's simplify software engineering together!

View File

@@ -70,7 +70,6 @@ Create a `config.toml` file in the OpenDevin directory and enter these contents:
```toml
[core]
workspace_base="./workspace"
persist_sandbox=false
run_as_devin=true
sandbox_container_image="custom_image"
```
@@ -129,7 +128,6 @@ If you see this error in the console output it is because OpenDevin is trying to
```toml
[core]
workspace_base="./workspace"
persist_sandbox=false
run_as_devin=true
sandbox_container_image="custom_image"
sandbox_user_id="1001"
@@ -141,4 +139,4 @@ If you see an error about a port being in use or unavailable, try deleting all r
## Discuss
For other issues or questions join the [Slack](https://join.slack.com/t/opendevin/shared_invite/zt-2jsrl32uf-fTeeFjNyNYxqSZt5NPY3fA) or [Discord](https://discord.gg/ESHStjSjD4) and ask!
For other issues or questions join the [Slack](https://join.slack.com/t/opendevin/shared_invite/zt-2ngejmfw6-9gW4APWOC9XUp1n~SiQ6iw) or [Discord](https://discord.gg/ESHStjSjD4) and ask!

View File

@@ -0,0 +1,257 @@
---
sidebar_position: 6
---
# 📈 How to contribute to OpenDevin Evaluation Harness
This guide provides an overview of how to integrate your own evaluation benchmark into the OpenDevin framework.
## Before everything begins: Setup Environment and LLM Configuration
Please follow instruction [here](https://github.com/OpenDevin/OpenDevin/blob/main/Development.md) to setup your local development environment and LLM.
OpenDevin in development mode uses `config.toml` to keep track of most configurations.
Here's an example configuration file you can use to define and use multiple LLMs:
```toml
[llm]
# IMPORTANT: add your API key here, and set the model to the one you want to evaluate
model = "gpt-4o-2024-05-13"
api_key = "sk-XXX"
[llm.eval_gpt4_1106_preview_llm]
model = "gpt-4-1106-preview"
api_key = "XXX"
temperature = 0.0
[llm.eval_some_openai_compatible_model_llm]
model = "openai/MODEL_NAME"
base_url = "https://OPENAI_COMPATIBLE_URL/v1"
api_key = "XXX"
temperature = 0.0
```
## How to use OpenDevin in the command line
OpenDevin can be run from the command line using the following format:
```bash
poetry run python ./opendevin/core/main.py \
-i <max_iterations> \
-t "<task_description>" \
-c <agent_class> \
-l <llm_config>
```
For example:
```bash
poetry run python ./opendevin/core/main.py \
-i 10 \
-t "Write me a bash script that prints hello world." \
-c CodeActAgent \
-l llm
```
This command runs OpenDevin with:
- A maximum of 10 iterations
- The specified task description
- Using the CodeActAgent
- With the LLM configuration defined in the `llm` section of your `config.toml` file
## How does OpenDevin work
The main entry point for OpenDevin is in `opendevin/core/main.py`. Here's a simplified flow of how it works:
1. Parse command-line arguments and load the configuration.
2. Create a runtime environment using `create_runtime()`.
3. Initialize the specified agent.
4. Run the controller using `run_controller()`, which:
- Attaches the runtime to the agent
- Executes the agent's task
- Returns a final state when complete
The `run_controller()` function is the core of OpenDevin's execution. It manages the interaction between the agent, the runtime, and the task, handling things like user input simulation and event processing.
## Easiest way to get started: Exploring Existing Benchmarks
We encourage you to review the various evaluation benchmarks available in the [`evaluation/` directory](https://github.com/OpenDevin/OpenDevin/blob/main/evaluation) of our repository.
To integrate your own benchmark, we suggest starting with the one that most closely resembles your needs. This approach can significantly streamline your integration process, allowing you to build upon existing structures and adapt them to your specific requirements.
## How to create an evaluation workflow
To create an evaluation workflow for your benchmark, follow these steps:
1. Create a configuration:
```python
def get_config(instance: pd.Series, metadata: EvalMetadata) -> AppConfig:
config = AppConfig(
default_agent=metadata.agent_class,
runtime='eventstream',
max_iterations=metadata.max_iterations,
sandbox=SandboxConfig(
container_image='your_container_image',
enable_auto_lint=True,
timeout=300,
),
)
config.set_llm_config(metadata.llm_config)
return config
```
2. Initialize the runtime and set up the evaluation environment:
```python
async def initialize_runtime(runtime: Runtime, instance: pd.Series):
# Set up your evaluation environment here
# For example, setting environment variables, preparing files, etc.
pass
```
3. Create a function to process each instance:
```python
async def process_instance(instance: pd.Series, metadata: EvalMetadata) -> EvalOutput:
config = get_config(instance, metadata)
runtime = await create_runtime(config, sid=instance.instance_id)
await initialize_runtime(runtime, instance)
instruction = get_instruction(instance, metadata)
state = await run_controller(
config=config,
task_str=instruction,
runtime=runtime,
fake_user_response_fn=your_user_response_function,
)
# Evaluate the agent's actions
evaluation_result = await evaluate_agent_actions(runtime, instance)
return EvalOutput(
instance_id=instance.instance_id,
instruction=instruction,
test_result=evaluation_result,
metadata=metadata,
history=state.history.compatibility_for_eval_history_pairs(),
metrics=state.metrics.get() if state.metrics else None,
error=state.last_error if state and state.last_error else None,
)
```
4. Run the evaluation:
```python
metadata = make_metadata(llm_config, dataset_name, agent_class, max_iterations, eval_note, eval_output_dir)
output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl')
instances = prepare_dataset(your_dataset, output_file, eval_n_limit)
await run_evaluation(
instances,
metadata,
output_file,
num_workers,
process_instance
)
```
This workflow sets up the configuration, initializes the runtime environment, processes each instance by running the agent and evaluating its actions, and then collects the results into an `EvalOutput` object. The `run_evaluation` function handles parallelization and progress tracking.
Remember to customize the `get_instruction`, `your_user_response_function`, and `evaluate_agent_actions` functions according to your specific benchmark requirements.
By following this structure, you can create a robust evaluation workflow for your benchmark within the OpenDevin framework.
Certainly! I'll add a section explaining the user_response_fn and include a description of the workflow and interaction. Here's an updated version of the guideline with the new section:
## Understanding the `user_response_fn`
The `user_response_fn` is a crucial component in OpenDevin's evaluation workflow. It simulates user interaction with the agent, allowing for automated responses during the evaluation process. This function is particularly useful when you want to provide consistent, predefined responses to the agent's queries or actions.
### Workflow and Interaction
The correct workflow for handling actions and the `user_response_fn` is as follows:
1. Agent receives a task and starts processing
2. Agent emits an Action
3. If the Action is executable (e.g., CmdRunAction, IPythonRunCellAction):
- The Runtime processes the Action
- Runtime returns an Observation
4. If the Action is not executable (typically a MessageAction):
- The `user_response_fn` is called
- It returns a simulated user response
5. The agent receives either the Observation or the simulated response
6. Steps 2-5 repeat until the task is completed or max iterations are reached
Here's a more accurate visual representation:
```
[Agent]
|
v
[Emit Action]
|
v
[Is Action Executable?]
/ \
Yes No
| |
v v
[Runtime] [user_response_fn]
| |
v v
[Return Observation] [Simulated Response]
\ /
\ /
v v
[Agent receives feedback]
|
v
[Continue or Complete Task]
```
In this workflow:
- Executable actions (like running commands or executing code) are handled directly by the Runtime.
- Non-executable actions (typically when the agent wants to communicate or ask for clarification) are handled by the `user_response_fn`.
- The agent then processes the feedback, whether it's an Observation from the Runtime or a simulated response from the `user_response_fn`.
This approach allows for automated handling of both concrete actions and simulated user interactions, making it suitable for evaluation scenarios where you want to test the agent's ability to complete tasks with minimal human intervention.
### Example Implementation
Here's an example of a `user_response_fn` used in the SWE-Bench evaluation:
```python
def codeact_user_response(state: State | None) -> str:
msg = (
'Please continue working on the task on whatever approach you think is suitable.\n'
'If you think you have solved the task, please first send your answer to user through message and then <execute_bash> exit </execute_bash>.\n'
'IMPORTANT: YOU SHOULD NEVER ASK FOR HUMAN HELP.\n'
)
if state and state.history:
# check if the agent has tried to talk to the user 3 times, if so, let the agent know it can give up
user_msgs = [
event
for event in state.history.get_events()
if isinstance(event, MessageAction) and event.source == 'user'
]
if len(user_msgs) >= 2:
# let the agent know that it can give up when it has tried 3 times
return (
msg
+ 'If you want to give up, run: <execute_bash> exit </execute_bash>.\n'
)
return msg
```
This function does the following:
1. Provides a standard message encouraging the agent to continue working.
2. Checks how many times the agent has attempted to communicate with the user.
3. If the agent has made multiple attempts, it provides an option to give up.
By using this function, you can ensure consistent behavior across multiple evaluation runs and prevent the agent from getting stuck waiting for human input.

View File

@@ -42,7 +42,7 @@ Explore the codebase of OpenDevin on [GitHub](https://github.com/OpenDevin/OpenD
/>
</a>
<br></br>
<a href="https://join.slack.com/t/opendevin/shared_invite/zt-2jsrl32uf-fTeeFjNyNYxqSZt5NPY3fA">
<a href="https://join.slack.com/t/opendevin/shared_invite/zt-2ngejmfw6-9gW4APWOC9XUp1n~SiQ6iw">
<img
src="https://img.shields.io/badge/Slack-Join%20Us-red?logo=slack&logoColor=white&style=for-the-badge"
alt="Join our Slack community"

View File

@@ -158,8 +158,6 @@ spec:
env:
- name: SANDBOX_USER_ID
value: "1000"
- name: SANDBOX_BOX_TYPE
value: 'local'
- name: WORKSPACE_MOUNT_PATH
value: "/opt/workspace_base"
volumeMounts:
@@ -290,13 +288,13 @@ RUN mkdir -p /opt/workspace_base && chown -R 1000:1000 /opt/workspace_base
# Verify Git installation
RUN git --version
```
2. Mount a shared development directory "i.e. one hosted in EC2 instance" to the POD:
This can be also done by sharing the developement directory to the worker node through a sharing software (NFS), then creating a pv and pvc as described above to access that directory.
3. Not all Agents working! Just tested CoderAgent with an openai API key and produced results.
3. Not all Agents working! Just tested CoderAgent with an openai API key and produced results.
## Discuss
For other issues or questions join the [Slack](https://join.slack.com/t/opendevin/shared_invite/zt-2jsrl32uf-fTeeFjNyNYxqSZt5NPY3fA) or [Discord](https://discord.gg/ESHStjSjD4) and ask!
For other issues or questions join the [Slack](https://join.slack.com/t/opendevin/shared_invite/zt-2ngejmfw6-9gW4APWOC9XUp1n~SiQ6iw) or [Discord](https://discord.gg/ESHStjSjD4) and ask!

View File

@@ -0,0 +1,181 @@
---
sidebar_position: 4
---
# 📦 EventStream Runtime
The OpenDevin EventStream Runtime is the core component that enables secure and flexible execution of AI agent's action.
It creates a sandboxed environment using Docker, where arbitrary code can be run safely without risking the host system.
## Why do we need a sandboxed runtime?
OpenDevin needs to execute arbitrary code in a secure, isolated environment for several reasons:
1. Security: Executing untrusted code can pose significant risks to the host system. A sandboxed environment prevents malicious code from accessing or modifying the host system's resources.
2. Consistency: A sandboxed environment ensures that code execution is consistent across different machines and setups, eliminating "it works on my machine" issues.
3. Resource Control: Sandboxing allows for better control over resource allocation and usage, preventing runaway processes from affecting the host system.
4. Isolation: Different projects or users can work in isolated environments without interfering with each other or the host system.
5. Reproducibility: Sandboxed environments make it easier to reproduce bugs and issues, as the execution environment is consistent and controllable.
## How does our Runtime work?
The OpenDevin Runtime system uses a client-server architecture implemented with Docker containers. Here's an overview of how it works:
```mermaid
graph TD
A[User-provided Custom Docker Image] --> B[OpenDevin Backend]
B -->|Builds| C[OD Runtime Image]
C -->|Launches| D[Runtime Client]
D -->|Initializes| E[Browser]
D -->|Initializes| F[Bash Shell]
D -->|Initializes| G[Plugins]
G -->|Initializes| L[Jupyter Server]
B -->|Spawn| H[Agent]
B -->|Spawn| I[EventStream]
I <--->|Execute Action to
Get Observation
via REST API
| D
H -->|Generate Action| I
I -->|Obtain Observation| H
subgraph "Docker Container"
D
E
F
G
L
end
```
1. User Input: The user provides a custom base Docker image.
2. Image Building: OpenDevin builds a new Docker image (the "OD runtime image") based on the user-provided image. This new image includes OpenDevin-specific code, primarily the "runtime client."
3. Container Launch: When OpenDevin starts, it launches a Docker container using the OD runtime image.
4. Client Initialization: The runtime client initializes inside the container, setting up necessary components like a bash shell and loading any specified plugins.
5. Communication: The OpenDevin backend (`runtime.py`) communicates with the runtime client over RESTful API, sending actions and receiving observations.
6. Action Execution: The runtime client receives actions from the backend, executes them in the sandboxed environment, and sends back observations.
7. Observation Return: The client sends execution results back to the OpenDevin backend as observations.
The role of the client is crucial:
- It acts as an intermediary between the OpenDevin backend and the sandboxed environment.
- It executes various types of actions (shell commands, file operations, Python code, etc.) safely within the container.
- It manages the state of the sandboxed environment, including the current working directory and loaded plugins.
- It formats and returns observations to the backend, ensuring a consistent interface for processing results.
## Advanced: How OpenDevin builds and maintains OD Runtime images
OpenDevin uses a sophisticated approach to build and manage runtime images. This process ensures efficiency, consistency, and flexibility in creating and maintaining Docker images for both production and development environments.
Check out [relavant code](https://github.com/OpenDevin/OpenDevin/blob/main/opendevin/runtime/utils/runtime_build.py) if you are interested in more details.
### Image Tagging System
OpenDevin uses a dual-tagging system for its runtime images to balance reproducibility with flexibility:
1. Hash-based tag: `{target_image_repo}:{target_image_hash_tag}`
Example: `od_runtime:abc123def456`
- This tag is based on the MD5 hash of the Docker build folder, which includes the source code (of runtime client and related dependencies) and Dockerfile.
- Identical hash tags guarantee that the images were built with exactly the same source code and Dockerfile.
- This ensures reproducibility: the same hash always means the same image contents.
2. Generic tag: `{target_image_repo}:{target_image_tag}`
Example: `od_runtime:od_v0.8.3_ubuntu_tag_22.04`
- This tag follows the format: `od_runtime:od_v{OD_VERSION}_{BASE_IMAGE_NAME}_tag_{BASE_IMAGE_TAG}`
- It represents the latest build for a particular base image and OpenDevin version combination.
- This tag is updated whenever a new image is built from the same base image, even if the source code changes.
The hash-based tag ensures exact reproducibility, while the generic tag provides a stable reference to the latest version of a particular configuration. This dual-tagging approach allows OpenDevin to efficiently manage both development and production environments.
### Build Process
1. Image Naming Convention:
- Hash-based tag: `{target_image_repo}:{target_image_hash_tag}`
Example: `od_runtime:abc123def456`
- Generic tag: `{target_image_repo}:{target_image_tag}`
Example: `od_runtime:od_v0.8.3_ubuntu_tag_22.04`
2. Build Process:
- a. Convert the base image name to an OD runtime image name.
Example: `ubuntu:22.04` -> `od_runtime:od_v0.8.3_ubuntu_tag_22.04`
- b. Generate a build context (Dockerfile and OpenDevin source code) and calculate its hash.
- c. Check for an existing image with the calculated hash.
- d. If not found, check for a recent compatible image to use as a base.
- e. If no compatible image exists, build from scratch using the original base image.
- f. Tag the new image with both hash-based and generic tags.
3. Image Reuse and Rebuilding Logic:
The system follows these steps to determine whether to build a new image or use an existing one from a user-provided (base) image (e.g., `ubuntu:22.04`):
a. If an image exists with the same hash (e.g., `od_runtime:abc123def456`), it will be reused as is.
b. If the exact hash is not found, the system will try to rebuild using the latest generic image (e.g., `od_runtime:od_v0.8.3_ubuntu_tag_22.04`) as a base. This saves time by leveraging existing dependencies.
c. If neither the hash-tagged nor the generic-tagged image is found, the system will build the image completely from scratch.
4. Caching and Efficiency:
- The system attempts to reuse existing images when possible to save build time.
- If an exact match (by hash) is found, it's used without rebuilding.
- If a compatible image is found, it's used as a base for rebuilding, saving time on dependency installation.
Here's a flowchart illustrating the build process:
```mermaid
flowchart TD
A[Start] --> B{Convert base image name}
B --> |ubuntu:22.04 -> od_runtime:od_v0.8.3_ubuntu_tag_22.04| C[Generate build context and hash]
C --> D{Check for existing image with hash}
D -->|Found od_runtime:abc123def456| E[Use existing image]
D -->|Not found| F{Check for od_runtime:od_v0.8.3_ubuntu_tag_22.04}
F -->|Found| G[Rebuild based on recent image]
F -->|Not found| H[Build from scratch]
G --> I[Tag with hash and generic tags]
H --> I
E --> J[End]
I --> J
```
This approach ensures that:
1. Identical source code and Dockerfile always produce the same image (via hash-based tags).
2. The system can quickly rebuild images when minor changes occur (by leveraging recent compatible images).
3. The generic tag (e.g., `od_runtime:od_v0.8.3_ubuntu_tag_22.04`) always points to the latest build for a particular base image and OpenDevin version combination.
By using this method, OpenDevin maintains an efficient and flexible system for building and managing runtime images, adapting to both development needs and production requirements.
## Advanced: Runtime Plugin System
The OpenDevin Runtime supports a plugin system that allows for extending functionality and customizing the runtime environment. Plugins are initialized when the runtime client starts up.
Check [an example of Jupyter plugin here](https://github.com/OpenDevin/OpenDevin/blob/9c44d94cef32e6426ebd8deeeb52963153b2348a/opendevin/runtime/plugins/jupyter/__init__.py#L30-L63) if you want to implement your own plugin.
*More details about the Plugin system are still under construction - contributions are welcomed!*
Key aspects of the plugin system:
1. Plugin Definition: Plugins are defined as Python classes that inherit from a base `Plugin` class.
2. Plugin Registration: Available plugins are registered in an `ALL_PLUGINS` dictionary.
3. Plugin Specification: Plugins are associate with `Agent.sandbox_plugins: list[PluginRequirement]`. Users can specify which plugins to load when initializing the runtime.
4. Initialization: Plugins are initialized asynchronously when the runtime client starts.
5. Usage: The runtime client can use initialized plugins to extend its capabilities (e.g., the JupyterPlugin for running IPython cells).

1121
docs/package-lock.json generated

File diff suppressed because it is too large Load Diff

View File

@@ -18,6 +18,7 @@
"@docusaurus/core": "^3.4.0",
"@docusaurus/plugin-content-pages": "^3.4.0",
"@docusaurus/preset-classic": "^3.4.0",
"@docusaurus/theme-mermaid": "^3.4.0",
"@mdx-js/react": "^3.0.0",
"clsx": "^2.0.0",
"prism-react-renderer": "^2.3.0",

View File

@@ -17,11 +17,9 @@ function CustomFooter() {
</a>
</div>
</div>
<div className="footer-community">
<Translate id="footer.community">Community</Translate>
</div>
<div className="footer-icons">
<a href="https://join.slack.com/t/opendevin/shared_invite/zt-2jsrl32uf-fTeeFjNyNYxqSZt5NPY3fA" target="_blank" rel="noopener noreferrer">
<a href="https://join.slack.com/t/opendevin/shared_invite/zt-2ngejmfw6-9gW4APWOC9XUp1n~SiQ6iw" target="_blank" rel="noopener noreferrer">
<FaSlack />
</a>
<a href="https://discord.gg/ESHStjSjD4" target="_blank" rel="noopener noreferrer">

View File

@@ -6,7 +6,7 @@ export function Demo() {
return (
<div
style={{ paddingBottom: "30px", paddingTop: "20px", textAlign: "center" }}
style={{ paddingBottom: "10px", paddingTop: "10px", textAlign: "center" }}
>
<video
playsInline

View File

@@ -14,15 +14,28 @@ export function HomepageHeader() {
<Heading as="h1" className="header-title">
{siteConfig.title}
</Heading>
<p className="header-subtitle">{siteConfig.tagline}</p>
<div className="header-buttons">
<Link
className="button button--secondary button--lg"
to="/modules/usage/intro"
>
<Translate id="homepage.getStarted">Get Started</Translate>
</Link>
<div className="header-links">
<a href="https://github.com/OpenDevin/OpenDevin">
<img src="https://img.shields.io/badge/Code-Github-purple?logo=github&logoColor=white&style=for-the-badge" alt="Code" />
</a>
<a href="https://join.slack.com/t/opendevin/shared_invite/zt-2i1iqdag6-bVmvamiPA9EZUu7oCO6KhA">
<img src="https://img.shields.io/badge/Slack-Join%20Us-red?logo=slack&logoColor=white&style=for-the-badge" alt="Join our Slack community" />
</a>
<a href="https://discord.gg/ESHStjSjD4">
<img src="https://img.shields.io/badge/Discord-Join%20Us-purple?logo=discord&logoColor=white&style=for-the-badge" alt="Join our Discord community" />
</a>
<a href="https://arxiv.org/abs/2407.16741">
<img src="https://img.shields.io/badge/Paper-%20on%20Arxiv-red?logo=arxiv&style=for-the-badge" alt="Paper on Arxiv" />
</a>
<a href="https://huggingface.co/spaces/OpenDevin/evaluation">
<img src="https://img.shields.io/badge/Evaluation-Benchmark%20on%20HF%20Space-green?logo=huggingface&style=for-the-badge" alt="Evaluation Benchmark" />
</a>
</div>
<Demo />
</div>
</div>

View File

@@ -1,20 +0,0 @@
import "../../css/welcome.css";
import Translate from '@docusaurus/Translate';
export function Welcome() {
return (
<div className="text-white">
<div className="welcome-container">
<img src="img/logo.png" className="welcome-logo" />
<p className="welcome-text">
<Translate id="welcome.message">
Welcome to OpenDevin, an open-source autonomous AI software engineer
that is capable of executing
complex engineering tasks and collaborating actively with users on
software development projects.
</Translate>
</p>
</div>
</div>
);
}

View File

@@ -1,66 +0,0 @@
/* faq.css */
.faq-container {
margin: auto;
padding: 24px;
display: flex;
flex-direction: column;
gap: 8px;
margin-bottom: 24px;
}
.faq-title {
display: flex;
align-items: center;
justify-content: center;
font-size: 2rem;
padding: 8px;
text-transform: uppercase;
font-weight: bold;
}
@media (min-width: 1024px) {
.faq-title {
font-size: 6rem;
}
}
.faq-section {
display: flex;
flex-direction: column;
gap: 8px;
width: 100%;
margin-bottom: 24px;
}
.faq-section-title {
text-transform: uppercase;
font-weight: bold;
font-size: 2rem;
letter-spacing: 0.1em;
}
.highlight {
font-weight: 600;
color: var(--logo);
}
.faq-steps ol {
padding-left: 24px;
}
.command-box {
display: flex;
flex-direction: column;
padding: 8px;
background-color: #e0e0e0;
border-radius: 0.375rem;
height: 6vh;
text-transform: uppercase;
color: #4a5568;
}
.command-box + .command-box {
height: 8vh;
}

View File

@@ -3,12 +3,12 @@
.custom-footer {
background-color: dark;
color: white;
height: 25vh;
height: 200px;
/* background: linear-gradient(to bottom, #1a1a1a, #1a1a1a); */
background: linear-gradient(to bottom, #1f2937, #000000);
}
.footer-content {
display: flex;
flex-direction: column;
@@ -17,56 +17,55 @@
padding: 8px;
height: 100%;
}
.footer-top {
display: flex;
gap: 8px;
align-items: center;
}
.footer-title {
font-weight: bold;
font-size: 1.125rem;
}
@media (min-width: 768px) {
.footer-title {
font-size: 1.875rem;
}
}
.footer-link a {
font-size: 0.875rem;
text-decoration: none;
color: gray;
transition: color 0.3s ease;
}
.footer-link a:hover {
color: white;
}
.footer-community {
text-transform: uppercase;
font-weight: 300;
}
.footer-icons {
display: flex;
gap: 24px;
font-size: 1.875rem;
}
.footer-icons a {
color:gray;
transition: color 0.3s ease;
}
.footer-icons a:hover {
color: white;
}
.footer-bottom {
text-transform: uppercase;
}

View File

@@ -1,36 +1,47 @@
/* homepageHeader.css */
.homepage-header {
height: 100vh;
color: white;
background: linear-gradient(to top, #64748b, #000000);
}
.header-content {
display: flex;
flex-direction: column;
gap: 8px;
align-items: center;
padding: 24px;
font-weight: 300;
width: 100%;
}
height: 800px;
color: white;
background: linear-gradient(to top, #64748b, #000000);
}
.header-content {
display: flex;
flex-direction: column;
align-items: center;
padding: 2rem;
font-weight: 300;
width: 100%;
}
.header-title {
font-size: 3rem;
}
@media (min-width: 768px) {
.header-title {
font-size: 3rem;
font-size: 4rem;
}
@media (min-width: 768px) {
.header-title {
font-size: 5rem;
}
}
.header-subtitle {
font-size: 1.25rem;
}
.header-buttons {
margin-top: 24px;
}
}
.header-subtitle {
font-size: 1.5rem;
}
.header-links {
display: flex;
flex-wrap: wrap;
justify-content: center;
gap: 10px;
max-width: 680px;
}
.header-links a {
display: inline-block;
transition: transform 0.2s ease-in-out;
}
.header-links a:hover {
transform: translateY(-2px);
}

View File

@@ -1,53 +0,0 @@
/* welcome.css */
.text-white {
color: white;
}
.welcome-container {
display: flex;
justify-content: center;
align-items: center;
flex-direction: column;
background: linear-gradient(to bottom, #64748b, #1f2937);
}
@media (min-width: 768px) {
.welcome-container {
flex-direction: row;
background: linear-gradient(to bottom, #64748b, #1f2937);
}
}
.welcome-logo {
height: 45vh;
width: 45vw;
}
@media (max-width: 640px) {
.welcome-logo {
height: 40vw;
width: 40vw;
}
}
@media (min-width: 768px) {
.welcome-logo {
height: auto;
width: 350px;
}
}
.welcome-text {
padding: 24px;
margin-bottom: 24px;
font-weight: 300;
font-size: 1.125rem;
}
@media (min-width: 768px) {
.welcome-text {
padding: 8px;
font-size: 1.5rem;
}
}

View File

@@ -1,129 +0,0 @@
import Layout from '@theme/Layout';
import '../css/faq.css';
import Translate, { translate } from '@docusaurus/Translate';
export default function FAQ() {
const githubLink = (
<a href="https://github.com/OpenDevin/OpenDevin/issues" target="_blank">GitHub</a>
);
const discordLink = (
<a href="https://discord.gg/mBuDGRzzES" target="_blank">Discord</a>
);
const slackLink = (
<a href="https://join.slack.com/t/opendevin/shared_invite/zt-2jsrl32uf-fTeeFjNyNYxqSZt5NPY3fA" target="_blank">Slack</a>
);
return (
<Layout
title={translate({ id: 'faq.title', message: 'FAQ' })}
description={translate({ id: 'faq.description', message: 'Frequently Asked Questions' })}
>
<div id="faq" className="faq-container">
<div className="faq-title">
<Translate id="faq.title" description="FAQ Title">Frequently Asked Questions</Translate>
</div>
<div className="faq-section">
<div className="faq-section-title">
<Translate id="faq.section.title.1" description="First Section Title">What is OpenDevin?</Translate>
</div>
<p>
<span className="highlight"><Translate id="faq.section.highlight" description="Highlight Text">OpenDevin</Translate></span>{" "}
<Translate id="faq.section.description.1" description="Description for OpenDevin">
is an autonomous software engineer that can solve software engineering
and web-browsing tasks end-to-end. It can perform data science queries, such
as "Find the number of pull requests to the OpenDevin repository in the last
month," and software engineering tasks, such as "Please add tests to this
file and verify that all the tests pass. If they don't fix the file."
</Translate>
</p>
<p>
<Translate id="faq.section.description.2" description="Further Description for OpenDevin">
At the same time, OpenDevin is a platform and community for agent developers
to test out and evaluate new agents.
</Translate>
</p>
</div>
<div className="faq-section">
<div className="faq-section-title">
<Translate id="faq.section.title.2" description="Support Section Title">Support</Translate>
</div>
<div>
<Translate
id="faq.section.support.answer"
description="Support Answer"
values={{
githubLink: githubLink,
discordLink: discordLink,
slackLink: slackLink,
}}
>
{`Please file a bug on {githubLink} if you notice a problem that likely affects others. If you're having trouble installing, or have general questions, reach out on {discordLink} or {slackLink}.`}
</Translate>
</div>
</div>
<div className="faq-section">
<div className="faq-section-title">
<Translate id="faq.section.title.3" description="GitHub Issue Section Title">How to fix a GitHub issue with OpenDevin?</Translate>
</div>
<div className="faq-steps">
<Translate id="faq.section.github.steps.intro" description="GitHub Steps Introduction">
To fix an issue on GitHub using OpenDevin, send a prompt to OpenDevin asking it to follow
steps like the following:
</Translate>
<ol>
<li><Translate id="faq.section.github.step1" description="GitHub Step 1">Read the issue https://github.com/OpenDevin/OpenDevin/issues/1611</Translate></li>
<li><Translate id="faq.section.github.step2" description="GitHub Step 2">Clone the repository and check out a new branch</Translate></li>
<li><Translate id="faq.section.github.step3" description="GitHub Step 3">Based on the instructions in the issue description, modify files to fix the issue</Translate></li>
<li><Translate id="faq.section.github.step4" description="GitHub Step 4">Push the resulting output to GitHub using the GITHUB_TOKEN environment variable</Translate></li>
<li><Translate id="faq.section.github.step5" description="GitHub Step 5">Tell me the link that I need to go to to send a pull request</Translate></li>
</ol>
<Translate id="faq.section.github.steps.preRun" description="GitHub Steps Pre-Run">
Before you run OpenDevin, you can do:
</Translate>
<div className="command-box">
export SANDBOX_ENV_GITHUB_TOKEN=XXX
</div>
<Translate id="faq.section.github.steps.tokenInfo" description="GitHub Steps Token Info">
where XXX is a GitHub token that you created that has permissions to push to the OpenDevin repo. If you dont have write permission to the OpenDevin repo, you might need to change that to:
</Translate>
<div className="command-box">
Push the resulting output to my fork at https://github.com/USERNAME/OpenDevin/ using the GITHUB_TOKEN environment variable
</div>
<Translate id="faq.section.github.steps.usernameInfo" description="GitHub Steps Username Info">
where USERNAME is your GitHub username.
</Translate>
</div>
</div>
<div className="faq-section">
<div className="faq-section-title">
<Translate id="faq.section.title.4" description="Devin Section Title">How is OpenDevin different from Devin?</Translate>
</div>
<p>
<a href="https://www.cognition.ai/blog/introducing-devin"><Translate id="faq.section.devin.linkText" description="Devin Link Text">Devin</Translate></a>&nbsp;
<Translate id="faq.section.devin.description" description="Devin Description">
is a commercial product by Cognition Inc., that served as the initial
inspiration for OpenDevin. They both aim to do a good job at solving software
engineering tasks, but OpenDevin you can download, use, and modify, while Devin
you can only use through the Cognition site. In addition, OpenDevin has evolved
beyond the initial inspiration, and now serves as a community-driven ecosystem for
agent development in general, and we'd love to have you join and
</Translate>
<a href="https://github.com/OpenDevin/OpenDevin/blob/main/CONTRIBUTING.md"><Translate id="faq.section.devin.contribute" description="Contribute Link">contribute</Translate></a>!
</p>
</div>
<div className="faq-section">
<div className="faq-section-title">
<Translate id="faq.section.title.5" description="ChatGPT Section Title">How is OpenDevin different from ChatGPT?</Translate>
</div>
<p>
<Translate id="faq.section.chatgpt.description" description="ChatGPT Description">
ChatGPT you can access online, it does not interface with local files, and
its ability to execute code is limited. So it can write code, but it is not
easy to test or execute it.
</Translate>
</p>
</div>
</div>
</Layout>
);
}

View File

@@ -4,12 +4,11 @@ import { HomepageHeader } from "../components/HomepageHeader/HomepageHeader";
import { Welcome } from "../components/Welcome/Welcome";
import { translate } from '@docusaurus/Translate';
export function Header({ title, summary, description }): JSX.Element {
export function Header({ title, summary }): JSX.Element {
return (
<div>
<h1>{title}</h1>
<h2 style={{ fontSize: "40px" }}>{summary}</h2>
<h3 className="headerDescription">{description}</h3>
<h2 style={{ fontSize: "3rem" }}>{summary}</h2>
</div>
);
}
@@ -17,22 +16,15 @@ export function Header({ title, summary, description }): JSX.Element {
export default function Home(): JSX.Element {
const { siteConfig } = useDocusaurusContext();
return (
<>
<Layout
title={`${siteConfig.title}`}
description={translate({
id: 'homepage.description',
message: 'AI-powered code generation for software engineering.',
message: 'An Open Platform for AI Software Developers as Generalist Agents',
description: 'The homepage description',
})}
>
<div>
<HomepageHeader />
<div>
<Welcome />
</div>
</div>
<HomepageHeader />
</Layout>
</>
);
}

Binary file not shown.

View File

@@ -2,9 +2,10 @@
This folder contains evaluation harness for evaluating agents on the Entity-deduction-Arena Benchmark, from the paper [Probing the Multi-turn Planning Capabilities of LLMs via 20 Question Games](https://arxiv.org/abs/2310.01468), presented in ACL 2024 main conference.
## Configure OpenDevin and your LLM
## Setup Environment and LLM Configuration
Please follow instruction [here](../README.md#setup) to setup your local development environment and LLM.
Create a `config.toml` file if it does not exist at the root of the workspace. Please check [README.md](../../README.md) for how to set this up.
## Start the evaluation

View File

@@ -1,30 +1,27 @@
import asyncio
import logging
import os
import pandas as pd
# import huggingface_hub
from datasets import load_dataset
from evaluation.EDA.game import Q20Game, Q20GameCelebrity
from evaluation.utils.shared import (
EvalMetadata,
EvalOutput,
make_metadata,
prepare_dataset,
reset_logger_for_multiprocessing,
run_evaluation,
)
from opendevin.controller.agent import Agent
# from evaluation.EDA.scorer import question_scorer
from opendevin.controller.state.state import State
from opendevin.core.config import get_llm_config_arg, get_parser, load_app_config
from opendevin.core.logger import get_console_handler
from opendevin.core.config import (
AppConfig,
SandboxConfig,
get_llm_config_arg,
get_parser,
)
from opendevin.core.logger import opendevin_logger as logger
from opendevin.core.main import run_agent_controller
from opendevin.llm.llm import LLM
config = load_app_config()
from opendevin.core.main import create_runtime, run_controller
game = None
@@ -56,39 +53,44 @@ AGENT_CLS_TO_INST_SUFFIX = {
}
def process_instance(
def get_config(
metadata: EvalMetadata,
) -> AppConfig:
config = AppConfig(
default_agent=metadata.agent_class,
run_as_devin=False,
runtime='eventstream',
max_iterations=metadata.max_iterations,
sandbox=SandboxConfig(
container_image='ubuntu:22.04',
enable_auto_lint=False,
use_host_network=False,
),
# do not mount workspace
workspace_base=None,
workspace_mount_path=None,
)
config.set_llm_config(metadata.llm_config)
return config
async def process_instance(
instance: pd.Series,
metadata: EvalMetadata,
reset_logger: bool = True,
):
# Create the agent
agent = Agent.get_cls(metadata.agent_class)(llm=LLM(config=metadata.llm_config))
) -> EvalOutput:
config = get_config(metadata)
instance_id = instance['text'].strip()
# Setup the logger properly, so you can run multi-processing to parallelize the evaluation
eval_output_dir = metadata.eval_output_dir
if reset_logger:
# Set up logger
log_file = os.path.join(
eval_output_dir, 'logs', f'instance_{instance["text"].strip()}.log'
)
# Remove all existing handlers from logger
for handler in logger.handlers[:]:
logger.removeHandler(handler)
# add back the console handler to print ONE line
logger.addHandler(get_console_handler())
logger.info(
f'Starting evaluation for instance {instance["text"].strip()}.\nLOG: tail -f {log_file}'
)
# Remove all existing handlers from logger
for handler in logger.handlers[:]:
logger.removeHandler(handler)
file_handler = logging.FileHandler(log_file)
file_handler.setFormatter(
logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
)
logger.addHandler(file_handler)
log_dir = os.path.join(metadata.eval_output_dir, 'infer_logs')
reset_logger_for_multiprocessing(logger, instance_id, log_dir)
else:
logger.info(f'Starting evaluation for instance {instance_id}.')
# Prepare instruction
_game_class = {'things': Q20Game, 'celebs': Q20GameCelebrity}
_game_class = {'eda-things': Q20Game, 'eda-celebs': Q20GameCelebrity}
guesser_kargs = {
'max_new_tokens': 64,
@@ -112,24 +114,16 @@ def process_instance(
instruction = f'{game.first_user_utterance}'
logger.info(f'Instruction: {instruction}')
# instruction += 'IMPORTANT: You should ONLY interact with the environment provided to you AND NEVER ASK FOR HUMAN HELP.\n'
# NOTE: You can actually set slightly different instruction for different agents
instruction += AGENT_CLS_TO_INST_SUFFIX[agent.__class__.__name__]
instruction += AGENT_CLS_TO_INST_SUFFIX[metadata.agent_class]
# Here's how you can run the agent (similar to the `main` function) and get the final task state
runtime = await create_runtime(config, sid=instance['text'].strip())
state: State | None = asyncio.run(
run_agent_controller(
agent,
instruction,
max_iterations=metadata.max_iterations,
max_budget_per_task=config.max_budget_per_task,
fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN[
agent.__class__.__name__
],
sid=instance['text'].strip(),
)
state: State | None = await run_controller(
config=config,
task_str=instruction,
runtime=runtime,
fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN[metadata.agent_class],
)
# ======= Attempt to evaluate the agent's edits =======
# If you are working on simpler benchmark that only evaluates the final model output (e.g., in a MessageAction)
@@ -150,21 +144,20 @@ def process_instance(
histories = state.history.compatibility_for_eval_history_pairs()
# Save the output
output = {
'instance_id': instance['text'].strip(),
'instance': instance,
'instruction': instruction,
'metadata': metadata.model_dump(),
'history': histories,
'metrics': metrics,
'error': state.last_error if state and state.last_error else None,
'test_result': {
output = EvalOutput(
instance_id=instance_id,
instance=instance.to_dict(),
instruction=instruction,
metadata=metadata,
history=histories,
metrics=metrics,
error=state.last_error if state and state.last_error else None,
test_result={
'success': test_result,
'final_message': final_message,
'ground_truth': instance['text'],
},
}
)
return output
@@ -191,12 +184,16 @@ if __name__ == '__main__':
)
args, _ = parser.parse_known_args()
llm_config = get_llm_config_arg(args.llm_config) if args.llm_config else config.llm
logger.info(f'Config for evaluation: {config}')
eda_dataset = load_dataset(
'yizheapple/entity-deduction-arena', name=args.dataset, split=args.data_split
)
eda_dataset.rename(columns={'text': 'instance_id'}, inplace=True)
llm_config = None
if args.llm_config:
llm_config = get_llm_config_arg(args.llm_config)
if llm_config is None:
raise ValueError(f'Could not find LLM config: --llm_config {args.llm_config}')
metadata = make_metadata(
llm_config,
@@ -214,16 +211,15 @@ if __name__ == '__main__':
output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl')
prepared_dataset = prepare_dataset(
eda_dataset.to_pandas(), output_file, args.eval_n_limit, 'text'
eda_dataset.to_pandas(), output_file, args.eval_n_limit
)
agent = Agent.get_cls(args.agent_cls)(llm=LLM(config.llm))
run_evaluation(
prepared_dataset,
metadata,
output_file,
args.eval_num_workers,
process_instance,
'text',
asyncio.run(
run_evaluation(
prepared_dataset,
metadata,
output_file,
args.eval_num_workers,
process_instance,
)
)

0
evaluation/EDA/scripts/run_infer.sh Normal file → Executable file
View File

View File

@@ -12,15 +12,59 @@ all the preprocessing/evaluation/analysis scripts.
## Supported Benchmarks
To learn more about how to integrate your benchmark into OpenDevin, check out [tutorial here](https://docs.all-hands.dev/modules/usage/evaluation_harness).
### Software Engineering
- SWE-Bench: [`evaluation/swe_bench`](./swe_bench)
- ML-Bench: [`evaluation/ml_bench`](./ml_bench)
- HumanEvalFix: [`evaluation/humanevalfix`](./humanevalfix)
- GAIA: [`evaluation/gaia`](./gaia)
- Entity deduction Arena (EDA): [`evaluation/EDA`](./EDA)
- MINT: [`evaluation/mint`](./mint)
- AgentBench: [`evaluation/agent_bench`](./agent_bench)
- BIRD: [`evaluation/bird`](./bird)
- LogicReasoning: [`evaluation/logic_reasoning`](./logic_reasoning)
- BioCoder: [`evaluation/ml_bench`](./ml_bench)
- ML-Bench: [`evaluation/ml_bench`](./ml_bench)
- APIBench: [`evaluation/gorilla`](./gorilla/)
- ToolQA: [`evaluation/toolqa`](./toolqa/)
### Web Browsing
- WebArena: [`evaluation/webarena`](./webarena/)
- MiniWob++: [`evaluation/miniwob`](./miniwob/)
### Misc. Assistance
- GAIA: [`evaluation/gaia`](./gaia)
- GPQA: [`evaluation/gpqa`](./gpqa)
- AgentBench: [`evaluation/agent_bench`](./agent_bench)
- MINT: [`evaluation/mint`](./mint)
- Entity deduction Arena (EDA): [`evaluation/EDA`](./EDA)
- ProofWriter: [`evaluation/logic_reasoning`](./logic_reasoning)
## Before everything begins: Setup Environment and LLM Configuration
Please follow instruction [here](https://github.com/OpenDevin/OpenDevin/blob/main/Development.md) to setup your local development environment and LLM.
OpenDevin in development mode uses `config.toml` to keep track of most configurations.
Here's an example configuration file you can use to define and use multiple LLMs:
```toml
[llm]
# IMPORTANT: add your API key here, and set the model to the one you want to evaluate
model = "gpt-4o-2024-05-13"
api_key = "sk-XXX"
[llm.eval_gpt4_1106_preview_llm]
model = "gpt-4-1106-preview"
api_key = "XXX"
temperature = 0.0
[llm.eval_some_openai_compatible_model_llm]
model = "openai/MODEL_NAME"
base_url = "https://OPENAI_COMPATIBLE_URL/v1"
api_key = "XXX"
temperature = 0.0
```
### Result Visualization

View File

@@ -1,186 +0,0 @@
# Tutorial: How to add a New Evaluation Benchmark to OpenDevin
This tutorial provides a general guide on how to integrate your own evaluation benchmark into the OpenDevin framework.
You can read this for details, and also learn by example by looking at our existing evaluations:
- [swe_bench](swe_bench/)
## A quick walk-through of OpenDevin architecture
### Before everything begins
Please follow [this document](https://github.com/OpenDevin/OpenDevin/blob/main/Development.md) to setup local develop environment for OpenDevin.
### Configuration file
OpenDevin uses `config.toml` to keep track of most configurations.
Here's an example configuration file you can use:
```toml
[core]
max_iterations = 100
cache_dir = "/tmp/cache"
# IMPORTANT: You should set these two paths to YOUR WORKSPACE directory,
# which will be mounted into Sandbox for agent to interact with!
# The OpenDevin agent will be able to read/write files whatever they like (even rm -rf)
# in this directory, so be careful!!
workspace_base = "/path/to/your/workspace"
workspace_mount_path = "/path/to/your/workspace"
# ==========================
ssh_hostname = "localhost"
run_as_devin = false
[sandbox]
# SWEBench eval specific - but you can tweak it to your needs
use_host_network = false
# linting python after editing helps LLM fix indentations
enable_auto_lint = true
box_type = "ssh"
timeout = 120
[llm]
# IMPORTANT: add your API key here, and set the model to the one you want to evaluate
model = "gpt-4o-2024-05-13"
api_key = "sk-XXX"
```
### How to use OpenDevin programmatically
In this section, for the purpose of building an evaluation task, we don't use the standard OpenDevin web-based GUI, but rather run OpenDevin backend from CLI.
For example, you can run the following, which performs the specified task `-t`, with a particular model config `-l` and agent `-c`, for a maximum number of iterations `-i`:
```bash
poetry run python ./opendevin/core/main.py \
-i 10 \
-t "Write me a bash script that print hello world." \
-c CodeActAgent \
-l llm
```
After running the script, you will observe the following:
![](./static/example_task_1.png)
You can see the agent uses bash to write a script, makes it executable, and then tests it by running it to make sure it is working.
At the end of the above screenshot, OpenDevin actually requests user inputs when it think it finishes the task. This will cause issues in evaluation, since most evaluation don't assume additional user input. To fix this, we introduce the functionality of `fake_user_response_fn` in the `main` function, which we describe in the next section.
## The `main` function
The signature of `main` (in file [[`opendevin/core/main.py`](../opendevin/core/main.py)]) is as follows:
```python
async def main(
task_str: str = '',
exit_on_message: bool = False,
fake_user_response_fn: Optional[Callable[[Optional[State]], str]] = None,
sandbox: Optional[Sandbox] = None,
) -> Optional[State]:
```
- `task_str`: The task instruction to run. In the above example, it is "Write me a bash script that print hello world."
- `exit_on_message`: whether to quit if the agent asks for a message from user
- `fake_user_response_fn`: An optional function that receives the current state (could be None) and returns a fake user response.
- `sandbox`: An optional sandbox to run the agent in.
### `fake_user_response_fn`
Here's an example of `fake_user_response_fn` in the implementation for SWE-Bench in [`evaluation/swe_bench/run_infer.py`](swe_bench/run_infer.py):
```python
def codeact_user_response(state: State) -> str:
msg = (
'Please continue working on the task on whatever approach you think is suitable.\n'
'If you think you have modified the code in a way that fixes the issue, please run the following command: <execute_bash> exit </execute_bash>.\n'
'IMPORTANT: YOU SHOULD NEVER ASK FOR HUMAN HELP OR USE THE INTERNET TO SOLVE THIS TASK.\n'
)
# check if the agent has tried to talk to the user 3 times, if so, let the agent know it can give up
if state.history:
user_msgs = [
event
for event in state.history.get_events()
if isinstance(action, MessageAction) and action.source == 'user'
]
if len(user_msgs) > 2:
# let the agent know that it can give up when it has tried 3 times
return (
msg
+ 'If you want to give up, run: <execute_bash> exit </execute_bash>.\n'
)
return msg
```
### Return value
The main function returns a `State`, which is defined in [`opendevin/controller/state/state.py`](../opendevin/controller/state/state.py). We are mainly using `state.history` here, which is the most important field of data. You can imagine it is being a more structured version of OpenAI's chat completion [messages](https://platform.openai.com/docs/guides/text-generation/chat-completions-api).
`history: list[tuple[Action, Observation]] = field(default_factory=list)` is a list of (action, observation) tuple. All the actions are defined at [`opendevin/events/action`](../opendevin/events/action) and observations are defined at [`opendevin/events/observation`](../opendevin/events/action).
The agent can emit different actions like `CmdRunAction` (`opendevin/events/action/commands.py`) to execute bash commands and receive `CmdOutputObservation` (`opendevin/events/observation/commands.py`), `IPythonRunCellAction` to receive `IPythonRunCellObservation`, `BrowseInteractiveAction` (`opendevin/events/action/browse.py`) to browse the web and receive `BrowserOutputObservation` (`opendevin/events/observation/browse.py`).
The action we used in this example is `MessageAction` (`opendevin/events/action/message.py`), which actually denotes a message from either `agent` or `user`. In the [CodeAct agent example](https://github.com/OpenDevin/OpenDevin/blob/7ca560471bd262f22513f3863995d0a8e6121c07/agenthub/codeact_agent/codeact_agent.py#L239-L273), an agent is considered to emit a `MessageAction` when it does not trigger a `CmdRunAction`, `IPythonRunCellAction`, and/or `BrowseInteractiveAction`.
Typically, the agent returns `MessageAction` when it is confused about the task, and want to ask human for follow-up clarification, which is a good thing in real-world task, but not necessarily in evaluation. So in this example, we provide a dummy prompt to tell the agent "Please continue working on the task on whatever approach you think is suitable[...]".
If you see something like this, you can consider adding this to your evaluation pipeline as well.
### `sandbox`
Sandbox is a fully functioning docker container where the agent can perform all sorts of tasks, e.g., using bash, calling Python, install packages, and more. You can leave `sandbox` to `None` if you don't need to do anything special to pre-configure the `Sandbox`.
In SWE-Bench, we need to copy the proper repository directory to the workspace and activate the right python virtual environment before the agent can start performing the task, so we actually defined a custom [`SWEBenchSSHBox`](https://github.com/OpenDevin/OpenDevin/blob/7ca560471bd262f22513f3863995d0a8e6121c07/evaluation/swe_bench/swe_env_box.py#L12-L118) that inherit from the default sandbox [`SSHBox`](https://github.com/OpenDevin/OpenDevin/blob/7ca560471bd262f22513f3863995d0a8e6121c07/opendevin/runtime/docker/ssh_box.py#L188) and handles all these initial setup. If you need to configure the `sandbox` for your evaluation, check `SWEBenchSSHBox` for a reference of implementation.
## How to put together an evaluation script?
Now we know how to start running the agent end-to-end, and how `fake_user_response_fn` and `sandbox` work. We will walk through a piece of dummy code (simplified version of SWE-Bench's [`run_infer.py`](https://github.com/OpenDevin/OpenDevin/blob/main/evaluation/swe_bench/run_infer.py)) that outline the general workflow:
- Load the dataset and prepare the evaluation configuration.
- Filter out any instances that have already been processed.
- For each instance in the dataset:
- Set up the sandbox environment.
- Run the agent to generate a solution.
- Apply the solution to the instance and execute the test command.
- Collect the results and write them to the output file.
- Perform cleanup after the evaluation is complete.
You can see the [swe_bench/run_infer.py](swe_bench/run_infer.py) file for an example.
When you fully understand the `run_infer.py`, you can be ready to actually starting the evaluation!
## Run the evaluation!
You can write your `run_infer.sh` script mimicking SWE-Bench's [`run_infer.sh`](https://github.com/OpenDevin/OpenDevin/blob/main/evaluation/swe_bench/scripts/run_infer.sh).
You can start the evaluation by running:
```bash
./run_infer.sh eval_gpt_4o_2024_05_13
```
Where `eval_gpt_4o_2024_05_13` is the model config you defined on the config.toml.
Like this:
```toml
[core]
...
[llm]
model="gpt-4-32k"
...
[eval_gpt_4o_2024_05_13]
model="gpt-4o-2024-05-13"
api_key="sk-xxx"
```
If `[eval_gpt_4o_2024_05_13]` is not present, it will default to using the model configured in `[llm]`.

View File

@@ -1,44 +1,10 @@
# AgentBench Evaluation
This folder contains evaluation harness for evaluating agents on
the [AgentBench: Evaluating LLMs as Agents](https://arxiv.org/abs/2308.03688).
This folder contains evaluation harness for evaluating agents on the [AgentBench: Evaluating LLMs as Agents](https://arxiv.org/abs/2308.03688). We currently only support running on the `osbench` subset.
## Configure OpenDevin and your LLM
## Setup Environment and LLM Configuration
Create a `config.toml` file if it does not exist at the root of the workspace. Please check [README.md](../../README.md)
for how to set this up.
Here is an example `config.toml` file:
```toml
[core]
max_iterations = 100
cache_dir = "/path/to/cache"
workspace_base = "/path/to/workspace"
workspace_mount_path = "/path/to/workspace"
ssh_hostname = "localhost"
# AgentBench specific
run_as_devin = true
[sandbox]
use_host_network = false
enable_auto_lint = true
box_type = "ssh"
timeout = 120
[llm.eval_gpt35_turbo]
model = "gpt-3.5-turbo"
api_key = "sk-123"
temperature = 0.0
[llm.eval_gpt4o]
model = "gpt-4o"
api_key = "sk-123"
temperature = 0.0
```
Please follow instruction [here](../README.md#setup) to setup your local development environment and LLM.
## Start the evaluation
@@ -46,7 +12,18 @@ temperature = 0.0
./evaluation/agent_bench/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit]
```
Following is the basic command to start the evaluation. Here we are only evaluating the `osbench` for now.
- `model_config`, e.g. `eval_gpt4_1106_preview`, is the config group name for your
LLM settings, as defined in your `config.toml`.
- `git-version`, e.g. `HEAD`, is the git commit hash of the OpenDevin version you would
like to evaluate. It could also be a release tag like `0.6.2`.
- `agent`, e.g. `CodeActAgent`, is the name of the agent for benchmarks, defaulting
to `CodeActAgent`.
- `eval_limit`, e.g. `10`, limits the evaluation to the first `eval_limit` instances. By
default, the script evaluates the entire SWE-bench_Lite test set (300 issues). Note:
in order to use `eval_limit`, you must also set `agent`.
Following is the basic command to start the evaluation.
You can update the arguments in the script `evaluation/agent_bench/scripts/run_infer.sh`, such as `--max-iterations`, `--eval-num-workers` and so on.
@@ -57,5 +34,5 @@ You can update the arguments in the script `evaluation/agent_bench/scripts/run_i
- `--eval-n-limit`: the number of examples to evaluate. For example, `100`.
```bash
./evaluation/agent_bench/scripts/run_infer.sh eval_gpt35_turbo 0.6.2 CodeActAgent 1
./evaluation/agent_bench/scripts/run_infer.sh eval_gpt35_turbo HEAD CodeActAgent 1
```

View File

@@ -14,7 +14,7 @@ def try_parse_answer(act) -> str | None:
raw_ans = act.thought
else:
return None
agent_answer = re.findall(r'<solution>(.*?)</solution>', raw_ans)
agent_answer = re.findall(r'<solution>(.*?)</solution>', raw_ans, re.DOTALL)
if not agent_answer:
return None
return agent_answer[0].strip()

View File

@@ -1,10 +1,9 @@
import asyncio
import logging
import os
import re
import shutil
import tempfile
from typing import Any
import docker
import pandas as pd
from datasets import load_dataset
@@ -16,64 +15,175 @@ from evaluation.agent_bench.helper import (
)
from evaluation.utils.shared import (
EvalMetadata,
EvalOutput,
make_metadata,
prepare_dataset,
reset_logger_for_multiprocessing,
run_evaluation,
)
from opendevin.controller.agent import Agent
from opendevin.controller.state.state import State
from opendevin.core.config import get_llm_config_arg, load_app_config, parse_arguments
from opendevin.core.logger import get_console_handler
from opendevin.core.config import (
AppConfig,
SandboxConfig,
get_llm_config_arg,
parse_arguments,
)
from opendevin.core.logger import opendevin_logger as logger
from opendevin.core.main import run_agent_controller
from opendevin.events.action import CmdRunAction, MessageAction
from opendevin.llm.llm import LLM
from opendevin.runtime.docker.ssh_box import DockerSSHBox
config = load_app_config()
from opendevin.core.main import create_runtime, run_controller
from opendevin.events.action import AgentFinishAction, CmdRunAction, MessageAction
from opendevin.events.observation import CmdOutputObservation
from opendevin.runtime.runtime import Runtime
def process_instance(
def get_config(
metadata: EvalMetadata,
) -> AppConfig:
config = AppConfig(
default_agent=metadata.agent_class,
run_as_devin=False,
runtime='eventstream',
max_iterations=metadata.max_iterations,
sandbox=SandboxConfig(
container_image='ubuntu:22.04',
enable_auto_lint=True,
use_host_network=False,
),
# do not mount workspace
workspace_base=None,
workspace_mount_path=None,
)
config.set_llm_config(metadata.llm_config)
return config
async def initialize_runtime(
runtime: Runtime,
instance: pd.Series, # this argument is not required
):
"""Initialize the runtime for the agent.
This function is called before the runtime is used to run the agent.
"""
logger.info(f"{'-' * 50} BEGIN Runtime Initialization Fn {'-' * 50}")
obs: CmdOutputObservation
# Set instance id
action = CmdRunAction(command='mkdir -p /workspace')
logger.info(action, extra={'msg_type': 'ACTION'})
obs = await runtime.run_action(action)
assert obs.exit_code == 0
action = CmdRunAction(command='cd /workspace')
logger.info(action, extra={'msg_type': 'ACTION'})
obs = await runtime.run_action(action)
assert obs.exit_code == 0
init_cmd = instance.init
if init_cmd is not None:
script_name = f'{instance.instance_id}_init.sh'
with tempfile.TemporaryDirectory() as tmpdir:
host_script_path = os.path.join(tmpdir, script_name)
create_sh_file(host_script_path, init_cmd)
await runtime.copy_to(
host_script_path,
'/workspace',
)
logger.info(f'Running init script: {script_name}')
action = CmdRunAction(command=f'chmod +x ./{script_name} && ./{script_name}')
logger.info(action, extra={'msg_type': 'ACTION'})
obs = await runtime.run_action(action)
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
assert obs.exit_code == 0
logger.info(f"{'-' * 50} END Runtime Initialization Fn {'-' * 50}")
async def complete_runtime(
runtime: Runtime,
instance: pd.Series, # this argument is not required, but it is used to get the workspace_dir_name
) -> dict[str, Any]:
"""Complete the runtime for the agent.
This function is called before the runtime is used to run the agent.
If you need to do something in the sandbox to get the correctness metric after
the agent has run, modify this function.
"""
logger.info(f"{'-' * 50} BEGIN Runtime Completion Fn {'-' * 50}")
obs: CmdOutputObservation
agent_answer = None
get_agent_result_cmd = instance.get_agent_result
if get_agent_result_cmd is not None:
script_name = 'get_agent_result.sh'
with tempfile.TemporaryDirectory() as tmpdir:
host_script_path = os.path.join(tmpdir, script_name)
create_sh_file(host_script_path, get_agent_result_cmd)
await runtime.copy_to(
host_script_path,
'/workspace',
)
logger.info(f'Running get agent result cmd: {script_name}')
action = CmdRunAction(
command=f'chmod +x ./{script_name} && ./{script_name}',
keep_prompt=False,
)
logger.info(action, extra={'msg_type': 'ACTION'})
obs = await runtime.run_action(action)
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
assert obs.exit_code == 0
agent_answer = obs.content
# IF the agent answer is not found, retrieve it from the history
# We wait until the controller finishes
final_ans = None
if instance.ground_truth is not None:
final_ans = instance.ground_truth
else:
get_ground_truth_cmd = instance.get_ground_truth
if get_ground_truth_cmd is not None:
script_name = 'get_ground_truth.sh'
with tempfile.TemporaryDirectory() as tmpdir:
host_script_path = os.path.join(tmpdir, script_name)
create_sh_file(host_script_path, get_ground_truth_cmd)
await runtime.copy_to(
host_script_path,
'/workspace',
)
logger.info(f'Running get ground truth cmd: {script_name}')
action = CmdRunAction(
command=f'chmod +x ./{script_name} && ./{script_name}',
keep_prompt=False,
)
logger.info(action, extra={'msg_type': 'ACTION'})
obs = await runtime.run_action(action)
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
final_ans = obs.content
logger.info(f"{'-' * 50} END Runtime Completion Fn {'-' * 50}")
return {
'final_ans': final_ans,
'agent_answer': agent_answer,
}
async def process_instance(
instance: pd.Series,
metadata: EvalMetadata,
reset_logger: bool = True,
):
# Create the agent
agent = Agent.get_cls(metadata.agent_class)(llm=LLM(config=metadata.llm_config))
) -> EvalOutput:
config = get_config(metadata)
inst_id = instance.instance_id
question = instance.description
# create a directory for the instance's workspace
instance_workspace = str(os.path.join(config.workspace_base, inst_id))
container_inst_workspace = str(
os.path.join(config.workspace_mount_path_in_sandbox, inst_id)
)
if os.path.exists(instance_workspace):
shutil.rmtree(instance_workspace)
os.makedirs(instance_workspace, exist_ok=True)
# Set up the logger properly, so you can run multiprocessing to parallel the evaluation
# Setup the logger properly, so you can run multi-processing to parallelize the evaluation
if reset_logger:
# Set up logger
log_file = os.path.join(
metadata.eval_output_dir, 'logs', f'instance_{inst_id}.log'
)
# Remove all existing handlers from logger
for handler in logger.handlers[:]:
logger.removeHandler(handler)
# add back the console handler to print ONE line
logger.addHandler(get_console_handler())
logger.info(
f'Starting evaluation for instance {inst_id}.\nHint: run "tail -f {log_file}" to see live logs in a separate shell'
)
# Remove all existing handlers from logger
for handler in logger.handlers[:]:
logger.removeHandler(handler)
file_handler = logging.FileHandler(log_file)
file_handler.setFormatter(
logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
)
logger.addHandler(file_handler)
log_dir = os.path.join(metadata.eval_output_dir, 'infer_logs')
reset_logger_for_multiprocessing(logger, instance.instance_id, log_dir)
else:
logger.info(f'Starting evaluation for instance {instance.instance_id}.')
# =============================================
# build instruction
@@ -86,104 +196,68 @@ def process_instance(
'Please encapsulate your final answer (answer ONLY) within <solution> and </solution>.\n'
'For example: The answer to the question is <solution> 42 </solution>.\n'
'# Problem \n'
f'{question}\n\n'
f'{instance.description}\n\n'
)
instruction += (
'IMPORTANT: You should ONLY interact with the environment provided '
'to you AND NEVER ASK FOR HUMAN HELP.\n'
)
# NOTE: You can actually set slightly different instruction for different agents
instruction += INST_SUFFIXES[agent.__class__.__name__]
instruction += INST_SUFFIXES[metadata.agent_class]
# =============================================
# create sandbox and run the agent
# =============================================
sandbox = DockerSSHBox(
config=config.sandbox,
persist_sandbox=False,
workspace_mount_path=config.workspace_mount_path,
sandbox_workspace_dir=config.workspace_mount_path_in_sandbox,
cache_dir=config.cache_dir,
run_as_devin=config.run_as_devin,
)
sandbox.execute(f'cd {inst_id}')
runtime: Runtime = await create_runtime(config, sid=instance.instance_id)
init_cmd = instance.init
if init_cmd is not None:
scpt_name = f'{instance.instance_id}_init.sh'
scpt_path = os.path.join(container_inst_workspace, scpt_name)
host_scpt_path = os.path.join(instance_workspace, scpt_name)
create_sh_file(host_scpt_path, init_cmd)
logger.info(f'Running init script: {scpt_path}')
_, init_res = sandbox.execute(scpt_path)
logger.info(f'Init script result: {init_res}')
await initialize_runtime(runtime, instance=instance)
# Here's how you can run the agent (similar to the `main` function) and get the final task state
state: State | None = asyncio.run(
run_agent_controller(
agent,
instruction,
max_iterations=metadata.max_iterations,
max_budget_per_task=config.max_budget_per_task,
fake_user_response_fn=FAKE_RESPONSES[agent.__class__.__name__],
sandbox=sandbox,
sid=inst_id,
)
state: State | None = await run_controller(
config=config,
task_str=instruction,
runtime=runtime,
fake_user_response_fn=FAKE_RESPONSES[metadata.agent_class],
)
if state is None:
raise ValueError('State should not be None.')
# get the ground truth
# OSBenchSSHBox.get_ground_truth(instance, state)
# =============================================
# result evaluation
# =============================================
agent_answer = ''
get_agent_result_cmd = instance.get_agent_result
if get_agent_result_cmd is not None:
scpt_name = f'{instance.instance_id}_get_agent_result.sh'
scpt_path = os.path.join(container_inst_workspace, scpt_name)
host_scpt_path = os.path.join(instance_workspace, scpt_name)
create_sh_file(host_scpt_path, get_agent_result_cmd)
logger.info(f'Running get agent result cmd: {scpt_path}')
_, agent_answer = sandbox.execute(scpt_path)
else:
return_val = await complete_runtime(runtime, instance)
agent_answer = return_val['agent_answer']
final_ans = return_val['final_ans']
# If the agent answer is not found, retrieve it from the history
if agent_answer is None:
agent_answer = ''
logger.info('Retrieving agent answer from history.')
raw_ans = ''
# retrieve the last agent message or thought
for event in state.history.get_events(reverse=True):
if isinstance(event, MessageAction) and event.source == 'agent':
raw_ans = event.content
elif isinstance(event, CmdRunAction) and event.source == 'agent':
raw_ans = event.thought
if event.source == 'agent':
if isinstance(event, AgentFinishAction):
raw_ans = event.thought
break
elif isinstance(event, MessageAction):
raw_ans = event.content
break
elif isinstance(event, CmdRunAction):
raw_ans = event.thought
break
# parse the answer for a solution tag
agent_answer = re.findall(r'<solution>(.*?)</solution>', raw_ans)
agent_answer = re.findall(r'<solution>(.*?)</solution>', raw_ans, re.DOTALL)
if len(agent_answer) == 0:
logger.warning(f'Failed to parse model answer: {raw_ans}')
agent_answer = raw_ans
else:
agent_answer = agent_answer[0]
final_ans = ''
if instance.ground_truth is not None:
final_ans = instance.ground_truth
else:
get_ground_truth_cmd = instance.get_ground_truth
if get_ground_truth_cmd is not None:
scpt_name = f'{instance.instance_id}_get_ground_truth.sh'
scpt_path = os.path.join(container_inst_workspace, scpt_name)
host_scpt_path = os.path.join(instance_workspace, scpt_name)
create_sh_file(host_scpt_path, get_ground_truth_cmd)
logger.info(f'Running get ground truth cmd: {scpt_path}')
sandbox.execute(f'cd {container_inst_workspace}')
_, final_ans = sandbox.execute(scpt_path)
comparison_method = instance.comparison_method
logger.info(
f'Final message: {agent_answer} | Ground truth: {final_ans} | Comparison method: {comparison_method}'
@@ -198,58 +272,49 @@ def process_instance(
metrics = state.metrics.get() if state.metrics else None
# Save the output
output = {
'instance_id': inst_id,
'instance': instance.to_dict(),
'instruction': instruction,
'metadata': metadata.model_dump(),
'history': histories,
'metrics': metrics,
'error': state.last_error if state and state.last_error else None,
'test_result': {
output = EvalOutput(
instance_id=instance.instance_id,
instance=instance.to_dict(),
instruction=instruction,
metadata=metadata,
history=histories,
metrics=metrics,
error=state.last_error if state and state.last_error else None,
test_result={
'agent_answer': agent_answer,
'final_answer': final_ans,
'check_method': comparison_method,
'result': test_result,
},
}
# clean up
if os.path.exists(instance_workspace):
shutil.rmtree(instance_workspace)
# Close the sandbox
try:
sandbox.close()
except docker.errors.NotFound as e:
logger.error(f'Failed to close sandbox: {e}')
)
return output
if __name__ == '__main__':
id_column = 'instance_id'
args = parse_arguments()
dataset = load_dataset('iFurySt/AgentBench')
agent_bench_tests = dataset['osbench'].to_pandas()
llm_config = get_llm_config_arg(args.llm_config) if args.llm_config else config.llm
logger.info(f'Config for evaluation: {config}')
llm_config = None
if args.llm_config:
llm_config = get_llm_config_arg(args.llm_config)
if llm_config is None:
raise ValueError(f'Could not find LLM config: --llm_config {args.llm_config}')
metadata = make_metadata(
llm_config,
args.dataset_name,
'AgentBench-OS',
args.agent_cls,
args.max_iterations,
args.eval_note,
args.eval_output_dir,
)
output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl')
instances = prepare_dataset(dataset, output_file, args.eval_n_limit, id_column)
instances = prepare_dataset(agent_bench_tests, output_file, args.eval_n_limit)
run_evaluation(
instances,
metadata,
output_file,
args.eval_num_workers,
process_instance,
id_column,
asyncio.run(
run_evaluation(
instances, metadata, output_file, args.eval_num_workers, process_instance
)
)

0
evaluation/agent_bench/scripts/run_infer.sh Normal file → Executable file
View File

View File

@@ -2,15 +2,12 @@
Implements evaluation of agents on BioCoder from the BioCoder benchmark introduced in [BioCoder: A Benchmark for Bioinformatics Code Generation with Large Language Models](https://arxiv.org/abs/2308.16458). Please see [here](https://github.com/bigcode-project/bigcode-evaluation-harness/blob/main/bigcode_eval/tasks/humanevalpack.py) for the reference implementation used in the paper.
## Setup Environment
## Setup Environment and LLM Configuration
Please follow [this document](https://github.com/OpenDevin/OpenDevin/blob/main/Development.md) to setup local develop environment for OpenDevin.
## Configure OpenDevin and your LLM
Create a `config.toml` file if it does not exist at the root of the workspace. Please check [README.md](../../README.md) for how to set this up.
Please follow instruction [here](../README.md#setup) to setup your local development environment and LLM.
## BioCoder Docker Image
In the opendevin branch of the Biocoder repository, we have slightly modified our original Docker image to work with the OpenDevin environment. In the Docker image are testing scripts (`/testing/start_test_opendevin.py` and aux files in `/testing_files/`) to assist with evaluation. Additionally, we have installed all dependencies, including OpenJDK, mamba (with Python 3.6), and many system libraries. Notably, we have **not** packaged all repositories into the image, so they are downloaded at runtime.
**Before first execution, pull our Docker image with the following command**
@@ -41,12 +38,12 @@ to `CodeActAgent`.
- `eval_limit`, e.g. `10`, limits the evaluation to the first `eval_limit` instances. By default it infers all instances.
Let's say you'd like to run 1 instance using `eval_gpt4_1106_eval_gpt4o_2024_05_13preview` and CodeActAgent
with OpenDevin version 0.6.2, then your command would be:
with current OpenDevin version, then your command would be:
## Examples
```bash
./evaluation/biocoder/scripts/run_infer.sh eval_gpt4o_2024_05_13 0.6.2 CodeActAgent 1
./evaluation/biocoder/scripts/run_infer.sh eval_gpt4o_2024_05_13 HEAD CodeActAgent 1
```
## Reference

View File

@@ -1,387 +0,0 @@
import json
import os
import re
import sys
from collections import defaultdict
from dataclasses import dataclass
from datasets import load_dataset
from opendevin.core.config import load_app_config
from opendevin.core.logger import opendevin_logger as logger
from opendevin.runtime.docker.ssh_box import DockerSSHBox
from opendevin.runtime.plugins import (
JupyterRequirement,
PluginRequirement,
SWEAgentCommandsRequirement,
)
config = load_app_config()
BIOCODER_BENCH_CONTAINER_IMAGE = 'public.ecr.aws/i5g0m1f6/eval_biocoder:v1.0'
@dataclass
class BiocoderData:
filePath: str
numLines: int
lineStart: int
lineEnd: int
signature: str
comment: str
content: str
repository: str
promptSummaryOnly: str
contextCode: str
goldenCode: str
test_case_id: str
language: str
def to_dict(self):
return {
'filePath': self.filePath,
'numLines': self.numLines,
'lineStart': self.lineStart,
'lineEnd': self.lineEnd,
'signature': self.signature,
'comment': self.comment,
'content': self.content,
'repository': self.repository,
'promptSummaryOnly': self.promptSummaryOnly,
'contextCode': self.contextCode,
'goldenCode': self.goldenCode,
'test_case_id': self.test_case_id,
'language': self.language,
}
def get_likely_indent_size(array_of_tabs) -> int:
sizes = defaultdict(int)
for i in range(len(array_of_tabs) - 1):
diff = array_of_tabs[i + 1] - array_of_tabs[i]
if diff > 0:
sizes[diff] += 1
if len(sizes) == 0:
return 4
return int(max(sizes, key=sizes.get))
class BiocoderSSHBox(DockerSSHBox):
def __init__(
self,
container_image: str,
timeout: int = 120,
sid: str | None = None,
biocoder_instance_id: str | None = None,
biocoder_instance: BiocoderData | None = None,
skip_workspace_mount: bool = True,
sandbox_plugins: list[PluginRequirement] = [], # noqa: B006
biocoder_cache_folder: str = 'biocoder_cache',
workspace_dir_name: str | None = None,
):
if biocoder_instance_id is None:
raise ValueError('biocoder_instance_id must be provided')
self.biocoder_instance_id = biocoder_instance_id
self.biocoder_instance = biocoder_instance
self.skip_workspace_mount = skip_workspace_mount
self.biocoder_cache_folder = biocoder_cache_folder
self.first_line_after_removed = None
self.workspace_dir_name = workspace_dir_name
self.workspace_base = config.workspace_base
self.workspace_mount_path = config.workspace_mount_path
# self.workspace_dir_name_host = os.path.join(config.workspace_base, workspace_dir_name)
self.context_path = None
self.generated_path = None
self.golden_path = None
assert (
container_image is not None
), 'container_image is required for BiocoderBenchSSHBox!'
super().__init__(container_image, timeout, sid)
self.init_plugins(sandbox_plugins)
@property
def volumes(self):
if self.skip_workspace_mount:
return {
k: v
for k, v in super().volumes.items()
if not v['bind'] == self.sandbox_workspace_dir
}
return super().volumes
def get_target_filepath(self):
target_filepath = os.path.join(
self.workspace_mount_path,
self.biocoder_instance.repository.split('/')[1],
self.biocoder_instance.filePath,
)
return target_filepath
def get_changed_code(self, include_signature=False):
# copies changed code into /testing_files/
# Note that this does NOT copy the function signature
target_filepath = self.get_target_filepath()
selected_lines = []
offset = 1 if include_signature else 0
if self.first_line_after_removed is None:
logger.warning('First line after removed is None')
with open(target_filepath, 'r') as f:
lines = f.read().split('\n')
for i in range(self.biocoder_instance.lineStart - offset, len(lines)):
if lines[i].strip() == self.first_line_after_removed.strip():
break
selected_lines.append(lines[i])
text = '\n'.join(selected_lines)
return text
def copy_changed_code(self):
changed_code = self.get_changed_code(include_signature=True)
with open(self.generated_path, 'w') as f:
f.write(changed_code)
exit_code, output = self.execute_and_check(
f'cp -r /workspace/{self.biocoder_cache_folder}/* /testing_files',
'Failed to copy the files',
)
def remove_code(self):
comment_prefix = {'python': '#', 'java': '//'}
target_filepath = self.get_target_filepath()
line_start = self.biocoder_instance.lineStart
line_end = self.biocoder_instance.lineEnd
with open(target_filepath, 'r') as f:
lines = f.read().split('\n')
# print("="*10+"ORIGINAL"+"="*10)
# print("\n".join(lines))
signature_line = lines[line_start - 1]
# get the number of tabs
def get_indent_size(s: str):
return len(re.match(r'\s*', s).group())
indent_sizes = list(map(get_indent_size, lines))
indent_size = get_likely_indent_size(indent_sizes)
comment_indent_size = get_indent_size(signature_line) + indent_size
lines = (
lines[:line_start]
+ [
f"{' '*comment_indent_size+comment_prefix[self.biocoder_instance.language.lower()]}TODO: replace with your code here"
]
+ ([''] * 2)
+ lines[line_end:]
)
first_line_after_removed_index = line_start
while len(
lines[first_line_after_removed_index].strip()
) == 0 and first_line_after_removed_index < len(lines):
first_line_after_removed_index += 1
self.first_line_after_removed = lines[first_line_after_removed_index]
# print("FIRST LINE AFTER REMOVED: ", self.first_line_after_removed)
with open(target_filepath, 'w') as f:
f.write('\n'.join(lines))
# with open(target_filepath, 'r') as f:
# print("="*10+"MODIFIED"+"="*10)
# print(f.read())
def execute_and_check(self, cmd: str, error_msg: str) -> tuple[int, str]:
exit_code, output = self.execute(cmd)
if exit_code != 0:
logger.error(error_msg)
sys.exit(1)
return exit_code, output
@classmethod
def get_box_for_instance(
cls,
instance,
workspace_dir_name=None,
skip_workspace_mount: bool = False,
workspace_mount_path: str | None = None,
sandbox_plugins: list[PluginRequirement] = [], # noqa: B006
) -> 'BiocoderSSHBox':
"""This method initializes a container image, then runs some initialization commands"""
if workspace_dir_name is None:
workspace_dir_name = f'{instance.repository}__{instance.test_case_id[:10]}__{os.getpid()}'.replace(
'/', '__'
)
workspace_base = str(os.path.join(config.workspace_base, workspace_dir_name))
old_workspace_base = config.workspace_base
old_workspace_mount_path = config.workspace_mount_path
try:
config.workspace_base = workspace_base
config.workspace_mount_path = workspace_base
# linting python after editing helps LLM fix indentations
config.sandbox.enable_auto_lint = True
# create folder for transferring files back/forth
biocoder_cache_folder = 'biocoder_cache'
if not os.path.exists(os.path.join(workspace_base, biocoder_cache_folder)):
os.makedirs(
os.path.join(workspace_base, biocoder_cache_folder), exist_ok=True
)
file_ext = {
'python': 'py',
'java': 'java',
'c': 'c',
'cpp': 'cpp',
'javascript': 'js',
'typescript': 'ts',
}[instance.language.lower()]
context_path = os.path.join(
workspace_base, biocoder_cache_folder, 'context.' + file_ext
)
generated_path = os.path.join(
workspace_base, biocoder_cache_folder, 'generated.' + file_ext
)
golden_path = os.path.join(
workspace_base, biocoder_cache_folder, 'golden.' + file_ext
)
# print(instance.contextCode)
with open(context_path, 'w') as f:
f.write(instance.contextCode)
with open(generated_path, 'w') as f:
f.write(instance.goldenCode)
with open(golden_path, 'w') as f:
f.write(instance.goldenCode)
testcase_json = {
'test_case_id': instance.test_case_id,
'num_cases': 1000,
'language': instance.language.lower(),
}
with open(
os.path.join(
workspace_base, biocoder_cache_folder, 'testcase_biocoder.json'
),
'w',
) as f:
f.write(json.dumps(testcase_json, indent=4))
# linting python after editing helps LLM fix indentations
config.sandbox.enable_auto_lint = True
sandbox = cls(
container_image=BIOCODER_BENCH_CONTAINER_IMAGE,
biocoder_instance_id=instance.test_case_id,
biocoder_instance=instance,
skip_workspace_mount=skip_workspace_mount,
sandbox_plugins=sandbox_plugins,
biocoder_cache_folder=biocoder_cache_folder,
workspace_dir_name=workspace_dir_name,
)
except Exception:
raise
finally:
config.workspace_base = old_workspace_base
config.workspace_mount_path = old_workspace_mount_path
sandbox.context_path = context_path
sandbox.generated_path = generated_path
sandbox.golden_path = golden_path
logger.info(f'SSH box started for instance {instance.test_case_id}.')
# cd to the workspace
exit_code, output = sandbox.execute_and_check(
'cd /workspace', 'Failed to cd to workspace'
)
logger.info(f'cd to workspace: {output}')
# download repository archive
repository_url = f"https://biocoder.lilbillbiscuit.com/repos/{instance.repository.split('/')[1]}.zip"
exit_code, output = sandbox.execute_and_check(
'wget -O repo.zip ' + repository_url, 'Failed to download the repository'
)
logger.info(f'Downloaded the repository: {output}')
exit_code, output = sandbox.execute_and_check(
'unzip -o -q repo.zip', 'Failed to unzip the repository'
)
logger.info(f'Unzipped the repository: {output}')
# copy the context, generated and golden files to the /testing_files folder
exit_code, output = sandbox.execute_and_check(
f'cp -r /workspace/{biocoder_cache_folder}/* /testing_files',
'Failed to copy the files',
)
# chmod 777
exit_code, output = sandbox.execute_and_check(
'chmod -R 777 /workspace',
'Failed to chmod the files',
)
return sandbox
if __name__ == '__main__':
biocoder_dataset = load_dataset('Lilbillbiscuit/biocoder_public')
EXAMPLE_INSTANCE = biocoder_dataset['test'][0]
EXAMPLE_INSTANCE = BiocoderData(**EXAMPLE_INSTANCE)
sandbox = BiocoderSSHBox.get_box_for_instance(
instance=EXAMPLE_INSTANCE,
workspace_mount_path='/home/ubuntu/OpenDevinBioCoder/workspace',
skip_workspace_mount=False,
sandbox_plugins=[JupyterRequirement(), SWEAgentCommandsRequirement()],
)
# PRE TEST
exit_code, output = sandbox.execute_and_check(
'cd /testing',
'Failed to cd /testing',
)
logger.info(f'cd $REPO_PATH: {output}')
exit_code, output = sandbox.execute_and_check(
'whoami',
'Failed to run whoami',
)
logger.info(f'whoami: {output}')
# TEST
exit_code, output = sandbox.execute(
'/home/devin/mambaforge/bin/mamba run -n test python3 /testing/start_test_opendevin.py'
)
assert exit_code == 0, 'Expected exit code 0 (this should have passed)'
logger.info(f'$TEST_CMD:\n{output}')
exit_code, output = sandbox.execute_and_check(
'cat /testing_files/results_biocoder.json', 'Failed to read the result file'
)
print(output)
json_obj = json.loads(output)
if json_obj['result'] == 'pass':
print('PASS')
else:
print('FAIL')
sys.stdout.flush()
try:
while True:
try:
user_input = input('>>> ')
except EOFError:
logger.info('Exiting...')
break
if user_input.lower() == 'exit':
logger.info('Exiting...')
break
exit_code, output = sandbox.execute(user_input)
logger.info('exit code: %d', exit_code)
logger.info(output)
sys.stdout.flush()
except KeyboardInterrupt:
logger.info('Exiting...')
sandbox.close()

View File

@@ -1,33 +1,38 @@
import asyncio
import functools
import json
import logging
import os
import pathlib
from functools import partial
import tempfile
from typing import Any
import pandas as pd
from datasets import load_dataset
from evaluation.biocoder.biocoder_env_box import BiocoderData, BiocoderSSHBox
from evaluation.biocoder.utils import BiocoderData
from evaluation.utils.shared import (
EvalMetadata,
EvalOutput,
codeact_user_response,
make_metadata,
prepare_dataset,
reset_logger_for_multiprocessing,
run_evaluation,
)
from opendevin.controller.agent import Agent
from opendevin.controller.state.state import State
from opendevin.core.config import get_llm_config_arg, load_app_config, parse_arguments
from opendevin.core.logger import get_console_handler
from opendevin.core.config import (
AppConfig,
SandboxConfig,
get_llm_config_arg,
parse_arguments,
)
from opendevin.core.logger import opendevin_logger as logger
from opendevin.core.main import run_agent_controller
from opendevin.llm.llm import LLM
config = load_app_config()
from opendevin.core.main import create_runtime, run_controller
from opendevin.events.action import CmdRunAction
from opendevin.events.observation import CmdOutputObservation
from opendevin.runtime.runtime import Runtime
AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = {
'CodeActAgent': partial(
'CodeActAgent': functools.partial(
codeact_user_response, encapsulate_solution=True, try_parse=None
),
}
@@ -36,111 +41,218 @@ AGENT_CLS_TO_INST_SUFFIX = {
'CodeActAgent': 'When you think you have fixed the issue through code changes, please run the following command: <execute_bash> exit </execute_bash>.\n'
}
FILE_EXT_MAP = {
'python': 'py',
'java': 'java',
'c': 'c',
'cpp': 'cpp',
'javascript': 'js',
'typescript': 'ts',
}
def get_config(
metadata: EvalMetadata,
) -> AppConfig:
BIOCODER_BENCH_CONTAINER_IMAGE = 'public.ecr.aws/i5g0m1f6/eval_biocoder:v1.0'
config = AppConfig(
default_agent=metadata.agent_class,
run_as_devin=False,
runtime='eventstream',
max_iterations=metadata.max_iterations,
sandbox=SandboxConfig(
container_image=BIOCODER_BENCH_CONTAINER_IMAGE,
enable_auto_lint=True,
use_host_network=False,
),
# do not mount workspace
workspace_base=None,
workspace_mount_path=None,
)
config.set_llm_config(metadata.llm_config)
return config
async def initialize_runtime(
runtime: Runtime,
instance: BiocoderData, # this argument is not required
):
"""Initialize the runtime for the agent.
This function is called before the runtime is used to run the agent.
"""
logger.info(f"{'-' * 50} BEGIN Runtime Initialization Fn {'-' * 50}")
obs: CmdOutputObservation
file_ext = FILE_EXT_MAP[instance.language.lower()]
action = CmdRunAction(command='mkdir -p /workspace && mkdir -p /testing_files')
logger.info(action, extra={'msg_type': 'ACTION'})
obs = await runtime.run_action(action)
assert obs.exit_code == 0
with tempfile.TemporaryDirectory() as tmpdir:
context_path = os.path.join(tmpdir, 'context.' + file_ext)
with open(context_path, 'w') as f:
f.write(instance.contextCode)
await runtime.copy_to(context_path, '/testing_files')
golden_path = os.path.join(tmpdir, 'golden.' + file_ext)
with open(golden_path, 'w') as f:
f.write(instance.goldenCode)
await runtime.copy_to(golden_path, '/testing_files')
testcase_json = {
'test_case_id': instance.test_case_id,
'num_cases': 1000,
'language': instance.language.lower(),
}
testcase_path = os.path.join(tmpdir, 'testcase_biocoder.json')
with open(testcase_path, 'w') as f:
f.write(json.dumps(testcase_json, indent=4))
await runtime.copy_to(testcase_path, '/testing_files')
# setup paths
remove_code_script = os.path.join(
os.path.dirname(__file__), 'scripts', 'setup', 'remove_code.py'
)
await runtime.copy_to(remove_code_script, '/testing_files')
action = CmdRunAction(command='cd /workspace')
logger.info(action, extra={'msg_type': 'ACTION'})
obs = await runtime.run_action(action)
assert obs.exit_code == 0
# download repository archive
repository_url = f"https://biocoder.lilbillbiscuit.com/repos/{instance.repository.split('/')[1]}.zip"
action = CmdRunAction(command='wget -O repo.zip ' + repository_url)
logger.info(action, extra={'msg_type': 'ACTION'})
obs = await runtime.run_action(action)
assert obs.exit_code == 0, f'Failed to download the repository: {obs.content}'
# unzip the repository
action = CmdRunAction(command='unzip -o -q repo.zip && rm repo.zip')
logger.info(action, extra={'msg_type': 'ACTION'})
obs = await runtime.run_action(action)
assert obs.exit_code == 0, f'Failed to unzip the repository: {obs.content}'
# chmod 777
action = CmdRunAction(command='chmod -R 777 /workspace')
logger.info(action, extra={'msg_type': 'ACTION'})
obs = await runtime.run_action(action)
assert obs.exit_code == 0, f'Failed to chmod the files: {obs.content}'
# remove code for evaluation instance
target_filepath = os.path.join(
'/workspace', instance.repository.split('/')[1], instance.filePath
)
line_start = instance.lineStart
line_end = instance.lineEnd
language = instance.language.lower()
action = CmdRunAction(
command=f'python3 /testing_files/remove_code.py --target_filepath {target_filepath} --line_start {line_start} --line_end {line_end} --language {language}'
)
logger.info(action, extra={'msg_type': 'ACTION'})
obs = await runtime.run_action(action)
assert obs.exit_code == 0, f'Failed to remove the code: {obs.content}'
logger.info(f"{'-' * 50} END Runtime Initialization Fn {'-' * 50}")
async def complete_runtime(
runtime: Runtime,
instance: pd.Series, # this argument is not required, but it is used to get the workspace_dir_name
) -> dict[str, Any]:
"""Complete the runtime for the agent.
This function is called before the runtime is used to run the agent.
If you need to do something in the sandbox to get the correctness metric after
the agent has run, modify this function.
"""
logger.info(f"{'-' * 50} BEGIN Runtime Completion Fn {'-' * 50}")
obs: CmdOutputObservation
def get_test_result(instance, sandbox, workspace_dir_name):
test_result = {'result': {}, 'metadata': {}}
try:
code = sandbox.get_changed_code(include_signature=True)
sandbox.copy_changed_code()
copy_changed_code_script = os.path.join(
os.path.dirname(__file__), 'scripts', 'setup', 'copy_changed_code.py'
)
await runtime.copy_to(copy_changed_code_script, '/testing_files')
file_ext = FILE_EXT_MAP[instance.language.lower()]
target_filepath = os.path.join(
'/workspace', instance.repository.split('/')[1], instance.filePath
)
generated_path = os.path.join('/testing_files', 'generated.' + file_ext)
action = CmdRunAction(
command=f'python3 /testing_files/copy_changed_code.py --target_filepath {target_filepath} --generated_code_filepath {generated_path} --line_start {instance.lineStart} --include_signature'
)
logger.info(action, extra={'msg_type': 'ACTION'})
obs = await runtime.run_action(action)
if obs.exit_code == 0:
test_result['metadata']['1_copy_change_success'] = True
action = CmdRunAction(command=f'cat {generated_path}', keep_prompt=False)
logger.info(action, extra={'msg_type': 'ACTION'})
obs = await runtime.run_action(action)
assert obs.exit_code == 0
code = obs.content
test_result['metadata']['1_copy_change_code'] = code
except Exception:
logger.error('Error fetching changed code for this instance')
else:
test_result['metadata']['1_copy_change_success'] = False
test_result['metadata']['1_copy_change_code'] = None
exit_code, output = sandbox.execute_and_check(
'cd /testing',
'Failed to cd /testing',
)
logger.info(f'cd $REPO_PATH: {output}')
action = CmdRunAction(command='cd /testing_files')
logger.info(action, extra={'msg_type': 'ACTION'})
obs = await runtime.run_action(action)
assert obs.exit_code == 0
exit_code, output = sandbox.execute_and_check(
'whoami',
'Failed to run whoami',
action = CmdRunAction(
command='/home/devin/mambaforge/bin/mamba run -n test python3 /testing/start_test_opendevin.py'
)
logger.info(f'whoami: {output}')
logger.info(action, extra={'msg_type': 'ACTION'})
obs = await runtime.run_action(action)
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
assert obs.exit_code == 0
exit_code, output = sandbox.execute(
'/home/devin/mambaforge/bin/mamba run -n test python3 /testing/start_test_opendevin.py'
action = CmdRunAction(
command='cat /testing_files/results_biocoder.json', keep_prompt=False
)
logger.info(f'$TEST_CMD:\n{output}')
exit_code, output = sandbox.execute_and_check(
'cat /testing_files/results_biocoder.json', 'Failed to read the result file'
)
if exit_code == 0:
logger.info(action, extra={'msg_type': 'ACTION'})
obs = await runtime.run_action(action)
if obs.exit_code == 0:
test_result['metadata']['2_run_test_success'] = True
test_result['metadata']['2_run_test_result'] = str(output)
test_result['metadata']['2_run_test_result'] = str(obs.content)
json_obj = json.loads(obs.content)
test_result['result'] = json_obj['result']
else:
test_result['metadata']['2_run_test_success'] = False
test_result['metadata']['2_run_test_result'] = str(output)
json_obj = json.loads(output)
test_result['result'] = json_obj['result']
test_result['metadata']['2_run_test_result'] = str(obs.content)
logger.info(f"{'-' * 50} END Runtime Completion Fn {'-' * 50}")
return test_result
def process_instance(
async def process_instance(
instance: pd.Series,
metadata: EvalMetadata,
reset_logger: bool = True,
):
# Create the agent
agent = Agent.get_cls(metadata.agent_class)(llm=LLM(config=metadata.llm_config))
) -> EvalOutput:
config = get_config(metadata)
instance = BiocoderData(**instance)
print(instance)
workspace_dir_name = (
f'{instance.repository}__{instance.test_case_id[:10]}__{os.getpid()}'.replace(
'/', '__'
)
)
workspace_mount_path = os.path.join(config.workspace_base, workspace_dir_name)
# create process-specific workspace dir
# if `not skip_workspace_mount` - we will create a workspace directory for EACH process
# so that different agent don't interfere with each other.
workspace_mount_path = os.path.join(workspace_mount_path, str(os.getpid()))
pathlib.Path(workspace_mount_path).mkdir(parents=True, exist_ok=True)
instance_id = f'{instance.repository}__{instance.instance_id[:10]}'
# Setup the logger properly, so you can run multi-processing to parallize the evaluation
# Setup the logger properly, so you can run multi-processing to parallelize the evaluation
if reset_logger:
# Set up logger
log_file = os.path.join(
metadata.eval_output_dir, 'logs', f'instance_{instance.test_case_id}.log'
)
# Remove all existing handlers from logger
for handler in logger.handlers[:]:
logger.removeHandler(handler)
# add back the console handler to print ONE line
logger.addHandler(get_console_handler())
logger.info(
f'Starting evaluation for instance {instance.test_case_id}.\nHint: run "tail -f {log_file}" to see live logs in a seperate shell'
)
# Remove all existing handlers from logger
for handler in logger.handlers[:]:
logger.removeHandler(handler)
file_handler = logging.FileHandler(log_file)
file_handler.setFormatter(
logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
)
logger.addHandler(file_handler)
logger.info(f'Process-specific workspace mounted at {workspace_mount_path}')
# NOTE: this is something special we do for SWE-Bench due to the reason described in the previous section
# You can omit this if you don't need to setup specialized sandbox
workspace_dir_name = f'{instance.repository}__{instance.test_case_id[:10]}'.replace(
'/', '__'
)
sandbox = BiocoderSSHBox.get_box_for_instance(
instance,
workspace_dir_name,
skip_workspace_mount=False,
workspace_mount_path=workspace_mount_path,
sandbox_plugins=agent.sandbox_plugins,
)
sandbox.remove_code()
log_dir = os.path.join(metadata.eval_output_dir, 'infer_logs')
reset_logger_for_multiprocessing(logger, instance_id, log_dir)
else:
logger.info(f'Starting evaluation for instance {instance_id}.')
# Prepare instruction
instruction = (
@@ -160,80 +272,76 @@ def process_instance(
'Make sure to include proper formatting in Java and Python, including correct braces and/or indentation.\n'
)
# NOTE: You can actually set slightly different instruction for different agents
instruction += AGENT_CLS_TO_INST_SUFFIX[agent.__class__.__name__]
instruction += AGENT_CLS_TO_INST_SUFFIX[metadata.agent_class]
# use a session id for concurrent evaluation
sid = instance.test_case_id.replace('/', '__')
sid = instance.instance_id.replace('/', '__')
runtime = await create_runtime(config, sid=sid)
await initialize_runtime(runtime, instance)
# Here's how you can run the agent (similar to the `main` function) and get the final task state
state: State | None = asyncio.run(
run_agent_controller(
agent,
instruction,
max_iterations=metadata.max_iterations,
max_budget_per_task=config.max_budget_per_task,
fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN[
agent.__class__.__name__
],
sandbox=sandbox,
sid=sid,
)
state: State | None = await run_controller(
config=config,
task_str=instruction,
runtime=runtime,
fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN[metadata.agent_class],
)
test_result = get_test_result(instance, sandbox, workspace_dir_name)
if state is None:
raise ValueError('State should not be None.')
metrics = state.metrics.get() if state.metrics else None
test_result = await complete_runtime(runtime, instance)
metrics = state.metrics.get() if state.metrics else None
# history is now available as a stream of events, rather than list of pairs of (Action, Observation)
# for compatibility with the existing output format, we can remake the pairs here
# remove when it becomes unnecessary
histories = state.history.compatibility_for_eval_history_pairs()
# Save the output
output = {
'test_case_id': instance.test_case_id,
'biocoder_instance': instance.to_dict(),
'instruction': instruction,
'generated': test_result['metadata']['1_copy_change_code'],
'metadata': metadata.model_dump(),
'history': histories,
'metrics': metrics,
'error': state.last_error if state and state.last_error else None,
'test_result': test_result,
}
test_result['generated'] = test_result['metadata']['1_copy_change_code']
# Close the sandbox
sandbox.close()
# Save the output
output = EvalOutput(
instance_id=instance.instance_id,
instance=instance.to_dict(),
instruction=instruction,
metadata=metadata,
history=histories,
metrics=metrics,
error=state.last_error if state and state.last_error else None,
test_result=test_result,
)
return output
if __name__ == '__main__':
id_column = 'test_case_id'
args = parse_arguments()
dataset = load_dataset('lilbillbiscuit/biocoder_public')
biocoder_tests = dataset['test'].to_pandas()
llm_config = get_llm_config_arg(args.llm_config) if args.llm_config else config.llm
logger.info(f'Config for evaluation: {config}')
dataset = load_dataset('lilbillbiscuit/biocoder_public')
biocoder_tests = dataset['train'].to_pandas()
biocoder_tests['instance_id'] = biocoder_tests['test_case_id']
llm_config = None
if args.llm_config:
llm_config = get_llm_config_arg(args.llm_config)
if llm_config is None:
raise ValueError(f'Could not find LLM config: --llm_config {args.llm_config}')
metadata = make_metadata(
llm_config,
args.dataset_name,
'biocoder',
args.agent_cls,
args.max_iterations,
args.eval_note,
args.eval_output_dir,
)
output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl')
instances = prepare_dataset(dataset, output_file, args.eval_n_limit, id_column)
instances = prepare_dataset(biocoder_tests, output_file, args.eval_n_limit)
run_evaluation(
instances,
metadata,
output_file,
args.eval_num_workers,
process_instance,
id_column,
asyncio.run(
run_evaluation(
instances, metadata, output_file, args.eval_num_workers, process_instance
)
)

0
evaluation/biocoder/scripts/run_infer.sh Normal file → Executable file
View File

View File

@@ -0,0 +1,45 @@
import argparse
def get_changed_code(target_filepath, line_start, include_signature=False):
# copies changed code into /testing_files/
# Note that this does NOT copy the function signature
selected_lines = []
offset = 1 if include_signature else 0
with open('/testing_files/first_line_after_removed.txt', 'r') as f:
first_line_after_removed = f.read()
if first_line_after_removed is None:
print('First line after removed is None')
with open(target_filepath, 'r') as f:
lines = f.read().split('\n')
for i in range(line_start - offset, len(lines)):
if lines[i].strip() == first_line_after_removed.strip():
break
selected_lines.append(lines[i])
text = '\n'.join(selected_lines)
return text
def copy_changed_code(
target_filepath, generated_code_filepath, line_start, include_signature=False
):
changed_code = get_changed_code(target_filepath, line_start, include_signature)
with open(generated_code_filepath, 'w') as f:
f.write(changed_code)
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--target_filepath', type=str, required=True)
parser.add_argument('--generated_code_filepath', type=str, required=True)
parser.add_argument('--line_start', type=int, required=True)
parser.add_argument('--include_signature', action='store_true')
args = parser.parse_args()
copy_changed_code(
args.target_filepath,
args.generated_code_filepath,
args.line_start,
args.include_signature,
)

View File

@@ -0,0 +1,74 @@
import argparse
import os
import re
from collections import defaultdict
def get_likely_indent_size(array_of_tabs) -> int:
sizes = defaultdict(int)
for i in range(len(array_of_tabs) - 1):
diff = array_of_tabs[i + 1] - array_of_tabs[i]
if diff > 0:
sizes[diff] += 1
if len(sizes) == 0:
return 4
return int(max(sizes, key=sizes.get))
def get_target_filepath(self):
target_filepath = os.path.join(
self.workspace_mount_path,
self.biocoder_instance.repository.split('/')[1],
self.biocoder_instance.filePath,
)
return target_filepath
def remove_code(target_filepath: str, line_start: int, line_end: int, language: str):
comment_prefix = {'python': '#', 'java': '//'}
with open(target_filepath, 'r') as f:
lines = f.read().split('\n')
# print("="*10+"ORIGINAL"+"="*10)
# print("\n".join(lines))
signature_line = lines[line_start - 1]
# get the number of tabs
def get_indent_size(s: str):
return len(re.match(r'\s*', s).group())
indent_sizes = list(map(get_indent_size, lines))
indent_size = get_likely_indent_size(indent_sizes)
comment_indent_size = get_indent_size(signature_line) + indent_size
lines = (
lines[:line_start]
+ [
f"{' '*comment_indent_size+comment_prefix[language.lower()]}TODO: replace with your code here"
]
+ ([''] * 2)
+ lines[line_end:]
)
first_line_after_removed_index = line_start
while len(
lines[first_line_after_removed_index].strip()
) == 0 and first_line_after_removed_index < len(lines):
first_line_after_removed_index += 1
first_line_after_removed = lines[first_line_after_removed_index]
print('FIRST LINE AFTER REMOVED: ', first_line_after_removed)
with open('/testing_files/first_line_after_removed.txt', 'w') as f:
f.write(first_line_after_removed)
with open(target_filepath, 'w') as f:
f.write('\n'.join(lines))
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--target_filepath', type=str, required=True)
parser.add_argument('--line_start', type=int, required=True)
parser.add_argument('--line_end', type=int, required=True)
parser.add_argument('--language', type=str, required=True)
args = parser.parse_args()
remove_code(args.target_filepath, args.line_start, args.line_end, args.language)

View File

@@ -0,0 +1,36 @@
from dataclasses import dataclass
@dataclass
class BiocoderData:
instance_id: str
filePath: str
numLines: int
lineStart: int
lineEnd: int
signature: str
comment: str
content: str
repository: str
promptSummaryOnly: str
contextCode: str
goldenCode: str
test_case_id: str
language: str
def to_dict(self):
return {
'filePath': self.filePath,
'numLines': self.numLines,
'lineStart': self.lineStart,
'lineEnd': self.lineEnd,
'signature': self.signature,
'comment': self.comment,
'content': self.content,
'repository': self.repository,
'promptSummaryOnly': self.promptSummaryOnly,
'contextCode': self.contextCode,
'goldenCode': self.goldenCode,
'test_case_id': self.test_case_id,
'language': self.language,
}

View File

@@ -2,43 +2,14 @@
Implements evaluation of agents on BIRD introduced in [Can LLM Already Serve as A Database Interface? A BIg Bench for Large-Scale Database Grounded Text-to-SQLs](https://arxiv.org/abs/2305.03111). Please see [here](https://bird-bench.github.io/) for the reference implementation used in the paper.
## Setup Environment
## Setup Environment and LLM Configuration
Please follow [this document](https://github.com/OpenDevin/OpenDevin/blob/main/Development.md) to setup local develop environment for OpenDevin.
## Configure OpenDevin and your LLM
Create a `config.toml` file if it does not exist at the root of the workspace.
Add the following configurations:
```toml
[core]
max_iterations = 100
cache_dir = "/tmp/cache"
ssh_hostname = "localhost"
[sandbox]
enable_auto_lint = true
# TODO: Change these to the model you want to evaluate
[llm.eval_gpt4_1106_preview]
model = "gpt-4-1106-preview"
api_key = "XXX"
temperature = 0.0
[llm.eval_some_openai_compatible_model]
model = "openai/MODEL_NAME"
base_url = "https://OPENAI_COMPATIBLE_URL/v1"
api_key = "XXX"
temperature = 0.0
```
Please follow instruction [here](../README.md#setup) to setup your local development environment and LLM.
## Run Inference on Bird
```bash
./evaluation/bird/scripts/run_infer.sh eval_gpt4_1106_preview [model_config] [git-version]
./evaluation/bird/scripts/run_infer.sh [model_config] [git-version]
```
- `model_config`, e.g. `eval_gpt4_1106_preview`, is the config group name for your

View File

@@ -1,12 +1,12 @@
import asyncio
import json
import logging
import os
import pathlib
import re
import shutil
import sqlite3
import subprocess
import zipfile
from typing import Any
import pandas as pd
from datasets import load_dataset
@@ -15,20 +15,24 @@ from tqdm import tqdm
from evaluation.utils.shared import (
EvalMetadata,
EvalOutput,
make_metadata,
prepare_dataset,
reset_logger_for_multiprocessing,
run_evaluation,
)
from opendevin.controller.agent import Agent
from opendevin.controller.state.state import State
from opendevin.core.config import get_llm_config_arg, load_app_config, parse_arguments
from opendevin.core.logger import get_console_handler
from opendevin.core.config import (
AppConfig,
SandboxConfig,
get_llm_config_arg,
parse_arguments,
)
from opendevin.core.logger import opendevin_logger as logger
from opendevin.core.main import run_agent_controller
from opendevin.events.action import MessageAction
from opendevin.llm.llm import LLM
config = load_app_config()
from opendevin.core.main import create_runtime, run_controller
from opendevin.events.action import CmdRunAction, MessageAction
from opendevin.events.observation import CmdOutputObservation
from opendevin.runtime.runtime import Runtime
def codeact_user_response(state: State) -> str:
@@ -62,6 +66,27 @@ AGENT_CLS_TO_INST_SUFFIX = {
}
def get_config(
metadata: EvalMetadata,
) -> AppConfig:
config = AppConfig(
default_agent=metadata.agent_class,
run_as_devin=False,
runtime='eventstream',
max_iterations=metadata.max_iterations,
sandbox=SandboxConfig(
container_image='ubuntu:22.04',
enable_auto_lint=True,
use_host_network=False,
),
# do not mount workspace
workspace_base=None,
workspace_mount_path=None,
)
config.set_llm_config(metadata.llm_config)
return config
def execute_sql(db_path, gen_sql, gold_sql):
"""Execute the generated SQL and the ground truth SQL and compare the results."""
with sqlite3.connect(db_path) as conn:
@@ -76,12 +101,213 @@ def execute_sql(db_path, gen_sql, gold_sql):
return res
def get_test_result(instance, path, timeout=30):
LOCAL_DATASET_PATH = os.path.join(os.path.dirname(__file__), 'data')
def load_bird():
"""Main function to handle the flow of downloading, processing, and loading the bird dataset."""
def _download_bird():
"""Downloads and extracts the bird dataset from a specified URL into a local directory."""
devset_path = os.path.join(LOCAL_DATASET_PATH, 'dev')
if not os.path.exists(devset_path):
logger.info(
f'{LOCAL_DATASET_PATH} folder does not exist, starting download and extraction...'
)
os.makedirs(LOCAL_DATASET_PATH, exist_ok=True)
download_url = 'https://bird-bench.oss-cn-beijing.aliyuncs.com/dev.zip'
download_path = os.path.join(LOCAL_DATASET_PATH, 'dev.zip')
if not os.path.exists(download_path):
logger.info('Start Downloading...')
subprocess.run(['wget', download_url, '-O', download_path])
logger.info('Download completed.')
devset_path = os.path.join(LOCAL_DATASET_PATH, 'dev')
if not os.path.exists(devset_path):
logger.info('Start Extracting...')
os.makedirs(devset_path, exist_ok=True)
with zipfile.ZipFile(download_path, 'r') as zip_ref:
zip_ref.extractall(devset_path)
# move everything in 'dev_20240627' to the root folder
for file in os.listdir(os.path.join(devset_path, 'dev_20240627')):
os.rename(
os.path.join(devset_path, 'dev_20240627', file),
os.path.join(devset_path, file),
)
os.rmdir(os.path.join(devset_path, 'dev_20240627'))
logger.info('Extraction completed.')
# extract databases
database_path = os.path.join(devset_path, 'dev_databases.zip')
assert os.path.exists(database_path)
logger.info('Start Extracting...')
with zipfile.ZipFile(database_path, 'r') as zip_ref:
zip_ref.extractall(devset_path)
logger.info('Extraction completed.')
else:
logger.info(f'{LOCAL_DATASET_PATH} folder already exists.')
return devset_path
def _extract_create_table_prompt(db_path, limit_value=0):
"""Generates a SQL prompt with CREATE TABLE statements and sample data from the database."""
table_query = "SELECT * FROM sqlite_master WHERE type='table';"
tables = sqlite3.connect(db_path).cursor().execute(table_query).fetchall()
prompt = ''
for table in tables:
table_name = table[1]
create_table_statement = table[-1]
table_info_query = f'PRAGMA table_info(`{table_name}`);'
top_k_row_query = f'SELECT * FROM {table_name} LIMIT {limit_value};'
try:
headers = [
x[1]
for x in sqlite3.connect(db_path)
.cursor()
.execute(table_info_query)
.fetchall()
]
except Exception:
logger.error(f'Error Connection: {table_info_query}, {top_k_row_query}')
exit(0)
prompt += create_table_statement + ';\n'
if limit_value > 0:
top_k_rows = (
sqlite3.connect(db_path)
.cursor()
.execute(top_k_row_query)
.fetchall()
)
prompt += (
f"/*\n3 example rows:\n{top_k_row_query}\n{' '.join(headers)}\n"
)
for row in top_k_rows:
row = [str(x) for x in row]
row = [x if x is not None else '' for x in row]
prompt += ' '.join(row) + '\n'
prompt += '*/\n'
prompt += '\n'
return prompt
def _create_prompt(e, database_path):
"""Create a prompt for the given example"""
db_id = e['db_id']
db_path = pathlib.Path(database_path) / db_id / f'{db_id}.sqlite'
# Extract the CREATE TABLE statements and sample data from the database
prompt = _extract_create_table_prompt(db_path)
prompt += f"-- External Knowledge: {e['evidence']}\n\n"
prompt += '-- Using valid SQLite and understanding External Knowledge, answer the following questions for the tables provided above.\n\n'
prompt += '-- Using valid SQLite, answer the following questions for the tables provided above.\n'
prompt += f"Question: {e['question']}\n"
return prompt
def _process_bird(dataset_path):
"""Processes the raw bird dataset into a structured format and saves it as JSON."""
processed_path = os.path.join(LOCAL_DATASET_PATH, 'dev', 'processed_dev.json')
if not os.path.exists(processed_path):
logger.info(
f'{processed_path} folder does not exist, starting processing...'
)
raw_data_path = os.path.join(LOCAL_DATASET_PATH, 'dev', 'dev.json')
database_path = os.path.join(LOCAL_DATASET_PATH, 'dev', 'dev_databases')
processed_data = []
with pathlib.Path(raw_data_path).open('r') as f:
data = json.load(f)
for e in tqdm(data):
item = {
'instance_id': f'{len(processed_data)}',
'db_path': os.path.join(
database_path, e['db_id'], f"{e['db_id']}.sqlite"
),
'db_id': e['db_id'],
'instruction': _create_prompt(e, database_path),
'SQL': e['SQL'],
}
processed_data.append(item)
with pathlib.Path(processed_path).open('w') as f:
json.dump(processed_data, f, indent=2)
logger.info(f'Processed data saved to {processed_path}')
else:
logger.info(f'{processed_path} folder already exists.')
bird_dataset = load_dataset('json', data_files={'test': processed_path})
return bird_dataset
raw_dataset_path = _download_bird()
bird_dataset = _process_bird(raw_dataset_path)
return bird_dataset
async def initialize_runtime(
runtime: Runtime,
instance: pd.Series, # this argument is not required
):
"""Initialize the runtime for the agent.
This function is called before the runtime is used to run the agent.
"""
logger.info(f"{'-' * 50} BEGIN Runtime Initialization Fn {'-' * 50}")
obs: CmdOutputObservation
# Copy the database to the workspace
db_file = os.path.join(
LOCAL_DATASET_PATH,
'dev',
'dev_databases',
instance.db_id,
f'{instance.db_id}.sqlite',
)
await runtime.copy_to(db_file, '/workspace')
# Check the database is copied
action = CmdRunAction(
command='cd /workspace && ls -l',
keep_prompt=False,
)
obs = await runtime.run_action(action)
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
assert obs.exit_code == 0
assert f'{instance.db_id}.sqlite' in obs.content
logger.info(f"{'-' * 50} END Runtime Initialization Fn {'-' * 50}")
async def complete_runtime(
runtime: Runtime,
instance: pd.Series, # this argument is not required, but it is used to get the workspace_dir_name
) -> dict[str, Any]:
"""Complete the runtime for the agent.
This function is called before the runtime is used to run the agent.
If you need to do something in the sandbox to get the correctness metric after
the agent has run, modify this function.
"""
logger.info(f"{'-' * 50} BEGIN Runtime Completion Fn {'-' * 50}")
obs: CmdOutputObservation
timeout = 30
test_result = {'result': {}, 'metadata': {}}
# Read the generated python file
with open(path, 'r') as f:
gen_file = f.read()
instance_id = instance.instance_id.replace('/', '__')
path = os.path.join('/workspace', f'{instance_id}.py')
action = CmdRunAction(
command=f'cat {path}',
keep_prompt=False,
)
obs = await runtime.run_action(action)
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
if obs.exit_code != 0:
test_result['result'] = {'passed': 0, 'status': 'error'}
return test_result
gen_file = obs.content.strip().replace('\r\n', '\n')
# Extract the SQL from the python file
gen_sql = ''
@@ -96,7 +322,13 @@ def get_test_result(instance, path, timeout=30):
# Execute the SQL
try:
res = func_timeout(
timeout, execute_sql, args=(instance.db_path, gen_sql, gold_sql)
timeout,
execute_sql,
args=(
instance.db_path,
gen_sql,
gold_sql,
),
)
status = 'success'
except FunctionTimedOut:
@@ -114,68 +346,28 @@ def get_test_result(instance, path, timeout=30):
'gen_sql': gen_sql,
'gold_sql': gold_sql,
}
logger.info(f"{'-' * 50} END Runtime Completion Fn {'-' * 50}")
return test_result
def process_instance(
async def process_instance(
instance: pd.Series,
metadata: EvalMetadata,
reset_logger: bool = True,
):
# Create the agent
agent = Agent.get_cls(metadata.agent_class)(llm=LLM(config=metadata.llm_config))
workspace_mount_path = os.path.join(
config.workspace_mount_path, 'bird_eval_workspace'
)
workspace_mount_path = os.path.join(workspace_mount_path, str(os.getpid()))
pathlib.Path(workspace_mount_path).mkdir(parents=True, exist_ok=True)
# reset workspace to config
config.workspace_mount_path = workspace_mount_path
# Copy the database to the workspace
db_root = os.path.join(
config.workspace_base, 'evaluation_bird/dev/dev_databases', instance.db_id
)
target_path = os.path.join(workspace_mount_path, f'{instance.db_id}')
if not os.path.exists(target_path):
logger.info(f'Copying database from {db_root} to {target_path}...')
shutil.copytree(db_root, target_path)
# Set up the database path
database_path = os.path.join(instance.db_id, f'{instance.db_id}.sqlite')
) -> EvalOutput:
config = get_config(metadata)
# use session id for concurrent evaluation
sid = instance.task_id.replace('/', '__')
instance_id = instance.instance_id.replace('/', '__')
# Set up the logger properly, so you can run multi-processing to parallelize the evaluation
if reset_logger:
# Set up logger
log_file = os.path.join(
metadata.eval_output_dir,
'logs',
f'instance_{sid}.log',
)
# Remove all existing handlers from logger
for handler in logger.handlers[:]:
logger.removeHandler(handler)
# add back the console handler to print ONE line
logger.addHandler(get_console_handler())
logger.info(
f'Starting evaluation for instance {instance.task_id}.\nLOG: tail -f {log_file}'
)
# Remove all existing handlers from logger
for handler in logger.handlers[:]:
logger.removeHandler(handler)
file_handler = logging.FileHandler(log_file)
file_handler.setFormatter(
logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
)
logger.addHandler(file_handler)
logger.info(f'Process-specific workspace mounted at {workspace_mount_path}')
log_dir = os.path.join(metadata.eval_output_dir, 'infer_logs')
reset_logger_for_multiprocessing(logger, instance_id, log_dir)
else:
logger.info(f'Starting evaluation for instance {instance_id}.')
# Create file with BIRD instance
database_path = os.path.join('/workspace', f'{instance.db_id}.sqlite')
statements = f"""
import sqlite3
def execute_sql(db_path, sql):
@@ -192,12 +384,12 @@ def process_instance(
result = execute_sql(db_path, sql)
print(result)
"""
path = os.path.join(config.workspace_mount_path, f'{sid}.py')
instruction = (
f'You are a SQL expert and need to complete the following text-to-SQL tasks.'
f'\n\n{instance.instruction}\n\n'
'Please write the SQL in one line without line breaks.'
f'And write a new python file named {sid}.py to call the SQL you wrote.'
f'And write a new python file named {instance_id}.py to call the SQL you wrote.'
'You need to follow the code template below:'
f'\n\n{statements}\n\n'
'Environment has been set up for you to start working.'
@@ -208,23 +400,21 @@ def process_instance(
'You SHOULD INCLUDE PROPER INDENTATION in your edit commands.\n'
)
# NOTE: You can actually set slightly different instruction for different agents
instruction += AGENT_CLS_TO_INST_SUFFIX[agent.__class__.__name__]
instruction += AGENT_CLS_TO_INST_SUFFIX[metadata.agent_class]
runtime = await create_runtime(config, sid=instance_id)
await initialize_runtime(runtime, instance)
# Here's how you can run the agent (similar to the `main` function) and get the final task state
state: State | None = asyncio.run(
run_agent_controller(
agent,
instruction,
max_iterations=metadata.max_iterations,
max_budget_per_task=config.max_budget_per_task,
fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN[
agent.__class__.__name__
],
sid=sid,
)
state: State | None = await run_controller(
config=config,
task_str=instruction,
fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN[metadata.agent_class],
runtime=runtime,
)
# ======= Attempt to evaluate the agent's edits =======
test_result = get_test_result(instance, path)
test_result = await complete_runtime(runtime, instance)
# If you are working on some simpler benchmark that only evaluates the final model output (e.g., in a MessageAction)
# You can simply get the LAST `MessageAction` from the returned `state.history` and parse it for evaluation.
@@ -238,162 +428,43 @@ def process_instance(
histories = state.history.compatibility_for_eval_history_pairs()
# Save the output
output = {
'task_id': instance.task_id,
'instruction': instruction,
'metadata': metadata.model_dump(),
'history': histories,
'metrics': metrics,
'error': state.last_error if state and state.last_error else None,
'test_result': test_result,
}
output = EvalOutput(
instance_id=instance.instance_id,
instruction=instruction,
metadata=metadata,
history=histories,
metrics=metrics,
error=state.last_error if state and state.last_error else None,
test_result=test_result,
)
return output
def load_bird():
"""Main function to handle the flow of downloading, processing, and loading the bird dataset."""
raw_dataset_path = download_bird()
bird_dataset = process_bird(raw_dataset_path)
return bird_dataset
def download_bird():
"""Downloads and extracts the bird dataset from a specified URL into a local directory."""
dataset_path = os.path.join(config.workspace_base, 'evaluation_bird')
devset_path = os.path.join(dataset_path, 'dev')
if not os.path.exists(dataset_path):
logger.info(
f'{dataset_path} folder does not exist, starting download and extraction...'
)
os.makedirs(dataset_path, exist_ok=True)
download_url = 'https://bird-bench.oss-cn-beijing.aliyuncs.com/dev.zip'
download_path = os.path.join(dataset_path, 'dev.zip')
logger.info('Start Downloading...')
subprocess.run(['wget', download_url, '-O', download_path])
logger.info('Download completed.')
logger.info('Start Extracting...')
subprocess.run(['unzip', download_path, '-d', dataset_path])
# extract databases
devset_path = os.path.join(dataset_path, 'dev')
database_path = os.path.join(devset_path, 'dev_databases.zip')
subprocess.run(['unzip', database_path, '-d', devset_path])
logger.info('Extraction completed.')
else:
logger.info(f'{dataset_path} folder already exists.')
return devset_path
def process_bird(dataset_path):
"""Processes the raw bird dataset into a structured format and saves it as JSON."""
processed_path = os.path.join(dataset_path, 'processed_dev.json')
if not os.path.exists(processed_path):
logger.info(f'{processed_path} folder does not exist, starting processing...')
raw_data_path = os.path.join(dataset_path, 'dev.json')
database_path = os.path.join(dataset_path, 'dev_databases')
processed_data = []
with pathlib.Path(raw_data_path).open('r') as f:
data = json.load(f)
for e in tqdm(data):
item = {
'task_id': f'{len(processed_data)}',
'db_path': os.path.join(
database_path, e['db_id'], f"{e['db_id']}.sqlite"
),
'db_id': e['db_id'],
'instruction': create_prompt(e, database_path),
'SQL': e['SQL'],
}
processed_data.append(item)
with pathlib.Path(processed_path).open('w') as f:
json.dump(processed_data, f, indent=2)
logger.info(f'Processed data saved to {processed_path}')
else:
logger.info(f'{processed_path} folder already exists.')
bird_dataset = load_dataset('json', data_files={'test': processed_path})
return bird_dataset
def extract_create_table_prompt(db_path, limit_value=0):
"""Generates a SQL prompt with CREATE TABLE statements and sample data from the database."""
table_query = "SELECT * FROM sqlite_master WHERE type='table';"
tables = sqlite3.connect(db_path).cursor().execute(table_query).fetchall()
prompt = ''
for table in tables:
table_name = table[1]
create_table_statement = table[-1]
table_info_query = f'PRAGMA table_info(`{table_name}`);'
top_k_row_query = f'SELECT * FROM {table_name} LIMIT {limit_value};'
try:
headers = [
x[1]
for x in sqlite3.connect(db_path)
.cursor()
.execute(table_info_query)
.fetchall()
]
except Exception:
logger.error(f'Error Connection: {table_info_query}, {top_k_row_query}')
exit(0)
prompt += create_table_statement + ';\n'
if limit_value > 0:
top_k_rows = (
sqlite3.connect(db_path).cursor().execute(top_k_row_query).fetchall()
)
prompt += (
f"/*\n3 example rows:\n{top_k_row_query}\n{' '.join(headers)}\n"
)
for row in top_k_rows:
row = [str(x) for x in row]
row = [x if x is not None else '' for x in row]
prompt += ' '.join(row) + '\n'
prompt += '*/\n'
prompt += '\n'
return prompt
def create_prompt(e, database_path):
"""Create a prompt for the given example"""
db_id = e['db_id']
db_path = pathlib.Path(database_path) / db_id / f'{db_id}.sqlite'
# Extract the CREATE TABLE statements and sample data from the database
prompt = extract_create_table_prompt(db_path)
prompt += f"-- External Knowledge: {e['evidence']}\n\n"
prompt += '-- Using valid SQLite and understanding External Knowledge, answer the following questions for the tables provided above.\n\n'
prompt += '-- Using valid SQLite, answer the following questions for the tables provided above.\n'
prompt += f"Question: {e['question']}\n"
return prompt
if __name__ == '__main__':
id_column = 'task_id'
args = parse_arguments()
bird_dataset = load_bird()
dataset = bird_dataset['test'].to_pandas()
dataset.rename(columns={'task_id': 'instance_id'}, inplace=True)
llm_config = get_llm_config_arg(args.llm_config) if args.llm_config else config.llm
logger.info(f'Config for evaluation: {config}')
llm_config = None
if args.llm_config:
llm_config = get_llm_config_arg(args.llm_config)
if llm_config is None:
raise ValueError(f'Could not find LLM config: --llm_config {args.llm_config}')
metadata = make_metadata(
llm_config,
args.dataset_name,
'BIRD',
args.agent_cls,
args.max_iterations,
args.eval_note,
args.eval_output_dir,
)
output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl')
instances = prepare_dataset(dataset, output_file, args.eval_n_limit, id_column)
instances = prepare_dataset(dataset, output_file, args.eval_n_limit)
run_evaluation(
instances,
metadata,
output_file,
args.eval_num_workers,
process_instance,
id_column,
asyncio.run(
run_evaluation(
instances, metadata, output_file, args.eval_num_workers, process_instance
)
)

0
evaluation/bird/scripts/run_infer.sh Normal file → Executable file
View File

View File

@@ -5,30 +5,9 @@ Some of OpenDevin's agent supports agent delegation action, for example, CodeAct
This evaluation tests whether CodeActAgent can correctly delegate the instruction from WebArena and MiniWob benchmark to the BrowsingAgent.
If so, the browsing performance upper-bound of CodeActAgent will be the performance of BrowsingAgent.
## Setup Environment and LLM Configuration
## Setup Environment
Please follow [this document](https://github.com/OpenDevin/OpenDevin/blob/main/Development.md) to set up a local development environment for OpenDevin.
## Configure OpenDevin and your LLM
Create a `config.toml` file if it does not exist at the root of the workspace.
Add the following configurations:
```toml
# TODO: Change these to the model you want to evaluate
[llm.eval_gpt4_1106_preview_llm]
model = "gpt-4-1106-preview"
api_key = "XXX"
temperature = 0.0
[llm.eval_some_openai_compatible_model_llm]
model = "openai/MODEL_NAME"
base_url = "https://OPENAI_COMPATIBLE_URL/v1"
api_key = "XXX"
temperature = 0.0
```
Please follow instruction [here](../README.md#setup) to setup your local development environment and LLM.
## Run Inference

View File

@@ -1,5 +1,4 @@
import asyncio
import logging
import os
import re
@@ -9,56 +8,61 @@ from datasets import load_dataset
from evaluation.utils.shared import (
EvalMetadata,
EvalOutput,
make_metadata,
prepare_dataset,
reset_logger_for_multiprocessing,
run_evaluation,
)
from opendevin.controller.agent import Agent
from opendevin.controller.state.state import State
from opendevin.core.config import get_llm_config_arg, load_app_config, parse_arguments
from opendevin.core.logger import get_console_handler
from opendevin.core.config import (
AppConfig,
SandboxConfig,
get_llm_config_arg,
parse_arguments,
)
from opendevin.core.logger import opendevin_logger as logger
from opendevin.core.main import run_agent_controller
from opendevin.llm.llm import LLM
config = load_app_config()
from opendevin.core.main import create_runtime, run_controller
# Only CodeActAgent can delegate to BrowsingAgent
SUPPORTED_AGENT_CLS = {'CodeActAgent'}
def process_instance(
def get_config(
metadata: EvalMetadata,
) -> AppConfig:
assert (
metadata.max_iterations == 1
), 'max_iterations must be 1 for browsing delegation evaluation.'
config = AppConfig(
default_agent=metadata.agent_class,
run_as_devin=False,
runtime='eventstream',
max_iterations=metadata.max_iterations,
sandbox=SandboxConfig(
container_image='ubuntu:22.04',
enable_auto_lint=False,
use_host_network=False,
),
workspace_base=None,
workspace_mount_path=None,
)
config.set_llm_config(metadata.llm_config)
return config
async def process_instance(
instance: pd.Series,
metadata: EvalMetadata,
reset_logger: bool = True,
):
# Create the agent
agent = Agent.get_cls(metadata.agent_class)(llm=LLM(config=metadata.llm_config))
env_id = instance.instance_id
) -> EvalOutput:
config = get_config(metadata)
# Setup the logger properly, so you can run multi-processing to parallelize the evaluation
if reset_logger:
# Set up logger
log_file = os.path.join(
metadata.eval_output_dir, 'logs', f'instance_{env_id}.log'
)
# Remove all existing handlers from logger
for handler in logger.handlers[:]:
logger.removeHandler(handler)
# add back the console handler to print ONE line
logger.addHandler(get_console_handler())
logger.info(
f'Starting evaluation for instance {env_id}.\nHint: run "tail -f {log_file}" to see live logs in a separate shell'
)
# Remove all existing handlers from logger
for handler in logger.handlers[:]:
logger.removeHandler(handler)
file_handler = logging.FileHandler(log_file)
file_handler.setFormatter(
logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
)
logger.addHandler(file_handler)
log_dir = os.path.join(metadata.eval_output_dir, 'infer_logs')
reset_logger_for_multiprocessing(logger, instance.instance_id, log_dir)
else:
logger.info(f'Starting evaluation for instance {env_id}.')
logger.info(f'Starting evaluation for instance {instance.instance_id}.')
instruction = (
f'You can delegate browsing tasks to a browser agent. '
@@ -67,21 +71,14 @@ def process_instance(
f'NOTE: You should copy the "query" as is into the <execute_browse> tag. DO NOT change ANYTHING in the query.'
)
state: State | None = asyncio.run(
run_agent_controller(
agent,
instruction,
max_iterations=metadata.max_iterations,
max_budget_per_task=config.max_budget_per_task,
sid=env_id,
)
runtime = await create_runtime(config, sid=instance.instance_id)
state: State | None = await run_controller(
config=config,
task_str=instruction,
runtime=runtime,
)
# ======= Attempt to evaluate the agent's environment impact =======
# If you are working on some simpler benchmark that only evaluates the final model output (e.g., in a MessageAction)
# You can simply get the LAST `MessageAction` from the returned `state.history` and parse it for evaluation.
if state is None:
raise ValueError('State should not be None.')
@@ -115,20 +112,19 @@ def process_instance(
result['is_exact_match'] = is_exact_match
# Save the output
output = {
'instance_id': env_id,
'instruction': instruction,
'metadata': metadata.model_dump(),
'history': histories,
'metrics': metrics,
'error': state.last_error if state and state.last_error else None,
'test_result': {
output = EvalOutput(
instance_id=instance.instance_id,
instruction=instruction,
metadata=metadata,
history=histories,
metrics=metrics,
error=state.last_error if state and state.last_error else None,
test_result={
'query': instance.instruction,
'action': last_delegate_action,
'result': result,
},
}
)
return output
@@ -138,9 +134,13 @@ if __name__ == '__main__':
dataset = load_dataset('OpenDevin/eval-browsing-instructions')
dataset = dataset['train'].to_pandas()
assert dataset.columns.tolist() == ['instance_id', 'instruction']
id_column = 'instance_id'
llm_config = get_llm_config_arg(args.llm_config) if args.llm_config else config.llm
logger.info(f'Config for evaluation: {config}')
llm_config = None
if args.llm_config:
llm_config = get_llm_config_arg(args.llm_config)
if llm_config is None:
raise ValueError(f'Could not find LLM config: --llm_config {args.llm_config}')
metadata = make_metadata(
llm_config,
@@ -150,18 +150,20 @@ if __name__ == '__main__':
args.eval_note,
args.eval_output_dir,
)
if metadata.agent_class not in SUPPORTED_AGENT_CLS:
raise ValueError(
f'Agent class {metadata.agent_class} not supported with AgentDelegation.'
)
output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl')
instances = prepare_dataset(dataset, output_file, args.eval_n_limit, id_column)
run_evaluation(
instances,
metadata,
output_file,
args.eval_num_workers,
process_instance,
id_column,
instances = prepare_dataset(dataset, output_file, args.eval_n_limit)
asyncio.run(
run_evaluation(
instances,
metadata,
output_file,
args.eval_num_workers,
process_instance,
)
)

View File

@@ -2,9 +2,9 @@
This folder contains evaluation harness for evaluating agents on the [GAIA benchmark](https://arxiv.org/abs/2311.12983).
## Configure OpenDevin and your LLM
## Setup Environment and LLM Configuration
Create a `config.toml` file if it does not exist at the root of the workspace. Please check [README.md](../../README.md) for how to set this up.
Please follow instruction [here](../README.md#setup) to setup your local development environment and LLM.
## Run the evaluation
We are using the GAIA dataset hosted on [Hugging Face](https://huggingface.co/datasets/gaia-benchmark/GAIA).

View File

@@ -1,10 +1,7 @@
import asyncio
import logging
import functools
import os
import pathlib
import re
import shutil
from functools import partial
import huggingface_hub
import pandas as pd
@@ -13,28 +10,31 @@ from datasets import load_dataset
from evaluation.gaia.scorer import question_scorer
from evaluation.utils.shared import (
EvalMetadata,
EvalOutput,
codeact_user_response,
make_metadata,
prepare_dataset,
reset_logger_for_multiprocessing,
run_evaluation,
)
from opendevin.controller.agent import Agent
from opendevin.controller.state.state import State
from opendevin.core.config import get_llm_config_arg, get_parser, load_app_config
from opendevin.core.logger import get_console_handler
from opendevin.core.config import (
AppConfig,
SandboxConfig,
get_llm_config_arg,
get_parser,
)
from opendevin.core.logger import opendevin_logger as logger
from opendevin.core.main import run_agent_controller
from opendevin.events.action import CmdRunAction, MessageAction
from opendevin.llm.llm import LLM
from opendevin.core.main import create_runtime, run_controller
from opendevin.events.action import AgentFinishAction, CmdRunAction, MessageAction
from opendevin.events.observation import CmdOutputObservation
from opendevin.runtime.runtime import Runtime
config = load_app_config()
DATASET_CACHE_DIR = '~/.cache/open-devin/evals/gaia'
DATASET_CACHE_DIR = os.path.expanduser(DATASET_CACHE_DIR)
DATASET_CACHE_DIR = os.path.join(os.path.dirname(__file__), 'data')
AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = {
'CodeActAgent': partial(codeact_user_response, encapsulate_solution=True),
'CodeActAgent': functools.partial(codeact_user_response, encapsulate_solution=True),
}
AGENT_CLS_TO_INST_SUFFIX = {
@@ -42,151 +42,174 @@ AGENT_CLS_TO_INST_SUFFIX = {
}
def process_instance(
def get_config(
metadata: EvalMetadata,
) -> AppConfig:
config = AppConfig(
default_agent=metadata.agent_class,
run_as_devin=False,
runtime='eventstream',
max_iterations=metadata.max_iterations,
sandbox=SandboxConfig(
container_image='ubuntu:22.04',
enable_auto_lint=True,
use_host_network=False,
),
# do not mount workspace
workspace_base=None,
workspace_mount_path=None,
)
config.set_llm_config(metadata.llm_config)
return config
async def initialize_runtime(
runtime: Runtime,
instance: pd.Series, # this argument is not required
):
"""Initialize the runtime for the agent.
This function is called before the runtime is used to run the agent.
"""
logger.info(f"{'-' * 50} BEGIN Runtime Initialization Fn {'-' * 50}")
obs: CmdOutputObservation
action = CmdRunAction(command='mkdir -p /workspace')
logger.info(action, extra={'msg_type': 'ACTION'})
obs = await runtime.run_action(action)
assert obs.exit_code == 0
if instance['file_name'] != '':
# if this question comes with a file, we need to save it to the workspace
assert metadata.data_split is not None
src_file = os.path.join(
DATASET_CACHE_DIR, '2023', metadata.data_split, instance['file_name']
)
assert os.path.exists(src_file)
dest_file = os.path.join('/workspace', instance['file_name'])
await runtime.copy_to(src_file, dest_file)
# rename to file.extension_name
extension_name = instance['file_name'].split('.')[-1]
action = CmdRunAction(
command=f'mv /workspace/{instance["file_name"]} /workspace/file.{extension_name}'
)
logger.info(action, extra={'msg_type': 'ACTION'})
obs = await runtime.run_action(action)
assert obs.exit_code == 0
action = CmdRunAction(command='cd /workspace')
logger.info(action, extra={'msg_type': 'ACTION'})
obs = await runtime.run_action(action)
assert obs.exit_code == 0
logger.info(f"{'-' * 50} END Runtime Initialization Fn {'-' * 50}")
async def process_instance(
instance: pd.Series,
metadata: EvalMetadata,
reset_logger: bool = True,
):
# Create the agent
agent = Agent.get_cls(metadata.agent_class)(llm=LLM(config=metadata.llm_config))
# create process-specific workspace dir
# we will create a workspace directory for EACH process
# so that different agent don't interfere with each other.
old_workspace_mount_path = config.workspace_mount_path
) -> EvalOutput:
config = get_config(metadata)
try:
workspace_mount_path = os.path.join(
config.workspace_mount_path, '_eval_workspace'
)
workspace_mount_path = os.path.join(workspace_mount_path, str(os.getpid()))
pathlib.Path(workspace_mount_path).mkdir(parents=True, exist_ok=True)
config.workspace_mount_path = workspace_mount_path
# Setup the logger properly, so you can run multi-processing to parallelize the evaluation
if reset_logger:
log_dir = os.path.join(metadata.eval_output_dir, 'infer_logs')
reset_logger_for_multiprocessing(logger, instance['instance_id'], log_dir)
else:
logger.info(f'Starting evaluation for instance {instance["instance_id"]}.')
# Setup the logger properly, so you can run multi-processing to parallelize the evaluation
eval_output_dir = metadata.eval_output_dir
if reset_logger:
# Set up logger
log_file = os.path.join(
eval_output_dir, 'logs', f'instance_{instance["task_id"]}.log'
)
# Remove all existing handlers from logger
for handler in logger.handlers[:]:
logger.removeHandler(handler)
# add back the console handler to print ONE line
logger.addHandler(get_console_handler())
logger.info(
f'Starting evaluation for instance {instance["task_id"]}.\nLOG: tail -f {log_file}'
)
# Remove all existing handlers from logger
for handler in logger.handlers[:]:
logger.removeHandler(handler)
file_handler = logging.FileHandler(log_file)
file_handler.setFormatter(
logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
)
logger.addHandler(file_handler)
if instance['file_name'] != '':
extension_name = instance['file_name'].split('.')[-1]
dest_file = os.path.join('/workspace', f'file.{extension_name}')
else:
dest_file = None
logger.info(f'Process-specific workspace mounted at {workspace_mount_path}')
if instance['file_name'] != '':
# if this question comes with a file, we need to save it to the workspace
assert metadata.data_split is not None
src_file = os.path.join(
DATASET_CACHE_DIR, '2023', metadata.data_split, instance['file_name']
)
extension_name = instance['file_name'].split('.')[-1]
dest_file = os.path.join(workspace_mount_path, f'file.{extension_name}')
shutil.copyfile(src_file, dest_file)
logger.info(f'File copied to {dest_file}')
else:
dest_file = None
# Prepare instruction
instruction = f"{instance['Question']}\n"
logger.info(f'Instruction: {instruction}')
if dest_file:
instruction += f"\n\nThe mentioned file is provided in the workspace at: {dest_file.split('/')[-1]}"
# Prepare instruction
instruction = f"{instance['Question']}\n"
logger.info(f'Instruction: {instruction}')
if dest_file:
instruction += f"\n\nThe mentioned file is provided in the workspace at: {dest_file.split('/')[-1]}"
instruction += 'IMPORTANT: You should ONLY interact with the environment provided to you AND NEVER ASK FOR HUMAN HELP.\n'
instruction += 'Please encapsulate your final answer (answer ONLY) within <solution> and </solution>.\n'
instruction += (
'For example: The answer to the question is <solution> 42 </solution>.\n'
)
# NOTE: You can actually set slightly different instruction for different agents
instruction += AGENT_CLS_TO_INST_SUFFIX.get(metadata.agent_class, '')
logger.info(f'Instruction:\n{instruction}', extra={'msg_type': 'OBSERVATION'})
instruction += 'IMPORTANT: You should ONLY interact with the environment provided to you AND NEVER ASK FOR HUMAN HELP.\n'
instruction += 'Please encapsulate your final answer (answer ONLY) within <solution> and </solution>.\n'
instruction += (
'For example: The answer to the question is <solution> 42 </solution>.\n'
)
# NOTE: You can actually set slightly different instruction for different agents
instruction += AGENT_CLS_TO_INST_SUFFIX.get(agent.__class__.__name__, '')
logger.info(f'Instruction:\n{instruction}', extra={'msg_type': 'OBSERVATION'})
runtime = await create_runtime(config, sid=instance['instance_id'])
await initialize_runtime(runtime, instance)
# Here's how you can run the agent (similar to the `main` function) and get the final task state
state: State | None = asyncio.run(
run_agent_controller(
agent,
instruction,
max_iterations=metadata.max_iterations,
max_budget_per_task=config.max_budget_per_task,
fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN[
agent.__class__.__name__
],
sid=instance['task_id'],
)
)
# ======= Attempt to evaluate the agent's edits =======
# If you are working on simpler benchmark that only evaluates the final model output (e.g., in a MessageAction)
# You can simply get the LAST `MessageAction` from the returned `state.history` and parse it for evaluation.
# Here's how you can run the agent (similar to the `main` function) and get the final task state
state: State | None = await run_controller(
config=config,
task_str=instruction,
runtime=runtime,
fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN[metadata.agent_class],
)
# ======= Attempt to evaluate the agent's edits =======
# If you are working on simpler benchmark that only evaluates the final model output (e.g., in a MessageAction)
# You can simply get the LAST `MessageAction` from the returned `state.history` and parse it for evaluation.
if state is None:
raise ValueError('State should not be None.')
if state is None:
raise ValueError('State should not be None.')
model_answer_raw = ''
# get the last message or thought from the agent
for event in state.history.get_events(reverse=True):
if isinstance(event, CmdRunAction) and event.source == 'agent':
model_answer_raw = ''
# get the last message or thought from the agent
for event in state.history.get_events(reverse=True):
if event.source == 'agent':
if isinstance(event, AgentFinishAction):
model_answer_raw = event.thought
elif isinstance(event, MessageAction) and event.source == 'agent':
break
elif isinstance(event, CmdRunAction):
model_answer_raw = event.thought
break
elif isinstance(event, MessageAction):
model_answer_raw = event.content
break
# attempt to parse model_answer
model_answer = re.findall(r'<solution>(.*?)</solution>', model_answer_raw)
if len(model_answer) == 0:
logger.warning(f'Failed to parse model answer: {model_answer_raw}')
model_answer = model_answer_raw
else:
model_answer = model_answer[0]
# attempt to parse model_answer
model_answer = re.findall(r'<solution>(.*?)</solution>', model_answer_raw)
if len(model_answer) == 0:
logger.warning(f'Failed to parse model answer: {model_answer_raw}')
model_answer = model_answer_raw
else:
model_answer = model_answer[0]
logger.info(
f'Final message: {model_answer} | Ground truth: {instance["Final answer"]}'
)
score = question_scorer(
model_answer=model_answer, ground_truth=instance['Final answer']
)
test_result = {
'score': score,
'model_answer_raw': model_answer_raw,
'model_answer': model_answer,
'ground_truth': instance['Final answer'],
}
metrics = state.metrics.get() if state.metrics else None
logger.info(
f'Final message: {model_answer} | Ground truth: {instance["Final answer"]}'
)
score = question_scorer(
model_answer=model_answer, ground_truth=instance['Final answer']
)
test_result = {
'score': score,
'model_answer_raw': model_answer_raw,
'model_answer': model_answer,
'ground_truth': instance['Final answer'],
}
metrics = state.metrics.get() if state.metrics else None
# history is now available as a stream of events, rather than list of pairs of (Action, Observation)
# for compatibility with the existing output format, we can remake the pairs here
# remove when it becomes unnecessary
histories = state.history.compatibility_for_eval_history_pairs()
# history is now available as a stream of events, rather than list of pairs of (Action, Observation)
# for compatibility with the existing output format, we can remake the pairs here
# remove when it becomes unnecessary
histories = state.history.compatibility_for_eval_history_pairs()
# Save the output
output = {
'instance_id': instance['task_id'],
'instance': instance,
'instruction': instance['Question'],
'metadata': metadata.model_dump(),
'history': histories,
'metrics': metrics,
'error': state.last_error if state and state.last_error else None,
'test_result': test_result,
}
except Exception:
logger.error('Process instance failed')
raise
finally:
config.workspace_mount_path = old_workspace_mount_path
# Save the output
output = EvalOutput(
instance_id=instance['instance_id'],
instance=instance.to_dict(),
instruction=instance['Question'],
metadata=metadata,
history=histories,
metrics=metrics,
error=state.last_error if state and state.last_error else None,
test_result=test_result,
)
return output
@@ -197,13 +220,19 @@ if __name__ == '__main__':
type=str,
help='gaia level to evaluate, eg. 2023_level1',
)
parser.add_argument(
'--data-split',
type=str,
help='data split to evaluate, eg. test',
default='validation',
)
args, _ = parser.parse_known_args()
if args.directory:
config.workspace_base = os.path.abspath(args.directory)
logger.info(f'Setting workspace base to {config.workspace_base}')
llm_config = get_llm_config_arg(args.llm_config) if args.llm_config else config.llm
logger.info(f'Config for evaluation: {config}')
llm_config = None
if args.llm_config:
llm_config = get_llm_config_arg(args.llm_config)
if llm_config is None:
raise ValueError(f'Could not find LLM config: --llm_config {args.llm_config}')
metadata = make_metadata(
llm_config=llm_config,
@@ -222,20 +251,18 @@ if __name__ == '__main__':
repo_type='dataset',
local_dir=DATASET_CACHE_DIR,
)
gaia_tests = dataset[metadata.data_split]
gaia_tests = dataset[metadata.data_split].to_pandas()
gaia_tests.rename(columns={'task_id': 'instance_id'}, inplace=True)
output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl')
prepared_dataset = prepare_dataset(
gaia_tests.to_pandas(), output_file, args.eval_n_limit, 'task_id'
)
prepared_dataset = prepare_dataset(gaia_tests, output_file, args.eval_n_limit)
agent = Agent.get_cls(args.agent_cls)(llm=LLM(config.llm))
run_evaluation(
dataset=prepared_dataset,
metadata=metadata,
output_file=output_file,
num_workers=args.eval_num_workers,
process_instance_func=process_instance,
id_column='task_id',
asyncio.run(
run_evaluation(
dataset=prepared_dataset,
metadata=metadata,
output_file=output_file,
num_workers=args.eval_num_workers,
process_instance_func=process_instance,
)
)

0
evaluation/gaia/scripts/run_infer.sh Normal file → Executable file
View File

View File

@@ -2,20 +2,16 @@
This folder contains evaluation harness we built on top of the original [Gorilla APIBench](https://github.com/ShishirPatil/gorilla) ([paper](https://arxiv.org/pdf/2305.15334)).
## Setup Environment
## Setup Environment and LLM Configuration
Please follow [this document](https://github.com/OpenDevin/OpenDevin/blob/main/Development.md) to setup local development environment for OpenDevin.
## Configure OpenDevin and your LLM
Run `make setup-config` to set up the `config.toml` file if it does not exist at the root of the workspace.
Please follow instruction [here](../README.md#setup) to setup your local development environment and LLM.
## Run Inference on APIBench Instances
Make sure your Docker daemon is running, then run this bash script:
```bash
bash evaluation/gorilla/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit] [hubs]
./evaluation/gorilla/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit] [hubs]
```
where `model_config` is mandatory, while all other arguments are optional.
@@ -39,5 +35,5 @@ Note: in order to use `eval_limit`, you must also set `agent`; in order to use `
For example,
```bash
bash evaluation/gorilla/scripts/run_infer.sh llm 0.6.2 CodeActAgent 10 th
./evaluation/gorilla/scripts/run_infer.sh llm 0.6.2 CodeActAgent 10 th
```

View File

@@ -1,59 +1,28 @@
import asyncio
import json
import logging
import multiprocessing as mp
import os
import pathlib
import subprocess
import time
from concurrent.futures import ProcessPoolExecutor
from tqdm import tqdm
import pandas as pd
from opendevin.controller.agent import Agent
from evaluation.gorilla.utils import encode_question, get_data_for_hub
from evaluation.utils.shared import (
EvalMetadata,
EvalOutput,
codeact_user_response,
make_metadata,
prepare_dataset,
reset_logger_for_multiprocessing,
run_evaluation,
)
from opendevin.controller.state.state import State
from opendevin.core.config import get_llm_config_arg, get_parser, load_app_config
from opendevin.core.logger import get_console_handler
from opendevin.core.config import (
AppConfig,
SandboxConfig,
get_llm_config_arg,
get_parser,
)
from opendevin.core.logger import opendevin_logger as logger
from opendevin.core.main import run_agent_controller
from opendevin.events.action import MessageAction
from opendevin.llm.llm import LLM
from .utils import encode_question, get_data
config = load_app_config()
def cleanup():
print('Cleaning up child processes...')
for process in mp.active_children():
print(f'Terminating child process: {process.name}')
process.terminate()
process.join()
def codeact_user_response(state: State) -> str:
msg = (
#'Please continue working on the task on whatever approach you think is suitable.\n'
'Please run the following command: <execute_bash> exit </execute_bash>.\n'
#'IMPORTANT: YOU SHOULD NEVER ASK FOR HUMAN HELP OR USE THE INTERNET TO SOLVE THIS TASK.\n'
)
# check if the agent has tried to talk to the user 3 times, if so, let the agent know it can give up
if state.history:
user_msgs = [
event
for event in state.history.get_events()
if isinstance(event, MessageAction) and event.source == 'user'
]
if len(user_msgs) > 2:
# let the agent know that it can give up when it has tried 3 times
return (
msg
+ 'If you want to give up, run: <execute_bash> exit </execute_bash>.\n'
)
return msg
from opendevin.core.main import create_runtime, run_controller
AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = {
'CodeActAgent': codeact_user_response,
@@ -64,105 +33,95 @@ AGENT_CLS_TO_INST_SUFFIX = {
}
def process_instance(agent, question_id, question, metadata, reset_logger: bool = True):
# create process-specific workspace dir
# we will create a workspace directory for EACH process
# so that different agent don't interfere with each other.
old_workspace_mount_path = config.workspace_mount_path
try:
workspace_mount_path = os.path.join(
config.workspace_mount_path, '_eval_workspace'
)
workspace_mount_path = os.path.join(workspace_mount_path, str(os.getpid()))
pathlib.Path(workspace_mount_path).mkdir(parents=True, exist_ok=True)
config.workspace_mount_path = workspace_mount_path
def get_config(
metadata: EvalMetadata,
) -> AppConfig:
config = AppConfig(
default_agent=metadata.agent_class,
run_as_devin=False,
runtime='eventstream',
max_iterations=metadata.max_iterations,
sandbox=SandboxConfig(
container_image='ubuntu:22.04',
enable_auto_lint=True,
use_host_network=False,
),
# do not mount workspace
workspace_base=None,
workspace_mount_path=None,
)
config.set_llm_config(metadata.llm_config)
return config
# Setup the logger properly, so you can run multi-processing to parallize the evaluation
eval_output_dir = metadata['eval_output_dir']
if reset_logger:
# Set up logger
log_file = os.path.join(
eval_output_dir, 'logs', f'instance_{question_id}.log'
)
# Remove all existing handlers from logger
for handler in logger.handlers[:]:
logger.removeHandler(handler)
# add back the console handler to print ONE line
logger.addHandler(get_console_handler())
logger.info(
f'Starting evaluation for instance {question_id}.\nLOG: tail -f {log_file}'
)
# Remove all existing handlers from logger
for handler in logger.handlers[:]:
logger.removeHandler(handler)
file_handler = logging.FileHandler(log_file)
file_handler.setFormatter(
logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
)
logger.addHandler(file_handler)
logger.info(f'Process-specific workspace mounted at {workspace_mount_path}')
# Prepare instruction
instruction = encode_question(question, metadata['hub'])
instruction += 'IMPORTANT: You should ONLY interact with the environment provided to you AND NEVER ASK FOR HUMAN HELP.\n'
# NOTE: You can actually set slightly different instruction for different agents
instruction += AGENT_CLS_TO_INST_SUFFIX[agent.__class__.__name__]
# logger.info(f'Instruction:\n{instruction}', extra={'msg_type': 'OBSERVATION'})
async def process_instance(
instance: pd.Series,
metadata: EvalMetadata,
reset_logger: bool = True,
) -> EvalOutput:
config = get_config(metadata)
instance_id = instance['question_id']
question = instance['question']
# Here's how you can run the agent (similar to the `main` function) and get the final task state
state: State | None = asyncio.run(
run_agent_controller(
agent,
instruction,
max_iterations=metadata.max_iterations,
max_budget_per_task=config.max_budget_per_task,
fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN.get(
agent.__class__.__name__
),
sid=question_id,
)
)
# ======= Attempt to evaluate the agent's edits =======
# If you are working on simpler benchmark that only evaluates the final model output (e.g., in a MessageAction)
# You can simply get the LAST `MessageAction` from the returned `state.history` and parse it for evaluation.
# Setup the logger properly, so you can run multi-processing to parallelize the evaluation
if reset_logger:
log_dir = os.path.join(metadata.eval_output_dir, 'infer_logs')
reset_logger_for_multiprocessing(logger, instance_id, log_dir)
else:
logger.info(f'Starting evaluation for instance {instance_id}.')
if state is None:
raise ValueError('State should not be None.')
# Prepare instruction
instruction = encode_question(question, instance['hub'])
instruction += 'IMPORTANT: You should ONLY interact with the environment provided to you AND NEVER ASK FOR HUMAN HELP.\n'
# NOTE: You can actually set slightly different instruction for different agents
instruction += AGENT_CLS_TO_INST_SUFFIX[metadata.agent_class]
# logger.info(f'Instruction:\n{instruction}', extra={'msg_type': 'OBSERVATION'})
# retrieve the last message from the agent
model_answer_raw = state.history.get_last_agent_message()
# Here's how you can run the agent (similar to the `main` function) and get the final task state
runtime = await create_runtime(config, sid=instance_id)
state: State | None = await run_controller(
config=config,
task_str=instruction,
runtime=runtime,
fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN.get(
metadata.agent_class
),
)
# ======= Attempt to evaluate the agent's edits =======
# If you are working on simpler benchmark that only evaluates the final model output (e.g., in a MessageAction)
# You can simply get the LAST `MessageAction` from the returned `state.history` and parse it for evaluation.
# attempt to parse model_answer
_, _, ast_eval = get_data(metadata['hub'])
correct, hallucination = ast_eval(question_id, model_answer_raw)
metrics = state.metrics.get() if state.metrics else None
logger.info(
f'Final message: {model_answer_raw} | Correctness: {correct} | Hallucination: {hallucination}'
)
if state is None:
raise ValueError('State should not be None.')
# history is now available as a stream of events, rather than list of pairs of (Action, Observation)
# for compatibility with the existing output format, we can remake the pairs here
# remove when it becomes unnecessary
histories = state.history.compatibility_for_eval_history_pairs()
# retrieve the last message from the agent
model_answer_raw = state.history.get_last_agent_message()
# Save the output
output = {
'question_id': question_id,
# attempt to parse model_answer
ast_eval_fn = instance['ast_eval']
correct, hallucination = ast_eval_fn(instance_id, model_answer_raw)
metrics = state.metrics.get() if state.metrics else None
logger.info(
f'Final message: {model_answer_raw} | Correctness: {correct} | Hallucination: {hallucination}'
)
# history is now available as a stream of events, rather than list of pairs of (Action, Observation)
# for compatibility with the existing output format, we can remake the pairs here
# remove when it becomes unnecessary
histories = state.history.compatibility_for_eval_history_pairs()
output = EvalOutput(
instance_id=instance_id,
metadata=metadata,
history=histories,
metrics=metrics,
error=state.last_error if state and state.last_error else None,
test_result={
'text': model_answer_raw,
'correct': correct,
'hallucination': hallucination,
'answer_id': 'None',
'model_id': metadata['model_name'],
'metadata': metadata.model_dump(),
'history': histories,
'metrics': metrics,
'error': state.last_error if state and state.last_error else None,
}
except Exception:
logger.error('Process instance failed')
raise
finally:
config.workspace_mount_path = old_workspace_mount_path
},
)
return output
@@ -175,188 +134,62 @@ if __name__ == '__main__':
default='hf,torch,tf',
)
args, _ = parser.parse_known_args()
if args.directory:
config.workspace_base = os.path.abspath(args.directory)
print(f'Setting workspace base to {config.workspace_base}')
# Check https://github.com/OpenDevin/OpenDevin/blob/main/evaluation/swe_bench/README.md#configure-opendevin-and-your-llm
# for details of how to set `llm_config`
llm_config = None
if args.llm_config:
specified_llm_config = get_llm_config_arg(args.llm_config)
if specified_llm_config:
config.llm = specified_llm_config
logger.info(f'Config for evaluation: {config}')
agent_class = args.agent_cls
assert (
agent_class in AGENT_CLS_TO_FAKE_USER_RESPONSE_FN
), f'Unsupported agent class: {agent_class}'
model_name = config.llm.model.split('/')[-1]
max_iterations = args.max_iterations
eval_note = ''
if args.eval_note is not None:
eval_note += '_N_' + args.eval_note
eval_output_dir = os.path.join(
args.eval_output_dir,
'gorilla',
agent_class,
model_name + '_maxiter_' + str(max_iterations) + eval_note,
)
pathlib.Path(eval_output_dir).mkdir(parents=True, exist_ok=True)
pathlib.Path(os.path.join(eval_output_dir, 'logs')).mkdir(
parents=True, exist_ok=True
)
logger.info(f'Using evaluation output directory: {eval_output_dir}')
llm_config = get_llm_config_arg(args.llm_config)
if llm_config is None:
raise ValueError(f'Could not find LLM config: --llm_config {args.llm_config}')
hubs = []
if 'hf' in args.hubs:
hubs.append('hf')
if 'torch' in args.hubs or 'th' in args.hubs:
hubs.append('torch')
if 'tf' in args.hubs:
hubs.append('tf')
if hubs == []:
hubs = args.hubs.split(',')
if len(hubs) == 0:
raise ValueError('Please choose at least one from hf, torch, and tf for hubs.')
dfs = []
for hub in hubs:
logger.info(f'Evaluating APIBench {hub} test')
questions, question_ids, ast_eval = get_data(hub)
df = get_data_for_hub(hub)
dfs.append(df)
dataset_df = pd.concat(dfs)
dataset_df.rename(columns={'question_id': 'instance_id'}, inplace=True)
# TEST METADATA
metadata = {
'hub': hub,
'agent_class': agent_class,
'model_name': model_name,
'max_iterations': max_iterations,
'eval_output_dir': eval_output_dir,
'start_time': time.strftime('%Y-%m-%d %H:%M:%S'),
# get the commit id of current repo for reproduciblity
'git_commit': subprocess.check_output(['git', 'rev-parse', 'HEAD'])
.decode('utf-8')
.strip(),
}
logger.info(f'Metadata: {metadata}')
with open(os.path.join(eval_output_dir, f'metadata_{hub}.json'), 'w') as f:
json.dump(metadata, f)
metadata = make_metadata(
llm_config=llm_config,
dataset_name=f'gorilla-{hub}',
agent_class=args.agent_cls,
max_iterations=args.max_iterations,
eval_note=args.eval_note,
eval_output_dir=args.eval_output_dir,
data_split=args.data_split,
)
output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl')
# LIMIT EVALUATION
eval_n_limit = args.eval_n_limit
if eval_n_limit:
questions = questions[: (eval_n_limit // len(hubs))]
question_ids = question_ids[: (eval_n_limit // len(hubs))]
logger.info(
f'Limiting evaluation to a total of first {eval_n_limit} instances -> first {eval_n_limit//len(hubs)} instances per hub.'
)
output_file = os.path.join(eval_output_dir, f'output_{model_name}_{hub}.jsonl')
logger.info(f'Writing evaluation output to {output_file}')
finished_task_ids = set()
if os.path.exists(output_file):
with open(output_file, 'r') as f:
for line in f:
data = json.loads(line)
for i in range(len(question_ids)):
if question_ids[i] == int(data['question_id']):
finished_task_ids.add(data['question_id'])
logger.warning(
f'Output file {output_file} already exists. Loaded {len(finished_task_ids)} finished instances.'
)
output_fp = open(output_file, 'a')
logger.info(
f'Evaluation started with Agent {agent_class}, model {model_name}, max iterations {max_iterations}.'
dataset = prepare_dataset(
dataset_df, output_file=output_file, eval_n_limit=args.eval_n_limit
)
asyncio.run(
run_evaluation(
dataset=dataset,
metadata=metadata,
output_file=output_file,
num_workers=args.eval_num_workers,
process_instance_func=process_instance,
)
# =============================================
# filter out finished instances
new_questions = []
new_question_ids = []
for i in range(len(question_ids)):
if question_ids[i] in finished_task_ids:
logger.info(
f'Skipping instance {question_ids[i]} as it is already finished.'
)
continue
new_questions.append(questions[i])
new_question_ids.append(question_ids[i])
)
finished_task_number = len(finished_task_ids)
questions = new_questions
question_ids = new_question_ids
logger.info(
f'Finished instances: {finished_task_number}, Remaining instances: {len(question_ids)}'
)
# =============================================
pbar = tqdm(total=len(question_ids))
# This function tracks the progress AND write the output to a JSONL file
def update_progress(future, pbar, output_fp, finished_task_ids):
pbar.update(1)
output = future.result()
pbar.set_description(f'Instance {output["question_id"]}')
pbar.set_postfix_str(f'Test Result: {output["correct"]}')
logger.info(
f'Finished evaluation for instance {output["question_id"]}: {output["correct"]}'
)
output_fp.write(json.dumps(output) + '\n')
output_fp.flush()
finished_task_ids.add(output['question_id'])
# Create the agent
agent = Agent.get_cls(agent_class)(llm=LLM(config.llm))
# This sets the multi-processing
num_workers = args.eval_num_workers
logger.info(f'Using {num_workers} workers for evaluation.')
try:
with ProcessPoolExecutor(num_workers) as executor:
futures = []
# This is how we perform multi-processing
for i in range(len(question_ids)):
try:
question_id = question_ids[i]
question = questions[i]
future = executor.submit(
process_instance,
agent,
question_id,
question,
metadata,
reset_logger=bool(num_workers > 1),
)
future.add_done_callback(
update_progress, pbar, output_fp, finished_task_ids
)
futures.append(future)
except Exception:
continue
# Wait for all futures to complete
for future in futures:
try:
future.result()
except Exception:
continue
except KeyboardInterrupt:
logger.info('KeyboardInterrupt received. Cleaning up...')
cleanup()
output_fp.close()
total_correct = 0
total_hallucination = 0
output = []
with open(output_file, 'r') as f:
for line in f:
data = json.loads(line)
output.append(data)
if int(data['question_id']) in finished_task_ids:
if str(data['correct']).lower() == 'true':
total_correct += 1
if str(data['hallucination']).lower() == 'true':
total_hallucination += 1
# sort all output by question_id
output = sorted(output, key=lambda x: x['question_id'])
with open(output_file, 'w') as f:
for dat in output:
f.write(json.dumps(dat) + '\n')
f.flush()
logger.info(
f'Evaluation finished for {hub}. Total: {len(question_ids)+finished_task_number}; Correct: {total_correct}; Hallucination: {total_hallucination}. Accuracy: {total_correct / (len(question_ids)+finished_task_number)}'
)
# Read the output file and calculate the accuracy
total_correct = 0
total_hallucination = 0
output = []
with open(output_file, 'r') as f:
for line in f:
data = json.loads(line)
if data['test_result']['correct']:
total_correct += 1
if data['test_result']['hallucination']:
total_hallucination += 1
output.append(data)
logger.info(
f'Evaluation finished for {hub}. Total: {len(output)}; Correct: {total_correct}; Hallucination: {total_hallucination}. Accuracy: {total_correct / len(output)}'
)

0
evaluation/gorilla/scripts/run_infer.sh Normal file → Executable file
View File

View File

@@ -1,6 +1,8 @@
import json
import os
from functools import partial
import pandas as pd
import requests
from ast_eval_hf import ast_eval_hf, ast_parse
from ast_eval_tf import ast_eval_tf
@@ -48,48 +50,59 @@ def encode_question(question, api_name):
return prompts
def get_data(hub):
DATA_DIR = os.path.join(os.path.dirname(__file__), 'data')
os.makedirs(DATA_DIR, exist_ok=True)
def fetch_data(url, filename):
cache_path = os.path.join(DATA_DIR, filename)
if os.path.exists(cache_path):
with open(cache_path, 'r') as f:
return f.read()
else:
response = requests.get(url)
if response.status_code == 200:
with open(cache_path, 'w') as f:
f.write(response.text)
return response.text
else:
raise Exception(f'Failed to fetch data from {url}')
def get_data_for_hub(hub: str):
if hub == 'hf':
question_data = 'https://raw.githubusercontent.com/ShishirPatil/gorilla/main/eval/eval-data/questions/huggingface/questions_huggingface_0_shot.jsonl'
api_dataset = 'https://raw.githubusercontent.com/ShishirPatil/gorilla/main/data/api/huggingface_api.jsonl'
apibench = 'https://raw.githubusercontent.com/ShishirPatil/gorilla/main/data/apibench/huggingface_eval.json'
ast_eval = ast_eval_hf
if hub == 'torch':
elif hub == 'torch':
question_data = 'https://raw.githubusercontent.com/ShishirPatil/gorilla/main/eval/eval-data/questions/torchhub/questions_torchhub_0_shot.jsonl'
api_dataset = 'https://raw.githubusercontent.com/ShishirPatil/gorilla/main/data/api/torchhub_api.jsonl'
apibench = 'https://raw.githubusercontent.com/ShishirPatil/gorilla/main/data/apibench/torchhub_eval.json'
ast_eval = ast_eval_th
if hub == 'tf':
elif hub == 'tf':
question_data = 'https://raw.githubusercontent.com/ShishirPatil/gorilla/main/eval/eval-data/questions/tensorflowhub/questions_tensorflowhub_0_shot.jsonl'
api_dataset = 'https://raw.githubusercontent.com/ShishirPatil/gorilla/main/data/api/tensorflowhub_api.jsonl'
apibench = 'https://raw.githubusercontent.com/ShishirPatil/gorilla/main/data/apibench/tensorflow_eval.json'
ast_eval = ast_eval_tf
# get questions and question_ids
question_data = fetch_data(question_data, 'question_data.jsonl')
api_dataset = fetch_data(api_dataset, 'api_dataset.jsonl')
apibench = fetch_data(apibench, 'apibench.json')
# Parse question data
questions = []
question_ids = []
question_data = requests.get(question_data)
if question_data.status_code == 200:
lines = question_data.text.splitlines()
for line in lines:
questions.append(json.loads(line)['text'])
question_ids.append(json.loads(line)['question_id'])
for line in question_data.splitlines():
data = json.loads(line)
questions.append(data['text'])
question_ids.append(data['question_id'])
# get the api datasest
api_database = []
api_dataset = requests.get(api_dataset)
if api_dataset.status_code == 200:
lines = api_dataset.text.splitlines()
for line in lines:
api_database.append(json.loads(line))
# Parse API dataset
api_database = [json.loads(line) for line in api_dataset.splitlines()]
# get the question answer pair datasest
qa_pairs = []
apibench = requests.get(apibench)
if apibench.status_code == 200:
lines = apibench.text.splitlines()
for line in lines:
qa_pairs.append(json.loads(line)['api_data'])
# Parse question-answer pairs
qa_pairs = [json.loads(line)['api_data'] for line in apibench.splitlines()]
# Parse all apis to ast trees
ast_database = []
@@ -97,4 +110,15 @@ def get_data(hub):
ast_tree = ast_parse(data['api_call'])
ast_database.append(ast_tree)
ast_eval = partial(ast_eval, api_database, qa_pairs, ast_database)
return questions, question_ids, ast_eval
return pd.DataFrame(
{
'question_id': question_ids,
'question': questions,
'api_database': [api_database] * len(questions),
'qa_pairs': [qa_pairs] * len(questions),
'ast_database': [ast_database] * len(questions),
'ast_eval': [ast_eval] * len(questions),
'hub': [hub] * len(questions),
}
)

View File

@@ -15,31 +15,9 @@ Further references:
- https://paperswithcode.com/dataset/gpqa
- https://github.com/idavidrein/gpqa
## Setup Environment and LLM Configuration
## Setup Environment
Please follow [this document](https://github.com/OpenDevin/OpenDevin/blob/main/Development.md) to setup local develop environment for OpenDevin.
## Configure OpenDevin and your LLM
Create a `config.toml` file (you can copy from `config.template.toml`) if it does not exist at the root of the workspace.
Add the following configurations:
```toml
# TODO: Change these to the model you want to evaluate
[llm.eval_gpt4_1106_preview]
model = "gpt-4-1106-preview"
api_key = "XXX"
temperature = 0.0
[llm.eval_azure_openai_compatible_model]
model = "AZURE_OPENAI_EXACT_DEPLOYMENT_MODEL_NAME"
base_url = "AZURE_OPENAI_ENDPOINT"
api_key = "AZURE_ENDPOINT_API_KEY"
temperature = 0.0
```
Please follow instruction [here](../README.md#setup) to setup your local development environment and LLM.
## Run Inference on GPQA Benchmark
'gpqa_main', 'gqpa_diamond', 'gpqa_experts', 'gpqa_extended' -- data split options
@@ -55,8 +33,3 @@ like to evaluate. It could also be a release tag like `0.6.2`.
- `num_samples_eval`: Number of samples to evaluate (useful for testing and debugging).
- `data_split`: The data split to evaluate on. Must be one of `gpqa_main`, `gqpa_diamond`, `gpqa_experts`, `gpqa_extended`. Defaults to `gpqa_diamond` as done in the paper.
- `AgentClass`: The agent class to use for evaluation. Currently only supports `CodeActAgent` for CodeActAgent.
## Benchmark Evaluation Results
- [] TODO: Finish the evaluation run across the entire benchmark and compile results

View File

@@ -17,9 +17,7 @@ TODOs:
"""
import asyncio
import logging
import os
import pathlib
import random
import re
from typing import Callable
@@ -29,22 +27,27 @@ from datasets import load_dataset
from evaluation.utils.shared import (
EvalMetadata,
codeact_user_response,
EvalOutput,
make_metadata,
prepare_dataset,
reset_logger_for_multiprocessing,
run_evaluation,
)
from opendevin.controller.agent import Agent
from opendevin.controller.state.state import State
from opendevin.core.config import get_llm_config_arg, get_parser, load_app_config
from opendevin.core.logger import get_console_handler
from opendevin.core.config import (
AppConfig,
SandboxConfig,
get_llm_config_arg,
get_parser,
)
from opendevin.core.logger import opendevin_logger as logger
from opendevin.core.main import run_agent_controller
from opendevin.events.action import Action, AgentFinishAction, MessageAction
from opendevin.core.main import create_runtime, run_controller
from opendevin.events.action import (
Action,
AgentFinishAction,
MessageAction,
)
from opendevin.events.observation import Observation
from opendevin.llm.llm import LLM
config = load_app_config()
ACTION_FORMAT = """
<<FINAL_ANSWER||
@@ -53,6 +56,27 @@ ACTION_FORMAT = """
""".strip()
def get_config(
metadata: EvalMetadata,
) -> AppConfig:
config = AppConfig(
default_agent=metadata.agent_class,
run_as_devin=False,
runtime='eventstream',
max_iterations=metadata.max_iterations,
sandbox=SandboxConfig(
container_image='ubuntu:22.04',
enable_auto_lint=True,
use_host_network=False,
),
# do not mount workspace
workspace_base=None,
workspace_mount_path=None,
)
config.set_llm_config(metadata.llm_config)
return config
def gpqa_codeact_user_response(
state: State,
encapsulate_solution: bool = False,
@@ -68,11 +92,10 @@ def gpqa_codeact_user_response(
'<execute_bash> exit </execute_bash>\n'
'IMPORTANT: YOU SHOULD NEVER ASK FOR HUMAN HELP TO SOLVE THIS TASK.\n'
)
return msg
AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = {'CodeActAgent': codeact_user_response}
AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = {'CodeActAgent': gpqa_codeact_user_response}
AGENT_CLS_TO_INST_SUFFIX = {
'CodeActAgent': '\n\n SUPER IMPORTANT: When you think you have solved the question, first report it back to the user in the requested format. Only once that is done, in the next turn, please run the following command: <execute_bash> exit </execute_bash>.\n'
@@ -146,57 +169,23 @@ def convert_instance_dict(instance):
return out_instance_dict
def process_instance(
async def process_instance(
instance: pd.Series,
metadata: EvalMetadata,
reset_logger: bool = True,
):
# Create the agent
agent = Agent.get_cls(metadata.agent_class)(llm=LLM(config=metadata.llm_config))
old_workspace_mount_path = config.workspace_mount_path
old_workspace_base = config.workspace_base
try:
workspace_mount_path = os.path.join(
config.workspace_mount_path, '_eval_workspace'
)
# create process-specific workspace dir
workspace_mount_path = os.path.join(workspace_mount_path, str(os.getpid()))
pathlib.Path(workspace_mount_path).mkdir(parents=True, exist_ok=True)
config = get_config(metadata)
# reset workspace to config
config.workspace_base = workspace_mount_path
config.workspace_mount_path = workspace_mount_path
# Setup the logger properly, so you can run multi-processing to parallelize the evaluation
if reset_logger:
log_dir = os.path.join(metadata.eval_output_dir, 'infer_logs')
reset_logger_for_multiprocessing(logger, instance['instance_id'], log_dir)
else:
logger.info(f'Starting evaluation for instance {instance["instance_id"]}.')
# Setup the logger properly, so you can run multi-processing to parallelize the evaluation
if reset_logger:
# Set up logger
log_file = os.path.join(
metadata.eval_output_dir, 'logs', f'instance_{instance.instance_id}.log'
)
# Remove all existing handlers from logger
for handler in logger.handlers[:]:
logger.removeHandler(handler)
# add back the console handler to print ONE line
logger.addHandler(get_console_handler())
logger.info(
f'Starting evaluation for instance {instance.instance_id}.\nHint: run "tail -f {log_file}" to see live logs in a separate shell'
)
# Remove all existing handlers from logger
for handler in logger.handlers[:]:
logger.removeHandler(handler)
file_handler = logging.FileHandler(log_file)
file_handler.setFormatter(
logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
)
logger.addHandler(file_handler)
else:
logger.info(f'Starting evaluation for instance {instance.instance_id}.')
logger.info(f'Process-specific workspace mounted at {workspace_mount_path}')
# ======= Run the agent on the instance =======
# Prepare instruction for the agent using suggested format in gpqa codebase
instruction = f"""
# ======= Run the agent on the instance =======
# Prepare instruction for the agent using suggested format in gpqa codebase
instruction = f"""
What is the correct answer to this question:\n
{instance['question']}\n
@@ -225,109 +214,98 @@ Again do not quit without reporting the answer first.
Ok now its time to start solving the question. Good luck!
"""
# Here's how you can run the agent (similar to the `main` function) and get the final task state
state: State | None = asyncio.run(
run_agent_controller(
agent,
instruction,
max_iterations=metadata.max_iterations,
max_budget_per_task=config.max_budget_per_task,
fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN.get(
agent.__class__.__name__
),
sid=f'gptq_{str(instance.instance_id)}',
)
)
assert state is not None, 'State should not be None.'
runtime = await create_runtime(config, sid=f'gptq_{str(instance.instance_id)}')
# ======= Attempt to evaluate the agent's edits =======
state: State | None = await run_controller(
config=config,
task_str=instruction,
runtime=runtime,
fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN.get(
metadata.agent_class
),
)
assert state is not None, 'State should not be None.'
question_choices = {
'A': instance['choices'][0],
'B': instance['choices'][1],
'C': instance['choices'][2],
'D': instance['choices'][3],
}
# get the final message from the state history (default to empty if not found)
found_answers = {
'A': False,
'B': False,
'C': False,
'D': False,
}
for event in state.history.get_events(reverse=True):
if (
isinstance(event, AgentFinishAction)
and event.source != 'user'
and '<<FINAL_ANSWER||' in event.thought
):
final_message = event.thought
break
elif (
isinstance(event, MessageAction)
and event.source != 'user'
and '<<FINAL_ANSWER||' in event.content
):
final_message = event.content
break
elif isinstance(event, Observation):
for option, option_text in question_choices.items():
if option_text in event.content:
found_answers[option] = True
else:
final_message = None
# ======= Attempt to evaluate the agent's edits =======
found_options = [option for option, found in found_answers.items() if found]
question_choices = {
'A': instance['choices'][0],
'B': instance['choices'][1],
'C': instance['choices'][2],
'D': instance['choices'][3],
}
# get the final message from the state history (default to empty if not found)
found_answers = {
'A': False,
'B': False,
'C': False,
'D': False,
}
for event in state.history.get_events(reverse=True):
if (
isinstance(event, AgentFinishAction)
and event.source != 'user'
and '<<FINAL_ANSWER||' in event.thought
):
final_message = event.thought
break
elif (
isinstance(event, MessageAction)
and event.source != 'user'
and '<<FINAL_ANSWER||' in event.content
):
final_message = event.content
break
elif isinstance(event, Observation):
for option, option_text in question_choices.items():
if option_text in event.content:
found_answers[option] = True
else:
final_message = None
found_options = [option for option, found in found_answers.items() if found]
logger.info('#############################################')
logger.info(f'Final message generated by the agent: {final_message}')
logger.info('#############################################')
# check if the model output matches the ground truth
test_result = compare_answers(final_message, instance.correct_solution)
if final_message is None and len(found_options) > 0:
_selected = random.choice(found_options)
# if the final message is None, then the agent did not report the answer in the correct format
# so we randomly select one of the found options and compare it with the correct solution
test_result = _selected == instance.correct_solution
logger.info('#############################################')
logger.info(f'Final message generated by the agent: {final_message}')
logger.info('Agent did not report the answer in the correct format.')
logger.info(f'Found options: {found_options}')
logger.info(f'Selected option: {_selected}')
logger.info('#############################################')
# check if the model output matches the ground truth
test_result = compare_answers(final_message, instance.correct_solution)
if final_message is None and len(found_options) > 0:
_selected = random.choice(found_options)
# if the final message is None, then the agent did not report the answer in the correct format
# so we randomly select one of the found options and compare it with the correct solution
test_result = _selected == instance.correct_solution
logger.info('#############################################')
logger.info('Agent did not report the answer in the correct format.')
logger.info(f'Found options: {found_options}')
logger.info(f'Selected option: {_selected}')
logger.info('#############################################')
logger.info('#############################################')
logger.info(f'Test result: {test_result}')
logger.info('#############################################')
logger.info('#############################################')
logger.info(f'Test result: {test_result}')
logger.info('#############################################')
# If you are working on some simpler benchmark that only evaluates the final model output (e.g., in a MessageAction)
# You can simply get the LAST `MessageAction` from the returned `state.history` and parse it for evaluation.
if state is None:
raise ValueError('State should not be None.')
# If you are working on some simpler benchmark that only evaluates the final model output (e.g., in a MessageAction)
# You can simply get the LAST `MessageAction` from the returned `state.history` and parse it for evaluation.
if state is None:
raise ValueError('State should not be None.')
metrics = state.metrics.get() if state.metrics else None
metrics = state.metrics.get() if state.metrics else None
# Save the output
output = {
'task_id': instance.task_id,
'instance_id': instance.instance_id,
'instruction': instruction,
'metadata': metadata.model_dump(),
'history': state.history.compatibility_for_eval_history_pairs(),
'metrics': metrics,
'error': state.last_error if state and state.last_error else None,
'test_result': {
'result': test_result,
'found_answers': found_answers,
'last_message': final_message,
},
}
except Exception:
logger.error('Process instance failed')
raise
finally:
config.workspace_mount_path = old_workspace_mount_path
config.workspace_base = old_workspace_base
# Save the output
output = EvalOutput(
instance_id=str(instance.instance_id),
instruction=instruction,
metadata=metadata,
history=state.history.compatibility_for_eval_history_pairs(),
metrics=metrics,
error=state.last_error if state and state.last_error else None,
test_result={
'result': test_result,
'found_answers': found_answers,
'last_message': final_message,
},
)
return output
@@ -343,8 +321,11 @@ if __name__ == '__main__':
)
args, _ = parser.parse_known_args()
llm_config = get_llm_config_arg(args.llm_config) if args.llm_config else config.llm
logger.info(f'Config for evaluation: {config}')
llm_config = None
if args.llm_config:
llm_config = get_llm_config_arg(args.llm_config)
if llm_config is None:
raise ValueError(f'Could not find LLM config: --llm_config {args.llm_config}')
# NOTE: It is preferable to load datasets from huggingface datasets and perform post-processing
# so we don't need to manage file uploading to OpenDevin's repo
@@ -355,8 +336,6 @@ if __name__ == '__main__':
gpqa_dataset = gpqa_dataset.to_pandas()
# Add a new column 'instance_id' with the index
gpqa_dataset['instance_id'] = gpqa_dataset.index
gpqa_dataset['task_id'] = gpqa_dataset.index
# gpqa_dataset = dataset['train'].to_pandas().sort_values(by='id').reset_index(drop=True)
if args.agent_cls != 'CodeActAgent':
raise ValueError(
@@ -374,15 +353,14 @@ if __name__ == '__main__':
)
output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl')
prepared_dataset = prepare_dataset(
gpqa_dataset, output_file, args.eval_n_limit, 'task_id'
)
prepared_dataset = prepare_dataset(gpqa_dataset, output_file, args.eval_n_limit)
run_evaluation(
dataset=prepared_dataset,
metadata=metadata,
output_file=output_file,
num_workers=args.eval_num_workers,
process_instance_func=process_instance,
id_column='task_id',
asyncio.run(
run_evaluation(
dataset=prepared_dataset,
metadata=metadata,
output_file=output_file,
num_workers=args.eval_num_workers,
process_instance_func=process_instance,
)
)

View File

@@ -1,39 +1,10 @@
# HumanEvalFix Evaluation with OpenDevin
Implements evaluation of agents on HumanEvalFix from the HumanEvalPack benchmark introduced in [OctoPack: Instruction Tuning Code Large Language Models](https://arxiv.org/abs/2308.07124). Please see [here](https://github.com/bigcode-project/bigcode-evaluation-harness/blob/main/bigcode_eval/tasks/humanevalpack.py) for the reference implementation used in the paper.
Implements evaluation of agents on HumanEvalFix from the HumanEvalPack benchmark introduced in [OctoPack: Instruction Tuning Code Large Language Models](https://arxiv.org/abs/2308.07124). Please see [here](https://github.com/bigcode-project/bigcode-evaluation-harness/blob/main/bigcode_eval/tasks/humanevalpack.py) for the reference implementation used in the paper. Currently only `python` evaluation is supported.
## Setup Environment
## Setup Environment and LLM Configuration
Please follow [this document](https://github.com/OpenDevin/OpenDevin/blob/main/Development.md) to setup local develop environment for OpenDevin.
## Configure OpenDevin and your LLM
Create a `config.toml` file if it does not exist at the root of the workspace.
Add the following configurations:
```toml
[core]
max_iterations = 100
cache_dir = "/tmp/cache"
ssh_hostname = "localhost"
[sandbox]
enable_auto_lint = true
# TODO: Change these to the model you want to evaluate
[llm.eval_gpt4_1106_preview]
model = "gpt-4-1106-preview"
api_key = "XXX"
temperature = 0.0
[llm.eval_some_openai_compatible_model]
model = "openai/MODEL_NAME"
base_url = "https://OPENAI_COMPATIBLE_URL/v1"
api_key = "XXX"
temperature = 0.0
```
Please follow instruction [here](../README.md#setup) to setup your local development environment and LLM.
## Run Inference on HumanEvalFix

View File

@@ -9,9 +9,9 @@ TODOs:
"""
import asyncio
import logging
import os
import pathlib
import tempfile
from typing import Any
import pandas as pd
from datasets import load_dataset
@@ -19,20 +19,25 @@ from evaluate import load
from evaluation.utils.shared import (
EvalMetadata,
EvalOutput,
codeact_user_response,
make_metadata,
prepare_dataset,
reset_logger_for_multiprocessing,
run_evaluation,
)
from opendevin.controller.agent import Agent
from opendevin.controller.state.state import State
from opendevin.core.config import get_llm_config_arg, load_app_config, parse_arguments
from opendevin.core.logger import get_console_handler
from opendevin.core.config import (
AppConfig,
SandboxConfig,
get_llm_config_arg,
parse_arguments,
)
from opendevin.core.logger import opendevin_logger as logger
from opendevin.core.main import run_agent_controller
from opendevin.llm.llm import LLM
config = load_app_config()
from opendevin.core.main import create_runtime, run_controller
from opendevin.events.action import CmdRunAction
from opendevin.events.observation import CmdOutputObservation
from opendevin.runtime.runtime import Runtime
IMPORT_HELPER = {
'python': [
@@ -72,19 +77,105 @@ AGENT_CLS_TO_INST_SUFFIX = {
}
def get_test_result(instance, path, language='python', timeout=10):
# Evaluation reference: https://github.com/bigcode-project/bigcode-evaluation-harness/blob/84b96da31b7f840b55c5733325346176140cdb6b/bigcode_eval/tasks/humanevalpack.py#L347
def get_config(
metadata: EvalMetadata,
) -> AppConfig:
config = AppConfig(
default_agent=metadata.agent_class,
run_as_devin=False,
runtime='eventstream',
max_iterations=metadata.max_iterations,
sandbox=SandboxConfig(
container_image='ubuntu:22.04',
enable_auto_lint=True,
use_host_network=False,
),
# do not mount workspace
workspace_base=None,
workspace_mount_path=None,
)
config.set_llm_config(metadata.llm_config)
return config
def _get_instance_id(instance: pd.Series) -> str:
return instance.task_id.replace('/', '__')
async def initialize_runtime(
runtime: Runtime,
instance: pd.Series, # this argument is not required
):
"""Initialize the runtime for the agent.
This function is called before the runtime is used to run the agent.
"""
logger.info(f"{'-' * 50} BEGIN Runtime Initialization Fn {'-' * 50}")
obs: CmdOutputObservation
action = CmdRunAction(command='mkdir -p /workspace')
logger.info(action, extra={'msg_type': 'ACTION'})
obs = await runtime.run_action(action)
assert obs.exit_code == 0
action = CmdRunAction(command='cd /workspace')
logger.info(action, extra={'msg_type': 'ACTION'})
obs = await runtime.run_action(action)
assert obs.exit_code == 0
problem_statement = (
instance.declaration + instance.buggy_solution + '\n' + instance.test
)
filename = f'{_get_instance_id(instance)}.py'
with tempfile.TemporaryDirectory() as tmpdir:
host_script_path = os.path.join(tmpdir, filename)
with open(host_script_path, 'w') as f:
f.write(problem_statement)
await runtime.copy_to(
host_script_path,
'/workspace',
)
# check file exists
action = CmdRunAction(command=f'ls /workspace/{_get_instance_id(instance)}.py')
obs = await runtime.run_action(action)
assert obs.exit_code == 0
logger.info(f"{'-' * 50} END Runtime Initialization Fn {'-' * 50}")
async def complete_runtime(
runtime: Runtime,
instance: pd.Series, # this argument is not required, but it is used to get the workspace_dir_name
) -> dict[str, Any]:
"""Complete the runtime for the agent.
This function is called before the runtime is used to run the agent.
If you need to do something in the sandbox to get the correctness metric after
the agent has run, modify this function.
"""
logger.info(f"{'-' * 50} BEGIN Runtime Completion Fn {'-' * 50}")
obs: CmdOutputObservation
# default value
language = 'python'
timeout = 10
test_result = {'result': {}, 'metadata': {}}
code_metric = load('Muennighoff/code_eval_octopack')
timeout = LANGUAGE_TO_TIMEOUT[language]
num_workers = LANGUAGE_TO_NUM_WORKERS[language]
python_imports = '\n'.join(IMPORT_HELPER[language])
# Load function from path
with open(path, 'r') as f:
function = f.read()
action = CmdRunAction(
command=f'cat /workspace/{_get_instance_id(instance)}.py', keep_prompt=False
)
obs = await runtime.run_action(action)
assert obs.exit_code == 0
function = [[python_imports + '\n' + function.strip()]]
function = obs.content.replace('\r\n', '\n')
logger.info(f'Function: {function}')
function = [[python_imports + '\n' + function]]
results, logs = code_metric.compute(
references=[instance.test],
@@ -99,129 +190,79 @@ def get_test_result(instance, path, language='python', timeout=10):
'timeout': timeout,
'num_workers': num_workers,
}
logger.info(f"{'-' * 50} END Runtime Completion Fn {'-' * 50}")
return test_result
def process_instance(
async def process_instance(
instance: pd.Series,
metadata: EvalMetadata,
reset_logger: bool = True,
):
# Create the agent
agent = Agent.get_cls(metadata.agent_class)(llm=LLM(config=metadata.llm_config))
old_workspace_mount_path = config.workspace_mount_path
old_workspace_base = config.workspace_base
) -> EvalOutput:
config = get_config(metadata)
# use a session id for concurrent evaluation
sid = _get_instance_id(instance)
try:
workspace_mount_path = os.path.join(
config.workspace_mount_path, '_eval_workspace'
)
# create process-specific workspace dir
workspace_mount_path = os.path.join(workspace_mount_path, str(os.getpid()))
pathlib.Path(workspace_mount_path).mkdir(parents=True, exist_ok=True)
# Setup the logger properly, so you can run multi-processing to parallelize the evaluation
if reset_logger:
log_dir = os.path.join(metadata.eval_output_dir, 'infer_logs')
reset_logger_for_multiprocessing(logger, instance.task_id, log_dir)
else:
logger.info(f'Starting evaluation for instance {instance.task_id}.')
# reset workspace to config
config.workspace_base = workspace_mount_path
config.workspace_mount_path = workspace_mount_path
# Create file with HumanEvalFix problem
# Prompt reference: https://github.com/bigcode-project/bigcode-evaluation-harness/blob/84b96da31b7f840b55c5733325346176140cdb6b/bigcode_eval/tasks/humanevalpack.py#L509
problem_statement = (
instance.declaration + instance.buggy_solution + '\n' + instance.test
)
# use a session id for concurrent evaluation
sid = instance.task_id.replace('/', '__')
# Prepare instruction
instruction = (
f'Please fix the function in {sid}.py such that all test cases pass.\n'
'Environment has been set up for you to start working. You may assume all necessary tools are installed.\n\n'
'# Problem Statement\n'
f'{problem_statement}\n\n'
)
instruction += (
'IMPORTANT: You should ONLY interact with the environment provided to you AND NEVER ASK FOR HUMAN HELP.\n'
'You should NOT modify any existing test case files. If needed, you can add new test cases in a NEW file to reproduce the issue.\n'
'You SHOULD INCLUDE PROPER INDENTATION in your edit commands.\n'
)
# NOTE: You can actually set slightly different instruction for different agents
instruction += AGENT_CLS_TO_INST_SUFFIX[metadata.agent_class]
# Setup the logger properly, so you can run multi-processing to parallelize the evaluation
if reset_logger:
# Set up logger
log_file = os.path.join(
metadata.eval_output_dir,
'logs',
f'instance_{sid}.log',
)
# Remove all existing handlers from logger
for handler in logger.handlers[:]:
logger.removeHandler(handler)
# add back the console handler to print ONE line
logger.addHandler(get_console_handler())
logger.info(
f'Starting evaluation for instance {instance.task_id}.\nLOG: tail -f {log_file}'
)
# Remove all existing handlers from logger
for handler in logger.handlers[:]:
logger.removeHandler(handler)
file_handler = logging.FileHandler(log_file)
file_handler.setFormatter(
logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
)
logger.addHandler(file_handler)
# Here's how you can run the agent (similar to the `main` function) and get the final task state
runtime = await create_runtime(config, sid=sid)
await initialize_runtime(runtime, instance)
state: State | None = await run_controller(
config=config,
task_str=instruction,
runtime=runtime,
fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN.get(
metadata.agent_class
),
)
logger.info(f'Process-specific workspace mounted at {workspace_mount_path}')
if state is None:
raise ValueError('State should not be None.')
metrics = state.metrics.get() if state.metrics else None
test_result = await complete_runtime(runtime, instance)
# Create file with HumanEvalFix problem
# Prompt reference: https://github.com/bigcode-project/bigcode-evaluation-harness/blob/84b96da31b7f840b55c5733325346176140cdb6b/bigcode_eval/tasks/humanevalpack.py#L509
problem_statement = (
instance.declaration + instance.buggy_solution + '\n' + instance.test
)
path = os.path.join(workspace_mount_path, f'{sid}.py')
with open(path, 'w') as f:
f.write(problem_statement)
# history is now available as a stream of events, rather than list of pairs of (Action, Observation)
# for compatibility with the existing output format, we can remake the pairs here
# remove when it becomes unnecessary
histories = state.history.compatibility_for_eval_history_pairs()
# Prepare instruction
instruction = (
f'Please fix the function in {instance.task_id.replace("/", "__")}.py such that all test cases pass.\n'
'Environment has been set up for you to start working. You may assume all necessary tools are installed.\n\n'
'# Problem Statement\n'
f'{problem_statement}\n\n'
)
instruction += (
'IMPORTANT: You should ONLY interact with the environment provided to you AND NEVER ASK FOR HUMAN HELP.\n'
'You should NOT modify any existing test case files. If needed, you can add new test cases in a NEW file to reproduce the issue.\n'
'You SHOULD INCLUDE PROPER INDENTATION in your edit commands.\n'
)
# NOTE: You can actually set slightly different instruction for different agents
instruction += AGENT_CLS_TO_INST_SUFFIX[agent.__class__.__name__]
# Here's how you can run the agent (similar to the `main` function) and get the final task state
state: State | None = asyncio.run(
run_agent_controller(
agent,
instruction,
max_iterations=metadata.max_iterations,
max_budget_per_task=config.max_budget_per_task,
fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN.get(
agent.__class__.__name__
),
sid=sid,
)
)
# ======= Attempt to evaluate the agent's edits =======
test_result = get_test_result(instance, path)
# If you are working on some simpler benchmark that only evaluates the final model output (e.g., in a MessageAction)
# You can simply get the LAST `MessageAction` from the returned `state.history` and parse it for evaluation.
if state is None:
raise ValueError('State should not be None.')
metrics = state.metrics.get() if state.metrics else None
# history is now available as a stream of events, rather than list of pairs of (Action, Observation)
# for compatibility with the existing output format, we can remake the pairs here
# remove when it becomes unnecessary
histories = state.history.compatibility_for_eval_history_pairs()
# Save the output
output = {
'task_id': instance.task_id,
'instruction': instruction,
'metadata': metadata.model_dump(),
'history': histories,
'metrics': metrics,
'error': state.last_error if state and state.last_error else None,
'test_result': test_result,
}
except Exception:
logger.error('Process instance failed')
raise
finally:
config.workspace_mount_path = old_workspace_mount_path
config.workspace_base = old_workspace_base
# Save the output
output = EvalOutput(
instance_id=instance.task_id,
instruction=instruction,
metadata=metadata,
history=histories,
metrics=metrics,
error=state.last_error if state and state.last_error else None,
test_result=test_result,
)
return output
@@ -234,28 +275,31 @@ if __name__ == '__main__':
'bigcode/humanevalpack', 'python'
) # TODO: Support other languages
hefix_tests = dataset['test'].to_pandas()
hefix_tests.rename(columns={'task_id': 'instance_id'}, inplace=True)
id_column = 'task_id'
llm_config = get_llm_config_arg(args.llm_config) if args.llm_config else config.llm
logger.info(f'Config for evaluation: {config}')
llm_config = None
if args.llm_config:
llm_config = get_llm_config_arg(args.llm_config)
if llm_config is None:
raise ValueError(f'Could not find LLM config: --llm_config {args.llm_config}')
metadata = make_metadata(
llm_config,
args.dataset_name,
'humanevalfix-python',
args.agent_cls,
args.max_iterations,
args.eval_note,
args.eval_output_dir,
)
output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl')
instances = prepare_dataset(dataset, output_file, args.eval_n_limit, id_column)
instances = prepare_dataset(hefix_tests, output_file, args.eval_n_limit)
run_evaluation(
instances,
metadata,
output_file,
args.eval_num_workers,
process_instance,
id_column,
asyncio.run(
run_evaluation(
instances,
metadata,
output_file,
args.eval_num_workers,
process_instance,
)
)

0
evaluation/humanevalfix/scripts/run_infer.sh Normal file → Executable file
View File

View File

@@ -0,0 +1,7 @@
FROM ubuntu:22.04
RUN apt-get update && apt-get install -y python3 python3-pip
RUN pip install scitools-pyke
# docker build -t xingyaoww/od_logic_reasoning .

View File

@@ -2,38 +2,13 @@
This folder contains evaluation harness for evaluating agents on the logic reasoning benchmark [ProntoQA](https://github.com/asaparov/prontoqa) and [ProofWriter](https://allenai.org/data/proofwriter).
## Configure OpenDevin and your LLM
## Setup Environment and LLM Configuration
Create a `config.toml` file if it does not exist at the root of the workspace.
Add the following configurations:
```toml
[core]
max_iterations = 100
cache_dir = "/tmp/cache"
ssh_hostname = "localhost"
[sandbox]
enable_auto_lint = true
# TODO: Change these to the model you want to evaluate
[llm.eval_gpt4_1106_preview_llm]
model = "gpt-4-1106-preview"
api_key = "XXX"
temperature = 0.0
[llm.eval_some_openai_compatible_model_llm]
model = "openai/MODEL_NAME"
base_url = "https://OPENAI_COMPATIBLE_URL/v1"
api_key = "XXX"
temperature = 0.0
```
Please follow instruction [here](../README.md#setup) to setup your local development environment and LLM.
## Run Inference on logic_reasoning
The following code will run inference on the first example of the ProntoQA dataset,
using OpenDevin 0.6.2 version.
The following code will run inference on the first example of the ProofWriter dataset,
```bash
./evaluation/logic_reasoning/scripts/run_infer.sh ProntoQA eval_gpt4_1106_preview_llm 0.6.2 1
./evaluation/logic_reasoning/scripts/run_infer.sh eval_gpt4_1106_preview_llm ProofWriter
```

View File

@@ -3,12 +3,12 @@ you can interact with an interactive Python (Jupyter Notebook) environment and r
In this task, you need to use the code in [[logic_inference_path.py]] to help you. Specifically, you first need to instantiate a **LogicInferenceEngine** class and use the **safe_execute_program** method to prove the **logic programs**. You should receive *answer*, *flag*, *error_message* from the output.
An example would be look like this:
<execute_ipython>
import sys
sys.path.append(workspace_mount_path)
engine = LogicInferenceEngine(dataset_name, workspace_mount_path)
answer, flag, error_message = engine.safe_execute_program(logic_programs)
</execute_ipython>
<execute_ipython>
import sys
sys.path.append('/workspace')
engine = LogicInferenceEngine()
answer, flag, error_message = engine.safe_execute_program(logic_programs)
</execute_ipython>
Please send the *answer* variable through message.

View File

@@ -191,9 +191,9 @@ class PykeProgram:
class LogicInferenceEngine:
def __init__(self, dataset_name, workspace_mount_path):
self.dataset_name = dataset_name
self.workspace_mount_path = workspace_mount_path
def __init__(self):
self.dataset_name = os.environ.get('DATASET_NAME', 'ProofWriter')
self.workspace_mount_path = '/workspace'
def random_backup(self):
if self.dataset_name == 'ProntoQA':

View File

@@ -1,29 +1,35 @@
import asyncio
import logging
import os
import pathlib
import shutil
import pandas as pd
from datasets import load_dataset
from evaluation.swe_bench.swe_env_box import DockerSSHBox
from evaluation.utils.shared import (
EvalMetadata,
EvalOutput,
codeact_user_response,
make_metadata,
prepare_dataset,
reset_logger_for_multiprocessing,
run_evaluation,
)
from opendevin.controller.agent import Agent
from opendevin.controller.state.state import State
from opendevin.core.config import get_llm_config_arg, get_parser, load_app_config
from opendevin.core.logger import get_console_handler
from opendevin.core.config import (
AppConfig,
SandboxConfig,
get_llm_config_arg,
get_parser,
)
from opendevin.core.logger import opendevin_logger as logger
from opendevin.core.main import run_agent_controller
from opendevin.llm.llm import LLM
config = load_app_config()
from opendevin.core.main import create_runtime, run_controller
from opendevin.events.action import (
AgentFinishAction,
CmdRunAction,
IPythonRunCellAction,
MessageAction,
)
from opendevin.events.observation import CmdOutputObservation
from opendevin.runtime.runtime import Runtime
AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = {
'CodeActAgent': codeact_user_response,
@@ -34,6 +40,28 @@ AGENT_CLS_TO_INST_SUFFIX = {
}
def get_config(
metadata: EvalMetadata,
) -> AppConfig:
config = AppConfig(
default_agent=metadata.agent_class,
run_as_devin=False,
runtime='eventstream',
max_iterations=metadata.max_iterations,
sandbox=SandboxConfig(
container_image='xingyaoww/od-eval-logic-reasoning:v1.0',
enable_auto_lint=True,
use_host_network=False,
od_runtime_extra_deps='$OD_INTERPRETER_PATH -m pip install scitools-pyke',
),
# do not mount workspace
workspace_base=None,
workspace_mount_path=None,
)
config.set_llm_config(metadata.llm_config)
return config
def get_choice(answer_str):
choices = [
'A',
@@ -83,7 +111,7 @@ def get_test_result(
'the correct answer is',
'The correct answer is',
'The correct option is',
'Thus, the answer is',
'the answer is',
]
if prediction is None:
for indicator in indicators:
@@ -97,162 +125,143 @@ def get_test_result(
return test_result
def process_instance(
CUR_EVAL_DIR = os.path.dirname(__file__)
async def initialize_runtime(
runtime: Runtime,
instance: pd.Series, # this argument is not required
):
"""Initialize the runtime for the agent.
This function is called before the runtime is used to run the agent.
"""
logger.info(f"{'-' * 50} BEGIN Runtime Initialization Fn {'-' * 50}")
obs: CmdOutputObservation
# Set instance id
action = CmdRunAction(command='mkdir -p /workspace')
logger.info(action, extra={'msg_type': 'ACTION'})
obs = await runtime.run_action(action)
assert obs.exit_code == 0
action = CmdRunAction(command='cd /workspace')
logger.info(action, extra={'msg_type': 'ACTION'})
obs = await runtime.run_action(action)
assert obs.exit_code == 0
# copy logic_inference.py to /workspace
await runtime.copy_to(
os.path.join(CUR_EVAL_DIR, 'logic_inference.py'), '/workspace'
)
# check if the file exists
obs = await runtime.run_action(CmdRunAction(command='ls /workspace'))
assert obs.exit_code == 0
assert 'logic_inference.py' in obs.content
await runtime.add_env_vars({'DATASET_NAME': metadata.dataset})
action = CmdRunAction(command='mkdir -p /workspace/.cache_program')
logger.info(action, extra={'msg_type': 'ACTION'})
obs = await runtime.run_action(action)
assert obs.exit_code == 0
action = IPythonRunCellAction(code='%pip install scitools-pyke')
logger.info(action, extra={'msg_type': 'ACTION'})
ipynb_obs = await runtime.run_action(action)
logger.info(ipynb_obs, extra={'msg_type': 'OBSERVATION'})
logger.info(f"{'-' * 50} END Runtime Initialization Fn {'-' * 50}")
# Prepare instruction
with open(os.path.join(CUR_EVAL_DIR, 'instruction.txt'), 'r') as f:
INSTRUCTION_TEMPLATE = f.read()
async def process_instance(
instance: pd.Series,
metadata: EvalMetadata,
reset_logger: bool = True,
):
# Create the agent
agent = Agent.get_cls(metadata.agent_class)(llm=LLM(config=metadata.llm_config))
old_workspace_mount_path = config.workspace_mount_path
old_workspace_base = config.workspace_base
config = get_config(metadata)
try:
workspace_mount_path = os.path.join(
config.workspace_mount_path, '_eval_workspace'
# Setup the logger properly, so you can run multi-processing to parallelize the evaluation
if reset_logger:
log_dir = os.path.join(metadata.eval_output_dir, 'infer_logs')
reset_logger_for_multiprocessing(logger, instance['instance_id'], log_dir)
else:
logger.info(f'Starting evaluation for instance {instance["instance_id"]}.')
instance_logic_programs = instance['raw_logic_programs'][0].strip()
instruction = (
INSTRUCTION_TEMPLATE.replace('[[dataset_name]]', dataset_name)
.replace('[[logic_programs]]', instance_logic_programs)
.replace('[[logic_inference_path.py]]', '/workspace/logic_inference.py')
)
# NOTE: You can actually set slightly different instruction for different agents
instruction += AGENT_CLS_TO_INST_SUFFIX[metadata.agent_class]
# use a session id for concurrent evaluation
sid = instance['instance_id']
runtime = await create_runtime(config, sid=sid)
await initialize_runtime(runtime, instance)
# Here's how you can run the agent (similar to the `main` function) and get the final task state
state: State | None = asyncio.run(
run_controller(
config=config,
task_str=instruction,
runtime=runtime,
fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN.get(
metadata.agent_class
),
)
# create process-specific workspace dir
workspace_mount_path = os.path.join(workspace_mount_path, str(os.getpid()))
pathlib.Path(workspace_mount_path).mkdir(parents=True, exist_ok=True)
)
# ======= Attempt to evaluate the agent's edits =======
# If you are working on simpler benchmark that only evaluates the final model output (e.g., in a MessageAction)
# You can simply get the LAST `MessageAction` from the returned `state.history` and parse it for evaluation.
# reset workspace to config
config.workspace_base = workspace_mount_path
config.workspace_mount_path = workspace_mount_path
if state is None:
raise ValueError('State should not be None.')
# Setup the logger properly, so you can run multi-processing to parallelize the evaluation
if reset_logger:
# Set up logger
log_file = os.path.join(
metadata.eval_output_dir, 'logs', f'instance_{instance["id"]}.log'
)
# Remove all existing handlers from logger
for handler in logger.handlers[:]:
logger.removeHandler(handler)
# add back the console handler to print ONE line
logger.addHandler(get_console_handler())
logger.info(
f'Starting evaluation for instance {instance["id"]}.\nLOG: tail -f {log_file}'
)
# Remove all existing handlers from logger
for handler in logger.handlers[:]:
logger.removeHandler(handler)
file_handler = logging.FileHandler(log_file)
file_handler.setFormatter(
logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
)
logger.addHandler(file_handler)
final_message = ''
for event in state.history.get_events(reverse=True):
if isinstance(event, AgentFinishAction):
final_message = event.thought
break
elif isinstance(event, MessageAction):
final_message = event.content
break
logger.info(f'Process-specific workspace mounted at {workspace_mount_path}')
final_message = final_message.strip("'")
logger.info(
f'Predicted answer: {final_message}, Ground truth: {instance["answer"]}'
)
# sandbox = DockerSSHBox()
logic_inference_path = os.path.join(workspace_mount_path, 'logic_inference.py')
if not os.path.exists(logic_inference_path):
shutil.copyfile(
'./evaluation/logic_reasoning/logic_inference.py', logic_inference_path
)
logger.info(f'logic_inference.py copied to {workspace_mount_path}')
test_result = get_test_result(
model_answer=final_message, ground_truth=instance['answer']
)
test_result['final_message'] = final_message
cache_dir = os.path.join(workspace_mount_path, '.cache_program')
if not os.path.exists(cache_dir):
os.makedirs(cache_dir)
# Prepare instruction
with open('./evaluation/logic_reasoning/instruction.txt', 'r') as f:
instruction = f.read()
instance_logic_programs = instance['raw_logic_programs'][0].strip()
instruction = instruction.replace('[[dataset_name]]', dataset_name)
instruction = instruction.replace('[[logic_programs]]', instance_logic_programs)
instruction = instruction.replace(
'[[logic_inference_path.py]]', logic_inference_path
)
# NOTE: You can actually set slightly different instruction for different agents
instruction += AGENT_CLS_TO_INST_SUFFIX[agent.__class__.__name__]
# use a session id for concurrent evaluation
sid = instance['id'] + '_' + str(os.getpid())
sandbox = DockerSSHBox(
config=config.sandbox,
persist_sandbox=False,
workspace_mount_path=config.workspace_mount_path,
sandbox_workspace_dir=config.workspace_mount_path_in_sandbox,
cache_dir=config.cache_dir,
run_as_devin=config.run_as_devin,
sid=sid,
)
exit_code, command_output = sandbox.execute('pip install scitools-pyke')
# Here's how you can run the agent (similar to the `main` function) and get the final task state
state: State | None = asyncio.run(
run_agent_controller(
agent,
instruction,
max_iterations=metadata.max_iterations,
max_budget_per_task=config.max_budget_per_task,
fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN.get(
agent.__class__.__name__
),
sandbox=sandbox,
sid=sid,
)
)
# ======= Attempt to evaluate the agent's edits =======
# If you are working on simpler benchmark that only evaluates the final model output (e.g., in a MessageAction)
# You can simply get the LAST `MessageAction` from the returned `state.history` and parse it for evaluation.
if state is None:
raise ValueError('State should not be None.')
final_message = ''
messages = []
for event in state.history.get_events(reverse=True):
# will this be a MessageAction?
# TODO we can filter for types of events if we know what to expect
messages.append(event.content)
if str(event.content) in ["'A'", "'B'", "'C'"]:
final_message = event.content
break
final_message = final_message.strip("'")
logger.info(
f'Predicted answer: {final_message}, Ground truth: {instance["answer"]}'
)
test_result = get_test_result(
model_answer=final_message, ground_truth=instance['answer']
)
metrics = state.metrics.get() if state.metrics else None
# history is now available as a stream of events, rather than list of pairs of (Action, Observation)
# for compatibility with the existing output format, we can remake the pairs here
# remove when it becomes unnecessary
histories = state.history.compatibility_for_eval_history_pairs()
# Save the output
output = {
'id': instance['id'],
'instance': instance,
'instruction': instruction,
# 'metadata': metadata.model_dump(),
'history': histories,
'metrics': metrics,
'final_message': final_message,
'messages': messages,
'error': state.last_error if state and state.last_error else None,
'test_result': test_result,
}
except Exception:
logger.error('Process instance failed')
raise
finally:
config.workspace_mount_path = old_workspace_mount_path
config.workspace_base = old_workspace_base
# Close the sandbox
sandbox.close()
metrics = state.metrics.get() if state.metrics else None
# history is now available as a stream of events, rather than list of pairs of (Action, Observation)
# for compatibility with the existing output format, we can remake the pairs here
# remove when it becomes unnecessary
histories = state.history.compatibility_for_eval_history_pairs()
# Save the output
output = EvalOutput(
instance_id=instance['instance_id'],
instruction=instruction,
metadata=metadata,
history=histories,
metrics=metrics,
error=state.last_error if state and state.last_error else None,
test_result=test_result,
)
return output
@@ -262,7 +271,7 @@ if __name__ == '__main__':
'--dataset',
type=str,
help='the logic reasoning dataset to evaluate on {ProntoQA, ProofWriter}',
default='ProntoQA',
default='ProofWriter',
)
parser.add_argument(
'--data_split',
@@ -270,36 +279,32 @@ if __name__ == '__main__':
help='data split to evaluate on {validation}', # right now we only support validation split
default='validation',
)
args, _ = parser.parse_known_args()
if args.directory:
config.workspace_base = os.path.abspath(args.directory)
print(f'Setting workspace base to {config.workspace_base}')
dataset_name = args.dataset
data_split = args.data_split
dataset = load_dataset(f'renma/{dataset_name}')
logic_reasoning_tests = dataset[data_split]
dataset_df = dataset[data_split].to_pandas()
dataset_df.rename(columns={'id': 'instance_id'}, inplace=True)
id_column = 'id'
llm_config = get_llm_config_arg(args.llm_config) if args.llm_config else config.llm
logger.info(f'Config for evaluation: {config}')
llm_config = None
if args.llm_config:
llm_config = get_llm_config_arg(args.llm_config)
if llm_config is None:
raise ValueError(f'Could not find LLM config: --llm_config {args.llm_config}')
metadata = make_metadata(
llm_config,
args.dataset_name,
dataset_name,
args.agent_cls,
args.max_iterations,
args.eval_note,
args.eval_output_dir,
)
output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl')
instances = prepare_dataset(dataset, output_file, args.eval_n_limit, id_column)
run_evaluation(
instances,
metadata,
output_file,
args.eval_num_workers,
process_instance,
id_column,
instances = prepare_dataset(dataset_df, output_file, args.eval_n_limit)
asyncio.run(
run_evaluation(
instances, metadata, output_file, args.eval_num_workers, process_instance
)
)

9
evaluation/logic_reasoning/scripts/run_infer.sh Normal file → Executable file
View File

@@ -3,8 +3,8 @@ set -eo pipefail
source "evaluation/utils/version_control.sh"
DATASET=$1
MODEL_CONFIG=$2
MODEL_CONFIG=$1
DATASET=$2
COMMIT_HASH=$3
EVAL_LIMIT=$4
AGENT=$5
@@ -23,6 +23,11 @@ if [ -z "$AGENT" ]; then
AGENT="CodeActAgent"
fi
if [ -z "$DATASET" ]; then
echo "Dataset not specified, use default ProofWriter"
DATASET="ProofWriter"
fi
get_agent_version
echo "AGENT: $AGENT"

View File

@@ -0,0 +1,10 @@
FROM ubuntu:22.04
RUN apt-get update && apt-get install -y python3 python3-pip git
RUN git clone https://github.com/Farama-Foundation/miniwob-plusplus.git /miniwob-plusplus && \
git -C "/miniwob-plusplus" reset --hard 7fd85d71a4b60325c6585396ec4f48377d049838
ENV MINIWOB_URL="file:///miniwob-plusplus/miniwob/html/miniwob/"
# docker build -t xingyaoww/od-eval-miniwob .

View File

@@ -2,52 +2,9 @@
This folder contains evaluation for [MiniWoB++](https://miniwob.farama.org/) benchmark, powered by [BrowserGym](https://github.com/ServiceNow/BrowserGym) for easy evaluation of how well an agent capable of browsing can perform on synthetic web browsing tasks.
## Setup OpenDevin Environment
## Setup Environment and LLM Configuration
Please follow [this document](https://github.com/OpenDevin/OpenDevin/blob/main/Development.md) to setup local develop environment for OpenDevin.
## Configure OpenDevin and your LLM
Create a `config.toml` file if it does not exist at the root of the workspace.
Add the following configurations:
```toml
[core]
max_iterations = 100
cache_dir = "/tmp/cache"
ssh_hostname = "localhost"
[sandbox]
box_type = "ssh"
timeout = 120
# TODO: Change these to the model you want to evaluate
[llm.eval_gpt4_1106_preview]
model = "gpt-4-1106-preview"
api_key = "XXX"
temperature = 0.0
[llm.eval_some_openai_compatible_model]
model = "openai/MODEL_NAME"
base_url = "https://OPENAI_COMPATIBLE_URL/v1"
api_key = "XXX"
temperature = 0.0
```
## Setup MiniWoB++ Environment and Environment Variables of MiniWoB++
MiniWoB++ requires you to set up websites containing a static website that is accessible via URL to the machine running the OpenDevin agents.
- Clone miniwob (use a specific frozen commit for reproducibility)
```sh
git clone git@github.com:Farama-Foundation/miniwob-plusplus.git
git -C "./miniwob-plusplus" reset --hard 7fd85d71a4b60325c6585396ec4f48377d049838
```
- Setup Miniwob URL (change `PATH_TO_MINIWOB_CLONED_REPO` here to the absolute path to your `miniwob-plusplus` folder) in `evaluation/miniwob/scripts/run_infer.sh`
```sh
export MINIWOB_URL="file://<PATH_TO_MINIWOB_CLONED_REPO>/miniwob/html/miniwob/"
```
Please follow instruction [here](../README.md#setup) to setup your local development environment and LLM.
## Test if your environment works
@@ -56,7 +13,7 @@ Access with browser the above MiniWoB URLs and see if they load correctly.
## Run Evaluation
```sh
bash evaluation/miniwob/scripts/run_infer.sh
./evaluation/miniwob/scripts/run_infer.sh llm.claude-35-sonnet-eval
```
Results will be in `evaluation/evaluation_outputs/outputs/miniwob/`

View File

@@ -1,7 +1,7 @@
import asyncio
import json
import logging
import os
from typing import Any
import browsergym.miniwob # noqa F401 register miniwob tasks as gym environments
import gymnasium as gym
@@ -9,91 +9,131 @@ import pandas as pd
from evaluation.utils.shared import (
EvalMetadata,
EvalOutput,
make_metadata,
prepare_dataset,
reset_logger_for_multiprocessing,
run_evaluation,
)
from opendevin.controller.agent import Agent
from opendevin.controller.state.state import State
from opendevin.core.config import get_llm_config_arg, load_app_config, parse_arguments
from opendevin.core.logger import get_console_handler
from opendevin.core.config import (
AppConfig,
SandboxConfig,
get_llm_config_arg,
parse_arguments,
)
from opendevin.core.logger import opendevin_logger as logger
from opendevin.core.main import run_agent_controller
from opendevin.llm.llm import LLM
from opendevin.runtime.docker.ssh_box import DockerSSHBox
from opendevin.runtime.tools import RuntimeTool
config = load_app_config()
from opendevin.core.main import create_runtime, run_controller
from opendevin.events.action import (
BrowseInteractiveAction,
CmdRunAction,
MessageAction,
)
from opendevin.events.observation import CmdOutputObservation
from opendevin.runtime.browser.browser_env import (
BROWSER_EVAL_GET_GOAL_ACTION,
BROWSER_EVAL_GET_REWARDS_ACTION,
)
from opendevin.runtime.runtime import Runtime
SUPPORTED_AGENT_CLS = {'BrowsingAgent'}
docker_ssh_box: DockerSSHBox | None = None
def get_config(
metadata: EvalMetadata,
env_id: str,
) -> AppConfig:
config = AppConfig(
default_agent=metadata.agent_class,
run_as_devin=False,
runtime='eventstream',
max_iterations=metadata.max_iterations,
sandbox=SandboxConfig(
container_image='xingyaoww/od-eval-miniwob:v1.0',
enable_auto_lint=True,
use_host_network=False,
browsergym_eval_env=env_id,
),
# do not mount workspace
workspace_base=None,
workspace_mount_path=None,
)
config.set_llm_config(metadata.llm_config)
return config
def get_sandbox():
global docker_ssh_box
if docker_ssh_box is None:
docker_ssh_box = DockerSSHBox(
config=config.sandbox,
persist_sandbox=False,
workspace_mount_path=config.workspace_mount_path,
sandbox_workspace_dir=config.workspace_mount_path_in_sandbox,
cache_dir=config.cache_dir,
run_as_devin=config.run_as_devin,
)
return docker_ssh_box
async def initialize_runtime(
runtime: Runtime,
) -> str:
"""Initialize the runtime for the agent.
This function is called before the runtime is used to run the agent.
"""
logger.info(f"{'-' * 50} BEGIN Runtime Initialization Fn {'-' * 50}")
obs: CmdOutputObservation
# Set instance id
action = CmdRunAction(command='mkdir -p /workspace')
logger.info(action, extra={'msg_type': 'ACTION'})
obs = await runtime.run_action(action)
assert obs.exit_code == 0
action = BrowseInteractiveAction(browser_actions=BROWSER_EVAL_GET_GOAL_ACTION)
logger.info(action, extra={'msg_type': 'ACTION'})
obs = await runtime.run_action(action)
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
goal = obs.content
logger.info(f"{'-' * 50} END Runtime Initialization Fn {'-' * 50}")
return goal
def process_instance(
async def complete_runtime(
runtime: Runtime,
) -> dict[str, Any]:
"""Complete the runtime for the agent.
This function is called before the runtime is used to run the agent.
If you need to do something in the sandbox to get the correctness metric after
the agent has run, modify this function.
"""
logger.info(f"{'-' * 50} BEGIN Runtime Completion Fn {'-' * 50}")
obs: CmdOutputObservation
action = BrowseInteractiveAction(browser_actions=BROWSER_EVAL_GET_REWARDS_ACTION)
logger.info(action, extra={'msg_type': 'ACTION'})
obs = await runtime.run_action(action)
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
logger.info(f"{'-' * 50} END Runtime Completion Fn {'-' * 50}")
return {
'rewards': json.loads(obs.content),
}
async def process_instance(
instance: pd.Series,
metadata: EvalMetadata,
reset_logger: bool = True,
):
# Create the agent
agent = Agent.get_cls(metadata.agent_class)(llm=LLM(config=metadata.llm_config))
) -> EvalOutput:
env_id = instance.id
config = get_config(metadata, env_id)
# Setup the logger properly, so you can run multi-processing to parallelize the evaluation
if reset_logger:
# Set up logger
log_file = os.path.join(
metadata.eval_output_dir, 'logs', f'instance_{env_id}.log'
)
# Remove all existing handlers from logger
for handler in logger.handlers[:]:
logger.removeHandler(handler)
# add back the console handler to print ONE line
logger.addHandler(get_console_handler())
logger.info(
f'Starting evaluation for instance {env_id}.\nHint: run "tail -f {log_file}" to see live logs in a separate shell'
)
# Remove all existing handlers from logger
for handler in logger.handlers[:]:
logger.removeHandler(handler)
file_handler = logging.FileHandler(log_file)
file_handler.setFormatter(
logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
)
logger.addHandler(file_handler)
log_dir = os.path.join(metadata.eval_output_dir, 'infer_logs')
reset_logger_for_multiprocessing(logger, env_id, log_dir)
else:
logger.info(f'Starting evaluation for instance {env_id}.')
# Here's how you can run the agent (similar to the `main` function) and get the final task state
runtime_tools_config = {
RuntimeTool.BROWSER: {
'browsergym_eval': env_id,
'browsergym_eval_save_dir': metadata.eval_output_dir,
}
}
runtime = await create_runtime(config, sid=env_id)
task_str = await initialize_runtime(runtime)
state: State | None = asyncio.run(
run_agent_controller(
agent,
'PLACEHOLDER_GOAL',
max_iterations=metadata.max_iterations,
max_budget_per_task=config.max_budget_per_task,
runtime_tools_config=runtime_tools_config,
sandbox=get_sandbox(),
sid=env_id,
run_controller(
config=config,
task_str=task_str, # take output from initialize_runtime
runtime=runtime,
)
)
@@ -106,18 +146,17 @@ def process_instance(
raise ValueError('State should not be None.')
metrics = state.metrics.get() if state.metrics else None
browsergym_eval_dir = os.path.join(metadata.eval_output_dir, env_id.split('/')[1])
# read goal
with open(
os.path.join(browsergym_eval_dir, 'goal.txt'), 'r', encoding='utf-8'
) as f:
instruction = f.read()
# read reward
with open(
os.path.join(browsergym_eval_dir, 'rewards.json'), 'r', encoding='utf-8'
) as f:
rewards = json.load(f)
reward = max(rewards)
# Instruction is the first message from the USER
instruction = ''
for event in state.history.get_events():
if isinstance(event, MessageAction):
instruction = event.content
break
return_val = await complete_runtime(runtime)
logger.info(f'Return value from complete_runtime: {return_val}')
reward = max(return_val['rewards'])
# history is now available as a stream of events, rather than list of pairs of (Action, Observation)
# for compatibility with the existing output format, we can remake the pairs here
@@ -125,16 +164,17 @@ def process_instance(
histories = state.history.compatibility_for_eval_history_pairs()
# Save the output
output = {
'instance_id': env_id,
'instruction': instruction,
'metadata': metadata.model_dump(),
'history': histories,
'metrics': metrics,
'error': state.last_error if state and state.last_error else None,
'test_result': reward,
}
output = EvalOutput(
instance_id=env_id,
instruction=instruction,
metadata=metadata,
history=histories,
metrics=metrics,
error=state.last_error if state and state.last_error else None,
test_result={
'reward': reward,
},
)
return output
@@ -143,7 +183,7 @@ if __name__ == '__main__':
dataset = pd.DataFrame(
{
'id': [
'instance_id': [
id
for id in gym.envs.registry.keys()
if id.startswith('browsergym/miniwob')
@@ -151,26 +191,25 @@ if __name__ == '__main__':
}
)
id_column = 'id'
llm_config = get_llm_config_arg(args.llm_config) if args.llm_config else config.llm
logger.info(f'Config for evaluation: {config}')
llm_config = None
if args.llm_config:
llm_config = get_llm_config_arg(args.llm_config)
if llm_config is None:
raise ValueError(f'Could not find LLM config: --llm_config {args.llm_config}')
metadata = make_metadata(
llm_config,
args.dataset_name,
'miniwob',
args.agent_cls,
args.max_iterations,
args.eval_note,
args.eval_output_dir,
)
output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl')
instances = prepare_dataset(dataset, output_file, args.eval_n_limit, id_column)
_ = get_sandbox() # Initialize the sandbox
run_evaluation(
instances,
metadata,
output_file,
args.eval_num_workers,
process_instance,
id_column,
instances = prepare_dataset(dataset, output_file, args.eval_n_limit)
asyncio.run(
run_evaluation(
instances, metadata, output_file, args.eval_num_workers, process_instance
)
)

6
evaluation/miniwob/scripts/run_infer.sh Normal file → Executable file
View File

@@ -3,14 +3,10 @@ set -eo pipefail
source "evaluation/utils/version_control.sh"
# configure miniwob website, change URL to yours
export MINIWOB_URL="file:///home/fangzhex/miniwob-plusplus/miniwob/html/miniwob/"
# configure browsing agent
export USE_NAV="false"
export USE_CONCISE_ANSWER="true"
MODEL_CONFIG=$1
COMMIT_HASH=$2
AGENT=$3
@@ -42,7 +38,7 @@ COMMAND="poetry run python evaluation/miniwob/run_infer.py \
--llm-config $MODEL_CONFIG \
--max-iterations 10 \
--max-chars 10000000 \
--eval-num-workers $NUM_WORKERS \
--eval-num-workers $NUM_WORKERS"
if [ -n "$EVAL_LIMIT" ]; then
echo "EVAL_LIMIT: $EVAL_LIMIT"

View File

@@ -0,0 +1,10 @@
FROM ubuntu:22.04
RUN apt-get update && apt-get install -y python3 python3-pip git gcc
WORKDIR /root
COPY requirements.txt .
RUN pip install -r requirements.txt
# docker build -t xingyaoww/od-eval-mint:v1.0 .

View File

@@ -2,9 +2,11 @@
This folder contains the evaluation harness for the [MINT benchmark](https://arxiv.org/abs/2309.10691) on LLMs' ability to solve tasks with multi-turn interactions.
## Configure OpenDevin and LM
We support evaluation of the [Eurus subset focus on math and code reasoning](https://arxiv.org/abs/2404.02078), including MATH, MMLU, TheoremQA, HumanEval, MBPP.
Create a `config.toml` file if it does not exist at the root of the workspace. Please check [README.md](../../README.md) for how to set this up.
## Setup Environment and LLM Configuration
Please follow instruction [here](../README.md#setup) to setup your local development environment and LLM.
## Start the evaluation

View File

@@ -1,33 +1,36 @@
import asyncio
import functools
import logging
import os
import pathlib
from typing import Any, Dict
import pandas as pd
from datasets import load_dataset
from evaluation.swe_bench.swe_env_box import DockerSSHBox
from evaluation.mint.datatypes import TaskState
from evaluation.mint.env import SimplifiedEnv
from evaluation.mint.prompts import ToolPromptTemplate
from evaluation.mint.tasks import Task
from evaluation.utils.shared import (
EvalMetadata,
EvalOutput,
make_metadata,
prepare_dataset,
reset_logger_for_multiprocessing,
run_evaluation,
)
from opendevin.controller.agent import Agent
from opendevin.controller.state.state import State
from opendevin.core.config import get_llm_config_arg, get_parser, load_app_config
from opendevin.core.logger import get_console_handler
from opendevin.core.config import (
AppConfig,
SandboxConfig,
get_llm_config_arg,
get_parser,
)
from opendevin.core.logger import opendevin_logger as logger
from opendevin.core.main import run_agent_controller
from opendevin.llm.llm import LLM
from .datatypes import TaskState
from .env import SimplifiedEnv
from .prompts import ToolPromptTemplate
from .tasks import Task
config = load_app_config()
from opendevin.core.main import create_runtime, run_controller
from opendevin.events.action import (
CmdRunAction,
)
from opendevin.events.observation import CmdOutputObservation
from opendevin.runtime.runtime import Runtime
def codeact_user_response_mint(state: State, task: Task, task_config: Dict[str, int]):
@@ -42,7 +45,7 @@ def codeact_user_response_mint(state: State, task: Task, task_config: Dict[str,
last_action = state.history.get_last_action()
result_state: TaskState = env.step(last_action.message or '')
state.task_state = result_state
state.extra_data['task_state'] = result_state
if not result_state.latest_output:
# Task is finished
@@ -62,85 +65,107 @@ AGENT_CLS_TO_INST_SUFFIX = {
'CodeActAgent': '\nIMPORTANT: When your answer is confirmed by the user to be correct, you can exit using the following command: <execute_bash> exit </execute_bash>.\n'
}
with open(os.path.join(os.path.dirname(__file__), 'requirements.txt'), 'r') as f:
MINT_DEPENDENCIES = f.read().splitlines()
def process_instance(
def load_incontext_example(task_name: str, with_tool: bool = True):
assert with_tool, 'NOT with_tool is not supported yet'
subset = {
'gsm8k': 'reasoning',
'math': 'reasoning',
'mmlu': 'reasoning',
'theoremqa': 'reasoning',
'mbpp': 'mbpp',
'humaneval': 'humaneval',
}[task_name]
with open(
os.path.join(
os.path.dirname(__file__),
'tasks',
'in_context_examples',
subset,
'with_tool.txt',
),
'r',
) as f:
return f.read()
def get_config(
metadata: EvalMetadata,
) -> AppConfig:
config = AppConfig(
default_agent=metadata.agent_class,
run_as_devin=False,
runtime='eventstream',
max_iterations=metadata.max_iterations,
sandbox=SandboxConfig(
container_image='xingyaoww/od-eval-mint:v1.0',
enable_auto_lint=True,
use_host_network=False,
od_runtime_extra_deps=f'$OD_INTERPRETER_PATH -m pip install {" ".join(MINT_DEPENDENCIES)}',
),
# do not mount workspace
workspace_base=None,
workspace_mount_path=None,
)
config.set_llm_config(metadata.llm_config)
return config
async def initialize_runtime(runtime: Runtime):
"""Initialize the runtime for the agent.
This function is called before the runtime is used to run the agent.
"""
logger.info(f"{'-' * 50} BEGIN Runtime Initialization Fn {'-' * 50}")
obs: CmdOutputObservation
# Set instance id
action = CmdRunAction(command='mkdir -p /workspace')
logger.info(action, extra={'msg_type': 'ACTION'})
obs = await runtime.run_action(action)
assert obs.exit_code == 0
action = CmdRunAction(command='cd /workspace')
logger.info(action, extra={'msg_type': 'ACTION'})
obs = await runtime.run_action(action)
assert obs.exit_code == 0
logger.info(f"{'-' * 50} END Runtime Initialization Fn {'-' * 50}")
async def process_instance(
instance: Any,
metadata: EvalMetadata,
reset_logger: bool = True,
):
agent = Agent.get_cls(metadata.agent_class)(llm=LLM(metadata.llm_config))
workspace_mount_path = os.path.join(config.workspace_mount_path, '_eval_workspace')
# create process-specific workspace dir
workspace_mount_path = os.path.join(workspace_mount_path, str(os.getpid()))
pathlib.Path(workspace_mount_path).mkdir(parents=True, exist_ok=True)
config = get_config(metadata)
# Setup the logger properly, so you can run multi-processing to parallelize the evaluation
if reset_logger:
# Set up logger
log_file = os.path.join(
metadata.eval_output_dir, 'logs', f'instance_{instance.task_id}.log'
)
# Remove all existing handlers from logger
for handler in logger.handlers[:]:
logger.removeHandler(handler)
# add back the console handler to print ONE line
logger.addHandler(get_console_handler())
logger.info(
f'Starting evaluation for instance {instance.task_id}.\nHint: run "tail -f {log_file}" to see live logs in a separate shell'
)
# Remove all existing handlers from logger
for handler in logger.handlers[:]:
logger.removeHandler(handler)
file_handler = logging.FileHandler(log_file)
file_handler.setFormatter(
logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
)
logger.addHandler(file_handler)
logger.info(f'Process-specific workspace mounted at {workspace_mount_path}')
# use a session id for concurrent processing
sid = instance.task_id + '_' + str(os.getpid())
sandbox = DockerSSHBox(
config=config.sandbox,
persist_sandbox=False,
workspace_mount_path=config.workspace_mount_path,
sandbox_workspace_dir=config.workspace_mount_path_in_sandbox,
cache_dir=config.cache_dir,
run_as_devin=config.run_as_devin,
sid=sid,
)
requirements_host_src = 'evaluation/mint/requirements.txt'
requirements_sandbox_dest = '/opendevin/plugins/mint/requirements.txt'
sandbox.copy_to(
host_src=requirements_host_src,
sandbox_dest=requirements_sandbox_dest,
recursive=False,
)
logger.info(
f'Copied files from [{requirements_host_src}] to [{requirements_sandbox_dest}] inside sandbox.'
)
exit_code, output = sandbox.execute(f'pip install -r {requirements_sandbox_dest}')
log_dir = os.path.join(metadata.eval_output_dir, 'infer_logs')
reset_logger_for_multiprocessing(logger, instance.instance_id, log_dir)
else:
logger.info(f'Starting evaluation for instance {instance.instance_id}.')
# Prepare instruction
assert metadata.details is not None
instruction = ToolPromptTemplate(use_tool=True)(
max_total_steps=metadata.max_iterations,
max_propose_solution=metadata.details['max_propose_solution'],
in_context_example=instance.in_context_example(
use_tool=True, with_feedback=False
),
in_context_example=instance.in_context_example,
task_prompt='Task:\n' + instance.prompt,
)
instruction += 'IMPORTANT: You should ONLY interact with the environment provided to you or provide the concise RESULT inside <solution> tag AND NEVER ASK FOR HUMAN HELP.\n'
# NOTE: You can actually set slightly different instruction for different agents
instruction += AGENT_CLS_TO_INST_SUFFIX[agent.__class__.__name__]
instruction += AGENT_CLS_TO_INST_SUFFIX[metadata.agent_class]
# Here's how you can run the agent (similar to the `main` function) and get the final task state
fake_user_response_fn = functools.partial(
AGENT_CLS_TO_FAKE_USER_RESPONSE_FN[agent.__class__.__name__],
AGENT_CLS_TO_FAKE_USER_RESPONSE_FN[metadata.agent_class],
task=instance,
task_config={
'max_iterations': metadata.max_iterations,
@@ -148,24 +173,22 @@ def process_instance(
},
)
state: State | None = asyncio.run(
run_agent_controller(
agent,
instruction,
max_iterations=metadata.max_iterations,
max_budget_per_task=config.max_budget_per_task,
fake_user_response_fn=fake_user_response_fn,
sandbox=sandbox,
sid=sid,
)
runtime = await create_runtime(config, sid=instance.instance_id)
await initialize_runtime(runtime)
state: State | None = await run_controller(
config=config,
task_str=instruction,
runtime=runtime,
fake_user_response_fn=fake_user_response_fn,
)
if state is None:
raise ValueError('State should not be None.')
task_state = None
if hasattr(state, 'task_state'):
task_state = state.task_state
if 'task_state' in state.extra_data:
task_state = state.extra_data['task_state']
logger.info('Task state: ' + str(task_state.to_dict()))
metrics = state.metrics.get() if state.metrics else None
@@ -176,30 +199,37 @@ def process_instance(
histories = state.history.compatibility_for_eval_history_pairs()
# Save the output
output = {
'id': instance.task_id,
'instance': instance.to_dict(),
'instruction': instruction,
'metadata': metadata.model_dump(),
'history': histories,
'metrics': metrics,
'error': state.last_error if state and state.last_error else None,
'test_result': task_state.success if task_state else False,
}
# Close the sandbox
sandbox.close()
output = EvalOutput(
instance_id=instance.instance_id,
instance=instance.to_dict(),
instruction=instruction,
metadata=metadata,
history=histories,
metrics=metrics,
error=state.last_error if state and state.last_error else None,
test_result={
'success': task_state.success if task_state else False,
},
)
return output
if __name__ == '__main__':
parser = get_parser()
SUBSETS = [
# Eurus subset: https://arxiv.org/abs/2404.02078
'math',
# 'gsm8k',
'mmlu',
'theoremqa',
'mbpp',
'humaneval',
]
parser.add_argument(
'--subset',
default='math',
choices=['math', 'gsm8k', 'mmlu', 'theoremqa', 'mbpp', 'humaneval'],
default='all',
choices=SUBSETS + ['all'],
type=str,
help='subset of the dataset to be used',
)
@@ -214,19 +244,36 @@ if __name__ == '__main__':
# NOTE: It is preferable to load datasets from huggingface datasets and perform post-processing
# so we don't need to manage file uploading to OpenDevin's repo
mint_dataset = load_dataset(
'ryanhoangt/xingyaoww-mint-bench', name=args.subset, split='test'
)
logger.info(f'Evaluating MINT - {args.subset} subset')
mint_tests = mint_dataset.to_pandas()
if args.subset == 'all':
subsets = SUBSETS
else:
subsets = [args.subset]
id_column = 'id'
llm_config = get_llm_config_arg(args.llm_config) if args.llm_config else config.llm
logger.info(f'Config for evaluation: {config}')
dataset_dfs = []
for subset in subsets:
in_context_example = load_incontext_example(subset)
_cur_dataset = load_dataset(
'ryanhoangt/xingyaoww-mint-bench', name=subset, split='test'
)
logger.info(f'Loaded MINT - {subset} subset')
_df = _cur_dataset.to_pandas().rename(columns={'id': 'instance_id'})
_df['instance_id'] = _df['instance_id'].apply(lambda x: f'{subset}/{x}') # noqa
_df['in_context_example'] = in_context_example
dataset_dfs.append(_df)
logger.info(f'Loaded {len(_df)} instances for subset: {subset}')
dataset_df = pd.concat(dataset_dfs)
logger.info(f'Loaded {len(dataset_df)} instances for subset: {subsets}')
llm_config = None
if args.llm_config:
llm_config = get_llm_config_arg(args.llm_config)
if llm_config is None:
raise ValueError(f'Could not find LLM config: --llm_config {args.llm_config}')
metadata = make_metadata(
llm_config,
args.dataset_name,
f'MINT-{args.subset}',
args.agent_cls,
args.max_iterations,
args.eval_note,
@@ -234,12 +281,7 @@ if __name__ == '__main__':
details={'max_propose_solution': args.max_propose_solution},
)
output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl')
instances = prepare_dataset(mint_dataset, output_file, args.eval_n_limit, id_column)
instances = prepare_dataset(dataset_df, output_file, args.eval_n_limit)
run_evaluation(
instances,
metadata,
output_file,
args.eval_num_workers,
process_instance,
id_column,
instances, metadata, output_file, args.eval_num_workers, process_instance
)

7
evaluation/mint/scripts/run_infer.sh Normal file → Executable file
View File

@@ -29,15 +29,16 @@ COMMAND="poetry run python ./evaluation/mint/run_infer.py \
--llm-config $MODEL_CONFIG \
--max-iterations 5 \
--max-propose-solution 2 \
--eval-num-workers $NUM_WORKERS \
--eval-num-workers $NUM_WORKERS
"
if [ -n "$SUBSET" ]; then
echo "SUBSET: $SUBSET"
COMMAND="$COMMAND --subset $SUBSET"
# otherwise default to use the math subset
else
echo "SUBSET: math"
COMMAND="$COMMAND --subset math"
echo "SUBSET: all"
COMMAND="$COMMAND --subset all"
fi
if [ -n "$EVAL_LIMIT" ]; then

View File

@@ -10,40 +10,9 @@ The task introduces new challenges for LLMs, such as comprehending long and lang
For more details on the ML-Bench task and dataset, please refer to the paper: [ML-Bench: Evaluating Large Language Models for Code Generation in Repository-Level Machine Learning Tasks](https://arxiv.org/abs/2311.09835).
## Setup Environment
## Setup Environment and LLM Configuration
Please follow the [OpenDevin setup guide](https://github.com/OpenDevin/OpenDevin/blob/main/docs/setup.md) to set up the local development environment for OpenDevin.
## Configure OpenDevin and your LLM
Create a `config.toml` file if it does not exist at the root of the workspace.
Add the following configurations:
```toml
[core]
max_iterations = 100
cache_dir = "/tmp/cache"
ssh_hostname = "localhost"
run_as_devin = false
sandbox_container_image = "public.ecr.aws/i5g0m1f6/ml-bench" # Use the latest image from the ML-Bench repository
[sandbox]
enable_auto_lint = true
# TODO: Change these to the model you want to evaluate
[llm.eval_gpt4_1106_preview]
model = "gpt-4-1106-preview"
api_key = "XXX"
temperature = 0.0
[llm.eval_some_openai_compatible_model]
model = "openai/MODEL_NAME"
base_url = "https://OPENAI_COMPATIBLE_URL/v1"
api_key = "XXX"
temperature = 0.0
```
Please follow instruction [here](../README.md#setup) to setup your local development environment and LLM.
## Run Inference on ML-Bench

Some files were not shown because too many files have changed in this diff Show More