Merge branch 'main' into rb/experimental-ui

2026-03-22 05:37:20 +08:00 · 2024-08-09 13:54:03 -04:00
parent b029883d33 00bc68642f
commit 381143b8fc
529 changed files with 31360 additions and 8552 deletions
--- a/.github/workflows/clean-up.yml
+++ b/.github/workflows/clean-up.yml
@@ -0,0 +1,68 @@
+# Workflow that cleans up outdated and old workflows to prevent out of disk issues
+name: Delete old workflow runs
+
+on:
+  workflow_dispatch:
+    inputs:
+      days:
+        description: 'Days-worth of runs to keep for each workflow'
+        required: true
+        default: '30'
+      minimum_runs:
+        description: 'Minimum runs to keep for each workflow'
+        required: true
+        default: '10'
+      delete_workflow_pattern:
+        description: 'Name or filename of the workflow (if not set, all workflows are targeted)'
+        required: false
+      delete_workflow_by_state_pattern:
+        description: 'Filter workflows by state: active, deleted, disabled_fork, disabled_inactivity, disabled_manually'
+        required: true
+        default: "ALL"
+        type: choice
+        options:
+          - "ALL"
+          - active
+          - deleted
+          - disabled_inactivity
+          - disabled_manually
+      delete_run_by_conclusion_pattern:
+        description: 'Remove runs based on conclusion: action_required, cancelled, failure, skipped, success'
+        required: true
+        default: 'ALL'
+        type: choice
+        options:
+          - 'ALL'
+          - 'Unsuccessful: action_required,cancelled,failure,skipped'
+          - action_required
+          - cancelled
+          - failure
+          - skipped
+          - success
+      dry_run:
+        description: 'Logs simulated changes, no deletions are performed'
+        required: false
+
+jobs:
+  del_runs:
+    runs-on: ubuntu-latest
+    permissions:
+      actions: write
+      contents: read
+    steps:
+      - name: Delete workflow runs
+        uses: Mattraks/delete-workflow-runs@v2
+        with:
+          token: ${{ github.token }}
+          repository: ${{ github.repository }}
+          retain_days: ${{ github.event.inputs.days }}
+          keep_minimum_runs: ${{ github.event.inputs.minimum_runs }}
+          delete_workflow_pattern: ${{ github.event.inputs.delete_workflow_pattern }}
+          delete_workflow_by_state_pattern: ${{ github.event.inputs.delete_workflow_by_state_pattern }}
+          delete_run_by_conclusion_pattern: >-
+            ${{
+              startsWith(github.event.inputs.delete_run_by_conclusion_pattern, 'Unsuccessful:')
+              && 'action_required,cancelled,failure,skipped'
+              || github.event.inputs.delete_run_by_conclusion_pattern
+            }}
+          dry_run: ${{ github.event.inputs.dry_run }}
--- a/.github/workflows/deploy-docs.yml
+++ b/.github/workflows/deploy-docs.yml
@@ -1,3 +1,4 @@
+# Workflow that builds and deploys the documentation website
 name: Deploy Docs to GitHub Pages

 on:
@@ -5,10 +6,13 @@ on:
    branches:
      - main
  pull_request:
+    paths:
+      - 'docs/**'
    branches:
      - main

 jobs:
+  # Build the documentation website
  build:
    name: Build Docusaurus
    runs-on: ubuntu-latest
@@ -25,23 +29,23 @@ jobs:
      - name: Set up Python
        uses: actions/setup-python@v5
        with:
-          python-version: "3.11"
-
+          python-version: '3.11'
      - name: Generate Python Docs
        run: rm -rf docs/modules/python && pip install pydoc-markdown && pydoc-markdown
      - name: Install dependencies
        run: cd docs && npm ci
      - name: Build website
        run: cd docs && npm run build
-
      - name: Upload Build Artifact
        if: github.ref == 'refs/heads/main'
        uses: actions/upload-pages-artifact@v3
        with:
          path: docs/build

+  # Deploy the documentation website
  deploy:
    name: Deploy to GitHub Pages
+    runs-on: ubuntu-latest
    needs: build
    if: github.ref == 'refs/heads/main' && github.repository == 'OpenDevin/OpenDevin'
    # Grant GITHUB_TOKEN the permissions required to make a Pages deployment
@@ -52,7 +56,6 @@ jobs:
    environment:
      name: github-pages
      url: ${{ steps.deployment.outputs.page_url }}
-    runs-on: ubuntu-latest
    steps:
      - name: Deploy to GitHub Pages
        id: deployment
--- a/.github/workflows/dummy-agent-test.yml
+++ b/.github/workflows/dummy-agent-test.yml
@@ -1,3 +1,4 @@
+# Workflow that uses the DummyAgent to run a simple task
 name: Run E2E test with dummy agent

 concurrency:
@@ -10,9 +11,6 @@ on:
    - main
  pull_request:

-env:
-  PERSIST_SANDBOX : "false"
-
 jobs:
  test:
    runs-on: ubuntu-latest
--- a/.github/workflows/ghcr.yml
+++ b/.github/workflows/ghcr.yml
@@ -1,3 +1,4 @@
+# Workflow that builds, tests and then pushes the docker images to the ghcr.io repository
 name: Build Publish and Test Runtime Image

 concurrency:
@@ -19,25 +20,21 @@ on:
        default: ''

 jobs:
+  # Builds the OpenDevin Docker images
  ghcr_build:
    runs-on: ubuntu-latest
-
    outputs:
      tags: ${{ steps.capture-tags.outputs.tags }}
-
    permissions:
      contents: read
      packages: write
-
    strategy:
      matrix:
-        image: ["sandbox", "opendevin"]
-        platform: ["amd64", "arm64"]
-
+        image: ['opendevin']
+        platform: ['amd64', 'arm64']
    steps:
      - name: Checkout
        uses: actions/checkout@v4
-
      - name: Free Disk Space (Ubuntu)
        uses: jlumbroso/free-disk-space@main
        with:
@@ -52,51 +49,43 @@ jobs:
          large-packages: true
          docker-images: false
          swap-storage: true
-
      - name: Set up QEMU
        uses: docker/setup-qemu-action@v3
-
      - name: Set up Docker Buildx
        id: buildx
        uses: docker/setup-buildx-action@v3
-
      - name: Build and export image
        id: build
        run: ./containers/build.sh ${{ matrix.image }} ${{ github.repository_owner }} ${{ matrix.platform }}
-
      - name: Capture tags
        id: capture-tags
        run: |
          tags=$(cat tags.txt)
          echo "tags=$tags"
          echo "tags=$tags" >> $GITHUB_OUTPUT
-
      - name: Upload Docker image as artifact
        uses: actions/upload-artifact@v4
        with:
          name: ${{ matrix.image }}-docker-image-${{ matrix.platform }}
          path: /tmp/${{ matrix.image }}_image_${{ matrix.platform }}.tar
+          retention-days: 14

+  # Builds the runtime Docker images
  ghcr_build_runtime:
    runs-on: ubuntu-latest
-
    outputs:
      tags: ${{ steps.capture-tags.outputs.tags }}
-
    permissions:
      contents: read
      packages: write
-
    strategy:
      matrix:
-        image: ["od_runtime"]
-        base_image: ["ubuntu:22.04"]
-        platform: ["amd64", "arm64"]
-
+        image: ['od_runtime']
+        base_image: ['ubuntu:22.04']
+        platform: ['amd64', 'arm64']
    steps:
      - name: Checkout
        uses: actions/checkout@v4
-
      - name: Free Disk Space (Ubuntu)
        uses: jlumbroso/free-disk-space@main
        with:
@@ -111,67 +100,65 @@ jobs:
          large-packages: true
          docker-images: false
          swap-storage: true
-
      - name: Set up QEMU
        uses: docker/setup-qemu-action@v3
-
      - name: Set up Docker Buildx
        id: buildx
        uses: docker/setup-buildx-action@v3
-
      - name: Install poetry via pipx
        run: pipx install poetry
-
      - name: Set up Python
        uses: actions/setup-python@v5
        with:
-          python-version: "3.11"
-          cache: "poetry"
-
+          python-version: '3.11'
+          cache: 'poetry'
      - name: Install Python dependencies using Poetry
        run: make install-python-dependencies
-
      - name: Create source distribution and Dockerfile
        run: poetry run python3 opendevin/runtime/utils/runtime_build.py --base_image ${{ matrix.base_image }} --build_folder containers/runtime
-
      - name: Build and export image
        id: build
-        run: ./containers/build.sh ${{ matrix.image }} ${{ github.repository_owner }} ${{ matrix.platform }}
-
+        run: |
+          if [ -f 'containers/runtime/Dockerfile' ]; then
+            echo 'Dockerfile detected, building runtime image...'
+            ./containers/build.sh ${{ matrix.image }} ${{ github.repository_owner }} ${{ matrix.platform }}
+          else
+            echo 'No Dockerfile detected which means an exact image is already built. Pulling the image and saving it to a tar file...'
+            source containers/runtime/config.sh
+            echo "$DOCKER_IMAGE_TAG $DOCKER_IMAGE_HASH_TAG" >> tags.txt
+            echo "Pulling image $DOCKER_IMAGE/$DOCKER_IMAGE_HASH_TAG to /tmp/${{ matrix.image }}_image_${{ matrix.platform }}.tar"
+            docker pull $DOCKER_IMAGE:$DOCKER_IMAGE_HASH_TAG
+            docker save $DOCKER_IMAGE:$DOCKER_IMAGE_HASH_TAG -o /tmp/${{ matrix.image }}_image_${{ matrix.platform }}.tar
+          fi
      - name: Capture tags
        id: capture-tags
        run: |
          tags=$(cat tags.txt)
          echo "tags=$tags"
          echo "tags=$tags" >> $GITHUB_OUTPUT
-
      - name: Upload Docker image as artifact
        uses: actions/upload-artifact@v4
        with:
          name: ${{ matrix.image }}-docker-image-${{ matrix.platform }}
          path: /tmp/${{ matrix.image }}_image_${{ matrix.platform }}.tar
+          retention-days: 14

+  # Run unit tests with the EventStream and Server runtime Docker images
  test_runtime:
    name: Test Runtime
    runs-on: ubuntu-latest
    needs: [ghcr_build_runtime, ghcr_build]
-    env:
-      PERSIST_SANDBOX: "false"
-
    strategy:
      matrix:
-        runtime_type: ["eventstream", "server"]
-
+        runtime_type: ['eventstream']
    steps:
      - uses: actions/checkout@v4
-
      - name: Free Disk Space (Ubuntu)
        uses: jlumbroso/free-disk-space@main
        with:
          # this might remove tools that are actually needed,
          # when set to "true" but frees about 6 GB
          tool-cache: true
-
          # all of these default to true, but feel free to set to
          # "false" if necessary for your workflow
          android: true
@@ -179,33 +166,27 @@ jobs:
          haskell: true
          large-packages: true
          swap-storage: true
-
      - name: Install poetry via pipx
        run: pipx install poetry
-
      - name: Set up Python
        uses: actions/setup-python@v5
        with:
-          python-version: "3.11"
-          cache: "poetry"
-
+          python-version: '3.11'
+          cache: 'poetry'
      - name: Install Python dependencies using Poetry
        run: make install-python-dependencies
-
      - name: Download Runtime Docker image
        if: matrix.runtime_type == 'eventstream'
        uses: actions/download-artifact@v4
        with:
          name: od_runtime-docker-image-amd64
          path: /tmp/
-
      - name: Download Sandbox Docker image
        if: matrix.runtime_type == 'server'
        uses: actions/download-artifact@v4
        with:
          name: sandbox-docker-image-amd64
          path: /tmp/
-
      - name: Load Runtime image and run runtime tests
        run: |
          # Load the Docker image and capture the output
@@ -222,50 +203,47 @@ jobs:
          echo "Loaded Docker image: $image_name"

          TEST_RUNTIME=${{ matrix.runtime_type }} SANDBOX_USER_ID=$(id -u) SANDBOX_CONTAINER_IMAGE=$image_name TEST_IN_CI=true poetry run pytest --cov=agenthub --cov=opendevin --cov-report=xml -s ./tests/unit/test_runtime.py
-
      - name: Upload coverage to Codecov
        uses: codecov/codecov-action@v4
        env:
          CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}

-  integration_tests_on_linux:
-    name: Integration Tests on Linux
+  # Run integration tests with the eventstream runtime Docker image
+  runtime_integration_tests_on_linux:
+    name: Runtime Integration Tests on Linux
    runs-on: ubuntu-latest
-    needs: ghcr_build
-    env:
-      PERSIST_SANDBOX: "false"
+    needs: [ghcr_build_runtime]
    strategy:
      fail-fast: false
      matrix:
-        python-version: ["3.11"]
-        sandbox: ["ssh", "local"]
+        python-version: ['3.11']
+        # server is tested in a separate workflow
+        runtime_type: ['eventstream']
    steps:
      - uses: actions/checkout@v4
-
      - name: Install poetry via pipx
        run: pipx install poetry
-
      - name: Set up Python
        uses: actions/setup-python@v5
        with:
          python-version: ${{ matrix.python-version }}
          cache: 'poetry'
-
      - name: Install Python dependencies using Poetry
        run: make install-python-dependencies
-
-      - name: Download sandbox Docker image
+      - name: Download Runtime Docker image
        uses: actions/download-artifact@v4
        with:
-          name: sandbox-docker-image-amd64
+          name: od_runtime-docker-image-amd64
          path: /tmp/
-
-      - name: Load sandbox image and run integration tests
-        env:
-          SANDBOX_BOX_TYPE: ${{ matrix.sandbox }}
+      - name: Load runtime image and run integration tests
        run: |
          # Load the Docker image and capture the output
-          output=$(docker load -i /tmp/sandbox_image_amd64.tar)
+          if [ "${{ matrix.runtime_type }}" == "eventstream" ]; then
+            output=$(docker load -i /tmp/od_runtime_image_amd64.tar)
+          else
+            echo "No Runtime Docker image to load"
+            exit 1
+          fi

          # Extract the first image name from the output
          image_name=$(echo "$output" | grep -oP 'Loaded image: \K.*' | head -n 1)
@@ -273,49 +251,40 @@ jobs:
          # Print the full name of the image
          echo "Loaded Docker image: $image_name"

-          SANDBOX_CONTAINER_IMAGE=$image_name TEST_IN_CI=true TEST_ONLY=true ./tests/integration/regenerate.sh
-
+          TEST_RUNTIME=${{ matrix.runtime_type }} SANDBOX_USER_ID=$(id -u) SANDBOX_CONTAINER_IMAGE=$image_name TEST_IN_CI=true TEST_ONLY=true ./tests/integration/regenerate.sh
      - name: Upload coverage to Codecov
        uses: codecov/codecov-action@v4
        env:
          CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}

-
+  # Push the OpenDevin and sandbox Docker images to the ghcr.io repository
  ghcr_push:
    runs-on: ubuntu-latest
-    # don't push if integration tests or sandbox tests fail
-    needs: [ghcr_build, test_runtime, integration_tests_on_linux]
+    needs: [ghcr_build]
    if: github.ref == 'refs/heads/main' || startsWith(github.ref, 'refs/tags/')
-
    env:
      tags: ${{ needs.ghcr_build.outputs.tags }}
-
    permissions:
      contents: read
      packages: write
-
    strategy:
      matrix:
-        image: ["sandbox", "opendevin"]
-        platform: ["amd64", "arm64"]
-
+        image: ['opendevin']
+        platform: ['amd64', 'arm64']
    steps:
      - name: Checkout code
        uses: actions/checkout@v4
-
      - name: Login to GHCR
        uses: docker/login-action@v2
        with:
          registry: ghcr.io
          username: ${{ github.repository_owner }}
          password: ${{ secrets.GITHUB_TOKEN }}
-
      - name: Download Docker images
        uses: actions/download-artifact@v4
        with:
          name: ${{ matrix.image }}-docker-image-${{ matrix.platform }}
          path: /tmp/${{ matrix.platform }}
-
      - name: Load images and push to registry
        run: |
          mv /tmp/${{ matrix.platform }}/${{ matrix.image }}_image_${{ matrix.platform }}.tar .
@@ -330,28 +299,23 @@ jobs:
            docker push $image_name:${tag}_${{ matrix.platform }}
          done

+  # Push the runtime Docker images to the ghcr.io repository
  ghcr_push_runtime:
    runs-on: ubuntu-latest
-    # don't push if runtime tests fail
-    needs: [ghcr_build_runtime, test_runtime, integration_tests_on_linux]
+    needs: [ghcr_build_runtime, test_runtime, runtime_integration_tests_on_linux]
    if: github.ref == 'refs/heads/main' || startsWith(github.ref, 'refs/tags/')
-
    env:
-      tags: ${{ needs.ghcr_build_runtime.outputs.tags }}
-
+      RUNTIME_TAGS: ${{ needs.ghcr_build_runtime.outputs.tags }}
    permissions:
      contents: read
      packages: write
-
    strategy:
      matrix:
-        image: ["od_runtime"]
-        platform: ["amd64", "arm64"]
-
+        image: ['od_runtime']
+        platform: ['amd64', 'arm64']
    steps:
      - name: Checkout code
        uses: actions/checkout@v4
-
      - name: Free Disk Space (Ubuntu)
        uses: jlumbroso/free-disk-space@main
        with:
@@ -362,25 +326,21 @@ jobs:
          large-packages: true
          docker-images: false
          swap-storage: true
-
      - name: Login to GHCR
        uses: docker/login-action@v2
        with:
          registry: ghcr.io
          username: ${{ github.repository_owner }}
          password: ${{ secrets.GITHUB_TOKEN }}
-
      - name: Download Docker images
        uses: actions/download-artifact@v4
        with:
          name: ${{ matrix.image }}-docker-image-${{ matrix.platform }}
          path: /tmp/${{ matrix.platform }}
-
      - name: List downloaded files
        run: |
          ls -la /tmp/${{ matrix.platform }}
          file /tmp/${{ matrix.platform }}/*
-
      - name: Load images and push to registry
        run: |
          mv /tmp/${{ matrix.platform }}/${{ matrix.image }}_image_${{ matrix.platform }}.tar ./${{ matrix.image }}_image_${{ matrix.platform }}.tar
@@ -389,46 +349,40 @@ jobs:
            exit 1
          fi
          echo "loaded image = $loaded_image"
-          tags=$(echo ${tags} | tr ' ' '\n')
          image_name=$(echo "ghcr.io/${{ github.repository_owner }}/${{ matrix.image }}" | tr '[:upper:]' '[:lower:]')
          echo "image name = $image_name"
-          for tag in $tags; do
+          echo "$RUNTIME_TAGS" | tr ' ' '\n' | while read -r tag; do
            echo "tag = $tag"
-            if [ -n "$image_name" ]; then
+            if [ -n "$image_name" ] && [ -n "$tag" ]; then
              docker tag $loaded_image $image_name:${tag}_${{ matrix.platform }}
              docker push $image_name:${tag}_${{ matrix.platform }}
            else
-              echo "Skipping tag and push due to empty image_name"
+              echo "Skipping tag and push due to empty image_name or tag"
            fi
          done

+  # Creates and pushes the OpenDevin and sandbox Docker image manifests
  create_manifest:
    runs-on: ubuntu-latest
    needs: [ghcr_build, ghcr_push]
    if: github.ref == 'refs/heads/main' || startsWith(github.ref, 'refs/tags/')
-
    env:
      tags: ${{ needs.ghcr_build.outputs.tags }}
-
    strategy:
      matrix:
-        image: ["sandbox", "opendevin"]
-
+        image: ['opendevin']
    permissions:
      contents: read
      packages: write
-
    steps:
      - name: Checkout code
        uses: actions/checkout@v4
-
      - name: Login to GHCR
        uses: docker/login-action@v2
        with:
          registry: ghcr.io
          username: ${{ github.repository_owner }}
          password: ${{ secrets.GITHUB_TOKEN }}
-
      - name: Create and push multi-platform manifest
        run: |
          image_name=$(echo "ghcr.io/${{ github.repository_owner }}/${{ matrix.image }}" | tr '[:upper:]' '[:lower:]')
@@ -441,33 +395,28 @@ jobs:
              $image_name:${tag}_arm64
          done

+  # Creates and pushes the runtime Docker image manifest
  create_manifest_runtime:
    runs-on: ubuntu-latest
    needs: [ghcr_build_runtime, ghcr_push_runtime]
    if: github.ref == 'refs/heads/main' || startsWith(github.ref, 'refs/tags/')
-
    env:
      tags: ${{ needs.ghcr_build_runtime.outputs.tags }}
-
    strategy:
      matrix:
-        image: ["od_runtime"]
-
+        image: ['od_runtime']
    permissions:
      contents: read
      packages: write
-
    steps:
      - name: Checkout code
        uses: actions/checkout@v4
-
      - name: Login to GHCR
        uses: docker/login-action@v2
        with:
          registry: ghcr.io
          username: ${{ github.repository_owner }}
          password: ${{ secrets.GITHUB_TOKEN }}
-
      - name: Create and push multi-platform manifest
        run: |
          image_name=$(echo "ghcr.io/${{ github.repository_owner }}/${{ matrix.image }}" | tr '[:upper:]' '[:lower:]')
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -1,3 +1,4 @@
+# Workflow that runs lint on the frontend and python code
 name: Lint

 concurrency:
@@ -11,27 +12,26 @@ on:
  pull_request:

 jobs:
+  # Run lint on the frontend code
  lint-frontend:
    name: Lint frontend
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v4
-
      - name: Install Node.js 20
        uses: actions/setup-node@v4
        with:
          node-version: 20
-
      - name: Install dependencies
        run: |
          cd frontend
          npm install --frozen-lockfile
-
      - name: Lint
        run: |
          cd frontend
          npm run lint

+  # Run lint on the python code
  lint-python:
    name: Lint python
    runs-on: ubuntu-latest
--- a/.github/workflows/review-pr.yml
+++ b/.github/workflows/review-pr.yml
@@ -1,3 +1,4 @@
+# Workflow that uses OpenDevin to review a pull request. PR must be labeled 'review-this'
 name: Use OpenDevin to Review Pull Request

 on:
@@ -22,16 +23,13 @@ jobs:
      run: |
        sudo apt-get install -y git gh
        git config --global --add safe.directory $PWD
-
    - name: Checkout Repository
      uses: actions/checkout@v4
      with:
        ref: ${{ github.event.pull_request.base.ref }} # check out the target branch
-
    - name: Download Diff
      run: |
        curl -O "${{ github.event.pull_request.diff_url }}" -L
-
    - name: Write Task File
      run: |
        echo "Your coworker wants to apply a pull request to this project." > task.txt
@@ -45,19 +43,16 @@ jobs:
        echo "${{ github.event.pull_request.body }}" >> task.txt
        echo "" >> task.txt
        echo "Diff file is: ${{ github.event.pull_request.number }}.diff" >> task.txt
-
    - name: Set up environment
      run: |
        curl -sSL https://install.python-poetry.org | python3 -
        export PATH="/github/home/.local/bin:$PATH"
        poetry install --without evaluation,llama-index
        poetry run playwright install --with-deps chromium
-
    - name: Run OpenDevin
      env:
        LLM_API_KEY: ${{ secrets.LLM_API_KEY }}
        LLM_MODEL: ${{ vars.LLM_MODEL }}
-        SANDBOX_BOX_TYPE: ssh
      run: |
        # Append path to launch poetry
        export PATH="/github/home/.local/bin:$PATH"
@@ -67,7 +62,6 @@ jobs:
        export WORKSPACE_BASE=$GITHUB_WORKSPACE
        echo -e "/exit\n" | poetry run python opendevin/core/main.py -i 50 -f task.txt
        rm task.txt
-
    - name: Check if review file is non-empty
      id: check_file
      run: |
@@ -76,7 +70,6 @@ jobs:
          echo "non_empty=true" >> $GITHUB_OUTPUT
        fi
      shell: bash
-
    - name: Create PR review if file is non-empty
      env:
        GH_TOKEN: ${{ github.token }}
--- a/.github/workflows/run-unit-tests.yml
+++ b/.github/workflows/run-unit-tests.yml
@@ -1,3 +1,4 @@
+# Workflow that runs frontend and python unit tests
 name: Run Unit Tests

 concurrency:
@@ -15,63 +16,52 @@ on:
      - 'evaluation/**'
  pull_request:

-env:
-  PERSIST_SANDBOX : "false"

 jobs:
+  # Run frontend unit tests
  fe-test:
    runs-on: ubuntu-latest
-
    strategy:
      matrix:
        node-version: [20]
-
    steps:
      - name: Checkout
        uses: actions/checkout@v4
-
      - name: Set up Node.js
        uses: actions/setup-node@v4
        with:
          node-version: ${{ matrix.node-version }}
-
      - name: Install dependencies
        working-directory: ./frontend
        run: npm ci
-
      - name: Run tests and collect coverage
        working-directory: ./frontend
        run: npm run test:coverage
-
      - name: Upload coverage to Codecov
        uses: codecov/codecov-action@v4
        env:
          CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}

+  # Run python unit tests on macOS
  test-on-macos:
    name: Test on macOS
    runs-on: macos-12
    env:
-      INSTALL_DOCKER: "1" # Set to '0' to skip Docker installation
+      INSTALL_DOCKER: '1' # Set to '0' to skip Docker installation
    strategy:
      matrix:
-        python-version: ["3.11"]
-
+        python-version: ['3.11']
    steps:
      - uses: actions/checkout@v4
-
      - name: Install poetry via pipx
        run: pipx install poetry
-
      - name: Set up Python ${{ matrix.python-version }}
        uses: actions/setup-python@v5
        with:
          python-version: ${{ matrix.python-version }}
-          cache: "poetry"
-
+          cache: 'poetry'
      - name: Install Python dependencies using Poetry
        run: poetry install --without evaluation,llama-index
-
      - name: Install & Start Docker
        if: env.INSTALL_DOCKER == '1'
        run: |
@@ -120,47 +110,39 @@ jobs:
          # For testcontainers to find the Colima socket
          # https://github.com/abiosoft/colima/blob/main/docs/FAQ.md#cannot-connect-to-the-docker-daemon-at-unixvarrundockersock-is-the-docker-daemon-running
          sudo ln -sf $HOME/.colima/default/docker.sock /var/run/docker.sock
-
      - name: Build Environment
        run: make build
-
      - name: Run Tests
        run: poetry run pytest --forked --cov=agenthub --cov=opendevin --cov-report=xml ./tests/unit -k "not test_sandbox.py and not test_runtime.py"
-
      - name: Upload coverage to Codecov
        uses: codecov/codecov-action@v4
        env:
          CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
+
+  # Run python unit tests on Linux
  test-on-linux:
    name: Test on Linux
    runs-on: ubuntu-latest
    env:
-      INSTALL_DOCKER: "0" # Set to '0' to skip Docker installation
+      INSTALL_DOCKER: '0' # Set to '0' to skip Docker installation
    strategy:
      matrix:
-        python-version: ["3.11"]
-
+        python-version: ['3.11']
    steps:
      - uses: actions/checkout@v4
-
      - name: Install poetry via pipx
        run: pipx install poetry
-
      - name: Set up Python
        uses: actions/setup-python@v5
        with:
          python-version: ${{ matrix.python-version }}
-          cache: "poetry"
-
+          cache: 'poetry'
      - name: Install Python dependencies using Poetry
        run: poetry install --without evaluation,llama-index
-
      - name: Build Environment
        run: make build
-
      - name: Run Tests
        run: poetry run pytest --forked --cov=agenthub --cov=opendevin --cov-report=xml ./tests/unit -k "not test_sandbox.py and not test_runtime.py"
-
      - name: Upload coverage to Codecov
        uses: codecov/codecov-action@v4
        env:
--- a/.github/workflows/solve-issue.yml
+++ b/.github/workflows/solve-issue.yml
@@ -1,3 +1,4 @@
+# Workflow that uses OpenDevin to resolve a GitHub issue. Issue must be labeled 'solve-this'
 name: Use OpenDevin to Resolve GitHub Issue

 on:
@@ -17,14 +18,11 @@ jobs:
      image: ghcr.io/opendevin/opendevin
      volumes:
        - /var/run/docker.sock:/var/run/docker.sock
-
    steps:
    - name: install git, github cli
      run: apt-get install -y git gh
-
    - name: Checkout Repository
      uses: actions/checkout@v4
-
    - name: Write Task File
      env:
        ISSUE_TITLE: ${{ github.event.issue.title }}
@@ -35,22 +33,18 @@ jobs:
        echo "" >> task.txt
        echo "BODY:" >> task.txt
        echo "${ISSUE_BODY}" >> task.txt
-
    - name: Set up environment
      run: |
        curl -sSL https://install.python-poetry.org | python3 -
        export PATH="/github/home/.local/bin:$PATH"
        poetry install --without evaluation,llama-index
        poetry run playwright install --with-deps chromium
-
-
    - name: Run OpenDevin
      env:
        ISSUE_TITLE: ${{ github.event.issue.title }}
        ISSUE_BODY: ${{ github.event.issue.body }}
        LLM_API_KEY: ${{ secrets.OPENAI_API_KEY }}
        OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
-        SANDBOX_BOX_TYPE: ssh
      run: |
        # Append path to launch poetry
        export PATH="/github/home/.local/bin:$PATH"
@@ -58,7 +52,6 @@ jobs:
        export PYTHONPATH=$(pwd):$PYTHONPATH
        WORKSPACE_MOUNT_PATH=$GITHUB_WORKSPACE poetry run python ./opendevin/core/main.py -i 50 -f task.txt -d $GITHUB_WORKSPACE
        rm task.txt
-
    - name: Setup Git, Create Branch, and Commit Changes
      run: |
        # Setup Git configuration
@@ -84,7 +77,6 @@ jobs:

        # Push changes
        git push --set-upstream origin $BRANCH_NAME
-
    - name: Fetch Default Branch
      env:
        GH_TOKEN: ${{ github.token }}
@@ -93,7 +85,6 @@ jobs:
        DEFAULT_BRANCH=$(gh repo view --json defaultBranchRef --jq .defaultBranchRef.name)
        echo "Default branch is $DEFAULT_BRANCH"
        echo "DEFAULT_BRANCH=$DEFAULT_BRANCH" >> $GITHUB_ENV
-
    - name: Generate PR
      env:
        GH_TOKEN: ${{ github.token }}
--- a/.github/workflows/stale.yml
+++ b/.github/workflows/stale.yml
@@ -1,4 +1,6 @@
+# Workflow that marks issues and PRs with no activity for 30 days with "Stale" and closes them after 7 more days of no activity
 name: 'Close stale issues'
+
 on:
  schedule:
    - cron: '30 1 * * *'
@@ -9,21 +11,9 @@ jobs:
    steps:
      - uses: actions/stale@v9
        with:
-          # Aggressively close issues that have been explicitly labeled `age-out`
-          any-of-labels: age-out
-          stale-issue-message: 'This issue is stale because it has been open for 7 days with no activity. Remove stale label or comment or this will be closed in 1 day.'
-          close-issue-message: 'This issue was closed because it has been stalled for over 7 days with no activity.'
-          stale-pr-message: 'This PR is stale because it has been open for 7 days with no activity. Remove stale label or comment or this will be closed in 1 days.'
-          close-pr-message: 'This PR was closed because it has been stalled for over 7 days with no activity.'
-          days-before-stale: 7
-          days-before-close: 1
-
-      - uses: actions/stale@v9
-        with:
-          # Be more lenient with other issues
          stale-issue-message: 'This issue is stale because it has been open for 30 days with no activity. Remove stale label or comment or this will be closed in 7 days.'
-          close-issue-message: 'This issue was closed because it has been stalled for over 30 days with no activity.'
          stale-pr-message: 'This PR is stale because it has been open for 30 days with no activity. Remove stale label or comment or this will be closed in 7 days.'
-          close-pr-message: 'This PR was closed because it has been stalled for over 30 days with no activity.'
          days-before-stale: 30
+          close-issue-message: 'This issue was closed because it has been stalled for over 30 days with no activity.'
+          close-pr-message: 'This PR was closed because it has been stalled for over 30 days with no activity.'
          days-before-close: 7
--- a/.github/workflows/update-pyproject-version.yml
+++ b/.github/workflows/update-pyproject-version.yml
@@ -1,48 +0,0 @@
-name: Update pyproject.toml Version and Tags
-
-on:
-  release:
-    types:
-      - published
-
-jobs:
-  update-pyproject-and-tags:
-    runs-on: ubuntu-latest
-
-    steps:
-      - name: Checkout code
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0  # Fetch all history for all branches and tags
-
-      - name: Set up Python
-        uses: actions/setup-python@v5
-        with:
-          python-version: "3.11"
-
-      - name: Install dependencies
-        run: |
-          python -m pip install --upgrade pip
-          pip install toml
-
-      - name: Get release tag
-        id: get_release_tag
-        run: echo "RELEASE_TAG=${GITHUB_REF#refs/tags/}" >> $GITHUB_ENV
-
-      - name: Update pyproject.toml with release tag
-        run: |
-          python -c "
-          import toml
-          with open('pyproject.toml', 'r') as f:
-              data = toml.load(f)
-          data['tool']['poetry']['version'] = '${{ env.RELEASE_TAG }}'
-          with open('pyproject.toml', 'w') as f:
-              toml.dump(data, f)
-          "
-
-      - name: Commit and push pyproject.toml changes
-        uses: stefanzweifel/git-auto-commit-action@v4
-        with:
-          commit_message: "Update pyproject.toml version to ${{ env.RELEASE_TAG }}"
-          branch: main
-          file_pattern: pyproject.toml
--- a/.gitignore
+++ b/.gitignore
@@ -169,6 +169,10 @@ evaluation/outputs
 evaluation/swe_bench/eval_workspace*
 evaluation/SWE-bench/data
 evaluation/webarena/scripts/webarena_env.sh
+evaluation/bird/data
+evaluation/gaia/data
+evaluation/gorilla/data
+evaluation/toolqa/data

 # frontend

--- a/20
+++ b/20
@@ -23,9 +23,6 @@ RESET=$(shell tput -Txterm sgr0)
 build:
 	@echo "$(GREEN)Building project...$(RESET)"
 	@$(MAKE) -s check-dependencies
-ifeq ($(INSTALL_DOCKER),)
-	@$(MAKE) -s pull-docker-image
-endif
 	@$(MAKE) -s install-python-dependencies
 	@$(MAKE) -s install-frontend-dependencies
 	@$(MAKE) -s install-pre-commit-hooks
@@ -124,11 +121,6 @@ check-poetry:
 		exit 1; \
 	fi

-pull-docker-image:
-	@echo "$(YELLOW)Pulling Docker image...$(RESET)"
-	@docker pull $(DOCKER_IMAGE)
-	@echo "$(GREEN)Docker image pulled successfully.$(RESET)"
-
 install-python-dependencies:
 	@echo "$(GREEN)Installing Python dependencies...$(RESET)"
 	@if [ -z "${TZ}" ]; then \
@@ -246,16 +238,6 @@ setup-config-prompts:
 	 workspace_dir=$${workspace_dir:-$(DEFAULT_WORKSPACE_DIR)}; \
 	 echo "workspace_base=\"$$workspace_dir\"" >> $(CONFIG_FILE).tmp

-	@read -p "Do you want to persist the sandbox container? [true/false] [default: false]: " persist_sandbox; \
-	 persist_sandbox=$${persist_sandbox:-false}; \
-	 if [ "$$persist_sandbox" = "true" ]; then \
-		 read -p "Enter a password for the sandbox container: " ssh_password; \
-		 echo "ssh_password=\"$$ssh_password\"" >> $(CONFIG_FILE).tmp; \
-		 echo "persist_sandbox=$$persist_sandbox" >> $(CONFIG_FILE).tmp; \
-	 else \
-		echo "persist_sandbox=$$persist_sandbox" >> $(CONFIG_FILE).tmp; \
-	 fi
-
 	@echo "" >> $(CONFIG_FILE).tmp

 	@echo "[llm]" >> $(CONFIG_FILE).tmp
@@ -316,4 +298,4 @@ help:
 	@echo "  $(GREEN)help$(RESET)                - Display this help message, providing information on available targets."

 # Phony targets
-.PHONY: build check-dependencies check-python check-npm check-docker check-poetry pull-docker-image install-python-dependencies install-frontend-dependencies install-pre-commit-hooks lint start-backend start-frontend run run-wsl setup-config setup-config-prompts help
+.PHONY: build check-dependencies check-python check-npm check-docker check-poetry install-python-dependencies install-frontend-dependencies install-pre-commit-hooks lint start-backend start-frontend run run-wsl setup-config setup-config-prompts help
--- a/README.md
+++ b/README.md
@@ -24,7 +24,7 @@
  <a href="https://github.com/OpenDevin/OpenDevin/issues"><img src="https://img.shields.io/github/issues/opendevin/opendevin?style=for-the-badge&color=blue" alt="Issues"></a>
  <a href="https://github.com/OpenDevin/OpenDevin/blob/main/LICENSE"><img src="https://img.shields.io/github/license/opendevin/opendevin?style=for-the-badge&color=blue" alt="MIT License"></a>
  <br/>
-  <a href="https://join.slack.com/t/opendevin/shared_invite/zt-2i1iqdag6-bVmvamiPA9EZUu7oCO6KhA"><img src="https://img.shields.io/badge/Slack-Join%20Us-red?logo=slack&logoColor=white&style=for-the-badge" alt="Join our Slack community"></a>
+  <a href="https://join.slack.com/t/opendevin/shared_invite/zt-2ngejmfw6-9gW4APWOC9XUp1n~SiQ6iw"><img src="https://img.shields.io/badge/Slack-Join%20Us-red?logo=slack&logoColor=white&style=for-the-badge" alt="Join our Slack community"></a>
  <a href="https://discord.gg/ESHStjSjD4"><img src="https://img.shields.io/badge/Discord-Join%20Us-purple?logo=discord&logoColor=white&style=for-the-badge" alt="Join our Discord community"></a>
  <a href="https://codecov.io/github/opendevin/opendevin?branch=main"><img alt="CodeCov" src="https://img.shields.io/codecov/c/github/opendevin/opendevin?style=for-the-badge"></a>
 </div>
@@ -66,7 +66,7 @@ docker run -it \
    -p 3000:3000 \
    --add-host host.docker.internal:host-gateway \
    --name opendevin-app-$(date +%Y%m%d%H%M%S) \
-    ghcr.io/opendevin/opendevin
+    ghcr.io/opendevin/opendevin:0.8
 ```

 > [!NOTE]
@@ -111,7 +111,7 @@ For details, please check [CONTRIBUTING.md](./CONTRIBUTING.md).
 Whether you're a developer, a researcher, or simply enthusiastic about OpenDevin, we'd love to have you in our community.
 Let's make software engineering better together!

- [Slack workspace](https://join.slack.com/t/opendevin/shared_invite/zt-2jsrl32uf-fTeeFjNyNYxqSZt5NPY3fA) - Here we talk about research, architecture, and future development.
+- [Slack workspace](https://join.slack.com/t/opendevin/shared_invite/zt-2ngejmfw6-9gW4APWOC9XUp1n~SiQ6iw) - Here we talk about research, architecture, and future development.
 - [Discord server](https://discord.gg/ESHStjSjD4) - This is a community-run server for general discussion, questions, and feedback.

 ## 📈 Progress
@@ -141,12 +141,12 @@ Distributed under the MIT License. See [`LICENSE`](./LICENSE) for more informati

 ```
@misc{opendevin,
-      title={{OpenDevin: An Open Platform for AI Software Developers as Generalist Agents}}, 
+      title={{OpenDevin: An Open Platform for AI Software Developers as Generalist Agents}},
      author={Xingyao Wang and Boxuan Li and Yufan Song and Frank F. Xu and Xiangru Tang and Mingchen Zhuge and Jiayi Pan and Yueqi Song and Bowen Li and Jaskirat Singh and Hoang H. Tran and Fuqiang Li and Ren Ma and Mingzhang Zheng and Bill Qian and Yanjun Shao and Niklas Muennighoff and Yizhe Zhang and Binyuan Hui and Junyang Lin and Robert Brennan and Hao Peng and Heng Ji and Graham Neubig},
      year={2024},
      eprint={2407.16741},
      archivePrefix={arXiv},
      primaryClass={cs.SE},
-      url={https://arxiv.org/abs/2407.16741}, 
+      url={https://arxiv.org/abs/2407.16741},
 }
 ```
--- a/agenthub/browsing_agent/browsing_agent.py
+++ b/agenthub/browsing_agent/browsing_agent.py
@@ -7,6 +7,7 @@ from agenthub.browsing_agent.response_parser import BrowsingResponseParser
 from opendevin.controller.agent import Agent
 from opendevin.controller.state.state import State
 from opendevin.core.logger import opendevin_logger as logger
+from opendevin.core.message import Message, TextContent
 from opendevin.events.action import (
    Action,
    AgentFinishAction,
@@ -136,7 +137,7 @@ class BrowsingAgent(Agent):
        - MessageAction(content) - Message action to run (e.g. ask for clarification)
        - AgentFinishAction() - end the interaction
        """
-        messages = []
+        messages: list[Message] = []
        prev_actions = []
        cur_axtree_txt = ''
        error_prefix = ''
@@ -191,20 +192,23 @@ class BrowsingAgent(Agent):
                )
                return MessageAction('Error encountered when browsing.')

-        if (goal := state.get_current_user_intent()) is None:
+        goal, _ = state.get_current_user_intent()
+
+        if goal is None:
            goal = state.inputs['task']
+
        system_msg = get_system_message(
            goal,
            self.action_space.describe(with_long_description=False, with_examples=True),
        )

-        messages.append({'role': 'system', 'content': system_msg})
+        messages.append(Message(role='system', content=[TextContent(text=system_msg)]))

        prompt = get_prompt(error_prefix, cur_axtree_txt, prev_action_str)
-        messages.append({'role': 'user', 'content': prompt})
+        messages.append(Message(role='user', content=[TextContent(text=prompt)]))
        logger.debug(prompt)
        response = self.llm.completion(
-            messages=messages,
+            messages=[message.model_dump() for message in messages],
            temperature=0.0,
            stop=[')```', ')\n```'],
        )
--- a/agenthub/codeact_agent/codeact_agent.py
+++ b/agenthub/codeact_agent/codeact_agent.py
@@ -8,6 +8,7 @@ from agenthub.codeact_agent.prompt import (
 )
 from opendevin.controller.agent import Agent
 from opendevin.controller.state.state import State
+from opendevin.core.message import ImageContent, Message, TextContent
 from opendevin.events.action import (
    Action,
    AgentDelegateAction,
@@ -131,7 +132,7 @@ class CodeActAgent(Agent):
            return action.thought
        return ''

-    def get_action_message(self, action: Action) -> dict[str, str] | None:
+    def get_action_message(self, action: Action) -> Message | None:
        if (
            isinstance(action, AgentDelegateAction)
            or isinstance(action, CmdRunAction)
@@ -139,39 +140,41 @@ class CodeActAgent(Agent):
            or isinstance(action, MessageAction)
            or (isinstance(action, AgentFinishAction) and action.source == 'agent')
        ):
-            return {
-                'role': 'user' if action.source == 'user' else 'assistant',
-                'content': self.action_to_str(action),
-            }
+            content = [TextContent(text=self.action_to_str(action))]
+
+            if isinstance(action, MessageAction) and action.images_urls:
+                content.append(ImageContent(image_urls=action.images_urls))
+
+            return Message(
+                role='user' if action.source == 'user' else 'assistant', content=content
+            )
        return None

-    def get_observation_message(self, obs: Observation) -> dict[str, str] | None:
+    def get_observation_message(self, obs: Observation) -> Message | None:
        max_message_chars = self.llm.config.max_message_chars
        if isinstance(obs, CmdOutputObservation):
-            content = 'OBSERVATION:\n' + truncate_content(
-                obs.content, max_message_chars
-            )
-            content += (
+            text = 'OBSERVATION:\n' + truncate_content(obs.content, max_message_chars)
+            text += (
                f'\n[Command {obs.command_id} finished with exit code {obs.exit_code}]'
            )
-            return {'role': 'user', 'content': content}
+            return Message(role='user', content=[TextContent(text=text)])
        elif isinstance(obs, IPythonRunCellObservation):
-            content = 'OBSERVATION:\n' + obs.content
+            text = 'OBSERVATION:\n' + obs.content
            # replace base64 images with a placeholder
-            splitted = content.split('\n')
+            splitted = text.split('\n')
            for i, line in enumerate(splitted):
                if '![image](data:image/png;base64,' in line:
                    splitted[i] = (
                        '![image](data:image/png;base64, ...) already displayed to user'
                    )
-            content = '\n'.join(splitted)
-            content = truncate_content(content, max_message_chars)
-            return {'role': 'user', 'content': content}
+            text = '\n'.join(splitted)
+            text = truncate_content(text, max_message_chars)
+            return Message(role='user', content=[TextContent(text=text)])
        elif isinstance(obs, AgentDelegateObservation):
-            content = 'OBSERVATION:\n' + truncate_content(
+            text = 'OBSERVATION:\n' + truncate_content(
                str(obs.outputs), max_message_chars
            )
-            return {'role': 'user', 'content': content}
+            return Message(role='user', content=[TextContent(text=text)])
        return None

    def reset(self) -> None:
@@ -198,10 +201,10 @@ class CodeActAgent(Agent):
            return AgentFinishAction()

        # prepare what we want to send to the LLM
-        messages: list[dict[str, str]] = self._get_messages(state)
+        messages = self._get_messages(state)

        response = self.llm.completion(
-            messages=messages,
+            messages=[message.model_dump() for message in messages],
            stop=[
                '</execute_ipython>',
                '</execute_bash>',
@@ -211,11 +214,11 @@ class CodeActAgent(Agent):
        )
        return self.action_parser.parse(response)

-    def _get_messages(self, state: State) -> list[dict[str, str]]:
+    def _get_messages(self, state: State) -> list[Message]:
        system_message: str = get_system_message(state.prompt_context)
        messages = [
-            {'role': 'system', 'content': system_message},
-            {'role': 'user', 'content': self.in_context_example},
+            Message(role='system', content=[TextContent(text=system_message)]),
+            Message(role='user', content=[TextContent(text=self.in_context_example)]),
        ]

        for event in state.history.get_events():
@@ -229,18 +232,44 @@ class CodeActAgent(Agent):

            # add regular message
            if message:
-                messages.append(message)
+                # handle error if the message is the SAME role as the previous message
+                # litellm.exceptions.BadRequestError: litellm.BadRequestError: OpenAIException - Error code: 400 - {'detail': 'Only supports u/a/u/a/u...'}
+                # there should not have two consecutive messages from the same role
+                if messages and messages[-1].role == message.role:
+                    messages[-1].content.extend(message.content)
+                else:
+                    messages.append(message)

        # the latest user message is important:
        # we want to remind the agent of the environment constraints
        latest_user_message = next(
-            (m for m in reversed(messages) if m['role'] == 'user'), None
+            (
+                m
+                for m in reversed(messages)
+                if m.role == 'user'
+                and any(isinstance(c, TextContent) for c in m.content)
+            ),
+            None,
        )

-        # add a reminder to the prompt
+        # Get the last user text inside content
        if latest_user_message:
-            latest_user_message['content'] += (
-                f'\n\nENVIRONMENT REMINDER: You have {state.max_iterations - state.iteration} turns left to complete the task. When finished reply with <finish></finish>'
+            latest_user_message_text = next(
+                (
+                    t
+                    for t in reversed(latest_user_message.content)
+                    if isinstance(t, TextContent)
+                )
            )
+            # add a reminder to the prompt
+            reminder_text = f'\n\nENVIRONMENT REMINDER: You have {state.max_iterations - state.iteration} turns left to complete the task. When finished reply with <finish></finish>.'
+
+            if latest_user_message_text:
+                latest_user_message_text.text = (
+                    latest_user_message_text.text + reminder_text
+                )
+            else:
+                latest_user_message_text = TextContent(text=reminder_text)
+                latest_user_message.content.append(latest_user_message_text)

        return messages
--- a/agenthub/codeact_swe_agent/codeact_swe_agent.py
+++ b/agenthub/codeact_swe_agent/codeact_swe_agent.py
@@ -7,6 +7,7 @@ from agenthub.codeact_swe_agent.prompt import (
 from agenthub.codeact_swe_agent.response_parser import CodeActSWEResponseParser
 from opendevin.controller.agent import Agent
 from opendevin.controller.state.state import State
+from opendevin.core.message import ImageContent, Message, TextContent
 from opendevin.events.action import (
    Action,
    AgentFinishAction,
@@ -84,40 +85,43 @@ class CodeActSWEAgent(Agent):
            return action.content
        return ''

-    def get_action_message(self, action: Action) -> dict[str, str] | None:
+    def get_action_message(self, action: Action) -> Message | None:
        if (
            isinstance(action, CmdRunAction)
            or isinstance(action, IPythonRunCellAction)
            or isinstance(action, MessageAction)
        ):
-            return {
-                'role': 'user' if action.source == 'user' else 'assistant',
-                'content': self.action_to_str(action),
-            }
+            content = [TextContent(text=self.action_to_str(action))]
+
+            if isinstance(action, MessageAction) and action.images_urls:
+                content.append(ImageContent(image_urls=action.images_urls))
+
+            return Message(
+                role='user' if action.source == 'user' else 'assistant', content=content
+            )
+
        return None

-    def get_observation_message(self, obs: Observation) -> dict[str, str] | None:
+    def get_observation_message(self, obs: Observation) -> Message | None:
        max_message_chars = self.llm.config.max_message_chars
        if isinstance(obs, CmdOutputObservation):
-            content = 'OBSERVATION:\n' + truncate_content(
-                obs.content, max_message_chars
-            )
-            content += (
+            text = 'OBSERVATION:\n' + truncate_content(obs.content, max_message_chars)
+            text += (
                f'\n[Command {obs.command_id} finished with exit code {obs.exit_code}]'
            )
-            return {'role': 'user', 'content': content}
+            return Message(role='user', content=[TextContent(text=text)])
        elif isinstance(obs, IPythonRunCellObservation):
-            content = 'OBSERVATION:\n' + obs.content
+            text = 'OBSERVATION:\n' + obs.content
            # replace base64 images with a placeholder
-            splitted = content.split('\n')
+            splitted = text.split('\n')
            for i, line in enumerate(splitted):
                if '![image](data:image/png;base64,' in line:
                    splitted[i] = (
                        '![image](data:image/png;base64, ...) already displayed to user'
                    )
-            content = '\n'.join(splitted)
-            content = truncate_content(content, max_message_chars)
-            return {'role': 'user', 'content': content}
+            text = '\n'.join(splitted)
+            text = truncate_content(text, max_message_chars)
+            return Message(role='user', content=[TextContent(text=text)])
        return None

    def reset(self) -> None:
@@ -143,10 +147,10 @@ class CodeActSWEAgent(Agent):
            return AgentFinishAction()

        # prepare what we want to send to the LLM
-        messages: list[dict[str, str]] = self._get_messages(state)
+        messages: list[Message] = self._get_messages(state)

        response = self.llm.completion(
-            messages=messages,
+            messages=[message.model_dump() for message in messages],
            stop=[
                '</execute_ipython>',
                '</execute_bash>',
@@ -156,10 +160,10 @@ class CodeActSWEAgent(Agent):

        return self.response_parser.parse(response)

-    def _get_messages(self, state: State) -> list[dict[str, str]]:
-        messages = [
-            {'role': 'system', 'content': self.system_message},
-            {'role': 'user', 'content': self.in_context_example},
+    def _get_messages(self, state: State) -> list[Message]:
+        messages: list[Message] = [
+            Message(role='system', content=[TextContent(text=self.system_message)]),
+            Message(role='user', content=[TextContent(text=self.in_context_example)]),
        ]

        for event in state.history.get_events():
@@ -173,18 +177,38 @@ class CodeActSWEAgent(Agent):

            # add regular message
            if message:
-                messages.append(message)
+                # handle error if the message is the SAME role as the previous message
+                # litellm.exceptions.BadRequestError: litellm.BadRequestError: OpenAIException - Error code: 400 - {'detail': 'Only supports u/a/u/a/u...'}
+                # there should not have two consecutive messages from the same role
+                if messages and messages[-1].role == message.role:
+                    messages[-1].content.extend(message.content)
+                else:
+                    messages.append(message)

        # the latest user message is important:
        # we want to remind the agent of the environment constraints
        latest_user_message = next(
-            (m for m in reversed(messages) if m['role'] == 'user'), None
+            (m for m in reversed(messages) if m.role == 'user'), None
        )

-        # add a reminder to the prompt
+        # Get the last user text inside content
        if latest_user_message:
-            latest_user_message['content'] += (
-                f'\n\nENVIRONMENT REMINDER: You have {state.max_iterations - state.iteration} turns left to complete the task.'
+            latest_user_message_text = next(
+                (
+                    t
+                    for t in reversed(latest_user_message.content)
+                    if isinstance(t, TextContent)
+                )
            )
+            # add a reminder to the prompt
+            reminder_text = f'\n\nENVIRONMENT REMINDER: You have {state.max_iterations - state.iteration} turns left to complete the task. When finished reply with <finish></finish>.'
+
+            if latest_user_message_text:
+                latest_user_message_text.text = (
+                    latest_user_message_text.text + reminder_text
+                )
+            else:
+                latest_user_message_text = TextContent(text=reminder_text)
+                latest_user_message.content.append(latest_user_message_text)

        return messages
--- a/agenthub/delegator_agent/agent.py
+++ b/agenthub/delegator_agent/agent.py
@@ -34,7 +34,7 @@ class DelegatorAgent(Agent):
        """
        if self.current_delegate == '':
            self.current_delegate = 'study'
-            task = state.get_current_user_intent()
+            task, _ = state.get_current_user_intent()
            return AgentDelegateAction(
                agent='StudyRepoForTaskAgent', inputs={'task': task}
            )
@@ -45,7 +45,7 @@ class DelegatorAgent(Agent):
        if not isinstance(last_observation, AgentDelegateObservation):
            raise Exception('Last observation is not an AgentDelegateObservation')

-        goal = state.get_current_user_intent()
+        goal, _ = state.get_current_user_intent()
        if self.current_delegate == 'study':
            self.current_delegate = 'coder'
            return AgentDelegateAction(
--- a/agenthub/dummy_agent/agent.py
+++ b/agenthub/dummy_agent/agent.py
@@ -208,9 +208,3 @@ class DummyAgent(Agent):
                f' Unable to perform interactive browsing: {action.browser_actions}'
            )
        return MessageAction(content=message)
-
-    async def get_working_directory(self, state: State) -> str:
-        # Implement this method to return the current working directory
-        # This might involve accessing state information or making an async call
-        # For now, we'll return a placeholder value
-        return './workspace'
--- a/agenthub/micro/agent.py
+++ b/agenthub/micro/agent.py
@@ -2,6 +2,7 @@ from jinja2 import BaseLoader, Environment

 from opendevin.controller.agent import Agent
 from opendevin.controller.state.state import State
+from opendevin.core.message import ImageContent, Message, TextContent
 from opendevin.core.utils import json
 from opendevin.events.action import Action
 from opendevin.events.serialization.action import action_from_dict
@@ -62,16 +63,20 @@ class MicroAgent(Agent):
        del self.delegates[self.agent_definition['name']]

    def step(self, state: State) -> Action:
+        last_user_message, last_image_urls = state.get_current_user_intent()
        prompt = self.prompt_template.render(
            state=state,
            instructions=instructions,
            to_json=to_json,
            history_to_json=self.history_to_json,
            delegates=self.delegates,
-            latest_user_message=state.get_current_user_intent(),
+            latest_user_message=last_user_message,
        )
-        messages = [{'content': prompt, 'role': 'user'}]
-        resp = self.llm.completion(messages=messages)
+        content = [TextContent(text=prompt)]
+        if last_image_urls:
+            content.append(ImageContent(image_urls=last_image_urls))
+        message = Message(role='user', content=content)
+        resp = self.llm.completion(messages=[message.model_dump()])
        action_resp = resp['choices'][0]['message']['content']
        action = parse_response(action_resp)
        return action
--- a/agenthub/micro/commit_writer/README.md
+++ b/agenthub/micro/commit_writer/README.md
@@ -3,7 +3,7 @@
 CommitWriterAgent can help write git commit message. Example:

 ```bash
-WORKSPACE_MOUNT_PATH="`PWD`" SANDBOX_BOX_TYPE="ssh" \
+WORKSPACE_MOUNT_PATH="`PWD`" \
  poetry run python opendevin/core/main.py -t "dummy task" -c CommitWriterAgent -d ./
 ```

--- a/agenthub/planner_agent/agent.py
+++ b/agenthub/planner_agent/agent.py
@@ -1,11 +1,12 @@
 from agenthub.planner_agent.response_parser import PlannerResponseParser
 from opendevin.controller.agent import Agent
 from opendevin.controller.state.state import State
+from opendevin.core.message import ImageContent, Message, TextContent
 from opendevin.events.action import Action, AgentFinishAction
 from opendevin.llm.llm import LLM
 from opendevin.runtime.tools import RuntimeTool

-from .prompt import get_prompt
+from .prompt import get_prompt_and_images


 class PlannerAgent(Agent):
@@ -42,7 +43,13 @@ class PlannerAgent(Agent):
            'abandoned',
        ]:
            return AgentFinishAction()
-        prompt = get_prompt(state, self.llm.config.max_message_chars)
-        messages = [{'content': prompt, 'role': 'user'}]
-        resp = self.llm.completion(messages=messages)
+
+        prompt, image_urls = get_prompt_and_images(
+            state, self.llm.config.max_message_chars
+        )
+        content = [TextContent(text=prompt)]
+        if image_urls:
+            content.append(ImageContent(image_urls=image_urls))
+        message = Message(role='user', content=content)
+        resp = self.llm.completion(messages=[message.model_dump()])
        return self.response_parser.parse(resp)
--- a/agenthub/planner_agent/prompt.py
+++ b/agenthub/planner_agent/prompt.py
@@ -115,7 +115,9 @@ def get_hint(latest_action_id: str) -> str:
    return hints.get(latest_action_id, '')


-def get_prompt(state: State, max_message_chars: int) -> str:
+def get_prompt_and_images(
+    state: State, max_message_chars: int
+) -> tuple[str, list[str]]:
    """Gets the prompt for the planner agent.

    Formatted with the most recent action-observation pairs, current task, and hint based on last action
@@ -161,16 +163,16 @@ def get_prompt(state: State, max_message_chars: int) -> str:
    logger.info('HINT:\n' + hint, extra={'msg_type': 'DETAIL'})

    # the last relevant user message (the task)
-    task = state.get_current_user_intent()
+    message, image_urls = state.get_current_user_intent()

    # finally, fill in the prompt
    return prompt % {
-        'task': task,
+        'task': message,
        'plan': plan_str,
        'history': history_str,
        'hint': hint,
        'plan_status': plan_status,
-    }
+    }, image_urls


 def parse_response(response: str) -> Action:
--- a/config.template.toml
+++ b/config.template.toml
@@ -55,24 +55,11 @@ workspace_base = "./workspace"
 # Path to rewrite the workspace mount path to
 #workspace_mount_rewrite = ""

-# Persist the sandbox
-persist_sandbox = false
-
 # Run as devin
 #run_as_devin = true

 # Runtime environment
-#runtime = "server"
-
-# SSH hostname for the sandbox
-#ssh_hostname = "localhost"
-
-# SSH password for the sandbox
-#ssh_password = ""
-
-# SSH port for the sandbox
-#ssh_port = 63710
-
+#runtime = "eventstream"

 # Name of the default agent
 #default_agent = "CodeActAgent"
@@ -183,9 +170,6 @@ llm_config = 'gpt3'
 # Sandbox timeout in seconds
 #timeout = 120

-# Sandbox type (ssh, e2b, local)
-#box_type = "ssh"
-
 # Sandbox user ID
 #user_id = 1000

--- a/containers/app/Dockerfile
+++ b/containers/app/Dockerfile
@@ -32,11 +32,13 @@ FROM python:3.12.3-slim AS runtime

 WORKDIR /app

+ARG OPEN_DEVIN_BUILD_VERSION #re-declare for this section
+
 ENV RUN_AS_DEVIN=true
 # A random number--we need this to be different from the user's UID on the host machine
 ENV OPENDEVIN_USER_ID=42420
+ENV SANDBOX_API_HOSTNAME=host.docker.internal
 ENV USE_HOST_NETWORK=false
-ENV SSH_HOSTNAME=host.docker.internal
 ENV WORKSPACE_BASE=/opt/workspace_base
 ENV OPEN_DEVIN_BUILD_VERSION=$OPEN_DEVIN_BUILD_VERSION
 RUN mkdir -p $WORKSPACE_BASE
@@ -44,8 +46,10 @@ RUN mkdir -p $WORKSPACE_BASE
 RUN apt-get update -y \
    && apt-get install -y curl ssh sudo

-RUN sed -i 's/^UID_MIN.*/UID_MIN 499/' /etc/login.defs # Default is 1000, but OSX is often 501
-RUN sed -i 's/^UID_MAX.*/UID_MAX 1000000/' /etc/login.defs # Default is 60000, but we've seen up to 200000
+# Default is 1000, but OSX is often 501
+RUN sed -i 's/^UID_MIN.*/UID_MIN 499/' /etc/login.defs
+# Default is 60000, but we've seen up to 200000
+RUN sed -i 's/^UID_MAX.*/UID_MAX 1000000/' /etc/login.defs

 RUN groupadd app
 RUN useradd -l -m -u $OPENDEVIN_USER_ID -s /bin/bash opendevin && \
@@ -66,6 +70,9 @@ RUN playwright install --with-deps chromium
 COPY --chown=opendevin:app --chmod=770 ./opendevin ./opendevin
 COPY --chown=opendevin:app --chmod=777 ./opendevin/runtime/plugins ./opendevin/runtime/plugins
 COPY --chown=opendevin:app --chmod=770 ./agenthub ./agenthub
+COPY --chown=opendevin:app --chmod=770 ./pyproject.toml ./pyproject.toml
+COPY --chown=opendevin:app --chmod=770 ./poetry.lock ./poetry.lock
+COPY --chown=opendevin:app --chmod=770 ./README.md ./README.md

 RUN python opendevin/core/download.py # No-op to download assets
 RUN chown -R opendevin:app /app/logs && chmod -R 770 /app/logs # This gets created by the download.py script
--- a/containers/build.sh
+++ b/containers/build.sh
@@ -53,6 +53,11 @@ fi
 if [[ -n "$DOCKER_IMAGE_TAG" ]]; then
  tags+=("$DOCKER_IMAGE_TAG")
 fi
+# If $DOCKER_IMAGE_HASH_TAG is set, add it to the tags
+if [[ -n "$DOCKER_IMAGE_HASH_TAG" ]]; then
+  tags+=("$DOCKER_IMAGE_HASH_TAG")
+fi
+

 DOCKER_REPOSITORY="$DOCKER_REGISTRY/$DOCKER_ORG/$DOCKER_IMAGE"
 DOCKER_REPOSITORY=${DOCKER_REPOSITORY,,} # lowercase
--- a/containers/runtime/config.sh
+++ b/containers/runtime/config.sh
@@ -4,5 +4,3 @@ DOCKER_BASE_DIR="./containers/runtime"
 # These two variables will be appended by the runtime_build.py script
 # DOCKER_IMAGE=
 # DOCKER_IMAGE_TAG=
-DOCKER_IMAGE=od_runtime
-DOCKER_IMAGE_TAG=od_v0.8.1_image_ubuntu_tag_22.04
--- a/docs/docusaurus.config.ts
+++ b/docs/docusaurus.config.ts
@@ -4,7 +4,7 @@ import { themes as prismThemes } from "prism-react-renderer";

 const config: Config = {
  title: "OpenDevin",
-  tagline: "Code Less, Make More",
+  tagline: "An Open Platform for AI Software Developers as Generalist Agents",
  favicon: "img/logo.png",

  // Set the production url of your site here
@@ -32,6 +32,10 @@ const config: Config = {
    },
  },

+  markdown: {
+    mermaid: true,
+  },
+  themes: ['@docusaurus/theme-mermaid'],
  presets: [
    [
      "classic",
@@ -77,7 +81,6 @@ const config: Config = {
          position: "left",
          label: "Codebase",
        },
-        { to: "/faq", label: "FAQ", position: "left" },
        {
          href: "https://github.com/OpenDevin/OpenDevin",
          label: "GitHub",
--- a/docs/i18n/fr/docusaurus-plugin-content-docs/current/usage/about.md
+++ b/docs/i18n/fr/docusaurus-plugin-content-docs/current/usage/about.md
@@ -31,7 +31,7 @@ Pour plus de détails, veuillez consulter [ce document](https://github.com/OpenD

 Nous avons maintenant à la fois un espace de travail Slack pour la collaboration sur la construction d'OpenDevin et un serveur Discord pour discuter de tout ce qui est lié, par exemple, à ce projet, aux LLM, aux agents, etc.

- [Espace de travail Slack](https://join.slack.com/t/opendevin/shared_invite/zt-2jsrl32uf-fTeeFjNyNYxqSZt5NPY3fA)
+- [Espace de travail Slack](https://join.slack.com/t/opendevin/shared_invite/zt-2ngejmfw6-9gW4APWOC9XUp1n~SiQ6iw)
 - [Serveur Discord](https://discord.gg/ESHStjSjD4)

 Si vous souhaitez contribuer, n'hésitez pas à rejoindre notre communauté. Simplifions l'ingénierie logicielle ensemble !
--- a/docs/i18n/fr/docusaurus-plugin-content-docs/current/usage/custom_sandbox_guide.md
+++ b/docs/i18n/fr/docusaurus-plugin-content-docs/current/usage/custom_sandbox_guide.md
@@ -41,7 +41,6 @@ Créez un fichier ```config.toml``` dans le répertoire OpenDevin et entrez ces
 ```toml
 [core]
 workspace_base="./workspace"
-persist_sandbox=false
 run_as_devin=true
 sandbox_container_image="image_personnalisée"
 ```
@@ -92,7 +91,6 @@ Si vous voyez cette erreur dans la sortie de la console, il s'agit du fait que O
 ```toml
 [core]
 workspace_base="./workspace"
-persist_sandbox=false
 run_as_devin=true
 sandbox_container_image="image_personnalisée"
 sandbox_user_id="1001"
@@ -104,4 +102,4 @@ Si vous voyez un message d'erreur indiquant que le port est utilisé ou indispon

 ## Discuter

-Pour d'autres problèmes ou questions rejoignez le [Slack](https://join.slack.com/t/opendevin/shared_invite/zt-2jsrl32uf-fTeeFjNyNYxqSZt5NPY3fA) ou le [Discord](https://discord.gg/ESHStjSjD4) et demandez!
+Pour d'autres problèmes ou questions rejoignez le [Slack](https://join.slack.com/t/opendevin/shared_invite/zt-2ngejmfw6-9gW4APWOC9XUp1n~SiQ6iw) ou le [Discord](https://discord.gg/ESHStjSjD4) et demandez!
--- a/docs/i18n/fr/docusaurus-plugin-content-docs/current/usage/intro.mdx
+++ b/docs/i18n/fr/docusaurus-plugin-content-docs/current/usage/intro.mdx
@@ -42,7 +42,7 @@ Explorez le code source d'OpenDevin sur [GitHub](https://github.com/OpenDevin/Op
  />
 </a>
 <br></br>
-<a href="https://join.slack.com/t/opendevin/shared_invite/zt-2jsrl32uf-fTeeFjNyNYxqSZt5NPY3fA">
+<a href="https://join.slack.com/t/opendevin/shared_invite/zt-2ngejmfw6-9gW4APWOC9XUp1n~SiQ6iw">
  <img
    src="https://img.shields.io/badge/Slack-Join%20Us-red?logo=slack&logoColor=white&style=for-the-badge"
    alt="Join our Slack community"
@@ -72,8 +72,6 @@ WORKSPACE_BASE=$(pwd)/workspace
 docker run -it \
    --pull=always \
    -e SANDBOX_USER_ID=$(id -u) \
-    -e PERSIST_SANDBOX="true" \
-    -e SSH_PASSWORD="make something up here" \
    -e WORKSPACE_MOUNT_PATH=$WORKSPACE_BASE \
    -v $WORKSPACE_BASE:/opt/workspace_base \
    -v /var/run/docker.sock:/var/run/docker.sock \
--- a/docs/i18n/zh-Hans/docusaurus-plugin-content-docs/current/usage/about.md
+++ b/docs/i18n/zh-Hans/docusaurus-plugin-content-docs/current/usage/about.md
@@ -31,7 +31,7 @@ OpenDevin 是一个社区驱动的项目，我们欢迎每个人的贡献。无

 我们现在有一个 Slack 工作区，用于合作建设 OpenDevin，还设有一个 Discord 服务器，用于讨论与该项目、LLM、代理等相关的任何事情。

- [Slack 工作区](https://join.slack.com/t/opendevin/shared_invite/zt-2jsrl32uf-fTeeFjNyNYxqSZt5NPY3fA)
+- [Slack 工作区](https://join.slack.com/t/opendevin/shared_invite/zt-2ngejmfw6-9gW4APWOC9XUp1n~SiQ6iw)
 - [Discord 服务器](https://discord.gg/ESHStjSjD4)

 如果您愿意贡献，请随时加入我们的社区。让我们一起简化软件工程！
--- a/docs/i18n/zh-Hans/docusaurus-plugin-content-docs/current/usage/custom_sandbox_guide.md
+++ b/docs/i18n/zh-Hans/docusaurus-plugin-content-docs/current/usage/custom_sandbox_guide.md
@@ -40,7 +40,6 @@ docker build -t custom_image .
 ```
 [core]
 workspace_base="./workspace"
-persist_sandbox=false
 run_as_devin=true
 sandbox_container_image="custom_image"
 ```
@@ -92,7 +91,6 @@ dockerfile_content = (
 ```
 [core]
 workspace_base="./workspace"
-persist_sandbox=false
 run_as_devin=true
 sandbox_container_image="custom_image"
 sandbox_user_id="1001"
@@ -104,4 +102,4 @@ sandbox_user_id="1001"

 ## 讨论

-对于其他问题或疑问，请加入 [Slack](https://join.slack.com/t/opendevin/shared_invite/zt-2jsrl32uf-fTeeFjNyNYxqSZt5NPY3fA) 或 [Discord](https://discord.gg/ESHStjSjD4)，并提问！
+对于其他问题或疑问，请加入 [Slack](https://join.slack.com/t/opendevin/shared_invite/zt-2ngejmfw6-9gW4APWOC9XUp1n~SiQ6iw) 或 [Discord](https://discord.gg/ESHStjSjD4)，并提问！
--- a/docs/i18n/zh-Hans/docusaurus-plugin-content-docs/current/usage/intro.mdx
+++ b/docs/i18n/zh-Hans/docusaurus-plugin-content-docs/current/usage/intro.mdx
@@ -42,7 +42,7 @@ OpenDevin 是一个**自主 AI 软件工程师**，能够执行复杂的工程
  />
 </a>
 <br></br>
-<a href="https://join.slack.com/t/opendevin/shared_invite/zt-2jsrl32uf-fTeeFjNyNYxqSZt5NPY3fA">
+<a href="https://join.slack.com/t/opendevin/shared_invite/zt-2ngejmfw6-9gW4APWOC9XUp1n~SiQ6iw">
  <img
    src="https://img.shields.io/badge/Slack-Join%20Us-red?logo=slack&logoColor=white&style=for-the-badge"
    alt="Join our Slack community"
@@ -72,8 +72,6 @@ WORKSPACE_BASE=$(pwd)/workspace
 docker run -it \
    --pull=always \
    -e SANDBOX_USER_ID=$(id -u) \
-    -e PERSIST_SANDBOX="true" \
-    -e SSH_PASSWORD="make something up here" \
    -e WORKSPACE_MOUNT_PATH=$WORKSPACE_BASE \
    -v $WORKSPACE_BASE:/opt/workspace_base \
    -v /var/run/docker.sock:/var/run/docker.sock \
--- a/docs/modules/usage/about.md
+++ b/docs/modules/usage/about.md
@@ -31,7 +31,7 @@ For details, please check [this document](https://github.com/OpenDevin/OpenDevin

 We have both Slack workspace for the collaboration on building OpenDevin and Discord server for discussion about anything related, e.g., this project, LLM, agent, etc.

- [Slack workspace](https://join.slack.com/t/opendevin/shared_invite/zt-2jsrl32uf-fTeeFjNyNYxqSZt5NPY3fA)
+- [Slack workspace](https://join.slack.com/t/opendevin/shared_invite/zt-2ngejmfw6-9gW4APWOC9XUp1n~SiQ6iw)
 - [Discord server](https://discord.gg/ESHStjSjD4)

 If you would love to contribute, feel free to join our community. Let's simplify software engineering together!
--- a/docs/modules/usage/custom_sandbox_guide.md
+++ b/docs/modules/usage/custom_sandbox_guide.md
@@ -70,7 +70,6 @@ Create a `config.toml` file in the OpenDevin directory and enter these contents:
 ```toml
 [core]
 workspace_base="./workspace"
-persist_sandbox=false
 run_as_devin=true
 sandbox_container_image="custom_image"
 ```
@@ -129,7 +128,6 @@ If you see this error in the console output it is because OpenDevin is trying to
 ```toml
 [core]
 workspace_base="./workspace"
-persist_sandbox=false
 run_as_devin=true
 sandbox_container_image="custom_image"
 sandbox_user_id="1001"
@@ -141,4 +139,4 @@ If you see an error about a port being in use or unavailable, try deleting all r

 ## Discuss

-For other issues or questions join the [Slack](https://join.slack.com/t/opendevin/shared_invite/zt-2jsrl32uf-fTeeFjNyNYxqSZt5NPY3fA) or [Discord](https://discord.gg/ESHStjSjD4) and ask!
+For other issues or questions join the [Slack](https://join.slack.com/t/opendevin/shared_invite/zt-2ngejmfw6-9gW4APWOC9XUp1n~SiQ6iw) or [Discord](https://discord.gg/ESHStjSjD4) and ask!
--- a/docs/modules/usage/evaluation_harness.md
+++ b/docs/modules/usage/evaluation_harness.md
@@ -0,0 +1,257 @@
+---
+sidebar_position: 6
+---
+
+# 📈 How to contribute to OpenDevin Evaluation Harness
+
+This guide provides an overview of how to integrate your own evaluation benchmark into the OpenDevin framework.
+
+## Before everything begins: Setup Environment and LLM Configuration
+
+Please follow instruction [here](https://github.com/OpenDevin/OpenDevin/blob/main/Development.md) to setup your local development environment and LLM.
+
+OpenDevin in development mode uses `config.toml` to keep track of most configurations.
+
+Here's an example configuration file you can use to define and use multiple LLMs:
+
+```toml
+[llm]
+# IMPORTANT: add your API key here, and set the model to the one you want to evaluate
+model = "gpt-4o-2024-05-13"
+api_key = "sk-XXX"
+
+[llm.eval_gpt4_1106_preview_llm]
+model = "gpt-4-1106-preview"
+api_key = "XXX"
+temperature = 0.0
+
+[llm.eval_some_openai_compatible_model_llm]
+model = "openai/MODEL_NAME"
+base_url = "https://OPENAI_COMPATIBLE_URL/v1"
+api_key = "XXX"
+temperature = 0.0
+```
+
+
+## How to use OpenDevin in the command line
+
+OpenDevin can be run from the command line using the following format:
+
+```bash
+poetry run python ./opendevin/core/main.py \
+        -i <max_iterations> \
+        -t "<task_description>" \
+        -c <agent_class> \
+        -l <llm_config>
+```
+
+For example:
+
+```bash
+poetry run python ./opendevin/core/main.py \
+        -i 10 \
+        -t "Write me a bash script that prints hello world." \
+        -c CodeActAgent \
+        -l llm
+```
+
+This command runs OpenDevin with:
+- A maximum of 10 iterations
+- The specified task description
+- Using the CodeActAgent
+- With the LLM configuration defined in the `llm` section of your `config.toml` file
+
+## How does OpenDevin work
+
+The main entry point for OpenDevin is in `opendevin/core/main.py`. Here's a simplified flow of how it works:
+
+1. Parse command-line arguments and load the configuration.
+2. Create a runtime environment using `create_runtime()`.
+3. Initialize the specified agent.
+4. Run the controller using `run_controller()`, which:
+   - Attaches the runtime to the agent
+   - Executes the agent's task
+   - Returns a final state when complete
+
+The `run_controller()` function is the core of OpenDevin's execution. It manages the interaction between the agent, the runtime, and the task, handling things like user input simulation and event processing.
+
+
+## Easiest way to get started: Exploring Existing Benchmarks
+
+We encourage you to review the various evaluation benchmarks available in the [`evaluation/` directory](https://github.com/OpenDevin/OpenDevin/blob/main/evaluation) of our repository.
+
+To integrate your own benchmark, we suggest starting with the one that most closely resembles your needs. This approach can significantly streamline your integration process, allowing you to build upon existing structures and adapt them to your specific requirements.
+
+## How to create an evaluation workflow
+
+To create an evaluation workflow for your benchmark, follow these steps:
+
+1. Create a configuration:
+   ```python
+   def get_config(instance: pd.Series, metadata: EvalMetadata) -> AppConfig:
+       config = AppConfig(
+           default_agent=metadata.agent_class,
+           runtime='eventstream',
+           max_iterations=metadata.max_iterations,
+           sandbox=SandboxConfig(
+               container_image='your_container_image',
+               enable_auto_lint=True,
+               timeout=300,
+           ),
+       )
+       config.set_llm_config(metadata.llm_config)
+       return config
+   ```
+
+2. Initialize the runtime and set up the evaluation environment:
+   ```python
+   async def initialize_runtime(runtime: Runtime, instance: pd.Series):
+       # Set up your evaluation environment here
+       # For example, setting environment variables, preparing files, etc.
+       pass
+   ```
+
+3. Create a function to process each instance:
+   ```python
+   async def process_instance(instance: pd.Series, metadata: EvalMetadata) -> EvalOutput:
+       config = get_config(instance, metadata)
+       runtime = await create_runtime(config, sid=instance.instance_id)
+       await initialize_runtime(runtime, instance)
+
+       instruction = get_instruction(instance, metadata)
+
+       state = await run_controller(
+           config=config,
+           task_str=instruction,
+           runtime=runtime,
+           fake_user_response_fn=your_user_response_function,
+       )
+
+       # Evaluate the agent's actions
+       evaluation_result = await evaluate_agent_actions(runtime, instance)
+
+       return EvalOutput(
+           instance_id=instance.instance_id,
+           instruction=instruction,
+           test_result=evaluation_result,
+           metadata=metadata,
+           history=state.history.compatibility_for_eval_history_pairs(),
+           metrics=state.metrics.get() if state.metrics else None,
+           error=state.last_error if state and state.last_error else None,
+       )
+   ```
+
+4. Run the evaluation:
+   ```python
+   metadata = make_metadata(llm_config, dataset_name, agent_class, max_iterations, eval_note, eval_output_dir)
+   output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl')
+   instances = prepare_dataset(your_dataset, output_file, eval_n_limit)
+
+   await run_evaluation(
+       instances,
+       metadata,
+       output_file,
+       num_workers,
+       process_instance
+   )
+   ```
+
+This workflow sets up the configuration, initializes the runtime environment, processes each instance by running the agent and evaluating its actions, and then collects the results into an `EvalOutput` object. The `run_evaluation` function handles parallelization and progress tracking.
+
+Remember to customize the `get_instruction`, `your_user_response_function`, and `evaluate_agent_actions` functions according to your specific benchmark requirements.
+
+By following this structure, you can create a robust evaluation workflow for your benchmark within the OpenDevin framework.
+
+Certainly! I'll add a section explaining the user_response_fn and include a description of the workflow and interaction. Here's an updated version of the guideline with the new section:
+
+
+## Understanding the `user_response_fn`
+
+The `user_response_fn` is a crucial component in OpenDevin's evaluation workflow. It simulates user interaction with the agent, allowing for automated responses during the evaluation process. This function is particularly useful when you want to provide consistent, predefined responses to the agent's queries or actions.
+
+
+### Workflow and Interaction
+
+The correct workflow for handling actions and the `user_response_fn` is as follows:
+
+1. Agent receives a task and starts processing
+2. Agent emits an Action
+3. If the Action is executable (e.g., CmdRunAction, IPythonRunCellAction):
+   - The Runtime processes the Action
+   - Runtime returns an Observation
+4. If the Action is not executable (typically a MessageAction):
+   - The `user_response_fn` is called
+   - It returns a simulated user response
+5. The agent receives either the Observation or the simulated response
+6. Steps 2-5 repeat until the task is completed or max iterations are reached
+
+Here's a more accurate visual representation:
+
+```
+                 [Agent]
+                    |
+                    v
+               [Emit Action]
+                    |
+                    v
+            [Is Action Executable?]
+           /                       \
+         Yes                        No
+          |                          |
+          v                          v
+     [Runtime]               [user_response_fn]
+          |                          |
+          v                          v
+  [Return Observation]    [Simulated Response]
+           \                        /
+            \                      /
+             v                    v
+           [Agent receives feedback]
+                    |
+                    v
+         [Continue or Complete Task]
+```
+
+In this workflow:
+
+- Executable actions (like running commands or executing code) are handled directly by the Runtime.
+- Non-executable actions (typically when the agent wants to communicate or ask for clarification) are handled by the `user_response_fn`.
+- The agent then processes the feedback, whether it's an Observation from the Runtime or a simulated response from the `user_response_fn`.
+
+This approach allows for automated handling of both concrete actions and simulated user interactions, making it suitable for evaluation scenarios where you want to test the agent's ability to complete tasks with minimal human intervention.
+
+### Example Implementation
+
+Here's an example of a `user_response_fn` used in the SWE-Bench evaluation:
+
+```python
+def codeact_user_response(state: State | None) -> str:
+    msg = (
+        'Please continue working on the task on whatever approach you think is suitable.\n'
+        'If you think you have solved the task, please first send your answer to user through message and then <execute_bash> exit </execute_bash>.\n'
+        'IMPORTANT: YOU SHOULD NEVER ASK FOR HUMAN HELP.\n'
+    )
+
+    if state and state.history:
+        # check if the agent has tried to talk to the user 3 times, if so, let the agent know it can give up
+        user_msgs = [
+            event
+            for event in state.history.get_events()
+            if isinstance(event, MessageAction) and event.source == 'user'
+        ]
+        if len(user_msgs) >= 2:
+            # let the agent know that it can give up when it has tried 3 times
+            return (
+                msg
+                + 'If you want to give up, run: <execute_bash> exit </execute_bash>.\n'
+            )
+    return msg
+```
+
+This function does the following:
+
+1. Provides a standard message encouraging the agent to continue working.
+2. Checks how many times the agent has attempted to communicate with the user.
+3. If the agent has made multiple attempts, it provides an option to give up.
+
+By using this function, you can ensure consistent behavior across multiple evaluation runs and prevent the agent from getting stuck waiting for human input.
--- a/docs/modules/usage/intro.mdx
+++ b/docs/modules/usage/intro.mdx
@@ -42,7 +42,7 @@ Explore the codebase of OpenDevin on [GitHub](https://github.com/OpenDevin/OpenD
  />
 </a>
 <br></br>
-<a href="https://join.slack.com/t/opendevin/shared_invite/zt-2jsrl32uf-fTeeFjNyNYxqSZt5NPY3fA">
+<a href="https://join.slack.com/t/opendevin/shared_invite/zt-2ngejmfw6-9gW4APWOC9XUp1n~SiQ6iw">
  <img
    src="https://img.shields.io/badge/Slack-Join%20Us-red?logo=slack&logoColor=white&style=for-the-badge"
    alt="Join our Slack community"
--- a/docs/modules/usage/openshift-example.md
+++ b/docs/modules/usage/openshift-example.md
@@ -158,8 +158,6 @@ spec:
    env:
    - name: SANDBOX_USER_ID
      value: "1000"
-    - name: SANDBOX_BOX_TYPE
-      value: 'local'
    - name: WORKSPACE_MOUNT_PATH
      value: "/opt/workspace_base"
    volumeMounts:
@@ -290,13 +288,13 @@ RUN mkdir -p /opt/workspace_base && chown -R 1000:1000 /opt/workspace_base
 # Verify Git installation
 RUN git --version
 ```
-   
+
 2. Mount a shared development directory "i.e. one hosted in EC2 instance" to the POD:
   This can be also done by sharing the developement directory to the worker node through a sharing software (NFS), then creating a pv and pvc as described above to access that directory.

-3. Not all Agents working! Just tested CoderAgent with an openai API key and produced results. 
-   
+3. Not all Agents working! Just tested CoderAgent with an openai API key and produced results.
+

 ## Discuss

-For other issues or questions join the [Slack](https://join.slack.com/t/opendevin/shared_invite/zt-2jsrl32uf-fTeeFjNyNYxqSZt5NPY3fA) or [Discord](https://discord.gg/ESHStjSjD4) and ask!
+For other issues or questions join the [Slack](https://join.slack.com/t/opendevin/shared_invite/zt-2ngejmfw6-9gW4APWOC9XUp1n~SiQ6iw) or [Discord](https://discord.gg/ESHStjSjD4) and ask!
--- a/docs/modules/usage/runtime.md
+++ b/docs/modules/usage/runtime.md
@@ -0,0 +1,181 @@
+---
+sidebar_position: 4
+---
+
+# 📦 EventStream Runtime
+
+The OpenDevin EventStream Runtime is the core component that enables secure and flexible execution of AI agent's action.
+It creates a sandboxed environment using Docker, where arbitrary code can be run safely without risking the host system.
+
+
+## Why do we need a sandboxed runtime?
+
+OpenDevin needs to execute arbitrary code in a secure, isolated environment for several reasons:
+
+1. Security: Executing untrusted code can pose significant risks to the host system. A sandboxed environment prevents malicious code from accessing or modifying the host system's resources.
+
+2. Consistency: A sandboxed environment ensures that code execution is consistent across different machines and setups, eliminating "it works on my machine" issues.
+
+3. Resource Control: Sandboxing allows for better control over resource allocation and usage, preventing runaway processes from affecting the host system.
+
+4. Isolation: Different projects or users can work in isolated environments without interfering with each other or the host system.
+
+5. Reproducibility: Sandboxed environments make it easier to reproduce bugs and issues, as the execution environment is consistent and controllable.
+
+## How does our Runtime work?
+
+The OpenDevin Runtime system uses a client-server architecture implemented with Docker containers. Here's an overview of how it works:
+
+```mermaid
+graph TD
+    A[User-provided Custom Docker Image] --> B[OpenDevin Backend]
+    B -->|Builds| C[OD Runtime Image]
+    C -->|Launches| D[Runtime Client]
+    D -->|Initializes| E[Browser]
+    D -->|Initializes| F[Bash Shell]
+    D -->|Initializes| G[Plugins]
+    G -->|Initializes| L[Jupyter Server]
+
+    B -->|Spawn| H[Agent]
+    B -->|Spawn| I[EventStream]
+    I <--->|Execute Action to
+    Get Observation
+    via REST API
+    | D
+
+    H -->|Generate Action| I
+    I -->|Obtain Observation| H
+
+    subgraph "Docker Container"
+    D
+    E
+    F
+    G
+    L
+    end
+```
+
+1. User Input: The user provides a custom base Docker image.
+
+2. Image Building: OpenDevin builds a new Docker image (the "OD runtime image") based on the user-provided image. This new image includes OpenDevin-specific code, primarily the "runtime client."
+
+3. Container Launch: When OpenDevin starts, it launches a Docker container using the OD runtime image.
+
+4. Client Initialization: The runtime client initializes inside the container, setting up necessary components like a bash shell and loading any specified plugins.
+
+5. Communication: The OpenDevin backend (`runtime.py`) communicates with the runtime client over RESTful API, sending actions and receiving observations.
+
+6. Action Execution: The runtime client receives actions from the backend, executes them in the sandboxed environment, and sends back observations.
+
+7. Observation Return: The client sends execution results back to the OpenDevin backend as observations.
+
+
+The role of the client is crucial:
+- It acts as an intermediary between the OpenDevin backend and the sandboxed environment.
+- It executes various types of actions (shell commands, file operations, Python code, etc.) safely within the container.
+- It manages the state of the sandboxed environment, including the current working directory and loaded plugins.
+- It formats and returns observations to the backend, ensuring a consistent interface for processing results.
+
+
+## Advanced: How OpenDevin builds and maintains OD Runtime images
+
+OpenDevin uses a sophisticated approach to build and manage runtime images. This process ensures efficiency, consistency, and flexibility in creating and maintaining Docker images for both production and development environments.
+
+Check out [relavant code](https://github.com/OpenDevin/OpenDevin/blob/main/opendevin/runtime/utils/runtime_build.py) if you are interested in more details.
+
+### Image Tagging System
+
+OpenDevin uses a dual-tagging system for its runtime images to balance reproducibility with flexibility:
+
+1. Hash-based tag: `{target_image_repo}:{target_image_hash_tag}`
+   Example: `od_runtime:abc123def456`
+
+   - This tag is based on the MD5 hash of the Docker build folder, which includes the source code (of runtime client and related dependencies) and Dockerfile.
+   - Identical hash tags guarantee that the images were built with exactly the same source code and Dockerfile.
+   - This ensures reproducibility: the same hash always means the same image contents.
+
+2. Generic tag: `{target_image_repo}:{target_image_tag}`
+   Example: `od_runtime:od_v0.8.3_ubuntu_tag_22.04`
+
+   - This tag follows the format: `od_runtime:od_v{OD_VERSION}_{BASE_IMAGE_NAME}_tag_{BASE_IMAGE_TAG}`
+   - It represents the latest build for a particular base image and OpenDevin version combination.
+   - This tag is updated whenever a new image is built from the same base image, even if the source code changes.
+
+The hash-based tag ensures exact reproducibility, while the generic tag provides a stable reference to the latest version of a particular configuration. This dual-tagging approach allows OpenDevin to efficiently manage both development and production environments.
+
+### Build Process
+
+1. Image Naming Convention:
+   - Hash-based tag: `{target_image_repo}:{target_image_hash_tag}`
+     Example: `od_runtime:abc123def456`
+   - Generic tag: `{target_image_repo}:{target_image_tag}`
+     Example: `od_runtime:od_v0.8.3_ubuntu_tag_22.04`
+
+2. Build Process:
+   - a. Convert the base image name to an OD runtime image name.
+      Example: `ubuntu:22.04` -> `od_runtime:od_v0.8.3_ubuntu_tag_22.04`
+   - b. Generate a build context (Dockerfile and OpenDevin source code) and calculate its hash.
+   - c. Check for an existing image with the calculated hash.
+   - d. If not found, check for a recent compatible image to use as a base.
+   - e. If no compatible image exists, build from scratch using the original base image.
+   - f. Tag the new image with both hash-based and generic tags.
+
+3. Image Reuse and Rebuilding Logic:
+   The system follows these steps to determine whether to build a new image or use an existing one from a user-provided (base) image (e.g., `ubuntu:22.04`):
+
+   a. If an image exists with the same hash (e.g., `od_runtime:abc123def456`), it will be reused as is.
+
+   b. If the exact hash is not found, the system will try to rebuild using the latest generic image (e.g., `od_runtime:od_v0.8.3_ubuntu_tag_22.04`) as a base. This saves time by leveraging existing dependencies.
+
+   c. If neither the hash-tagged nor the generic-tagged image is found, the system will build the image completely from scratch.
+
+4. Caching and Efficiency:
+   - The system attempts to reuse existing images when possible to save build time.
+   - If an exact match (by hash) is found, it's used without rebuilding.
+   - If a compatible image is found, it's used as a base for rebuilding, saving time on dependency installation.
+
+Here's a flowchart illustrating the build process:
+
+```mermaid
+flowchart TD
+    A[Start] --> B{Convert base image name}
+    B --> |ubuntu:22.04 -> od_runtime:od_v0.8.3_ubuntu_tag_22.04| C[Generate build context and hash]
+    C --> D{Check for existing image with hash}
+    D -->|Found od_runtime:abc123def456| E[Use existing image]
+    D -->|Not found| F{Check for od_runtime:od_v0.8.3_ubuntu_tag_22.04}
+    F -->|Found| G[Rebuild based on recent image]
+    F -->|Not found| H[Build from scratch]
+    G --> I[Tag with hash and generic tags]
+    H --> I
+    E --> J[End]
+    I --> J
+```
+
+This approach ensures that:
+
+1. Identical source code and Dockerfile always produce the same image (via hash-based tags).
+2. The system can quickly rebuild images when minor changes occur (by leveraging recent compatible images).
+3. The generic tag (e.g., `od_runtime:od_v0.8.3_ubuntu_tag_22.04`) always points to the latest build for a particular base image and OpenDevin version combination.
+
+By using this method, OpenDevin maintains an efficient and flexible system for building and managing runtime images, adapting to both development needs and production requirements.
+
+
+## Advanced: Runtime Plugin System
+
+The OpenDevin Runtime supports a plugin system that allows for extending functionality and customizing the runtime environment. Plugins are initialized when the runtime client starts up.
+
+Check [an example of Jupyter plugin here](https://github.com/OpenDevin/OpenDevin/blob/9c44d94cef32e6426ebd8deeeb52963153b2348a/opendevin/runtime/plugins/jupyter/__init__.py#L30-L63) if you want to implement your own plugin.
+
+*More details about the Plugin system are still under construction - contributions are welcomed!*
+
+Key aspects of the plugin system:
+
+1. Plugin Definition: Plugins are defined as Python classes that inherit from a base `Plugin` class.
+
+2. Plugin Registration: Available plugins are registered in an `ALL_PLUGINS` dictionary.
+
+3. Plugin Specification: Plugins are associate with `Agent.sandbox_plugins: list[PluginRequirement]`. Users can specify which plugins to load when initializing the runtime.
+
+4. Initialization: Plugins are initialized asynchronously when the runtime client starts.
+
+5. Usage: The runtime client can use initialized plugins to extend its capabilities (e.g., the JupyterPlugin for running IPython cells).
--- a/docs/package-lock.json
+++ b/docs/package-lock.json
--- a/docs/package.json
+++ b/docs/package.json
@@ -18,6 +18,7 @@
    "@docusaurus/core": "^3.4.0",
    "@docusaurus/plugin-content-pages": "^3.4.0",
    "@docusaurus/preset-classic": "^3.4.0",
+    "@docusaurus/theme-mermaid": "^3.4.0",
    "@mdx-js/react": "^3.0.0",
    "clsx": "^2.0.0",
    "prism-react-renderer": "^2.3.0",
--- a/docs/src/components/CustomFooter.tsx
+++ b/docs/src/components/CustomFooter.tsx
@@ -17,11 +17,9 @@ function CustomFooter() {
            </a>
          </div>
        </div>
-        <div className="footer-community">
-          <Translate id="footer.community">Community</Translate>
-        </div>
+
        <div className="footer-icons">
-          <a href="https://join.slack.com/t/opendevin/shared_invite/zt-2jsrl32uf-fTeeFjNyNYxqSZt5NPY3fA" target="_blank" rel="noopener noreferrer">
+          <a href="https://join.slack.com/t/opendevin/shared_invite/zt-2ngejmfw6-9gW4APWOC9XUp1n~SiQ6iw" target="_blank" rel="noopener noreferrer">
            <FaSlack />
          </a>
          <a href="https://discord.gg/ESHStjSjD4" target="_blank" rel="noopener noreferrer">
--- a/docs/src/components/Demo/Demo.tsx
+++ b/docs/src/components/Demo/Demo.tsx
@@ -6,7 +6,7 @@ export function Demo() {

  return (
    <div
-      style={{ paddingBottom: "30px", paddingTop: "20px", textAlign: "center" }}
+      style={{ paddingBottom: "10px", paddingTop: "10px", textAlign: "center" }}
    >
      <video
        playsInline
--- a/docs/src/components/HomepageHeader/HomepageHeader.tsx
+++ b/docs/src/components/HomepageHeader/HomepageHeader.tsx
@@ -14,15 +14,28 @@ export function HomepageHeader() {
        <Heading as="h1" className="header-title">
          {siteConfig.title}
        </Heading>
+
        <p className="header-subtitle">{siteConfig.tagline}</p>
-        <div className="header-buttons">
-          <Link
-            className="button button--secondary button--lg"
-            to="/modules/usage/intro"
-          >
-            <Translate id="homepage.getStarted">Get Started</Translate>
-          </Link>
+
+        <div className="header-links">
+          <a href="https://github.com/OpenDevin/OpenDevin">
+            <img src="https://img.shields.io/badge/Code-Github-purple?logo=github&logoColor=white&style=for-the-badge" alt="Code" />
+          </a>
+          <a href="https://join.slack.com/t/opendevin/shared_invite/zt-2i1iqdag6-bVmvamiPA9EZUu7oCO6KhA">
+            <img src="https://img.shields.io/badge/Slack-Join%20Us-red?logo=slack&logoColor=white&style=for-the-badge" alt="Join our Slack community" />
+          </a>
+          <a href="https://discord.gg/ESHStjSjD4">
+            <img src="https://img.shields.io/badge/Discord-Join%20Us-purple?logo=discord&logoColor=white&style=for-the-badge" alt="Join our Discord community" />
+          </a>
+
+          <a href="https://arxiv.org/abs/2407.16741">
+            <img src="https://img.shields.io/badge/Paper-%20on%20Arxiv-red?logo=arxiv&style=for-the-badge" alt="Paper on Arxiv" />
+          </a>
+          <a href="https://huggingface.co/spaces/OpenDevin/evaluation">
+            <img src="https://img.shields.io/badge/Evaluation-Benchmark%20on%20HF%20Space-green?logo=huggingface&style=for-the-badge" alt="Evaluation Benchmark" />
+          </a>
        </div>
+
        <Demo />
      </div>
    </div>
--- a/docs/src/components/Welcome/Welcome.tsx
+++ b/docs/src/components/Welcome/Welcome.tsx
@@ -1,20 +0,0 @@
-import "../../css/welcome.css";
-import Translate from '@docusaurus/Translate';
-
-export function Welcome() {
-  return (
-    <div className="text-white">
-      <div className="welcome-container">
-        <img src="img/logo.png" className="welcome-logo" />
-        <p className="welcome-text">
-          <Translate id="welcome.message">
-          Welcome to OpenDevin, an open-source autonomous AI software engineer
-          that is capable of executing
-          complex engineering tasks and collaborating actively with users on
-          software development projects.
-          </Translate>
-        </p>
-      </div>
-    </div>
-  );
-}
--- a/docs/src/css/faq.css
+++ b/docs/src/css/faq.css
@@ -1,66 +0,0 @@
-/* faq.css */
-
-.faq-container {
-    margin: auto;
-    padding: 24px;
-    display: flex;
-    flex-direction: column;
-    gap: 8px;
-    margin-bottom: 24px;
-  }
-  
-  .faq-title {
-    display: flex;
-    align-items: center;
-    justify-content: center;
-    font-size: 2rem;
-    padding: 8px;
-    text-transform: uppercase;
-    font-weight: bold;
-  }
-  
-  @media (min-width: 1024px) {
-    .faq-title {
-      font-size: 6rem;
-    }
-  }
-  
-  .faq-section {
-    display: flex;
-    flex-direction: column;
-    gap: 8px;
-    width: 100%;
-    margin-bottom: 24px;
-  }
-  
-  .faq-section-title {
-    text-transform: uppercase;
-    font-weight: bold;
-    font-size: 2rem;
-    letter-spacing: 0.1em;
-  }
-  
-  .highlight {
-    font-weight: 600;
-    color: var(--logo);
-  }
-  
-  .faq-steps ol {
-    padding-left: 24px;
-  }
-  
-  .command-box {
-    display: flex;
-    flex-direction: column;
-    padding: 8px;
-    background-color: #e0e0e0;
-    border-radius: 0.375rem;
-    height: 6vh;
-    text-transform: uppercase;
-    color: #4a5568;
-  }
-  
-  .command-box + .command-box {
-    height: 8vh;
-  }
-  
--- a/docs/src/css/footer.css
+++ b/docs/src/css/footer.css
@@ -3,12 +3,12 @@
 .custom-footer {
    background-color: dark;
    color: white;
-    height: 25vh;
+    height: 200px;
    /* background: linear-gradient(to bottom, #1a1a1a, #1a1a1a); */
    background: linear-gradient(to bottom, #1f2937, #000000);

  }
-  
+
  .footer-content {
    display: flex;
    flex-direction: column;
@@ -17,56 +17,55 @@
    padding: 8px;
    height: 100%;
  }
-  
+
  .footer-top {
    display: flex;
    gap: 8px;
    align-items: center;
  }
-  
+
  .footer-title {
    font-weight: bold;
    font-size: 1.125rem;
  }
-  
+
  @media (min-width: 768px) {
    .footer-title {
      font-size: 1.875rem;
    }
  }
-  
+
  .footer-link a {
    font-size: 0.875rem;
    text-decoration: none;
    color: gray;
    transition: color 0.3s ease;
  }
-  
+
  .footer-link a:hover {
    color: white;
  }
-  
+
  .footer-community {
    text-transform: uppercase;
    font-weight: 300;
  }
-  
+
  .footer-icons {
    display: flex;
    gap: 24px;
    font-size: 1.875rem;
  }
-  
+
  .footer-icons a {
    color:gray;
    transition: color 0.3s ease;
  }
-  
+
  .footer-icons a:hover {
    color: white;
  }
-  
+
  .footer-bottom {
    text-transform: uppercase;
  }
-  
--- a/docs/src/css/homepageHeader.css
+++ b/docs/src/css/homepageHeader.css
@@ -1,36 +1,47 @@
 /* homepageHeader.css */

 .homepage-header {
-    height: 100vh;
-    color: white;
-    background: linear-gradient(to top, #64748b, #000000);
-  }
-  
-  .header-content {
-    display: flex;
-    flex-direction: column;
-    gap: 8px;
-    align-items: center;
-    padding: 24px;
-    font-weight: 300;
-    width: 100%;
-  }
-  
+  height: 800px;
+  color: white;
+  background: linear-gradient(to top, #64748b, #000000);
+}
+
+.header-content {
+  display: flex;
+  flex-direction: column;
+  align-items: center;
+  padding: 2rem;
+  font-weight: 300;
+  width: 100%;
+}
+
+.header-title {
+  font-size: 3rem;
+}
+
+@media (min-width: 768px) {
  .header-title {
-    font-size: 3rem;
+    font-size: 4rem;
  }
-  
-  @media (min-width: 768px) {
-    .header-title {
-      font-size: 5rem;
-    }
-  }
-  
-  .header-subtitle {
-    font-size: 1.25rem;
-  }
-  
-  .header-buttons {
-    margin-top: 24px;
-  }
-  
+}
+
+.header-subtitle {
+  font-size: 1.5rem;
+}
+
+.header-links {
+  display: flex;
+  flex-wrap: wrap;
+  justify-content: center;
+  gap: 10px;
+  max-width: 680px;
+}
+
+.header-links a {
+  display: inline-block;
+  transition: transform 0.2s ease-in-out;
+}
+
+.header-links a:hover {
+  transform: translateY(-2px);
+}
--- a/docs/src/css/welcome.css
+++ b/docs/src/css/welcome.css
@@ -1,53 +0,0 @@
-/* welcome.css */
-
-.text-white {
-    color: white;
-  }
-
-  .welcome-container {
-    display: flex;
-    justify-content: center;
-    align-items: center;
-    flex-direction: column;
-    background: linear-gradient(to bottom, #64748b, #1f2937);
-  }
-
-  @media (min-width: 768px) {
-    .welcome-container {
-      flex-direction: row;
-      background: linear-gradient(to bottom, #64748b, #1f2937);
-    }
-  }
-
-  .welcome-logo {
-    height: 45vh;
-    width: 45vw;
-  }
-
-  @media (max-width: 640px) {
-    .welcome-logo {
-      height: 40vw;
-      width: 40vw;
-    }
-  }
-
-  @media (min-width: 768px) {
-    .welcome-logo {
-      height: auto;
-      width: 350px;
-    }
-  }
-
-  .welcome-text {
-    padding: 24px;
-    margin-bottom: 24px;
-    font-weight: 300;
-    font-size: 1.125rem;
-  }
-
-  @media (min-width: 768px) {
-    .welcome-text {
-      padding: 8px;
-      font-size: 1.5rem;
-    }
-  }
--- a/docs/src/pages/faq.tsx
+++ b/docs/src/pages/faq.tsx
@@ -1,129 +0,0 @@
-import Layout from '@theme/Layout';
-import '../css/faq.css';
-import Translate, { translate } from '@docusaurus/Translate';
-
-export default function FAQ() {
-  const githubLink = (
-    <a href="https://github.com/OpenDevin/OpenDevin/issues" target="_blank">GitHub</a>
-  );
-  const discordLink = (
-    <a href="https://discord.gg/mBuDGRzzES" target="_blank">Discord</a>
-  );
-  const slackLink = (
-    <a href="https://join.slack.com/t/opendevin/shared_invite/zt-2jsrl32uf-fTeeFjNyNYxqSZt5NPY3fA" target="_blank">Slack</a>
-  );
-
-  return (
-    <Layout
-      title={translate({ id: 'faq.title', message: 'FAQ' })}
-      description={translate({ id: 'faq.description', message: 'Frequently Asked Questions' })}
-    >
-      <div id="faq" className="faq-container">
-        <div className="faq-title">
-          <Translate id="faq.title" description="FAQ Title">Frequently Asked Questions</Translate>
-        </div>
-        <div className="faq-section">
-          <div className="faq-section-title">
-            <Translate id="faq.section.title.1" description="First Section Title">What is OpenDevin?</Translate>
-          </div>
-          <p>
-            <span className="highlight"><Translate id="faq.section.highlight" description="Highlight Text">OpenDevin</Translate></span>{" "}
-            <Translate id="faq.section.description.1" description="Description for OpenDevin">
-              is an autonomous software engineer that can solve software engineering
-              and web-browsing tasks end-to-end. It can perform data science queries, such
-              as "Find the number of pull requests to the OpenDevin repository in the last
-              month," and software engineering tasks, such as "Please add tests to this
-              file and verify that all the tests pass. If they don't fix the file."
-            </Translate>
-          </p>
-          <p>
-            <Translate id="faq.section.description.2" description="Further Description for OpenDevin">
-              At the same time, OpenDevin is a platform and community for agent developers
-              to test out and evaluate new agents.
-            </Translate>
-          </p>
-        </div>
-        <div className="faq-section">
-          <div className="faq-section-title">
-            <Translate id="faq.section.title.2" description="Support Section Title">Support</Translate>
-          </div>
-          <div>
-            <Translate
-              id="faq.section.support.answer"
-              description="Support Answer"
-              values={{
-                githubLink: githubLink,
-                discordLink: discordLink,
-                slackLink: slackLink,
-              }}
-            >
-              {`Please file a bug on {githubLink} if you notice a problem that likely affects others. If you're having trouble installing, or have general questions, reach out on {discordLink} or {slackLink}.`}
-            </Translate>
-          </div>
-        </div>
-        <div className="faq-section">
-          <div className="faq-section-title">
-            <Translate id="faq.section.title.3" description="GitHub Issue Section Title">How to fix a GitHub issue with OpenDevin?</Translate>
-          </div>
-          <div className="faq-steps">
-            <Translate id="faq.section.github.steps.intro" description="GitHub Steps Introduction">
-              To fix an issue on GitHub using OpenDevin, send a prompt to OpenDevin asking it to follow
-              steps like the following:
-            </Translate>
-            <ol>
-              <li><Translate id="faq.section.github.step1" description="GitHub Step 1">Read the issue https://github.com/OpenDevin/OpenDevin/issues/1611</Translate></li>
-              <li><Translate id="faq.section.github.step2" description="GitHub Step 2">Clone the repository and check out a new branch</Translate></li>
-              <li><Translate id="faq.section.github.step3" description="GitHub Step 3">Based on the instructions in the issue description, modify files to fix the issue</Translate></li>
-              <li><Translate id="faq.section.github.step4" description="GitHub Step 4">Push the resulting output to GitHub using the GITHUB_TOKEN environment variable</Translate></li>
-              <li><Translate id="faq.section.github.step5" description="GitHub Step 5">Tell me the link that I need to go to to send a pull request</Translate></li>
-            </ol>
-            <Translate id="faq.section.github.steps.preRun" description="GitHub Steps Pre-Run">
-              Before you run OpenDevin, you can do:
-            </Translate>
-            <div className="command-box">
-              export SANDBOX_ENV_GITHUB_TOKEN=XXX
-            </div>
-            <Translate id="faq.section.github.steps.tokenInfo" description="GitHub Steps Token Info">
-              where XXX is a GitHub token that you created that has permissions to push to the OpenDevin repo. If you don’t have write permission to the OpenDevin repo, you might need to change that to:
-            </Translate>
-            <div className="command-box">
-              Push the resulting output to my fork at https://github.com/USERNAME/OpenDevin/ using the GITHUB_TOKEN environment variable
-            </div>
-            <Translate id="faq.section.github.steps.usernameInfo" description="GitHub Steps Username Info">
-              where USERNAME is your GitHub username.
-            </Translate>
-          </div>
-        </div>
-        <div className="faq-section">
-          <div className="faq-section-title">
-            <Translate id="faq.section.title.4" description="Devin Section Title">How is OpenDevin different from Devin?</Translate>
-          </div>
-          <p>
-            <a href="https://www.cognition.ai/blog/introducing-devin"><Translate id="faq.section.devin.linkText" description="Devin Link Text">Devin</Translate></a>&nbsp;
-            <Translate id="faq.section.devin.description" description="Devin Description">
-              is a commercial product by Cognition Inc., that served as the initial
-              inspiration for OpenDevin. They both aim to do a good job at solving software
-              engineering tasks, but OpenDevin you can download, use, and modify, while Devin
-              you can only use through the Cognition site. In addition, OpenDevin has evolved
-              beyond the initial inspiration, and now serves as a community-driven ecosystem for
-              agent development in general, and we'd love to have you join and
-            </Translate>
-            <a href="https://github.com/OpenDevin/OpenDevin/blob/main/CONTRIBUTING.md"><Translate id="faq.section.devin.contribute" description="Contribute Link">contribute</Translate></a>!
-          </p>
-        </div>
-        <div className="faq-section">
-          <div className="faq-section-title">
-            <Translate id="faq.section.title.5" description="ChatGPT Section Title">How is OpenDevin different from ChatGPT?</Translate>
-          </div>
-          <p>
-            <Translate id="faq.section.chatgpt.description" description="ChatGPT Description">
-              ChatGPT you can access online, it does not interface with local files, and
-              its ability to execute code is limited. So it can write code, but it is not
-              easy to test or execute it.
-            </Translate>
-          </p>
-        </div>
-      </div>
-    </Layout>
-  );
-}
--- a/docs/src/pages/index.tsx
+++ b/docs/src/pages/index.tsx
@@ -4,12 +4,11 @@ import { HomepageHeader } from "../components/HomepageHeader/HomepageHeader";
 import { Welcome } from "../components/Welcome/Welcome";
 import { translate } from '@docusaurus/Translate';

-export function Header({ title, summary, description }): JSX.Element {
+export function Header({ title, summary }): JSX.Element {
  return (
    <div>
      <h1>{title}</h1>
-      <h2 style={{ fontSize: "40px" }}>{summary}</h2>
-      <h3 className="headerDescription">{description}</h3>
+      <h2 style={{ fontSize: "3rem" }}>{summary}</h2>
    </div>
  );
 }
@@ -17,22 +16,15 @@ export function Header({ title, summary, description }): JSX.Element {
 export default function Home(): JSX.Element {
  const { siteConfig } = useDocusaurusContext();
  return (
-    <>
    <Layout
      title={`${siteConfig.title}`}
      description={translate({
        id: 'homepage.description',
-        message: 'AI-powered code generation for software engineering.',
+        message: 'An Open Platform for AI Software Developers as Generalist Agents',
        description: 'The homepage description',
      })}
    >
-      <div>
-        <HomepageHeader />
-        <div>
-          <Welcome />
-        </div>
-      </div>
+    <HomepageHeader />
    </Layout>
-    </>
  );
 }
--- a/docs/static/img/teaser.mp4
+++ b/docs/static/img/teaser.mp4
--- a/evaluation/EDA/README.md
+++ b/evaluation/EDA/README.md
@@ -2,9 +2,10 @@

 This folder contains evaluation harness for evaluating agents on the Entity-deduction-Arena Benchmark, from the paper [Probing the Multi-turn Planning Capabilities of LLMs via 20 Question Games](https://arxiv.org/abs/2310.01468), presented in ACL 2024 main conference.

-## Configure OpenDevin and your LLM
+## Setup Environment and LLM Configuration
+
+Please follow instruction [here](../README.md#setup) to setup your local development environment and LLM.

-Create a `config.toml` file if it does not exist at the root of the workspace. Please check [README.md](../../README.md) for how to set this up.

 ## Start the evaluation

--- a/evaluation/EDA/run_infer.py
+++ b/evaluation/EDA/run_infer.py
@@ -1,30 +1,27 @@
 import asyncio
-import logging
 import os

 import pandas as pd
-
-# import huggingface_hub
 from datasets import load_dataset

 from evaluation.EDA.game import Q20Game, Q20GameCelebrity
 from evaluation.utils.shared import (
    EvalMetadata,
+    EvalOutput,
    make_metadata,
    prepare_dataset,
+    reset_logger_for_multiprocessing,
    run_evaluation,
 )
-from opendevin.controller.agent import Agent
-
-# from evaluation.EDA.scorer import question_scorer
 from opendevin.controller.state.state import State
-from opendevin.core.config import get_llm_config_arg, get_parser, load_app_config
-from opendevin.core.logger import get_console_handler
+from opendevin.core.config import (
+    AppConfig,
+    SandboxConfig,
+    get_llm_config_arg,
+    get_parser,
+)
 from opendevin.core.logger import opendevin_logger as logger
-from opendevin.core.main import run_agent_controller
-from opendevin.llm.llm import LLM
-
-config = load_app_config()
+from opendevin.core.main import create_runtime, run_controller

 game = None

@@ -56,39 +53,44 @@ AGENT_CLS_TO_INST_SUFFIX = {
 }


-def process_instance(
+def get_config(
+    metadata: EvalMetadata,
+) -> AppConfig:
+    config = AppConfig(
+        default_agent=metadata.agent_class,
+        run_as_devin=False,
+        runtime='eventstream',
+        max_iterations=metadata.max_iterations,
+        sandbox=SandboxConfig(
+            container_image='ubuntu:22.04',
+            enable_auto_lint=False,
+            use_host_network=False,
+        ),
+        # do not mount workspace
+        workspace_base=None,
+        workspace_mount_path=None,
+    )
+    config.set_llm_config(metadata.llm_config)
+    return config
+
+
+async def process_instance(
    instance: pd.Series,
    metadata: EvalMetadata,
    reset_logger: bool = True,
-):
-    # Create the agent
-    agent = Agent.get_cls(metadata.agent_class)(llm=LLM(config=metadata.llm_config))
+) -> EvalOutput:
+    config = get_config(metadata)
+    instance_id = instance['text'].strip()
+
    # Setup the logger properly, so you can run multi-processing to parallelize the evaluation
-    eval_output_dir = metadata.eval_output_dir
    if reset_logger:
-        # Set up logger
-        log_file = os.path.join(
-            eval_output_dir, 'logs', f'instance_{instance["text"].strip()}.log'
-        )
-        # Remove all existing handlers from logger
-        for handler in logger.handlers[:]:
-            logger.removeHandler(handler)
-        # add back the console handler to print ONE line
-        logger.addHandler(get_console_handler())
-        logger.info(
-            f'Starting evaluation for instance {instance["text"].strip()}.\nLOG:   tail -f {log_file}'
-        )
-        # Remove all existing handlers from logger
-        for handler in logger.handlers[:]:
-            logger.removeHandler(handler)
-        file_handler = logging.FileHandler(log_file)
-        file_handler.setFormatter(
-            logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
-        )
-        logger.addHandler(file_handler)
+        log_dir = os.path.join(metadata.eval_output_dir, 'infer_logs')
+        reset_logger_for_multiprocessing(logger, instance_id, log_dir)
+    else:
+        logger.info(f'Starting evaluation for instance {instance_id}.')

    # Prepare instruction
-    _game_class = {'things': Q20Game, 'celebs': Q20GameCelebrity}
+    _game_class = {'eda-things': Q20Game, 'eda-celebs': Q20GameCelebrity}

    guesser_kargs = {
        'max_new_tokens': 64,
@@ -112,24 +114,16 @@ def process_instance(

    instruction = f'{game.first_user_utterance}'
    logger.info(f'Instruction: {instruction}')
-
-    # instruction += 'IMPORTANT: You should ONLY interact with the environment provided to you AND NEVER ASK FOR HUMAN HELP.\n'
-    # NOTE: You can actually set slightly different instruction for different agents
-    instruction += AGENT_CLS_TO_INST_SUFFIX[agent.__class__.__name__]
+    instruction += AGENT_CLS_TO_INST_SUFFIX[metadata.agent_class]

    # Here's how you can run the agent (similar to the `main` function) and get the final task state
+    runtime = await create_runtime(config, sid=instance['text'].strip())

-    state: State | None = asyncio.run(
-        run_agent_controller(
-            agent,
-            instruction,
-            max_iterations=metadata.max_iterations,
-            max_budget_per_task=config.max_budget_per_task,
-            fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN[
-                agent.__class__.__name__
-            ],
-            sid=instance['text'].strip(),
-        )
+    state: State | None = await run_controller(
+        config=config,
+        task_str=instruction,
+        runtime=runtime,
+        fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN[metadata.agent_class],
    )
    # ======= Attempt to evaluate the agent's edits =======
    # If you are working on simpler benchmark that only evaluates the final model output (e.g., in a MessageAction)
@@ -150,21 +144,20 @@ def process_instance(
    histories = state.history.compatibility_for_eval_history_pairs()

    # Save the output
-    output = {
-        'instance_id': instance['text'].strip(),
-        'instance': instance,
-        'instruction': instruction,
-        'metadata': metadata.model_dump(),
-        'history': histories,
-        'metrics': metrics,
-        'error': state.last_error if state and state.last_error else None,
-        'test_result': {
+    output = EvalOutput(
+        instance_id=instance_id,
+        instance=instance.to_dict(),
+        instruction=instruction,
+        metadata=metadata,
+        history=histories,
+        metrics=metrics,
+        error=state.last_error if state and state.last_error else None,
+        test_result={
            'success': test_result,
            'final_message': final_message,
            'ground_truth': instance['text'],
        },
-    }
-
+    )
    return output


@@ -191,12 +184,16 @@ if __name__ == '__main__':
    )
    args, _ = parser.parse_known_args()

-    llm_config = get_llm_config_arg(args.llm_config) if args.llm_config else config.llm
-    logger.info(f'Config for evaluation: {config}')
-
    eda_dataset = load_dataset(
        'yizheapple/entity-deduction-arena', name=args.dataset, split=args.data_split
    )
+    eda_dataset.rename(columns={'text': 'instance_id'}, inplace=True)
+
+    llm_config = None
+    if args.llm_config:
+        llm_config = get_llm_config_arg(args.llm_config)
+    if llm_config is None:
+        raise ValueError(f'Could not find LLM config: --llm_config {args.llm_config}')

    metadata = make_metadata(
        llm_config,
@@ -214,16 +211,15 @@ if __name__ == '__main__':

    output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl')
    prepared_dataset = prepare_dataset(
-        eda_dataset.to_pandas(), output_file, args.eval_n_limit, 'text'
+        eda_dataset.to_pandas(), output_file, args.eval_n_limit
    )

-    agent = Agent.get_cls(args.agent_cls)(llm=LLM(config.llm))
-
-    run_evaluation(
-        prepared_dataset,
-        metadata,
-        output_file,
-        args.eval_num_workers,
-        process_instance,
-        'text',
+    asyncio.run(
+        run_evaluation(
+            prepared_dataset,
+            metadata,
+            output_file,
+            args.eval_num_workers,
+            process_instance,
+        )
    )
--- a/evaluation/EDA/scripts/run_infer.sh
+++ b/evaluation/EDA/scripts/run_infer.sh
--- a/evaluation/README.md
+++ b/evaluation/README.md
@@ -12,15 +12,59 @@ all the preprocessing/evaluation/analysis scripts.

 ## Supported Benchmarks

+To learn more about how to integrate your benchmark into OpenDevin, check out [tutorial here](https://docs.all-hands.dev/modules/usage/evaluation_harness).
+
+### Software Engineering
+
 - SWE-Bench: [`evaluation/swe_bench`](./swe_bench)
- ML-Bench: [`evaluation/ml_bench`](./ml_bench)
 - HumanEvalFix: [`evaluation/humanevalfix`](./humanevalfix)
- GAIA: [`evaluation/gaia`](./gaia)
- Entity deduction Arena (EDA): [`evaluation/EDA`](./EDA)
- MINT: [`evaluation/mint`](./mint)
- AgentBench: [`evaluation/agent_bench`](./agent_bench)
 - BIRD: [`evaluation/bird`](./bird)
- LogicReasoning: [`evaluation/logic_reasoning`](./logic_reasoning)
+- BioCoder: [`evaluation/ml_bench`](./ml_bench)
+- ML-Bench: [`evaluation/ml_bench`](./ml_bench)
+- APIBench: [`evaluation/gorilla`](./gorilla/)
+- ToolQA: [`evaluation/toolqa`](./toolqa/)
+
+### Web Browsing
+
+- WebArena: [`evaluation/webarena`](./webarena/)
+- MiniWob++: [`evaluation/miniwob`](./miniwob/)
+
+### Misc. Assistance
+
+- GAIA: [`evaluation/gaia`](./gaia)
+- GPQA: [`evaluation/gpqa`](./gpqa)
+- AgentBench: [`evaluation/agent_bench`](./agent_bench)
+- MINT: [`evaluation/mint`](./mint)
+- Entity deduction Arena (EDA): [`evaluation/EDA`](./EDA)
+- ProofWriter: [`evaluation/logic_reasoning`](./logic_reasoning)
+
+
+## Before everything begins: Setup Environment and LLM Configuration
+
+Please follow instruction [here](https://github.com/OpenDevin/OpenDevin/blob/main/Development.md) to setup your local development environment and LLM.
+
+OpenDevin in development mode uses `config.toml` to keep track of most configurations.
+
+Here's an example configuration file you can use to define and use multiple LLMs:
+
+```toml
+[llm]
+# IMPORTANT: add your API key here, and set the model to the one you want to evaluate
+model = "gpt-4o-2024-05-13"
+api_key = "sk-XXX"
+
+[llm.eval_gpt4_1106_preview_llm]
+model = "gpt-4-1106-preview"
+api_key = "XXX"
+temperature = 0.0
+
+[llm.eval_some_openai_compatible_model_llm]
+model = "openai/MODEL_NAME"
+base_url = "https://OPENAI_COMPATIBLE_URL/v1"
+api_key = "XXX"
+temperature = 0.0
+```
+

 ### Result Visualization

--- a/evaluation/TUTORIAL.md
+++ b/evaluation/TUTORIAL.md
@@ -1,186 +0,0 @@
-# Tutorial: How to add a New Evaluation Benchmark to OpenDevin
-
-This tutorial provides a general guide on how to integrate your own evaluation benchmark into the OpenDevin framework.
-
-You can read this for details, and also learn by example by looking at our existing evaluations:
- [swe_bench](swe_bench/)
-
-
-## A quick walk-through of OpenDevin architecture
-
-### Before everything begins
-
-Please follow [this document](https://github.com/OpenDevin/OpenDevin/blob/main/Development.md) to setup local develop environment for OpenDevin.
-
-### Configuration file
-
-OpenDevin uses `config.toml` to keep track of most configurations.
-
-Here's an example configuration file you can use:
-
-```toml
-[core]
-max_iterations = 100
-cache_dir = "/tmp/cache"
-
-# IMPORTANT: You should set these two paths to YOUR WORKSPACE directory,
-# which will be mounted into Sandbox for agent to interact with!
-# The OpenDevin agent will be able to read/write files whatever they like (even rm -rf)
-# in this directory, so be careful!!
-workspace_base = "/path/to/your/workspace"
-workspace_mount_path = "/path/to/your/workspace"
-# ==========================
-
-ssh_hostname = "localhost"
-
-run_as_devin = false
-
-[sandbox]
-# SWEBench eval specific - but you can tweak it to your needs
-use_host_network = false
-# linting python after editing helps LLM fix indentations
-enable_auto_lint = true
-
-
-box_type = "ssh"
-timeout = 120
-
-[llm]
-# IMPORTANT: add your API key here, and set the model to the one you want to evaluate
-model = "gpt-4o-2024-05-13"
-api_key = "sk-XXX"
-```
-
-### How to use OpenDevin programmatically
-
-In this section, for the purpose of building an evaluation task, we don't use the standard OpenDevin web-based GUI, but rather run OpenDevin backend from CLI.
-
-For example, you can run the following, which performs the specified task `-t`, with a particular model config `-l` and agent `-c`, for a maximum number of iterations `-i`:
-
-```bash
-poetry run python ./opendevin/core/main.py \
-        -i 10 \
-        -t "Write me a bash script that print hello world." \
-        -c CodeActAgent \
-        -l llm
-```
-
-After running the script, you will observe the following:
-
-![](./static/example_task_1.png)
-
-You can see the agent uses bash to write a script, makes it executable, and then tests it by running it to make sure it is working.
-
-At the end of the above screenshot, OpenDevin actually requests user inputs when it think it finishes the task. This will cause issues in evaluation, since most evaluation don't assume additional user input. To fix this, we introduce the functionality of `fake_user_response_fn` in the `main` function, which we describe in the next section.
-
-## The `main` function
-
-The signature of `main` (in file [[`opendevin/core/main.py`](../opendevin/core/main.py)]) is as follows:
-
-```python
-async def main(
-    task_str: str = '',
-    exit_on_message: bool = False,
-    fake_user_response_fn: Optional[Callable[[Optional[State]], str]] = None,
-    sandbox: Optional[Sandbox] = None,
-) -> Optional[State]:
-```
-
- `task_str`: The task instruction to run. In the above example, it is "Write me a bash script that print hello world."
- `exit_on_message`: whether to quit if the agent asks for a message from user
- `fake_user_response_fn`: An optional function that receives the current state (could be None) and returns a fake user response.
- `sandbox`: An optional sandbox to run the agent in.
-
-### `fake_user_response_fn`
-
-Here's an example of `fake_user_response_fn` in the implementation for SWE-Bench in [`evaluation/swe_bench/run_infer.py`](swe_bench/run_infer.py):
-
-```python
-def codeact_user_response(state: State) -> str:
-    msg = (
-        'Please continue working on the task on whatever approach you think is suitable.\n'
-        'If you think you have modified the code in a way that fixes the issue, please run the following command: <execute_bash> exit </execute_bash>.\n'
-        'IMPORTANT: YOU SHOULD NEVER ASK FOR HUMAN HELP OR USE THE INTERNET TO SOLVE THIS TASK.\n'
-    )
-    # check if the agent has tried to talk to the user 3 times, if so, let the agent know it can give up
-    if state.history:
-        user_msgs = [
-            event
-            for event in state.history.get_events()
-            if isinstance(action, MessageAction) and action.source == 'user'
-        ]
-        if len(user_msgs) > 2:
-            # let the agent know that it can give up when it has tried 3 times
-            return (
-                msg
-                + 'If you want to give up, run: <execute_bash> exit </execute_bash>.\n'
-            )
-    return msg
-```
-
-
-### Return value
-
-The main function returns a `State`, which is defined in [`opendevin/controller/state/state.py`](../opendevin/controller/state/state.py). We are mainly using `state.history` here, which is the most important field of data. You can imagine it is being a more structured version of OpenAI's chat completion [messages](https://platform.openai.com/docs/guides/text-generation/chat-completions-api).
-
-`history: list[tuple[Action, Observation]] = field(default_factory=list)` is a list of (action, observation) tuple. All the actions are defined at [`opendevin/events/action`](../opendevin/events/action) and observations are defined at [`opendevin/events/observation`](../opendevin/events/action).
-
-The agent can emit different actions like `CmdRunAction`  (`opendevin/events/action/commands.py`) to execute bash commands and receive `CmdOutputObservation` (`opendevin/events/observation/commands.py`), `IPythonRunCellAction` to receive `IPythonRunCellObservation`, `BrowseInteractiveAction` (`opendevin/events/action/browse.py`) to browse the web and receive `BrowserOutputObservation` (`opendevin/events/observation/browse.py`).
-
-The action we used in this example is `MessageAction` (`opendevin/events/action/message.py`), which actually denotes a message from either `agent` or `user`. In the [CodeAct agent example](https://github.com/OpenDevin/OpenDevin/blob/7ca560471bd262f22513f3863995d0a8e6121c07/agenthub/codeact_agent/codeact_agent.py#L239-L273), an agent is considered to emit a `MessageAction` when it does not trigger a `CmdRunAction`, `IPythonRunCellAction`, and/or `BrowseInteractiveAction`.
-
-Typically, the agent returns `MessageAction` when it is confused about the task, and want to ask human for follow-up clarification, which is a good thing in real-world task, but not necessarily in evaluation. So in this example, we provide a dummy prompt to tell the agent "Please continue working on the task on whatever approach you think is suitable[...]".
-
-If you see something like this, you can consider adding this to your evaluation pipeline as well.
-
-### `sandbox`
-
-Sandbox is a fully functioning docker container where the agent can perform all sorts of tasks, e.g., using bash, calling Python, install packages, and more. You can leave `sandbox` to `None` if you don't need to do anything special to pre-configure the `Sandbox`.
-
-In SWE-Bench, we need to copy the proper repository directory to the workspace and activate the right python virtual environment before the agent can start performing the task, so we actually defined a custom [`SWEBenchSSHBox`](https://github.com/OpenDevin/OpenDevin/blob/7ca560471bd262f22513f3863995d0a8e6121c07/evaluation/swe_bench/swe_env_box.py#L12-L118) that inherit from the default sandbox [`SSHBox`](https://github.com/OpenDevin/OpenDevin/blob/7ca560471bd262f22513f3863995d0a8e6121c07/opendevin/runtime/docker/ssh_box.py#L188) and handles all these initial setup. If you need to configure the `sandbox` for your evaluation, check `SWEBenchSSHBox` for a reference of implementation.
-
-## How to put together an evaluation script?
-
-Now we know how to start running the agent end-to-end, and how `fake_user_response_fn` and `sandbox` work. We will walk through a piece of dummy code (simplified version of SWE-Bench's [`run_infer.py`](https://github.com/OpenDevin/OpenDevin/blob/main/evaluation/swe_bench/run_infer.py)) that outline the general workflow:
-
- Load the dataset and prepare the evaluation configuration.
- Filter out any instances that have already been processed.
- For each instance in the dataset:
-  - Set up the sandbox environment.
-  - Run the agent to generate a solution.
-  - Apply the solution to the instance and execute the test command.
-  - Collect the results and write them to the output file.
- Perform cleanup after the evaluation is complete.
-
-You can see the [swe_bench/run_infer.py](swe_bench/run_infer.py) file for an example.
-
-When you fully understand the `run_infer.py`, you can be ready to actually starting the evaluation!
-
-
-## Run the evaluation!
-
-You can write your `run_infer.sh` script mimicking SWE-Bench's [`run_infer.sh`](https://github.com/OpenDevin/OpenDevin/blob/main/evaluation/swe_bench/scripts/run_infer.sh).
-
-
-You can start the evaluation by running:
-
-```bash
-./run_infer.sh eval_gpt_4o_2024_05_13
-```
-Where `eval_gpt_4o_2024_05_13` is the model config you defined on the config.toml.
-Like this:
-
-```toml
-[core]
-...
-
-[llm]
-model="gpt-4-32k"
-...
-
-[eval_gpt_4o_2024_05_13]
-model="gpt-4o-2024-05-13"
-api_key="sk-xxx"
-```
-
-If `[eval_gpt_4o_2024_05_13]` is not present, it will default to using the model configured in `[llm]`.
--- a/evaluation/agent_bench/README.md
+++ b/evaluation/agent_bench/README.md
@@ -1,44 +1,10 @@
 # AgentBench Evaluation

-This folder contains evaluation harness for evaluating agents on
-the [AgentBench: Evaluating LLMs as Agents](https://arxiv.org/abs/2308.03688).
+This folder contains evaluation harness for evaluating agents on the [AgentBench: Evaluating LLMs as Agents](https://arxiv.org/abs/2308.03688). We currently only support running on the `osbench` subset.

-## Configure OpenDevin and your LLM
+## Setup Environment and LLM Configuration

-Create a `config.toml` file if it does not exist at the root of the workspace. Please check [README.md](../../README.md)
-for how to set this up.
-
-Here is an example `config.toml` file:
-
-```toml
-[core]
-max_iterations = 100
-cache_dir = "/path/to/cache"
-
-workspace_base = "/path/to/workspace"
-workspace_mount_path = "/path/to/workspace"
-
-ssh_hostname = "localhost"
-
-# AgentBench specific
-run_as_devin = true
-
-[sandbox]
-use_host_network = false
-enable_auto_lint = true
-box_type = "ssh"
-timeout = 120
-
-[llm.eval_gpt35_turbo]
-model = "gpt-3.5-turbo"
-api_key = "sk-123"
-temperature = 0.0
-
-[llm.eval_gpt4o]
-model = "gpt-4o"
-api_key = "sk-123"
-temperature = 0.0
-```
+Please follow instruction [here](../README.md#setup) to setup your local development environment and LLM.

 ## Start the evaluation

@@ -46,7 +12,18 @@ temperature = 0.0
 ./evaluation/agent_bench/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit]
 ```

-Following is the basic command to start the evaluation. Here we are only evaluating the `osbench` for now.
+- `model_config`, e.g. `eval_gpt4_1106_preview`, is the config group name for your
+LLM settings, as defined in your `config.toml`.
+- `git-version`, e.g. `HEAD`, is the git commit hash of the OpenDevin version you would
+like to evaluate. It could also be a release tag like `0.6.2`.
+- `agent`, e.g. `CodeActAgent`, is the name of the agent for benchmarks, defaulting
+to `CodeActAgent`.
+- `eval_limit`, e.g. `10`, limits the evaluation to the first `eval_limit` instances. By
+default, the script evaluates the entire SWE-bench_Lite test set (300 issues). Note:
+in order to use `eval_limit`, you must also set `agent`.
+
+
+Following is the basic command to start the evaluation.

 You can update the arguments in the script `evaluation/agent_bench/scripts/run_infer.sh`, such as `--max-iterations`, `--eval-num-workers` and so on.

@@ -57,5 +34,5 @@ You can update the arguments in the script `evaluation/agent_bench/scripts/run_i
 - `--eval-n-limit`: the number of examples to evaluate. For example, `100`.

 ```bash
-./evaluation/agent_bench/scripts/run_infer.sh eval_gpt35_turbo 0.6.2 CodeActAgent 1
+./evaluation/agent_bench/scripts/run_infer.sh eval_gpt35_turbo HEAD CodeActAgent 1
 ```
--- a/evaluation/agent_bench/helper.py
+++ b/evaluation/agent_bench/helper.py
@@ -14,7 +14,7 @@ def try_parse_answer(act) -> str | None:
        raw_ans = act.thought
    else:
        return None
-    agent_answer = re.findall(r'<solution>(.*?)</solution>', raw_ans)
+    agent_answer = re.findall(r'<solution>(.*?)</solution>', raw_ans, re.DOTALL)
    if not agent_answer:
        return None
    return agent_answer[0].strip()
--- a/evaluation/agent_bench/run_infer.py
+++ b/evaluation/agent_bench/run_infer.py
@@ -1,10 +1,9 @@
 import asyncio
-import logging
 import os
 import re
-import shutil
+import tempfile
+from typing import Any

-import docker
 import pandas as pd
 from datasets import load_dataset

@@ -16,64 +15,175 @@ from evaluation.agent_bench.helper import (
 )
 from evaluation.utils.shared import (
    EvalMetadata,
+    EvalOutput,
    make_metadata,
    prepare_dataset,
+    reset_logger_for_multiprocessing,
    run_evaluation,
 )
-from opendevin.controller.agent import Agent
 from opendevin.controller.state.state import State
-from opendevin.core.config import get_llm_config_arg, load_app_config, parse_arguments
-from opendevin.core.logger import get_console_handler
+from opendevin.core.config import (
+    AppConfig,
+    SandboxConfig,
+    get_llm_config_arg,
+    parse_arguments,
+)
 from opendevin.core.logger import opendevin_logger as logger
-from opendevin.core.main import run_agent_controller
-from opendevin.events.action import CmdRunAction, MessageAction
-from opendevin.llm.llm import LLM
-from opendevin.runtime.docker.ssh_box import DockerSSHBox
-
-config = load_app_config()
+from opendevin.core.main import create_runtime, run_controller
+from opendevin.events.action import AgentFinishAction, CmdRunAction, MessageAction
+from opendevin.events.observation import CmdOutputObservation
+from opendevin.runtime.runtime import Runtime


-def process_instance(
+def get_config(
+    metadata: EvalMetadata,
+) -> AppConfig:
+    config = AppConfig(
+        default_agent=metadata.agent_class,
+        run_as_devin=False,
+        runtime='eventstream',
+        max_iterations=metadata.max_iterations,
+        sandbox=SandboxConfig(
+            container_image='ubuntu:22.04',
+            enable_auto_lint=True,
+            use_host_network=False,
+        ),
+        # do not mount workspace
+        workspace_base=None,
+        workspace_mount_path=None,
+    )
+    config.set_llm_config(metadata.llm_config)
+    return config
+
+
+async def initialize_runtime(
+    runtime: Runtime,
+    instance: pd.Series,  # this argument is not required
+):
+    """Initialize the runtime for the agent.
+
+    This function is called before the runtime is used to run the agent.
+    """
+    logger.info(f"{'-' * 50} BEGIN Runtime Initialization Fn {'-' * 50}")
+    obs: CmdOutputObservation
+
+    # Set instance id
+    action = CmdRunAction(command='mkdir -p /workspace')
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = await runtime.run_action(action)
+    assert obs.exit_code == 0
+
+    action = CmdRunAction(command='cd /workspace')
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = await runtime.run_action(action)
+    assert obs.exit_code == 0
+
+    init_cmd = instance.init
+    if init_cmd is not None:
+        script_name = f'{instance.instance_id}_init.sh'
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            host_script_path = os.path.join(tmpdir, script_name)
+            create_sh_file(host_script_path, init_cmd)
+            await runtime.copy_to(
+                host_script_path,
+                '/workspace',
+            )
+
+        logger.info(f'Running init script: {script_name}')
+        action = CmdRunAction(command=f'chmod +x ./{script_name} && ./{script_name}')
+        logger.info(action, extra={'msg_type': 'ACTION'})
+        obs = await runtime.run_action(action)
+        logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+        assert obs.exit_code == 0
+
+    logger.info(f"{'-' * 50} END Runtime Initialization Fn {'-' * 50}")
+
+
+async def complete_runtime(
+    runtime: Runtime,
+    instance: pd.Series,  # this argument is not required, but it is used to get the workspace_dir_name
+) -> dict[str, Any]:
+    """Complete the runtime for the agent.
+
+    This function is called before the runtime is used to run the agent.
+    If you need to do something in the sandbox to get the correctness metric after
+    the agent has run, modify this function.
+    """
+    logger.info(f"{'-' * 50} BEGIN Runtime Completion Fn {'-' * 50}")
+    obs: CmdOutputObservation
+
+    agent_answer = None
+    get_agent_result_cmd = instance.get_agent_result
+    if get_agent_result_cmd is not None:
+        script_name = 'get_agent_result.sh'
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            host_script_path = os.path.join(tmpdir, script_name)
+            create_sh_file(host_script_path, get_agent_result_cmd)
+            await runtime.copy_to(
+                host_script_path,
+                '/workspace',
+            )
+            logger.info(f'Running get agent result cmd: {script_name}')
+
+        action = CmdRunAction(
+            command=f'chmod +x ./{script_name} && ./{script_name}',
+            keep_prompt=False,
+        )
+        logger.info(action, extra={'msg_type': 'ACTION'})
+        obs = await runtime.run_action(action)
+        logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+        assert obs.exit_code == 0
+        agent_answer = obs.content
+    # IF the agent answer is not found, retrieve it from the history
+    # We wait until the controller finishes
+
+    final_ans = None
+    if instance.ground_truth is not None:
+        final_ans = instance.ground_truth
+    else:
+        get_ground_truth_cmd = instance.get_ground_truth
+        if get_ground_truth_cmd is not None:
+            script_name = 'get_ground_truth.sh'
+            with tempfile.TemporaryDirectory() as tmpdir:
+                host_script_path = os.path.join(tmpdir, script_name)
+                create_sh_file(host_script_path, get_ground_truth_cmd)
+                await runtime.copy_to(
+                    host_script_path,
+                    '/workspace',
+                )
+            logger.info(f'Running get ground truth cmd: {script_name}')
+
+            action = CmdRunAction(
+                command=f'chmod +x ./{script_name} && ./{script_name}',
+                keep_prompt=False,
+            )
+            logger.info(action, extra={'msg_type': 'ACTION'})
+            obs = await runtime.run_action(action)
+            logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+            final_ans = obs.content
+
+    logger.info(f"{'-' * 50} END Runtime Completion Fn {'-' * 50}")
+    return {
+        'final_ans': final_ans,
+        'agent_answer': agent_answer,
+    }
+
+
+async def process_instance(
    instance: pd.Series,
    metadata: EvalMetadata,
    reset_logger: bool = True,
-):
-    # Create the agent
-    agent = Agent.get_cls(metadata.agent_class)(llm=LLM(config=metadata.llm_config))
+) -> EvalOutput:
+    config = get_config(metadata)

-    inst_id = instance.instance_id
-    question = instance.description
-    # create a directory for the instance's workspace
-    instance_workspace = str(os.path.join(config.workspace_base, inst_id))
-    container_inst_workspace = str(
-        os.path.join(config.workspace_mount_path_in_sandbox, inst_id)
-    )
-    if os.path.exists(instance_workspace):
-        shutil.rmtree(instance_workspace)
-    os.makedirs(instance_workspace, exist_ok=True)
-
-    # Set up the logger properly, so you can run multiprocessing to parallel the evaluation
+    # Setup the logger properly, so you can run multi-processing to parallelize the evaluation
    if reset_logger:
-        # Set up logger
-        log_file = os.path.join(
-            metadata.eval_output_dir, 'logs', f'instance_{inst_id}.log'
-        )
-        # Remove all existing handlers from logger
-        for handler in logger.handlers[:]:
-            logger.removeHandler(handler)
-        # add back the console handler to print ONE line
-        logger.addHandler(get_console_handler())
-        logger.info(
-            f'Starting evaluation for instance {inst_id}.\nHint: run "tail -f {log_file}" to see live logs in a separate shell'
-        )
-        # Remove all existing handlers from logger
-        for handler in logger.handlers[:]:
-            logger.removeHandler(handler)
-        file_handler = logging.FileHandler(log_file)
-        file_handler.setFormatter(
-            logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
-        )
-        logger.addHandler(file_handler)
+        log_dir = os.path.join(metadata.eval_output_dir, 'infer_logs')
+        reset_logger_for_multiprocessing(logger, instance.instance_id, log_dir)
+    else:
+        logger.info(f'Starting evaluation for instance {instance.instance_id}.')

    # =============================================
    # build instruction
@@ -86,104 +196,68 @@ def process_instance(
        'Please encapsulate your final answer (answer ONLY) within <solution> and </solution>.\n'
        'For example: The answer to the question is <solution> 42 </solution>.\n'
        '# Problem \n'
-        f'{question}\n\n'
+        f'{instance.description}\n\n'
    )
    instruction += (
        'IMPORTANT: You should ONLY interact with the environment provided '
        'to you AND NEVER ASK FOR HUMAN HELP.\n'
    )
    # NOTE: You can actually set slightly different instruction for different agents
-    instruction += INST_SUFFIXES[agent.__class__.__name__]
+    instruction += INST_SUFFIXES[metadata.agent_class]

    # =============================================
    # create sandbox and run the agent
    # =============================================

-    sandbox = DockerSSHBox(
-        config=config.sandbox,
-        persist_sandbox=False,
-        workspace_mount_path=config.workspace_mount_path,
-        sandbox_workspace_dir=config.workspace_mount_path_in_sandbox,
-        cache_dir=config.cache_dir,
-        run_as_devin=config.run_as_devin,
-    )
-    sandbox.execute(f'cd {inst_id}')
+    runtime: Runtime = await create_runtime(config, sid=instance.instance_id)

-    init_cmd = instance.init
-    if init_cmd is not None:
-        scpt_name = f'{instance.instance_id}_init.sh'
-        scpt_path = os.path.join(container_inst_workspace, scpt_name)
-        host_scpt_path = os.path.join(instance_workspace, scpt_name)
-        create_sh_file(host_scpt_path, init_cmd)
-        logger.info(f'Running init script: {scpt_path}')
-        _, init_res = sandbox.execute(scpt_path)
-        logger.info(f'Init script result: {init_res}')
+    await initialize_runtime(runtime, instance=instance)

    # Here's how you can run the agent (similar to the `main` function) and get the final task state
-    state: State | None = asyncio.run(
-        run_agent_controller(
-            agent,
-            instruction,
-            max_iterations=metadata.max_iterations,
-            max_budget_per_task=config.max_budget_per_task,
-            fake_user_response_fn=FAKE_RESPONSES[agent.__class__.__name__],
-            sandbox=sandbox,
-            sid=inst_id,
-        )
+    state: State | None = await run_controller(
+        config=config,
+        task_str=instruction,
+        runtime=runtime,
+        fake_user_response_fn=FAKE_RESPONSES[metadata.agent_class],
    )
-
    if state is None:
        raise ValueError('State should not be None.')

-    # get the ground truth
-    # OSBenchSSHBox.get_ground_truth(instance, state)
-
    # =============================================
    # result evaluation
    # =============================================

-    agent_answer = ''
-    get_agent_result_cmd = instance.get_agent_result
-    if get_agent_result_cmd is not None:
-        scpt_name = f'{instance.instance_id}_get_agent_result.sh'
-        scpt_path = os.path.join(container_inst_workspace, scpt_name)
-        host_scpt_path = os.path.join(instance_workspace, scpt_name)
-        create_sh_file(host_scpt_path, get_agent_result_cmd)
-        logger.info(f'Running get agent result cmd: {scpt_path}')
-        _, agent_answer = sandbox.execute(scpt_path)
-    else:
+    return_val = await complete_runtime(runtime, instance)
+    agent_answer = return_val['agent_answer']
+    final_ans = return_val['final_ans']
+
+    # If the agent answer is not found, retrieve it from the history
+    if agent_answer is None:
+        agent_answer = ''
        logger.info('Retrieving agent answer from history.')
        raw_ans = ''

        # retrieve the last agent message or thought
        for event in state.history.get_events(reverse=True):
-            if isinstance(event, MessageAction) and event.source == 'agent':
-                raw_ans = event.content
-            elif isinstance(event, CmdRunAction) and event.source == 'agent':
-                raw_ans = event.thought
+            if event.source == 'agent':
+                if isinstance(event, AgentFinishAction):
+                    raw_ans = event.thought
+                    break
+                elif isinstance(event, MessageAction):
+                    raw_ans = event.content
+                    break
+                elif isinstance(event, CmdRunAction):
+                    raw_ans = event.thought
+                    break

        # parse the answer for a solution tag
-        agent_answer = re.findall(r'<solution>(.*?)</solution>', raw_ans)
+        agent_answer = re.findall(r'<solution>(.*?)</solution>', raw_ans, re.DOTALL)
        if len(agent_answer) == 0:
            logger.warning(f'Failed to parse model answer: {raw_ans}')
            agent_answer = raw_ans
        else:
            agent_answer = agent_answer[0]

-    final_ans = ''
-    if instance.ground_truth is not None:
-        final_ans = instance.ground_truth
-    else:
-        get_ground_truth_cmd = instance.get_ground_truth
-        if get_ground_truth_cmd is not None:
-            scpt_name = f'{instance.instance_id}_get_ground_truth.sh'
-            scpt_path = os.path.join(container_inst_workspace, scpt_name)
-            host_scpt_path = os.path.join(instance_workspace, scpt_name)
-            create_sh_file(host_scpt_path, get_ground_truth_cmd)
-            logger.info(f'Running get ground truth cmd: {scpt_path}')
-            sandbox.execute(f'cd {container_inst_workspace}')
-            _, final_ans = sandbox.execute(scpt_path)
-
    comparison_method = instance.comparison_method
    logger.info(
        f'Final message: {agent_answer} | Ground truth: {final_ans} | Comparison method: {comparison_method}'
@@ -198,58 +272,49 @@ def process_instance(
    metrics = state.metrics.get() if state.metrics else None

    # Save the output
-    output = {
-        'instance_id': inst_id,
-        'instance': instance.to_dict(),
-        'instruction': instruction,
-        'metadata': metadata.model_dump(),
-        'history': histories,
-        'metrics': metrics,
-        'error': state.last_error if state and state.last_error else None,
-        'test_result': {
+    output = EvalOutput(
+        instance_id=instance.instance_id,
+        instance=instance.to_dict(),
+        instruction=instruction,
+        metadata=metadata,
+        history=histories,
+        metrics=metrics,
+        error=state.last_error if state and state.last_error else None,
+        test_result={
            'agent_answer': agent_answer,
            'final_answer': final_ans,
            'check_method': comparison_method,
            'result': test_result,
        },
-    }
-
-    # clean up
-    if os.path.exists(instance_workspace):
-        shutil.rmtree(instance_workspace)
-    # Close the sandbox
-    try:
-        sandbox.close()
-    except docker.errors.NotFound as e:
-        logger.error(f'Failed to close sandbox: {e}')
+    )
    return output


 if __name__ == '__main__':
-    id_column = 'instance_id'
    args = parse_arguments()
    dataset = load_dataset('iFurySt/AgentBench')
    agent_bench_tests = dataset['osbench'].to_pandas()

-    llm_config = get_llm_config_arg(args.llm_config) if args.llm_config else config.llm
-    logger.info(f'Config for evaluation: {config}')
+    llm_config = None
+    if args.llm_config:
+        llm_config = get_llm_config_arg(args.llm_config)
+
+    if llm_config is None:
+        raise ValueError(f'Could not find LLM config: --llm_config {args.llm_config}')

    metadata = make_metadata(
        llm_config,
-        args.dataset_name,
+        'AgentBench-OS',
        args.agent_cls,
        args.max_iterations,
        args.eval_note,
        args.eval_output_dir,
    )
    output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl')
-    instances = prepare_dataset(dataset, output_file, args.eval_n_limit, id_column)
+    instances = prepare_dataset(agent_bench_tests, output_file, args.eval_n_limit)

-    run_evaluation(
-        instances,
-        metadata,
-        output_file,
-        args.eval_num_workers,
-        process_instance,
-        id_column,
+    asyncio.run(
+        run_evaluation(
+            instances, metadata, output_file, args.eval_num_workers, process_instance
+        )
    )
--- a/evaluation/agent_bench/scripts/run_infer.sh
+++ b/evaluation/agent_bench/scripts/run_infer.sh
--- a/evaluation/biocoder/README.md
+++ b/evaluation/biocoder/README.md
@@ -2,15 +2,12 @@

 Implements evaluation of agents on BioCoder from the BioCoder benchmark introduced in [BioCoder: A Benchmark for Bioinformatics Code Generation with Large Language Models](https://arxiv.org/abs/2308.16458). Please see [here](https://github.com/bigcode-project/bigcode-evaluation-harness/blob/main/bigcode_eval/tasks/humanevalpack.py) for the reference implementation used in the paper.

-## Setup Environment
+## Setup Environment and LLM Configuration

-Please follow [this document](https://github.com/OpenDevin/OpenDevin/blob/main/Development.md) to setup local develop environment for OpenDevin.
-
-
-## Configure OpenDevin and your LLM
-Create a `config.toml` file if it does not exist at the root of the workspace. Please check [README.md](../../README.md) for how to set this up.
+Please follow instruction [here](../README.md#setup) to setup your local development environment and LLM.

 ## BioCoder Docker Image
+
 In the opendevin branch of the Biocoder repository, we have slightly modified our original Docker image to work with the OpenDevin environment. In the Docker image are testing scripts (`/testing/start_test_opendevin.py` and aux files in `/testing_files/`) to assist with evaluation. Additionally, we have installed all dependencies, including OpenJDK, mamba (with Python 3.6), and many system libraries. Notably, we have **not** packaged all repositories into the image, so they are downloaded at runtime.

 **Before first execution, pull our Docker image with the following command**
@@ -41,12 +38,12 @@ to `CodeActAgent`.
 - `eval_limit`, e.g. `10`, limits the evaluation to the first `eval_limit` instances. By default it infers all instances.

 Let's say you'd like to run 1 instance using `eval_gpt4_1106_eval_gpt4o_2024_05_13preview` and CodeActAgent
-with OpenDevin version 0.6.2, then your command would be:
+with current OpenDevin version, then your command would be:

 ## Examples

 ```bash
-./evaluation/biocoder/scripts/run_infer.sh eval_gpt4o_2024_05_13 0.6.2 CodeActAgent 1
+./evaluation/biocoder/scripts/run_infer.sh eval_gpt4o_2024_05_13 HEAD CodeActAgent 1
 ```

 ## Reference
--- a/evaluation/biocoder/biocoder_env_box.py
+++ b/evaluation/biocoder/biocoder_env_box.py
@@ -1,387 +0,0 @@
-import json
-import os
-import re
-import sys
-from collections import defaultdict
-from dataclasses import dataclass
-
-from datasets import load_dataset
-
-from opendevin.core.config import load_app_config
-from opendevin.core.logger import opendevin_logger as logger
-from opendevin.runtime.docker.ssh_box import DockerSSHBox
-from opendevin.runtime.plugins import (
-    JupyterRequirement,
-    PluginRequirement,
-    SWEAgentCommandsRequirement,
-)
-
-config = load_app_config()
-
-BIOCODER_BENCH_CONTAINER_IMAGE = 'public.ecr.aws/i5g0m1f6/eval_biocoder:v1.0'
-
-
-@dataclass
-class BiocoderData:
-    filePath: str
-    numLines: int
-    lineStart: int
-    lineEnd: int
-    signature: str
-    comment: str
-    content: str
-    repository: str
-    promptSummaryOnly: str
-    contextCode: str
-    goldenCode: str
-    test_case_id: str
-    language: str
-
-    def to_dict(self):
-        return {
-            'filePath': self.filePath,
-            'numLines': self.numLines,
-            'lineStart': self.lineStart,
-            'lineEnd': self.lineEnd,
-            'signature': self.signature,
-            'comment': self.comment,
-            'content': self.content,
-            'repository': self.repository,
-            'promptSummaryOnly': self.promptSummaryOnly,
-            'contextCode': self.contextCode,
-            'goldenCode': self.goldenCode,
-            'test_case_id': self.test_case_id,
-            'language': self.language,
-        }
-
-
-def get_likely_indent_size(array_of_tabs) -> int:
-    sizes = defaultdict(int)
-
-    for i in range(len(array_of_tabs) - 1):
-        diff = array_of_tabs[i + 1] - array_of_tabs[i]
-        if diff > 0:
-            sizes[diff] += 1
-    if len(sizes) == 0:
-        return 4
-    return int(max(sizes, key=sizes.get))
-
-
-class BiocoderSSHBox(DockerSSHBox):
-    def __init__(
-        self,
-        container_image: str,
-        timeout: int = 120,
-        sid: str | None = None,
-        biocoder_instance_id: str | None = None,
-        biocoder_instance: BiocoderData | None = None,
-        skip_workspace_mount: bool = True,
-        sandbox_plugins: list[PluginRequirement] = [],  # noqa: B006
-        biocoder_cache_folder: str = 'biocoder_cache',
-        workspace_dir_name: str | None = None,
-    ):
-        if biocoder_instance_id is None:
-            raise ValueError('biocoder_instance_id must be provided')
-        self.biocoder_instance_id = biocoder_instance_id
-        self.biocoder_instance = biocoder_instance
-        self.skip_workspace_mount = skip_workspace_mount
-        self.biocoder_cache_folder = biocoder_cache_folder
-        self.first_line_after_removed = None
-        self.workspace_dir_name = workspace_dir_name
-        self.workspace_base = config.workspace_base
-        self.workspace_mount_path = config.workspace_mount_path
-        # self.workspace_dir_name_host = os.path.join(config.workspace_base, workspace_dir_name)
-
-        self.context_path = None
-        self.generated_path = None
-        self.golden_path = None
-
-        assert (
-            container_image is not None
-        ), 'container_image is required for BiocoderBenchSSHBox!'
-        super().__init__(container_image, timeout, sid)
-        self.init_plugins(sandbox_plugins)
-
-    @property
-    def volumes(self):
-        if self.skip_workspace_mount:
-            return {
-                k: v
-                for k, v in super().volumes.items()
-                if not v['bind'] == self.sandbox_workspace_dir
-            }
-        return super().volumes
-
-    def get_target_filepath(self):
-        target_filepath = os.path.join(
-            self.workspace_mount_path,
-            self.biocoder_instance.repository.split('/')[1],
-            self.biocoder_instance.filePath,
-        )
-        return target_filepath
-
-    def get_changed_code(self, include_signature=False):
-        # copies changed code into /testing_files/
-        # Note that this does NOT copy the function signature
-        target_filepath = self.get_target_filepath()
-        selected_lines = []
-        offset = 1 if include_signature else 0
-        if self.first_line_after_removed is None:
-            logger.warning('First line after removed is None')
-        with open(target_filepath, 'r') as f:
-            lines = f.read().split('\n')
-            for i in range(self.biocoder_instance.lineStart - offset, len(lines)):
-                if lines[i].strip() == self.first_line_after_removed.strip():
-                    break
-                selected_lines.append(lines[i])
-        text = '\n'.join(selected_lines)
-        return text
-
-    def copy_changed_code(self):
-        changed_code = self.get_changed_code(include_signature=True)
-        with open(self.generated_path, 'w') as f:
-            f.write(changed_code)
-        exit_code, output = self.execute_and_check(
-            f'cp -r /workspace/{self.biocoder_cache_folder}/* /testing_files',
-            'Failed to copy the files',
-        )
-
-    def remove_code(self):
-        comment_prefix = {'python': '#', 'java': '//'}
-
-        target_filepath = self.get_target_filepath()
-        line_start = self.biocoder_instance.lineStart
-        line_end = self.biocoder_instance.lineEnd
-        with open(target_filepath, 'r') as f:
-            lines = f.read().split('\n')
-            # print("="*10+"ORIGINAL"+"="*10)
-            # print("\n".join(lines))
-            signature_line = lines[line_start - 1]
-
-            # get the number of tabs
-            def get_indent_size(s: str):
-                return len(re.match(r'\s*', s).group())
-
-            indent_sizes = list(map(get_indent_size, lines))
-            indent_size = get_likely_indent_size(indent_sizes)
-            comment_indent_size = get_indent_size(signature_line) + indent_size
-            lines = (
-                lines[:line_start]
-                + [
-                    f"{' '*comment_indent_size+comment_prefix[self.biocoder_instance.language.lower()]}TODO: replace with your code here"
-                ]
-                + ([''] * 2)
-                + lines[line_end:]
-            )
-        first_line_after_removed_index = line_start
-        while len(
-            lines[first_line_after_removed_index].strip()
-        ) == 0 and first_line_after_removed_index < len(lines):
-            first_line_after_removed_index += 1
-        self.first_line_after_removed = lines[first_line_after_removed_index]
-        # print("FIRST LINE AFTER REMOVED: ", self.first_line_after_removed)
-
-        with open(target_filepath, 'w') as f:
-            f.write('\n'.join(lines))
-
-        # with open(target_filepath, 'r') as f:
-        #     print("="*10+"MODIFIED"+"="*10)
-        #     print(f.read())
-
-    def execute_and_check(self, cmd: str, error_msg: str) -> tuple[int, str]:
-        exit_code, output = self.execute(cmd)
-        if exit_code != 0:
-            logger.error(error_msg)
-            sys.exit(1)
-        return exit_code, output
-
-    @classmethod
-    def get_box_for_instance(
-        cls,
-        instance,
-        workspace_dir_name=None,
-        skip_workspace_mount: bool = False,
-        workspace_mount_path: str | None = None,
-        sandbox_plugins: list[PluginRequirement] = [],  # noqa: B006
-    ) -> 'BiocoderSSHBox':
-        """This method initializes a container image, then runs some initialization commands"""
-        if workspace_dir_name is None:
-            workspace_dir_name = f'{instance.repository}__{instance.test_case_id[:10]}__{os.getpid()}'.replace(
-                '/', '__'
-            )
-
-        workspace_base = str(os.path.join(config.workspace_base, workspace_dir_name))
-        old_workspace_base = config.workspace_base
-        old_workspace_mount_path = config.workspace_mount_path
-
-        try:
-            config.workspace_base = workspace_base
-            config.workspace_mount_path = workspace_base
-
-            # linting python after editing helps LLM fix indentations
-            config.sandbox.enable_auto_lint = True
-
-            # create folder for transferring files back/forth
-            biocoder_cache_folder = 'biocoder_cache'
-            if not os.path.exists(os.path.join(workspace_base, biocoder_cache_folder)):
-                os.makedirs(
-                    os.path.join(workspace_base, biocoder_cache_folder), exist_ok=True
-                )
-
-            file_ext = {
-                'python': 'py',
-                'java': 'java',
-                'c': 'c',
-                'cpp': 'cpp',
-                'javascript': 'js',
-                'typescript': 'ts',
-            }[instance.language.lower()]
-
-            context_path = os.path.join(
-                workspace_base, biocoder_cache_folder, 'context.' + file_ext
-            )
-            generated_path = os.path.join(
-                workspace_base, biocoder_cache_folder, 'generated.' + file_ext
-            )
-            golden_path = os.path.join(
-                workspace_base, biocoder_cache_folder, 'golden.' + file_ext
-            )
-
-            # print(instance.contextCode)
-            with open(context_path, 'w') as f:
-                f.write(instance.contextCode)
-            with open(generated_path, 'w') as f:
-                f.write(instance.goldenCode)
-            with open(golden_path, 'w') as f:
-                f.write(instance.goldenCode)
-
-            testcase_json = {
-                'test_case_id': instance.test_case_id,
-                'num_cases': 1000,
-                'language': instance.language.lower(),
-            }
-
-            with open(
-                os.path.join(
-                    workspace_base, biocoder_cache_folder, 'testcase_biocoder.json'
-                ),
-                'w',
-            ) as f:
-                f.write(json.dumps(testcase_json, indent=4))
-
-            # linting python after editing helps LLM fix indentations
-            config.sandbox.enable_auto_lint = True
-
-            sandbox = cls(
-                container_image=BIOCODER_BENCH_CONTAINER_IMAGE,
-                biocoder_instance_id=instance.test_case_id,
-                biocoder_instance=instance,
-                skip_workspace_mount=skip_workspace_mount,
-                sandbox_plugins=sandbox_plugins,
-                biocoder_cache_folder=biocoder_cache_folder,
-                workspace_dir_name=workspace_dir_name,
-            )
-        except Exception:
-            raise
-        finally:
-            config.workspace_base = old_workspace_base
-            config.workspace_mount_path = old_workspace_mount_path
-
-        sandbox.context_path = context_path
-        sandbox.generated_path = generated_path
-        sandbox.golden_path = golden_path
-
-        logger.info(f'SSH box started for instance {instance.test_case_id}.')
-        # cd to the workspace
-        exit_code, output = sandbox.execute_and_check(
-            'cd /workspace', 'Failed to cd to workspace'
-        )
-        logger.info(f'cd to workspace: {output}')
-
-        # download repository archive
-        repository_url = f"https://biocoder.lilbillbiscuit.com/repos/{instance.repository.split('/')[1]}.zip"
-        exit_code, output = sandbox.execute_and_check(
-            'wget -O repo.zip ' + repository_url, 'Failed to download the repository'
-        )
-        logger.info(f'Downloaded the repository: {output}')
-        exit_code, output = sandbox.execute_and_check(
-            'unzip -o -q repo.zip', 'Failed to unzip the repository'
-        )
-        logger.info(f'Unzipped the repository: {output}')
-
-        # copy the context, generated and golden files to the /testing_files folder
-        exit_code, output = sandbox.execute_and_check(
-            f'cp -r /workspace/{biocoder_cache_folder}/* /testing_files',
-            'Failed to copy the files',
-        )
-
-        # chmod 777
-        exit_code, output = sandbox.execute_and_check(
-            'chmod -R 777 /workspace',
-            'Failed to chmod the files',
-        )
-
-        return sandbox
-
-
-if __name__ == '__main__':
-    biocoder_dataset = load_dataset('Lilbillbiscuit/biocoder_public')
-    EXAMPLE_INSTANCE = biocoder_dataset['test'][0]
-    EXAMPLE_INSTANCE = BiocoderData(**EXAMPLE_INSTANCE)
-
-    sandbox = BiocoderSSHBox.get_box_for_instance(
-        instance=EXAMPLE_INSTANCE,
-        workspace_mount_path='/home/ubuntu/OpenDevinBioCoder/workspace',
-        skip_workspace_mount=False,
-        sandbox_plugins=[JupyterRequirement(), SWEAgentCommandsRequirement()],
-    )
-
-    # PRE TEST
-    exit_code, output = sandbox.execute_and_check(
-        'cd /testing',
-        'Failed to cd /testing',
-    )
-    logger.info(f'cd $REPO_PATH: {output}')
-
-    exit_code, output = sandbox.execute_and_check(
-        'whoami',
-        'Failed to run whoami',
-    )
-    logger.info(f'whoami: {output}')
-
-    # TEST
-    exit_code, output = sandbox.execute(
-        '/home/devin/mambaforge/bin/mamba run -n test python3 /testing/start_test_opendevin.py'
-    )
-    assert exit_code == 0, 'Expected exit code 0 (this should have passed)'
-    logger.info(f'$TEST_CMD:\n{output}')
-
-    exit_code, output = sandbox.execute_and_check(
-        'cat /testing_files/results_biocoder.json', 'Failed to read the result file'
-    )
-
-    print(output)
-    json_obj = json.loads(output)
-    if json_obj['result'] == 'pass':
-        print('PASS')
-    else:
-        print('FAIL')
-
-    sys.stdout.flush()
-    try:
-        while True:
-            try:
-                user_input = input('>>> ')
-            except EOFError:
-                logger.info('Exiting...')
-                break
-            if user_input.lower() == 'exit':
-                logger.info('Exiting...')
-                break
-            exit_code, output = sandbox.execute(user_input)
-            logger.info('exit code: %d', exit_code)
-            logger.info(output)
-            sys.stdout.flush()
-    except KeyboardInterrupt:
-        logger.info('Exiting...')
-    sandbox.close()
--- a/evaluation/biocoder/run_infer.py
+++ b/evaluation/biocoder/run_infer.py
@@ -1,33 +1,38 @@
 import asyncio
+import functools
 import json
-import logging
 import os
-import pathlib
-from functools import partial
+import tempfile
+from typing import Any

 import pandas as pd
 from datasets import load_dataset

-from evaluation.biocoder.biocoder_env_box import BiocoderData, BiocoderSSHBox
+from evaluation.biocoder.utils import BiocoderData
 from evaluation.utils.shared import (
    EvalMetadata,
+    EvalOutput,
    codeact_user_response,
    make_metadata,
    prepare_dataset,
+    reset_logger_for_multiprocessing,
    run_evaluation,
 )
-from opendevin.controller.agent import Agent
 from opendevin.controller.state.state import State
-from opendevin.core.config import get_llm_config_arg, load_app_config, parse_arguments
-from opendevin.core.logger import get_console_handler
+from opendevin.core.config import (
+    AppConfig,
+    SandboxConfig,
+    get_llm_config_arg,
+    parse_arguments,
+)
 from opendevin.core.logger import opendevin_logger as logger
-from opendevin.core.main import run_agent_controller
-from opendevin.llm.llm import LLM
-
-config = load_app_config()
+from opendevin.core.main import create_runtime, run_controller
+from opendevin.events.action import CmdRunAction
+from opendevin.events.observation import CmdOutputObservation
+from opendevin.runtime.runtime import Runtime

 AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = {
-    'CodeActAgent': partial(
+    'CodeActAgent': functools.partial(
        codeact_user_response, encapsulate_solution=True, try_parse=None
    ),
 }
@@ -36,111 +41,218 @@ AGENT_CLS_TO_INST_SUFFIX = {
    'CodeActAgent': 'When you think you have fixed the issue through code changes, please run the following command: <execute_bash> exit </execute_bash>.\n'
 }

+FILE_EXT_MAP = {
+    'python': 'py',
+    'java': 'java',
+    'c': 'c',
+    'cpp': 'cpp',
+    'javascript': 'js',
+    'typescript': 'ts',
+}
+
+
+def get_config(
+    metadata: EvalMetadata,
+) -> AppConfig:
+    BIOCODER_BENCH_CONTAINER_IMAGE = 'public.ecr.aws/i5g0m1f6/eval_biocoder:v1.0'
+
+    config = AppConfig(
+        default_agent=metadata.agent_class,
+        run_as_devin=False,
+        runtime='eventstream',
+        max_iterations=metadata.max_iterations,
+        sandbox=SandboxConfig(
+            container_image=BIOCODER_BENCH_CONTAINER_IMAGE,
+            enable_auto_lint=True,
+            use_host_network=False,
+        ),
+        # do not mount workspace
+        workspace_base=None,
+        workspace_mount_path=None,
+    )
+    config.set_llm_config(metadata.llm_config)
+    return config
+
+
+async def initialize_runtime(
+    runtime: Runtime,
+    instance: BiocoderData,  # this argument is not required
+):
+    """Initialize the runtime for the agent.
+
+    This function is called before the runtime is used to run the agent.
+    """
+    logger.info(f"{'-' * 50} BEGIN Runtime Initialization Fn {'-' * 50}")
+    obs: CmdOutputObservation
+
+    file_ext = FILE_EXT_MAP[instance.language.lower()]
+
+    action = CmdRunAction(command='mkdir -p /workspace && mkdir -p /testing_files')
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = await runtime.run_action(action)
+    assert obs.exit_code == 0
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        context_path = os.path.join(tmpdir, 'context.' + file_ext)
+        with open(context_path, 'w') as f:
+            f.write(instance.contextCode)
+        await runtime.copy_to(context_path, '/testing_files')
+
+        golden_path = os.path.join(tmpdir, 'golden.' + file_ext)
+        with open(golden_path, 'w') as f:
+            f.write(instance.goldenCode)
+        await runtime.copy_to(golden_path, '/testing_files')
+
+        testcase_json = {
+            'test_case_id': instance.test_case_id,
+            'num_cases': 1000,
+            'language': instance.language.lower(),
+        }
+        testcase_path = os.path.join(tmpdir, 'testcase_biocoder.json')
+        with open(testcase_path, 'w') as f:
+            f.write(json.dumps(testcase_json, indent=4))
+
+        await runtime.copy_to(testcase_path, '/testing_files')
+
+    # setup paths
+    remove_code_script = os.path.join(
+        os.path.dirname(__file__), 'scripts', 'setup', 'remove_code.py'
+    )
+    await runtime.copy_to(remove_code_script, '/testing_files')
+
+    action = CmdRunAction(command='cd /workspace')
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = await runtime.run_action(action)
+    assert obs.exit_code == 0
+
+    # download repository archive
+    repository_url = f"https://biocoder.lilbillbiscuit.com/repos/{instance.repository.split('/')[1]}.zip"
+    action = CmdRunAction(command='wget -O repo.zip ' + repository_url)
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = await runtime.run_action(action)
+    assert obs.exit_code == 0, f'Failed to download the repository: {obs.content}'
+
+    # unzip the repository
+    action = CmdRunAction(command='unzip -o -q repo.zip && rm repo.zip')
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = await runtime.run_action(action)
+    assert obs.exit_code == 0, f'Failed to unzip the repository: {obs.content}'
+
+    # chmod 777
+    action = CmdRunAction(command='chmod -R 777 /workspace')
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = await runtime.run_action(action)
+    assert obs.exit_code == 0, f'Failed to chmod the files: {obs.content}'
+
+    # remove code for evaluation instance
+    target_filepath = os.path.join(
+        '/workspace', instance.repository.split('/')[1], instance.filePath
+    )
+    line_start = instance.lineStart
+    line_end = instance.lineEnd
+    language = instance.language.lower()
+    action = CmdRunAction(
+        command=f'python3 /testing_files/remove_code.py --target_filepath {target_filepath} --line_start {line_start} --line_end {line_end} --language {language}'
+    )
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = await runtime.run_action(action)
+    assert obs.exit_code == 0, f'Failed to remove the code: {obs.content}'
+
+    logger.info(f"{'-' * 50} END Runtime Initialization Fn {'-' * 50}")
+
+
+async def complete_runtime(
+    runtime: Runtime,
+    instance: pd.Series,  # this argument is not required, but it is used to get the workspace_dir_name
+) -> dict[str, Any]:
+    """Complete the runtime for the agent.
+
+    This function is called before the runtime is used to run the agent.
+    If you need to do something in the sandbox to get the correctness metric after
+    the agent has run, modify this function.
+    """
+    logger.info(f"{'-' * 50} BEGIN Runtime Completion Fn {'-' * 50}")
+    obs: CmdOutputObservation

-def get_test_result(instance, sandbox, workspace_dir_name):
    test_result = {'result': {}, 'metadata': {}}
-    try:
-        code = sandbox.get_changed_code(include_signature=True)
-        sandbox.copy_changed_code()
+
+    copy_changed_code_script = os.path.join(
+        os.path.dirname(__file__), 'scripts', 'setup', 'copy_changed_code.py'
+    )
+    await runtime.copy_to(copy_changed_code_script, '/testing_files')
+
+    file_ext = FILE_EXT_MAP[instance.language.lower()]
+    target_filepath = os.path.join(
+        '/workspace', instance.repository.split('/')[1], instance.filePath
+    )
+    generated_path = os.path.join('/testing_files', 'generated.' + file_ext)
+
+    action = CmdRunAction(
+        command=f'python3 /testing_files/copy_changed_code.py --target_filepath {target_filepath} --generated_code_filepath {generated_path} --line_start {instance.lineStart} --include_signature'
+    )
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = await runtime.run_action(action)
+    if obs.exit_code == 0:
        test_result['metadata']['1_copy_change_success'] = True
+
+        action = CmdRunAction(command=f'cat {generated_path}', keep_prompt=False)
+        logger.info(action, extra={'msg_type': 'ACTION'})
+        obs = await runtime.run_action(action)
+        assert obs.exit_code == 0
+
+        code = obs.content
        test_result['metadata']['1_copy_change_code'] = code
-    except Exception:
-        logger.error('Error fetching changed code for this instance')
+    else:
        test_result['metadata']['1_copy_change_success'] = False
        test_result['metadata']['1_copy_change_code'] = None

-    exit_code, output = sandbox.execute_and_check(
-        'cd /testing',
-        'Failed to cd /testing',
-    )
-    logger.info(f'cd $REPO_PATH: {output}')
+    action = CmdRunAction(command='cd /testing_files')
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = await runtime.run_action(action)
+    assert obs.exit_code == 0

-    exit_code, output = sandbox.execute_and_check(
-        'whoami',
-        'Failed to run whoami',
+    action = CmdRunAction(
+        command='/home/devin/mambaforge/bin/mamba run -n test python3 /testing/start_test_opendevin.py'
    )
-    logger.info(f'whoami: {output}')
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = await runtime.run_action(action)
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    assert obs.exit_code == 0

-    exit_code, output = sandbox.execute(
-        '/home/devin/mambaforge/bin/mamba run -n test python3 /testing/start_test_opendevin.py'
+    action = CmdRunAction(
+        command='cat /testing_files/results_biocoder.json', keep_prompt=False
    )
-    logger.info(f'$TEST_CMD:\n{output}')
-
-    exit_code, output = sandbox.execute_and_check(
-        'cat /testing_files/results_biocoder.json', 'Failed to read the result file'
-    )
-    if exit_code == 0:
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = await runtime.run_action(action)
+    if obs.exit_code == 0:
        test_result['metadata']['2_run_test_success'] = True
-        test_result['metadata']['2_run_test_result'] = str(output)
+        test_result['metadata']['2_run_test_result'] = str(obs.content)
+        json_obj = json.loads(obs.content)
+        test_result['result'] = json_obj['result']
    else:
        test_result['metadata']['2_run_test_success'] = False
-        test_result['metadata']['2_run_test_result'] = str(output)
-    json_obj = json.loads(output)
-    test_result['result'] = json_obj['result']
+        test_result['metadata']['2_run_test_result'] = str(obs.content)

+    logger.info(f"{'-' * 50} END Runtime Completion Fn {'-' * 50}")
    return test_result


-def process_instance(
+async def process_instance(
    instance: pd.Series,
    metadata: EvalMetadata,
    reset_logger: bool = True,
-):
-    # Create the agent
-    agent = Agent.get_cls(metadata.agent_class)(llm=LLM(config=metadata.llm_config))
+) -> EvalOutput:
+    config = get_config(metadata)
    instance = BiocoderData(**instance)
    print(instance)
-    workspace_dir_name = (
-        f'{instance.repository}__{instance.test_case_id[:10]}__{os.getpid()}'.replace(
-            '/', '__'
-        )
-    )
-    workspace_mount_path = os.path.join(config.workspace_base, workspace_dir_name)
-    # create process-specific workspace dir
-    # if `not skip_workspace_mount` - we will create a workspace directory for EACH process
-    # so that different agent don't interfere with each other.
-    workspace_mount_path = os.path.join(workspace_mount_path, str(os.getpid()))
-    pathlib.Path(workspace_mount_path).mkdir(parents=True, exist_ok=True)
+    instance_id = f'{instance.repository}__{instance.instance_id[:10]}'

-    # Setup the logger properly, so you can run multi-processing to parallize the evaluation
+    # Setup the logger properly, so you can run multi-processing to parallelize the evaluation
    if reset_logger:
-        # Set up logger
-        log_file = os.path.join(
-            metadata.eval_output_dir, 'logs', f'instance_{instance.test_case_id}.log'
-        )
-        # Remove all existing handlers from logger
-        for handler in logger.handlers[:]:
-            logger.removeHandler(handler)
-        # add back the console handler to print ONE line
-        logger.addHandler(get_console_handler())
-        logger.info(
-            f'Starting evaluation for instance {instance.test_case_id}.\nHint: run "tail -f {log_file}" to see live logs in a seperate shell'
-        )
-        # Remove all existing handlers from logger
-        for handler in logger.handlers[:]:
-            logger.removeHandler(handler)
-        file_handler = logging.FileHandler(log_file)
-        file_handler.setFormatter(
-            logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
-        )
-        logger.addHandler(file_handler)
-
-    logger.info(f'Process-specific workspace mounted at {workspace_mount_path}')
-
-    # NOTE: this is something special we do for SWE-Bench due to the reason described in the previous section
-    # You can omit this if you don't need to setup specialized sandbox
-    workspace_dir_name = f'{instance.repository}__{instance.test_case_id[:10]}'.replace(
-        '/', '__'
-    )
-    sandbox = BiocoderSSHBox.get_box_for_instance(
-        instance,
-        workspace_dir_name,
-        skip_workspace_mount=False,
-        workspace_mount_path=workspace_mount_path,
-        sandbox_plugins=agent.sandbox_plugins,
-    )
-
-    sandbox.remove_code()
+        log_dir = os.path.join(metadata.eval_output_dir, 'infer_logs')
+        reset_logger_for_multiprocessing(logger, instance_id, log_dir)
+    else:
+        logger.info(f'Starting evaluation for instance {instance_id}.')

    # Prepare instruction
    instruction = (
@@ -160,80 +272,76 @@ def process_instance(
        'Make sure to include proper formatting in Java and Python, including correct braces and/or indentation.\n'
    )
    # NOTE: You can actually set slightly different instruction for different agents
-    instruction += AGENT_CLS_TO_INST_SUFFIX[agent.__class__.__name__]
+    instruction += AGENT_CLS_TO_INST_SUFFIX[metadata.agent_class]

    # use a session id for concurrent evaluation
-    sid = instance.test_case_id.replace('/', '__')
+    sid = instance.instance_id.replace('/', '__')
+
+    runtime = await create_runtime(config, sid=sid)
+
+    await initialize_runtime(runtime, instance)

    # Here's how you can run the agent (similar to the `main` function) and get the final task state
-    state: State | None = asyncio.run(
-        run_agent_controller(
-            agent,
-            instruction,
-            max_iterations=metadata.max_iterations,
-            max_budget_per_task=config.max_budget_per_task,
-            fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN[
-                agent.__class__.__name__
-            ],
-            sandbox=sandbox,
-            sid=sid,
-        )
+    state: State | None = await run_controller(
+        config=config,
+        task_str=instruction,
+        runtime=runtime,
+        fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN[metadata.agent_class],
    )

-    test_result = get_test_result(instance, sandbox, workspace_dir_name)
-
    if state is None:
        raise ValueError('State should not be None.')
-    metrics = state.metrics.get() if state.metrics else None

+    test_result = await complete_runtime(runtime, instance)
+    metrics = state.metrics.get() if state.metrics else None
    # history is now available as a stream of events, rather than list of pairs of (Action, Observation)
    # for compatibility with the existing output format, we can remake the pairs here
    # remove when it becomes unnecessary
    histories = state.history.compatibility_for_eval_history_pairs()

-    # Save the output
-    output = {
-        'test_case_id': instance.test_case_id,
-        'biocoder_instance': instance.to_dict(),
-        'instruction': instruction,
-        'generated': test_result['metadata']['1_copy_change_code'],
-        'metadata': metadata.model_dump(),
-        'history': histories,
-        'metrics': metrics,
-        'error': state.last_error if state and state.last_error else None,
-        'test_result': test_result,
-    }
+    test_result['generated'] = test_result['metadata']['1_copy_change_code']

-    # Close the sandbox
-    sandbox.close()
+    # Save the output
+    output = EvalOutput(
+        instance_id=instance.instance_id,
+        instance=instance.to_dict(),
+        instruction=instruction,
+        metadata=metadata,
+        history=histories,
+        metrics=metrics,
+        error=state.last_error if state and state.last_error else None,
+        test_result=test_result,
+    )
    return output


 if __name__ == '__main__':
-    id_column = 'test_case_id'
    args = parse_arguments()
-    dataset = load_dataset('lilbillbiscuit/biocoder_public')
-    biocoder_tests = dataset['test'].to_pandas()

-    llm_config = get_llm_config_arg(args.llm_config) if args.llm_config else config.llm
-    logger.info(f'Config for evaluation: {config}')
+    dataset = load_dataset('lilbillbiscuit/biocoder_public')
+    biocoder_tests = dataset['train'].to_pandas()
+    biocoder_tests['instance_id'] = biocoder_tests['test_case_id']
+
+    llm_config = None
+    if args.llm_config:
+        llm_config = get_llm_config_arg(args.llm_config)
+
+    if llm_config is None:
+        raise ValueError(f'Could not find LLM config: --llm_config {args.llm_config}')

    metadata = make_metadata(
        llm_config,
-        args.dataset_name,
+        'biocoder',
        args.agent_cls,
        args.max_iterations,
        args.eval_note,
        args.eval_output_dir,
    )
    output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl')
-    instances = prepare_dataset(dataset, output_file, args.eval_n_limit, id_column)
+    instances = prepare_dataset(biocoder_tests, output_file, args.eval_n_limit)

-    run_evaluation(
-        instances,
-        metadata,
-        output_file,
-        args.eval_num_workers,
-        process_instance,
-        id_column,
+    asyncio.run(
+        run_evaluation(
+            instances, metadata, output_file, args.eval_num_workers, process_instance
+        )
    )
--- a/evaluation/biocoder/scripts/run_infer.sh
+++ b/evaluation/biocoder/scripts/run_infer.sh
--- a/evaluation/biocoder/scripts/setup/copy_changed_code.py
+++ b/evaluation/biocoder/scripts/setup/copy_changed_code.py
@@ -0,0 +1,45 @@
+import argparse
+
+
+def get_changed_code(target_filepath, line_start, include_signature=False):
+    # copies changed code into /testing_files/
+    # Note that this does NOT copy the function signature
+    selected_lines = []
+    offset = 1 if include_signature else 0
+
+    with open('/testing_files/first_line_after_removed.txt', 'r') as f:
+        first_line_after_removed = f.read()
+    if first_line_after_removed is None:
+        print('First line after removed is None')
+
+    with open(target_filepath, 'r') as f:
+        lines = f.read().split('\n')
+        for i in range(line_start - offset, len(lines)):
+            if lines[i].strip() == first_line_after_removed.strip():
+                break
+            selected_lines.append(lines[i])
+    text = '\n'.join(selected_lines)
+    return text
+
+
+def copy_changed_code(
+    target_filepath, generated_code_filepath, line_start, include_signature=False
+):
+    changed_code = get_changed_code(target_filepath, line_start, include_signature)
+    with open(generated_code_filepath, 'w') as f:
+        f.write(changed_code)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--target_filepath', type=str, required=True)
+    parser.add_argument('--generated_code_filepath', type=str, required=True)
+    parser.add_argument('--line_start', type=int, required=True)
+    parser.add_argument('--include_signature', action='store_true')
+    args = parser.parse_args()
+    copy_changed_code(
+        args.target_filepath,
+        args.generated_code_filepath,
+        args.line_start,
+        args.include_signature,
+    )
--- a/evaluation/biocoder/scripts/setup/remove_code.py
+++ b/evaluation/biocoder/scripts/setup/remove_code.py
@@ -0,0 +1,74 @@
+import argparse
+import os
+import re
+from collections import defaultdict
+
+
+def get_likely_indent_size(array_of_tabs) -> int:
+    sizes = defaultdict(int)
+
+    for i in range(len(array_of_tabs) - 1):
+        diff = array_of_tabs[i + 1] - array_of_tabs[i]
+        if diff > 0:
+            sizes[diff] += 1
+    if len(sizes) == 0:
+        return 4
+    return int(max(sizes, key=sizes.get))
+
+
+def get_target_filepath(self):
+    target_filepath = os.path.join(
+        self.workspace_mount_path,
+        self.biocoder_instance.repository.split('/')[1],
+        self.biocoder_instance.filePath,
+    )
+    return target_filepath
+
+
+def remove_code(target_filepath: str, line_start: int, line_end: int, language: str):
+    comment_prefix = {'python': '#', 'java': '//'}
+
+    with open(target_filepath, 'r') as f:
+        lines = f.read().split('\n')
+        # print("="*10+"ORIGINAL"+"="*10)
+        # print("\n".join(lines))
+        signature_line = lines[line_start - 1]
+
+        # get the number of tabs
+        def get_indent_size(s: str):
+            return len(re.match(r'\s*', s).group())
+
+        indent_sizes = list(map(get_indent_size, lines))
+        indent_size = get_likely_indent_size(indent_sizes)
+        comment_indent_size = get_indent_size(signature_line) + indent_size
+        lines = (
+            lines[:line_start]
+            + [
+                f"{' '*comment_indent_size+comment_prefix[language.lower()]}TODO: replace with your code here"
+            ]
+            + ([''] * 2)
+            + lines[line_end:]
+        )
+    first_line_after_removed_index = line_start
+    while len(
+        lines[first_line_after_removed_index].strip()
+    ) == 0 and first_line_after_removed_index < len(lines):
+        first_line_after_removed_index += 1
+
+    first_line_after_removed = lines[first_line_after_removed_index]
+    print('FIRST LINE AFTER REMOVED: ', first_line_after_removed)
+    with open('/testing_files/first_line_after_removed.txt', 'w') as f:
+        f.write(first_line_after_removed)
+
+    with open(target_filepath, 'w') as f:
+        f.write('\n'.join(lines))
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--target_filepath', type=str, required=True)
+    parser.add_argument('--line_start', type=int, required=True)
+    parser.add_argument('--line_end', type=int, required=True)
+    parser.add_argument('--language', type=str, required=True)
+    args = parser.parse_args()
+    remove_code(args.target_filepath, args.line_start, args.line_end, args.language)
--- a/evaluation/biocoder/utils.py
+++ b/evaluation/biocoder/utils.py
@@ -0,0 +1,36 @@
+from dataclasses import dataclass
+
+
+@dataclass
+class BiocoderData:
+    instance_id: str
+    filePath: str
+    numLines: int
+    lineStart: int
+    lineEnd: int
+    signature: str
+    comment: str
+    content: str
+    repository: str
+    promptSummaryOnly: str
+    contextCode: str
+    goldenCode: str
+    test_case_id: str
+    language: str
+
+    def to_dict(self):
+        return {
+            'filePath': self.filePath,
+            'numLines': self.numLines,
+            'lineStart': self.lineStart,
+            'lineEnd': self.lineEnd,
+            'signature': self.signature,
+            'comment': self.comment,
+            'content': self.content,
+            'repository': self.repository,
+            'promptSummaryOnly': self.promptSummaryOnly,
+            'contextCode': self.contextCode,
+            'goldenCode': self.goldenCode,
+            'test_case_id': self.test_case_id,
+            'language': self.language,
+        }
--- a/evaluation/bird/README.md
+++ b/evaluation/bird/README.md
@@ -2,43 +2,14 @@

 Implements evaluation of agents on BIRD introduced in [Can LLM Already Serve as A Database Interface? A BIg Bench for Large-Scale Database Grounded Text-to-SQLs](https://arxiv.org/abs/2305.03111). Please see [here](https://bird-bench.github.io/) for the reference implementation used in the paper.

-## Setup Environment
+## Setup Environment and LLM Configuration

-Please follow [this document](https://github.com/OpenDevin/OpenDevin/blob/main/Development.md) to setup local develop environment for OpenDevin.
-
-
-## Configure OpenDevin and your LLM
-
-Create a `config.toml` file if it does not exist at the root of the workspace.
-
-Add the following configurations:
-
-```toml
-[core]
-max_iterations = 100
-cache_dir = "/tmp/cache"
-ssh_hostname = "localhost"
-
-[sandbox]
-enable_auto_lint = true
-
-# TODO: Change these to the model you want to evaluate
-[llm.eval_gpt4_1106_preview]
-model = "gpt-4-1106-preview"
-api_key = "XXX"
-temperature = 0.0
-
-[llm.eval_some_openai_compatible_model]
-model = "openai/MODEL_NAME"
-base_url = "https://OPENAI_COMPATIBLE_URL/v1"
-api_key = "XXX"
-temperature = 0.0
-```
+Please follow instruction [here](../README.md#setup) to setup your local development environment and LLM.

 ## Run Inference on Bird

 ```bash
-./evaluation/bird/scripts/run_infer.sh eval_gpt4_1106_preview [model_config] [git-version]
+./evaluation/bird/scripts/run_infer.sh [model_config] [git-version]
 ```

 - `model_config`, e.g. `eval_gpt4_1106_preview`, is the config group name for your
--- a/evaluation/bird/run_infer.py
+++ b/evaluation/bird/run_infer.py
@@ -1,12 +1,12 @@
 import asyncio
 import json
-import logging
 import os
 import pathlib
 import re
-import shutil
 import sqlite3
 import subprocess
+import zipfile
+from typing import Any

 import pandas as pd
 from datasets import load_dataset
@@ -15,20 +15,24 @@ from tqdm import tqdm

 from evaluation.utils.shared import (
    EvalMetadata,
+    EvalOutput,
    make_metadata,
    prepare_dataset,
+    reset_logger_for_multiprocessing,
    run_evaluation,
 )
-from opendevin.controller.agent import Agent
 from opendevin.controller.state.state import State
-from opendevin.core.config import get_llm_config_arg, load_app_config, parse_arguments
-from opendevin.core.logger import get_console_handler
+from opendevin.core.config import (
+    AppConfig,
+    SandboxConfig,
+    get_llm_config_arg,
+    parse_arguments,
+)
 from opendevin.core.logger import opendevin_logger as logger
-from opendevin.core.main import run_agent_controller
-from opendevin.events.action import MessageAction
-from opendevin.llm.llm import LLM
-
-config = load_app_config()
+from opendevin.core.main import create_runtime, run_controller
+from opendevin.events.action import CmdRunAction, MessageAction
+from opendevin.events.observation import CmdOutputObservation
+from opendevin.runtime.runtime import Runtime


 def codeact_user_response(state: State) -> str:
@@ -62,6 +66,27 @@ AGENT_CLS_TO_INST_SUFFIX = {
 }


+def get_config(
+    metadata: EvalMetadata,
+) -> AppConfig:
+    config = AppConfig(
+        default_agent=metadata.agent_class,
+        run_as_devin=False,
+        runtime='eventstream',
+        max_iterations=metadata.max_iterations,
+        sandbox=SandboxConfig(
+            container_image='ubuntu:22.04',
+            enable_auto_lint=True,
+            use_host_network=False,
+        ),
+        # do not mount workspace
+        workspace_base=None,
+        workspace_mount_path=None,
+    )
+    config.set_llm_config(metadata.llm_config)
+    return config
+
+
 def execute_sql(db_path, gen_sql, gold_sql):
    """Execute the generated SQL and the ground truth SQL and compare the results."""
    with sqlite3.connect(db_path) as conn:
@@ -76,12 +101,213 @@ def execute_sql(db_path, gen_sql, gold_sql):
    return res


-def get_test_result(instance, path, timeout=30):
+LOCAL_DATASET_PATH = os.path.join(os.path.dirname(__file__), 'data')
+
+
+def load_bird():
+    """Main function to handle the flow of downloading, processing, and loading the bird dataset."""
+
+    def _download_bird():
+        """Downloads and extracts the bird dataset from a specified URL into a local directory."""
+        devset_path = os.path.join(LOCAL_DATASET_PATH, 'dev')
+        if not os.path.exists(devset_path):
+            logger.info(
+                f'{LOCAL_DATASET_PATH} folder does not exist, starting download and extraction...'
+            )
+            os.makedirs(LOCAL_DATASET_PATH, exist_ok=True)
+
+            download_url = 'https://bird-bench.oss-cn-beijing.aliyuncs.com/dev.zip'
+            download_path = os.path.join(LOCAL_DATASET_PATH, 'dev.zip')
+            if not os.path.exists(download_path):
+                logger.info('Start Downloading...')
+                subprocess.run(['wget', download_url, '-O', download_path])
+                logger.info('Download completed.')
+
+            devset_path = os.path.join(LOCAL_DATASET_PATH, 'dev')
+            if not os.path.exists(devset_path):
+                logger.info('Start Extracting...')
+                os.makedirs(devset_path, exist_ok=True)
+                with zipfile.ZipFile(download_path, 'r') as zip_ref:
+                    zip_ref.extractall(devset_path)
+                # move everything in 'dev_20240627' to the root folder
+                for file in os.listdir(os.path.join(devset_path, 'dev_20240627')):
+                    os.rename(
+                        os.path.join(devset_path, 'dev_20240627', file),
+                        os.path.join(devset_path, file),
+                    )
+                os.rmdir(os.path.join(devset_path, 'dev_20240627'))
+                logger.info('Extraction completed.')
+
+            # extract databases
+            database_path = os.path.join(devset_path, 'dev_databases.zip')
+            assert os.path.exists(database_path)
+            logger.info('Start Extracting...')
+            with zipfile.ZipFile(database_path, 'r') as zip_ref:
+                zip_ref.extractall(devset_path)
+            logger.info('Extraction completed.')
+        else:
+            logger.info(f'{LOCAL_DATASET_PATH} folder already exists.')
+        return devset_path
+
+    def _extract_create_table_prompt(db_path, limit_value=0):
+        """Generates a SQL prompt with CREATE TABLE statements and sample data from the database."""
+        table_query = "SELECT * FROM sqlite_master WHERE type='table';"
+        tables = sqlite3.connect(db_path).cursor().execute(table_query).fetchall()
+        prompt = ''
+        for table in tables:
+            table_name = table[1]
+            create_table_statement = table[-1]
+
+            table_info_query = f'PRAGMA table_info(`{table_name}`);'
+            top_k_row_query = f'SELECT * FROM {table_name} LIMIT {limit_value};'
+            try:
+                headers = [
+                    x[1]
+                    for x in sqlite3.connect(db_path)
+                    .cursor()
+                    .execute(table_info_query)
+                    .fetchall()
+                ]
+            except Exception:
+                logger.error(f'Error Connection: {table_info_query}, {top_k_row_query}')
+                exit(0)
+
+            prompt += create_table_statement + ';\n'
+            if limit_value > 0:
+                top_k_rows = (
+                    sqlite3.connect(db_path)
+                    .cursor()
+                    .execute(top_k_row_query)
+                    .fetchall()
+                )
+                prompt += (
+                    f"/*\n3 example rows:\n{top_k_row_query}\n{'    '.join(headers)}\n"
+                )
+                for row in top_k_rows:
+                    row = [str(x) for x in row]
+                    row = [x if x is not None else '' for x in row]
+                    prompt += '    '.join(row) + '\n'
+                prompt += '*/\n'
+            prompt += '\n'
+        return prompt
+
+    def _create_prompt(e, database_path):
+        """Create a prompt for the given example"""
+        db_id = e['db_id']
+        db_path = pathlib.Path(database_path) / db_id / f'{db_id}.sqlite'
+
+        # Extract the CREATE TABLE statements and sample data from the database
+        prompt = _extract_create_table_prompt(db_path)
+        prompt += f"-- External Knowledge: {e['evidence']}\n\n"
+        prompt += '-- Using valid SQLite and understanding External Knowledge, answer the following questions for the tables provided above.\n\n'
+        prompt += '-- Using valid SQLite, answer the following questions for the tables provided above.\n'
+        prompt += f"Question: {e['question']}\n"
+
+        return prompt
+
+    def _process_bird(dataset_path):
+        """Processes the raw bird dataset into a structured format and saves it as JSON."""
+        processed_path = os.path.join(LOCAL_DATASET_PATH, 'dev', 'processed_dev.json')
+        if not os.path.exists(processed_path):
+            logger.info(
+                f'{processed_path} folder does not exist, starting processing...'
+            )
+            raw_data_path = os.path.join(LOCAL_DATASET_PATH, 'dev', 'dev.json')
+            database_path = os.path.join(LOCAL_DATASET_PATH, 'dev', 'dev_databases')
+            processed_data = []
+            with pathlib.Path(raw_data_path).open('r') as f:
+                data = json.load(f)
+                for e in tqdm(data):
+                    item = {
+                        'instance_id': f'{len(processed_data)}',
+                        'db_path': os.path.join(
+                            database_path, e['db_id'], f"{e['db_id']}.sqlite"
+                        ),
+                        'db_id': e['db_id'],
+                        'instruction': _create_prompt(e, database_path),
+                        'SQL': e['SQL'],
+                    }
+                    processed_data.append(item)
+
+            with pathlib.Path(processed_path).open('w') as f:
+                json.dump(processed_data, f, indent=2)
+                logger.info(f'Processed data saved to {processed_path}')
+        else:
+            logger.info(f'{processed_path} folder already exists.')
+        bird_dataset = load_dataset('json', data_files={'test': processed_path})
+        return bird_dataset
+
+    raw_dataset_path = _download_bird()
+    bird_dataset = _process_bird(raw_dataset_path)
+    return bird_dataset
+
+
+async def initialize_runtime(
+    runtime: Runtime,
+    instance: pd.Series,  # this argument is not required
+):
+    """Initialize the runtime for the agent.
+
+    This function is called before the runtime is used to run the agent.
+    """
+    logger.info(f"{'-' * 50} BEGIN Runtime Initialization Fn {'-' * 50}")
+    obs: CmdOutputObservation
+
+    # Copy the database to the workspace
+    db_file = os.path.join(
+        LOCAL_DATASET_PATH,
+        'dev',
+        'dev_databases',
+        instance.db_id,
+        f'{instance.db_id}.sqlite',
+    )
+    await runtime.copy_to(db_file, '/workspace')
+
+    # Check the database is copied
+    action = CmdRunAction(
+        command='cd /workspace && ls -l',
+        keep_prompt=False,
+    )
+    obs = await runtime.run_action(action)
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    assert obs.exit_code == 0
+    assert f'{instance.db_id}.sqlite' in obs.content
+
+    logger.info(f"{'-' * 50} END Runtime Initialization Fn {'-' * 50}")
+
+
+async def complete_runtime(
+    runtime: Runtime,
+    instance: pd.Series,  # this argument is not required, but it is used to get the workspace_dir_name
+) -> dict[str, Any]:
+    """Complete the runtime for the agent.
+
+    This function is called before the runtime is used to run the agent.
+    If you need to do something in the sandbox to get the correctness metric after
+    the agent has run, modify this function.
+    """
+    logger.info(f"{'-' * 50} BEGIN Runtime Completion Fn {'-' * 50}")
+    obs: CmdOutputObservation
+    timeout = 30
+
    test_result = {'result': {}, 'metadata': {}}

    # Read the generated python file
-    with open(path, 'r') as f:
-        gen_file = f.read()
+    instance_id = instance.instance_id.replace('/', '__')
+    path = os.path.join('/workspace', f'{instance_id}.py')
+
+    action = CmdRunAction(
+        command=f'cat {path}',
+        keep_prompt=False,
+    )
+    obs = await runtime.run_action(action)
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+
+    if obs.exit_code != 0:
+        test_result['result'] = {'passed': 0, 'status': 'error'}
+        return test_result
+
+    gen_file = obs.content.strip().replace('\r\n', '\n')

    # Extract the SQL from the python file
    gen_sql = ''
@@ -96,7 +322,13 @@ def get_test_result(instance, path, timeout=30):
    # Execute the SQL
    try:
        res = func_timeout(
-            timeout, execute_sql, args=(instance.db_path, gen_sql, gold_sql)
+            timeout,
+            execute_sql,
+            args=(
+                instance.db_path,
+                gen_sql,
+                gold_sql,
+            ),
        )
        status = 'success'
    except FunctionTimedOut:
@@ -114,68 +346,28 @@ def get_test_result(instance, path, timeout=30):
        'gen_sql': gen_sql,
        'gold_sql': gold_sql,
    }
+    logger.info(f"{'-' * 50} END Runtime Completion Fn {'-' * 50}")
    return test_result


-def process_instance(
+async def process_instance(
    instance: pd.Series,
    metadata: EvalMetadata,
    reset_logger: bool = True,
-):
-    # Create the agent
-    agent = Agent.get_cls(metadata.agent_class)(llm=LLM(config=metadata.llm_config))
-    workspace_mount_path = os.path.join(
-        config.workspace_mount_path, 'bird_eval_workspace'
-    )
-    workspace_mount_path = os.path.join(workspace_mount_path, str(os.getpid()))
-    pathlib.Path(workspace_mount_path).mkdir(parents=True, exist_ok=True)
-
-    # reset workspace to config
-    config.workspace_mount_path = workspace_mount_path
-
-    # Copy the database to the workspace
-    db_root = os.path.join(
-        config.workspace_base, 'evaluation_bird/dev/dev_databases', instance.db_id
-    )
-    target_path = os.path.join(workspace_mount_path, f'{instance.db_id}')
-    if not os.path.exists(target_path):
-        logger.info(f'Copying database from {db_root} to {target_path}...')
-        shutil.copytree(db_root, target_path)
-
-    # Set up the database path
-    database_path = os.path.join(instance.db_id, f'{instance.db_id}.sqlite')
-
+) -> EvalOutput:
+    config = get_config(metadata)
    # use session id for concurrent evaluation
-    sid = instance.task_id.replace('/', '__')
+    instance_id = instance.instance_id.replace('/', '__')

    # Set up the logger properly, so you can run multi-processing to parallelize the evaluation
    if reset_logger:
-        # Set up logger
-        log_file = os.path.join(
-            metadata.eval_output_dir,
-            'logs',
-            f'instance_{sid}.log',
-        )
-        # Remove all existing handlers from logger
-        for handler in logger.handlers[:]:
-            logger.removeHandler(handler)
-        # add back the console handler to print ONE line
-        logger.addHandler(get_console_handler())
-        logger.info(
-            f'Starting evaluation for instance {instance.task_id}.\nLOG:   tail -f {log_file}'
-        )
-        # Remove all existing handlers from logger
-        for handler in logger.handlers[:]:
-            logger.removeHandler(handler)
-        file_handler = logging.FileHandler(log_file)
-        file_handler.setFormatter(
-            logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
-        )
-        logger.addHandler(file_handler)
-
-    logger.info(f'Process-specific workspace mounted at {workspace_mount_path}')
+        log_dir = os.path.join(metadata.eval_output_dir, 'infer_logs')
+        reset_logger_for_multiprocessing(logger, instance_id, log_dir)
+    else:
+        logger.info(f'Starting evaluation for instance {instance_id}.')

    # Create file with BIRD instance
+    database_path = os.path.join('/workspace', f'{instance.db_id}.sqlite')
    statements = f"""
    import sqlite3
    def execute_sql(db_path, sql):
@@ -192,12 +384,12 @@ def process_instance(
        result = execute_sql(db_path, sql)
        print(result)
    """
-    path = os.path.join(config.workspace_mount_path, f'{sid}.py')
+
    instruction = (
        f'You are a SQL expert and need to complete the following text-to-SQL tasks.'
        f'\n\n{instance.instruction}\n\n'
        'Please write the SQL in one line without line breaks.'
-        f'And write a new python file named {sid}.py to call the SQL you wrote.'
+        f'And write a new python file named {instance_id}.py to call the SQL you wrote.'
        'You need to follow the code template below:'
        f'\n\n{statements}\n\n'
        'Environment has been set up for you to start working.'
@@ -208,23 +400,21 @@ def process_instance(
        'You SHOULD INCLUDE PROPER INDENTATION in your edit commands.\n'
    )
    # NOTE: You can actually set slightly different instruction for different agents
-    instruction += AGENT_CLS_TO_INST_SUFFIX[agent.__class__.__name__]
+    instruction += AGENT_CLS_TO_INST_SUFFIX[metadata.agent_class]
+
+    runtime = await create_runtime(config, sid=instance_id)
+    await initialize_runtime(runtime, instance)
+
    # Here's how you can run the agent (similar to the `main` function) and get the final task state
-    state: State | None = asyncio.run(
-        run_agent_controller(
-            agent,
-            instruction,
-            max_iterations=metadata.max_iterations,
-            max_budget_per_task=config.max_budget_per_task,
-            fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN[
-                agent.__class__.__name__
-            ],
-            sid=sid,
-        )
+    state: State | None = await run_controller(
+        config=config,
+        task_str=instruction,
+        fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN[metadata.agent_class],
+        runtime=runtime,
    )

    # ======= Attempt to evaluate the agent's edits =======
-    test_result = get_test_result(instance, path)
+    test_result = await complete_runtime(runtime, instance)

    # If you are working on some simpler benchmark that only evaluates the final model output (e.g., in a MessageAction)
    # You can simply get the LAST `MessageAction` from the returned `state.history` and parse it for evaluation.
@@ -238,162 +428,43 @@ def process_instance(
    histories = state.history.compatibility_for_eval_history_pairs()

    # Save the output
-    output = {
-        'task_id': instance.task_id,
-        'instruction': instruction,
-        'metadata': metadata.model_dump(),
-        'history': histories,
-        'metrics': metrics,
-        'error': state.last_error if state and state.last_error else None,
-        'test_result': test_result,
-    }
+    output = EvalOutput(
+        instance_id=instance.instance_id,
+        instruction=instruction,
+        metadata=metadata,
+        history=histories,
+        metrics=metrics,
+        error=state.last_error if state and state.last_error else None,
+        test_result=test_result,
+    )
    return output


-def load_bird():
-    """Main function to handle the flow of downloading, processing, and loading the bird dataset."""
-    raw_dataset_path = download_bird()
-    bird_dataset = process_bird(raw_dataset_path)
-    return bird_dataset
-
-
-def download_bird():
-    """Downloads and extracts the bird dataset from a specified URL into a local directory."""
-    dataset_path = os.path.join(config.workspace_base, 'evaluation_bird')
-    devset_path = os.path.join(dataset_path, 'dev')
-    if not os.path.exists(dataset_path):
-        logger.info(
-            f'{dataset_path} folder does not exist, starting download and extraction...'
-        )
-        os.makedirs(dataset_path, exist_ok=True)
-        download_url = 'https://bird-bench.oss-cn-beijing.aliyuncs.com/dev.zip'
-        download_path = os.path.join(dataset_path, 'dev.zip')
-        logger.info('Start Downloading...')
-        subprocess.run(['wget', download_url, '-O', download_path])
-        logger.info('Download completed.')
-        logger.info('Start Extracting...')
-        subprocess.run(['unzip', download_path, '-d', dataset_path])
-        # extract databases
-        devset_path = os.path.join(dataset_path, 'dev')
-        database_path = os.path.join(devset_path, 'dev_databases.zip')
-        subprocess.run(['unzip', database_path, '-d', devset_path])
-        logger.info('Extraction completed.')
-    else:
-        logger.info(f'{dataset_path} folder already exists.')
-    return devset_path
-
-
-def process_bird(dataset_path):
-    """Processes the raw bird dataset into a structured format and saves it as JSON."""
-    processed_path = os.path.join(dataset_path, 'processed_dev.json')
-    if not os.path.exists(processed_path):
-        logger.info(f'{processed_path} folder does not exist, starting processing...')
-        raw_data_path = os.path.join(dataset_path, 'dev.json')
-        database_path = os.path.join(dataset_path, 'dev_databases')
-        processed_data = []
-        with pathlib.Path(raw_data_path).open('r') as f:
-            data = json.load(f)
-            for e in tqdm(data):
-                item = {
-                    'task_id': f'{len(processed_data)}',
-                    'db_path': os.path.join(
-                        database_path, e['db_id'], f"{e['db_id']}.sqlite"
-                    ),
-                    'db_id': e['db_id'],
-                    'instruction': create_prompt(e, database_path),
-                    'SQL': e['SQL'],
-                }
-                processed_data.append(item)
-
-        with pathlib.Path(processed_path).open('w') as f:
-            json.dump(processed_data, f, indent=2)
-            logger.info(f'Processed data saved to {processed_path}')
-    else:
-        logger.info(f'{processed_path} folder already exists.')
-    bird_dataset = load_dataset('json', data_files={'test': processed_path})
-    return bird_dataset
-
-
-def extract_create_table_prompt(db_path, limit_value=0):
-    """Generates a SQL prompt with CREATE TABLE statements and sample data from the database."""
-    table_query = "SELECT * FROM sqlite_master WHERE type='table';"
-    tables = sqlite3.connect(db_path).cursor().execute(table_query).fetchall()
-    prompt = ''
-    for table in tables:
-        table_name = table[1]
-        create_table_statement = table[-1]
-
-        table_info_query = f'PRAGMA table_info(`{table_name}`);'
-        top_k_row_query = f'SELECT * FROM {table_name} LIMIT {limit_value};'
-        try:
-            headers = [
-                x[1]
-                for x in sqlite3.connect(db_path)
-                .cursor()
-                .execute(table_info_query)
-                .fetchall()
-            ]
-        except Exception:
-            logger.error(f'Error Connection: {table_info_query}, {top_k_row_query}')
-            exit(0)
-
-        prompt += create_table_statement + ';\n'
-        if limit_value > 0:
-            top_k_rows = (
-                sqlite3.connect(db_path).cursor().execute(top_k_row_query).fetchall()
-            )
-            prompt += (
-                f"/*\n3 example rows:\n{top_k_row_query}\n{'    '.join(headers)}\n"
-            )
-            for row in top_k_rows:
-                row = [str(x) for x in row]
-                row = [x if x is not None else '' for x in row]
-                prompt += '    '.join(row) + '\n'
-            prompt += '*/\n'
-        prompt += '\n'
-    return prompt
-
-
-def create_prompt(e, database_path):
-    """Create a prompt for the given example"""
-    db_id = e['db_id']
-    db_path = pathlib.Path(database_path) / db_id / f'{db_id}.sqlite'
-
-    # Extract the CREATE TABLE statements and sample data from the database
-    prompt = extract_create_table_prompt(db_path)
-    prompt += f"-- External Knowledge: {e['evidence']}\n\n"
-    prompt += '-- Using valid SQLite and understanding External Knowledge, answer the following questions for the tables provided above.\n\n'
-    prompt += '-- Using valid SQLite, answer the following questions for the tables provided above.\n'
-    prompt += f"Question: {e['question']}\n"
-
-    return prompt
-
-
 if __name__ == '__main__':
-    id_column = 'task_id'
    args = parse_arguments()
    bird_dataset = load_bird()
    dataset = bird_dataset['test'].to_pandas()
+    dataset.rename(columns={'task_id': 'instance_id'}, inplace=True)

-    llm_config = get_llm_config_arg(args.llm_config) if args.llm_config else config.llm
-    logger.info(f'Config for evaluation: {config}')
+    llm_config = None
+    if args.llm_config:
+        llm_config = get_llm_config_arg(args.llm_config)
+    if llm_config is None:
+        raise ValueError(f'Could not find LLM config: --llm_config {args.llm_config}')

    metadata = make_metadata(
        llm_config,
-        args.dataset_name,
+        'BIRD',
        args.agent_cls,
        args.max_iterations,
        args.eval_note,
        args.eval_output_dir,
    )
    output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl')
-    instances = prepare_dataset(dataset, output_file, args.eval_n_limit, id_column)
+    instances = prepare_dataset(dataset, output_file, args.eval_n_limit)

-    run_evaluation(
-        instances,
-        metadata,
-        output_file,
-        args.eval_num_workers,
-        process_instance,
-        id_column,
+    asyncio.run(
+        run_evaluation(
+            instances, metadata, output_file, args.eval_num_workers, process_instance
+        )
    )
--- a/evaluation/bird/scripts/run_infer.sh
+++ b/evaluation/bird/scripts/run_infer.sh
--- a/evaluation/browsing_delegation/README.md
+++ b/evaluation/browsing_delegation/README.md
@@ -5,30 +5,9 @@ Some of OpenDevin's agent supports agent delegation action, for example, CodeAct
 This evaluation tests whether CodeActAgent can correctly delegate the instruction from WebArena and MiniWob benchmark to the BrowsingAgent.
 If so, the browsing performance upper-bound of CodeActAgent will be the performance of BrowsingAgent.

+## Setup Environment and LLM Configuration

-## Setup Environment
-
-Please follow [this document](https://github.com/OpenDevin/OpenDevin/blob/main/Development.md) to set up a local development environment for OpenDevin.
-
-## Configure OpenDevin and your LLM
-
-Create a `config.toml` file if it does not exist at the root of the workspace.
-
-Add the following configurations:
-
-```toml
-# TODO: Change these to the model you want to evaluate
-[llm.eval_gpt4_1106_preview_llm]
-model = "gpt-4-1106-preview"
-api_key = "XXX"
-temperature = 0.0
-
-[llm.eval_some_openai_compatible_model_llm]
-model = "openai/MODEL_NAME"
-base_url = "https://OPENAI_COMPATIBLE_URL/v1"
-api_key = "XXX"
-temperature = 0.0
-```
+Please follow instruction [here](../README.md#setup) to setup your local development environment and LLM.

 ## Run Inference

--- a/evaluation/browsing_delegation/run_infer.py
+++ b/evaluation/browsing_delegation/run_infer.py
@@ -1,5 +1,4 @@
 import asyncio
-import logging
 import os
 import re

@@ -9,56 +8,61 @@ from datasets import load_dataset

 from evaluation.utils.shared import (
    EvalMetadata,
+    EvalOutput,
    make_metadata,
    prepare_dataset,
+    reset_logger_for_multiprocessing,
    run_evaluation,
 )
-from opendevin.controller.agent import Agent
 from opendevin.controller.state.state import State
-from opendevin.core.config import get_llm_config_arg, load_app_config, parse_arguments
-from opendevin.core.logger import get_console_handler
+from opendevin.core.config import (
+    AppConfig,
+    SandboxConfig,
+    get_llm_config_arg,
+    parse_arguments,
+)
 from opendevin.core.logger import opendevin_logger as logger
-from opendevin.core.main import run_agent_controller
-from opendevin.llm.llm import LLM
-
-config = load_app_config()
+from opendevin.core.main import create_runtime, run_controller

 # Only CodeActAgent can delegate to BrowsingAgent
 SUPPORTED_AGENT_CLS = {'CodeActAgent'}


-def process_instance(
+def get_config(
+    metadata: EvalMetadata,
+) -> AppConfig:
+    assert (
+        metadata.max_iterations == 1
+    ), 'max_iterations must be 1 for browsing delegation evaluation.'
+    config = AppConfig(
+        default_agent=metadata.agent_class,
+        run_as_devin=False,
+        runtime='eventstream',
+        max_iterations=metadata.max_iterations,
+        sandbox=SandboxConfig(
+            container_image='ubuntu:22.04',
+            enable_auto_lint=False,
+            use_host_network=False,
+        ),
+        workspace_base=None,
+        workspace_mount_path=None,
+    )
+    config.set_llm_config(metadata.llm_config)
+    return config
+
+
+async def process_instance(
    instance: pd.Series,
    metadata: EvalMetadata,
    reset_logger: bool = True,
-):
-    # Create the agent
-    agent = Agent.get_cls(metadata.agent_class)(llm=LLM(config=metadata.llm_config))
-    env_id = instance.instance_id
+) -> EvalOutput:
+    config = get_config(metadata)
    # Setup the logger properly, so you can run multi-processing to parallelize the evaluation
    if reset_logger:
-        # Set up logger
-        log_file = os.path.join(
-            metadata.eval_output_dir, 'logs', f'instance_{env_id}.log'
-        )
-        # Remove all existing handlers from logger
-        for handler in logger.handlers[:]:
-            logger.removeHandler(handler)
-        # add back the console handler to print ONE line
-        logger.addHandler(get_console_handler())
-        logger.info(
-            f'Starting evaluation for instance {env_id}.\nHint: run "tail -f {log_file}" to see live logs in a separate shell'
-        )
-        # Remove all existing handlers from logger
-        for handler in logger.handlers[:]:
-            logger.removeHandler(handler)
-        file_handler = logging.FileHandler(log_file)
-        file_handler.setFormatter(
-            logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
-        )
-        logger.addHandler(file_handler)
+        log_dir = os.path.join(metadata.eval_output_dir, 'infer_logs')
+        reset_logger_for_multiprocessing(logger, instance.instance_id, log_dir)
    else:
-        logger.info(f'Starting evaluation for instance {env_id}.')
+        logger.info(f'Starting evaluation for instance {instance.instance_id}.')

    instruction = (
        f'You can delegate browsing tasks to a browser agent. '
@@ -67,21 +71,14 @@ def process_instance(
        f'NOTE: You should copy the "query" as is into the <execute_browse> tag. DO NOT change ANYTHING in the query.'
    )

-    state: State | None = asyncio.run(
-        run_agent_controller(
-            agent,
-            instruction,
-            max_iterations=metadata.max_iterations,
-            max_budget_per_task=config.max_budget_per_task,
-            sid=env_id,
-        )
+    runtime = await create_runtime(config, sid=instance.instance_id)
+
+    state: State | None = await run_controller(
+        config=config,
+        task_str=instruction,
+        runtime=runtime,
    )

-    # ======= Attempt to evaluate the agent's environment impact =======
-
-    # If you are working on some simpler benchmark that only evaluates the final model output (e.g., in a MessageAction)
-    # You can simply get the LAST `MessageAction` from the returned `state.history` and parse it for evaluation.
-
    if state is None:
        raise ValueError('State should not be None.')

@@ -115,20 +112,19 @@ def process_instance(
            result['is_exact_match'] = is_exact_match

    # Save the output
-    output = {
-        'instance_id': env_id,
-        'instruction': instruction,
-        'metadata': metadata.model_dump(),
-        'history': histories,
-        'metrics': metrics,
-        'error': state.last_error if state and state.last_error else None,
-        'test_result': {
+    output = EvalOutput(
+        instance_id=instance.instance_id,
+        instruction=instruction,
+        metadata=metadata,
+        history=histories,
+        metrics=metrics,
+        error=state.last_error if state and state.last_error else None,
+        test_result={
            'query': instance.instruction,
            'action': last_delegate_action,
            'result': result,
        },
-    }
-
+    )
    return output


@@ -138,9 +134,13 @@ if __name__ == '__main__':
    dataset = load_dataset('OpenDevin/eval-browsing-instructions')
    dataset = dataset['train'].to_pandas()
    assert dataset.columns.tolist() == ['instance_id', 'instruction']
-    id_column = 'instance_id'
-    llm_config = get_llm_config_arg(args.llm_config) if args.llm_config else config.llm
-    logger.info(f'Config for evaluation: {config}')
+
+    llm_config = None
+    if args.llm_config:
+        llm_config = get_llm_config_arg(args.llm_config)
+
+    if llm_config is None:
+        raise ValueError(f'Could not find LLM config: --llm_config {args.llm_config}')

    metadata = make_metadata(
        llm_config,
@@ -150,18 +150,20 @@ if __name__ == '__main__':
        args.eval_note,
        args.eval_output_dir,
    )
+
    if metadata.agent_class not in SUPPORTED_AGENT_CLS:
        raise ValueError(
            f'Agent class {metadata.agent_class} not supported with AgentDelegation.'
        )

    output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl')
-    instances = prepare_dataset(dataset, output_file, args.eval_n_limit, id_column)
-    run_evaluation(
-        instances,
-        metadata,
-        output_file,
-        args.eval_num_workers,
-        process_instance,
-        id_column,
+    instances = prepare_dataset(dataset, output_file, args.eval_n_limit)
+    asyncio.run(
+        run_evaluation(
+            instances,
+            metadata,
+            output_file,
+            args.eval_num_workers,
+            process_instance,
+        )
    )
--- a/evaluation/gaia/README.md
+++ b/evaluation/gaia/README.md
@@ -2,9 +2,9 @@

 This folder contains evaluation harness for evaluating agents on the [GAIA benchmark](https://arxiv.org/abs/2311.12983).

-## Configure OpenDevin and your LLM
+## Setup Environment and LLM Configuration

-Create a `config.toml` file if it does not exist at the root of the workspace. Please check [README.md](../../README.md) for how to set this up.
+Please follow instruction [here](../README.md#setup) to setup your local development environment and LLM.

 ## Run the evaluation
 We are using the GAIA dataset hosted on [Hugging Face](https://huggingface.co/datasets/gaia-benchmark/GAIA).
--- a/evaluation/gaia/run_infer.py
+++ b/evaluation/gaia/run_infer.py
@@ -1,10 +1,7 @@
 import asyncio
-import logging
+import functools
 import os
-import pathlib
 import re
-import shutil
-from functools import partial

 import huggingface_hub
 import pandas as pd
@@ -13,28 +10,31 @@ from datasets import load_dataset
 from evaluation.gaia.scorer import question_scorer
 from evaluation.utils.shared import (
    EvalMetadata,
+    EvalOutput,
    codeact_user_response,
    make_metadata,
    prepare_dataset,
+    reset_logger_for_multiprocessing,
    run_evaluation,
 )
-from opendevin.controller.agent import Agent
 from opendevin.controller.state.state import State
-from opendevin.core.config import get_llm_config_arg, get_parser, load_app_config
-from opendevin.core.logger import get_console_handler
+from opendevin.core.config import (
+    AppConfig,
+    SandboxConfig,
+    get_llm_config_arg,
+    get_parser,
+)
 from opendevin.core.logger import opendevin_logger as logger
-from opendevin.core.main import run_agent_controller
-from opendevin.events.action import CmdRunAction, MessageAction
-from opendevin.llm.llm import LLM
+from opendevin.core.main import create_runtime, run_controller
+from opendevin.events.action import AgentFinishAction, CmdRunAction, MessageAction
+from opendevin.events.observation import CmdOutputObservation
+from opendevin.runtime.runtime import Runtime

-config = load_app_config()
-
-DATASET_CACHE_DIR = '~/.cache/open-devin/evals/gaia'
-DATASET_CACHE_DIR = os.path.expanduser(DATASET_CACHE_DIR)
+DATASET_CACHE_DIR = os.path.join(os.path.dirname(__file__), 'data')


 AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = {
-    'CodeActAgent': partial(codeact_user_response, encapsulate_solution=True),
+    'CodeActAgent': functools.partial(codeact_user_response, encapsulate_solution=True),
 }

 AGENT_CLS_TO_INST_SUFFIX = {
@@ -42,151 +42,174 @@ AGENT_CLS_TO_INST_SUFFIX = {
 }


-def process_instance(
+def get_config(
+    metadata: EvalMetadata,
+) -> AppConfig:
+    config = AppConfig(
+        default_agent=metadata.agent_class,
+        run_as_devin=False,
+        runtime='eventstream',
+        max_iterations=metadata.max_iterations,
+        sandbox=SandboxConfig(
+            container_image='ubuntu:22.04',
+            enable_auto_lint=True,
+            use_host_network=False,
+        ),
+        # do not mount workspace
+        workspace_base=None,
+        workspace_mount_path=None,
+    )
+    config.set_llm_config(metadata.llm_config)
+    return config
+
+
+async def initialize_runtime(
+    runtime: Runtime,
+    instance: pd.Series,  # this argument is not required
+):
+    """Initialize the runtime for the agent.
+
+    This function is called before the runtime is used to run the agent.
+    """
+    logger.info(f"{'-' * 50} BEGIN Runtime Initialization Fn {'-' * 50}")
+    obs: CmdOutputObservation
+
+    action = CmdRunAction(command='mkdir -p /workspace')
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = await runtime.run_action(action)
+    assert obs.exit_code == 0
+
+    if instance['file_name'] != '':
+        # if this question comes with a file, we need to save it to the workspace
+        assert metadata.data_split is not None
+        src_file = os.path.join(
+            DATASET_CACHE_DIR, '2023', metadata.data_split, instance['file_name']
+        )
+        assert os.path.exists(src_file)
+        dest_file = os.path.join('/workspace', instance['file_name'])
+        await runtime.copy_to(src_file, dest_file)
+
+        # rename to file.extension_name
+        extension_name = instance['file_name'].split('.')[-1]
+        action = CmdRunAction(
+            command=f'mv /workspace/{instance["file_name"]} /workspace/file.{extension_name}'
+        )
+        logger.info(action, extra={'msg_type': 'ACTION'})
+        obs = await runtime.run_action(action)
+        assert obs.exit_code == 0
+
+    action = CmdRunAction(command='cd /workspace')
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = await runtime.run_action(action)
+    assert obs.exit_code == 0
+
+    logger.info(f"{'-' * 50} END Runtime Initialization Fn {'-' * 50}")
+
+
+async def process_instance(
    instance: pd.Series,
    metadata: EvalMetadata,
    reset_logger: bool = True,
-):
-    # Create the agent
-    agent = Agent.get_cls(metadata.agent_class)(llm=LLM(config=metadata.llm_config))
-    # create process-specific workspace dir
-    # we will create a workspace directory for EACH process
-    # so that different agent don't interfere with each other.
-    old_workspace_mount_path = config.workspace_mount_path
+) -> EvalOutput:
+    config = get_config(metadata)

-    try:
-        workspace_mount_path = os.path.join(
-            config.workspace_mount_path, '_eval_workspace'
-        )
-        workspace_mount_path = os.path.join(workspace_mount_path, str(os.getpid()))
-        pathlib.Path(workspace_mount_path).mkdir(parents=True, exist_ok=True)
-        config.workspace_mount_path = workspace_mount_path
+    # Setup the logger properly, so you can run multi-processing to parallelize the evaluation
+    if reset_logger:
+        log_dir = os.path.join(metadata.eval_output_dir, 'infer_logs')
+        reset_logger_for_multiprocessing(logger, instance['instance_id'], log_dir)
+    else:
+        logger.info(f'Starting evaluation for instance {instance["instance_id"]}.')

-        # Setup the logger properly, so you can run multi-processing to parallelize the evaluation
-        eval_output_dir = metadata.eval_output_dir
-        if reset_logger:
-            # Set up logger
-            log_file = os.path.join(
-                eval_output_dir, 'logs', f'instance_{instance["task_id"]}.log'
-            )
-            # Remove all existing handlers from logger
-            for handler in logger.handlers[:]:
-                logger.removeHandler(handler)
-            # add back the console handler to print ONE line
-            logger.addHandler(get_console_handler())
-            logger.info(
-                f'Starting evaluation for instance {instance["task_id"]}.\nLOG:   tail -f {log_file}'
-            )
-            # Remove all existing handlers from logger
-            for handler in logger.handlers[:]:
-                logger.removeHandler(handler)
-            file_handler = logging.FileHandler(log_file)
-            file_handler.setFormatter(
-                logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
-            )
-            logger.addHandler(file_handler)
+    if instance['file_name'] != '':
+        extension_name = instance['file_name'].split('.')[-1]
+        dest_file = os.path.join('/workspace', f'file.{extension_name}')
+    else:
+        dest_file = None

-        logger.info(f'Process-specific workspace mounted at {workspace_mount_path}')
-        if instance['file_name'] != '':
-            # if this question comes with a file, we need to save it to the workspace
-            assert metadata.data_split is not None
-            src_file = os.path.join(
-                DATASET_CACHE_DIR, '2023', metadata.data_split, instance['file_name']
-            )
-            extension_name = instance['file_name'].split('.')[-1]
-            dest_file = os.path.join(workspace_mount_path, f'file.{extension_name}')
-            shutil.copyfile(src_file, dest_file)
-            logger.info(f'File copied to {dest_file}')
-        else:
-            dest_file = None
+    # Prepare instruction
+    instruction = f"{instance['Question']}\n"
+    logger.info(f'Instruction: {instruction}')
+    if dest_file:
+        instruction += f"\n\nThe mentioned file is provided in the workspace at: {dest_file.split('/')[-1]}"

-        # Prepare instruction
-        instruction = f"{instance['Question']}\n"
-        logger.info(f'Instruction: {instruction}')
-        if dest_file:
-            instruction += f"\n\nThe mentioned file is provided in the workspace at: {dest_file.split('/')[-1]}"
+    instruction += 'IMPORTANT: You should ONLY interact with the environment provided to you AND NEVER ASK FOR HUMAN HELP.\n'
+    instruction += 'Please encapsulate your final answer (answer ONLY) within <solution> and </solution>.\n'
+    instruction += (
+        'For example: The answer to the question is <solution> 42 </solution>.\n'
+    )
+    # NOTE: You can actually set slightly different instruction for different agents
+    instruction += AGENT_CLS_TO_INST_SUFFIX.get(metadata.agent_class, '')
+    logger.info(f'Instruction:\n{instruction}', extra={'msg_type': 'OBSERVATION'})

-        instruction += 'IMPORTANT: You should ONLY interact with the environment provided to you AND NEVER ASK FOR HUMAN HELP.\n'
-        instruction += 'Please encapsulate your final answer (answer ONLY) within <solution> and </solution>.\n'
-        instruction += (
-            'For example: The answer to the question is <solution> 42 </solution>.\n'
-        )
-        # NOTE: You can actually set slightly different instruction for different agents
-        instruction += AGENT_CLS_TO_INST_SUFFIX.get(agent.__class__.__name__, '')
-        logger.info(f'Instruction:\n{instruction}', extra={'msg_type': 'OBSERVATION'})
+    runtime = await create_runtime(config, sid=instance['instance_id'])
+    await initialize_runtime(runtime, instance)

-        # Here's how you can run the agent (similar to the `main` function) and get the final task state
-        state: State | None = asyncio.run(
-            run_agent_controller(
-                agent,
-                instruction,
-                max_iterations=metadata.max_iterations,
-                max_budget_per_task=config.max_budget_per_task,
-                fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN[
-                    agent.__class__.__name__
-                ],
-                sid=instance['task_id'],
-            )
-        )
-        # ======= Attempt to evaluate the agent's edits =======
-        # If you are working on simpler benchmark that only evaluates the final model output (e.g., in a MessageAction)
-        # You can simply get the LAST `MessageAction` from the returned `state.history` and parse it for evaluation.
+    # Here's how you can run the agent (similar to the `main` function) and get the final task state
+    state: State | None = await run_controller(
+        config=config,
+        task_str=instruction,
+        runtime=runtime,
+        fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN[metadata.agent_class],
+    )
+    # ======= Attempt to evaluate the agent's edits =======
+    # If you are working on simpler benchmark that only evaluates the final model output (e.g., in a MessageAction)
+    # You can simply get the LAST `MessageAction` from the returned `state.history` and parse it for evaluation.

-        if state is None:
-            raise ValueError('State should not be None.')
+    if state is None:
+        raise ValueError('State should not be None.')

-        model_answer_raw = ''
-
-        # get the last message or thought from the agent
-        for event in state.history.get_events(reverse=True):
-            if isinstance(event, CmdRunAction) and event.source == 'agent':
+    model_answer_raw = ''
+    # get the last message or thought from the agent
+    for event in state.history.get_events(reverse=True):
+        if event.source == 'agent':
+            if isinstance(event, AgentFinishAction):
                model_answer_raw = event.thought
-            elif isinstance(event, MessageAction) and event.source == 'agent':
+                break
+            elif isinstance(event, CmdRunAction):
+                model_answer_raw = event.thought
+                break
+            elif isinstance(event, MessageAction):
                model_answer_raw = event.content
+                break

-        # attempt to parse model_answer
-        model_answer = re.findall(r'<solution>(.*?)</solution>', model_answer_raw)
-        if len(model_answer) == 0:
-            logger.warning(f'Failed to parse model answer: {model_answer_raw}')
-            model_answer = model_answer_raw
-        else:
-            model_answer = model_answer[0]
+    # attempt to parse model_answer
+    model_answer = re.findall(r'<solution>(.*?)</solution>', model_answer_raw)
+    if len(model_answer) == 0:
+        logger.warning(f'Failed to parse model answer: {model_answer_raw}')
+        model_answer = model_answer_raw
+    else:
+        model_answer = model_answer[0]

-        logger.info(
-            f'Final message: {model_answer} | Ground truth: {instance["Final answer"]}'
-        )
-        score = question_scorer(
-            model_answer=model_answer, ground_truth=instance['Final answer']
-        )
-        test_result = {
-            'score': score,
-            'model_answer_raw': model_answer_raw,
-            'model_answer': model_answer,
-            'ground_truth': instance['Final answer'],
-        }
-        metrics = state.metrics.get() if state.metrics else None
+    logger.info(
+        f'Final message: {model_answer} | Ground truth: {instance["Final answer"]}'
+    )
+    score = question_scorer(
+        model_answer=model_answer, ground_truth=instance['Final answer']
+    )
+    test_result = {
+        'score': score,
+        'model_answer_raw': model_answer_raw,
+        'model_answer': model_answer,
+        'ground_truth': instance['Final answer'],
+    }
+    metrics = state.metrics.get() if state.metrics else None

-        # history is now available as a stream of events, rather than list of pairs of (Action, Observation)
-        # for compatibility with the existing output format, we can remake the pairs here
-        # remove when it becomes unnecessary
-        histories = state.history.compatibility_for_eval_history_pairs()
+    # history is now available as a stream of events, rather than list of pairs of (Action, Observation)
+    # for compatibility with the existing output format, we can remake the pairs here
+    # remove when it becomes unnecessary
+    histories = state.history.compatibility_for_eval_history_pairs()

-        # Save the output
-        output = {
-            'instance_id': instance['task_id'],
-            'instance': instance,
-            'instruction': instance['Question'],
-            'metadata': metadata.model_dump(),
-            'history': histories,
-            'metrics': metrics,
-            'error': state.last_error if state and state.last_error else None,
-            'test_result': test_result,
-        }
-    except Exception:
-        logger.error('Process instance failed')
-        raise
-    finally:
-        config.workspace_mount_path = old_workspace_mount_path
+    # Save the output
+    output = EvalOutput(
+        instance_id=instance['instance_id'],
+        instance=instance.to_dict(),
+        instruction=instance['Question'],
+        metadata=metadata,
+        history=histories,
+        metrics=metrics,
+        error=state.last_error if state and state.last_error else None,
+        test_result=test_result,
+    )
    return output


@@ -197,13 +220,19 @@ if __name__ == '__main__':
        type=str,
        help='gaia level to evaluate, eg. 2023_level1',
    )
+    parser.add_argument(
+        '--data-split',
+        type=str,
+        help='data split to evaluate, eg. test',
+        default='validation',
+    )
    args, _ = parser.parse_known_args()
-    if args.directory:
-        config.workspace_base = os.path.abspath(args.directory)
-        logger.info(f'Setting workspace base to {config.workspace_base}')

-    llm_config = get_llm_config_arg(args.llm_config) if args.llm_config else config.llm
-    logger.info(f'Config for evaluation: {config}')
+    llm_config = None
+    if args.llm_config:
+        llm_config = get_llm_config_arg(args.llm_config)
+    if llm_config is None:
+        raise ValueError(f'Could not find LLM config: --llm_config {args.llm_config}')

    metadata = make_metadata(
        llm_config=llm_config,
@@ -222,20 +251,18 @@ if __name__ == '__main__':
        repo_type='dataset',
        local_dir=DATASET_CACHE_DIR,
    )
-    gaia_tests = dataset[metadata.data_split]
+    gaia_tests = dataset[metadata.data_split].to_pandas()
+    gaia_tests.rename(columns={'task_id': 'instance_id'}, inplace=True)

    output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl')
-    prepared_dataset = prepare_dataset(
-        gaia_tests.to_pandas(), output_file, args.eval_n_limit, 'task_id'
-    )
+    prepared_dataset = prepare_dataset(gaia_tests, output_file, args.eval_n_limit)

-    agent = Agent.get_cls(args.agent_cls)(llm=LLM(config.llm))
-
-    run_evaluation(
-        dataset=prepared_dataset,
-        metadata=metadata,
-        output_file=output_file,
-        num_workers=args.eval_num_workers,
-        process_instance_func=process_instance,
-        id_column='task_id',
+    asyncio.run(
+        run_evaluation(
+            dataset=prepared_dataset,
+            metadata=metadata,
+            output_file=output_file,
+            num_workers=args.eval_num_workers,
+            process_instance_func=process_instance,
+        )
    )
--- a/evaluation/gaia/scripts/run_infer.sh
+++ b/evaluation/gaia/scripts/run_infer.sh
--- a/evaluation/gorilla/README.md
+++ b/evaluation/gorilla/README.md
@@ -2,20 +2,16 @@

 This folder contains evaluation harness we built on top of the original [Gorilla APIBench](https://github.com/ShishirPatil/gorilla) ([paper](https://arxiv.org/pdf/2305.15334)).

-## Setup Environment
+## Setup Environment and LLM Configuration

-Please follow [this document](https://github.com/OpenDevin/OpenDevin/blob/main/Development.md) to setup local development environment for OpenDevin.
-
-## Configure OpenDevin and your LLM
-
-Run `make setup-config` to set up the `config.toml` file if it does not exist at the root of the workspace.
+Please follow instruction [here](../README.md#setup) to setup your local development environment and LLM.

 ## Run Inference on APIBench Instances

 Make sure your Docker daemon is running, then run this bash script:

 ```bash
-bash evaluation/gorilla/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit] [hubs]
+./evaluation/gorilla/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit] [hubs]
 ```

 where `model_config` is mandatory, while all other arguments are optional.
@@ -39,5 +35,5 @@ Note: in order to use `eval_limit`, you must also set `agent`; in order to use `
 For example,

 ```bash
-bash evaluation/gorilla/scripts/run_infer.sh llm 0.6.2 CodeActAgent 10 th
+./evaluation/gorilla/scripts/run_infer.sh llm 0.6.2 CodeActAgent 10 th
 ```
--- a/evaluation/gorilla/run_infer.py
+++ b/evaluation/gorilla/run_infer.py
@@ -1,59 +1,28 @@
 import asyncio
 import json
-import logging
-import multiprocessing as mp
 import os
-import pathlib
-import subprocess
-import time
-from concurrent.futures import ProcessPoolExecutor

-from tqdm import tqdm
+import pandas as pd

-from opendevin.controller.agent import Agent
+from evaluation.gorilla.utils import encode_question, get_data_for_hub
+from evaluation.utils.shared import (
+    EvalMetadata,
+    EvalOutput,
+    codeact_user_response,
+    make_metadata,
+    prepare_dataset,
+    reset_logger_for_multiprocessing,
+    run_evaluation,
+)
 from opendevin.controller.state.state import State
-from opendevin.core.config import get_llm_config_arg, get_parser, load_app_config
-from opendevin.core.logger import get_console_handler
+from opendevin.core.config import (
+    AppConfig,
+    SandboxConfig,
+    get_llm_config_arg,
+    get_parser,
+)
 from opendevin.core.logger import opendevin_logger as logger
-from opendevin.core.main import run_agent_controller
-from opendevin.events.action import MessageAction
-from opendevin.llm.llm import LLM
-
-from .utils import encode_question, get_data
-
-config = load_app_config()
-
-
-def cleanup():
-    print('Cleaning up child processes...')
-    for process in mp.active_children():
-        print(f'Terminating child process: {process.name}')
-        process.terminate()
-        process.join()
-
-
-def codeact_user_response(state: State) -> str:
-    msg = (
-        #'Please continue working on the task on whatever approach you think is suitable.\n'
-        'Please run the following command: <execute_bash> exit </execute_bash>.\n'
-        #'IMPORTANT: YOU SHOULD NEVER ASK FOR HUMAN HELP OR USE THE INTERNET TO SOLVE THIS TASK.\n'
-    )
-
-    # check if the agent has tried to talk to the user 3 times, if so, let the agent know it can give up
-    if state.history:
-        user_msgs = [
-            event
-            for event in state.history.get_events()
-            if isinstance(event, MessageAction) and event.source == 'user'
-        ]
-        if len(user_msgs) > 2:
-            # let the agent know that it can give up when it has tried 3 times
-            return (
-                msg
-                + 'If you want to give up, run: <execute_bash> exit </execute_bash>.\n'
-            )
-    return msg
-
+from opendevin.core.main import create_runtime, run_controller

 AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = {
    'CodeActAgent': codeact_user_response,
@@ -64,105 +33,95 @@ AGENT_CLS_TO_INST_SUFFIX = {
 }


-def process_instance(agent, question_id, question, metadata, reset_logger: bool = True):
-    # create process-specific workspace dir
-    # we will create a workspace directory for EACH process
-    # so that different agent don't interfere with each other.
-    old_workspace_mount_path = config.workspace_mount_path
-    try:
-        workspace_mount_path = os.path.join(
-            config.workspace_mount_path, '_eval_workspace'
-        )
-        workspace_mount_path = os.path.join(workspace_mount_path, str(os.getpid()))
-        pathlib.Path(workspace_mount_path).mkdir(parents=True, exist_ok=True)
-        config.workspace_mount_path = workspace_mount_path
+def get_config(
+    metadata: EvalMetadata,
+) -> AppConfig:
+    config = AppConfig(
+        default_agent=metadata.agent_class,
+        run_as_devin=False,
+        runtime='eventstream',
+        max_iterations=metadata.max_iterations,
+        sandbox=SandboxConfig(
+            container_image='ubuntu:22.04',
+            enable_auto_lint=True,
+            use_host_network=False,
+        ),
+        # do not mount workspace
+        workspace_base=None,
+        workspace_mount_path=None,
+    )
+    config.set_llm_config(metadata.llm_config)
+    return config

-        # Setup the logger properly, so you can run multi-processing to parallize the evaluation
-        eval_output_dir = metadata['eval_output_dir']
-        if reset_logger:
-            # Set up logger
-            log_file = os.path.join(
-                eval_output_dir, 'logs', f'instance_{question_id}.log'
-            )
-            # Remove all existing handlers from logger
-            for handler in logger.handlers[:]:
-                logger.removeHandler(handler)
-            # add back the console handler to print ONE line
-            logger.addHandler(get_console_handler())
-            logger.info(
-                f'Starting evaluation for instance {question_id}.\nLOG:   tail -f {log_file}'
-            )
-            # Remove all existing handlers from logger
-            for handler in logger.handlers[:]:
-                logger.removeHandler(handler)
-            file_handler = logging.FileHandler(log_file)
-            file_handler.setFormatter(
-                logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
-            )
-            logger.addHandler(file_handler)
-        logger.info(f'Process-specific workspace mounted at {workspace_mount_path}')

-        # Prepare instruction
-        instruction = encode_question(question, metadata['hub'])
-        instruction += 'IMPORTANT: You should ONLY interact with the environment provided to you AND NEVER ASK FOR HUMAN HELP.\n'
-        # NOTE: You can actually set slightly different instruction for different agents
-        instruction += AGENT_CLS_TO_INST_SUFFIX[agent.__class__.__name__]
-        # logger.info(f'Instruction:\n{instruction}', extra={'msg_type': 'OBSERVATION'})
+async def process_instance(
+    instance: pd.Series,
+    metadata: EvalMetadata,
+    reset_logger: bool = True,
+) -> EvalOutput:
+    config = get_config(metadata)
+    instance_id = instance['question_id']
+    question = instance['question']

-        # Here's how you can run the agent (similar to the `main` function) and get the final task state
-        state: State | None = asyncio.run(
-            run_agent_controller(
-                agent,
-                instruction,
-                max_iterations=metadata.max_iterations,
-                max_budget_per_task=config.max_budget_per_task,
-                fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN.get(
-                    agent.__class__.__name__
-                ),
-                sid=question_id,
-            )
-        )
-        # ======= Attempt to evaluate the agent's edits =======
-        # If you are working on simpler benchmark that only evaluates the final model output (e.g., in a MessageAction)
-        # You can simply get the LAST `MessageAction` from the returned `state.history` and parse it for evaluation.
+    # Setup the logger properly, so you can run multi-processing to parallelize the evaluation
+    if reset_logger:
+        log_dir = os.path.join(metadata.eval_output_dir, 'infer_logs')
+        reset_logger_for_multiprocessing(logger, instance_id, log_dir)
+    else:
+        logger.info(f'Starting evaluation for instance {instance_id}.')

-        if state is None:
-            raise ValueError('State should not be None.')
+    # Prepare instruction
+    instruction = encode_question(question, instance['hub'])
+    instruction += 'IMPORTANT: You should ONLY interact with the environment provided to you AND NEVER ASK FOR HUMAN HELP.\n'
+    # NOTE: You can actually set slightly different instruction for different agents
+    instruction += AGENT_CLS_TO_INST_SUFFIX[metadata.agent_class]
+    # logger.info(f'Instruction:\n{instruction}', extra={'msg_type': 'OBSERVATION'})

-        # retrieve the last message from the agent
-        model_answer_raw = state.history.get_last_agent_message()
+    # Here's how you can run the agent (similar to the `main` function) and get the final task state
+    runtime = await create_runtime(config, sid=instance_id)
+    state: State | None = await run_controller(
+        config=config,
+        task_str=instruction,
+        runtime=runtime,
+        fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN.get(
+            metadata.agent_class
+        ),
+    )
+    # ======= Attempt to evaluate the agent's edits =======
+    # If you are working on simpler benchmark that only evaluates the final model output (e.g., in a MessageAction)
+    # You can simply get the LAST `MessageAction` from the returned `state.history` and parse it for evaluation.

-        # attempt to parse model_answer
-        _, _, ast_eval = get_data(metadata['hub'])
-        correct, hallucination = ast_eval(question_id, model_answer_raw)
-        metrics = state.metrics.get() if state.metrics else None
-        logger.info(
-            f'Final message: {model_answer_raw} | Correctness: {correct} | Hallucination: {hallucination}'
-        )
+    if state is None:
+        raise ValueError('State should not be None.')

-        # history is now available as a stream of events, rather than list of pairs of (Action, Observation)
-        # for compatibility with the existing output format, we can remake the pairs here
-        # remove when it becomes unnecessary
-        histories = state.history.compatibility_for_eval_history_pairs()
+    # retrieve the last message from the agent
+    model_answer_raw = state.history.get_last_agent_message()

-        # Save the output
-        output = {
-            'question_id': question_id,
+    # attempt to parse model_answer
+    ast_eval_fn = instance['ast_eval']
+    correct, hallucination = ast_eval_fn(instance_id, model_answer_raw)
+    metrics = state.metrics.get() if state.metrics else None
+    logger.info(
+        f'Final message: {model_answer_raw} | Correctness: {correct} | Hallucination: {hallucination}'
+    )
+
+    # history is now available as a stream of events, rather than list of pairs of (Action, Observation)
+    # for compatibility with the existing output format, we can remake the pairs here
+    # remove when it becomes unnecessary
+    histories = state.history.compatibility_for_eval_history_pairs()
+
+    output = EvalOutput(
+        instance_id=instance_id,
+        metadata=metadata,
+        history=histories,
+        metrics=metrics,
+        error=state.last_error if state and state.last_error else None,
+        test_result={
            'text': model_answer_raw,
            'correct': correct,
            'hallucination': hallucination,
-            'answer_id': 'None',
-            'model_id': metadata['model_name'],
-            'metadata': metadata.model_dump(),
-            'history': histories,
-            'metrics': metrics,
-            'error': state.last_error if state and state.last_error else None,
-        }
-    except Exception:
-        logger.error('Process instance failed')
-        raise
-    finally:
-        config.workspace_mount_path = old_workspace_mount_path
+        },
+    )
    return output


@@ -175,188 +134,62 @@ if __name__ == '__main__':
        default='hf,torch,tf',
    )
    args, _ = parser.parse_known_args()
-    if args.directory:
-        config.workspace_base = os.path.abspath(args.directory)
-        print(f'Setting workspace base to {config.workspace_base}')

-    # Check https://github.com/OpenDevin/OpenDevin/blob/main/evaluation/swe_bench/README.md#configure-opendevin-and-your-llm
-    # for details of how to set `llm_config`
+    llm_config = None
    if args.llm_config:
-        specified_llm_config = get_llm_config_arg(args.llm_config)
-        if specified_llm_config:
-            config.llm = specified_llm_config
-    logger.info(f'Config for evaluation: {config}')
-    agent_class = args.agent_cls
-    assert (
-        agent_class in AGENT_CLS_TO_FAKE_USER_RESPONSE_FN
-    ), f'Unsupported agent class: {agent_class}'
-    model_name = config.llm.model.split('/')[-1]
-    max_iterations = args.max_iterations
-    eval_note = ''
-    if args.eval_note is not None:
-        eval_note += '_N_' + args.eval_note
-    eval_output_dir = os.path.join(
-        args.eval_output_dir,
-        'gorilla',
-        agent_class,
-        model_name + '_maxiter_' + str(max_iterations) + eval_note,
-    )
-    pathlib.Path(eval_output_dir).mkdir(parents=True, exist_ok=True)
-    pathlib.Path(os.path.join(eval_output_dir, 'logs')).mkdir(
-        parents=True, exist_ok=True
-    )
-    logger.info(f'Using evaluation output directory: {eval_output_dir}')
+        llm_config = get_llm_config_arg(args.llm_config)
+    if llm_config is None:
+        raise ValueError(f'Could not find LLM config: --llm_config {args.llm_config}')

-    hubs = []
-    if 'hf' in args.hubs:
-        hubs.append('hf')
-    if 'torch' in args.hubs or 'th' in args.hubs:
-        hubs.append('torch')
-    if 'tf' in args.hubs:
-        hubs.append('tf')
-    if hubs == []:
+    hubs = args.hubs.split(',')
+    if len(hubs) == 0:
        raise ValueError('Please choose at least one from hf, torch, and tf for hubs.')

+    dfs = []
    for hub in hubs:
        logger.info(f'Evaluating APIBench {hub} test')
-        questions, question_ids, ast_eval = get_data(hub)
+        df = get_data_for_hub(hub)
+        dfs.append(df)
+    dataset_df = pd.concat(dfs)
+    dataset_df.rename(columns={'question_id': 'instance_id'}, inplace=True)

-        # TEST METADATA
-        metadata = {
-            'hub': hub,
-            'agent_class': agent_class,
-            'model_name': model_name,
-            'max_iterations': max_iterations,
-            'eval_output_dir': eval_output_dir,
-            'start_time': time.strftime('%Y-%m-%d %H:%M:%S'),
-            # get the commit id of current repo for reproduciblity
-            'git_commit': subprocess.check_output(['git', 'rev-parse', 'HEAD'])
-            .decode('utf-8')
-            .strip(),
-        }
-        logger.info(f'Metadata: {metadata}')
-        with open(os.path.join(eval_output_dir, f'metadata_{hub}.json'), 'w') as f:
-            json.dump(metadata, f)
+    metadata = make_metadata(
+        llm_config=llm_config,
+        dataset_name=f'gorilla-{hub}',
+        agent_class=args.agent_cls,
+        max_iterations=args.max_iterations,
+        eval_note=args.eval_note,
+        eval_output_dir=args.eval_output_dir,
+        data_split=args.data_split,
+    )
+    output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl')

-        # LIMIT EVALUATION
-        eval_n_limit = args.eval_n_limit
-        if eval_n_limit:
-            questions = questions[: (eval_n_limit // len(hubs))]
-            question_ids = question_ids[: (eval_n_limit // len(hubs))]
-            logger.info(
-                f'Limiting evaluation to a total of first {eval_n_limit} instances -> first {eval_n_limit//len(hubs)} instances per hub.'
-            )
-        output_file = os.path.join(eval_output_dir, f'output_{model_name}_{hub}.jsonl')
-        logger.info(f'Writing evaluation output to {output_file}')
-        finished_task_ids = set()
-        if os.path.exists(output_file):
-            with open(output_file, 'r') as f:
-                for line in f:
-                    data = json.loads(line)
-                    for i in range(len(question_ids)):
-                        if question_ids[i] == int(data['question_id']):
-                            finished_task_ids.add(data['question_id'])
-            logger.warning(
-                f'Output file {output_file} already exists. Loaded {len(finished_task_ids)} finished instances.'
-            )
-        output_fp = open(output_file, 'a')
-        logger.info(
-            f'Evaluation started with Agent {agent_class}, model {model_name}, max iterations {max_iterations}.'
+    dataset = prepare_dataset(
+        dataset_df, output_file=output_file, eval_n_limit=args.eval_n_limit
+    )
+
+    asyncio.run(
+        run_evaluation(
+            dataset=dataset,
+            metadata=metadata,
+            output_file=output_file,
+            num_workers=args.eval_num_workers,
+            process_instance_func=process_instance,
        )
-        # =============================================
-        # filter out finished instances
-        new_questions = []
-        new_question_ids = []
-        for i in range(len(question_ids)):
-            if question_ids[i] in finished_task_ids:
-                logger.info(
-                    f'Skipping instance {question_ids[i]} as it is already finished.'
-                )
-                continue
-            new_questions.append(questions[i])
-            new_question_ids.append(question_ids[i])
+    )

-        finished_task_number = len(finished_task_ids)
-        questions = new_questions
-        question_ids = new_question_ids
-        logger.info(
-            f'Finished instances: {finished_task_number}, Remaining instances: {len(question_ids)}'
-        )
-        # =============================================
-        pbar = tqdm(total=len(question_ids))
-
-        # This function tracks the progress AND write the output to a JSONL file
-        def update_progress(future, pbar, output_fp, finished_task_ids):
-            pbar.update(1)
-            output = future.result()
-            pbar.set_description(f'Instance {output["question_id"]}')
-            pbar.set_postfix_str(f'Test Result: {output["correct"]}')
-            logger.info(
-                f'Finished evaluation for instance {output["question_id"]}: {output["correct"]}'
-            )
-            output_fp.write(json.dumps(output) + '\n')
-            output_fp.flush()
-            finished_task_ids.add(output['question_id'])
-
-        # Create the agent
-        agent = Agent.get_cls(agent_class)(llm=LLM(config.llm))
-
-        # This sets the multi-processing
-        num_workers = args.eval_num_workers
-        logger.info(f'Using {num_workers} workers for evaluation.')
-        try:
-            with ProcessPoolExecutor(num_workers) as executor:
-                futures = []
-                # This is how we perform multi-processing
-                for i in range(len(question_ids)):
-                    try:
-                        question_id = question_ids[i]
-                        question = questions[i]
-                        future = executor.submit(
-                            process_instance,
-                            agent,
-                            question_id,
-                            question,
-                            metadata,
-                            reset_logger=bool(num_workers > 1),
-                        )
-                        future.add_done_callback(
-                            update_progress, pbar, output_fp, finished_task_ids
-                        )
-                        futures.append(future)
-                    except Exception:
-                        continue
-
-                # Wait for all futures to complete
-                for future in futures:
-                    try:
-                        future.result()
-                    except Exception:
-                        continue
-        except KeyboardInterrupt:
-            logger.info('KeyboardInterrupt received. Cleaning up...')
-            cleanup()
-
-        output_fp.close()
-        total_correct = 0
-        total_hallucination = 0
-        output = []
-        with open(output_file, 'r') as f:
-            for line in f:
-                data = json.loads(line)
-                output.append(data)
-                if int(data['question_id']) in finished_task_ids:
-                    if str(data['correct']).lower() == 'true':
-                        total_correct += 1
-                    if str(data['hallucination']).lower() == 'true':
-                        total_hallucination += 1
-        # sort all output by question_id
-        output = sorted(output, key=lambda x: x['question_id'])
-        with open(output_file, 'w') as f:
-            for dat in output:
-                f.write(json.dumps(dat) + '\n')
-                f.flush()
-
-        logger.info(
-            f'Evaluation finished for {hub}. Total: {len(question_ids)+finished_task_number}; Correct: {total_correct}; Hallucination: {total_hallucination}. Accuracy: {total_correct / (len(question_ids)+finished_task_number)}'
-        )
+    # Read the output file and calculate the accuracy
+    total_correct = 0
+    total_hallucination = 0
+    output = []
+    with open(output_file, 'r') as f:
+        for line in f:
+            data = json.loads(line)
+            if data['test_result']['correct']:
+                total_correct += 1
+            if data['test_result']['hallucination']:
+                total_hallucination += 1
+            output.append(data)
+    logger.info(
+        f'Evaluation finished for {hub}. Total: {len(output)}; Correct: {total_correct}; Hallucination: {total_hallucination}. Accuracy: {total_correct / len(output)}'
+    )
--- a/evaluation/gorilla/scripts/run_infer.sh
+++ b/evaluation/gorilla/scripts/run_infer.sh
--- a/evaluation/gorilla/utils.py
+++ b/evaluation/gorilla/utils.py
@@ -1,6 +1,8 @@
 import json
+import os
 from functools import partial

+import pandas as pd
 import requests
 from ast_eval_hf import ast_eval_hf, ast_parse
 from ast_eval_tf import ast_eval_tf
@@ -48,48 +50,59 @@ def encode_question(question, api_name):
    return prompts


-def get_data(hub):
+DATA_DIR = os.path.join(os.path.dirname(__file__), 'data')
+os.makedirs(DATA_DIR, exist_ok=True)
+
+
+def fetch_data(url, filename):
+    cache_path = os.path.join(DATA_DIR, filename)
+    if os.path.exists(cache_path):
+        with open(cache_path, 'r') as f:
+            return f.read()
+    else:
+        response = requests.get(url)
+        if response.status_code == 200:
+            with open(cache_path, 'w') as f:
+                f.write(response.text)
+            return response.text
+        else:
+            raise Exception(f'Failed to fetch data from {url}')
+
+
+def get_data_for_hub(hub: str):
    if hub == 'hf':
        question_data = 'https://raw.githubusercontent.com/ShishirPatil/gorilla/main/eval/eval-data/questions/huggingface/questions_huggingface_0_shot.jsonl'
        api_dataset = 'https://raw.githubusercontent.com/ShishirPatil/gorilla/main/data/api/huggingface_api.jsonl'
        apibench = 'https://raw.githubusercontent.com/ShishirPatil/gorilla/main/data/apibench/huggingface_eval.json'
        ast_eval = ast_eval_hf
-    if hub == 'torch':
+    elif hub == 'torch':
        question_data = 'https://raw.githubusercontent.com/ShishirPatil/gorilla/main/eval/eval-data/questions/torchhub/questions_torchhub_0_shot.jsonl'
        api_dataset = 'https://raw.githubusercontent.com/ShishirPatil/gorilla/main/data/api/torchhub_api.jsonl'
        apibench = 'https://raw.githubusercontent.com/ShishirPatil/gorilla/main/data/apibench/torchhub_eval.json'
        ast_eval = ast_eval_th
-    if hub == 'tf':
+    elif hub == 'tf':
        question_data = 'https://raw.githubusercontent.com/ShishirPatil/gorilla/main/eval/eval-data/questions/tensorflowhub/questions_tensorflowhub_0_shot.jsonl'
        api_dataset = 'https://raw.githubusercontent.com/ShishirPatil/gorilla/main/data/api/tensorflowhub_api.jsonl'
        apibench = 'https://raw.githubusercontent.com/ShishirPatil/gorilla/main/data/apibench/tensorflow_eval.json'
        ast_eval = ast_eval_tf

-    # get questions and question_ids
+    question_data = fetch_data(question_data, 'question_data.jsonl')
+    api_dataset = fetch_data(api_dataset, 'api_dataset.jsonl')
+    apibench = fetch_data(apibench, 'apibench.json')
+
+    # Parse question data
    questions = []
    question_ids = []
-    question_data = requests.get(question_data)
-    if question_data.status_code == 200:
-        lines = question_data.text.splitlines()
-        for line in lines:
-            questions.append(json.loads(line)['text'])
-            question_ids.append(json.loads(line)['question_id'])
+    for line in question_data.splitlines():
+        data = json.loads(line)
+        questions.append(data['text'])
+        question_ids.append(data['question_id'])

-    # get the api datasest
-    api_database = []
-    api_dataset = requests.get(api_dataset)
-    if api_dataset.status_code == 200:
-        lines = api_dataset.text.splitlines()
-        for line in lines:
-            api_database.append(json.loads(line))
+    # Parse API dataset
+    api_database = [json.loads(line) for line in api_dataset.splitlines()]

-    # get the question answer pair datasest
-    qa_pairs = []
-    apibench = requests.get(apibench)
-    if apibench.status_code == 200:
-        lines = apibench.text.splitlines()
-        for line in lines:
-            qa_pairs.append(json.loads(line)['api_data'])
+    # Parse question-answer pairs
+    qa_pairs = [json.loads(line)['api_data'] for line in apibench.splitlines()]

    # Parse all apis to ast trees
    ast_database = []
@@ -97,4 +110,15 @@ def get_data(hub):
        ast_tree = ast_parse(data['api_call'])
        ast_database.append(ast_tree)
    ast_eval = partial(ast_eval, api_database, qa_pairs, ast_database)
-    return questions, question_ids, ast_eval
+
+    return pd.DataFrame(
+        {
+            'question_id': question_ids,
+            'question': questions,
+            'api_database': [api_database] * len(questions),
+            'qa_pairs': [qa_pairs] * len(questions),
+            'ast_database': [ast_database] * len(questions),
+            'ast_eval': [ast_eval] * len(questions),
+            'hub': [hub] * len(questions),
+        }
+    )
--- a/evaluation/gpqa/README.md
+++ b/evaluation/gpqa/README.md
@@ -15,31 +15,9 @@ Further references:
 - https://paperswithcode.com/dataset/gpqa
 - https://github.com/idavidrein/gpqa

+## Setup Environment and LLM Configuration

-## Setup Environment
-
-Please follow [this document](https://github.com/OpenDevin/OpenDevin/blob/main/Development.md) to setup local develop environment for OpenDevin.
-
-
-## Configure OpenDevin and your LLM
-
-Create a `config.toml` file (you can copy from `config.template.toml`) if it does not exist at the root of the workspace.
-
-Add the following configurations:
-
-```toml
-# TODO: Change these to the model you want to evaluate
-[llm.eval_gpt4_1106_preview]
-model = "gpt-4-1106-preview"
-api_key = "XXX"
-temperature = 0.0
-
-[llm.eval_azure_openai_compatible_model]
-model = "AZURE_OPENAI_EXACT_DEPLOYMENT_MODEL_NAME"
-base_url = "AZURE_OPENAI_ENDPOINT"
-api_key = "AZURE_ENDPOINT_API_KEY"
-temperature = 0.0
-```
+Please follow instruction [here](../README.md#setup) to setup your local development environment and LLM.

 ## Run Inference on GPQA Benchmark
 'gpqa_main', 'gqpa_diamond', 'gpqa_experts', 'gpqa_extended' -- data split options
@@ -55,8 +33,3 @@ like to evaluate. It could also be a release tag like `0.6.2`.
 - `num_samples_eval`: Number of samples to evaluate (useful for testing and debugging).
 - `data_split`: The data split to evaluate on. Must be one of `gpqa_main`, `gqpa_diamond`, `gpqa_experts`, `gpqa_extended`. Defaults to `gpqa_diamond` as done in the paper.
 - `AgentClass`: The agent class to use for evaluation. Currently only supports `CodeActAgent` for CodeActAgent.
-
-
-## Benchmark Evaluation Results
-
- [] TODO: Finish the evaluation run across the entire benchmark and compile results
--- a/evaluation/gpqa/run_infer.py
+++ b/evaluation/gpqa/run_infer.py
@@ -17,9 +17,7 @@ TODOs:
 """

 import asyncio
-import logging
 import os
-import pathlib
 import random
 import re
 from typing import Callable
@@ -29,22 +27,27 @@ from datasets import load_dataset

 from evaluation.utils.shared import (
    EvalMetadata,
-    codeact_user_response,
+    EvalOutput,
    make_metadata,
    prepare_dataset,
+    reset_logger_for_multiprocessing,
    run_evaluation,
 )
-from opendevin.controller.agent import Agent
 from opendevin.controller.state.state import State
-from opendevin.core.config import get_llm_config_arg, get_parser, load_app_config
-from opendevin.core.logger import get_console_handler
+from opendevin.core.config import (
+    AppConfig,
+    SandboxConfig,
+    get_llm_config_arg,
+    get_parser,
+)
 from opendevin.core.logger import opendevin_logger as logger
-from opendevin.core.main import run_agent_controller
-from opendevin.events.action import Action, AgentFinishAction, MessageAction
+from opendevin.core.main import create_runtime, run_controller
+from opendevin.events.action import (
+    Action,
+    AgentFinishAction,
+    MessageAction,
+)
 from opendevin.events.observation import Observation
-from opendevin.llm.llm import LLM
-
-config = load_app_config()

 ACTION_FORMAT = """
 <<FINAL_ANSWER||
@@ -53,6 +56,27 @@ ACTION_FORMAT = """
 """.strip()


+def get_config(
+    metadata: EvalMetadata,
+) -> AppConfig:
+    config = AppConfig(
+        default_agent=metadata.agent_class,
+        run_as_devin=False,
+        runtime='eventstream',
+        max_iterations=metadata.max_iterations,
+        sandbox=SandboxConfig(
+            container_image='ubuntu:22.04',
+            enable_auto_lint=True,
+            use_host_network=False,
+        ),
+        # do not mount workspace
+        workspace_base=None,
+        workspace_mount_path=None,
+    )
+    config.set_llm_config(metadata.llm_config)
+    return config
+
+
 def gpqa_codeact_user_response(
    state: State,
    encapsulate_solution: bool = False,
@@ -68,11 +92,10 @@ def gpqa_codeact_user_response(
        '<execute_bash> exit </execute_bash>\n'
        'IMPORTANT: YOU SHOULD NEVER ASK FOR HUMAN HELP TO SOLVE THIS TASK.\n'
    )
-
    return msg


-AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = {'CodeActAgent': codeact_user_response}
+AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = {'CodeActAgent': gpqa_codeact_user_response}

 AGENT_CLS_TO_INST_SUFFIX = {
    'CodeActAgent': '\n\n SUPER IMPORTANT: When you think you have solved the question, first report it back to the user in the requested format. Only once that is done, in the next turn, please run the following command: <execute_bash> exit </execute_bash>.\n'
@@ -146,57 +169,23 @@ def convert_instance_dict(instance):
    return out_instance_dict


-def process_instance(
+async def process_instance(
    instance: pd.Series,
    metadata: EvalMetadata,
    reset_logger: bool = True,
 ):
-    # Create the agent
-    agent = Agent.get_cls(metadata.agent_class)(llm=LLM(config=metadata.llm_config))
-    old_workspace_mount_path = config.workspace_mount_path
-    old_workspace_base = config.workspace_base
-    try:
-        workspace_mount_path = os.path.join(
-            config.workspace_mount_path, '_eval_workspace'
-        )
-        # create process-specific workspace dir
-        workspace_mount_path = os.path.join(workspace_mount_path, str(os.getpid()))
-        pathlib.Path(workspace_mount_path).mkdir(parents=True, exist_ok=True)
+    config = get_config(metadata)

-        # reset workspace to config
-        config.workspace_base = workspace_mount_path
-        config.workspace_mount_path = workspace_mount_path
+    # Setup the logger properly, so you can run multi-processing to parallelize the evaluation
+    if reset_logger:
+        log_dir = os.path.join(metadata.eval_output_dir, 'infer_logs')
+        reset_logger_for_multiprocessing(logger, instance['instance_id'], log_dir)
+    else:
+        logger.info(f'Starting evaluation for instance {instance["instance_id"]}.')

-        # Setup the logger properly, so you can run multi-processing to parallelize the evaluation
-        if reset_logger:
-            # Set up logger
-            log_file = os.path.join(
-                metadata.eval_output_dir, 'logs', f'instance_{instance.instance_id}.log'
-            )
-            # Remove all existing handlers from logger
-            for handler in logger.handlers[:]:
-                logger.removeHandler(handler)
-            # add back the console handler to print ONE line
-            logger.addHandler(get_console_handler())
-            logger.info(
-                f'Starting evaluation for instance {instance.instance_id}.\nHint: run "tail -f {log_file}" to see live logs in a separate shell'
-            )
-            # Remove all existing handlers from logger
-            for handler in logger.handlers[:]:
-                logger.removeHandler(handler)
-            file_handler = logging.FileHandler(log_file)
-            file_handler.setFormatter(
-                logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
-            )
-            logger.addHandler(file_handler)
-        else:
-            logger.info(f'Starting evaluation for instance {instance.instance_id}.')
-
-        logger.info(f'Process-specific workspace mounted at {workspace_mount_path}')
-
-        # ======= Run the agent on the instance =======
-        # Prepare instruction for the agent using suggested format in gpqa codebase
-        instruction = f"""
+    # ======= Run the agent on the instance =======
+    # Prepare instruction for the agent using suggested format in gpqa codebase
+    instruction = f"""
 What is the correct answer to this question:\n
 {instance['question']}\n

@@ -225,109 +214,98 @@ Again do not quit without reporting the answer first.
 Ok now its time to start solving the question. Good luck!
 """

-        # Here's how you can run the agent (similar to the `main` function) and get the final task state
-        state: State | None = asyncio.run(
-            run_agent_controller(
-                agent,
-                instruction,
-                max_iterations=metadata.max_iterations,
-                max_budget_per_task=config.max_budget_per_task,
-                fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN.get(
-                    agent.__class__.__name__
-                ),
-                sid=f'gptq_{str(instance.instance_id)}',
-            )
-        )
-        assert state is not None, 'State should not be None.'
+    runtime = await create_runtime(config, sid=f'gptq_{str(instance.instance_id)}')

-        # ======= Attempt to evaluate the agent's edits =======
+    state: State | None = await run_controller(
+        config=config,
+        task_str=instruction,
+        runtime=runtime,
+        fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN.get(
+            metadata.agent_class
+        ),
+    )
+    assert state is not None, 'State should not be None.'

-        question_choices = {
-            'A': instance['choices'][0],
-            'B': instance['choices'][1],
-            'C': instance['choices'][2],
-            'D': instance['choices'][3],
-        }
-        # get the final message from the state history (default to empty if not found)
-        found_answers = {
-            'A': False,
-            'B': False,
-            'C': False,
-            'D': False,
-        }
-        for event in state.history.get_events(reverse=True):
-            if (
-                isinstance(event, AgentFinishAction)
-                and event.source != 'user'
-                and '<<FINAL_ANSWER||' in event.thought
-            ):
-                final_message = event.thought
-                break
-            elif (
-                isinstance(event, MessageAction)
-                and event.source != 'user'
-                and '<<FINAL_ANSWER||' in event.content
-            ):
-                final_message = event.content
-                break
-            elif isinstance(event, Observation):
-                for option, option_text in question_choices.items():
-                    if option_text in event.content:
-                        found_answers[option] = True
-            else:
-                final_message = None
+    # ======= Attempt to evaluate the agent's edits =======

-        found_options = [option for option, found in found_answers.items() if found]
+    question_choices = {
+        'A': instance['choices'][0],
+        'B': instance['choices'][1],
+        'C': instance['choices'][2],
+        'D': instance['choices'][3],
+    }
+    # get the final message from the state history (default to empty if not found)
+    found_answers = {
+        'A': False,
+        'B': False,
+        'C': False,
+        'D': False,
+    }
+    for event in state.history.get_events(reverse=True):
+        if (
+            isinstance(event, AgentFinishAction)
+            and event.source != 'user'
+            and '<<FINAL_ANSWER||' in event.thought
+        ):
+            final_message = event.thought
+            break
+        elif (
+            isinstance(event, MessageAction)
+            and event.source != 'user'
+            and '<<FINAL_ANSWER||' in event.content
+        ):
+            final_message = event.content
+            break
+        elif isinstance(event, Observation):
+            for option, option_text in question_choices.items():
+                if option_text in event.content:
+                    found_answers[option] = True
+        else:
+            final_message = None
+
+    found_options = [option for option, found in found_answers.items() if found]
+    logger.info('#############################################')
+    logger.info(f'Final message generated by the agent: {final_message}')
+    logger.info('#############################################')
+
+    # check if the model output matches the ground truth
+    test_result = compare_answers(final_message, instance.correct_solution)
+    if final_message is None and len(found_options) > 0:
+        _selected = random.choice(found_options)
+        # if the final message is None, then the agent did not report the answer in the correct format
+        # so we randomly select one of the found options and compare it with the correct solution
+        test_result = _selected == instance.correct_solution
        logger.info('#############################################')
-        logger.info(f'Final message generated by the agent: {final_message}')
+        logger.info('Agent did not report the answer in the correct format.')
+        logger.info(f'Found options: {found_options}')
+        logger.info(f'Selected option: {_selected}')
        logger.info('#############################################')

-        # check if the model output matches the ground truth
-        test_result = compare_answers(final_message, instance.correct_solution)
-        if final_message is None and len(found_options) > 0:
-            _selected = random.choice(found_options)
-            # if the final message is None, then the agent did not report the answer in the correct format
-            # so we randomly select one of the found options and compare it with the correct solution
-            test_result = _selected == instance.correct_solution
-            logger.info('#############################################')
-            logger.info('Agent did not report the answer in the correct format.')
-            logger.info(f'Found options: {found_options}')
-            logger.info(f'Selected option: {_selected}')
-            logger.info('#############################################')
+    logger.info('#############################################')
+    logger.info(f'Test result: {test_result}')
+    logger.info('#############################################')

-        logger.info('#############################################')
-        logger.info(f'Test result: {test_result}')
-        logger.info('#############################################')
+    # If you are working on some simpler benchmark that only evaluates the final model output (e.g., in a MessageAction)
+    # You can simply get the LAST `MessageAction` from the returned `state.history` and parse it for evaluation.
+    if state is None:
+        raise ValueError('State should not be None.')

-        # If you are working on some simpler benchmark that only evaluates the final model output (e.g., in a MessageAction)
-        # You can simply get the LAST `MessageAction` from the returned `state.history` and parse it for evaluation.
-        if state is None:
-            raise ValueError('State should not be None.')
+    metrics = state.metrics.get() if state.metrics else None

-        metrics = state.metrics.get() if state.metrics else None
-
-        # Save the output
-        output = {
-            'task_id': instance.task_id,
-            'instance_id': instance.instance_id,
-            'instruction': instruction,
-            'metadata': metadata.model_dump(),
-            'history': state.history.compatibility_for_eval_history_pairs(),
-            'metrics': metrics,
-            'error': state.last_error if state and state.last_error else None,
-            'test_result': {
-                'result': test_result,
-                'found_answers': found_answers,
-                'last_message': final_message,
-            },
-        }
-
-    except Exception:
-        logger.error('Process instance failed')
-        raise
-    finally:
-        config.workspace_mount_path = old_workspace_mount_path
-        config.workspace_base = old_workspace_base
+    # Save the output
+    output = EvalOutput(
+        instance_id=str(instance.instance_id),
+        instruction=instruction,
+        metadata=metadata,
+        history=state.history.compatibility_for_eval_history_pairs(),
+        metrics=metrics,
+        error=state.last_error if state and state.last_error else None,
+        test_result={
+            'result': test_result,
+            'found_answers': found_answers,
+            'last_message': final_message,
+        },
+    )
    return output


@@ -343,8 +321,11 @@ if __name__ == '__main__':
    )
    args, _ = parser.parse_known_args()

-    llm_config = get_llm_config_arg(args.llm_config) if args.llm_config else config.llm
-    logger.info(f'Config for evaluation: {config}')
+    llm_config = None
+    if args.llm_config:
+        llm_config = get_llm_config_arg(args.llm_config)
+    if llm_config is None:
+        raise ValueError(f'Could not find LLM config: --llm_config {args.llm_config}')

    # NOTE: It is preferable to load datasets from huggingface datasets and perform post-processing
    # so we don't need to manage file uploading to OpenDevin's repo
@@ -355,8 +336,6 @@ if __name__ == '__main__':
    gpqa_dataset = gpqa_dataset.to_pandas()
    # Add a new column 'instance_id' with the index
    gpqa_dataset['instance_id'] = gpqa_dataset.index
-    gpqa_dataset['task_id'] = gpqa_dataset.index
-    # gpqa_dataset = dataset['train'].to_pandas().sort_values(by='id').reset_index(drop=True)

    if args.agent_cls != 'CodeActAgent':
        raise ValueError(
@@ -374,15 +353,14 @@ if __name__ == '__main__':
    )

    output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl')
-    prepared_dataset = prepare_dataset(
-        gpqa_dataset, output_file, args.eval_n_limit, 'task_id'
-    )
+    prepared_dataset = prepare_dataset(gpqa_dataset, output_file, args.eval_n_limit)

-    run_evaluation(
-        dataset=prepared_dataset,
-        metadata=metadata,
-        output_file=output_file,
-        num_workers=args.eval_num_workers,
-        process_instance_func=process_instance,
-        id_column='task_id',
+    asyncio.run(
+        run_evaluation(
+            dataset=prepared_dataset,
+            metadata=metadata,
+            output_file=output_file,
+            num_workers=args.eval_num_workers,
+            process_instance_func=process_instance,
+        )
    )
--- a/evaluation/humanevalfix/README.md
+++ b/evaluation/humanevalfix/README.md
@@ -1,39 +1,10 @@
 # HumanEvalFix Evaluation with OpenDevin

-Implements evaluation of agents on HumanEvalFix from the HumanEvalPack benchmark introduced in [OctoPack: Instruction Tuning Code Large Language Models](https://arxiv.org/abs/2308.07124). Please see [here](https://github.com/bigcode-project/bigcode-evaluation-harness/blob/main/bigcode_eval/tasks/humanevalpack.py) for the reference implementation used in the paper.
+Implements evaluation of agents on HumanEvalFix from the HumanEvalPack benchmark introduced in [OctoPack: Instruction Tuning Code Large Language Models](https://arxiv.org/abs/2308.07124). Please see [here](https://github.com/bigcode-project/bigcode-evaluation-harness/blob/main/bigcode_eval/tasks/humanevalpack.py) for the reference implementation used in the paper. Currently only `python` evaluation is supported.

-## Setup Environment
+## Setup Environment and LLM Configuration

-Please follow [this document](https://github.com/OpenDevin/OpenDevin/blob/main/Development.md) to setup local develop environment for OpenDevin.
-
-
-## Configure OpenDevin and your LLM
-
-Create a `config.toml` file if it does not exist at the root of the workspace.
-
-Add the following configurations:
-
-```toml
-[core]
-max_iterations = 100
-cache_dir = "/tmp/cache"
-ssh_hostname = "localhost"
-
-[sandbox]
-enable_auto_lint = true
-
-# TODO: Change these to the model you want to evaluate
-[llm.eval_gpt4_1106_preview]
-model = "gpt-4-1106-preview"
-api_key = "XXX"
-temperature = 0.0
-
-[llm.eval_some_openai_compatible_model]
-model = "openai/MODEL_NAME"
-base_url = "https://OPENAI_COMPATIBLE_URL/v1"
-api_key = "XXX"
-temperature = 0.0
-```
+Please follow instruction [here](../README.md#setup) to setup your local development environment and LLM.

 ## Run Inference on HumanEvalFix

--- a/evaluation/humanevalfix/run_infer.py
+++ b/evaluation/humanevalfix/run_infer.py
@@ -9,9 +9,9 @@ TODOs:
 """

 import asyncio
-import logging
 import os
-import pathlib
+import tempfile
+from typing import Any

 import pandas as pd
 from datasets import load_dataset
@@ -19,20 +19,25 @@ from evaluate import load

 from evaluation.utils.shared import (
    EvalMetadata,
+    EvalOutput,
    codeact_user_response,
    make_metadata,
    prepare_dataset,
+    reset_logger_for_multiprocessing,
    run_evaluation,
 )
-from opendevin.controller.agent import Agent
 from opendevin.controller.state.state import State
-from opendevin.core.config import get_llm_config_arg, load_app_config, parse_arguments
-from opendevin.core.logger import get_console_handler
+from opendevin.core.config import (
+    AppConfig,
+    SandboxConfig,
+    get_llm_config_arg,
+    parse_arguments,
+)
 from opendevin.core.logger import opendevin_logger as logger
-from opendevin.core.main import run_agent_controller
-from opendevin.llm.llm import LLM
-
-config = load_app_config()
+from opendevin.core.main import create_runtime, run_controller
+from opendevin.events.action import CmdRunAction
+from opendevin.events.observation import CmdOutputObservation
+from opendevin.runtime.runtime import Runtime

 IMPORT_HELPER = {
    'python': [
@@ -72,19 +77,105 @@ AGENT_CLS_TO_INST_SUFFIX = {
 }


-def get_test_result(instance, path, language='python', timeout=10):
-    # Evaluation reference: https://github.com/bigcode-project/bigcode-evaluation-harness/blob/84b96da31b7f840b55c5733325346176140cdb6b/bigcode_eval/tasks/humanevalpack.py#L347
+def get_config(
+    metadata: EvalMetadata,
+) -> AppConfig:
+    config = AppConfig(
+        default_agent=metadata.agent_class,
+        run_as_devin=False,
+        runtime='eventstream',
+        max_iterations=metadata.max_iterations,
+        sandbox=SandboxConfig(
+            container_image='ubuntu:22.04',
+            enable_auto_lint=True,
+            use_host_network=False,
+        ),
+        # do not mount workspace
+        workspace_base=None,
+        workspace_mount_path=None,
+    )
+    config.set_llm_config(metadata.llm_config)
+    return config
+
+
+def _get_instance_id(instance: pd.Series) -> str:
+    return instance.task_id.replace('/', '__')
+
+
+async def initialize_runtime(
+    runtime: Runtime,
+    instance: pd.Series,  # this argument is not required
+):
+    """Initialize the runtime for the agent.
+
+    This function is called before the runtime is used to run the agent.
+    """
+    logger.info(f"{'-' * 50} BEGIN Runtime Initialization Fn {'-' * 50}")
+    obs: CmdOutputObservation
+
+    action = CmdRunAction(command='mkdir -p /workspace')
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = await runtime.run_action(action)
+    assert obs.exit_code == 0
+
+    action = CmdRunAction(command='cd /workspace')
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = await runtime.run_action(action)
+    assert obs.exit_code == 0
+
+    problem_statement = (
+        instance.declaration + instance.buggy_solution + '\n' + instance.test
+    )
+    filename = f'{_get_instance_id(instance)}.py'
+    with tempfile.TemporaryDirectory() as tmpdir:
+        host_script_path = os.path.join(tmpdir, filename)
+        with open(host_script_path, 'w') as f:
+            f.write(problem_statement)
+        await runtime.copy_to(
+            host_script_path,
+            '/workspace',
+        )
+
+    # check file exists
+    action = CmdRunAction(command=f'ls /workspace/{_get_instance_id(instance)}.py')
+    obs = await runtime.run_action(action)
+    assert obs.exit_code == 0
+
+    logger.info(f"{'-' * 50} END Runtime Initialization Fn {'-' * 50}")
+
+
+async def complete_runtime(
+    runtime: Runtime,
+    instance: pd.Series,  # this argument is not required, but it is used to get the workspace_dir_name
+) -> dict[str, Any]:
+    """Complete the runtime for the agent.
+
+    This function is called before the runtime is used to run the agent.
+    If you need to do something in the sandbox to get the correctness metric after
+    the agent has run, modify this function.
+    """
+    logger.info(f"{'-' * 50} BEGIN Runtime Completion Fn {'-' * 50}")
+    obs: CmdOutputObservation
+
+    # default value
+    language = 'python'
+    timeout = 10
+
    test_result = {'result': {}, 'metadata': {}}
    code_metric = load('Muennighoff/code_eval_octopack')
    timeout = LANGUAGE_TO_TIMEOUT[language]
    num_workers = LANGUAGE_TO_NUM_WORKERS[language]
    python_imports = '\n'.join(IMPORT_HELPER[language])

-    # Load function from path
-    with open(path, 'r') as f:
-        function = f.read()
+    action = CmdRunAction(
+        command=f'cat /workspace/{_get_instance_id(instance)}.py', keep_prompt=False
+    )
+    obs = await runtime.run_action(action)
+    assert obs.exit_code == 0

-    function = [[python_imports + '\n' + function.strip()]]
+    function = obs.content.replace('\r\n', '\n')
+    logger.info(f'Function: {function}')
+    function = [[python_imports + '\n' + function]]

    results, logs = code_metric.compute(
        references=[instance.test],
@@ -99,129 +190,79 @@ def get_test_result(instance, path, language='python', timeout=10):
        'timeout': timeout,
        'num_workers': num_workers,
    }
+    logger.info(f"{'-' * 50} END Runtime Completion Fn {'-' * 50}")
    return test_result


-def process_instance(
+async def process_instance(
    instance: pd.Series,
    metadata: EvalMetadata,
    reset_logger: bool = True,
-):
-    # Create the agent
-    agent = Agent.get_cls(metadata.agent_class)(llm=LLM(config=metadata.llm_config))
-    old_workspace_mount_path = config.workspace_mount_path
-    old_workspace_base = config.workspace_base
+) -> EvalOutput:
+    config = get_config(metadata)
+    # use a session id for concurrent evaluation
+    sid = _get_instance_id(instance)

-    try:
-        workspace_mount_path = os.path.join(
-            config.workspace_mount_path, '_eval_workspace'
-        )
-        # create process-specific workspace dir
-        workspace_mount_path = os.path.join(workspace_mount_path, str(os.getpid()))
-        pathlib.Path(workspace_mount_path).mkdir(parents=True, exist_ok=True)
+    # Setup the logger properly, so you can run multi-processing to parallelize the evaluation
+    if reset_logger:
+        log_dir = os.path.join(metadata.eval_output_dir, 'infer_logs')
+        reset_logger_for_multiprocessing(logger, instance.task_id, log_dir)
+    else:
+        logger.info(f'Starting evaluation for instance {instance.task_id}.')

-        # reset workspace to config
-        config.workspace_base = workspace_mount_path
-        config.workspace_mount_path = workspace_mount_path
+    # Create file with HumanEvalFix problem
+    # Prompt reference: https://github.com/bigcode-project/bigcode-evaluation-harness/blob/84b96da31b7f840b55c5733325346176140cdb6b/bigcode_eval/tasks/humanevalpack.py#L509
+    problem_statement = (
+        instance.declaration + instance.buggy_solution + '\n' + instance.test
+    )

-        # use a session id for concurrent evaluation
-        sid = instance.task_id.replace('/', '__')
+    # Prepare instruction
+    instruction = (
+        f'Please fix the function in {sid}.py such that all test cases pass.\n'
+        'Environment has been set up for you to start working. You may assume all necessary tools are installed.\n\n'
+        '# Problem Statement\n'
+        f'{problem_statement}\n\n'
+    )
+    instruction += (
+        'IMPORTANT: You should ONLY interact with the environment provided to you AND NEVER ASK FOR HUMAN HELP.\n'
+        'You should NOT modify any existing test case files. If needed, you can add new test cases in a NEW file to reproduce the issue.\n'
+        'You SHOULD INCLUDE PROPER INDENTATION in your edit commands.\n'
+    )
+    # NOTE: You can actually set slightly different instruction for different agents
+    instruction += AGENT_CLS_TO_INST_SUFFIX[metadata.agent_class]

-        # Setup the logger properly, so you can run multi-processing to parallelize the evaluation
-        if reset_logger:
-            # Set up logger
-            log_file = os.path.join(
-                metadata.eval_output_dir,
-                'logs',
-                f'instance_{sid}.log',
-            )
-            # Remove all existing handlers from logger
-            for handler in logger.handlers[:]:
-                logger.removeHandler(handler)
-            # add back the console handler to print ONE line
-            logger.addHandler(get_console_handler())
-            logger.info(
-                f'Starting evaluation for instance {instance.task_id}.\nLOG:   tail -f {log_file}'
-            )
-            # Remove all existing handlers from logger
-            for handler in logger.handlers[:]:
-                logger.removeHandler(handler)
-            file_handler = logging.FileHandler(log_file)
-            file_handler.setFormatter(
-                logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
-            )
-            logger.addHandler(file_handler)
+    # Here's how you can run the agent (similar to the `main` function) and get the final task state
+    runtime = await create_runtime(config, sid=sid)
+    await initialize_runtime(runtime, instance)
+    state: State | None = await run_controller(
+        config=config,
+        task_str=instruction,
+        runtime=runtime,
+        fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN.get(
+            metadata.agent_class
+        ),
+    )

-        logger.info(f'Process-specific workspace mounted at {workspace_mount_path}')
+    if state is None:
+        raise ValueError('State should not be None.')
+    metrics = state.metrics.get() if state.metrics else None
+    test_result = await complete_runtime(runtime, instance)

-        # Create file with HumanEvalFix problem
-        # Prompt reference: https://github.com/bigcode-project/bigcode-evaluation-harness/blob/84b96da31b7f840b55c5733325346176140cdb6b/bigcode_eval/tasks/humanevalpack.py#L509
-        problem_statement = (
-            instance.declaration + instance.buggy_solution + '\n' + instance.test
-        )
-        path = os.path.join(workspace_mount_path, f'{sid}.py')
-        with open(path, 'w') as f:
-            f.write(problem_statement)
+    # history is now available as a stream of events, rather than list of pairs of (Action, Observation)
+    # for compatibility with the existing output format, we can remake the pairs here
+    # remove when it becomes unnecessary
+    histories = state.history.compatibility_for_eval_history_pairs()

-        # Prepare instruction
-        instruction = (
-            f'Please fix the function in {instance.task_id.replace("/", "__")}.py such that all test cases pass.\n'
-            'Environment has been set up for you to start working. You may assume all necessary tools are installed.\n\n'
-            '# Problem Statement\n'
-            f'{problem_statement}\n\n'
-        )
-        instruction += (
-            'IMPORTANT: You should ONLY interact with the environment provided to you AND NEVER ASK FOR HUMAN HELP.\n'
-            'You should NOT modify any existing test case files. If needed, you can add new test cases in a NEW file to reproduce the issue.\n'
-            'You SHOULD INCLUDE PROPER INDENTATION in your edit commands.\n'
-        )
-        # NOTE: You can actually set slightly different instruction for different agents
-        instruction += AGENT_CLS_TO_INST_SUFFIX[agent.__class__.__name__]
-
-        # Here's how you can run the agent (similar to the `main` function) and get the final task state
-        state: State | None = asyncio.run(
-            run_agent_controller(
-                agent,
-                instruction,
-                max_iterations=metadata.max_iterations,
-                max_budget_per_task=config.max_budget_per_task,
-                fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN.get(
-                    agent.__class__.__name__
-                ),
-                sid=sid,
-            )
-        )
-
-        # ======= Attempt to evaluate the agent's edits =======
-        test_result = get_test_result(instance, path)
-
-        # If you are working on some simpler benchmark that only evaluates the final model output (e.g., in a MessageAction)
-        # You can simply get the LAST `MessageAction` from the returned `state.history` and parse it for evaluation.
-        if state is None:
-            raise ValueError('State should not be None.')
-        metrics = state.metrics.get() if state.metrics else None
-
-        # history is now available as a stream of events, rather than list of pairs of (Action, Observation)
-        # for compatibility with the existing output format, we can remake the pairs here
-        # remove when it becomes unnecessary
-        histories = state.history.compatibility_for_eval_history_pairs()
-
-        # Save the output
-        output = {
-            'task_id': instance.task_id,
-            'instruction': instruction,
-            'metadata': metadata.model_dump(),
-            'history': histories,
-            'metrics': metrics,
-            'error': state.last_error if state and state.last_error else None,
-            'test_result': test_result,
-        }
-    except Exception:
-        logger.error('Process instance failed')
-        raise
-    finally:
-        config.workspace_mount_path = old_workspace_mount_path
-        config.workspace_base = old_workspace_base
+    # Save the output
+    output = EvalOutput(
+        instance_id=instance.task_id,
+        instruction=instruction,
+        metadata=metadata,
+        history=histories,
+        metrics=metrics,
+        error=state.last_error if state and state.last_error else None,
+        test_result=test_result,
+    )
    return output


@@ -234,28 +275,31 @@ if __name__ == '__main__':
        'bigcode/humanevalpack', 'python'
    )  # TODO: Support other languages
    hefix_tests = dataset['test'].to_pandas()
+    hefix_tests.rename(columns={'task_id': 'instance_id'}, inplace=True)

-    id_column = 'task_id'
-
-    llm_config = get_llm_config_arg(args.llm_config) if args.llm_config else config.llm
-    logger.info(f'Config for evaluation: {config}')
+    llm_config = None
+    if args.llm_config:
+        llm_config = get_llm_config_arg(args.llm_config)
+    if llm_config is None:
+        raise ValueError(f'Could not find LLM config: --llm_config {args.llm_config}')

    metadata = make_metadata(
        llm_config,
-        args.dataset_name,
+        'humanevalfix-python',
        args.agent_cls,
        args.max_iterations,
        args.eval_note,
        args.eval_output_dir,
    )
    output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl')
-    instances = prepare_dataset(dataset, output_file, args.eval_n_limit, id_column)
+    instances = prepare_dataset(hefix_tests, output_file, args.eval_n_limit)

-    run_evaluation(
-        instances,
-        metadata,
-        output_file,
-        args.eval_num_workers,
-        process_instance,
-        id_column,
+    asyncio.run(
+        run_evaluation(
+            instances,
+            metadata,
+            output_file,
+            args.eval_num_workers,
+            process_instance,
+        )
    )
--- a/evaluation/humanevalfix/scripts/run_infer.sh
+++ b/evaluation/humanevalfix/scripts/run_infer.sh
--- a/evaluation/logic_reasoning/Dockerfile
+++ b/evaluation/logic_reasoning/Dockerfile
@@ -0,0 +1,7 @@
+FROM ubuntu:22.04
+
+RUN apt-get update && apt-get install -y python3 python3-pip
+
+RUN pip install scitools-pyke
+
+# docker build -t xingyaoww/od_logic_reasoning .
--- a/evaluation/logic_reasoning/README.md
+++ b/evaluation/logic_reasoning/README.md
@@ -2,38 +2,13 @@

 This folder contains evaluation harness for evaluating agents on the logic reasoning benchmark [ProntoQA](https://github.com/asaparov/prontoqa) and [ProofWriter](https://allenai.org/data/proofwriter).

-## Configure OpenDevin and your LLM
+## Setup Environment and LLM Configuration

-Create a `config.toml` file if it does not exist at the root of the workspace.
-
-Add the following configurations:
-
-```toml
-[core]
-max_iterations = 100
-cache_dir = "/tmp/cache"
-ssh_hostname = "localhost"
-
-[sandbox]
-enable_auto_lint = true
-
-# TODO: Change these to the model you want to evaluate
-[llm.eval_gpt4_1106_preview_llm]
-model = "gpt-4-1106-preview"
-api_key = "XXX"
-temperature = 0.0
-
-[llm.eval_some_openai_compatible_model_llm]
-model = "openai/MODEL_NAME"
-base_url = "https://OPENAI_COMPATIBLE_URL/v1"
-api_key = "XXX"
-temperature = 0.0
-```
+Please follow instruction [here](../README.md#setup) to setup your local development environment and LLM.

 ## Run Inference on logic_reasoning
-The following code will run inference on the first example of the ProntoQA dataset,
-using OpenDevin 0.6.2 version.
+The following code will run inference on the first example of the ProofWriter dataset,

 ```bash
-./evaluation/logic_reasoning/scripts/run_infer.sh ProntoQA eval_gpt4_1106_preview_llm 0.6.2 1
+./evaluation/logic_reasoning/scripts/run_infer.sh eval_gpt4_1106_preview_llm ProofWriter
 ```
--- a/evaluation/logic_reasoning/instruction.txt
+++ b/evaluation/logic_reasoning/instruction.txt
@@ -3,12 +3,12 @@ you can interact with an interactive Python (Jupyter Notebook) environment and r
 In this task, you need to use the code in [[logic_inference_path.py]] to help you. Specifically, you first need to instantiate a **LogicInferenceEngine** class and use the **safe_execute_program** method to prove the **logic programs**. You should receive *answer*, *flag*, *error_message* from the output.

 An example would be look like this:
-    <execute_ipython>
-    import sys
-    sys.path.append(workspace_mount_path)
-    engine = LogicInferenceEngine(dataset_name, workspace_mount_path)
-    answer, flag, error_message = engine.safe_execute_program(logic_programs)
-    </execute_ipython>
+<execute_ipython>
+import sys
+sys.path.append('/workspace')
+engine = LogicInferenceEngine()
+answer, flag, error_message = engine.safe_execute_program(logic_programs)
+</execute_ipython>

 Please send the *answer* variable through message.

--- a/evaluation/logic_reasoning/logic_inference.py
+++ b/evaluation/logic_reasoning/logic_inference.py
@@ -191,9 +191,9 @@ class PykeProgram:


 class LogicInferenceEngine:
-    def __init__(self, dataset_name, workspace_mount_path):
-        self.dataset_name = dataset_name
-        self.workspace_mount_path = workspace_mount_path
+    def __init__(self):
+        self.dataset_name = os.environ.get('DATASET_NAME', 'ProofWriter')
+        self.workspace_mount_path = '/workspace'

    def random_backup(self):
        if self.dataset_name == 'ProntoQA':
--- a/evaluation/logic_reasoning/run_infer.py
+++ b/evaluation/logic_reasoning/run_infer.py
@@ -1,29 +1,35 @@
 import asyncio
-import logging
 import os
-import pathlib
-import shutil

 import pandas as pd
 from datasets import load_dataset

-from evaluation.swe_bench.swe_env_box import DockerSSHBox
 from evaluation.utils.shared import (
    EvalMetadata,
+    EvalOutput,
    codeact_user_response,
    make_metadata,
    prepare_dataset,
+    reset_logger_for_multiprocessing,
    run_evaluation,
 )
-from opendevin.controller.agent import Agent
 from opendevin.controller.state.state import State
-from opendevin.core.config import get_llm_config_arg, get_parser, load_app_config
-from opendevin.core.logger import get_console_handler
+from opendevin.core.config import (
+    AppConfig,
+    SandboxConfig,
+    get_llm_config_arg,
+    get_parser,
+)
 from opendevin.core.logger import opendevin_logger as logger
-from opendevin.core.main import run_agent_controller
-from opendevin.llm.llm import LLM
-
-config = load_app_config()
+from opendevin.core.main import create_runtime, run_controller
+from opendevin.events.action import (
+    AgentFinishAction,
+    CmdRunAction,
+    IPythonRunCellAction,
+    MessageAction,
+)
+from opendevin.events.observation import CmdOutputObservation
+from opendevin.runtime.runtime import Runtime

 AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = {
    'CodeActAgent': codeact_user_response,
@@ -34,6 +40,28 @@ AGENT_CLS_TO_INST_SUFFIX = {
 }


+def get_config(
+    metadata: EvalMetadata,
+) -> AppConfig:
+    config = AppConfig(
+        default_agent=metadata.agent_class,
+        run_as_devin=False,
+        runtime='eventstream',
+        max_iterations=metadata.max_iterations,
+        sandbox=SandboxConfig(
+            container_image='xingyaoww/od-eval-logic-reasoning:v1.0',
+            enable_auto_lint=True,
+            use_host_network=False,
+            od_runtime_extra_deps='$OD_INTERPRETER_PATH -m pip install scitools-pyke',
+        ),
+        # do not mount workspace
+        workspace_base=None,
+        workspace_mount_path=None,
+    )
+    config.set_llm_config(metadata.llm_config)
+    return config
+
+
 def get_choice(answer_str):
    choices = [
        'A',
@@ -83,7 +111,7 @@ def get_test_result(
        'the correct answer is',
        'The correct answer is',
        'The correct option is',
-        'Thus, the answer is',
+        'the answer is',
    ]
    if prediction is None:
        for indicator in indicators:
@@ -97,162 +125,143 @@ def get_test_result(
    return test_result


-def process_instance(
+CUR_EVAL_DIR = os.path.dirname(__file__)
+
+
+async def initialize_runtime(
+    runtime: Runtime,
+    instance: pd.Series,  # this argument is not required
+):
+    """Initialize the runtime for the agent.
+
+    This function is called before the runtime is used to run the agent.
+    """
+    logger.info(f"{'-' * 50} BEGIN Runtime Initialization Fn {'-' * 50}")
+    obs: CmdOutputObservation
+
+    # Set instance id
+    action = CmdRunAction(command='mkdir -p /workspace')
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = await runtime.run_action(action)
+    assert obs.exit_code == 0
+
+    action = CmdRunAction(command='cd /workspace')
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = await runtime.run_action(action)
+    assert obs.exit_code == 0
+
+    # copy logic_inference.py to /workspace
+    await runtime.copy_to(
+        os.path.join(CUR_EVAL_DIR, 'logic_inference.py'), '/workspace'
+    )
+    # check if the file exists
+    obs = await runtime.run_action(CmdRunAction(command='ls /workspace'))
+    assert obs.exit_code == 0
+    assert 'logic_inference.py' in obs.content
+
+    await runtime.add_env_vars({'DATASET_NAME': metadata.dataset})
+
+    action = CmdRunAction(command='mkdir -p /workspace/.cache_program')
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = await runtime.run_action(action)
+    assert obs.exit_code == 0
+
+    action = IPythonRunCellAction(code='%pip install scitools-pyke')
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    ipynb_obs = await runtime.run_action(action)
+    logger.info(ipynb_obs, extra={'msg_type': 'OBSERVATION'})
+
+    logger.info(f"{'-' * 50} END Runtime Initialization Fn {'-' * 50}")
+
+
+# Prepare instruction
+with open(os.path.join(CUR_EVAL_DIR, 'instruction.txt'), 'r') as f:
+    INSTRUCTION_TEMPLATE = f.read()
+
+
+async def process_instance(
    instance: pd.Series,
    metadata: EvalMetadata,
    reset_logger: bool = True,
 ):
-    # Create the agent
-    agent = Agent.get_cls(metadata.agent_class)(llm=LLM(config=metadata.llm_config))
-    old_workspace_mount_path = config.workspace_mount_path
-    old_workspace_base = config.workspace_base
+    config = get_config(metadata)

-    try:
-        workspace_mount_path = os.path.join(
-            config.workspace_mount_path, '_eval_workspace'
+    # Setup the logger properly, so you can run multi-processing to parallelize the evaluation
+    if reset_logger:
+        log_dir = os.path.join(metadata.eval_output_dir, 'infer_logs')
+        reset_logger_for_multiprocessing(logger, instance['instance_id'], log_dir)
+    else:
+        logger.info(f'Starting evaluation for instance {instance["instance_id"]}.')
+
+    instance_logic_programs = instance['raw_logic_programs'][0].strip()
+    instruction = (
+        INSTRUCTION_TEMPLATE.replace('[[dataset_name]]', dataset_name)
+        .replace('[[logic_programs]]', instance_logic_programs)
+        .replace('[[logic_inference_path.py]]', '/workspace/logic_inference.py')
+    )
+
+    # NOTE: You can actually set slightly different instruction for different agents
+    instruction += AGENT_CLS_TO_INST_SUFFIX[metadata.agent_class]
+
+    # use a session id for concurrent evaluation
+    sid = instance['instance_id']
+
+    runtime = await create_runtime(config, sid=sid)
+    await initialize_runtime(runtime, instance)
+
+    # Here's how you can run the agent (similar to the `main` function) and get the final task state
+    state: State | None = asyncio.run(
+        run_controller(
+            config=config,
+            task_str=instruction,
+            runtime=runtime,
+            fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN.get(
+                metadata.agent_class
+            ),
        )
-        # create process-specific workspace dir
-        workspace_mount_path = os.path.join(workspace_mount_path, str(os.getpid()))
-        pathlib.Path(workspace_mount_path).mkdir(parents=True, exist_ok=True)
+    )
+    # ======= Attempt to evaluate the agent's edits =======
+    # If you are working on simpler benchmark that only evaluates the final model output (e.g., in a MessageAction)
+    # You can simply get the LAST `MessageAction` from the returned `state.history` and parse it for evaluation.

-        # reset workspace to config
-        config.workspace_base = workspace_mount_path
-        config.workspace_mount_path = workspace_mount_path
+    if state is None:
+        raise ValueError('State should not be None.')

-        # Setup the logger properly, so you can run multi-processing to parallelize the evaluation
-        if reset_logger:
-            # Set up logger
-            log_file = os.path.join(
-                metadata.eval_output_dir, 'logs', f'instance_{instance["id"]}.log'
-            )
-            # Remove all existing handlers from logger
-            for handler in logger.handlers[:]:
-                logger.removeHandler(handler)
-            # add back the console handler to print ONE line
-            logger.addHandler(get_console_handler())
-            logger.info(
-                f'Starting evaluation for instance {instance["id"]}.\nLOG:   tail -f {log_file}'
-            )
-            # Remove all existing handlers from logger
-            for handler in logger.handlers[:]:
-                logger.removeHandler(handler)
-            file_handler = logging.FileHandler(log_file)
-            file_handler.setFormatter(
-                logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
-            )
-            logger.addHandler(file_handler)
+    final_message = ''
+    for event in state.history.get_events(reverse=True):
+        if isinstance(event, AgentFinishAction):
+            final_message = event.thought
+            break
+        elif isinstance(event, MessageAction):
+            final_message = event.content
+            break

-        logger.info(f'Process-specific workspace mounted at {workspace_mount_path}')
+    final_message = final_message.strip("'")
+    logger.info(
+        f'Predicted answer: {final_message}, Ground truth: {instance["answer"]}'
+    )

-        # sandbox = DockerSSHBox()
-        logic_inference_path = os.path.join(workspace_mount_path, 'logic_inference.py')
-        if not os.path.exists(logic_inference_path):
-            shutil.copyfile(
-                './evaluation/logic_reasoning/logic_inference.py', logic_inference_path
-            )
-        logger.info(f'logic_inference.py copied to {workspace_mount_path}')
+    test_result = get_test_result(
+        model_answer=final_message, ground_truth=instance['answer']
+    )
+    test_result['final_message'] = final_message

-        cache_dir = os.path.join(workspace_mount_path, '.cache_program')
-        if not os.path.exists(cache_dir):
-            os.makedirs(cache_dir)
-
-        # Prepare instruction
-
-        with open('./evaluation/logic_reasoning/instruction.txt', 'r') as f:
-            instruction = f.read()
-
-        instance_logic_programs = instance['raw_logic_programs'][0].strip()
-        instruction = instruction.replace('[[dataset_name]]', dataset_name)
-        instruction = instruction.replace('[[logic_programs]]', instance_logic_programs)
-        instruction = instruction.replace(
-            '[[logic_inference_path.py]]', logic_inference_path
-        )
-
-        # NOTE: You can actually set slightly different instruction for different agents
-        instruction += AGENT_CLS_TO_INST_SUFFIX[agent.__class__.__name__]
-
-        # use a session id for concurrent evaluation
-        sid = instance['id'] + '_' + str(os.getpid())
-        sandbox = DockerSSHBox(
-            config=config.sandbox,
-            persist_sandbox=False,
-            workspace_mount_path=config.workspace_mount_path,
-            sandbox_workspace_dir=config.workspace_mount_path_in_sandbox,
-            cache_dir=config.cache_dir,
-            run_as_devin=config.run_as_devin,
-            sid=sid,
-        )
-        exit_code, command_output = sandbox.execute('pip install scitools-pyke')
-
-        # Here's how you can run the agent (similar to the `main` function) and get the final task state
-        state: State | None = asyncio.run(
-            run_agent_controller(
-                agent,
-                instruction,
-                max_iterations=metadata.max_iterations,
-                max_budget_per_task=config.max_budget_per_task,
-                fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN.get(
-                    agent.__class__.__name__
-                ),
-                sandbox=sandbox,
-                sid=sid,
-            )
-        )
-        # ======= Attempt to evaluate the agent's edits =======
-        # If you are working on simpler benchmark that only evaluates the final model output (e.g., in a MessageAction)
-        # You can simply get the LAST `MessageAction` from the returned `state.history` and parse it for evaluation.
-
-        if state is None:
-            raise ValueError('State should not be None.')
-
-        final_message = ''
-        messages = []
-        for event in state.history.get_events(reverse=True):
-            # will this be a MessageAction?
-            # TODO we can filter for types of events if we know what to expect
-            messages.append(event.content)
-            if str(event.content) in ["'A'", "'B'", "'C'"]:
-                final_message = event.content
-                break
-
-        final_message = final_message.strip("'")
-        logger.info(
-            f'Predicted answer: {final_message}, Ground truth: {instance["answer"]}'
-        )
-
-        test_result = get_test_result(
-            model_answer=final_message, ground_truth=instance['answer']
-        )
-        metrics = state.metrics.get() if state.metrics else None
-
-        # history is now available as a stream of events, rather than list of pairs of (Action, Observation)
-        # for compatibility with the existing output format, we can remake the pairs here
-        # remove when it becomes unnecessary
-        histories = state.history.compatibility_for_eval_history_pairs()
-
-        # Save the output
-        output = {
-            'id': instance['id'],
-            'instance': instance,
-            'instruction': instruction,
-            # 'metadata': metadata.model_dump(),
-            'history': histories,
-            'metrics': metrics,
-            'final_message': final_message,
-            'messages': messages,
-            'error': state.last_error if state and state.last_error else None,
-            'test_result': test_result,
-        }
-    except Exception:
-        logger.error('Process instance failed')
-        raise
-    finally:
-        config.workspace_mount_path = old_workspace_mount_path
-        config.workspace_base = old_workspace_base
-
-    # Close the sandbox
-    sandbox.close()
+    metrics = state.metrics.get() if state.metrics else None
+    # history is now available as a stream of events, rather than list of pairs of (Action, Observation)
+    # for compatibility with the existing output format, we can remake the pairs here
+    # remove when it becomes unnecessary
+    histories = state.history.compatibility_for_eval_history_pairs()

+    # Save the output
+    output = EvalOutput(
+        instance_id=instance['instance_id'],
+        instruction=instruction,
+        metadata=metadata,
+        history=histories,
+        metrics=metrics,
+        error=state.last_error if state and state.last_error else None,
+        test_result=test_result,
+    )
    return output


@@ -262,7 +271,7 @@ if __name__ == '__main__':
        '--dataset',
        type=str,
        help='the logic reasoning dataset to evaluate on {ProntoQA, ProofWriter}',
-        default='ProntoQA',
+        default='ProofWriter',
    )
    parser.add_argument(
        '--data_split',
@@ -270,36 +279,32 @@ if __name__ == '__main__':
        help='data split to evaluate on {validation}',  # right now we only support validation split
        default='validation',
    )
-
    args, _ = parser.parse_known_args()
-    if args.directory:
-        config.workspace_base = os.path.abspath(args.directory)
-        print(f'Setting workspace base to {config.workspace_base}')

    dataset_name = args.dataset
    data_split = args.data_split
    dataset = load_dataset(f'renma/{dataset_name}')
-    logic_reasoning_tests = dataset[data_split]
+    dataset_df = dataset[data_split].to_pandas()
+    dataset_df.rename(columns={'id': 'instance_id'}, inplace=True)

-    id_column = 'id'
-    llm_config = get_llm_config_arg(args.llm_config) if args.llm_config else config.llm
-    logger.info(f'Config for evaluation: {config}')
+    llm_config = None
+    if args.llm_config:
+        llm_config = get_llm_config_arg(args.llm_config)
+    if llm_config is None:
+        raise ValueError(f'Could not find LLM config: --llm_config {args.llm_config}')

    metadata = make_metadata(
        llm_config,
-        args.dataset_name,
+        dataset_name,
        args.agent_cls,
        args.max_iterations,
        args.eval_note,
        args.eval_output_dir,
    )
    output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl')
-    instances = prepare_dataset(dataset, output_file, args.eval_n_limit, id_column)
-    run_evaluation(
-        instances,
-        metadata,
-        output_file,
-        args.eval_num_workers,
-        process_instance,
-        id_column,
+    instances = prepare_dataset(dataset_df, output_file, args.eval_n_limit)
+    asyncio.run(
+        run_evaluation(
+            instances, metadata, output_file, args.eval_num_workers, process_instance
+        )
    )
--- a/evaluation/logic_reasoning/scripts/run_infer.sh
+++ b/evaluation/logic_reasoning/scripts/run_infer.sh
@@ -3,8 +3,8 @@ set -eo pipefail

 source "evaluation/utils/version_control.sh"

-DATASET=$1
-MODEL_CONFIG=$2
+MODEL_CONFIG=$1
+DATASET=$2
 COMMIT_HASH=$3
 EVAL_LIMIT=$4
 AGENT=$5
@@ -23,6 +23,11 @@ if [ -z "$AGENT" ]; then
  AGENT="CodeActAgent"
 fi

+if [ -z "$DATASET" ]; then
+  echo "Dataset not specified, use default ProofWriter"
+  DATASET="ProofWriter"
+fi
+
 get_agent_version

 echo "AGENT: $AGENT"
--- a/evaluation/miniwob/Dockerfile
+++ b/evaluation/miniwob/Dockerfile
@@ -0,0 +1,10 @@
+FROM ubuntu:22.04
+
+RUN apt-get update && apt-get install -y python3 python3-pip git
+
+RUN git clone https://github.com/Farama-Foundation/miniwob-plusplus.git /miniwob-plusplus && \
+    git -C "/miniwob-plusplus" reset --hard 7fd85d71a4b60325c6585396ec4f48377d049838
+
+ENV MINIWOB_URL="file:///miniwob-plusplus/miniwob/html/miniwob/"
+
+# docker build -t xingyaoww/od-eval-miniwob .
--- a/evaluation/miniwob/README.md
+++ b/evaluation/miniwob/README.md
@@ -2,52 +2,9 @@

 This folder contains evaluation for [MiniWoB++](https://miniwob.farama.org/) benchmark, powered by [BrowserGym](https://github.com/ServiceNow/BrowserGym) for easy evaluation of how well an agent capable of browsing can perform on synthetic web browsing tasks.

-## Setup OpenDevin Environment
+## Setup Environment and LLM Configuration

-Please follow [this document](https://github.com/OpenDevin/OpenDevin/blob/main/Development.md) to setup local develop environment for OpenDevin.
-
-## Configure OpenDevin and your LLM
-
-Create a `config.toml` file if it does not exist at the root of the workspace.
-
-Add the following configurations:
-
-```toml
-[core]
-max_iterations = 100
-cache_dir = "/tmp/cache"
-ssh_hostname = "localhost"
-
-[sandbox]
-box_type = "ssh"
-timeout = 120
-
-# TODO: Change these to the model you want to evaluate
-[llm.eval_gpt4_1106_preview]
-model = "gpt-4-1106-preview"
-api_key = "XXX"
-temperature = 0.0
-
-[llm.eval_some_openai_compatible_model]
-model = "openai/MODEL_NAME"
-base_url = "https://OPENAI_COMPATIBLE_URL/v1"
-api_key = "XXX"
-temperature = 0.0
-```
-
-## Setup MiniWoB++ Environment and Environment Variables of MiniWoB++
-MiniWoB++ requires you to set up websites containing a static website that is accessible via URL to the machine running the OpenDevin agents.
-
- Clone miniwob (use a specific frozen commit for reproducibility)
-```sh
-git clone git@github.com:Farama-Foundation/miniwob-plusplus.git
-git -C "./miniwob-plusplus" reset --hard 7fd85d71a4b60325c6585396ec4f48377d049838
-```
-
- Setup Miniwob URL (change `PATH_TO_MINIWOB_CLONED_REPO` here to the absolute path to your `miniwob-plusplus` folder) in `evaluation/miniwob/scripts/run_infer.sh`
-```sh
-export MINIWOB_URL="file://<PATH_TO_MINIWOB_CLONED_REPO>/miniwob/html/miniwob/"
-```
+Please follow instruction [here](../README.md#setup) to setup your local development environment and LLM.

 ## Test if your environment works

@@ -56,7 +13,7 @@ Access with browser the above MiniWoB URLs and see if they load correctly.
 ## Run Evaluation

 ```sh
-bash evaluation/miniwob/scripts/run_infer.sh
+./evaluation/miniwob/scripts/run_infer.sh llm.claude-35-sonnet-eval
 ```

 Results will be in `evaluation/evaluation_outputs/outputs/miniwob/`
--- a/evaluation/miniwob/run_infer.py
+++ b/evaluation/miniwob/run_infer.py
@@ -1,7 +1,7 @@
 import asyncio
 import json
-import logging
 import os
+from typing import Any

 import browsergym.miniwob  # noqa F401 register miniwob tasks as gym environments
 import gymnasium as gym
@@ -9,91 +9,131 @@ import pandas as pd

 from evaluation.utils.shared import (
    EvalMetadata,
+    EvalOutput,
    make_metadata,
    prepare_dataset,
+    reset_logger_for_multiprocessing,
    run_evaluation,
 )
-from opendevin.controller.agent import Agent
 from opendevin.controller.state.state import State
-from opendevin.core.config import get_llm_config_arg, load_app_config, parse_arguments
-from opendevin.core.logger import get_console_handler
+from opendevin.core.config import (
+    AppConfig,
+    SandboxConfig,
+    get_llm_config_arg,
+    parse_arguments,
+)
 from opendevin.core.logger import opendevin_logger as logger
-from opendevin.core.main import run_agent_controller
-from opendevin.llm.llm import LLM
-from opendevin.runtime.docker.ssh_box import DockerSSHBox
-from opendevin.runtime.tools import RuntimeTool
-
-config = load_app_config()
+from opendevin.core.main import create_runtime, run_controller
+from opendevin.events.action import (
+    BrowseInteractiveAction,
+    CmdRunAction,
+    MessageAction,
+)
+from opendevin.events.observation import CmdOutputObservation
+from opendevin.runtime.browser.browser_env import (
+    BROWSER_EVAL_GET_GOAL_ACTION,
+    BROWSER_EVAL_GET_REWARDS_ACTION,
+)
+from opendevin.runtime.runtime import Runtime

 SUPPORTED_AGENT_CLS = {'BrowsingAgent'}

-docker_ssh_box: DockerSSHBox | None = None
+
+def get_config(
+    metadata: EvalMetadata,
+    env_id: str,
+) -> AppConfig:
+    config = AppConfig(
+        default_agent=metadata.agent_class,
+        run_as_devin=False,
+        runtime='eventstream',
+        max_iterations=metadata.max_iterations,
+        sandbox=SandboxConfig(
+            container_image='xingyaoww/od-eval-miniwob:v1.0',
+            enable_auto_lint=True,
+            use_host_network=False,
+            browsergym_eval_env=env_id,
+        ),
+        # do not mount workspace
+        workspace_base=None,
+        workspace_mount_path=None,
+    )
+    config.set_llm_config(metadata.llm_config)
+    return config


-def get_sandbox():
-    global docker_ssh_box
-    if docker_ssh_box is None:
-        docker_ssh_box = DockerSSHBox(
-            config=config.sandbox,
-            persist_sandbox=False,
-            workspace_mount_path=config.workspace_mount_path,
-            sandbox_workspace_dir=config.workspace_mount_path_in_sandbox,
-            cache_dir=config.cache_dir,
-            run_as_devin=config.run_as_devin,
-        )
-    return docker_ssh_box
+async def initialize_runtime(
+    runtime: Runtime,
+) -> str:
+    """Initialize the runtime for the agent.
+
+    This function is called before the runtime is used to run the agent.
+    """
+    logger.info(f"{'-' * 50} BEGIN Runtime Initialization Fn {'-' * 50}")
+    obs: CmdOutputObservation
+
+    # Set instance id
+    action = CmdRunAction(command='mkdir -p /workspace')
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = await runtime.run_action(action)
+    assert obs.exit_code == 0
+
+    action = BrowseInteractiveAction(browser_actions=BROWSER_EVAL_GET_GOAL_ACTION)
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = await runtime.run_action(action)
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    goal = obs.content
+
+    logger.info(f"{'-' * 50} END Runtime Initialization Fn {'-' * 50}")
+    return goal


-def process_instance(
+async def complete_runtime(
+    runtime: Runtime,
+) -> dict[str, Any]:
+    """Complete the runtime for the agent.
+
+    This function is called before the runtime is used to run the agent.
+    If you need to do something in the sandbox to get the correctness metric after
+    the agent has run, modify this function.
+    """
+    logger.info(f"{'-' * 50} BEGIN Runtime Completion Fn {'-' * 50}")
+    obs: CmdOutputObservation
+
+    action = BrowseInteractiveAction(browser_actions=BROWSER_EVAL_GET_REWARDS_ACTION)
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = await runtime.run_action(action)
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+
+    logger.info(f"{'-' * 50} END Runtime Completion Fn {'-' * 50}")
+    return {
+        'rewards': json.loads(obs.content),
+    }
+
+
+async def process_instance(
    instance: pd.Series,
    metadata: EvalMetadata,
    reset_logger: bool = True,
-):
-    # Create the agent
-    agent = Agent.get_cls(metadata.agent_class)(llm=LLM(config=metadata.llm_config))
+) -> EvalOutput:
    env_id = instance.id
+    config = get_config(metadata, env_id)
+
    # Setup the logger properly, so you can run multi-processing to parallelize the evaluation
    if reset_logger:
-        # Set up logger
-        log_file = os.path.join(
-            metadata.eval_output_dir, 'logs', f'instance_{env_id}.log'
-        )
-        # Remove all existing handlers from logger
-        for handler in logger.handlers[:]:
-            logger.removeHandler(handler)
-        # add back the console handler to print ONE line
-        logger.addHandler(get_console_handler())
-        logger.info(
-            f'Starting evaluation for instance {env_id}.\nHint: run "tail -f {log_file}" to see live logs in a separate shell'
-        )
-        # Remove all existing handlers from logger
-        for handler in logger.handlers[:]:
-            logger.removeHandler(handler)
-        file_handler = logging.FileHandler(log_file)
-        file_handler.setFormatter(
-            logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
-        )
-        logger.addHandler(file_handler)
+        log_dir = os.path.join(metadata.eval_output_dir, 'infer_logs')
+        reset_logger_for_multiprocessing(logger, env_id, log_dir)
    else:
        logger.info(f'Starting evaluation for instance {env_id}.')

-    # Here's how you can run the agent (similar to the `main` function) and get the final task state
-    runtime_tools_config = {
-        RuntimeTool.BROWSER: {
-            'browsergym_eval': env_id,
-            'browsergym_eval_save_dir': metadata.eval_output_dir,
-        }
-    }
+    runtime = await create_runtime(config, sid=env_id)
+    task_str = await initialize_runtime(runtime)

    state: State | None = asyncio.run(
-        run_agent_controller(
-            agent,
-            'PLACEHOLDER_GOAL',
-            max_iterations=metadata.max_iterations,
-            max_budget_per_task=config.max_budget_per_task,
-            runtime_tools_config=runtime_tools_config,
-            sandbox=get_sandbox(),
-            sid=env_id,
+        run_controller(
+            config=config,
+            task_str=task_str,  # take output from initialize_runtime
+            runtime=runtime,
        )
    )

@@ -106,18 +146,17 @@ def process_instance(
        raise ValueError('State should not be None.')

    metrics = state.metrics.get() if state.metrics else None
-    browsergym_eval_dir = os.path.join(metadata.eval_output_dir, env_id.split('/')[1])
-    # read goal
-    with open(
-        os.path.join(browsergym_eval_dir, 'goal.txt'), 'r', encoding='utf-8'
-    ) as f:
-        instruction = f.read()
-    # read reward
-    with open(
-        os.path.join(browsergym_eval_dir, 'rewards.json'), 'r', encoding='utf-8'
-    ) as f:
-        rewards = json.load(f)
-        reward = max(rewards)
+
+    # Instruction is the first message from the USER
+    instruction = ''
+    for event in state.history.get_events():
+        if isinstance(event, MessageAction):
+            instruction = event.content
+            break
+
+    return_val = await complete_runtime(runtime)
+    logger.info(f'Return value from complete_runtime: {return_val}')
+    reward = max(return_val['rewards'])

    # history is now available as a stream of events, rather than list of pairs of (Action, Observation)
    # for compatibility with the existing output format, we can remake the pairs here
@@ -125,16 +164,17 @@ def process_instance(
    histories = state.history.compatibility_for_eval_history_pairs()

    # Save the output
-    output = {
-        'instance_id': env_id,
-        'instruction': instruction,
-        'metadata': metadata.model_dump(),
-        'history': histories,
-        'metrics': metrics,
-        'error': state.last_error if state and state.last_error else None,
-        'test_result': reward,
-    }
-
+    output = EvalOutput(
+        instance_id=env_id,
+        instruction=instruction,
+        metadata=metadata,
+        history=histories,
+        metrics=metrics,
+        error=state.last_error if state and state.last_error else None,
+        test_result={
+            'reward': reward,
+        },
+    )
    return output


@@ -143,7 +183,7 @@ if __name__ == '__main__':

    dataset = pd.DataFrame(
        {
-            'id': [
+            'instance_id': [
                id
                for id in gym.envs.registry.keys()
                if id.startswith('browsergym/miniwob')
@@ -151,26 +191,25 @@ if __name__ == '__main__':
        }
    )

-    id_column = 'id'
-    llm_config = get_llm_config_arg(args.llm_config) if args.llm_config else config.llm
-    logger.info(f'Config for evaluation: {config}')
+    llm_config = None
+    if args.llm_config:
+        llm_config = get_llm_config_arg(args.llm_config)
+    if llm_config is None:
+        raise ValueError(f'Could not find LLM config: --llm_config {args.llm_config}')

    metadata = make_metadata(
        llm_config,
-        args.dataset_name,
+        'miniwob',
        args.agent_cls,
        args.max_iterations,
        args.eval_note,
        args.eval_output_dir,
    )
    output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl')
-    instances = prepare_dataset(dataset, output_file, args.eval_n_limit, id_column)
-    _ = get_sandbox()  # Initialize the sandbox
-    run_evaluation(
-        instances,
-        metadata,
-        output_file,
-        args.eval_num_workers,
-        process_instance,
-        id_column,
+    instances = prepare_dataset(dataset, output_file, args.eval_n_limit)
+
+    asyncio.run(
+        run_evaluation(
+            instances, metadata, output_file, args.eval_num_workers, process_instance
+        )
    )
--- a/evaluation/miniwob/scripts/run_infer.sh
+++ b/evaluation/miniwob/scripts/run_infer.sh
@@ -3,14 +3,10 @@ set -eo pipefail

 source "evaluation/utils/version_control.sh"

-# configure miniwob website, change URL to yours
-export MINIWOB_URL="file:///home/fangzhex/miniwob-plusplus/miniwob/html/miniwob/"
-
 # configure browsing agent
 export USE_NAV="false"
 export USE_CONCISE_ANSWER="true"

-
 MODEL_CONFIG=$1
 COMMIT_HASH=$2
 AGENT=$3
@@ -42,7 +38,7 @@ COMMAND="poetry run python evaluation/miniwob/run_infer.py \
  --llm-config $MODEL_CONFIG \
  --max-iterations 10 \
  --max-chars 10000000 \
-  --eval-num-workers $NUM_WORKERS \
+  --eval-num-workers $NUM_WORKERS"

 if [ -n "$EVAL_LIMIT" ]; then
  echo "EVAL_LIMIT: $EVAL_LIMIT"
--- a/evaluation/mint/Dockerfile
+++ b/evaluation/mint/Dockerfile
@@ -0,0 +1,10 @@
+FROM ubuntu:22.04
+
+RUN apt-get update && apt-get install -y python3 python3-pip git gcc
+
+WORKDIR /root
+
+COPY requirements.txt .
+RUN pip install -r requirements.txt
+
+# docker build -t xingyaoww/od-eval-mint:v1.0 .
--- a/evaluation/mint/README.md
+++ b/evaluation/mint/README.md
@@ -2,9 +2,11 @@

 This folder contains the evaluation harness for the [MINT benchmark](https://arxiv.org/abs/2309.10691) on LLMs' ability to solve tasks with multi-turn interactions.

-## Configure OpenDevin and LM
+We support evaluation of the [Eurus subset focus on math and code reasoning](https://arxiv.org/abs/2404.02078), including MATH, MMLU, TheoremQA, HumanEval, MBPP.

-Create a `config.toml` file if it does not exist at the root of the workspace. Please check [README.md](../../README.md) for how to set this up.
+## Setup Environment and LLM Configuration
+
+Please follow instruction [here](../README.md#setup) to setup your local development environment and LLM.

 ## Start the evaluation

--- a/evaluation/mint/run_infer.py
+++ b/evaluation/mint/run_infer.py
@@ -1,33 +1,36 @@
-import asyncio
 import functools
-import logging
 import os
-import pathlib
 from typing import Any, Dict

+import pandas as pd
 from datasets import load_dataset

-from evaluation.swe_bench.swe_env_box import DockerSSHBox
+from evaluation.mint.datatypes import TaskState
+from evaluation.mint.env import SimplifiedEnv
+from evaluation.mint.prompts import ToolPromptTemplate
+from evaluation.mint.tasks import Task
 from evaluation.utils.shared import (
    EvalMetadata,
+    EvalOutput,
    make_metadata,
    prepare_dataset,
+    reset_logger_for_multiprocessing,
    run_evaluation,
 )
-from opendevin.controller.agent import Agent
 from opendevin.controller.state.state import State
-from opendevin.core.config import get_llm_config_arg, get_parser, load_app_config
-from opendevin.core.logger import get_console_handler
+from opendevin.core.config import (
+    AppConfig,
+    SandboxConfig,
+    get_llm_config_arg,
+    get_parser,
+)
 from opendevin.core.logger import opendevin_logger as logger
-from opendevin.core.main import run_agent_controller
-from opendevin.llm.llm import LLM
-
-from .datatypes import TaskState
-from .env import SimplifiedEnv
-from .prompts import ToolPromptTemplate
-from .tasks import Task
-
-config = load_app_config()
+from opendevin.core.main import create_runtime, run_controller
+from opendevin.events.action import (
+    CmdRunAction,
+)
+from opendevin.events.observation import CmdOutputObservation
+from opendevin.runtime.runtime import Runtime


 def codeact_user_response_mint(state: State, task: Task, task_config: Dict[str, int]):
@@ -42,7 +45,7 @@ def codeact_user_response_mint(state: State, task: Task, task_config: Dict[str,
    last_action = state.history.get_last_action()
    result_state: TaskState = env.step(last_action.message or '')

-    state.task_state = result_state
+    state.extra_data['task_state'] = result_state

    if not result_state.latest_output:
        # Task is finished
@@ -62,85 +65,107 @@ AGENT_CLS_TO_INST_SUFFIX = {
    'CodeActAgent': '\nIMPORTANT: When your answer is confirmed by the user to be correct, you can exit using the following command: <execute_bash> exit </execute_bash>.\n'
 }

+with open(os.path.join(os.path.dirname(__file__), 'requirements.txt'), 'r') as f:
+    MINT_DEPENDENCIES = f.read().splitlines()

-def process_instance(
+
+def load_incontext_example(task_name: str, with_tool: bool = True):
+    assert with_tool, 'NOT with_tool is not supported yet'
+    subset = {
+        'gsm8k': 'reasoning',
+        'math': 'reasoning',
+        'mmlu': 'reasoning',
+        'theoremqa': 'reasoning',
+        'mbpp': 'mbpp',
+        'humaneval': 'humaneval',
+    }[task_name]
+    with open(
+        os.path.join(
+            os.path.dirname(__file__),
+            'tasks',
+            'in_context_examples',
+            subset,
+            'with_tool.txt',
+        ),
+        'r',
+    ) as f:
+        return f.read()
+
+
+def get_config(
+    metadata: EvalMetadata,
+) -> AppConfig:
+    config = AppConfig(
+        default_agent=metadata.agent_class,
+        run_as_devin=False,
+        runtime='eventstream',
+        max_iterations=metadata.max_iterations,
+        sandbox=SandboxConfig(
+            container_image='xingyaoww/od-eval-mint:v1.0',
+            enable_auto_lint=True,
+            use_host_network=False,
+            od_runtime_extra_deps=f'$OD_INTERPRETER_PATH -m pip install {" ".join(MINT_DEPENDENCIES)}',
+        ),
+        # do not mount workspace
+        workspace_base=None,
+        workspace_mount_path=None,
+    )
+    config.set_llm_config(metadata.llm_config)
+    return config
+
+
+async def initialize_runtime(runtime: Runtime):
+    """Initialize the runtime for the agent.
+
+    This function is called before the runtime is used to run the agent.
+    """
+    logger.info(f"{'-' * 50} BEGIN Runtime Initialization Fn {'-' * 50}")
+    obs: CmdOutputObservation
+
+    # Set instance id
+    action = CmdRunAction(command='mkdir -p /workspace')
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = await runtime.run_action(action)
+    assert obs.exit_code == 0
+
+    action = CmdRunAction(command='cd /workspace')
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = await runtime.run_action(action)
+    assert obs.exit_code == 0
+
+    logger.info(f"{'-' * 50} END Runtime Initialization Fn {'-' * 50}")
+
+
+async def process_instance(
    instance: Any,
    metadata: EvalMetadata,
    reset_logger: bool = True,
 ):
-    agent = Agent.get_cls(metadata.agent_class)(llm=LLM(metadata.llm_config))
-    workspace_mount_path = os.path.join(config.workspace_mount_path, '_eval_workspace')
-    # create process-specific workspace dir
-    workspace_mount_path = os.path.join(workspace_mount_path, str(os.getpid()))
-    pathlib.Path(workspace_mount_path).mkdir(parents=True, exist_ok=True)
+    config = get_config(metadata)

    # Setup the logger properly, so you can run multi-processing to parallelize the evaluation
    if reset_logger:
-        # Set up logger
-        log_file = os.path.join(
-            metadata.eval_output_dir, 'logs', f'instance_{instance.task_id}.log'
-        )
-        # Remove all existing handlers from logger
-        for handler in logger.handlers[:]:
-            logger.removeHandler(handler)
-        # add back the console handler to print ONE line
-        logger.addHandler(get_console_handler())
-        logger.info(
-            f'Starting evaluation for instance {instance.task_id}.\nHint: run "tail -f {log_file}" to see live logs in a separate shell'
-        )
-        # Remove all existing handlers from logger
-        for handler in logger.handlers[:]:
-            logger.removeHandler(handler)
-        file_handler = logging.FileHandler(log_file)
-        file_handler.setFormatter(
-            logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
-        )
-        logger.addHandler(file_handler)
-
-    logger.info(f'Process-specific workspace mounted at {workspace_mount_path}')
-
-    # use a session id for concurrent processing
-    sid = instance.task_id + '_' + str(os.getpid())
-    sandbox = DockerSSHBox(
-        config=config.sandbox,
-        persist_sandbox=False,
-        workspace_mount_path=config.workspace_mount_path,
-        sandbox_workspace_dir=config.workspace_mount_path_in_sandbox,
-        cache_dir=config.cache_dir,
-        run_as_devin=config.run_as_devin,
-        sid=sid,
-    )
-
-    requirements_host_src = 'evaluation/mint/requirements.txt'
-    requirements_sandbox_dest = '/opendevin/plugins/mint/requirements.txt'
-    sandbox.copy_to(
-        host_src=requirements_host_src,
-        sandbox_dest=requirements_sandbox_dest,
-        recursive=False,
-    )
-    logger.info(
-        f'Copied files from [{requirements_host_src}] to [{requirements_sandbox_dest}] inside sandbox.'
-    )
-    exit_code, output = sandbox.execute(f'pip install -r {requirements_sandbox_dest}')
+        log_dir = os.path.join(metadata.eval_output_dir, 'infer_logs')
+        reset_logger_for_multiprocessing(logger, instance.instance_id, log_dir)
+    else:
+        logger.info(f'Starting evaluation for instance {instance.instance_id}.')

    # Prepare instruction
    assert metadata.details is not None
    instruction = ToolPromptTemplate(use_tool=True)(
        max_total_steps=metadata.max_iterations,
        max_propose_solution=metadata.details['max_propose_solution'],
-        in_context_example=instance.in_context_example(
-            use_tool=True, with_feedback=False
-        ),
+        in_context_example=instance.in_context_example,
        task_prompt='Task:\n' + instance.prompt,
    )
    instruction += 'IMPORTANT: You should ONLY interact with the environment provided to you or provide the concise RESULT inside <solution> tag AND NEVER ASK FOR HUMAN HELP.\n'

    # NOTE: You can actually set slightly different instruction for different agents
-    instruction += AGENT_CLS_TO_INST_SUFFIX[agent.__class__.__name__]
+    instruction += AGENT_CLS_TO_INST_SUFFIX[metadata.agent_class]

    # Here's how you can run the agent (similar to the `main` function) and get the final task state
    fake_user_response_fn = functools.partial(
-        AGENT_CLS_TO_FAKE_USER_RESPONSE_FN[agent.__class__.__name__],
+        AGENT_CLS_TO_FAKE_USER_RESPONSE_FN[metadata.agent_class],
        task=instance,
        task_config={
            'max_iterations': metadata.max_iterations,
@@ -148,24 +173,22 @@ def process_instance(
        },
    )

-    state: State | None = asyncio.run(
-        run_agent_controller(
-            agent,
-            instruction,
-            max_iterations=metadata.max_iterations,
-            max_budget_per_task=config.max_budget_per_task,
-            fake_user_response_fn=fake_user_response_fn,
-            sandbox=sandbox,
-            sid=sid,
-        )
+    runtime = await create_runtime(config, sid=instance.instance_id)
+    await initialize_runtime(runtime)
+
+    state: State | None = await run_controller(
+        config=config,
+        task_str=instruction,
+        runtime=runtime,
+        fake_user_response_fn=fake_user_response_fn,
    )

    if state is None:
        raise ValueError('State should not be None.')

    task_state = None
-    if hasattr(state, 'task_state'):
-        task_state = state.task_state
+    if 'task_state' in state.extra_data:
+        task_state = state.extra_data['task_state']
        logger.info('Task state: ' + str(task_state.to_dict()))

    metrics = state.metrics.get() if state.metrics else None
@@ -176,30 +199,37 @@ def process_instance(
    histories = state.history.compatibility_for_eval_history_pairs()

    # Save the output
-    output = {
-        'id': instance.task_id,
-        'instance': instance.to_dict(),
-        'instruction': instruction,
-        'metadata': metadata.model_dump(),
-        'history': histories,
-        'metrics': metrics,
-        'error': state.last_error if state and state.last_error else None,
-        'test_result': task_state.success if task_state else False,
-    }
-
-    # Close the sandbox
-    sandbox.close()
-
+    output = EvalOutput(
+        instance_id=instance.instance_id,
+        instance=instance.to_dict(),
+        instruction=instruction,
+        metadata=metadata,
+        history=histories,
+        metrics=metrics,
+        error=state.last_error if state and state.last_error else None,
+        test_result={
+            'success': task_state.success if task_state else False,
+        },
+    )
    return output


 if __name__ == '__main__':
    parser = get_parser()

+    SUBSETS = [
+        # Eurus subset: https://arxiv.org/abs/2404.02078
+        'math',
+        # 'gsm8k',
+        'mmlu',
+        'theoremqa',
+        'mbpp',
+        'humaneval',
+    ]
    parser.add_argument(
        '--subset',
-        default='math',
-        choices=['math', 'gsm8k', 'mmlu', 'theoremqa', 'mbpp', 'humaneval'],
+        default='all',
+        choices=SUBSETS + ['all'],
        type=str,
        help='subset of the dataset to be used',
    )
@@ -214,19 +244,36 @@ if __name__ == '__main__':

    # NOTE: It is preferable to load datasets from huggingface datasets and perform post-processing
    # so we don't need to manage file uploading to OpenDevin's repo
-    mint_dataset = load_dataset(
-        'ryanhoangt/xingyaoww-mint-bench', name=args.subset, split='test'
-    )
-    logger.info(f'Evaluating MINT - {args.subset} subset')
-    mint_tests = mint_dataset.to_pandas()
+    if args.subset == 'all':
+        subsets = SUBSETS
+    else:
+        subsets = [args.subset]

-    id_column = 'id'
-    llm_config = get_llm_config_arg(args.llm_config) if args.llm_config else config.llm
-    logger.info(f'Config for evaluation: {config}')
+    dataset_dfs = []
+    for subset in subsets:
+        in_context_example = load_incontext_example(subset)
+        _cur_dataset = load_dataset(
+            'ryanhoangt/xingyaoww-mint-bench', name=subset, split='test'
+        )
+        logger.info(f'Loaded MINT - {subset} subset')
+        _df = _cur_dataset.to_pandas().rename(columns={'id': 'instance_id'})
+        _df['instance_id'] = _df['instance_id'].apply(lambda x: f'{subset}/{x}')  # noqa
+        _df['in_context_example'] = in_context_example
+        dataset_dfs.append(_df)
+        logger.info(f'Loaded {len(_df)} instances for subset: {subset}')
+
+    dataset_df = pd.concat(dataset_dfs)
+    logger.info(f'Loaded {len(dataset_df)} instances for subset: {subsets}')
+
+    llm_config = None
+    if args.llm_config:
+        llm_config = get_llm_config_arg(args.llm_config)
+    if llm_config is None:
+        raise ValueError(f'Could not find LLM config: --llm_config {args.llm_config}')

    metadata = make_metadata(
        llm_config,
-        args.dataset_name,
+        f'MINT-{args.subset}',
        args.agent_cls,
        args.max_iterations,
        args.eval_note,
@@ -234,12 +281,7 @@ if __name__ == '__main__':
        details={'max_propose_solution': args.max_propose_solution},
    )
    output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl')
-    instances = prepare_dataset(mint_dataset, output_file, args.eval_n_limit, id_column)
+    instances = prepare_dataset(dataset_df, output_file, args.eval_n_limit)
    run_evaluation(
-        instances,
-        metadata,
-        output_file,
-        args.eval_num_workers,
-        process_instance,
-        id_column,
+        instances, metadata, output_file, args.eval_num_workers, process_instance
    )
--- a/evaluation/mint/scripts/run_infer.sh
+++ b/evaluation/mint/scripts/run_infer.sh
@@ -29,15 +29,16 @@ COMMAND="poetry run python ./evaluation/mint/run_infer.py \
    --llm-config $MODEL_CONFIG \
    --max-iterations 5 \
    --max-propose-solution 2 \
-    --eval-num-workers $NUM_WORKERS \
+    --eval-num-workers $NUM_WORKERS
+"

 if [ -n "$SUBSET" ]; then
  echo "SUBSET: $SUBSET"
  COMMAND="$COMMAND --subset $SUBSET"
 # otherwise default to use the math subset
 else
-  echo "SUBSET: math"
-  COMMAND="$COMMAND --subset math"
+  echo "SUBSET: all"
+  COMMAND="$COMMAND --subset all"
 fi

 if [ -n "$EVAL_LIMIT" ]; then
--- a/evaluation/ml_bench/README.md
+++ b/evaluation/ml_bench/README.md
@@ -10,40 +10,9 @@ The task introduces new challenges for LLMs, such as comprehending long and lang

 For more details on the ML-Bench task and dataset, please refer to the paper: [ML-Bench: Evaluating Large Language Models for Code Generation in Repository-Level Machine Learning Tasks](https://arxiv.org/abs/2311.09835).

-## Setup Environment
+## Setup Environment and LLM Configuration

-Please follow the [OpenDevin setup guide](https://github.com/OpenDevin/OpenDevin/blob/main/docs/setup.md) to set up the local development environment for OpenDevin.
-
-## Configure OpenDevin and your LLM
-
-Create a `config.toml` file if it does not exist at the root of the workspace.
-
-Add the following configurations:
-
-```toml
-[core]
-max_iterations = 100
-cache_dir = "/tmp/cache"
-ssh_hostname = "localhost"
-run_as_devin = false
-sandbox_container_image = "public.ecr.aws/i5g0m1f6/ml-bench" # Use the latest image from the ML-Bench repository
-
-[sandbox]
-enable_auto_lint = true
-
-
-# TODO: Change these to the model you want to evaluate
-[llm.eval_gpt4_1106_preview]
-model = "gpt-4-1106-preview"
-api_key = "XXX"
-temperature = 0.0
-
-[llm.eval_some_openai_compatible_model]
-model = "openai/MODEL_NAME"
-base_url = "https://OPENAI_COMPATIBLE_URL/v1"
-api_key = "XXX"
-temperature = 0.0
-```
+Please follow instruction [here](../README.md#setup) to setup your local development environment and LLM.

 ## Run Inference on ML-Bench

--- a/Show More
+++ b/Show More