diff --git a/evaluation/regression/.gitignore b/evaluation/regression/.gitignore
deleted file mode 100644
index b00b59b516..0000000000
--- a/evaluation/regression/.gitignore
+++ /dev/null
@@ -1,2 +0,0 @@
-node_modules
-outputs
diff --git a/evaluation/regression/README.md b/evaluation/regression/README.md
deleted file mode 100644
index a7f532c0fc..0000000000
--- a/evaluation/regression/README.md
+++ /dev/null
@@ -1,70 +0,0 @@
-# OpenHands - Regression Test Framework
-
-OpenHands project is an open-source software engineering AI that can solve various software engineering tasks. This repository contains the regression test framework for OpenHands project.
-
-## Running the Tests
-
-To run the tests for OpenHands project, you can use the provided test runner script. Follow these steps:
-
-1. Ensure you have Python 3.6 or higher installed on your system.
-2. Install the required dependencies by running the following command in your terminal:
-   ```
-   pip install -r requirements.txt
-   ```
-3. Navigate to the root directory of the project.
-4. Run the test suite using the test runner script with the required arguments:
-   ```
-   python evaluation/regression/run_tests.py --OPENAI_API_KEY=sk-xxxxxxxxxxxxxxxxxxxxxx --model=gpt-4o
-   ```
-   Replace `sk-xxxxxxxxxxxxxxxxxxxxxx` with your actual OpenAI API key. The default model is `gpt-4o`, but you can specify a different model if needed.
-
-The test runner will discover and execute all the test cases in the `cases/` directory, and display the results of the test suite, including the status of each individual test case and the overall summary.
-
-## Test Case Structure
-
-The test cases for OpenHands project are organized in the `cases/` directory. Each test case has the following structure:
-
-```
-cases/
-├── hello-world/
-│   ├── task.txt
-│   ├── outputs/
-│   │   └── codeact_agent/
-│   │       └── workspace/
-│   │           ├── hello_world.sh
-│   └── test_hello_world.py
-├── create_web_app/
-│   ├── task.txt
-│   ├── outputs/
-│   │   └── codeact_agent/
-│   │       └── workspace/
-│   │           ├── app.py
-│   │           ├── requirements.txt
-│   │           ├── static/
-│   │           └── templates/
-│   └── test_create_web_app.py
-└── ...
-```
-
-- `task.txt`: This file contains the task description provided by the user.
-- `outputs/`: This directory contains the output generated by OpenHands for each agent.
-- `outputs/*/workspace/`: This directory contains the actual output files generated by OpenHands.
-- `test_*.py`: These are the test scripts that validate the output of OpenHands.
-
-## Adding New Test Cases
-
-To add a new test case to the regression test framework, follow the same steps as described in the previous sections.
-
-## Customizing the Test Cases
-
-The test cases can be customized by modifying the fixtures defined in the `conftest.py` file. The available fixtures are:
-
-- `test_cases_dir`: The directory containing the test cases.
-- `task_file`: The path to the `task.txt` file for the current test case.
-- `workspace_dir`: The path to the `workspace/` directory for the current test case.
-- `model`: The model selected start the generation.
-- `run_test_case`: A fixture that runs OpenHands and generates the workspace for the current test case.
-
-You can modify these fixtures to change the behavior of the test cases or add new ones as needed.
-
-If you have any questions or need further assistance, feel free to reach out to the project maintainers.
diff --git a/evaluation/regression/cases/client-server/task.txt b/evaluation/regression/cases/client-server/task.txt
deleted file mode 100644
index 0249d4cd46..0000000000
--- a/evaluation/regression/cases/client-server/task.txt
+++ /dev/null
@@ -1 +0,0 @@
-Write an API server in node express which responds with a random number, and a frontend in React that displays the next number from the API
diff --git a/evaluation/regression/cases/express/task.txt b/evaluation/regression/cases/express/task.txt
deleted file mode 100644
index 0fcdf10f03..0000000000
--- a/evaluation/regression/cases/express/task.txt
+++ /dev/null
@@ -1 +0,0 @@
-Write a simple hello world server in node Express
diff --git a/evaluation/regression/cases/hello-name/start/hello_world.sh b/evaluation/regression/cases/hello-name/start/hello_world.sh
deleted file mode 100644
index 1a2a680f62..0000000000
--- a/evaluation/regression/cases/hello-name/start/hello_world.sh
+++ /dev/null
@@ -1,2 +0,0 @@
-#!/usr/bin/env bash
-echo "hello world"
diff --git a/evaluation/regression/cases/hello-name/task.txt b/evaluation/regression/cases/hello-name/task.txt
deleted file mode 100644
index cdafb00866..0000000000
--- a/evaluation/regression/cases/hello-name/task.txt
+++ /dev/null
@@ -1 +0,0 @@
-Rewrite the script so that it prints the user's name, using the first argument. If there's no name, default to "world"
diff --git a/evaluation/regression/cases/hello-world/task.txt b/evaluation/regression/cases/hello-world/task.txt
deleted file mode 100644
index b03a5abac1..0000000000
--- a/evaluation/regression/cases/hello-world/task.txt
+++ /dev/null
@@ -1 +0,0 @@
-Write a bash script named "hello_world.sh" that prints "Hello, World!"
diff --git a/evaluation/regression/cases/hello-world/test_hello_world.py b/evaluation/regression/cases/hello-world/test_hello_world.py
deleted file mode 100644
index 6b4b808c4e..0000000000
--- a/evaluation/regression/cases/hello-world/test_hello_world.py
+++ /dev/null
@@ -1,20 +0,0 @@
-import os
-
-import pytest
-from conftest import agents
-
-
-@pytest.mark.parametrize('agent', agents())
-def test_hello_world(task_file, run_test_case, agent):
-    """Test case for the "Hello, World!" Bash script using different agents."""
-    # Run the test case for the specified agent
-    workspace_dir = run_test_case(agent, 'hello-world')
-
-    # Validate the generated workspace
-    assert os.path.exists(workspace_dir)
-    assert os.path.isfile(os.path.join(workspace_dir, 'hello_world.sh'))
-
-    # Execute the hello_world.sh script
-    os.chdir(workspace_dir)
-    output = os.popen('bash hello_world.sh').read()
-    assert output == 'Hello, World!\n'
diff --git a/evaluation/regression/cases/node-cli-rewrite/start/commands/length.py b/evaluation/regression/cases/node-cli-rewrite/start/commands/length.py
deleted file mode 100644
index 6389d4b30a..0000000000
--- a/evaluation/regression/cases/node-cli-rewrite/start/commands/length.py
+++ /dev/null
@@ -1,2 +0,0 @@
-def string_length(s):
-    return len(s)
diff --git a/evaluation/regression/cases/node-cli-rewrite/start/commands/lowercase.py b/evaluation/regression/cases/node-cli-rewrite/start/commands/lowercase.py
deleted file mode 100644
index 4f5d951bff..0000000000
--- a/evaluation/regression/cases/node-cli-rewrite/start/commands/lowercase.py
+++ /dev/null
@@ -1,2 +0,0 @@
-def to_lowercase(s):
-    return s.lower()
diff --git a/evaluation/regression/cases/node-cli-rewrite/start/commands/reverse.py b/evaluation/regression/cases/node-cli-rewrite/start/commands/reverse.py
deleted file mode 100644
index a5c4132200..0000000000
--- a/evaluation/regression/cases/node-cli-rewrite/start/commands/reverse.py
+++ /dev/null
@@ -1,2 +0,0 @@
-def reverse_string(s):
-    return s[::-1]
diff --git a/evaluation/regression/cases/node-cli-rewrite/start/commands/scramble.py b/evaluation/regression/cases/node-cli-rewrite/start/commands/scramble.py
deleted file mode 100644
index 7470813dac..0000000000
--- a/evaluation/regression/cases/node-cli-rewrite/start/commands/scramble.py
+++ /dev/null
@@ -1,7 +0,0 @@
-import random
-
-
-def scramble_string(s):
-    s_list = list(s)
-    random.shuffle(s_list)
-    return ''.join(s_list)
diff --git a/evaluation/regression/cases/node-cli-rewrite/start/commands/spongebob.py b/evaluation/regression/cases/node-cli-rewrite/start/commands/spongebob.py
deleted file mode 100644
index 782af450e1..0000000000
--- a/evaluation/regression/cases/node-cli-rewrite/start/commands/spongebob.py
+++ /dev/null
@@ -1,8 +0,0 @@
-def spongebob_case(s):
-    result = ''
-    for i, char in enumerate(s):
-        if i % 2 == 0:
-            result += char.lower()
-        else:
-            result += char.upper()
-    return result
diff --git a/evaluation/regression/cases/node-cli-rewrite/start/commands/uppercase.py b/evaluation/regression/cases/node-cli-rewrite/start/commands/uppercase.py
deleted file mode 100644
index 615ed1aae1..0000000000
--- a/evaluation/regression/cases/node-cli-rewrite/start/commands/uppercase.py
+++ /dev/null
@@ -1,2 +0,0 @@
-def to_uppercase(s):
-    return s.upper()
diff --git a/evaluation/regression/cases/node-cli-rewrite/start/string_cli.py b/evaluation/regression/cases/node-cli-rewrite/start/string_cli.py
deleted file mode 100644
index 6784551305..0000000000
--- a/evaluation/regression/cases/node-cli-rewrite/start/string_cli.py
+++ /dev/null
@@ -1,55 +0,0 @@
-import sys
-
-
-def print_help():
-    help_text = """
-Usage: python string_cli.py <command> <string>
-
-Commands:
-    reverse - Reverses the input string.
-    uppercase - Converts the input string to uppercase.
-    lowercase - Converts the input string to lowercase.
-    spongebob - Converts the input string to spongebob case.
-    length - Returns the length of the input string.
-    scramble - Randomly scrambles the characters in the input string.
-"""
-    print(help_text)
-
-
-if __name__ == '__main__':
-    if len(sys.argv) == 2 and sys.argv[1] == '--help':
-        print_help()
-        sys.exit(0)
-    elif len(sys.argv) < 3:
-        print('Usage: python string_cli.py <command> <string>')
-        sys.exit(1)
-
-    command = sys.argv[1]
-    input_string = sys.argv[2]
-
-    if command == 'reverse':
-        from commands.reverse import reverse_string
-
-        print(reverse_string(input_string))
-    elif command == 'uppercase':
-        from commands.uppercase import to_uppercase
-
-        print(to_uppercase(input_string))
-    elif command == 'lowercase':
-        from commands.lowercase import to_lowercase
-
-        print(to_lowercase(input_string))
-    elif command == 'spongebob':
-        from commands.spongebob import spongebob_case
-
-        print(spongebob_case(input_string))
-    elif command == 'length':
-        from commands.length import string_length
-
-        print(string_length(input_string))
-    elif command == 'scramble':
-        from commands.scramble import scramble_string
-
-        print(scramble_string(input_string))
-    else:
-        print('Invalid command!')
diff --git a/evaluation/regression/cases/node-cli-rewrite/task.txt b/evaluation/regression/cases/node-cli-rewrite/task.txt
deleted file mode 100644
index fce5870324..0000000000
--- a/evaluation/regression/cases/node-cli-rewrite/task.txt
+++ /dev/null
@@ -1 +0,0 @@
-Please rewrite the entire CLI in node.js
diff --git a/evaluation/regression/cases/python-cli-help/start/commands/length.py b/evaluation/regression/cases/python-cli-help/start/commands/length.py
deleted file mode 100644
index 6389d4b30a..0000000000
--- a/evaluation/regression/cases/python-cli-help/start/commands/length.py
+++ /dev/null
@@ -1,2 +0,0 @@
-def string_length(s):
-    return len(s)
diff --git a/evaluation/regression/cases/python-cli-help/start/commands/lowercase.py b/evaluation/regression/cases/python-cli-help/start/commands/lowercase.py
deleted file mode 100644
index 4f5d951bff..0000000000
--- a/evaluation/regression/cases/python-cli-help/start/commands/lowercase.py
+++ /dev/null
@@ -1,2 +0,0 @@
-def to_lowercase(s):
-    return s.lower()
diff --git a/evaluation/regression/cases/python-cli-help/start/commands/reverse.py b/evaluation/regression/cases/python-cli-help/start/commands/reverse.py
deleted file mode 100644
index a5c4132200..0000000000
--- a/evaluation/regression/cases/python-cli-help/start/commands/reverse.py
+++ /dev/null
@@ -1,2 +0,0 @@
-def reverse_string(s):
-    return s[::-1]
diff --git a/evaluation/regression/cases/python-cli-help/start/commands/scramble.py b/evaluation/regression/cases/python-cli-help/start/commands/scramble.py
deleted file mode 100644
index 7470813dac..0000000000
--- a/evaluation/regression/cases/python-cli-help/start/commands/scramble.py
+++ /dev/null
@@ -1,7 +0,0 @@
-import random
-
-
-def scramble_string(s):
-    s_list = list(s)
-    random.shuffle(s_list)
-    return ''.join(s_list)
diff --git a/evaluation/regression/cases/python-cli-help/start/commands/spongebob.py b/evaluation/regression/cases/python-cli-help/start/commands/spongebob.py
deleted file mode 100644
index 782af450e1..0000000000
--- a/evaluation/regression/cases/python-cli-help/start/commands/spongebob.py
+++ /dev/null
@@ -1,8 +0,0 @@
-def spongebob_case(s):
-    result = ''
-    for i, char in enumerate(s):
-        if i % 2 == 0:
-            result += char.lower()
-        else:
-            result += char.upper()
-    return result
diff --git a/evaluation/regression/cases/python-cli-help/start/commands/uppercase.py b/evaluation/regression/cases/python-cli-help/start/commands/uppercase.py
deleted file mode 100644
index 615ed1aae1..0000000000
--- a/evaluation/regression/cases/python-cli-help/start/commands/uppercase.py
+++ /dev/null
@@ -1,2 +0,0 @@
-def to_uppercase(s):
-    return s.upper()
diff --git a/evaluation/regression/cases/python-cli-help/start/string_cli.py b/evaluation/regression/cases/python-cli-help/start/string_cli.py
deleted file mode 100644
index 2deb02b0a6..0000000000
--- a/evaluation/regression/cases/python-cli-help/start/string_cli.py
+++ /dev/null
@@ -1,36 +0,0 @@
-import sys
-
-if __name__ == '__main__':
-    if len(sys.argv) < 3:
-        print('Usage: python string_cli.py <command> <string>')
-        sys.exit(1)
-
-    command = sys.argv[1]
-    input_string = sys.argv[2]
-
-    if command == 'reverse':
-        from commands.reverse import reverse_string
-
-        print(reverse_string(input_string))
-    elif command == 'uppercase':
-        from commands.uppercase import to_uppercase
-
-        print(to_uppercase(input_string))
-    elif command == 'lowercase':
-        from commands.lowercase import to_lowercase
-
-        print(to_lowercase(input_string))
-    elif command == 'spongebob':
-        from commands.spongebob import spongebob_case
-
-        print(spongebob_case(input_string))
-    elif command == 'length':
-        from commands.length import string_length
-
-        print(string_length(input_string))
-    elif command == 'scramble':
-        from commands.scramble import scramble_string
-
-        print(scramble_string(input_string))
-    else:
-        print('Invalid command!')
diff --git a/evaluation/regression/cases/python-cli-help/task.txt b/evaluation/regression/cases/python-cli-help/task.txt
deleted file mode 100644
index 7b35f6cc6e..0000000000
--- a/evaluation/regression/cases/python-cli-help/task.txt
+++ /dev/null
@@ -1 +0,0 @@
-Please add a --help option to the CLI, with a detailed description of each command
diff --git a/evaluation/regression/cases/python-cli/task.txt b/evaluation/regression/cases/python-cli/task.txt
deleted file mode 100644
index cbd8e54ef0..0000000000
--- a/evaluation/regression/cases/python-cli/task.txt
+++ /dev/null
@@ -1 +0,0 @@
-Write a python CLI for string manipulation. The CLI should accept a command, and a string. The commands should include `reverse`, `uppercase`, `lowercase`, `spongebob`, `length`, and `scramble`. The logic for each command should live in its own file.
diff --git a/evaluation/regression/cases/react-todo/task.txt b/evaluation/regression/cases/react-todo/task.txt
deleted file mode 100644
index ac3ab48992..0000000000
--- a/evaluation/regression/cases/react-todo/task.txt
+++ /dev/null
@@ -1 +0,0 @@
-Write a simple TODO list application in React
diff --git a/evaluation/regression/cases/server-test/start/server.py b/evaluation/regression/cases/server-test/start/server.py
deleted file mode 100644
index 71a8d84c94..0000000000
--- a/evaluation/regression/cases/server-test/start/server.py
+++ /dev/null
@@ -1,21 +0,0 @@
-from http.server import BaseHTTPRequestHandler, HTTPServer
-
-
-class HelloWorldHandler(BaseHTTPRequestHandler):
-    def do_GET(self):
-        self.send_response(200)
-        self.send_header('Content-type', 'text/plain')
-        self.end_headers()
-        self.wfile.write(b'Hello World\n')
-
-
-def run(server_class=HTTPServer, handler_class=HelloWorldHandler, port=8000):
-    server_address = ('', port)
-    httpd = server_class(server_address, handler_class)
-    print(f'Starting httpd on port {port}...')
-    httpd.serve_forever()
-
-
-if __name__ == '__main__':
-    print('starting server...')
-    run()
diff --git a/evaluation/regression/cases/server-test/task.txt b/evaluation/regression/cases/server-test/task.txt
deleted file mode 100644
index 0cabc8734a..0000000000
--- a/evaluation/regression/cases/server-test/task.txt
+++ /dev/null
@@ -1 +0,0 @@
-Make sure the server works and responds appropriately
diff --git a/evaluation/regression/conftest.py b/evaluation/regression/conftest.py
deleted file mode 100644
index a313e6f0ee..0000000000
--- a/evaluation/regression/conftest.py
+++ /dev/null
@@ -1,171 +0,0 @@
-import datetime
-import logging
-import os
-import shutil
-import subprocess
-
-import pytest
-
-SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
-CASES_DIR = os.path.join(SCRIPT_DIR, 'cases')
-AGENTHUB_DIR = os.path.join(SCRIPT_DIR, '../', 'agenthub')
-
-
-def agents():
-    """Retrieves a list of available agents.
-
-    Returns:
-        A list of agent names.
-    """
-    agents = []
-    for agent in os.listdir(AGENTHUB_DIR):
-        if os.path.isdir(os.path.join(AGENTHUB_DIR, agent)) and agent.endswith(
-            '_agent'
-        ):
-            agents.append(agent)
-    return agents
-
-
-@pytest.fixture(scope='session')
-def test_cases_dir():
-    """Fixture that provides the directory path for test cases.
-
-    Returns:
-        The directory path for test cases.
-    """
-    return CASES_DIR
-
-
-@pytest.fixture
-def task_file(test_cases_dir, request):
-    """Fixture that provides the path to the task file for a test case.
-
-    Args:
-        test_cases_dir: The directory path for test cases.
-        request: The pytest request object.
-
-    Returns:
-        The path to the task file for the test case.
-    """
-    test_case_dir = os.path.dirname(request.module.__file__)
-    task_file_path = os.path.join(test_case_dir, 'task.txt')
-    return task_file_path
-
-
-@pytest.fixture
-def workspace_dir(test_cases_dir, request):
-    """Fixture that provides the workspace directory for a test case.
-
-    Args:
-        test_cases_dir: The directory path for test cases.
-        request: The pytest request object.
-
-    Returns:
-        The workspace directory for the test case.
-    """
-    test_case_dir = os.path.dirname(request.module.__file__)
-    workspace_dir = os.path.join(test_case_dir, 'workspace')
-    return workspace_dir
-
-
-@pytest.fixture
-def model(request):
-    """Fixture that provides the model name.
-
-    Args:
-        request: The pytest request object.
-
-    Returns:
-        The model name, defaulting to "gpt-3.5-turbo".
-    """
-    return request.config.getoption('model', default='gpt-3.5-turbo')
-
-
-@pytest.fixture
-def run_test_case(test_cases_dir, workspace_dir, request):
-    """Fixture that provides a function to run a test case.
-
-    Args:
-        test_cases_dir: The directory path for test cases.
-        workspace_dir: The workspace directory for the test case.
-        request: The pytest request object.
-
-    Returns:
-        A function that runs a test case for a given agent and case.
-    """
-
-    def _run_test_case(agent, case):
-        """Runs a test case for a given agent.
-
-        Args:
-            agent: The name of the agent to run the test case for.
-            case: The name of the test case to run.
-
-        Returns:
-            The path to the workspace directory for the agent and test case.
-
-        Raises:
-            AssertionError: If the test case execution fails (non-zero return code).
-
-        Steps:
-        """
-        case_dir = os.path.join(test_cases_dir, case)
-        task = open(os.path.join(case_dir, 'task.txt'), 'r').read().strip()
-        outputs_dir = os.path.join(case_dir, 'outputs')
-        agent_dir = os.path.join(outputs_dir, agent)
-
-        if not os.path.exists(agent_dir):
-            os.makedirs(agent_dir)
-
-        shutil.rmtree(os.path.join(agent_dir, 'workspace'), ignore_errors=True)
-        if os.path.isdir(os.path.join(case_dir, 'start')):
-            os.copytree(
-                os.path.join(case_dir, 'start'), os.path.join(agent_dir, 'workspace')
-            )
-        else:
-            os.makedirs(os.path.join(agent_dir, 'workspace'))
-        agents_ref = {
-            'codeact_agent': 'CodeActAgent',
-        }
-        process = subprocess.Popen(
-            [
-                'python3',
-                f'{SCRIPT_DIR}/../../openhands/main.py',
-                '-d',
-                f'{os.path.join(agent_dir, "workspace")}',
-                '-c',
-                f'{agents_ref[agent]}',
-                '-t',
-                f'{task}',
-                '-m',
-                'gpt-3.5-turbo',
-            ],
-            stdout=subprocess.PIPE,
-            stderr=subprocess.PIPE,
-            universal_newlines=True,
-        )
-        stdout, stderr = process.communicate()
-        logging.info(f'Stdout: {stdout}')
-        logging.error(f'Stderr: {stderr}')
-
-        assert process.returncode == 0
-        return os.path.join(agent_dir, 'workspace')
-
-    return _run_test_case
-
-
-def pytest_configure(config):
-    """Configuration hook for pytest.
-
-    Args:
-        config: The pytest configuration object.
-    """
-    now = datetime.datetime.now()
-    logging.basicConfig(
-        level=logging.INFO,
-        format='%(asctime)s [%(levelname)s] %(message)s',
-        handlers=[
-            logging.FileHandler(f'test_results_{now.strftime("%Y%m%d_%H%M%S")}.log'),
-            logging.StreamHandler(),
-        ],
-    )
diff --git a/evaluation/regression/run_tests.py b/evaluation/regression/run_tests.py
deleted file mode 100644
index cf06c79fca..0000000000
--- a/evaluation/regression/run_tests.py
+++ /dev/null
@@ -1,32 +0,0 @@
-import argparse
-
-import pytest
-
-from openhands.config import load_openhands_config
-
-config = load_openhands_config()
-
-if __name__ == '__main__':
-    """Main entry point of the script.
-
-    This script runs pytest with specific arguments and configuration.
-
-    Usage:
-        python script_name.py [--OPENAI_API_KEY=<api_key>] [--model=<model_name>]
-
-    """
-    parser = argparse.ArgumentParser(
-        description='This script runs pytest with specific arguments and configuration.'
-    )
-    parser.add_argument(
-        '--OPENAI_API_KEY', type=str, required=True, help='Your OpenAI API key'
-    )
-    parser.add_argument(
-        '--model', type=str, required=True, help='The model name to use'
-    )
-
-    parser_args = parser.parse_args()
-    config.config['OPENAI_API_KEY'] = parser_args.OPENAI_API_KEY
-    args = ['-v', 'evaluation/regression/cases', f'-o model={parser_args.model}']
-
-    pytest.main(args)