diff --git a/evaluation/regression/.gitignore b/evaluation/regression/.gitignore deleted file mode 100644 index b00b59b516..0000000000 --- a/evaluation/regression/.gitignore +++ /dev/null @@ -1,2 +0,0 @@ -node_modules -outputs diff --git a/evaluation/regression/README.md b/evaluation/regression/README.md deleted file mode 100644 index a7f532c0fc..0000000000 --- a/evaluation/regression/README.md +++ /dev/null @@ -1,70 +0,0 @@ -# OpenHands - Regression Test Framework - -OpenHands project is an open-source software engineering AI that can solve various software engineering tasks. This repository contains the regression test framework for OpenHands project. - -## Running the Tests - -To run the tests for OpenHands project, you can use the provided test runner script. Follow these steps: - -1. Ensure you have Python 3.6 or higher installed on your system. -2. Install the required dependencies by running the following command in your terminal: - ``` - pip install -r requirements.txt - ``` -3. Navigate to the root directory of the project. -4. Run the test suite using the test runner script with the required arguments: - ``` - python evaluation/regression/run_tests.py --OPENAI_API_KEY=sk-xxxxxxxxxxxxxxxxxxxxxx --model=gpt-4o - ``` - Replace `sk-xxxxxxxxxxxxxxxxxxxxxx` with your actual OpenAI API key. The default model is `gpt-4o`, but you can specify a different model if needed. - -The test runner will discover and execute all the test cases in the `cases/` directory, and display the results of the test suite, including the status of each individual test case and the overall summary. - -## Test Case Structure - -The test cases for OpenHands project are organized in the `cases/` directory. Each test case has the following structure: - -``` -cases/ -├── hello-world/ -│ ├── task.txt -│ ├── outputs/ -│ │ └── codeact_agent/ -│ │ └── workspace/ -│ │ ├── hello_world.sh -│ └── test_hello_world.py -├── create_web_app/ -│ ├── task.txt -│ ├── outputs/ -│ │ └── codeact_agent/ -│ │ └── workspace/ -│ │ ├── app.py -│ │ ├── requirements.txt -│ │ ├── static/ -│ │ └── templates/ -│ └── test_create_web_app.py -└── ... -``` - -- `task.txt`: This file contains the task description provided by the user. -- `outputs/`: This directory contains the output generated by OpenHands for each agent. -- `outputs/*/workspace/`: This directory contains the actual output files generated by OpenHands. -- `test_*.py`: These are the test scripts that validate the output of OpenHands. - -## Adding New Test Cases - -To add a new test case to the regression test framework, follow the same steps as described in the previous sections. - -## Customizing the Test Cases - -The test cases can be customized by modifying the fixtures defined in the `conftest.py` file. The available fixtures are: - -- `test_cases_dir`: The directory containing the test cases. -- `task_file`: The path to the `task.txt` file for the current test case. -- `workspace_dir`: The path to the `workspace/` directory for the current test case. -- `model`: The model selected start the generation. -- `run_test_case`: A fixture that runs OpenHands and generates the workspace for the current test case. - -You can modify these fixtures to change the behavior of the test cases or add new ones as needed. - -If you have any questions or need further assistance, feel free to reach out to the project maintainers. diff --git a/evaluation/regression/cases/client-server/task.txt b/evaluation/regression/cases/client-server/task.txt deleted file mode 100644 index 0249d4cd46..0000000000 --- a/evaluation/regression/cases/client-server/task.txt +++ /dev/null @@ -1 +0,0 @@ -Write an API server in node express which responds with a random number, and a frontend in React that displays the next number from the API diff --git a/evaluation/regression/cases/express/task.txt b/evaluation/regression/cases/express/task.txt deleted file mode 100644 index 0fcdf10f03..0000000000 --- a/evaluation/regression/cases/express/task.txt +++ /dev/null @@ -1 +0,0 @@ -Write a simple hello world server in node Express diff --git a/evaluation/regression/cases/hello-name/start/hello_world.sh b/evaluation/regression/cases/hello-name/start/hello_world.sh deleted file mode 100644 index 1a2a680f62..0000000000 --- a/evaluation/regression/cases/hello-name/start/hello_world.sh +++ /dev/null @@ -1,2 +0,0 @@ -#!/usr/bin/env bash -echo "hello world" diff --git a/evaluation/regression/cases/hello-name/task.txt b/evaluation/regression/cases/hello-name/task.txt deleted file mode 100644 index cdafb00866..0000000000 --- a/evaluation/regression/cases/hello-name/task.txt +++ /dev/null @@ -1 +0,0 @@ -Rewrite the script so that it prints the user's name, using the first argument. If there's no name, default to "world" diff --git a/evaluation/regression/cases/hello-world/task.txt b/evaluation/regression/cases/hello-world/task.txt deleted file mode 100644 index b03a5abac1..0000000000 --- a/evaluation/regression/cases/hello-world/task.txt +++ /dev/null @@ -1 +0,0 @@ -Write a bash script named "hello_world.sh" that prints "Hello, World!" diff --git a/evaluation/regression/cases/hello-world/test_hello_world.py b/evaluation/regression/cases/hello-world/test_hello_world.py deleted file mode 100644 index 6b4b808c4e..0000000000 --- a/evaluation/regression/cases/hello-world/test_hello_world.py +++ /dev/null @@ -1,20 +0,0 @@ -import os - -import pytest -from conftest import agents - - -@pytest.mark.parametrize('agent', agents()) -def test_hello_world(task_file, run_test_case, agent): - """Test case for the "Hello, World!" Bash script using different agents.""" - # Run the test case for the specified agent - workspace_dir = run_test_case(agent, 'hello-world') - - # Validate the generated workspace - assert os.path.exists(workspace_dir) - assert os.path.isfile(os.path.join(workspace_dir, 'hello_world.sh')) - - # Execute the hello_world.sh script - os.chdir(workspace_dir) - output = os.popen('bash hello_world.sh').read() - assert output == 'Hello, World!\n' diff --git a/evaluation/regression/cases/node-cli-rewrite/start/commands/length.py b/evaluation/regression/cases/node-cli-rewrite/start/commands/length.py deleted file mode 100644 index 6389d4b30a..0000000000 --- a/evaluation/regression/cases/node-cli-rewrite/start/commands/length.py +++ /dev/null @@ -1,2 +0,0 @@ -def string_length(s): - return len(s) diff --git a/evaluation/regression/cases/node-cli-rewrite/start/commands/lowercase.py b/evaluation/regression/cases/node-cli-rewrite/start/commands/lowercase.py deleted file mode 100644 index 4f5d951bff..0000000000 --- a/evaluation/regression/cases/node-cli-rewrite/start/commands/lowercase.py +++ /dev/null @@ -1,2 +0,0 @@ -def to_lowercase(s): - return s.lower() diff --git a/evaluation/regression/cases/node-cli-rewrite/start/commands/reverse.py b/evaluation/regression/cases/node-cli-rewrite/start/commands/reverse.py deleted file mode 100644 index a5c4132200..0000000000 --- a/evaluation/regression/cases/node-cli-rewrite/start/commands/reverse.py +++ /dev/null @@ -1,2 +0,0 @@ -def reverse_string(s): - return s[::-1] diff --git a/evaluation/regression/cases/node-cli-rewrite/start/commands/scramble.py b/evaluation/regression/cases/node-cli-rewrite/start/commands/scramble.py deleted file mode 100644 index 7470813dac..0000000000 --- a/evaluation/regression/cases/node-cli-rewrite/start/commands/scramble.py +++ /dev/null @@ -1,7 +0,0 @@ -import random - - -def scramble_string(s): - s_list = list(s) - random.shuffle(s_list) - return ''.join(s_list) diff --git a/evaluation/regression/cases/node-cli-rewrite/start/commands/spongebob.py b/evaluation/regression/cases/node-cli-rewrite/start/commands/spongebob.py deleted file mode 100644 index 782af450e1..0000000000 --- a/evaluation/regression/cases/node-cli-rewrite/start/commands/spongebob.py +++ /dev/null @@ -1,8 +0,0 @@ -def spongebob_case(s): - result = '' - for i, char in enumerate(s): - if i % 2 == 0: - result += char.lower() - else: - result += char.upper() - return result diff --git a/evaluation/regression/cases/node-cli-rewrite/start/commands/uppercase.py b/evaluation/regression/cases/node-cli-rewrite/start/commands/uppercase.py deleted file mode 100644 index 615ed1aae1..0000000000 --- a/evaluation/regression/cases/node-cli-rewrite/start/commands/uppercase.py +++ /dev/null @@ -1,2 +0,0 @@ -def to_uppercase(s): - return s.upper() diff --git a/evaluation/regression/cases/node-cli-rewrite/start/string_cli.py b/evaluation/regression/cases/node-cli-rewrite/start/string_cli.py deleted file mode 100644 index 6784551305..0000000000 --- a/evaluation/regression/cases/node-cli-rewrite/start/string_cli.py +++ /dev/null @@ -1,55 +0,0 @@ -import sys - - -def print_help(): - help_text = """ -Usage: python string_cli.py - -Commands: - reverse - Reverses the input string. - uppercase - Converts the input string to uppercase. - lowercase - Converts the input string to lowercase. - spongebob - Converts the input string to spongebob case. - length - Returns the length of the input string. - scramble - Randomly scrambles the characters in the input string. -""" - print(help_text) - - -if __name__ == '__main__': - if len(sys.argv) == 2 and sys.argv[1] == '--help': - print_help() - sys.exit(0) - elif len(sys.argv) < 3: - print('Usage: python string_cli.py ') - sys.exit(1) - - command = sys.argv[1] - input_string = sys.argv[2] - - if command == 'reverse': - from commands.reverse import reverse_string - - print(reverse_string(input_string)) - elif command == 'uppercase': - from commands.uppercase import to_uppercase - - print(to_uppercase(input_string)) - elif command == 'lowercase': - from commands.lowercase import to_lowercase - - print(to_lowercase(input_string)) - elif command == 'spongebob': - from commands.spongebob import spongebob_case - - print(spongebob_case(input_string)) - elif command == 'length': - from commands.length import string_length - - print(string_length(input_string)) - elif command == 'scramble': - from commands.scramble import scramble_string - - print(scramble_string(input_string)) - else: - print('Invalid command!') diff --git a/evaluation/regression/cases/node-cli-rewrite/task.txt b/evaluation/regression/cases/node-cli-rewrite/task.txt deleted file mode 100644 index fce5870324..0000000000 --- a/evaluation/regression/cases/node-cli-rewrite/task.txt +++ /dev/null @@ -1 +0,0 @@ -Please rewrite the entire CLI in node.js diff --git a/evaluation/regression/cases/python-cli-help/start/commands/length.py b/evaluation/regression/cases/python-cli-help/start/commands/length.py deleted file mode 100644 index 6389d4b30a..0000000000 --- a/evaluation/regression/cases/python-cli-help/start/commands/length.py +++ /dev/null @@ -1,2 +0,0 @@ -def string_length(s): - return len(s) diff --git a/evaluation/regression/cases/python-cli-help/start/commands/lowercase.py b/evaluation/regression/cases/python-cli-help/start/commands/lowercase.py deleted file mode 100644 index 4f5d951bff..0000000000 --- a/evaluation/regression/cases/python-cli-help/start/commands/lowercase.py +++ /dev/null @@ -1,2 +0,0 @@ -def to_lowercase(s): - return s.lower() diff --git a/evaluation/regression/cases/python-cli-help/start/commands/reverse.py b/evaluation/regression/cases/python-cli-help/start/commands/reverse.py deleted file mode 100644 index a5c4132200..0000000000 --- a/evaluation/regression/cases/python-cli-help/start/commands/reverse.py +++ /dev/null @@ -1,2 +0,0 @@ -def reverse_string(s): - return s[::-1] diff --git a/evaluation/regression/cases/python-cli-help/start/commands/scramble.py b/evaluation/regression/cases/python-cli-help/start/commands/scramble.py deleted file mode 100644 index 7470813dac..0000000000 --- a/evaluation/regression/cases/python-cli-help/start/commands/scramble.py +++ /dev/null @@ -1,7 +0,0 @@ -import random - - -def scramble_string(s): - s_list = list(s) - random.shuffle(s_list) - return ''.join(s_list) diff --git a/evaluation/regression/cases/python-cli-help/start/commands/spongebob.py b/evaluation/regression/cases/python-cli-help/start/commands/spongebob.py deleted file mode 100644 index 782af450e1..0000000000 --- a/evaluation/regression/cases/python-cli-help/start/commands/spongebob.py +++ /dev/null @@ -1,8 +0,0 @@ -def spongebob_case(s): - result = '' - for i, char in enumerate(s): - if i % 2 == 0: - result += char.lower() - else: - result += char.upper() - return result diff --git a/evaluation/regression/cases/python-cli-help/start/commands/uppercase.py b/evaluation/regression/cases/python-cli-help/start/commands/uppercase.py deleted file mode 100644 index 615ed1aae1..0000000000 --- a/evaluation/regression/cases/python-cli-help/start/commands/uppercase.py +++ /dev/null @@ -1,2 +0,0 @@ -def to_uppercase(s): - return s.upper() diff --git a/evaluation/regression/cases/python-cli-help/start/string_cli.py b/evaluation/regression/cases/python-cli-help/start/string_cli.py deleted file mode 100644 index 2deb02b0a6..0000000000 --- a/evaluation/regression/cases/python-cli-help/start/string_cli.py +++ /dev/null @@ -1,36 +0,0 @@ -import sys - -if __name__ == '__main__': - if len(sys.argv) < 3: - print('Usage: python string_cli.py ') - sys.exit(1) - - command = sys.argv[1] - input_string = sys.argv[2] - - if command == 'reverse': - from commands.reverse import reverse_string - - print(reverse_string(input_string)) - elif command == 'uppercase': - from commands.uppercase import to_uppercase - - print(to_uppercase(input_string)) - elif command == 'lowercase': - from commands.lowercase import to_lowercase - - print(to_lowercase(input_string)) - elif command == 'spongebob': - from commands.spongebob import spongebob_case - - print(spongebob_case(input_string)) - elif command == 'length': - from commands.length import string_length - - print(string_length(input_string)) - elif command == 'scramble': - from commands.scramble import scramble_string - - print(scramble_string(input_string)) - else: - print('Invalid command!') diff --git a/evaluation/regression/cases/python-cli-help/task.txt b/evaluation/regression/cases/python-cli-help/task.txt deleted file mode 100644 index 7b35f6cc6e..0000000000 --- a/evaluation/regression/cases/python-cli-help/task.txt +++ /dev/null @@ -1 +0,0 @@ -Please add a --help option to the CLI, with a detailed description of each command diff --git a/evaluation/regression/cases/python-cli/task.txt b/evaluation/regression/cases/python-cli/task.txt deleted file mode 100644 index cbd8e54ef0..0000000000 --- a/evaluation/regression/cases/python-cli/task.txt +++ /dev/null @@ -1 +0,0 @@ -Write a python CLI for string manipulation. The CLI should accept a command, and a string. The commands should include `reverse`, `uppercase`, `lowercase`, `spongebob`, `length`, and `scramble`. The logic for each command should live in its own file. diff --git a/evaluation/regression/cases/react-todo/task.txt b/evaluation/regression/cases/react-todo/task.txt deleted file mode 100644 index ac3ab48992..0000000000 --- a/evaluation/regression/cases/react-todo/task.txt +++ /dev/null @@ -1 +0,0 @@ -Write a simple TODO list application in React diff --git a/evaluation/regression/cases/server-test/start/server.py b/evaluation/regression/cases/server-test/start/server.py deleted file mode 100644 index 71a8d84c94..0000000000 --- a/evaluation/regression/cases/server-test/start/server.py +++ /dev/null @@ -1,21 +0,0 @@ -from http.server import BaseHTTPRequestHandler, HTTPServer - - -class HelloWorldHandler(BaseHTTPRequestHandler): - def do_GET(self): - self.send_response(200) - self.send_header('Content-type', 'text/plain') - self.end_headers() - self.wfile.write(b'Hello World\n') - - -def run(server_class=HTTPServer, handler_class=HelloWorldHandler, port=8000): - server_address = ('', port) - httpd = server_class(server_address, handler_class) - print(f'Starting httpd on port {port}...') - httpd.serve_forever() - - -if __name__ == '__main__': - print('starting server...') - run() diff --git a/evaluation/regression/cases/server-test/task.txt b/evaluation/regression/cases/server-test/task.txt deleted file mode 100644 index 0cabc8734a..0000000000 --- a/evaluation/regression/cases/server-test/task.txt +++ /dev/null @@ -1 +0,0 @@ -Make sure the server works and responds appropriately diff --git a/evaluation/regression/conftest.py b/evaluation/regression/conftest.py deleted file mode 100644 index a313e6f0ee..0000000000 --- a/evaluation/regression/conftest.py +++ /dev/null @@ -1,171 +0,0 @@ -import datetime -import logging -import os -import shutil -import subprocess - -import pytest - -SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) -CASES_DIR = os.path.join(SCRIPT_DIR, 'cases') -AGENTHUB_DIR = os.path.join(SCRIPT_DIR, '../', 'agenthub') - - -def agents(): - """Retrieves a list of available agents. - - Returns: - A list of agent names. - """ - agents = [] - for agent in os.listdir(AGENTHUB_DIR): - if os.path.isdir(os.path.join(AGENTHUB_DIR, agent)) and agent.endswith( - '_agent' - ): - agents.append(agent) - return agents - - -@pytest.fixture(scope='session') -def test_cases_dir(): - """Fixture that provides the directory path for test cases. - - Returns: - The directory path for test cases. - """ - return CASES_DIR - - -@pytest.fixture -def task_file(test_cases_dir, request): - """Fixture that provides the path to the task file for a test case. - - Args: - test_cases_dir: The directory path for test cases. - request: The pytest request object. - - Returns: - The path to the task file for the test case. - """ - test_case_dir = os.path.dirname(request.module.__file__) - task_file_path = os.path.join(test_case_dir, 'task.txt') - return task_file_path - - -@pytest.fixture -def workspace_dir(test_cases_dir, request): - """Fixture that provides the workspace directory for a test case. - - Args: - test_cases_dir: The directory path for test cases. - request: The pytest request object. - - Returns: - The workspace directory for the test case. - """ - test_case_dir = os.path.dirname(request.module.__file__) - workspace_dir = os.path.join(test_case_dir, 'workspace') - return workspace_dir - - -@pytest.fixture -def model(request): - """Fixture that provides the model name. - - Args: - request: The pytest request object. - - Returns: - The model name, defaulting to "gpt-3.5-turbo". - """ - return request.config.getoption('model', default='gpt-3.5-turbo') - - -@pytest.fixture -def run_test_case(test_cases_dir, workspace_dir, request): - """Fixture that provides a function to run a test case. - - Args: - test_cases_dir: The directory path for test cases. - workspace_dir: The workspace directory for the test case. - request: The pytest request object. - - Returns: - A function that runs a test case for a given agent and case. - """ - - def _run_test_case(agent, case): - """Runs a test case for a given agent. - - Args: - agent: The name of the agent to run the test case for. - case: The name of the test case to run. - - Returns: - The path to the workspace directory for the agent and test case. - - Raises: - AssertionError: If the test case execution fails (non-zero return code). - - Steps: - """ - case_dir = os.path.join(test_cases_dir, case) - task = open(os.path.join(case_dir, 'task.txt'), 'r').read().strip() - outputs_dir = os.path.join(case_dir, 'outputs') - agent_dir = os.path.join(outputs_dir, agent) - - if not os.path.exists(agent_dir): - os.makedirs(agent_dir) - - shutil.rmtree(os.path.join(agent_dir, 'workspace'), ignore_errors=True) - if os.path.isdir(os.path.join(case_dir, 'start')): - os.copytree( - os.path.join(case_dir, 'start'), os.path.join(agent_dir, 'workspace') - ) - else: - os.makedirs(os.path.join(agent_dir, 'workspace')) - agents_ref = { - 'codeact_agent': 'CodeActAgent', - } - process = subprocess.Popen( - [ - 'python3', - f'{SCRIPT_DIR}/../../openhands/main.py', - '-d', - f'{os.path.join(agent_dir, "workspace")}', - '-c', - f'{agents_ref[agent]}', - '-t', - f'{task}', - '-m', - 'gpt-3.5-turbo', - ], - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - universal_newlines=True, - ) - stdout, stderr = process.communicate() - logging.info(f'Stdout: {stdout}') - logging.error(f'Stderr: {stderr}') - - assert process.returncode == 0 - return os.path.join(agent_dir, 'workspace') - - return _run_test_case - - -def pytest_configure(config): - """Configuration hook for pytest. - - Args: - config: The pytest configuration object. - """ - now = datetime.datetime.now() - logging.basicConfig( - level=logging.INFO, - format='%(asctime)s [%(levelname)s] %(message)s', - handlers=[ - logging.FileHandler(f'test_results_{now.strftime("%Y%m%d_%H%M%S")}.log'), - logging.StreamHandler(), - ], - ) diff --git a/evaluation/regression/run_tests.py b/evaluation/regression/run_tests.py deleted file mode 100644 index cf06c79fca..0000000000 --- a/evaluation/regression/run_tests.py +++ /dev/null @@ -1,32 +0,0 @@ -import argparse - -import pytest - -from openhands.config import load_openhands_config - -config = load_openhands_config() - -if __name__ == '__main__': - """Main entry point of the script. - - This script runs pytest with specific arguments and configuration. - - Usage: - python script_name.py [--OPENAI_API_KEY=] [--model=] - - """ - parser = argparse.ArgumentParser( - description='This script runs pytest with specific arguments and configuration.' - ) - parser.add_argument( - '--OPENAI_API_KEY', type=str, required=True, help='Your OpenAI API key' - ) - parser.add_argument( - '--model', type=str, required=True, help='The model name to use' - ) - - parser_args = parser.parse_args() - config.config['OPENAI_API_KEY'] = parser_args.OPENAI_API_KEY - args = ['-v', 'evaluation/regression/cases', f'-o model={parser_args.model}'] - - pytest.main(args)