mirror of
https://github.com/OpenHands/OpenHands.git
synced 2025-12-26 05:48:36 +08:00
chore(eval): remove old, unused regression test framework under evaluation/regression (#10419)
This commit is contained in:
parent
ab004478f6
commit
f7f4fcf98f
2
evaluation/regression/.gitignore
vendored
2
evaluation/regression/.gitignore
vendored
@ -1,2 +0,0 @@
|
||||
node_modules
|
||||
outputs
|
||||
@ -1,70 +0,0 @@
|
||||
# OpenHands - Regression Test Framework
|
||||
|
||||
OpenHands project is an open-source software engineering AI that can solve various software engineering tasks. This repository contains the regression test framework for OpenHands project.
|
||||
|
||||
## Running the Tests
|
||||
|
||||
To run the tests for OpenHands project, you can use the provided test runner script. Follow these steps:
|
||||
|
||||
1. Ensure you have Python 3.6 or higher installed on your system.
|
||||
2. Install the required dependencies by running the following command in your terminal:
|
||||
```
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
3. Navigate to the root directory of the project.
|
||||
4. Run the test suite using the test runner script with the required arguments:
|
||||
```
|
||||
python evaluation/regression/run_tests.py --OPENAI_API_KEY=sk-xxxxxxxxxxxxxxxxxxxxxx --model=gpt-4o
|
||||
```
|
||||
Replace `sk-xxxxxxxxxxxxxxxxxxxxxx` with your actual OpenAI API key. The default model is `gpt-4o`, but you can specify a different model if needed.
|
||||
|
||||
The test runner will discover and execute all the test cases in the `cases/` directory, and display the results of the test suite, including the status of each individual test case and the overall summary.
|
||||
|
||||
## Test Case Structure
|
||||
|
||||
The test cases for OpenHands project are organized in the `cases/` directory. Each test case has the following structure:
|
||||
|
||||
```
|
||||
cases/
|
||||
├── hello-world/
|
||||
│ ├── task.txt
|
||||
│ ├── outputs/
|
||||
│ │ └── codeact_agent/
|
||||
│ │ └── workspace/
|
||||
│ │ ├── hello_world.sh
|
||||
│ └── test_hello_world.py
|
||||
├── create_web_app/
|
||||
│ ├── task.txt
|
||||
│ ├── outputs/
|
||||
│ │ └── codeact_agent/
|
||||
│ │ └── workspace/
|
||||
│ │ ├── app.py
|
||||
│ │ ├── requirements.txt
|
||||
│ │ ├── static/
|
||||
│ │ └── templates/
|
||||
│ └── test_create_web_app.py
|
||||
└── ...
|
||||
```
|
||||
|
||||
- `task.txt`: This file contains the task description provided by the user.
|
||||
- `outputs/`: This directory contains the output generated by OpenHands for each agent.
|
||||
- `outputs/*/workspace/`: This directory contains the actual output files generated by OpenHands.
|
||||
- `test_*.py`: These are the test scripts that validate the output of OpenHands.
|
||||
|
||||
## Adding New Test Cases
|
||||
|
||||
To add a new test case to the regression test framework, follow the same steps as described in the previous sections.
|
||||
|
||||
## Customizing the Test Cases
|
||||
|
||||
The test cases can be customized by modifying the fixtures defined in the `conftest.py` file. The available fixtures are:
|
||||
|
||||
- `test_cases_dir`: The directory containing the test cases.
|
||||
- `task_file`: The path to the `task.txt` file for the current test case.
|
||||
- `workspace_dir`: The path to the `workspace/` directory for the current test case.
|
||||
- `model`: The model selected start the generation.
|
||||
- `run_test_case`: A fixture that runs OpenHands and generates the workspace for the current test case.
|
||||
|
||||
You can modify these fixtures to change the behavior of the test cases or add new ones as needed.
|
||||
|
||||
If you have any questions or need further assistance, feel free to reach out to the project maintainers.
|
||||
@ -1 +0,0 @@
|
||||
Write an API server in node express which responds with a random number, and a frontend in React that displays the next number from the API
|
||||
@ -1 +0,0 @@
|
||||
Write a simple hello world server in node Express
|
||||
@ -1,2 +0,0 @@
|
||||
#!/usr/bin/env bash
|
||||
echo "hello world"
|
||||
@ -1 +0,0 @@
|
||||
Rewrite the script so that it prints the user's name, using the first argument. If there's no name, default to "world"
|
||||
@ -1 +0,0 @@
|
||||
Write a bash script named "hello_world.sh" that prints "Hello, World!"
|
||||
@ -1,20 +0,0 @@
|
||||
import os
|
||||
|
||||
import pytest
|
||||
from conftest import agents
|
||||
|
||||
|
||||
@pytest.mark.parametrize('agent', agents())
|
||||
def test_hello_world(task_file, run_test_case, agent):
|
||||
"""Test case for the "Hello, World!" Bash script using different agents."""
|
||||
# Run the test case for the specified agent
|
||||
workspace_dir = run_test_case(agent, 'hello-world')
|
||||
|
||||
# Validate the generated workspace
|
||||
assert os.path.exists(workspace_dir)
|
||||
assert os.path.isfile(os.path.join(workspace_dir, 'hello_world.sh'))
|
||||
|
||||
# Execute the hello_world.sh script
|
||||
os.chdir(workspace_dir)
|
||||
output = os.popen('bash hello_world.sh').read()
|
||||
assert output == 'Hello, World!\n'
|
||||
@ -1,2 +0,0 @@
|
||||
def string_length(s):
|
||||
return len(s)
|
||||
@ -1,2 +0,0 @@
|
||||
def to_lowercase(s):
|
||||
return s.lower()
|
||||
@ -1,2 +0,0 @@
|
||||
def reverse_string(s):
|
||||
return s[::-1]
|
||||
@ -1,7 +0,0 @@
|
||||
import random
|
||||
|
||||
|
||||
def scramble_string(s):
|
||||
s_list = list(s)
|
||||
random.shuffle(s_list)
|
||||
return ''.join(s_list)
|
||||
@ -1,8 +0,0 @@
|
||||
def spongebob_case(s):
|
||||
result = ''
|
||||
for i, char in enumerate(s):
|
||||
if i % 2 == 0:
|
||||
result += char.lower()
|
||||
else:
|
||||
result += char.upper()
|
||||
return result
|
||||
@ -1,2 +0,0 @@
|
||||
def to_uppercase(s):
|
||||
return s.upper()
|
||||
@ -1,55 +0,0 @@
|
||||
import sys
|
||||
|
||||
|
||||
def print_help():
|
||||
help_text = """
|
||||
Usage: python string_cli.py <command> <string>
|
||||
|
||||
Commands:
|
||||
reverse - Reverses the input string.
|
||||
uppercase - Converts the input string to uppercase.
|
||||
lowercase - Converts the input string to lowercase.
|
||||
spongebob - Converts the input string to spongebob case.
|
||||
length - Returns the length of the input string.
|
||||
scramble - Randomly scrambles the characters in the input string.
|
||||
"""
|
||||
print(help_text)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if len(sys.argv) == 2 and sys.argv[1] == '--help':
|
||||
print_help()
|
||||
sys.exit(0)
|
||||
elif len(sys.argv) < 3:
|
||||
print('Usage: python string_cli.py <command> <string>')
|
||||
sys.exit(1)
|
||||
|
||||
command = sys.argv[1]
|
||||
input_string = sys.argv[2]
|
||||
|
||||
if command == 'reverse':
|
||||
from commands.reverse import reverse_string
|
||||
|
||||
print(reverse_string(input_string))
|
||||
elif command == 'uppercase':
|
||||
from commands.uppercase import to_uppercase
|
||||
|
||||
print(to_uppercase(input_string))
|
||||
elif command == 'lowercase':
|
||||
from commands.lowercase import to_lowercase
|
||||
|
||||
print(to_lowercase(input_string))
|
||||
elif command == 'spongebob':
|
||||
from commands.spongebob import spongebob_case
|
||||
|
||||
print(spongebob_case(input_string))
|
||||
elif command == 'length':
|
||||
from commands.length import string_length
|
||||
|
||||
print(string_length(input_string))
|
||||
elif command == 'scramble':
|
||||
from commands.scramble import scramble_string
|
||||
|
||||
print(scramble_string(input_string))
|
||||
else:
|
||||
print('Invalid command!')
|
||||
@ -1 +0,0 @@
|
||||
Please rewrite the entire CLI in node.js
|
||||
@ -1,2 +0,0 @@
|
||||
def string_length(s):
|
||||
return len(s)
|
||||
@ -1,2 +0,0 @@
|
||||
def to_lowercase(s):
|
||||
return s.lower()
|
||||
@ -1,2 +0,0 @@
|
||||
def reverse_string(s):
|
||||
return s[::-1]
|
||||
@ -1,7 +0,0 @@
|
||||
import random
|
||||
|
||||
|
||||
def scramble_string(s):
|
||||
s_list = list(s)
|
||||
random.shuffle(s_list)
|
||||
return ''.join(s_list)
|
||||
@ -1,8 +0,0 @@
|
||||
def spongebob_case(s):
|
||||
result = ''
|
||||
for i, char in enumerate(s):
|
||||
if i % 2 == 0:
|
||||
result += char.lower()
|
||||
else:
|
||||
result += char.upper()
|
||||
return result
|
||||
@ -1,2 +0,0 @@
|
||||
def to_uppercase(s):
|
||||
return s.upper()
|
||||
@ -1,36 +0,0 @@
|
||||
import sys
|
||||
|
||||
if __name__ == '__main__':
|
||||
if len(sys.argv) < 3:
|
||||
print('Usage: python string_cli.py <command> <string>')
|
||||
sys.exit(1)
|
||||
|
||||
command = sys.argv[1]
|
||||
input_string = sys.argv[2]
|
||||
|
||||
if command == 'reverse':
|
||||
from commands.reverse import reverse_string
|
||||
|
||||
print(reverse_string(input_string))
|
||||
elif command == 'uppercase':
|
||||
from commands.uppercase import to_uppercase
|
||||
|
||||
print(to_uppercase(input_string))
|
||||
elif command == 'lowercase':
|
||||
from commands.lowercase import to_lowercase
|
||||
|
||||
print(to_lowercase(input_string))
|
||||
elif command == 'spongebob':
|
||||
from commands.spongebob import spongebob_case
|
||||
|
||||
print(spongebob_case(input_string))
|
||||
elif command == 'length':
|
||||
from commands.length import string_length
|
||||
|
||||
print(string_length(input_string))
|
||||
elif command == 'scramble':
|
||||
from commands.scramble import scramble_string
|
||||
|
||||
print(scramble_string(input_string))
|
||||
else:
|
||||
print('Invalid command!')
|
||||
@ -1 +0,0 @@
|
||||
Please add a --help option to the CLI, with a detailed description of each command
|
||||
@ -1 +0,0 @@
|
||||
Write a python CLI for string manipulation. The CLI should accept a command, and a string. The commands should include `reverse`, `uppercase`, `lowercase`, `spongebob`, `length`, and `scramble`. The logic for each command should live in its own file.
|
||||
@ -1 +0,0 @@
|
||||
Write a simple TODO list application in React
|
||||
@ -1,21 +0,0 @@
|
||||
from http.server import BaseHTTPRequestHandler, HTTPServer
|
||||
|
||||
|
||||
class HelloWorldHandler(BaseHTTPRequestHandler):
|
||||
def do_GET(self):
|
||||
self.send_response(200)
|
||||
self.send_header('Content-type', 'text/plain')
|
||||
self.end_headers()
|
||||
self.wfile.write(b'Hello World\n')
|
||||
|
||||
|
||||
def run(server_class=HTTPServer, handler_class=HelloWorldHandler, port=8000):
|
||||
server_address = ('', port)
|
||||
httpd = server_class(server_address, handler_class)
|
||||
print(f'Starting httpd on port {port}...')
|
||||
httpd.serve_forever()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
print('starting server...')
|
||||
run()
|
||||
@ -1 +0,0 @@
|
||||
Make sure the server works and responds appropriately
|
||||
@ -1,171 +0,0 @@
|
||||
import datetime
|
||||
import logging
|
||||
import os
|
||||
import shutil
|
||||
import subprocess
|
||||
|
||||
import pytest
|
||||
|
||||
SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
|
||||
CASES_DIR = os.path.join(SCRIPT_DIR, 'cases')
|
||||
AGENTHUB_DIR = os.path.join(SCRIPT_DIR, '../', 'agenthub')
|
||||
|
||||
|
||||
def agents():
|
||||
"""Retrieves a list of available agents.
|
||||
|
||||
Returns:
|
||||
A list of agent names.
|
||||
"""
|
||||
agents = []
|
||||
for agent in os.listdir(AGENTHUB_DIR):
|
||||
if os.path.isdir(os.path.join(AGENTHUB_DIR, agent)) and agent.endswith(
|
||||
'_agent'
|
||||
):
|
||||
agents.append(agent)
|
||||
return agents
|
||||
|
||||
|
||||
@pytest.fixture(scope='session')
|
||||
def test_cases_dir():
|
||||
"""Fixture that provides the directory path for test cases.
|
||||
|
||||
Returns:
|
||||
The directory path for test cases.
|
||||
"""
|
||||
return CASES_DIR
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def task_file(test_cases_dir, request):
|
||||
"""Fixture that provides the path to the task file for a test case.
|
||||
|
||||
Args:
|
||||
test_cases_dir: The directory path for test cases.
|
||||
request: The pytest request object.
|
||||
|
||||
Returns:
|
||||
The path to the task file for the test case.
|
||||
"""
|
||||
test_case_dir = os.path.dirname(request.module.__file__)
|
||||
task_file_path = os.path.join(test_case_dir, 'task.txt')
|
||||
return task_file_path
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def workspace_dir(test_cases_dir, request):
|
||||
"""Fixture that provides the workspace directory for a test case.
|
||||
|
||||
Args:
|
||||
test_cases_dir: The directory path for test cases.
|
||||
request: The pytest request object.
|
||||
|
||||
Returns:
|
||||
The workspace directory for the test case.
|
||||
"""
|
||||
test_case_dir = os.path.dirname(request.module.__file__)
|
||||
workspace_dir = os.path.join(test_case_dir, 'workspace')
|
||||
return workspace_dir
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def model(request):
|
||||
"""Fixture that provides the model name.
|
||||
|
||||
Args:
|
||||
request: The pytest request object.
|
||||
|
||||
Returns:
|
||||
The model name, defaulting to "gpt-3.5-turbo".
|
||||
"""
|
||||
return request.config.getoption('model', default='gpt-3.5-turbo')
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def run_test_case(test_cases_dir, workspace_dir, request):
|
||||
"""Fixture that provides a function to run a test case.
|
||||
|
||||
Args:
|
||||
test_cases_dir: The directory path for test cases.
|
||||
workspace_dir: The workspace directory for the test case.
|
||||
request: The pytest request object.
|
||||
|
||||
Returns:
|
||||
A function that runs a test case for a given agent and case.
|
||||
"""
|
||||
|
||||
def _run_test_case(agent, case):
|
||||
"""Runs a test case for a given agent.
|
||||
|
||||
Args:
|
||||
agent: The name of the agent to run the test case for.
|
||||
case: The name of the test case to run.
|
||||
|
||||
Returns:
|
||||
The path to the workspace directory for the agent and test case.
|
||||
|
||||
Raises:
|
||||
AssertionError: If the test case execution fails (non-zero return code).
|
||||
|
||||
Steps:
|
||||
"""
|
||||
case_dir = os.path.join(test_cases_dir, case)
|
||||
task = open(os.path.join(case_dir, 'task.txt'), 'r').read().strip()
|
||||
outputs_dir = os.path.join(case_dir, 'outputs')
|
||||
agent_dir = os.path.join(outputs_dir, agent)
|
||||
|
||||
if not os.path.exists(agent_dir):
|
||||
os.makedirs(agent_dir)
|
||||
|
||||
shutil.rmtree(os.path.join(agent_dir, 'workspace'), ignore_errors=True)
|
||||
if os.path.isdir(os.path.join(case_dir, 'start')):
|
||||
os.copytree(
|
||||
os.path.join(case_dir, 'start'), os.path.join(agent_dir, 'workspace')
|
||||
)
|
||||
else:
|
||||
os.makedirs(os.path.join(agent_dir, 'workspace'))
|
||||
agents_ref = {
|
||||
'codeact_agent': 'CodeActAgent',
|
||||
}
|
||||
process = subprocess.Popen(
|
||||
[
|
||||
'python3',
|
||||
f'{SCRIPT_DIR}/../../openhands/main.py',
|
||||
'-d',
|
||||
f'{os.path.join(agent_dir, "workspace")}',
|
||||
'-c',
|
||||
f'{agents_ref[agent]}',
|
||||
'-t',
|
||||
f'{task}',
|
||||
'-m',
|
||||
'gpt-3.5-turbo',
|
||||
],
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
universal_newlines=True,
|
||||
)
|
||||
stdout, stderr = process.communicate()
|
||||
logging.info(f'Stdout: {stdout}')
|
||||
logging.error(f'Stderr: {stderr}')
|
||||
|
||||
assert process.returncode == 0
|
||||
return os.path.join(agent_dir, 'workspace')
|
||||
|
||||
return _run_test_case
|
||||
|
||||
|
||||
def pytest_configure(config):
|
||||
"""Configuration hook for pytest.
|
||||
|
||||
Args:
|
||||
config: The pytest configuration object.
|
||||
"""
|
||||
now = datetime.datetime.now()
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format='%(asctime)s [%(levelname)s] %(message)s',
|
||||
handlers=[
|
||||
logging.FileHandler(f'test_results_{now.strftime("%Y%m%d_%H%M%S")}.log'),
|
||||
logging.StreamHandler(),
|
||||
],
|
||||
)
|
||||
@ -1,32 +0,0 @@
|
||||
import argparse
|
||||
|
||||
import pytest
|
||||
|
||||
from openhands.config import load_openhands_config
|
||||
|
||||
config = load_openhands_config()
|
||||
|
||||
if __name__ == '__main__':
|
||||
"""Main entry point of the script.
|
||||
|
||||
This script runs pytest with specific arguments and configuration.
|
||||
|
||||
Usage:
|
||||
python script_name.py [--OPENAI_API_KEY=<api_key>] [--model=<model_name>]
|
||||
|
||||
"""
|
||||
parser = argparse.ArgumentParser(
|
||||
description='This script runs pytest with specific arguments and configuration.'
|
||||
)
|
||||
parser.add_argument(
|
||||
'--OPENAI_API_KEY', type=str, required=True, help='Your OpenAI API key'
|
||||
)
|
||||
parser.add_argument(
|
||||
'--model', type=str, required=True, help='The model name to use'
|
||||
)
|
||||
|
||||
parser_args = parser.parse_args()
|
||||
config.config['OPENAI_API_KEY'] = parser_args.OPENAI_API_KEY
|
||||
args = ['-v', 'evaluation/regression/cases', f'-o model={parser_args.model}']
|
||||
|
||||
pytest.main(args)
|
||||
Loading…
x
Reference in New Issue
Block a user