mirror of
https://github.com/OpenHands/OpenHands.git
synced 2025-12-26 05:48:36 +08:00
[chore] Just linting on swe-bench files (#7918)
This commit is contained in:
parent
6171395ef9
commit
9b9b1291fc
@ -214,7 +214,7 @@ In order to run evaluation of the obtained inference results in the SWT-Bench ha
|
||||
```bash
|
||||
python3 evaluation/benchmarks/swe_bench/scripts/swtbench/convert.py --prediction_file [output.jsonl] > [output_swt.jsonl]
|
||||
|
||||
# Example
|
||||
# Example
|
||||
python3 evaluation/benchmarks/swe_bench/scripts/swtbench/convert.py --prediction_file "evaluation/evaluation_outputs/outputs/princeton-nlp__SWE-bench_Verified-test/CodeActAgent/gpt-4o-2024-11-20_maxiter_100_N_v0.31.0-no-hint-swt-run_1/output.jsonl" > OpenHands-gpt-4o-2024-11-20.jsonl
|
||||
```
|
||||
|
||||
@ -238,4 +238,4 @@ The results of the evaluation can be obtained by running the reporting script of
|
||||
```bash
|
||||
# Example
|
||||
python -m src.report run_instance_swt_logs/OpenHands-CodeAct-gpt-4o-2024-11-20/OpenHands__CodeActAgent__gpt-4o-2024-11-20 --dataset verified
|
||||
```
|
||||
```
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@ -10,11 +10,6 @@ import toml
|
||||
from datasets import load_dataset
|
||||
|
||||
import openhands.agenthub
|
||||
from evaluation.benchmarks.swe_bench.resource.swt_bench_constants import (
|
||||
MAP_REPO_TO_TEST_FRAMEWORK_VERBOSE,
|
||||
MAP_REPO_TO_INSTALL,
|
||||
MAP_VERSION_TO_INSTALL
|
||||
)
|
||||
from evaluation.benchmarks.swe_bench.binary_patch_utils import (
|
||||
remove_binary_diffs,
|
||||
remove_binary_files_from_git,
|
||||
@ -22,6 +17,11 @@ from evaluation.benchmarks.swe_bench.binary_patch_utils import (
|
||||
from evaluation.benchmarks.swe_bench.resource.mapping import (
|
||||
get_instance_resource_factor,
|
||||
)
|
||||
from evaluation.benchmarks.swe_bench.resource.swt_bench_constants import (
|
||||
MAP_REPO_TO_INSTALL,
|
||||
MAP_REPO_TO_TEST_FRAMEWORK_VERBOSE,
|
||||
MAP_VERSION_TO_INSTALL,
|
||||
)
|
||||
from evaluation.utils.shared import (
|
||||
EvalException,
|
||||
EvalMetadata,
|
||||
@ -60,7 +60,7 @@ from openhands.utils.shutdown_listener import sleep_if_should_continue
|
||||
|
||||
USE_HINT_TEXT = os.environ.get('USE_HINT_TEXT', 'false').lower() == 'true'
|
||||
RUN_WITH_BROWSING = os.environ.get('RUN_WITH_BROWSING', 'false').lower() == 'true'
|
||||
BenchMode = Literal["swe", "swt", "swt-ci"]
|
||||
BenchMode = Literal['swe', 'swt', 'swt-ci']
|
||||
|
||||
|
||||
AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = {
|
||||
@ -74,9 +74,13 @@ def _get_swebench_workspace_dir_name(instance: pd.Series) -> str:
|
||||
|
||||
def get_instruction(instance: pd.Series, metadata: EvalMetadata) -> MessageAction:
|
||||
workspace_dir_name = _get_swebench_workspace_dir_name(instance)
|
||||
mode = metadata.details["mode"]
|
||||
mode = metadata.details['mode']
|
||||
if mode.startswith('swt'):
|
||||
test_instructions = f"The following command can be used to run the tests: `{list(MAP_REPO_TO_TEST_FRAMEWORK_VERBOSE[instance.repo].values())[0]}`. Make sure they fail in the expected way.\n" if mode.endswith("ci") else ""
|
||||
test_instructions = (
|
||||
f'The following command can be used to run the tests: `{list(MAP_REPO_TO_TEST_FRAMEWORK_VERBOSE[instance.repo].values())[0]}`. Make sure they fail in the expected way.\n'
|
||||
if mode.endswith('ci')
|
||||
else ''
|
||||
)
|
||||
instruction = f"""\
|
||||
<uploaded_files>
|
||||
/workspace/{workspace_dir_name}
|
||||
@ -387,20 +391,22 @@ def initialize_runtime(
|
||||
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
||||
assert_and_raise(obs.exit_code == 0, f'Failed to remove git remotes: {str(obs)}')
|
||||
|
||||
if metadata.details["mode"] == "swt-ci":
|
||||
if metadata.details['mode'] == 'swt-ci':
|
||||
# set up repo
|
||||
setup_commands = []
|
||||
if instance["repo"] in MAP_REPO_TO_INSTALL:
|
||||
setup_commands.append(MAP_REPO_TO_INSTALL[instance["repo"]])
|
||||
if instance['repo'] in MAP_REPO_TO_INSTALL:
|
||||
setup_commands.append(MAP_REPO_TO_INSTALL[instance['repo']])
|
||||
|
||||
# Run pre-install set up if provided
|
||||
install = MAP_VERSION_TO_INSTALL.get(instance['repo'], {}).get(instance['version'], [])
|
||||
if "pre_install" in install:
|
||||
for pre_install in install["pre_install"]:
|
||||
install = MAP_VERSION_TO_INSTALL.get(instance['repo'], {}).get(
|
||||
instance['version'], []
|
||||
)
|
||||
if 'pre_install' in install:
|
||||
for pre_install in install['pre_install']:
|
||||
setup_commands.append(pre_install)
|
||||
|
||||
if "install" in install:
|
||||
setup_commands.append(install["install"])
|
||||
if 'install' in install:
|
||||
setup_commands.append(install['install'])
|
||||
|
||||
for command in setup_commands:
|
||||
action = CmdRunAction(command=command)
|
||||
@ -409,7 +415,6 @@ def initialize_runtime(
|
||||
obs = runtime.run_action(action)
|
||||
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
||||
|
||||
|
||||
if 'multimodal' not in metadata.dataset.lower():
|
||||
# Only for non-multimodal datasets, we need to activate the testbed environment for Python
|
||||
# SWE-Bench multimodal datasets are not using the testbed environment
|
||||
@ -775,7 +780,7 @@ if __name__ == '__main__':
|
||||
if llm_config is None:
|
||||
raise ValueError(f'Could not find LLM config: --llm_config {args.llm_config}')
|
||||
|
||||
details = {"mode": args.mode}
|
||||
details = {'mode': args.mode}
|
||||
_agent_cls = openhands.agenthub.Agent.get_cls(args.agent_cls)
|
||||
|
||||
dataset_descrption = (
|
||||
|
||||
@ -1,35 +1,47 @@
|
||||
import json
|
||||
import argparse
|
||||
import json
|
||||
import logging
|
||||
|
||||
|
||||
import unidiff
|
||||
|
||||
from evaluation.benchmarks.swe_bench.resource.swt_bench_constants import MAP_VERSION_TO_INSTALL
|
||||
from evaluation.benchmarks.swe_bench.resource.swt_bench_constants import (
|
||||
MAP_VERSION_TO_INSTALL,
|
||||
)
|
||||
|
||||
_LOGGER = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def remove_setup_files(model_patch: str, instance: dict, delete_setup_changes: bool):
|
||||
""" Discard all changes that a patch applies to files changes by the pre_install script and that are reproduction scripts (top-level script)"""
|
||||
setup_files = ["setup.py", "tox.ini", "pyproject.toml"]
|
||||
pre_install = MAP_VERSION_TO_INSTALL.get(instance["repo"], {}).get(instance["version"], {}).get("pre_install", [])
|
||||
relevant_files = [
|
||||
file
|
||||
for file in setup_files
|
||||
if any(file in install and "sed" in install for install in pre_install)
|
||||
] if delete_setup_changes else []
|
||||
"""Discard all changes that a patch applies to files changes by the pre_install script and that are reproduction scripts (top-level script)"""
|
||||
setup_files = ['setup.py', 'tox.ini', 'pyproject.toml']
|
||||
pre_install = (
|
||||
MAP_VERSION_TO_INSTALL.get(instance['repo'], {})
|
||||
.get(instance['version'], {})
|
||||
.get('pre_install', [])
|
||||
)
|
||||
relevant_files = (
|
||||
[
|
||||
file
|
||||
for file in setup_files
|
||||
if any(file in install and 'sed' in install for install in pre_install)
|
||||
]
|
||||
if delete_setup_changes
|
||||
else []
|
||||
)
|
||||
for i in range(10):
|
||||
try:
|
||||
# Appearently outputs.jsonl has .strip() applied, so we try to reconstruct the original patch by adding auxiliary whitespace
|
||||
patch = unidiff.PatchSet(model_patch + i*"\n")
|
||||
patch = unidiff.PatchSet(model_patch + i * '\n')
|
||||
break
|
||||
except unidiff.UnidiffParseError as e:
|
||||
except unidiff.UnidiffParseError:
|
||||
pass
|
||||
|
||||
to_delete = []
|
||||
for i, file in enumerate(patch):
|
||||
if any(f in file.source_file for f in relevant_files) or file.target_file.count("/") == 1:
|
||||
if (
|
||||
any(f in file.source_file for f in relevant_files)
|
||||
or file.target_file.count('/') == 1
|
||||
):
|
||||
to_delete.append(i)
|
||||
for i in reversed(to_delete):
|
||||
del patch[i]
|
||||
@ -37,36 +49,46 @@ def remove_setup_files(model_patch: str, instance: dict, delete_setup_changes: b
|
||||
|
||||
|
||||
def main(
|
||||
prediction_file: str,
|
||||
prediction_file: str,
|
||||
):
|
||||
"""Main function to extract the model patches from the OpenHands prediction file and turn them into the expected SWT-Bench format."""
|
||||
with open(prediction_file) as f:
|
||||
for line in f:
|
||||
pred = json.loads(line)
|
||||
try:
|
||||
git_diff = pred["test_result"]["git_patch"]
|
||||
git_diff = pred['test_result']['git_patch']
|
||||
except KeyError:
|
||||
_LOGGER.warning("Warning: No git diff found for instance %s", pred["instance_id"])
|
||||
_LOGGER.warning(
|
||||
'Warning: No git diff found for instance %s', pred['instance_id']
|
||||
)
|
||||
continue
|
||||
ci_mode = pred["metadata"]["details"].get("mode", "") == "swt-ci"
|
||||
ci_mode = pred['metadata']['details'].get('mode', '') == 'swt-ci'
|
||||
try:
|
||||
git_diff = remove_setup_files(git_diff, pred["instance"], ci_mode)
|
||||
git_diff = remove_setup_files(git_diff, pred['instance'], ci_mode)
|
||||
except:
|
||||
_LOGGER.warning("Warning: Invalid git diff found for instance %s", pred["instance_id"])
|
||||
print(json.dumps({
|
||||
"instance_id": pred["instance_id"],
|
||||
"model_name_or_path": f'{pred["metadata"]["llm_config"]["openrouter_app_name"]}__{pred["metadata"]["agent_class"]}__{pred["metadata"]["llm_config"]["model"]}',
|
||||
"model_patch": git_diff,
|
||||
"full_output": json.dumps(pred),
|
||||
}))
|
||||
_LOGGER.warning(
|
||||
'Warning: Invalid git diff found for instance %s',
|
||||
pred['instance_id'],
|
||||
)
|
||||
print(
|
||||
json.dumps(
|
||||
{
|
||||
'instance_id': pred['instance_id'],
|
||||
'model_name_or_path': f'{pred["metadata"]["llm_config"]["openrouter_app_name"]}__{pred["metadata"]["agent_class"]}__{pred["metadata"]["llm_config"]["model"]}',
|
||||
'model_patch': git_diff,
|
||||
'full_output': json.dumps(pred),
|
||||
}
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument(
|
||||
"--prediction_file",
|
||||
'--prediction_file',
|
||||
type=str,
|
||||
required=True,
|
||||
help="Path to the prediction file (.../outputs.jsonl)",
|
||||
help='Path to the prediction file (.../outputs.jsonl)',
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
|
||||
@ -475,4 +475,3 @@ class RemoteRuntime(ActionExecutionClient):
|
||||
|
||||
def _stop_if_closed(self, retry_state: tenacity.RetryCallState) -> bool:
|
||||
return self._runtime_closed
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user