[chore] Just linting on swe-bench files (#7918)

This commit is contained in:
Engel Nyst 2025-04-18 16:12:01 +02:00 committed by GitHub
parent 6171395ef9
commit 9b9b1291fc
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 652 additions and 618 deletions

View File

@ -214,7 +214,7 @@ In order to run evaluation of the obtained inference results in the SWT-Bench ha
```bash
python3 evaluation/benchmarks/swe_bench/scripts/swtbench/convert.py --prediction_file [output.jsonl] > [output_swt.jsonl]
# Example
# Example
python3 evaluation/benchmarks/swe_bench/scripts/swtbench/convert.py --prediction_file "evaluation/evaluation_outputs/outputs/princeton-nlp__SWE-bench_Verified-test/CodeActAgent/gpt-4o-2024-11-20_maxiter_100_N_v0.31.0-no-hint-swt-run_1/output.jsonl" > OpenHands-gpt-4o-2024-11-20.jsonl
```
@ -238,4 +238,4 @@ The results of the evaluation can be obtained by running the reporting script of
```bash
# Example
python -m src.report run_instance_swt_logs/OpenHands-CodeAct-gpt-4o-2024-11-20/OpenHands__CodeActAgent__gpt-4o-2024-11-20 --dataset verified
```
```

File diff suppressed because it is too large Load Diff

View File

@ -10,11 +10,6 @@ import toml
from datasets import load_dataset
import openhands.agenthub
from evaluation.benchmarks.swe_bench.resource.swt_bench_constants import (
MAP_REPO_TO_TEST_FRAMEWORK_VERBOSE,
MAP_REPO_TO_INSTALL,
MAP_VERSION_TO_INSTALL
)
from evaluation.benchmarks.swe_bench.binary_patch_utils import (
remove_binary_diffs,
remove_binary_files_from_git,
@ -22,6 +17,11 @@ from evaluation.benchmarks.swe_bench.binary_patch_utils import (
from evaluation.benchmarks.swe_bench.resource.mapping import (
get_instance_resource_factor,
)
from evaluation.benchmarks.swe_bench.resource.swt_bench_constants import (
MAP_REPO_TO_INSTALL,
MAP_REPO_TO_TEST_FRAMEWORK_VERBOSE,
MAP_VERSION_TO_INSTALL,
)
from evaluation.utils.shared import (
EvalException,
EvalMetadata,
@ -60,7 +60,7 @@ from openhands.utils.shutdown_listener import sleep_if_should_continue
USE_HINT_TEXT = os.environ.get('USE_HINT_TEXT', 'false').lower() == 'true'
RUN_WITH_BROWSING = os.environ.get('RUN_WITH_BROWSING', 'false').lower() == 'true'
BenchMode = Literal["swe", "swt", "swt-ci"]
BenchMode = Literal['swe', 'swt', 'swt-ci']
AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = {
@ -74,9 +74,13 @@ def _get_swebench_workspace_dir_name(instance: pd.Series) -> str:
def get_instruction(instance: pd.Series, metadata: EvalMetadata) -> MessageAction:
workspace_dir_name = _get_swebench_workspace_dir_name(instance)
mode = metadata.details["mode"]
mode = metadata.details['mode']
if mode.startswith('swt'):
test_instructions = f"The following command can be used to run the tests: `{list(MAP_REPO_TO_TEST_FRAMEWORK_VERBOSE[instance.repo].values())[0]}`. Make sure they fail in the expected way.\n" if mode.endswith("ci") else ""
test_instructions = (
f'The following command can be used to run the tests: `{list(MAP_REPO_TO_TEST_FRAMEWORK_VERBOSE[instance.repo].values())[0]}`. Make sure they fail in the expected way.\n'
if mode.endswith('ci')
else ''
)
instruction = f"""\
<uploaded_files>
/workspace/{workspace_dir_name}
@ -387,20 +391,22 @@ def initialize_runtime(
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
assert_and_raise(obs.exit_code == 0, f'Failed to remove git remotes: {str(obs)}')
if metadata.details["mode"] == "swt-ci":
if metadata.details['mode'] == 'swt-ci':
# set up repo
setup_commands = []
if instance["repo"] in MAP_REPO_TO_INSTALL:
setup_commands.append(MAP_REPO_TO_INSTALL[instance["repo"]])
if instance['repo'] in MAP_REPO_TO_INSTALL:
setup_commands.append(MAP_REPO_TO_INSTALL[instance['repo']])
# Run pre-install set up if provided
install = MAP_VERSION_TO_INSTALL.get(instance['repo'], {}).get(instance['version'], [])
if "pre_install" in install:
for pre_install in install["pre_install"]:
install = MAP_VERSION_TO_INSTALL.get(instance['repo'], {}).get(
instance['version'], []
)
if 'pre_install' in install:
for pre_install in install['pre_install']:
setup_commands.append(pre_install)
if "install" in install:
setup_commands.append(install["install"])
if 'install' in install:
setup_commands.append(install['install'])
for command in setup_commands:
action = CmdRunAction(command=command)
@ -409,7 +415,6 @@ def initialize_runtime(
obs = runtime.run_action(action)
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
if 'multimodal' not in metadata.dataset.lower():
# Only for non-multimodal datasets, we need to activate the testbed environment for Python
# SWE-Bench multimodal datasets are not using the testbed environment
@ -775,7 +780,7 @@ if __name__ == '__main__':
if llm_config is None:
raise ValueError(f'Could not find LLM config: --llm_config {args.llm_config}')
details = {"mode": args.mode}
details = {'mode': args.mode}
_agent_cls = openhands.agenthub.Agent.get_cls(args.agent_cls)
dataset_descrption = (

View File

@ -1,35 +1,47 @@
import json
import argparse
import json
import logging
import unidiff
from evaluation.benchmarks.swe_bench.resource.swt_bench_constants import MAP_VERSION_TO_INSTALL
from evaluation.benchmarks.swe_bench.resource.swt_bench_constants import (
MAP_VERSION_TO_INSTALL,
)
_LOGGER = logging.getLogger(__name__)
def remove_setup_files(model_patch: str, instance: dict, delete_setup_changes: bool):
""" Discard all changes that a patch applies to files changes by the pre_install script and that are reproduction scripts (top-level script)"""
setup_files = ["setup.py", "tox.ini", "pyproject.toml"]
pre_install = MAP_VERSION_TO_INSTALL.get(instance["repo"], {}).get(instance["version"], {}).get("pre_install", [])
relevant_files = [
file
for file in setup_files
if any(file in install and "sed" in install for install in pre_install)
] if delete_setup_changes else []
"""Discard all changes that a patch applies to files changes by the pre_install script and that are reproduction scripts (top-level script)"""
setup_files = ['setup.py', 'tox.ini', 'pyproject.toml']
pre_install = (
MAP_VERSION_TO_INSTALL.get(instance['repo'], {})
.get(instance['version'], {})
.get('pre_install', [])
)
relevant_files = (
[
file
for file in setup_files
if any(file in install and 'sed' in install for install in pre_install)
]
if delete_setup_changes
else []
)
for i in range(10):
try:
# Appearently outputs.jsonl has .strip() applied, so we try to reconstruct the original patch by adding auxiliary whitespace
patch = unidiff.PatchSet(model_patch + i*"\n")
patch = unidiff.PatchSet(model_patch + i * '\n')
break
except unidiff.UnidiffParseError as e:
except unidiff.UnidiffParseError:
pass
to_delete = []
for i, file in enumerate(patch):
if any(f in file.source_file for f in relevant_files) or file.target_file.count("/") == 1:
if (
any(f in file.source_file for f in relevant_files)
or file.target_file.count('/') == 1
):
to_delete.append(i)
for i in reversed(to_delete):
del patch[i]
@ -37,36 +49,46 @@ def remove_setup_files(model_patch: str, instance: dict, delete_setup_changes: b
def main(
prediction_file: str,
prediction_file: str,
):
"""Main function to extract the model patches from the OpenHands prediction file and turn them into the expected SWT-Bench format."""
with open(prediction_file) as f:
for line in f:
pred = json.loads(line)
try:
git_diff = pred["test_result"]["git_patch"]
git_diff = pred['test_result']['git_patch']
except KeyError:
_LOGGER.warning("Warning: No git diff found for instance %s", pred["instance_id"])
_LOGGER.warning(
'Warning: No git diff found for instance %s', pred['instance_id']
)
continue
ci_mode = pred["metadata"]["details"].get("mode", "") == "swt-ci"
ci_mode = pred['metadata']['details'].get('mode', '') == 'swt-ci'
try:
git_diff = remove_setup_files(git_diff, pred["instance"], ci_mode)
git_diff = remove_setup_files(git_diff, pred['instance'], ci_mode)
except:
_LOGGER.warning("Warning: Invalid git diff found for instance %s", pred["instance_id"])
print(json.dumps({
"instance_id": pred["instance_id"],
"model_name_or_path": f'{pred["metadata"]["llm_config"]["openrouter_app_name"]}__{pred["metadata"]["agent_class"]}__{pred["metadata"]["llm_config"]["model"]}',
"model_patch": git_diff,
"full_output": json.dumps(pred),
}))
_LOGGER.warning(
'Warning: Invalid git diff found for instance %s',
pred['instance_id'],
)
print(
json.dumps(
{
'instance_id': pred['instance_id'],
'model_name_or_path': f'{pred["metadata"]["llm_config"]["openrouter_app_name"]}__{pred["metadata"]["agent_class"]}__{pred["metadata"]["llm_config"]["model"]}',
'model_patch': git_diff,
'full_output': json.dumps(pred),
}
)
)
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument(
"--prediction_file",
'--prediction_file',
type=str,
required=True,
help="Path to the prediction file (.../outputs.jsonl)",
help='Path to the prediction file (.../outputs.jsonl)',
)
args = parser.parse_args()

View File

@ -475,4 +475,3 @@ class RemoteRuntime(ActionExecutionClient):
def _stop_if_closed(self, retry_state: tenacity.RetryCallState) -> bool:
return self._runtime_closed