[chore] Just linting on swe-bench files (#7918)

2025-12-26 05:48:36 +08:00 · 2025-04-18 16:12:01 +02:00 · 2025-04-18 16:12:01 +02:00 · 9b9b1291fc
commit 9b9b1291fc
parent 6171395ef9
5 changed files with 652 additions and 618 deletions
--- a/evaluation/benchmarks/swe_bench/README.md
+++ b/evaluation/benchmarks/swe_bench/README.md
@ -214,7 +214,7 @@ In order to run evaluation of the obtained inference results in the SWT-Bench ha
 ```bash
 python3 evaluation/benchmarks/swe_bench/scripts/swtbench/convert.py --prediction_file [output.jsonl] > [output_swt.jsonl]

-# Example  
+# Example
 python3 evaluation/benchmarks/swe_bench/scripts/swtbench/convert.py --prediction_file "evaluation/evaluation_outputs/outputs/princeton-nlp__SWE-bench_Verified-test/CodeActAgent/gpt-4o-2024-11-20_maxiter_100_N_v0.31.0-no-hint-swt-run_1/output.jsonl" > OpenHands-gpt-4o-2024-11-20.jsonl
 ```

@ -238,4 +238,4 @@ The results of the evaluation can be obtained by running the reporting script of
 ```bash
 # Example
 python -m src.report run_instance_swt_logs/OpenHands-CodeAct-gpt-4o-2024-11-20/OpenHands__CodeActAgent__gpt-4o-2024-11-20 --dataset verified
-```
+```
--- a/evaluation/benchmarks/swe_bench/resource/swt_bench_constants.py
+++ b/evaluation/benchmarks/swe_bench/resource/swt_bench_constants.py
--- a/evaluation/benchmarks/swe_bench/run_infer.py
+++ b/evaluation/benchmarks/swe_bench/run_infer.py
@ -10,11 +10,6 @@ import toml
 from datasets import load_dataset

 import openhands.agenthub
-from evaluation.benchmarks.swe_bench.resource.swt_bench_constants import (
-    MAP_REPO_TO_TEST_FRAMEWORK_VERBOSE,
-    MAP_REPO_TO_INSTALL,
-    MAP_VERSION_TO_INSTALL
-)
 from evaluation.benchmarks.swe_bench.binary_patch_utils import (
    remove_binary_diffs,
    remove_binary_files_from_git,
@ -22,6 +17,11 @@ from evaluation.benchmarks.swe_bench.binary_patch_utils import (
 from evaluation.benchmarks.swe_bench.resource.mapping import (
    get_instance_resource_factor,
 )
+from evaluation.benchmarks.swe_bench.resource.swt_bench_constants import (
+    MAP_REPO_TO_INSTALL,
+    MAP_REPO_TO_TEST_FRAMEWORK_VERBOSE,
+    MAP_VERSION_TO_INSTALL,
+)
 from evaluation.utils.shared import (
    EvalException,
    EvalMetadata,
@ -60,7 +60,7 @@ from openhands.utils.shutdown_listener import sleep_if_should_continue

 USE_HINT_TEXT = os.environ.get('USE_HINT_TEXT', 'false').lower() == 'true'
 RUN_WITH_BROWSING = os.environ.get('RUN_WITH_BROWSING', 'false').lower() == 'true'
-BenchMode = Literal["swe", "swt", "swt-ci"]
+BenchMode = Literal['swe', 'swt', 'swt-ci']


 AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = {
@ -74,9 +74,13 @@ def _get_swebench_workspace_dir_name(instance: pd.Series) -> str:

 def get_instruction(instance: pd.Series, metadata: EvalMetadata) -> MessageAction:
    workspace_dir_name = _get_swebench_workspace_dir_name(instance)
-    mode = metadata.details["mode"]
+    mode = metadata.details['mode']
    if mode.startswith('swt'):
-        test_instructions = f"The following command can be used to run the tests: `{list(MAP_REPO_TO_TEST_FRAMEWORK_VERBOSE[instance.repo].values())[0]}`. Make sure they fail in the expected way.\n" if mode.endswith("ci") else ""
+        test_instructions = (
+            f'The following command can be used to run the tests: `{list(MAP_REPO_TO_TEST_FRAMEWORK_VERBOSE[instance.repo].values())[0]}`. Make sure they fail in the expected way.\n'
+            if mode.endswith('ci')
+            else ''
+        )
        instruction = f"""\
 <uploaded_files>
 /workspace/{workspace_dir_name}
@ -387,20 +391,22 @@ def initialize_runtime(
    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
    assert_and_raise(obs.exit_code == 0, f'Failed to remove git remotes: {str(obs)}')

-    if metadata.details["mode"] == "swt-ci":
+    if metadata.details['mode'] == 'swt-ci':
        # set up repo
        setup_commands = []
-        if instance["repo"] in MAP_REPO_TO_INSTALL:
-            setup_commands.append(MAP_REPO_TO_INSTALL[instance["repo"]])
+        if instance['repo'] in MAP_REPO_TO_INSTALL:
+            setup_commands.append(MAP_REPO_TO_INSTALL[instance['repo']])

        # Run pre-install set up if provided
-        install = MAP_VERSION_TO_INSTALL.get(instance['repo'], {}).get(instance['version'], [])
-        if "pre_install" in install:
-            for pre_install in install["pre_install"]:
+        install = MAP_VERSION_TO_INSTALL.get(instance['repo'], {}).get(
+            instance['version'], []
+        )
+        if 'pre_install' in install:
+            for pre_install in install['pre_install']:
                setup_commands.append(pre_install)

-        if "install" in install:
-            setup_commands.append(install["install"])
+        if 'install' in install:
+            setup_commands.append(install['install'])

        for command in setup_commands:
            action = CmdRunAction(command=command)
@ -409,7 +415,6 @@ def initialize_runtime(
            obs = runtime.run_action(action)
            logger.info(obs, extra={'msg_type': 'OBSERVATION'})

-
    if 'multimodal' not in metadata.dataset.lower():
        # Only for non-multimodal datasets, we need to activate the testbed environment for Python
        # SWE-Bench multimodal datasets are not using the testbed environment
@ -775,7 +780,7 @@ if __name__ == '__main__':
    if llm_config is None:
        raise ValueError(f'Could not find LLM config: --llm_config {args.llm_config}')

-    details = {"mode": args.mode}
+    details = {'mode': args.mode}
    _agent_cls = openhands.agenthub.Agent.get_cls(args.agent_cls)

    dataset_descrption = (
--- a/evaluation/benchmarks/swe_bench/scripts/swtbench/convert.py
+++ b/evaluation/benchmarks/swe_bench/scripts/swtbench/convert.py
@ -1,35 +1,47 @@
-import json
 import argparse
+import json
 import logging

-
 import unidiff

-from evaluation.benchmarks.swe_bench.resource.swt_bench_constants import MAP_VERSION_TO_INSTALL
+from evaluation.benchmarks.swe_bench.resource.swt_bench_constants import (
+    MAP_VERSION_TO_INSTALL,
+)

 _LOGGER = logging.getLogger(__name__)


 def remove_setup_files(model_patch: str, instance: dict, delete_setup_changes: bool):
-    """ Discard all changes that a patch applies to files changes by the pre_install script and that are reproduction scripts (top-level script)"""
-    setup_files = ["setup.py", "tox.ini", "pyproject.toml"]
-    pre_install = MAP_VERSION_TO_INSTALL.get(instance["repo"], {}).get(instance["version"], {}).get("pre_install", [])
-    relevant_files = [
-        file
-        for file in setup_files
-        if any(file in install and "sed" in install for install in pre_install)
-    ] if delete_setup_changes else []
+    """Discard all changes that a patch applies to files changes by the pre_install script and that are reproduction scripts (top-level script)"""
+    setup_files = ['setup.py', 'tox.ini', 'pyproject.toml']
+    pre_install = (
+        MAP_VERSION_TO_INSTALL.get(instance['repo'], {})
+        .get(instance['version'], {})
+        .get('pre_install', [])
+    )
+    relevant_files = (
+        [
+            file
+            for file in setup_files
+            if any(file in install and 'sed' in install for install in pre_install)
+        ]
+        if delete_setup_changes
+        else []
+    )
    for i in range(10):
        try:
            # Appearently outputs.jsonl has .strip() applied, so we try to reconstruct the original patch by adding auxiliary whitespace
-            patch = unidiff.PatchSet(model_patch + i*"\n")
+            patch = unidiff.PatchSet(model_patch + i * '\n')
            break
-        except unidiff.UnidiffParseError as e:
+        except unidiff.UnidiffParseError:
            pass

    to_delete = []
    for i, file in enumerate(patch):
-        if any(f in file.source_file for f in relevant_files) or file.target_file.count("/") == 1:
+        if (
+            any(f in file.source_file for f in relevant_files)
+            or file.target_file.count('/') == 1
+        ):
            to_delete.append(i)
    for i in reversed(to_delete):
        del patch[i]
@ -37,36 +49,46 @@ def remove_setup_files(model_patch: str, instance: dict, delete_setup_changes: b


 def main(
-        prediction_file: str,
+    prediction_file: str,
 ):
    """Main function to extract the model patches from the OpenHands prediction file and turn them into the expected SWT-Bench format."""
    with open(prediction_file) as f:
        for line in f:
            pred = json.loads(line)
            try:
-                git_diff = pred["test_result"]["git_patch"]
+                git_diff = pred['test_result']['git_patch']
            except KeyError:
-                _LOGGER.warning("Warning: No git diff found for instance %s", pred["instance_id"])
+                _LOGGER.warning(
+                    'Warning: No git diff found for instance %s', pred['instance_id']
+                )
                continue
-            ci_mode = pred["metadata"]["details"].get("mode", "") == "swt-ci"
+            ci_mode = pred['metadata']['details'].get('mode', '') == 'swt-ci'
            try:
-                git_diff = remove_setup_files(git_diff, pred["instance"], ci_mode)
+                git_diff = remove_setup_files(git_diff, pred['instance'], ci_mode)
            except:
-                _LOGGER.warning("Warning: Invalid git diff found for instance %s", pred["instance_id"])
-            print(json.dumps({
-                "instance_id": pred["instance_id"],
-                "model_name_or_path": f'{pred["metadata"]["llm_config"]["openrouter_app_name"]}__{pred["metadata"]["agent_class"]}__{pred["metadata"]["llm_config"]["model"]}',
-                "model_patch": git_diff,
-                "full_output": json.dumps(pred),
-            }))
+                _LOGGER.warning(
+                    'Warning: Invalid git diff found for instance %s',
+                    pred['instance_id'],
+                )
+            print(
+                json.dumps(
+                    {
+                        'instance_id': pred['instance_id'],
+                        'model_name_or_path': f'{pred["metadata"]["llm_config"]["openrouter_app_name"]}__{pred["metadata"]["agent_class"]}__{pred["metadata"]["llm_config"]["model"]}',
+                        'model_patch': git_diff,
+                        'full_output': json.dumps(pred),
+                    }
+                )
+            )
+

 if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument(
-        "--prediction_file",
+        '--prediction_file',
        type=str,
        required=True,
-        help="Path to the prediction file (.../outputs.jsonl)",
+        help='Path to the prediction file (.../outputs.jsonl)',
    )
    args = parser.parse_args()

--- a/openhands/runtime/impl/remote/remote_runtime.py
+++ b/openhands/runtime/impl/remote/remote_runtime.py
@ -475,4 +475,3 @@ class RemoteRuntime(ActionExecutionClient):

    def _stop_if_closed(self, retry_state: tenacity.RetryCallState) -> bool:
        return self._runtime_closed
-