diff --git a/evaluation/swe_bench/eval_infer.py b/evaluation/swe_bench/eval_infer.py index 8372c30ca0..de2e118cb3 100644 --- a/evaluation/swe_bench/eval_infer.py +++ b/evaluation/swe_bench/eval_infer.py @@ -3,6 +3,7 @@ import tempfile import time import pandas as pd +from pydantic import BaseModel from swebench.harness.grading import get_eval_report from swebench.harness.run_evaluation import ( APPLY_PATCH_FAIL, @@ -34,36 +35,6 @@ DOCKER_IMAGE_PREFIX = os.environ.get('EVAL_DOCKER_IMAGE_PREFIX', 'docker.io/xing logger.info(f'Using docker image prefix: {DOCKER_IMAGE_PREFIX}') -def process_git_patch(patch): - if not isinstance(patch, str): - return '' - - if not patch.strip(): - # skip empty patches - return '' - - patch = patch.replace('\r\n', '\n') - # There might be some weird characters at the beginning of the patch - # due to some OpenHands inference command outputs - - # FOR EXAMPLE: - # git diff --no-color --cached 895f28f9cbed817c00ab68770433170d83132d90 - # 0 - # diff --git a/django/db/models/sql/.backup.query.py b/django/db/models/sql/.backup.query.py - # new file mode 100644 - # index 0000000000..fc13db5948 - - # We "find" the first line that starts with "diff" and then we remove lines before it - lines = patch.split('\n') - for i, line in enumerate(lines): - if line.startswith('diff --git'): - patch = '\n'.join(lines[i:]) - break - - patch = patch.rstrip() + '\n' # Make sure the last line ends with a newline - return patch - - def get_config(instance: pd.Series) -> AppConfig: # We use a different instance image for the each instance of swe-bench eval base_container_image = get_instance_docker_image(instance['instance_id']) @@ -89,6 +60,13 @@ def get_config(instance: pd.Series) -> AppConfig: return config +class SWEBenchEvalResult(BaseModel): + instance_id: str + apply_patch_output: str + test_output: str + resolved: bool + + def process_instance( instance: pd.Series, metadata: EvalMetadata | None = None, @@ -116,7 +94,6 @@ def process_instance( 'resolved': False, 'failed_apply_patch': False, 'error_eval': False, - 'test_timeout': False, } if model_patch == '': @@ -193,14 +170,13 @@ def process_instance( # Poll for completion start_time = time.time() - timeout = 1800 # 30 minutes + timeout = 900 # 15 minutes while True: seconds_elapsed = time.time() - start_time if seconds_elapsed > timeout: logger.info( f'[{instance_id}] Evaluation timed out after {timeout} seconds' ) - instance['test_result']['report']['test_timeout'] = True break check_action = CmdRunAction( command=f'ps -p {pid} > /dev/null; echo $?', keep_prompt=False @@ -339,9 +315,6 @@ if __name__ == '__main__': set(predictions.columns) ), 'Input file must contain instance_id and model_patch columns.' - # Process model_patch - predictions['model_patch'] = predictions['model_patch'].apply(process_git_patch) - # Merge predictions with dataset predictions['instance'] = predictions['instance_id'].apply( lambda x: instance_id_to_instance[x] diff --git a/evaluation/swe_bench/scripts/eval/convert_oh_output_to_swe_json.py b/evaluation/swe_bench/scripts/eval/convert_oh_output_to_swe_json.py index 5006d3dde3..5132eb355a 100644 --- a/evaluation/swe_bench/scripts/eval/convert_oh_output_to_swe_json.py +++ b/evaluation/swe_bench/scripts/eval/convert_oh_output_to_swe_json.py @@ -3,8 +3,6 @@ import os import pandas as pd -from evaluation.swe_bench.eval_infer import process_git_patch - parser = argparse.ArgumentParser() parser.add_argument('oh_output_file', type=str) args = parser.parse_args() @@ -16,6 +14,36 @@ oh_format = pd.read_json(args.oh_output_file, orient='records', lines=True) model_name = os.path.basename(os.path.dirname(args.oh_output_file)) +def process_git_patch(patch): + if not isinstance(patch, str): + return '' + + if not patch.strip(): + # skip empty patches + return '' + + patch = patch.replace('\r\n', '\n') + # There might be some weird characters at the beginning of the patch + # due to some OpenHands inference command outputs + + # FOR EXAMPLE: + # git diff --no-color --cached 895f28f9cbed817c00ab68770433170d83132d90 + # 0 + # diff --git a/django/db/models/sql/.backup.query.py b/django/db/models/sql/.backup.query.py + # new file mode 100644 + # index 0000000000..fc13db5948 + + # We "find" the first line that starts with "diff" and then we remove lines before it + lines = patch.split('\n') + for i, line in enumerate(lines): + if line.startswith('diff --git'): + patch = '\n'.join(lines[i:]) + break + + patch = patch.rstrip() + '\n' # Make sure the last line ends with a newline + return patch + + def convert_row_to_swebench_format(row): if 'git_patch' in row: model_patch = row['git_patch']