fix(swebench): handle error in eval_infer and run_infer (#5017)

2025-12-26 05:48:36 +08:00 · 2024-11-15 09:04:56 -06:00 · 2024-11-15 09:04:56 -06:00 · bdc4513937
commit bdc4513937
parent ffc4d32440
2 changed files with 30 additions and 19 deletions
--- a/evaluation/swe_bench/eval_infer.py
+++ b/evaluation/swe_bench/eval_infer.py
@ -263,23 +263,29 @@ def process_instance(
                        test_output_path = os.path.join(log_dir, 'test_output.txt')
                        with open(test_output_path, 'w') as f:
                            f.write(test_output)
-
-                        _report = get_eval_report(
-                            test_spec=test_spec,
-                            prediction={
-                                'model_patch': model_patch,
-                                'instance_id': instance_id,
-                            },
-                            log_path=test_output_path,
-                            include_tests_status=True,
-                        )
-                        report = _report[instance_id]
-                        logger.info(
-                            f"[{instance_id}] report: {report}\nResult for {instance_id}: resolved: {report['resolved']}"
-                        )
-                        instance['test_result']['report']['resolved'] = report[
-                            'resolved'
-                        ]
+                        try:
+                            _report = get_eval_report(
+                                test_spec=test_spec,
+                                prediction={
+                                    'model_patch': model_patch,
+                                    'instance_id': instance_id,
+                                },
+                                log_path=test_output_path,
+                                include_tests_status=True,
+                            )
+                            report = _report[instance_id]
+                            logger.info(
+                                f"[{instance_id}] report: {report}\nResult for {instance_id}: resolved: {report['resolved']}"
+                            )
+                            instance['test_result']['report']['resolved'] = report[
+                                'resolved'
+                            ]
+                        except Exception as e:
+                            logger.error(
+                                f'[{instance_id}] Error when getting eval report: {e}'
+                            )
+                            instance['test_result']['report']['resolved'] = False
+                            instance['test_result']['report']['error_eval'] = True
            else:
                logger.info(f'[{instance_id}] Error when starting eval:\n{obs.content}')
                instance['test_result']['report']['error_eval'] = True
@ -355,7 +361,7 @@ if __name__ == '__main__':

    if 'model_patch' not in predictions.columns:
        predictions['model_patch'] = predictions['test_result'].apply(
-            lambda x: x['git_patch']
+            lambda x: x.get('git_patch', '')
        )
    assert {'instance_id', 'model_patch'}.issubset(
        set(predictions.columns)
--- a/evaluation/swe_bench/run_infer.py
+++ b/evaluation/swe_bench/run_infer.py
@ -534,5 +534,10 @@ if __name__ == '__main__':
            instances[col] = instances[col].apply(lambda x: str(x))

    run_evaluation(
-        instances, metadata, output_file, args.eval_num_workers, process_instance
+        instances,
+        metadata,
+        output_file,
+        args.eval_num_workers,
+        process_instance,
+        timeout_seconds=120 * 60,  # 2 hour PER instance should be more than enough
    )