diff --git a/evaluation/swe_bench/eval_infer.py b/evaluation/swe_bench/eval_infer.py
index 8372c30ca0..de2e118cb3 100644
--- a/evaluation/swe_bench/eval_infer.py
+++ b/evaluation/swe_bench/eval_infer.py
@@ -3,6 +3,7 @@ import tempfile
 import time
 
 import pandas as pd
+from pydantic import BaseModel
 from swebench.harness.grading import get_eval_report
 from swebench.harness.run_evaluation import (
     APPLY_PATCH_FAIL,
@@ -34,36 +35,6 @@ DOCKER_IMAGE_PREFIX = os.environ.get('EVAL_DOCKER_IMAGE_PREFIX', 'docker.io/xing
 logger.info(f'Using docker image prefix: {DOCKER_IMAGE_PREFIX}')
 
 
-def process_git_patch(patch):
-    if not isinstance(patch, str):
-        return ''
-
-    if not patch.strip():
-        # skip empty patches
-        return ''
-
-    patch = patch.replace('\r\n', '\n')
-    # There might be some weird characters at the beginning of the patch
-    # due to some OpenHands inference command outputs
-
-    # FOR EXAMPLE:
-    # git diff --no-color --cached 895f28f9cbed817c00ab68770433170d83132d90
-    # [A[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[K0
-    # diff --git a/django/db/models/sql/.backup.query.py b/django/db/models/sql/.backup.query.py
-    # new file mode 100644
-    # index 0000000000..fc13db5948
-
-    # We "find" the first line that starts with "diff" and then we remove lines before it
-    lines = patch.split('\n')
-    for i, line in enumerate(lines):
-        if line.startswith('diff --git'):
-            patch = '\n'.join(lines[i:])
-            break
-
-    patch = patch.rstrip() + '\n'  # Make sure the last line ends with a newline
-    return patch
-
-
 def get_config(instance: pd.Series) -> AppConfig:
     # We use a different instance image for the each instance of swe-bench eval
     base_container_image = get_instance_docker_image(instance['instance_id'])
@@ -89,6 +60,13 @@ def get_config(instance: pd.Series) -> AppConfig:
     return config
 
 
+class SWEBenchEvalResult(BaseModel):
+    instance_id: str
+    apply_patch_output: str
+    test_output: str
+    resolved: bool
+
+
 def process_instance(
     instance: pd.Series,
     metadata: EvalMetadata | None = None,
@@ -116,7 +94,6 @@ def process_instance(
         'resolved': False,
         'failed_apply_patch': False,
         'error_eval': False,
-        'test_timeout': False,
     }
 
     if model_patch == '':
@@ -193,14 +170,13 @@ def process_instance(
 
                 # Poll for completion
                 start_time = time.time()
-                timeout = 1800  # 30 minutes
+                timeout = 900  # 15 minutes
                 while True:
                     seconds_elapsed = time.time() - start_time
                     if seconds_elapsed > timeout:
                         logger.info(
                             f'[{instance_id}] Evaluation timed out after {timeout} seconds'
                         )
-                        instance['test_result']['report']['test_timeout'] = True
                         break
                     check_action = CmdRunAction(
                         command=f'ps -p {pid} > /dev/null; echo $?', keep_prompt=False
@@ -339,9 +315,6 @@ if __name__ == '__main__':
         set(predictions.columns)
     ), 'Input file must contain instance_id and model_patch columns.'
 
-    # Process model_patch
-    predictions['model_patch'] = predictions['model_patch'].apply(process_git_patch)
-
     # Merge predictions with dataset
     predictions['instance'] = predictions['instance_id'].apply(
         lambda x: instance_id_to_instance[x]
diff --git a/evaluation/swe_bench/scripts/eval/convert_oh_output_to_swe_json.py b/evaluation/swe_bench/scripts/eval/convert_oh_output_to_swe_json.py
index 5006d3dde3..5132eb355a 100644
--- a/evaluation/swe_bench/scripts/eval/convert_oh_output_to_swe_json.py
+++ b/evaluation/swe_bench/scripts/eval/convert_oh_output_to_swe_json.py
@@ -3,8 +3,6 @@ import os
 
 import pandas as pd
 
-from evaluation.swe_bench.eval_infer import process_git_patch
-
 parser = argparse.ArgumentParser()
 parser.add_argument('oh_output_file', type=str)
 args = parser.parse_args()
@@ -16,6 +14,36 @@ oh_format = pd.read_json(args.oh_output_file, orient='records', lines=True)
 model_name = os.path.basename(os.path.dirname(args.oh_output_file))
 
 
+def process_git_patch(patch):
+    if not isinstance(patch, str):
+        return ''
+
+    if not patch.strip():
+        # skip empty patches
+        return ''
+
+    patch = patch.replace('\r\n', '\n')
+    # There might be some weird characters at the beginning of the patch
+    # due to some OpenHands inference command outputs
+
+    # FOR EXAMPLE:
+    # git diff --no-color --cached 895f28f9cbed817c00ab68770433170d83132d90
+    # [A[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[K0
+    # diff --git a/django/db/models/sql/.backup.query.py b/django/db/models/sql/.backup.query.py
+    # new file mode 100644
+    # index 0000000000..fc13db5948
+
+    # We "find" the first line that starts with "diff" and then we remove lines before it
+    lines = patch.split('\n')
+    for i, line in enumerate(lines):
+        if line.startswith('diff --git'):
+            patch = '\n'.join(lines[i:])
+            break
+
+    patch = patch.rstrip() + '\n'  # Make sure the last line ends with a newline
+    return patch
+
+
 def convert_row_to_swebench_format(row):
     if 'git_patch' in row:
         model_patch = row['git_patch']