Support Logic Reasoning Benchmark (#1973)

2025-12-26 05:48:36 +08:00 · 2024-05-30 16:35:15 +08:00 · 2024-05-30 16:35:15 +08:00 · a9823491e6
commit a9823491e6
parent 01ef90205d
10 changed files with 836 additions and 0 deletions
--- a/evaluation/evaluation_outputs/outputs/logic_reasoning/CodeActAgent/ProntoQA/gpt-4o-2024-05-13_maxiter_10_N_v1.5/metadata.json
+++ b/evaluation/evaluation_outputs/outputs/logic_reasoning/CodeActAgent/ProntoQA/gpt-4o-2024-05-13_maxiter_10_N_v1.5/metadata.json
@ -0,0 +1,10 @@
+{
+    "Dataset": "ProntoQA",
+    "Data split": "validation",
+    "Number of Samples": 6,
+    "Agent class": "CodeActAgent",
+    "Model name": "gpt-4o-2024-05-13",
+    "Start_time": "2024-05-29 17:51:09",
+    "End_time": "2024-05-29 17:52:24",
+    "Final Accuracy": "0.83"
+}
--- a/evaluation/evaluation_outputs/outputs/logic_reasoning/CodeActAgent/ProntoQA/gpt-4o-2024-05-13_maxiter_10_N_v1.5/output.jsonl
+++ b/evaluation/evaluation_outputs/outputs/logic_reasoning/CodeActAgent/ProntoQA/gpt-4o-2024-05-13_maxiter_10_N_v1.5/output.jsonl
--- a/evaluation/logic_reasoning/.cache_program/facts.kfb
+++ b/evaluation/logic_reasoning/.cache_program/facts.kfb
@ -0,0 +1,12 @@
+Cold(Bob, True)
+Quiet(Bob, True)
+Red(Bob, True)
+Smart(Bob, True)
+Kind(Charlie, True)
+Quiet(Charlie, True)
+Red(Charlie, True)
+Rough(Charlie, True)
+Cold(Dave, True)
+Kind(Dave, True)
+Smart(Dave, True)
+Quiet(Fiona, True)
--- a/evaluation/logic_reasoning/.cache_program/rules.krb
+++ b/evaluation/logic_reasoning/.cache_program/rules.krb
@ -0,0 +1,52 @@
+fact1
+	foreach
+		facts.Quiet($x, True)
+		facts.Cold($x, True)
+	assert
+		facts.Smart($x, True)
+
+fact2
+	foreach
+		facts.Red($x, True)
+		facts.Cold($x, True)
+	assert
+		facts.Round($x, True)
+
+fact3
+	foreach
+		facts.Kind($x, True)
+		facts.Rough($x, True)
+	assert
+		facts.Red($x, True)
+
+fact4
+	foreach
+		facts.Quiet($x, True)
+	assert
+		facts.Rough($x, True)
+
+fact5
+	foreach
+		facts.Cold($x, True)
+		facts.Smart($x, True)
+	assert
+		facts.Red($x, True)
+
+fact6
+	foreach
+		facts.Rough($x, True)
+	assert
+		facts.Cold($x, True)
+
+fact7
+	foreach
+		facts.Red($x, True)
+	assert
+		facts.Rough($x, True)
+
+fact8
+	foreach
+		facts.Smart(Dave, True)
+		facts.Kind(Dave, True)
+	assert
+		facts.Quiet(Dave, True)
--- a/evaluation/logic_reasoning/README.md
+++ b/evaluation/logic_reasoning/README.md
@ -0,0 +1,43 @@
+# Logic Reasoning Evaluation
+
+This folder contains evaluation harness for evaluating agents on the logic reasoning benchmark [ProntoQA](https://github.com/asaparov/prontoqa) and [ProofWriter](https://allenai.org/data/proofwriter).
+
+## Configure OpenDevin and your LLM
+
+Create a `config.toml` file if it does not exist at the root of the workspace.
+
+Add the following configurations:
+
+```toml
+[core]
+max_iterations = 100
+cache_dir = "/tmp/cache"
+ssh_hostname = "localhost"
+enable_auto_lint = true
+
+# TODO: Change these to the model you want to evaluate
+[eval_gpt4_1106_preview]
+model = "gpt-4-1106-preview"
+api_key = "XXX"
+temperature = 0.0
+
+[eval_some_openai_compatible_model]
+model = "openai/MODEL_NAME"
+base_url = "https://OPENAI_COMPATIBLE_URL/v1"
+api_key = "XXX"
+temperature = 0.0
+```
+
+## Run Inference on logic_reasoning
+The following code will run inference on the first example of the ProntoQA dataset with model gpt-4o.
+```bash
+./evaluation/logic_reasoning/scripts/run_infer.sh ProntoQA gpt-4o 1 
+```
+
+
+## Examples
+
+See example output in 
+`./evaluation_outputs/outputs/logic_reasoning/CodeActAgent/ProntoQA/gpt-4o-2024-05-13_maxiter_10_N_v1.5/output.jsonl`
+and final evaluation performance in 
+`./evaluation_outputs/outputs/logic_reasoning/CodeActAgent/ProntoQA/gpt-4o-2024-05-13_maxiter_10_N_v1.5/metadata.json`
--- a/evaluation/logic_reasoning/init.py
+++ b/evaluation/logic_reasoning/init.py
--- a/evaluation/logic_reasoning/instruction.txt
+++ b/evaluation/logic_reasoning/instruction.txt
@ -0,0 +1,20 @@
+You are a helpful assistant assigned with logic reasoning task. You need to determine the correctness of a query given some facts and fules. 
+you can interact with an interactive Python (Jupyter Notebook) environment and receive the corresponding output when needed. The code should be enclosed using "<execute_ipython>" tag.
+In this task, you need to use the code in [[logic_inference_path.py]] to help you. Specifically, you first need to instantiate a **LogicInferenceEngine** class and use the **safe_execute_program** method to prove the **logic programs**. You should receive *answer*, *flag*, *error_message* from the output. 
+
+An example would be look like this:
+    <execute_ipython>
+    import sys
+    sys.path.append(workspace_mount_path)
+    engine = LogicInferenceEngine(dataset_name, workspace_mount_path)
+    answer, flag, error_message = engine.safe_execute_program(logic_programs)
+    </execute_ipython>
+
+Please send the *answer* variable through message.
+
+dataset_name:
+[[dataset_name]]
+
+logic_programs:
+[[logic_programs]]
+
--- a/evaluation/logic_reasoning/logic_inference.py
+++ b/evaluation/logic_reasoning/logic_inference.py
@ -0,0 +1,220 @@
+import os
+import random
+import re
+import shutil
+
+from pyke import knowledge_engine
+
+
+class PykeProgram:
+    def __init__(
+        self, logic_program: str, dataset_name='ProntoQA', workspace_mount_path='./'
+    ) -> None:
+        self.logic_program = logic_program
+        self.flag = self.parse_logic_program()
+        self.dataset_name = dataset_name
+        self.cache_dir = os.path.join(workspace_mount_path, '.cache_program')
+
+        # prepare the files for facts and rules
+        try:
+            self.create_fact_file(self.Facts)
+            self.create_rule_file(self.Rules)
+            self.flag = True
+        except Exception:
+            self.flag = False
+
+        self.answer_map = {
+            'ProntoQA': self.answer_map_prontoqa,
+            'ProofWriter': self.answer_map_proofwriter,
+        }
+
+    def parse_logic_program(self):
+        keywords = ['Query:', 'Rules:', 'Facts:', 'Predicates:']
+        program_str = self.logic_program
+        for keyword in keywords:
+            try:
+                program_str, segment_list = self._parse_segment(program_str, keyword)
+                setattr(self, keyword[:-1], segment_list)
+            except Exception:
+                setattr(self, keyword[:-1], None)
+
+        return self.validate_program()
+
+    def _parse_segment(self, program_str, key_phrase):
+        remain_program_str, segment = program_str.split(key_phrase)
+        segment_list = segment.strip().split('\n')
+        for i in range(len(segment_list)):
+            segment_list[i] = segment_list[i].split(':::')[0].strip()
+        return remain_program_str, segment_list
+
+    # check if the program is valid; if not, try to fix it
+    def validate_program(self):
+        if self.Rules is not None and self.Facts is not None:
+            if not self.Rules[0] == '' and not self.Facts[0] == '':
+                return True
+        # try to fix the program
+        tmp_rules = []
+        tmp_facts = []
+        statements = self.Facts if self.Facts is not None else self.Rules
+        if statements is None:
+            return False
+
+        for fact in statements:
+            if fact.find('>>>') >= 0:  # this is a rule
+                tmp_rules.append(fact)
+            else:
+                tmp_facts.append(fact)
+        self.Rules = tmp_rules
+        self.Facts = tmp_facts
+        return False
+
+    def create_fact_file(self, facts):
+        with open(os.path.join(self.cache_dir, 'facts.kfb'), 'w') as f:
+            for fact in facts:
+                # check for invalid facts
+                if not fact.find('$x') >= 0:
+                    f.write(fact + '\n')
+
+    def create_rule_file(self, rules):
+        pyke_rules = []
+        for idx, rule in enumerate(rules):
+            pyke_rules.append(self.parse_forward_rule(idx + 1, rule))
+
+        with open(os.path.join(self.cache_dir, 'rules.krb'), 'w') as f:
+            f.write('\n\n'.join(pyke_rules))
+
+    # example rule: Furry($x, True) && Quite($x, True) >>> White($x, True)
+    def parse_forward_rule(self, f_index, rule):
+        premise, conclusion = rule.split('>>>')
+        premise = premise.strip()
+        # split the premise into multiple facts if needed
+        premise = premise.split('&&')
+        premise_list = [p.strip() for p in premise]
+
+        conclusion = conclusion.strip()
+        # split the conclusion into multiple facts if needed
+        conclusion = conclusion.split('&&')
+        conclusion_list = [c.strip() for c in conclusion]
+
+        # create the Pyke rule
+        pyke_rule = f"""fact{f_index}\n\tforeach"""
+        for p in premise_list:
+            pyke_rule += f"""\n\t\tfacts.{p}"""
+        pyke_rule += """\n\tassert"""
+        for c in conclusion_list:
+            pyke_rule += f"""\n\t\tfacts.{c}"""
+        return pyke_rule
+
+    """
+    for example: Is Marvin from Mars?
+    Query: FromMars(Marvin, $label)
+    """
+
+    def check_specific_predicate(self, subject_name, predicate_name, engine):
+        results = []
+        with engine.prove_goal(
+            f'facts.{predicate_name}({subject_name}, $label)'
+        ) as gen:
+            for vars, plan in gen:
+                results.append(vars['label'])
+
+        with engine.prove_goal(
+            f'rules.{predicate_name}({subject_name}, $label)'
+        ) as gen:
+            for vars, plan in gen:
+                results.append(vars['label'])
+
+        if len(results) == 1:
+            return results[0]
+        elif len(results) == 2:
+            return results[0] and results[1]
+        elif len(results) == 0:
+            return None
+
+    """
+    Input Example: Metallic(Wren, False)
+    """
+
+    def parse_query(self, query):
+        pattern = r'(\w+)\(([^,]+),\s*([^)]+)\)'
+        match = re.match(pattern, query)
+        if match:
+            function_name = match.group(1)
+            arg1 = match.group(2)
+            arg2 = match.group(3)
+            arg2 = True if arg2 == 'True' else False
+            return function_name, arg1, arg2
+        else:
+            raise ValueError(f'Invalid query: {query}')
+
+    def execute_program(self):
+        # delete the compiled_krb dir
+        complied_krb_dir = './models/compiled_krb'
+        if os.path.exists(complied_krb_dir):
+            print('removing compiled_krb')
+            # os.system(f'rm -rf {complied_krb_dir}/*')
+            shutil.rmtree(complied_krb_dir)
+
+        # absolute_path = os.path.abspath(complied_krb_dir)
+        # print(absolute_path)
+        try:
+            engine = knowledge_engine.engine(self.cache_dir)
+            engine.reset()
+            engine.activate('rules')
+            engine.get_kb('facts')
+
+            # parse the logic query into pyke query
+            predicate, subject, value_to_check = self.parse_query(self.Query[0])
+            result = self.check_specific_predicate(subject, predicate, engine)
+            answer = self.answer_map[self.dataset_name](result, value_to_check)
+        except Exception as err:
+            return None, err
+
+        return answer, ''
+
+    def answer_mapping(self, answer):
+        return answer
+
+    def answer_map_prontoqa(self, result, value_to_check):
+        if result == value_to_check:
+            return 'A'
+        else:
+            return 'B'
+
+    def answer_map_proofwriter(self, result, value_to_check):
+        if result is None:
+            return 'C'
+        elif result == value_to_check:
+            return 'A'
+        else:
+            return 'B'
+
+
+class LogicInferenceEngine:
+    def __init__(self, dataset_name, workspace_mount_path):
+        self.dataset_name = dataset_name
+        self.workspace_mount_path = workspace_mount_path
+
+    def random_backup(self):
+        if self.dataset_name == 'ProntoQA':
+            return random.choice(['A', 'B'])
+        elif self.dataset_name == 'ProofWriter':
+            return random.choice(['A', 'B', 'C'])
+
+    def safe_execute_program(self, logic_program):
+        program = PykeProgram(
+            logic_program, self.dataset_name, self.workspace_mount_path
+        )
+        # cannot parse the program
+        if not program.flag:
+            answer = self.random_backup()
+            return answer, 'parsing error', ''
+        # execuate the program
+        answer, error_message = program.execute_program()
+        # not executable
+        if answer is None:
+            answer = self.random_backup()
+            return answer, 'execution error', error_message
+        # successfully executed
+        answer = program.answer_mapping(answer)
+        return answer, 'success', ''
--- a/evaluation/logic_reasoning/run_infer.py
+++ b/evaluation/logic_reasoning/run_infer.py
@ -0,0 +1,436 @@
+import asyncio
+import json
+import logging
+import multiprocessing as mp
+import os
+import pathlib
+import shutil
+import time
+from concurrent.futures import ProcessPoolExecutor
+
+from datasets import load_dataset
+from tqdm import tqdm
+
+from evaluation.swe_bench.swe_env_box import DockerSSHBox
+from opendevin.controller.state.state import State
+from opendevin.core.config import config, get_llm_config_arg, get_parser
+from opendevin.core.logger import get_console_handler
+from opendevin.core.logger import opendevin_logger as logger
+from opendevin.core.main import main
+from opendevin.events.action import MessageAction
+from opendevin.events.serialization.event import event_to_dict
+
+
+def cleanup():
+    logger.info('Cleaning up child processes...')
+    for process in mp.active_children():
+        logger.info(f'Terminating child process: {process.name}')
+        process.terminate()
+        process.join()
+
+
+def codeact_user_response(state: State) -> str:
+    msg = (
+        'Please continue working on the task on whatever approach you think is suitable.\n'
+        'If you think you have solved the task, please run the following command: <execute_bash> exit </execute_bash>.\n'
+        'IMPORTANT: YOU SHOULD NEVER ASK FOR HUMAN HELP OR USE THE INTERNET TO SOLVE THIS TASK.\n'
+    )
+    if state.history:
+        user_msgs = [
+            action
+            for action, _ in state.history
+            if isinstance(action, MessageAction) and action.source == 'user'
+        ]
+        if len(user_msgs) >= 2:
+            # let the agent know that it can give up when it has tried 3 times
+            return (
+                msg
+                + 'If you want to give up, run: <execute_bash> exit </execute_bash>.\n'
+            )
+    return msg
+
+
+def monologue_user_response(state: State) -> str:
+    raise NotImplementedError('MonologueAgent should never ask for user responses.')
+
+
+AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = {
+    'CodeActAgent': codeact_user_response,
+    'MonologueAgent': monologue_user_response,
+}
+
+AGENT_CLS_TO_INST_SUFFIX = {
+    'CodeActAgent': 'When you think you have solved the question, please first send your answer to user through message and then exit.\n'
+}
+
+
+def get_choice(answer_str):
+    choices = [
+        'A',
+        'B',
+        'C',
+        'D',
+        'E',
+        'F',
+        'G',
+        'H',
+        'A)',
+        'B)',
+        'C)',
+        'D)',
+        'E)',
+        'F)',
+        'G)',
+        'H)',
+        'A.',
+        'B.',
+        'C.',
+        'D.',
+        'E.',
+        'F.',
+        'G.',
+        'H.',
+    ]
+    for c in choices:
+        if answer_str.startswith(c):
+            return c.replace(')', '')
+
+    if answer_str.startswith(':'):
+        return answer_str.replace(':', '').replace('.', '').strip()
+    return None
+
+
+def get_test_result(
+    model_answer: str,
+    ground_truth: str,
+) -> bool:
+    gold_answer = ground_truth.replace('(', '').replace(')', '').strip()
+    answer_str = model_answer if model_answer is not None else ''
+    prediction = get_choice(answer_str)
+
+    indicators = [
+        'the correct option is',
+        'the correct answer is',
+        'The correct answer is',
+        'The correct option is',
+        'Thus, the answer is',
+    ]
+    if prediction is None:
+        for indicator in indicators:
+            if answer_str.find(indicator) >= 0:
+                answer_str = answer_str.split(indicator)[1].strip()
+                prediction = get_choice(answer_str)
+                break
+
+    isTrue = prediction == gold_answer
+    test_result = {'result': isTrue}
+    return test_result
+
+
+def process_instance(
+    instance,
+    agent_class,
+    # metadata,
+    dataset_name,
+    skip_workspace_mount,
+    eval_output_dir,
+    reset_logger: bool = True,
+):
+    old_workspace_mount_path = config.workspace_mount_path
+    old_workspace_base = config.workspace_base
+    workspace_mount_path = os.path.join(config.workspace_mount_path, '_eval_workspace')
+    # create process-specific workspace dir
+    # if `not skip_workspace_mount` - we will create a workspace directory for EACH process
+    # so that different agent don't interfere with each other.
+    if not skip_workspace_mount:
+        workspace_mount_path = os.path.join(workspace_mount_path, str(os.getpid()))
+        pathlib.Path(workspace_mount_path).mkdir(parents=True, exist_ok=True)
+
+    # reset workspace to config
+    config.workspace_base = workspace_mount_path
+    config.workspace_mount_path = workspace_mount_path
+
+    # Setup the logger properly, so you can run multi-processing to parallize the evaluation
+    if reset_logger:
+        # Set up logger
+        log_file = os.path.join(
+            eval_output_dir, 'logs', f'instance_{instance["id"]}.log'
+        )
+        # Remove all existing handlers from logger
+        for handler in logger.handlers[:]:
+            logger.removeHandler(handler)
+        # add back the console handler to print ONE line
+        logger.addHandler(get_console_handler())
+        logger.info(
+            f'Starting evaluation for instance {instance["id"]}.\nLOG:   tail -f {log_file}'
+        )
+        # Remove all existing handlers from logger
+        for handler in logger.handlers[:]:
+            logger.removeHandler(handler)
+        file_handler = logging.FileHandler(log_file)
+        file_handler.setFormatter(
+            logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
+        )
+        logger.addHandler(file_handler)
+
+    if not skip_workspace_mount:
+        logger.info(f'Process-specific workspace mounted at {workspace_mount_path}')
+
+    # sandbox = DockerSSHBox()
+    logic_inference_path = os.path.join(workspace_mount_path, 'logic_inference.py')
+    if not os.path.exists(logic_inference_path):
+        shutil.copyfile(
+            './evaluation/logic_reasoning/logic_inference.py', logic_inference_path
+        )
+    logger.info(f'logic_inference.py copied to {workspace_mount_path}')
+
+    cache_dir = os.path.join(workspace_mount_path, '.cache_program')
+    if not os.path.exists(cache_dir):
+        os.makedirs(cache_dir)
+
+    # Prepare instruction
+
+    with open('./evaluation/logic_reasoning/instruction.txt', 'r') as f:
+        instruction = f.read()
+
+    instance_logic_programs = instance['raw_logic_programs'][0].strip()
+    instruction = instruction.replace('[[dataset_name]]', dataset_name)
+    instruction = instruction.replace('[[logic_programs]]', instance_logic_programs)
+    instruction = instruction.replace(
+        '[[logic_inference_path.py]]', logic_inference_path
+    )
+
+    # NOTE: You can actually set slightly different instruction for different agents
+    instruction += AGENT_CLS_TO_INST_SUFFIX.get(agent_class, '')
+
+    sandbox = DockerSSHBox()
+    exit_code, command_output = sandbox.execute(f'pip install scitools-pyke')
+    
+    # Here's how you can run the agent (similar to the `main` function) and get the final task state
+    state: State = asyncio.run(
+        main(
+            instruction,
+            fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN.get(agent_class),
+            sandbox=sandbox,
+        )
+    )
+    # ======= Attempt to evaluate the agent's edits =======
+    # If you are working on simplier benchmark that only evaluates the final model output (e.g., in a MessageAction)
+    # You can simply get the LAST `MessageAction` from the returned `state.history` and parse it for evaluation.
+
+    if state is None:
+        raise ValueError('State should not be None.')
+
+    final_message = ''
+    messages = []
+    for action, obs in reversed(state.history):
+        # if isinstance(act, MessageAction):
+        messages.append(obs.content)
+        # print("obs.content:", obs.content)
+        if str(obs.content) in ["'A'", "'B'", "'C'"]:
+            final_message = obs.content
+            break
+    
+    final_message = final_message.strip("'")
+    logger.info(f'Predicted answer: {final_message}, Ground truth: {instance["answer"]}')
+
+    test_result = get_test_result(
+        model_answer=final_message, ground_truth=instance['answer']
+    )
+
+    # Save the output
+    output = {
+        'id': instance['id'],
+        'instance': instance,
+        'instruction': instruction,
+        # 'metadata': metadata,
+        'history': [
+            (event_to_dict(action), event_to_dict(obs)) for action, obs in state.history
+        ],
+        'final_message': final_message,
+        'messages': messages,
+        'error': state.error if state and state.error else None,
+        'test_result': test_result,
+    }
+    config.workspace_mount_path = old_workspace_mount_path
+    config.workspace_base = old_workspace_base
+    
+    # Close the sandbox
+    sandbox.close()
+    
+    return output
+
+
+if __name__ == '__main__':
+    parser = get_parser()
+    parser.add_argument(
+        '--dataset',
+        type=str,
+        help='the logic reasoning dataset to evaluate on {ProntoQA, ProofWriter}',
+        default='ProntoQA',
+    )
+    parser.add_argument(
+        '--data_split',
+        type=str,
+        help='data split to evaluate on {validation}', # right now we only support validation split
+        default='validation',
+    )
+
+    args, _ = parser.parse_known_args()
+    if args.directory:
+        config.workspace_base = os.path.abspath(args.directory)
+        print(f'Setting workspace base to {config.workspace_base}')
+    # NOTE: It is preferable to load datasets from huggingface datasets and perform post-processing
+    # so we don't need to manage file uploading to OpenDevin's repo
+
+    dataset_name = args.dataset
+    data_split = args.data_split
+    dataset = load_dataset(f'renma/{dataset_name}')
+    logic_reasoning_tests = dataset[data_split]
+    logger.info(f'Evaluating logic reasoning dataset {dataset_name} {data_split} split')
+
+    # Check https://github.com/OpenDevin/OpenDevin/blob/main/evaluation/swe_bench/README.md#configure-opendevin-and-your-llm
+    # for details of how to set `llm_config`
+    if args.llm_config:
+        specified_llm_config = get_llm_config_arg(args.llm_config)
+        if specified_llm_config:
+            config.llm = specified_llm_config
+    logger.info(f'Config for evaluation: {config}')
+
+    # TEST METADATA
+    agent_class = args.agent_cls
+    assert (
+        agent_class in AGENT_CLS_TO_FAKE_USER_RESPONSE_FN
+    ), f'Unsupported agent class: {agent_class}'
+    model_name = config.llm.model.split('/')[-1]
+    max_iterations = args.max_iterations
+    eval_note = ''
+    if args.eval_note is not None:
+        eval_note += '_N_' + args.eval_note
+
+    eval_output_dir = os.path.join(
+        args.eval_output_dir,
+        'logic_reasoning',
+        agent_class,
+        dataset_name,
+        model_name + '_maxiter_' + str(max_iterations) + eval_note
+    )
+
+    pathlib.Path(eval_output_dir).mkdir(parents=True, exist_ok=True)
+    pathlib.Path(os.path.join(eval_output_dir, 'logs')).mkdir(
+        parents=True, exist_ok=True
+    )
+    logger.info(f'Using evaluation output directory: {eval_output_dir}')
+
+    # LIMIT EVALUATION
+    eval_n_limit = args.eval_n_limit
+    if eval_n_limit:
+        logic_reasoning_tests = logic_reasoning_tests.select(list(range(eval_n_limit)))
+        logger.info(f'Limiting evaluation to first {eval_n_limit} instances.')
+
+    start_time = time.strftime('%Y-%m-%d %H:%M:%S')
+
+    # OUTPUT FILE
+    output_file = os.path.join(eval_output_dir, 'output.jsonl')
+    logger.info(f'Writing evaluation output to {output_file}')
+    finished_task_ids = set()
+    if os.path.exists(output_file):
+        with open(output_file, 'r') as f:
+            for line in f:
+                data = json.loads(line)
+                finished_task_ids.add(data['id'])
+        logger.warning(
+            f'Output file {output_file} already exists. Loaded {len(finished_task_ids)} finished instances.'
+        )
+    output_fp = open(output_file, 'a')
+
+    logger.info(
+        f'Evaluation started with Agent {agent_class}, model {model_name}, max iterations {max_iterations}.'
+    )
+
+    # =============================================
+    # filter out finished instances
+    new_logic_reasoning_tests = []
+    for instance in logic_reasoning_tests:
+        if instance['id'] in finished_task_ids:
+            logger.info(
+                f'Skipping instance {instance["id"]} as it is already finished.'
+            )
+            continue
+        new_logic_reasoning_tests.append(instance)
+
+    logic_reasoning_tests = new_logic_reasoning_tests
+    logger.info(
+        f'Finished instances: {len(finished_task_ids)}, Remaining instances: {len(logic_reasoning_tests)}'
+    )
+    # =============================================
+
+    pbar = tqdm(total=len(logic_reasoning_tests))
+
+    # This function tracks the progress AND write the output to a JSONL file
+    def update_progress(future):
+        pbar.update(1)
+        output = future.result()
+        pbar.set_description(f'Instance {output["id"]}')
+        pbar.set_postfix_str(f'Test Result: {output["test_result"]["result"]}')
+        logger.info(
+            f'Finished evaluation for instance {output["id"]}: {output["test_result"]["result"]}'
+        )
+        output_fp.write(json.dumps(output) + '\n')
+        # json.dump(output, output_fp, indent=4)
+        output_fp.flush()
+
+    # This sets the multi-processing
+    num_workers = args.eval_num_workers
+    # num_workers = 1
+    logger.info(f'Using {num_workers} workers for evaluation.')
+
+    # This is SWE-Bench specific - CodeActAgent don't requires mounted workspace to work
+    skip_workspace_mount = False
+    logger.info(f'Skipping workspace mount: {skip_workspace_mount}')
+
+    try:
+        with ProcessPoolExecutor(num_workers) as executor:
+            futures = []
+            # This is how we perform multi-processing
+            for instance in logic_reasoning_tests:
+                future = executor.submit(
+                    process_instance,
+                    instance,
+                    agent_class,
+                    dataset_name,
+                    skip_workspace_mount,
+                    eval_output_dir,
+                    reset_logger=bool(num_workers > 1),
+                )
+                future.add_done_callback(update_progress)
+                futures.append(future)
+
+            # Wait for all futures to complete
+            for future in futures:
+                future.result()
+    except KeyboardInterrupt:
+        print('KeyboardInterrupt received. Cleaning up...')
+        cleanup()
+
+    output_fp.close()
+    
+    with open(output_file, 'r') as f:
+        test_result = [(json.loads(line))["test_result"]["result"] for line in f]
+            
+    metadata = {
+        "Dataset": dataset_name,
+        "Data split": data_split,
+        "Number of Samples": len(test_result),
+        'Agent class': agent_class,
+        'Model name': model_name,
+        'Start_time': start_time,
+        "End_time": time.strftime('%Y-%m-%d %H:%M:%S'),
+        "Final Accuracy": f"{sum(test_result)/len(test_result):.2f}",
+        }
+    
+    with open(os.path.join(eval_output_dir, 'metadata.json'), 'w') as f:
+        json.dump(metadata, f, indent=4)
+        
+    logger.info(f'Metadata: {json.dumps(metadata, indent=4)}')
+    logger.info(f'Evaluation finished. Metadata saved to {eval_output_dir}/metadata.json')
--- a/evaluation/logic_reasoning/scripts/run_infer.sh
+++ b/evaluation/logic_reasoning/scripts/run_infer.sh
@ -0,0 +1,37 @@
+#!/bin/bash
+DATASET=$1
+MODEL_CONFIG=$2
+EVAL_LIMIT=$3
+AGENT=$4
+
+# ################################################################################
+
+if [ -z "$AGENT" ]; then
+  echo "Agent not specified, use default CodeActAgent"
+  AGENT="CodeActAgent"
+fi
+
+# IMPORTANT: Because Agent's prompt changes fairly often in the rapidly evolving codebase of OpenDevin
+# We need to track the version of Agent in the evaluation to make sure results are comparable
+AGENT_VERSION=v$(poetry run python -c "import agenthub; from opendevin.controller.agent import Agent; print(Agent.get_cls('$AGENT').VERSION)")
+
+echo "AGENT: $AGENT"
+echo "AGENT_VERSION: $AGENT_VERSION"
+echo "MODEL_CONFIG: $MODEL_CONFIG"
+
+COMMAND="poetry run python evaluation/logic_reasoning/run_infer.py \
+  --agent-cls $AGENT \
+  --llm-config $MODEL_CONFIG \
+  --dataset $DATASET \
+  --max-iterations 10 \
+  --max-chars 10000000 \
+  --eval-num-workers 1 \
+  --eval-note $AGENT_VERSION"
+
+if [ -n "$EVAL_LIMIT" ]; then
+  echo "EVAL_LIMIT: $EVAL_LIMIT"
+  COMMAND="$COMMAND --eval-n-limit $EVAL_LIMIT"
+fi
+
+# Run the command
+eval $COMMAND