mirror of
https://github.com/OpenHands/OpenHands.git
synced 2025-12-26 05:48:36 +08:00
Support Logic Reasoning Benchmark (#1973)
This commit is contained in:
parent
01ef90205d
commit
a9823491e6
@ -0,0 +1,10 @@
|
||||
{
|
||||
"Dataset": "ProntoQA",
|
||||
"Data split": "validation",
|
||||
"Number of Samples": 6,
|
||||
"Agent class": "CodeActAgent",
|
||||
"Model name": "gpt-4o-2024-05-13",
|
||||
"Start_time": "2024-05-29 17:51:09",
|
||||
"End_time": "2024-05-29 17:52:24",
|
||||
"Final Accuracy": "0.83"
|
||||
}
|
||||
File diff suppressed because one or more lines are too long
12
evaluation/logic_reasoning/.cache_program/facts.kfb
Normal file
12
evaluation/logic_reasoning/.cache_program/facts.kfb
Normal file
@ -0,0 +1,12 @@
|
||||
Cold(Bob, True)
|
||||
Quiet(Bob, True)
|
||||
Red(Bob, True)
|
||||
Smart(Bob, True)
|
||||
Kind(Charlie, True)
|
||||
Quiet(Charlie, True)
|
||||
Red(Charlie, True)
|
||||
Rough(Charlie, True)
|
||||
Cold(Dave, True)
|
||||
Kind(Dave, True)
|
||||
Smart(Dave, True)
|
||||
Quiet(Fiona, True)
|
||||
52
evaluation/logic_reasoning/.cache_program/rules.krb
Normal file
52
evaluation/logic_reasoning/.cache_program/rules.krb
Normal file
@ -0,0 +1,52 @@
|
||||
fact1
|
||||
foreach
|
||||
facts.Quiet($x, True)
|
||||
facts.Cold($x, True)
|
||||
assert
|
||||
facts.Smart($x, True)
|
||||
|
||||
fact2
|
||||
foreach
|
||||
facts.Red($x, True)
|
||||
facts.Cold($x, True)
|
||||
assert
|
||||
facts.Round($x, True)
|
||||
|
||||
fact3
|
||||
foreach
|
||||
facts.Kind($x, True)
|
||||
facts.Rough($x, True)
|
||||
assert
|
||||
facts.Red($x, True)
|
||||
|
||||
fact4
|
||||
foreach
|
||||
facts.Quiet($x, True)
|
||||
assert
|
||||
facts.Rough($x, True)
|
||||
|
||||
fact5
|
||||
foreach
|
||||
facts.Cold($x, True)
|
||||
facts.Smart($x, True)
|
||||
assert
|
||||
facts.Red($x, True)
|
||||
|
||||
fact6
|
||||
foreach
|
||||
facts.Rough($x, True)
|
||||
assert
|
||||
facts.Cold($x, True)
|
||||
|
||||
fact7
|
||||
foreach
|
||||
facts.Red($x, True)
|
||||
assert
|
||||
facts.Rough($x, True)
|
||||
|
||||
fact8
|
||||
foreach
|
||||
facts.Smart(Dave, True)
|
||||
facts.Kind(Dave, True)
|
||||
assert
|
||||
facts.Quiet(Dave, True)
|
||||
43
evaluation/logic_reasoning/README.md
Normal file
43
evaluation/logic_reasoning/README.md
Normal file
@ -0,0 +1,43 @@
|
||||
# Logic Reasoning Evaluation
|
||||
|
||||
This folder contains evaluation harness for evaluating agents on the logic reasoning benchmark [ProntoQA](https://github.com/asaparov/prontoqa) and [ProofWriter](https://allenai.org/data/proofwriter).
|
||||
|
||||
## Configure OpenDevin and your LLM
|
||||
|
||||
Create a `config.toml` file if it does not exist at the root of the workspace.
|
||||
|
||||
Add the following configurations:
|
||||
|
||||
```toml
|
||||
[core]
|
||||
max_iterations = 100
|
||||
cache_dir = "/tmp/cache"
|
||||
ssh_hostname = "localhost"
|
||||
enable_auto_lint = true
|
||||
|
||||
# TODO: Change these to the model you want to evaluate
|
||||
[eval_gpt4_1106_preview]
|
||||
model = "gpt-4-1106-preview"
|
||||
api_key = "XXX"
|
||||
temperature = 0.0
|
||||
|
||||
[eval_some_openai_compatible_model]
|
||||
model = "openai/MODEL_NAME"
|
||||
base_url = "https://OPENAI_COMPATIBLE_URL/v1"
|
||||
api_key = "XXX"
|
||||
temperature = 0.0
|
||||
```
|
||||
|
||||
## Run Inference on logic_reasoning
|
||||
The following code will run inference on the first example of the ProntoQA dataset with model gpt-4o.
|
||||
```bash
|
||||
./evaluation/logic_reasoning/scripts/run_infer.sh ProntoQA gpt-4o 1
|
||||
```
|
||||
|
||||
|
||||
## Examples
|
||||
|
||||
See example output in
|
||||
`./evaluation_outputs/outputs/logic_reasoning/CodeActAgent/ProntoQA/gpt-4o-2024-05-13_maxiter_10_N_v1.5/output.jsonl`
|
||||
and final evaluation performance in
|
||||
`./evaluation_outputs/outputs/logic_reasoning/CodeActAgent/ProntoQA/gpt-4o-2024-05-13_maxiter_10_N_v1.5/metadata.json`
|
||||
0
evaluation/logic_reasoning/__init__.py
Normal file
0
evaluation/logic_reasoning/__init__.py
Normal file
20
evaluation/logic_reasoning/instruction.txt
Normal file
20
evaluation/logic_reasoning/instruction.txt
Normal file
@ -0,0 +1,20 @@
|
||||
You are a helpful assistant assigned with logic reasoning task. You need to determine the correctness of a query given some facts and fules.
|
||||
you can interact with an interactive Python (Jupyter Notebook) environment and receive the corresponding output when needed. The code should be enclosed using "<execute_ipython>" tag.
|
||||
In this task, you need to use the code in [[logic_inference_path.py]] to help you. Specifically, you first need to instantiate a **LogicInferenceEngine** class and use the **safe_execute_program** method to prove the **logic programs**. You should receive *answer*, *flag*, *error_message* from the output.
|
||||
|
||||
An example would be look like this:
|
||||
<execute_ipython>
|
||||
import sys
|
||||
sys.path.append(workspace_mount_path)
|
||||
engine = LogicInferenceEngine(dataset_name, workspace_mount_path)
|
||||
answer, flag, error_message = engine.safe_execute_program(logic_programs)
|
||||
</execute_ipython>
|
||||
|
||||
Please send the *answer* variable through message.
|
||||
|
||||
dataset_name:
|
||||
[[dataset_name]]
|
||||
|
||||
logic_programs:
|
||||
[[logic_programs]]
|
||||
|
||||
220
evaluation/logic_reasoning/logic_inference.py
Normal file
220
evaluation/logic_reasoning/logic_inference.py
Normal file
@ -0,0 +1,220 @@
|
||||
import os
|
||||
import random
|
||||
import re
|
||||
import shutil
|
||||
|
||||
from pyke import knowledge_engine
|
||||
|
||||
|
||||
class PykeProgram:
|
||||
def __init__(
|
||||
self, logic_program: str, dataset_name='ProntoQA', workspace_mount_path='./'
|
||||
) -> None:
|
||||
self.logic_program = logic_program
|
||||
self.flag = self.parse_logic_program()
|
||||
self.dataset_name = dataset_name
|
||||
self.cache_dir = os.path.join(workspace_mount_path, '.cache_program')
|
||||
|
||||
# prepare the files for facts and rules
|
||||
try:
|
||||
self.create_fact_file(self.Facts)
|
||||
self.create_rule_file(self.Rules)
|
||||
self.flag = True
|
||||
except Exception:
|
||||
self.flag = False
|
||||
|
||||
self.answer_map = {
|
||||
'ProntoQA': self.answer_map_prontoqa,
|
||||
'ProofWriter': self.answer_map_proofwriter,
|
||||
}
|
||||
|
||||
def parse_logic_program(self):
|
||||
keywords = ['Query:', 'Rules:', 'Facts:', 'Predicates:']
|
||||
program_str = self.logic_program
|
||||
for keyword in keywords:
|
||||
try:
|
||||
program_str, segment_list = self._parse_segment(program_str, keyword)
|
||||
setattr(self, keyword[:-1], segment_list)
|
||||
except Exception:
|
||||
setattr(self, keyword[:-1], None)
|
||||
|
||||
return self.validate_program()
|
||||
|
||||
def _parse_segment(self, program_str, key_phrase):
|
||||
remain_program_str, segment = program_str.split(key_phrase)
|
||||
segment_list = segment.strip().split('\n')
|
||||
for i in range(len(segment_list)):
|
||||
segment_list[i] = segment_list[i].split(':::')[0].strip()
|
||||
return remain_program_str, segment_list
|
||||
|
||||
# check if the program is valid; if not, try to fix it
|
||||
def validate_program(self):
|
||||
if self.Rules is not None and self.Facts is not None:
|
||||
if not self.Rules[0] == '' and not self.Facts[0] == '':
|
||||
return True
|
||||
# try to fix the program
|
||||
tmp_rules = []
|
||||
tmp_facts = []
|
||||
statements = self.Facts if self.Facts is not None else self.Rules
|
||||
if statements is None:
|
||||
return False
|
||||
|
||||
for fact in statements:
|
||||
if fact.find('>>>') >= 0: # this is a rule
|
||||
tmp_rules.append(fact)
|
||||
else:
|
||||
tmp_facts.append(fact)
|
||||
self.Rules = tmp_rules
|
||||
self.Facts = tmp_facts
|
||||
return False
|
||||
|
||||
def create_fact_file(self, facts):
|
||||
with open(os.path.join(self.cache_dir, 'facts.kfb'), 'w') as f:
|
||||
for fact in facts:
|
||||
# check for invalid facts
|
||||
if not fact.find('$x') >= 0:
|
||||
f.write(fact + '\n')
|
||||
|
||||
def create_rule_file(self, rules):
|
||||
pyke_rules = []
|
||||
for idx, rule in enumerate(rules):
|
||||
pyke_rules.append(self.parse_forward_rule(idx + 1, rule))
|
||||
|
||||
with open(os.path.join(self.cache_dir, 'rules.krb'), 'w') as f:
|
||||
f.write('\n\n'.join(pyke_rules))
|
||||
|
||||
# example rule: Furry($x, True) && Quite($x, True) >>> White($x, True)
|
||||
def parse_forward_rule(self, f_index, rule):
|
||||
premise, conclusion = rule.split('>>>')
|
||||
premise = premise.strip()
|
||||
# split the premise into multiple facts if needed
|
||||
premise = premise.split('&&')
|
||||
premise_list = [p.strip() for p in premise]
|
||||
|
||||
conclusion = conclusion.strip()
|
||||
# split the conclusion into multiple facts if needed
|
||||
conclusion = conclusion.split('&&')
|
||||
conclusion_list = [c.strip() for c in conclusion]
|
||||
|
||||
# create the Pyke rule
|
||||
pyke_rule = f"""fact{f_index}\n\tforeach"""
|
||||
for p in premise_list:
|
||||
pyke_rule += f"""\n\t\tfacts.{p}"""
|
||||
pyke_rule += """\n\tassert"""
|
||||
for c in conclusion_list:
|
||||
pyke_rule += f"""\n\t\tfacts.{c}"""
|
||||
return pyke_rule
|
||||
|
||||
"""
|
||||
for example: Is Marvin from Mars?
|
||||
Query: FromMars(Marvin, $label)
|
||||
"""
|
||||
|
||||
def check_specific_predicate(self, subject_name, predicate_name, engine):
|
||||
results = []
|
||||
with engine.prove_goal(
|
||||
f'facts.{predicate_name}({subject_name}, $label)'
|
||||
) as gen:
|
||||
for vars, plan in gen:
|
||||
results.append(vars['label'])
|
||||
|
||||
with engine.prove_goal(
|
||||
f'rules.{predicate_name}({subject_name}, $label)'
|
||||
) as gen:
|
||||
for vars, plan in gen:
|
||||
results.append(vars['label'])
|
||||
|
||||
if len(results) == 1:
|
||||
return results[0]
|
||||
elif len(results) == 2:
|
||||
return results[0] and results[1]
|
||||
elif len(results) == 0:
|
||||
return None
|
||||
|
||||
"""
|
||||
Input Example: Metallic(Wren, False)
|
||||
"""
|
||||
|
||||
def parse_query(self, query):
|
||||
pattern = r'(\w+)\(([^,]+),\s*([^)]+)\)'
|
||||
match = re.match(pattern, query)
|
||||
if match:
|
||||
function_name = match.group(1)
|
||||
arg1 = match.group(2)
|
||||
arg2 = match.group(3)
|
||||
arg2 = True if arg2 == 'True' else False
|
||||
return function_name, arg1, arg2
|
||||
else:
|
||||
raise ValueError(f'Invalid query: {query}')
|
||||
|
||||
def execute_program(self):
|
||||
# delete the compiled_krb dir
|
||||
complied_krb_dir = './models/compiled_krb'
|
||||
if os.path.exists(complied_krb_dir):
|
||||
print('removing compiled_krb')
|
||||
# os.system(f'rm -rf {complied_krb_dir}/*')
|
||||
shutil.rmtree(complied_krb_dir)
|
||||
|
||||
# absolute_path = os.path.abspath(complied_krb_dir)
|
||||
# print(absolute_path)
|
||||
try:
|
||||
engine = knowledge_engine.engine(self.cache_dir)
|
||||
engine.reset()
|
||||
engine.activate('rules')
|
||||
engine.get_kb('facts')
|
||||
|
||||
# parse the logic query into pyke query
|
||||
predicate, subject, value_to_check = self.parse_query(self.Query[0])
|
||||
result = self.check_specific_predicate(subject, predicate, engine)
|
||||
answer = self.answer_map[self.dataset_name](result, value_to_check)
|
||||
except Exception as err:
|
||||
return None, err
|
||||
|
||||
return answer, ''
|
||||
|
||||
def answer_mapping(self, answer):
|
||||
return answer
|
||||
|
||||
def answer_map_prontoqa(self, result, value_to_check):
|
||||
if result == value_to_check:
|
||||
return 'A'
|
||||
else:
|
||||
return 'B'
|
||||
|
||||
def answer_map_proofwriter(self, result, value_to_check):
|
||||
if result is None:
|
||||
return 'C'
|
||||
elif result == value_to_check:
|
||||
return 'A'
|
||||
else:
|
||||
return 'B'
|
||||
|
||||
|
||||
class LogicInferenceEngine:
|
||||
def __init__(self, dataset_name, workspace_mount_path):
|
||||
self.dataset_name = dataset_name
|
||||
self.workspace_mount_path = workspace_mount_path
|
||||
|
||||
def random_backup(self):
|
||||
if self.dataset_name == 'ProntoQA':
|
||||
return random.choice(['A', 'B'])
|
||||
elif self.dataset_name == 'ProofWriter':
|
||||
return random.choice(['A', 'B', 'C'])
|
||||
|
||||
def safe_execute_program(self, logic_program):
|
||||
program = PykeProgram(
|
||||
logic_program, self.dataset_name, self.workspace_mount_path
|
||||
)
|
||||
# cannot parse the program
|
||||
if not program.flag:
|
||||
answer = self.random_backup()
|
||||
return answer, 'parsing error', ''
|
||||
# execuate the program
|
||||
answer, error_message = program.execute_program()
|
||||
# not executable
|
||||
if answer is None:
|
||||
answer = self.random_backup()
|
||||
return answer, 'execution error', error_message
|
||||
# successfully executed
|
||||
answer = program.answer_mapping(answer)
|
||||
return answer, 'success', ''
|
||||
436
evaluation/logic_reasoning/run_infer.py
Normal file
436
evaluation/logic_reasoning/run_infer.py
Normal file
@ -0,0 +1,436 @@
|
||||
import asyncio
|
||||
import json
|
||||
import logging
|
||||
import multiprocessing as mp
|
||||
import os
|
||||
import pathlib
|
||||
import shutil
|
||||
import time
|
||||
from concurrent.futures import ProcessPoolExecutor
|
||||
|
||||
from datasets import load_dataset
|
||||
from tqdm import tqdm
|
||||
|
||||
from evaluation.swe_bench.swe_env_box import DockerSSHBox
|
||||
from opendevin.controller.state.state import State
|
||||
from opendevin.core.config import config, get_llm_config_arg, get_parser
|
||||
from opendevin.core.logger import get_console_handler
|
||||
from opendevin.core.logger import opendevin_logger as logger
|
||||
from opendevin.core.main import main
|
||||
from opendevin.events.action import MessageAction
|
||||
from opendevin.events.serialization.event import event_to_dict
|
||||
|
||||
|
||||
def cleanup():
|
||||
logger.info('Cleaning up child processes...')
|
||||
for process in mp.active_children():
|
||||
logger.info(f'Terminating child process: {process.name}')
|
||||
process.terminate()
|
||||
process.join()
|
||||
|
||||
|
||||
def codeact_user_response(state: State) -> str:
|
||||
msg = (
|
||||
'Please continue working on the task on whatever approach you think is suitable.\n'
|
||||
'If you think you have solved the task, please run the following command: <execute_bash> exit </execute_bash>.\n'
|
||||
'IMPORTANT: YOU SHOULD NEVER ASK FOR HUMAN HELP OR USE THE INTERNET TO SOLVE THIS TASK.\n'
|
||||
)
|
||||
if state.history:
|
||||
user_msgs = [
|
||||
action
|
||||
for action, _ in state.history
|
||||
if isinstance(action, MessageAction) and action.source == 'user'
|
||||
]
|
||||
if len(user_msgs) >= 2:
|
||||
# let the agent know that it can give up when it has tried 3 times
|
||||
return (
|
||||
msg
|
||||
+ 'If you want to give up, run: <execute_bash> exit </execute_bash>.\n'
|
||||
)
|
||||
return msg
|
||||
|
||||
|
||||
def monologue_user_response(state: State) -> str:
|
||||
raise NotImplementedError('MonologueAgent should never ask for user responses.')
|
||||
|
||||
|
||||
AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = {
|
||||
'CodeActAgent': codeact_user_response,
|
||||
'MonologueAgent': monologue_user_response,
|
||||
}
|
||||
|
||||
AGENT_CLS_TO_INST_SUFFIX = {
|
||||
'CodeActAgent': 'When you think you have solved the question, please first send your answer to user through message and then exit.\n'
|
||||
}
|
||||
|
||||
|
||||
def get_choice(answer_str):
|
||||
choices = [
|
||||
'A',
|
||||
'B',
|
||||
'C',
|
||||
'D',
|
||||
'E',
|
||||
'F',
|
||||
'G',
|
||||
'H',
|
||||
'A)',
|
||||
'B)',
|
||||
'C)',
|
||||
'D)',
|
||||
'E)',
|
||||
'F)',
|
||||
'G)',
|
||||
'H)',
|
||||
'A.',
|
||||
'B.',
|
||||
'C.',
|
||||
'D.',
|
||||
'E.',
|
||||
'F.',
|
||||
'G.',
|
||||
'H.',
|
||||
]
|
||||
for c in choices:
|
||||
if answer_str.startswith(c):
|
||||
return c.replace(')', '')
|
||||
|
||||
if answer_str.startswith(':'):
|
||||
return answer_str.replace(':', '').replace('.', '').strip()
|
||||
return None
|
||||
|
||||
|
||||
def get_test_result(
|
||||
model_answer: str,
|
||||
ground_truth: str,
|
||||
) -> bool:
|
||||
gold_answer = ground_truth.replace('(', '').replace(')', '').strip()
|
||||
answer_str = model_answer if model_answer is not None else ''
|
||||
prediction = get_choice(answer_str)
|
||||
|
||||
indicators = [
|
||||
'the correct option is',
|
||||
'the correct answer is',
|
||||
'The correct answer is',
|
||||
'The correct option is',
|
||||
'Thus, the answer is',
|
||||
]
|
||||
if prediction is None:
|
||||
for indicator in indicators:
|
||||
if answer_str.find(indicator) >= 0:
|
||||
answer_str = answer_str.split(indicator)[1].strip()
|
||||
prediction = get_choice(answer_str)
|
||||
break
|
||||
|
||||
isTrue = prediction == gold_answer
|
||||
test_result = {'result': isTrue}
|
||||
return test_result
|
||||
|
||||
|
||||
def process_instance(
|
||||
instance,
|
||||
agent_class,
|
||||
# metadata,
|
||||
dataset_name,
|
||||
skip_workspace_mount,
|
||||
eval_output_dir,
|
||||
reset_logger: bool = True,
|
||||
):
|
||||
old_workspace_mount_path = config.workspace_mount_path
|
||||
old_workspace_base = config.workspace_base
|
||||
workspace_mount_path = os.path.join(config.workspace_mount_path, '_eval_workspace')
|
||||
# create process-specific workspace dir
|
||||
# if `not skip_workspace_mount` - we will create a workspace directory for EACH process
|
||||
# so that different agent don't interfere with each other.
|
||||
if not skip_workspace_mount:
|
||||
workspace_mount_path = os.path.join(workspace_mount_path, str(os.getpid()))
|
||||
pathlib.Path(workspace_mount_path).mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# reset workspace to config
|
||||
config.workspace_base = workspace_mount_path
|
||||
config.workspace_mount_path = workspace_mount_path
|
||||
|
||||
# Setup the logger properly, so you can run multi-processing to parallize the evaluation
|
||||
if reset_logger:
|
||||
# Set up logger
|
||||
log_file = os.path.join(
|
||||
eval_output_dir, 'logs', f'instance_{instance["id"]}.log'
|
||||
)
|
||||
# Remove all existing handlers from logger
|
||||
for handler in logger.handlers[:]:
|
||||
logger.removeHandler(handler)
|
||||
# add back the console handler to print ONE line
|
||||
logger.addHandler(get_console_handler())
|
||||
logger.info(
|
||||
f'Starting evaluation for instance {instance["id"]}.\nLOG: tail -f {log_file}'
|
||||
)
|
||||
# Remove all existing handlers from logger
|
||||
for handler in logger.handlers[:]:
|
||||
logger.removeHandler(handler)
|
||||
file_handler = logging.FileHandler(log_file)
|
||||
file_handler.setFormatter(
|
||||
logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
|
||||
)
|
||||
logger.addHandler(file_handler)
|
||||
|
||||
if not skip_workspace_mount:
|
||||
logger.info(f'Process-specific workspace mounted at {workspace_mount_path}')
|
||||
|
||||
# sandbox = DockerSSHBox()
|
||||
logic_inference_path = os.path.join(workspace_mount_path, 'logic_inference.py')
|
||||
if not os.path.exists(logic_inference_path):
|
||||
shutil.copyfile(
|
||||
'./evaluation/logic_reasoning/logic_inference.py', logic_inference_path
|
||||
)
|
||||
logger.info(f'logic_inference.py copied to {workspace_mount_path}')
|
||||
|
||||
cache_dir = os.path.join(workspace_mount_path, '.cache_program')
|
||||
if not os.path.exists(cache_dir):
|
||||
os.makedirs(cache_dir)
|
||||
|
||||
# Prepare instruction
|
||||
|
||||
with open('./evaluation/logic_reasoning/instruction.txt', 'r') as f:
|
||||
instruction = f.read()
|
||||
|
||||
instance_logic_programs = instance['raw_logic_programs'][0].strip()
|
||||
instruction = instruction.replace('[[dataset_name]]', dataset_name)
|
||||
instruction = instruction.replace('[[logic_programs]]', instance_logic_programs)
|
||||
instruction = instruction.replace(
|
||||
'[[logic_inference_path.py]]', logic_inference_path
|
||||
)
|
||||
|
||||
# NOTE: You can actually set slightly different instruction for different agents
|
||||
instruction += AGENT_CLS_TO_INST_SUFFIX.get(agent_class, '')
|
||||
|
||||
sandbox = DockerSSHBox()
|
||||
exit_code, command_output = sandbox.execute(f'pip install scitools-pyke')
|
||||
|
||||
# Here's how you can run the agent (similar to the `main` function) and get the final task state
|
||||
state: State = asyncio.run(
|
||||
main(
|
||||
instruction,
|
||||
fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN.get(agent_class),
|
||||
sandbox=sandbox,
|
||||
)
|
||||
)
|
||||
# ======= Attempt to evaluate the agent's edits =======
|
||||
# If you are working on simplier benchmark that only evaluates the final model output (e.g., in a MessageAction)
|
||||
# You can simply get the LAST `MessageAction` from the returned `state.history` and parse it for evaluation.
|
||||
|
||||
if state is None:
|
||||
raise ValueError('State should not be None.')
|
||||
|
||||
final_message = ''
|
||||
messages = []
|
||||
for action, obs in reversed(state.history):
|
||||
# if isinstance(act, MessageAction):
|
||||
messages.append(obs.content)
|
||||
# print("obs.content:", obs.content)
|
||||
if str(obs.content) in ["'A'", "'B'", "'C'"]:
|
||||
final_message = obs.content
|
||||
break
|
||||
|
||||
final_message = final_message.strip("'")
|
||||
logger.info(f'Predicted answer: {final_message}, Ground truth: {instance["answer"]}')
|
||||
|
||||
test_result = get_test_result(
|
||||
model_answer=final_message, ground_truth=instance['answer']
|
||||
)
|
||||
|
||||
# Save the output
|
||||
output = {
|
||||
'id': instance['id'],
|
||||
'instance': instance,
|
||||
'instruction': instruction,
|
||||
# 'metadata': metadata,
|
||||
'history': [
|
||||
(event_to_dict(action), event_to_dict(obs)) for action, obs in state.history
|
||||
],
|
||||
'final_message': final_message,
|
||||
'messages': messages,
|
||||
'error': state.error if state and state.error else None,
|
||||
'test_result': test_result,
|
||||
}
|
||||
config.workspace_mount_path = old_workspace_mount_path
|
||||
config.workspace_base = old_workspace_base
|
||||
|
||||
# Close the sandbox
|
||||
sandbox.close()
|
||||
|
||||
return output
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = get_parser()
|
||||
parser.add_argument(
|
||||
'--dataset',
|
||||
type=str,
|
||||
help='the logic reasoning dataset to evaluate on {ProntoQA, ProofWriter}',
|
||||
default='ProntoQA',
|
||||
)
|
||||
parser.add_argument(
|
||||
'--data_split',
|
||||
type=str,
|
||||
help='data split to evaluate on {validation}', # right now we only support validation split
|
||||
default='validation',
|
||||
)
|
||||
|
||||
args, _ = parser.parse_known_args()
|
||||
if args.directory:
|
||||
config.workspace_base = os.path.abspath(args.directory)
|
||||
print(f'Setting workspace base to {config.workspace_base}')
|
||||
# NOTE: It is preferable to load datasets from huggingface datasets and perform post-processing
|
||||
# so we don't need to manage file uploading to OpenDevin's repo
|
||||
|
||||
dataset_name = args.dataset
|
||||
data_split = args.data_split
|
||||
dataset = load_dataset(f'renma/{dataset_name}')
|
||||
logic_reasoning_tests = dataset[data_split]
|
||||
logger.info(f'Evaluating logic reasoning dataset {dataset_name} {data_split} split')
|
||||
|
||||
# Check https://github.com/OpenDevin/OpenDevin/blob/main/evaluation/swe_bench/README.md#configure-opendevin-and-your-llm
|
||||
# for details of how to set `llm_config`
|
||||
if args.llm_config:
|
||||
specified_llm_config = get_llm_config_arg(args.llm_config)
|
||||
if specified_llm_config:
|
||||
config.llm = specified_llm_config
|
||||
logger.info(f'Config for evaluation: {config}')
|
||||
|
||||
# TEST METADATA
|
||||
agent_class = args.agent_cls
|
||||
assert (
|
||||
agent_class in AGENT_CLS_TO_FAKE_USER_RESPONSE_FN
|
||||
), f'Unsupported agent class: {agent_class}'
|
||||
model_name = config.llm.model.split('/')[-1]
|
||||
max_iterations = args.max_iterations
|
||||
eval_note = ''
|
||||
if args.eval_note is not None:
|
||||
eval_note += '_N_' + args.eval_note
|
||||
|
||||
eval_output_dir = os.path.join(
|
||||
args.eval_output_dir,
|
||||
'logic_reasoning',
|
||||
agent_class,
|
||||
dataset_name,
|
||||
model_name + '_maxiter_' + str(max_iterations) + eval_note
|
||||
)
|
||||
|
||||
pathlib.Path(eval_output_dir).mkdir(parents=True, exist_ok=True)
|
||||
pathlib.Path(os.path.join(eval_output_dir, 'logs')).mkdir(
|
||||
parents=True, exist_ok=True
|
||||
)
|
||||
logger.info(f'Using evaluation output directory: {eval_output_dir}')
|
||||
|
||||
# LIMIT EVALUATION
|
||||
eval_n_limit = args.eval_n_limit
|
||||
if eval_n_limit:
|
||||
logic_reasoning_tests = logic_reasoning_tests.select(list(range(eval_n_limit)))
|
||||
logger.info(f'Limiting evaluation to first {eval_n_limit} instances.')
|
||||
|
||||
start_time = time.strftime('%Y-%m-%d %H:%M:%S')
|
||||
|
||||
# OUTPUT FILE
|
||||
output_file = os.path.join(eval_output_dir, 'output.jsonl')
|
||||
logger.info(f'Writing evaluation output to {output_file}')
|
||||
finished_task_ids = set()
|
||||
if os.path.exists(output_file):
|
||||
with open(output_file, 'r') as f:
|
||||
for line in f:
|
||||
data = json.loads(line)
|
||||
finished_task_ids.add(data['id'])
|
||||
logger.warning(
|
||||
f'Output file {output_file} already exists. Loaded {len(finished_task_ids)} finished instances.'
|
||||
)
|
||||
output_fp = open(output_file, 'a')
|
||||
|
||||
logger.info(
|
||||
f'Evaluation started with Agent {agent_class}, model {model_name}, max iterations {max_iterations}.'
|
||||
)
|
||||
|
||||
# =============================================
|
||||
# filter out finished instances
|
||||
new_logic_reasoning_tests = []
|
||||
for instance in logic_reasoning_tests:
|
||||
if instance['id'] in finished_task_ids:
|
||||
logger.info(
|
||||
f'Skipping instance {instance["id"]} as it is already finished.'
|
||||
)
|
||||
continue
|
||||
new_logic_reasoning_tests.append(instance)
|
||||
|
||||
logic_reasoning_tests = new_logic_reasoning_tests
|
||||
logger.info(
|
||||
f'Finished instances: {len(finished_task_ids)}, Remaining instances: {len(logic_reasoning_tests)}'
|
||||
)
|
||||
# =============================================
|
||||
|
||||
pbar = tqdm(total=len(logic_reasoning_tests))
|
||||
|
||||
# This function tracks the progress AND write the output to a JSONL file
|
||||
def update_progress(future):
|
||||
pbar.update(1)
|
||||
output = future.result()
|
||||
pbar.set_description(f'Instance {output["id"]}')
|
||||
pbar.set_postfix_str(f'Test Result: {output["test_result"]["result"]}')
|
||||
logger.info(
|
||||
f'Finished evaluation for instance {output["id"]}: {output["test_result"]["result"]}'
|
||||
)
|
||||
output_fp.write(json.dumps(output) + '\n')
|
||||
# json.dump(output, output_fp, indent=4)
|
||||
output_fp.flush()
|
||||
|
||||
# This sets the multi-processing
|
||||
num_workers = args.eval_num_workers
|
||||
# num_workers = 1
|
||||
logger.info(f'Using {num_workers} workers for evaluation.')
|
||||
|
||||
# This is SWE-Bench specific - CodeActAgent don't requires mounted workspace to work
|
||||
skip_workspace_mount = False
|
||||
logger.info(f'Skipping workspace mount: {skip_workspace_mount}')
|
||||
|
||||
try:
|
||||
with ProcessPoolExecutor(num_workers) as executor:
|
||||
futures = []
|
||||
# This is how we perform multi-processing
|
||||
for instance in logic_reasoning_tests:
|
||||
future = executor.submit(
|
||||
process_instance,
|
||||
instance,
|
||||
agent_class,
|
||||
dataset_name,
|
||||
skip_workspace_mount,
|
||||
eval_output_dir,
|
||||
reset_logger=bool(num_workers > 1),
|
||||
)
|
||||
future.add_done_callback(update_progress)
|
||||
futures.append(future)
|
||||
|
||||
# Wait for all futures to complete
|
||||
for future in futures:
|
||||
future.result()
|
||||
except KeyboardInterrupt:
|
||||
print('KeyboardInterrupt received. Cleaning up...')
|
||||
cleanup()
|
||||
|
||||
output_fp.close()
|
||||
|
||||
with open(output_file, 'r') as f:
|
||||
test_result = [(json.loads(line))["test_result"]["result"] for line in f]
|
||||
|
||||
metadata = {
|
||||
"Dataset": dataset_name,
|
||||
"Data split": data_split,
|
||||
"Number of Samples": len(test_result),
|
||||
'Agent class': agent_class,
|
||||
'Model name': model_name,
|
||||
'Start_time': start_time,
|
||||
"End_time": time.strftime('%Y-%m-%d %H:%M:%S'),
|
||||
"Final Accuracy": f"{sum(test_result)/len(test_result):.2f}",
|
||||
}
|
||||
|
||||
with open(os.path.join(eval_output_dir, 'metadata.json'), 'w') as f:
|
||||
json.dump(metadata, f, indent=4)
|
||||
|
||||
logger.info(f'Metadata: {json.dumps(metadata, indent=4)}')
|
||||
logger.info(f'Evaluation finished. Metadata saved to {eval_output_dir}/metadata.json')
|
||||
37
evaluation/logic_reasoning/scripts/run_infer.sh
Executable file
37
evaluation/logic_reasoning/scripts/run_infer.sh
Executable file
@ -0,0 +1,37 @@
|
||||
#!/bin/bash
|
||||
DATASET=$1
|
||||
MODEL_CONFIG=$2
|
||||
EVAL_LIMIT=$3
|
||||
AGENT=$4
|
||||
|
||||
# ################################################################################
|
||||
|
||||
if [ -z "$AGENT" ]; then
|
||||
echo "Agent not specified, use default CodeActAgent"
|
||||
AGENT="CodeActAgent"
|
||||
fi
|
||||
|
||||
# IMPORTANT: Because Agent's prompt changes fairly often in the rapidly evolving codebase of OpenDevin
|
||||
# We need to track the version of Agent in the evaluation to make sure results are comparable
|
||||
AGENT_VERSION=v$(poetry run python -c "import agenthub; from opendevin.controller.agent import Agent; print(Agent.get_cls('$AGENT').VERSION)")
|
||||
|
||||
echo "AGENT: $AGENT"
|
||||
echo "AGENT_VERSION: $AGENT_VERSION"
|
||||
echo "MODEL_CONFIG: $MODEL_CONFIG"
|
||||
|
||||
COMMAND="poetry run python evaluation/logic_reasoning/run_infer.py \
|
||||
--agent-cls $AGENT \
|
||||
--llm-config $MODEL_CONFIG \
|
||||
--dataset $DATASET \
|
||||
--max-iterations 10 \
|
||||
--max-chars 10000000 \
|
||||
--eval-num-workers 1 \
|
||||
--eval-note $AGENT_VERSION"
|
||||
|
||||
if [ -n "$EVAL_LIMIT" ]; then
|
||||
echo "EVAL_LIMIT: $EVAL_LIMIT"
|
||||
COMMAND="$COMMAND --eval-n-limit $EVAL_LIMIT"
|
||||
fi
|
||||
|
||||
# Run the command
|
||||
eval $COMMAND
|
||||
Loading…
x
Reference in New Issue
Block a user