mirror of
https://github.com/OpenHands/OpenHands.git
synced 2025-12-26 13:52:43 +08:00
* move multi-line bash tests to test_runtime; support multi-line bash for esruntime; * add testcase to handle PS2 prompt * use bashlex for bash parsing to handle multi-line commands; add testcases for multi-line commands * revert ghcr runtime change * Apply stash * fix run as other user; make test async; * fix test runtime for run as od * add run-as-devin to all the runtime tests * handle the case when username is root * move all run-as-devin tests from sandbox; only tests a few cases on different user to save time; * move over multi-line echo related tests to test_runtime * fix user-specific jupyter by fixing the pypoetry virtualenv folder * make plugin's init async; chdir at initialization of jupyter plugin; move ipy simple testcase to test runtime; * support agentskills import in move tests for jupyter pwd tests; overload `add_env_vars` for EventStreamRuntime to update env var also in Jupyter; make agentskills read env var lazily, in case env var is updated; * fix ServerRuntime agentskills issue * move agnostic image test to test_runtime * merge runtime tests in CI * fix enable auto lint as env var * update warning message * update warning message * test for different container images * change parsing output as debug * add exception handling for update_pwd_decorator * fix unit test indentation * add plugins as default input to Runtime class; remove init_sandbox_plugins; implement add_env_var (include jupyter) in the base class; * fix server runtime auto lint * Revert "add exception handling for update_pwd_decorator" This reverts commit 2b668b1506e02145cb8f87e321aad62febca3d50. * tries to print debugging info for agentskills * explictly setting uid (try fix permission issue) * Revert "tries to print debugging info for agentskills" This reverts commit 8be4c86756f0e3fc62957b327ba2ac4999c419de. * set sandbox user id during testing to hopefully fix the permission issue * add browser tools for server runtime * try to debug for old pwd * update debug cmd * only test agnostic runtime when TEST_RUNTIME is Server * fix temp dir mkdir * load TEST_RUNTIME at the beginning * remove ipython tests * only log to file when DEBUG * default logging to project root * temporarily remove log to file * fix LLM logger dir * fix logger * make set pwd an optional aux action * fix prev pwd * fix infinity recursion * simplify * do not import the whole od library to avoid logger folder by jupyter * fix browsing * increase timeout * attempt to fix agentskills yet again * clean up in testcases, since CI maybe run as non-root * add _cause attribute for event.id * remove parent * add a bunch of debugging statement again for CI :( * fix temp_dir fixture * change all temp dir to follow pytest's tmp_path_factory * remove extra bracket * clean up error printing a bit * jupyter chdir to self.config.workspace_mount_path_in_sandbox on initialization * jupyter chdir to self.config.workspace_mount_path_in_sandbox on initialization * add typing for tmp dir fixture * clear the directory before running the test to avoid weird CI temp dir * remove agnostic test case for server runtime * Revert "remove agnostic test case for server runtime" This reverts commit 30e2181c3fc1410e69596c2dcd06be01f1d016b3. * disable agnostic tests in CI * fix test * make sure plugin arg is not passed when no plugin is specified; remove redundant on_event function; * move mock prompt * rename runtime * remove extra logging * refactor run_controller's interface; support multiple runtime for integration test; filter out hostname for prompt * uncomment other tests * pass the right runtime to controller * log runtime when start * uncomment tests * improve symbol filters * add intergration test prompts that seemd ok * add integration test workflow * add python3 to default ubuntu image * symlink python and fix permission to jupyter pip * add retry for jupyter execute server * fix jupyter pip install; add post-process for jupyter pip install; simplify init by add agent_skills path to PYTHONPATH; add testcase to tests jupyter pip install; * fix bug * use ubuntu:22.04 for eventstream integration tests * add todo * update testcase * remove redundant code * fix unit test * reduce dependency for runtime * try making llama-index an optional dependency that's not installed by default * remove pip install since it seemd not needed * log ipython execution; await write message since it returns a future * update ipy testcase * do not install llama-index in CI * do not install llama-index in the app docker as well * set sandbox container image in the integration test script * log plugins & env var for runtime * update conftest for sha256 * add git * remove all non-alphanumeric chalracters * add working ipy module tests! * default to use host network * remove is_async from browser to make thing a little more reliable; retry loading browser when error; * add sleep to wait a bit for http server * kill http server before regenerate browsing tests * fix browsing * only set sandbox container image if undefined * skip empty config value * update evaluation to use the latest run_controller * revert logger in execute_server to be compatible with server runtime * revert logging level to fix jupyter * set logger level * revert the logging * chmod for workspace to fix permission * support getting timeout from action * update test for server runtime * try to fix file permission * fix test_cmd_run_action_serialization_deserialization test (added timeout) * poetry: pip 24.2, torch 2.2.2 * revert adding pip to pyproject.toml * add build to dependencies in pyproject.toml * forgot poetry lock --no-update * fix a DelegatorAgent prompt_002.log (timeout) * fix a DelegatorAgent prompt_003.log (timeout) * couple more timeout attribs in prompt files * some more prompt files * prompts galore * add clarification comment for timeout * default timeout to config * add assert * update integraton tests for eventstream * update integration tests * fix timeout for action<->dict * remove redundant on_event * default to use instance image * update run_controller interface * add logging for copy * refactor swe_bench for the new design * fix action execution timeout * updatelock * remove build sandbox locally * fix runtime * use plain for-loop for single process * remove extra print * get swebench inference working * print whole `test_result` dict * got swebench patch post-process working * update swe-bench evaluation readme * refactor using shared reset_logger function * move messy swebench prompt to a different file * support the ability to specify whether to keep prompt * support the ability to specify whether to keep prompt * fix dockerfile * fix import and remove unnecessary strip logic * fix action serialization * get agentbench running * remove extra ls for agent bench * fix agentbench metric * factor out common documentation for eval * update biocoder doc * remove swe_env_box since it is no longer needed * get biocoder working * add func timeout for bird * fix jupyter pwd with ~ as user name * fix jupyter pwd with ~ as user name * get bird working * get browsing evaluation working * make eda runnable * fix id column * fix eda run_infer * unify eval output using a structured format; make swebench coompatible with that format; update client source code for every swebench run; do not inject testcmd for swebench * standardize existing benchs for the new eval output * set update source code = true * get gaia standardized * fix gaia * gorilla refactored but stuck at language.so to test * refactor and make gpqa work * refactor humanevalfix and get it working * refactor logic reasoning and get it working * refactor browser env so it works with eventstream runtime for eval * add initial version of miniwob refactor * fix browsergym environment * get miniwob working!! * allowing injecting additional dependency to OD runtime docker image * allowing injecting additional dependency to OD runtime docker image * support logic reasoning with pre-injected dependency * get mint working * update runtime build * fix mint docker * add test for keep_prompt; add missing await close for some tests * update integration tests for eventstream runtime * fix integration tests for server runtime * refactor ml bench and toolqa * refactor webarena * fix default factory * Update run_infer.py * add APIError to retry * increase timeout for swebench * make sure to hide api key when dump eval output * update the behavior of put source code to put files instead of tarball * add dishash to dependency * sendintr when timeout * fix dockerfile copy * reduce timeout * use dirhash to avoid repeat building for update source * fix runtime_build testcase * add dir_hash to docker build pipeline * revert api error * update poetry lock * add retries for swebench run infer * fix git patch * update poetry lock * adjust config order * fix mount volumns * enforce all eval to use "instance_id" * remove file store from runtime * make file_store public inside eventstream * move the runtime logic inside `main` out * support using async function for process_instance_fn * refactor run_infer with the create_time * fix file store * Update evaluation/toolqa/utils.py Co-authored-by: Graham Neubig <neubig@gmail.com> * fix typo --------- Co-authored-by: tobitege <tobitege@gmx.de> Co-authored-by: super-dainiu <78588128+super-dainiu@users.noreply.github.com> Co-authored-by: Graham Neubig <neubig@gmail.com>
221 lines
7.3 KiB
Python
221 lines
7.3 KiB
Python
import os
|
|
import random
|
|
import re
|
|
import shutil
|
|
|
|
from pyke import knowledge_engine
|
|
|
|
|
|
class PykeProgram:
|
|
def __init__(
|
|
self, logic_program: str, dataset_name='ProntoQA', workspace_mount_path='./'
|
|
) -> None:
|
|
self.logic_program = logic_program
|
|
self.flag = self.parse_logic_program()
|
|
self.dataset_name = dataset_name
|
|
self.cache_dir = os.path.join(workspace_mount_path, '.cache_program')
|
|
|
|
# prepare the files for facts and rules
|
|
try:
|
|
self.create_fact_file(self.Facts)
|
|
self.create_rule_file(self.Rules)
|
|
self.flag = True
|
|
except Exception:
|
|
self.flag = False
|
|
|
|
self.answer_map = {
|
|
'ProntoQA': self.answer_map_prontoqa,
|
|
'ProofWriter': self.answer_map_proofwriter,
|
|
}
|
|
|
|
def parse_logic_program(self):
|
|
keywords = ['Query:', 'Rules:', 'Facts:', 'Predicates:']
|
|
program_str = self.logic_program
|
|
for keyword in keywords:
|
|
try:
|
|
program_str, segment_list = self._parse_segment(program_str, keyword)
|
|
setattr(self, keyword[:-1], segment_list)
|
|
except Exception:
|
|
setattr(self, keyword[:-1], None)
|
|
|
|
return self.validate_program()
|
|
|
|
def _parse_segment(self, program_str, key_phrase):
|
|
remain_program_str, segment = program_str.split(key_phrase)
|
|
segment_list = segment.strip().split('\n')
|
|
for i in range(len(segment_list)):
|
|
segment_list[i] = segment_list[i].split(':::')[0].strip()
|
|
return remain_program_str, segment_list
|
|
|
|
# check if the program is valid; if not, try to fix it
|
|
def validate_program(self):
|
|
if self.Rules is not None and self.Facts is not None:
|
|
if not self.Rules[0] == '' and not self.Facts[0] == '':
|
|
return True
|
|
# try to fix the program
|
|
tmp_rules = []
|
|
tmp_facts = []
|
|
statements = self.Facts if self.Facts is not None else self.Rules
|
|
if statements is None:
|
|
return False
|
|
|
|
for fact in statements:
|
|
if fact.find('>>>') >= 0: # this is a rule
|
|
tmp_rules.append(fact)
|
|
else:
|
|
tmp_facts.append(fact)
|
|
self.Rules = tmp_rules
|
|
self.Facts = tmp_facts
|
|
return False
|
|
|
|
def create_fact_file(self, facts):
|
|
with open(os.path.join(self.cache_dir, 'facts.kfb'), 'w') as f:
|
|
for fact in facts:
|
|
# check for invalid facts
|
|
if not fact.find('$x') >= 0:
|
|
f.write(fact + '\n')
|
|
|
|
def create_rule_file(self, rules):
|
|
pyke_rules = []
|
|
for idx, rule in enumerate(rules):
|
|
pyke_rules.append(self.parse_forward_rule(idx + 1, rule))
|
|
|
|
with open(os.path.join(self.cache_dir, 'rules.krb'), 'w') as f:
|
|
f.write('\n\n'.join(pyke_rules))
|
|
|
|
# example rule: Furry($x, True) && Quite($x, True) >>> White($x, True)
|
|
def parse_forward_rule(self, f_index, rule):
|
|
premise, conclusion = rule.split('>>>')
|
|
premise = premise.strip()
|
|
# split the premise into multiple facts if needed
|
|
premise = premise.split('&&')
|
|
premise_list = [p.strip() for p in premise]
|
|
|
|
conclusion = conclusion.strip()
|
|
# split the conclusion into multiple facts if needed
|
|
conclusion = conclusion.split('&&')
|
|
conclusion_list = [c.strip() for c in conclusion]
|
|
|
|
# create the Pyke rule
|
|
pyke_rule = f"""fact{f_index}\n\tforeach"""
|
|
for p in premise_list:
|
|
pyke_rule += f"""\n\t\tfacts.{p}"""
|
|
pyke_rule += """\n\tassert"""
|
|
for c in conclusion_list:
|
|
pyke_rule += f"""\n\t\tfacts.{c}"""
|
|
return pyke_rule
|
|
|
|
"""
|
|
for example: Is Marvin from Mars?
|
|
Query: FromMars(Marvin, $label)
|
|
"""
|
|
|
|
def check_specific_predicate(self, subject_name, predicate_name, engine):
|
|
results = []
|
|
with engine.prove_goal(
|
|
f'facts.{predicate_name}({subject_name}, $label)'
|
|
) as gen:
|
|
for vars, plan in gen:
|
|
results.append(vars['label'])
|
|
|
|
with engine.prove_goal(
|
|
f'rules.{predicate_name}({subject_name}, $label)'
|
|
) as gen:
|
|
for vars, plan in gen:
|
|
results.append(vars['label'])
|
|
|
|
if len(results) == 1:
|
|
return results[0]
|
|
elif len(results) == 2:
|
|
return results[0] and results[1]
|
|
elif len(results) == 0:
|
|
return None
|
|
|
|
"""
|
|
Input Example: Metallic(Wren, False)
|
|
"""
|
|
|
|
def parse_query(self, query):
|
|
pattern = r'(\w+)\(([^,]+),\s*([^)]+)\)'
|
|
match = re.match(pattern, query)
|
|
if match:
|
|
function_name = match.group(1)
|
|
arg1 = match.group(2)
|
|
arg2 = match.group(3)
|
|
arg2 = True if arg2 == 'True' else False
|
|
return function_name, arg1, arg2
|
|
else:
|
|
raise ValueError(f'Invalid query: {query}')
|
|
|
|
def execute_program(self):
|
|
# delete the compiled_krb dir
|
|
complied_krb_dir = './models/compiled_krb'
|
|
if os.path.exists(complied_krb_dir):
|
|
print('removing compiled_krb')
|
|
# os.system(f'rm -rf {complied_krb_dir}/*')
|
|
shutil.rmtree(complied_krb_dir)
|
|
|
|
# absolute_path = os.path.abspath(complied_krb_dir)
|
|
# print(absolute_path)
|
|
try:
|
|
engine = knowledge_engine.engine(self.cache_dir)
|
|
engine.reset()
|
|
engine.activate('rules')
|
|
engine.get_kb('facts')
|
|
|
|
# parse the logic query into pyke query
|
|
predicate, subject, value_to_check = self.parse_query(self.Query[0])
|
|
result = self.check_specific_predicate(subject, predicate, engine)
|
|
answer = self.answer_map[self.dataset_name](result, value_to_check)
|
|
except Exception as err:
|
|
return None, err
|
|
|
|
return answer, ''
|
|
|
|
def answer_mapping(self, answer):
|
|
return answer
|
|
|
|
def answer_map_prontoqa(self, result, value_to_check):
|
|
if result == value_to_check:
|
|
return 'A'
|
|
else:
|
|
return 'B'
|
|
|
|
def answer_map_proofwriter(self, result, value_to_check):
|
|
if result is None:
|
|
return 'C'
|
|
elif result == value_to_check:
|
|
return 'A'
|
|
else:
|
|
return 'B'
|
|
|
|
|
|
class LogicInferenceEngine:
|
|
def __init__(self):
|
|
self.dataset_name = os.environ.get('DATASET_NAME', 'ProofWriter')
|
|
self.workspace_mount_path = '/workspace'
|
|
|
|
def random_backup(self):
|
|
if self.dataset_name == 'ProntoQA':
|
|
return random.choice(['A', 'B'])
|
|
elif self.dataset_name == 'ProofWriter':
|
|
return random.choice(['A', 'B', 'C'])
|
|
|
|
def safe_execute_program(self, logic_program):
|
|
program = PykeProgram(
|
|
logic_program, self.dataset_name, self.workspace_mount_path
|
|
)
|
|
# cannot parse the program
|
|
if not program.flag:
|
|
answer = self.random_backup()
|
|
return answer, 'parsing error', ''
|
|
# execute the program
|
|
answer, error_message = program.execute_program()
|
|
# not executable
|
|
if answer is None:
|
|
answer = self.random_backup()
|
|
return answer, 'execution error', error_message
|
|
# successfully executed
|
|
answer = program.answer_mapping(answer)
|
|
return answer, 'success', ''
|