Support Logic Reasoning Benchmark (#1973)

This commit is contained in:
Ren Ma 2024-05-30 16:35:15 +08:00 committed by GitHub
parent 01ef90205d
commit a9823491e6
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
10 changed files with 836 additions and 0 deletions

View File

@ -0,0 +1,10 @@
{
"Dataset": "ProntoQA",
"Data split": "validation",
"Number of Samples": 6,
"Agent class": "CodeActAgent",
"Model name": "gpt-4o-2024-05-13",
"Start_time": "2024-05-29 17:51:09",
"End_time": "2024-05-29 17:52:24",
"Final Accuracy": "0.83"
}

View File

@ -0,0 +1,12 @@
Cold(Bob, True)
Quiet(Bob, True)
Red(Bob, True)
Smart(Bob, True)
Kind(Charlie, True)
Quiet(Charlie, True)
Red(Charlie, True)
Rough(Charlie, True)
Cold(Dave, True)
Kind(Dave, True)
Smart(Dave, True)
Quiet(Fiona, True)

View File

@ -0,0 +1,52 @@
fact1
foreach
facts.Quiet($x, True)
facts.Cold($x, True)
assert
facts.Smart($x, True)
fact2
foreach
facts.Red($x, True)
facts.Cold($x, True)
assert
facts.Round($x, True)
fact3
foreach
facts.Kind($x, True)
facts.Rough($x, True)
assert
facts.Red($x, True)
fact4
foreach
facts.Quiet($x, True)
assert
facts.Rough($x, True)
fact5
foreach
facts.Cold($x, True)
facts.Smart($x, True)
assert
facts.Red($x, True)
fact6
foreach
facts.Rough($x, True)
assert
facts.Cold($x, True)
fact7
foreach
facts.Red($x, True)
assert
facts.Rough($x, True)
fact8
foreach
facts.Smart(Dave, True)
facts.Kind(Dave, True)
assert
facts.Quiet(Dave, True)

View File

@ -0,0 +1,43 @@
# Logic Reasoning Evaluation
This folder contains evaluation harness for evaluating agents on the logic reasoning benchmark [ProntoQA](https://github.com/asaparov/prontoqa) and [ProofWriter](https://allenai.org/data/proofwriter).
## Configure OpenDevin and your LLM
Create a `config.toml` file if it does not exist at the root of the workspace.
Add the following configurations:
```toml
[core]
max_iterations = 100
cache_dir = "/tmp/cache"
ssh_hostname = "localhost"
enable_auto_lint = true
# TODO: Change these to the model you want to evaluate
[eval_gpt4_1106_preview]
model = "gpt-4-1106-preview"
api_key = "XXX"
temperature = 0.0
[eval_some_openai_compatible_model]
model = "openai/MODEL_NAME"
base_url = "https://OPENAI_COMPATIBLE_URL/v1"
api_key = "XXX"
temperature = 0.0
```
## Run Inference on logic_reasoning
The following code will run inference on the first example of the ProntoQA dataset with model gpt-4o.
```bash
./evaluation/logic_reasoning/scripts/run_infer.sh ProntoQA gpt-4o 1
```
## Examples
See example output in
`./evaluation_outputs/outputs/logic_reasoning/CodeActAgent/ProntoQA/gpt-4o-2024-05-13_maxiter_10_N_v1.5/output.jsonl`
and final evaluation performance in
`./evaluation_outputs/outputs/logic_reasoning/CodeActAgent/ProntoQA/gpt-4o-2024-05-13_maxiter_10_N_v1.5/metadata.json`

View File

View File

@ -0,0 +1,20 @@
You are a helpful assistant assigned with logic reasoning task. You need to determine the correctness of a query given some facts and fules.
you can interact with an interactive Python (Jupyter Notebook) environment and receive the corresponding output when needed. The code should be enclosed using "<execute_ipython>" tag.
In this task, you need to use the code in [[logic_inference_path.py]] to help you. Specifically, you first need to instantiate a **LogicInferenceEngine** class and use the **safe_execute_program** method to prove the **logic programs**. You should receive *answer*, *flag*, *error_message* from the output.
An example would be look like this:
<execute_ipython>
import sys
sys.path.append(workspace_mount_path)
engine = LogicInferenceEngine(dataset_name, workspace_mount_path)
answer, flag, error_message = engine.safe_execute_program(logic_programs)
</execute_ipython>
Please send the *answer* variable through message.
dataset_name:
[[dataset_name]]
logic_programs:
[[logic_programs]]

View File

@ -0,0 +1,220 @@
import os
import random
import re
import shutil
from pyke import knowledge_engine
class PykeProgram:
def __init__(
self, logic_program: str, dataset_name='ProntoQA', workspace_mount_path='./'
) -> None:
self.logic_program = logic_program
self.flag = self.parse_logic_program()
self.dataset_name = dataset_name
self.cache_dir = os.path.join(workspace_mount_path, '.cache_program')
# prepare the files for facts and rules
try:
self.create_fact_file(self.Facts)
self.create_rule_file(self.Rules)
self.flag = True
except Exception:
self.flag = False
self.answer_map = {
'ProntoQA': self.answer_map_prontoqa,
'ProofWriter': self.answer_map_proofwriter,
}
def parse_logic_program(self):
keywords = ['Query:', 'Rules:', 'Facts:', 'Predicates:']
program_str = self.logic_program
for keyword in keywords:
try:
program_str, segment_list = self._parse_segment(program_str, keyword)
setattr(self, keyword[:-1], segment_list)
except Exception:
setattr(self, keyword[:-1], None)
return self.validate_program()
def _parse_segment(self, program_str, key_phrase):
remain_program_str, segment = program_str.split(key_phrase)
segment_list = segment.strip().split('\n')
for i in range(len(segment_list)):
segment_list[i] = segment_list[i].split(':::')[0].strip()
return remain_program_str, segment_list
# check if the program is valid; if not, try to fix it
def validate_program(self):
if self.Rules is not None and self.Facts is not None:
if not self.Rules[0] == '' and not self.Facts[0] == '':
return True
# try to fix the program
tmp_rules = []
tmp_facts = []
statements = self.Facts if self.Facts is not None else self.Rules
if statements is None:
return False
for fact in statements:
if fact.find('>>>') >= 0: # this is a rule
tmp_rules.append(fact)
else:
tmp_facts.append(fact)
self.Rules = tmp_rules
self.Facts = tmp_facts
return False
def create_fact_file(self, facts):
with open(os.path.join(self.cache_dir, 'facts.kfb'), 'w') as f:
for fact in facts:
# check for invalid facts
if not fact.find('$x') >= 0:
f.write(fact + '\n')
def create_rule_file(self, rules):
pyke_rules = []
for idx, rule in enumerate(rules):
pyke_rules.append(self.parse_forward_rule(idx + 1, rule))
with open(os.path.join(self.cache_dir, 'rules.krb'), 'w') as f:
f.write('\n\n'.join(pyke_rules))
# example rule: Furry($x, True) && Quite($x, True) >>> White($x, True)
def parse_forward_rule(self, f_index, rule):
premise, conclusion = rule.split('>>>')
premise = premise.strip()
# split the premise into multiple facts if needed
premise = premise.split('&&')
premise_list = [p.strip() for p in premise]
conclusion = conclusion.strip()
# split the conclusion into multiple facts if needed
conclusion = conclusion.split('&&')
conclusion_list = [c.strip() for c in conclusion]
# create the Pyke rule
pyke_rule = f"""fact{f_index}\n\tforeach"""
for p in premise_list:
pyke_rule += f"""\n\t\tfacts.{p}"""
pyke_rule += """\n\tassert"""
for c in conclusion_list:
pyke_rule += f"""\n\t\tfacts.{c}"""
return pyke_rule
"""
for example: Is Marvin from Mars?
Query: FromMars(Marvin, $label)
"""
def check_specific_predicate(self, subject_name, predicate_name, engine):
results = []
with engine.prove_goal(
f'facts.{predicate_name}({subject_name}, $label)'
) as gen:
for vars, plan in gen:
results.append(vars['label'])
with engine.prove_goal(
f'rules.{predicate_name}({subject_name}, $label)'
) as gen:
for vars, plan in gen:
results.append(vars['label'])
if len(results) == 1:
return results[0]
elif len(results) == 2:
return results[0] and results[1]
elif len(results) == 0:
return None
"""
Input Example: Metallic(Wren, False)
"""
def parse_query(self, query):
pattern = r'(\w+)\(([^,]+),\s*([^)]+)\)'
match = re.match(pattern, query)
if match:
function_name = match.group(1)
arg1 = match.group(2)
arg2 = match.group(3)
arg2 = True if arg2 == 'True' else False
return function_name, arg1, arg2
else:
raise ValueError(f'Invalid query: {query}')
def execute_program(self):
# delete the compiled_krb dir
complied_krb_dir = './models/compiled_krb'
if os.path.exists(complied_krb_dir):
print('removing compiled_krb')
# os.system(f'rm -rf {complied_krb_dir}/*')
shutil.rmtree(complied_krb_dir)
# absolute_path = os.path.abspath(complied_krb_dir)
# print(absolute_path)
try:
engine = knowledge_engine.engine(self.cache_dir)
engine.reset()
engine.activate('rules')
engine.get_kb('facts')
# parse the logic query into pyke query
predicate, subject, value_to_check = self.parse_query(self.Query[0])
result = self.check_specific_predicate(subject, predicate, engine)
answer = self.answer_map[self.dataset_name](result, value_to_check)
except Exception as err:
return None, err
return answer, ''
def answer_mapping(self, answer):
return answer
def answer_map_prontoqa(self, result, value_to_check):
if result == value_to_check:
return 'A'
else:
return 'B'
def answer_map_proofwriter(self, result, value_to_check):
if result is None:
return 'C'
elif result == value_to_check:
return 'A'
else:
return 'B'
class LogicInferenceEngine:
def __init__(self, dataset_name, workspace_mount_path):
self.dataset_name = dataset_name
self.workspace_mount_path = workspace_mount_path
def random_backup(self):
if self.dataset_name == 'ProntoQA':
return random.choice(['A', 'B'])
elif self.dataset_name == 'ProofWriter':
return random.choice(['A', 'B', 'C'])
def safe_execute_program(self, logic_program):
program = PykeProgram(
logic_program, self.dataset_name, self.workspace_mount_path
)
# cannot parse the program
if not program.flag:
answer = self.random_backup()
return answer, 'parsing error', ''
# execuate the program
answer, error_message = program.execute_program()
# not executable
if answer is None:
answer = self.random_backup()
return answer, 'execution error', error_message
# successfully executed
answer = program.answer_mapping(answer)
return answer, 'success', ''

View File

@ -0,0 +1,436 @@
import asyncio
import json
import logging
import multiprocessing as mp
import os
import pathlib
import shutil
import time
from concurrent.futures import ProcessPoolExecutor
from datasets import load_dataset
from tqdm import tqdm
from evaluation.swe_bench.swe_env_box import DockerSSHBox
from opendevin.controller.state.state import State
from opendevin.core.config import config, get_llm_config_arg, get_parser
from opendevin.core.logger import get_console_handler
from opendevin.core.logger import opendevin_logger as logger
from opendevin.core.main import main
from opendevin.events.action import MessageAction
from opendevin.events.serialization.event import event_to_dict
def cleanup():
logger.info('Cleaning up child processes...')
for process in mp.active_children():
logger.info(f'Terminating child process: {process.name}')
process.terminate()
process.join()
def codeact_user_response(state: State) -> str:
msg = (
'Please continue working on the task on whatever approach you think is suitable.\n'
'If you think you have solved the task, please run the following command: <execute_bash> exit </execute_bash>.\n'
'IMPORTANT: YOU SHOULD NEVER ASK FOR HUMAN HELP OR USE THE INTERNET TO SOLVE THIS TASK.\n'
)
if state.history:
user_msgs = [
action
for action, _ in state.history
if isinstance(action, MessageAction) and action.source == 'user'
]
if len(user_msgs) >= 2:
# let the agent know that it can give up when it has tried 3 times
return (
msg
+ 'If you want to give up, run: <execute_bash> exit </execute_bash>.\n'
)
return msg
def monologue_user_response(state: State) -> str:
raise NotImplementedError('MonologueAgent should never ask for user responses.')
AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = {
'CodeActAgent': codeact_user_response,
'MonologueAgent': monologue_user_response,
}
AGENT_CLS_TO_INST_SUFFIX = {
'CodeActAgent': 'When you think you have solved the question, please first send your answer to user through message and then exit.\n'
}
def get_choice(answer_str):
choices = [
'A',
'B',
'C',
'D',
'E',
'F',
'G',
'H',
'A)',
'B)',
'C)',
'D)',
'E)',
'F)',
'G)',
'H)',
'A.',
'B.',
'C.',
'D.',
'E.',
'F.',
'G.',
'H.',
]
for c in choices:
if answer_str.startswith(c):
return c.replace(')', '')
if answer_str.startswith(':'):
return answer_str.replace(':', '').replace('.', '').strip()
return None
def get_test_result(
model_answer: str,
ground_truth: str,
) -> bool:
gold_answer = ground_truth.replace('(', '').replace(')', '').strip()
answer_str = model_answer if model_answer is not None else ''
prediction = get_choice(answer_str)
indicators = [
'the correct option is',
'the correct answer is',
'The correct answer is',
'The correct option is',
'Thus, the answer is',
]
if prediction is None:
for indicator in indicators:
if answer_str.find(indicator) >= 0:
answer_str = answer_str.split(indicator)[1].strip()
prediction = get_choice(answer_str)
break
isTrue = prediction == gold_answer
test_result = {'result': isTrue}
return test_result
def process_instance(
instance,
agent_class,
# metadata,
dataset_name,
skip_workspace_mount,
eval_output_dir,
reset_logger: bool = True,
):
old_workspace_mount_path = config.workspace_mount_path
old_workspace_base = config.workspace_base
workspace_mount_path = os.path.join(config.workspace_mount_path, '_eval_workspace')
# create process-specific workspace dir
# if `not skip_workspace_mount` - we will create a workspace directory for EACH process
# so that different agent don't interfere with each other.
if not skip_workspace_mount:
workspace_mount_path = os.path.join(workspace_mount_path, str(os.getpid()))
pathlib.Path(workspace_mount_path).mkdir(parents=True, exist_ok=True)
# reset workspace to config
config.workspace_base = workspace_mount_path
config.workspace_mount_path = workspace_mount_path
# Setup the logger properly, so you can run multi-processing to parallize the evaluation
if reset_logger:
# Set up logger
log_file = os.path.join(
eval_output_dir, 'logs', f'instance_{instance["id"]}.log'
)
# Remove all existing handlers from logger
for handler in logger.handlers[:]:
logger.removeHandler(handler)
# add back the console handler to print ONE line
logger.addHandler(get_console_handler())
logger.info(
f'Starting evaluation for instance {instance["id"]}.\nLOG: tail -f {log_file}'
)
# Remove all existing handlers from logger
for handler in logger.handlers[:]:
logger.removeHandler(handler)
file_handler = logging.FileHandler(log_file)
file_handler.setFormatter(
logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
)
logger.addHandler(file_handler)
if not skip_workspace_mount:
logger.info(f'Process-specific workspace mounted at {workspace_mount_path}')
# sandbox = DockerSSHBox()
logic_inference_path = os.path.join(workspace_mount_path, 'logic_inference.py')
if not os.path.exists(logic_inference_path):
shutil.copyfile(
'./evaluation/logic_reasoning/logic_inference.py', logic_inference_path
)
logger.info(f'logic_inference.py copied to {workspace_mount_path}')
cache_dir = os.path.join(workspace_mount_path, '.cache_program')
if not os.path.exists(cache_dir):
os.makedirs(cache_dir)
# Prepare instruction
with open('./evaluation/logic_reasoning/instruction.txt', 'r') as f:
instruction = f.read()
instance_logic_programs = instance['raw_logic_programs'][0].strip()
instruction = instruction.replace('[[dataset_name]]', dataset_name)
instruction = instruction.replace('[[logic_programs]]', instance_logic_programs)
instruction = instruction.replace(
'[[logic_inference_path.py]]', logic_inference_path
)
# NOTE: You can actually set slightly different instruction for different agents
instruction += AGENT_CLS_TO_INST_SUFFIX.get(agent_class, '')
sandbox = DockerSSHBox()
exit_code, command_output = sandbox.execute(f'pip install scitools-pyke')
# Here's how you can run the agent (similar to the `main` function) and get the final task state
state: State = asyncio.run(
main(
instruction,
fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN.get(agent_class),
sandbox=sandbox,
)
)
# ======= Attempt to evaluate the agent's edits =======
# If you are working on simplier benchmark that only evaluates the final model output (e.g., in a MessageAction)
# You can simply get the LAST `MessageAction` from the returned `state.history` and parse it for evaluation.
if state is None:
raise ValueError('State should not be None.')
final_message = ''
messages = []
for action, obs in reversed(state.history):
# if isinstance(act, MessageAction):
messages.append(obs.content)
# print("obs.content:", obs.content)
if str(obs.content) in ["'A'", "'B'", "'C'"]:
final_message = obs.content
break
final_message = final_message.strip("'")
logger.info(f'Predicted answer: {final_message}, Ground truth: {instance["answer"]}')
test_result = get_test_result(
model_answer=final_message, ground_truth=instance['answer']
)
# Save the output
output = {
'id': instance['id'],
'instance': instance,
'instruction': instruction,
# 'metadata': metadata,
'history': [
(event_to_dict(action), event_to_dict(obs)) for action, obs in state.history
],
'final_message': final_message,
'messages': messages,
'error': state.error if state and state.error else None,
'test_result': test_result,
}
config.workspace_mount_path = old_workspace_mount_path
config.workspace_base = old_workspace_base
# Close the sandbox
sandbox.close()
return output
if __name__ == '__main__':
parser = get_parser()
parser.add_argument(
'--dataset',
type=str,
help='the logic reasoning dataset to evaluate on {ProntoQA, ProofWriter}',
default='ProntoQA',
)
parser.add_argument(
'--data_split',
type=str,
help='data split to evaluate on {validation}', # right now we only support validation split
default='validation',
)
args, _ = parser.parse_known_args()
if args.directory:
config.workspace_base = os.path.abspath(args.directory)
print(f'Setting workspace base to {config.workspace_base}')
# NOTE: It is preferable to load datasets from huggingface datasets and perform post-processing
# so we don't need to manage file uploading to OpenDevin's repo
dataset_name = args.dataset
data_split = args.data_split
dataset = load_dataset(f'renma/{dataset_name}')
logic_reasoning_tests = dataset[data_split]
logger.info(f'Evaluating logic reasoning dataset {dataset_name} {data_split} split')
# Check https://github.com/OpenDevin/OpenDevin/blob/main/evaluation/swe_bench/README.md#configure-opendevin-and-your-llm
# for details of how to set `llm_config`
if args.llm_config:
specified_llm_config = get_llm_config_arg(args.llm_config)
if specified_llm_config:
config.llm = specified_llm_config
logger.info(f'Config for evaluation: {config}')
# TEST METADATA
agent_class = args.agent_cls
assert (
agent_class in AGENT_CLS_TO_FAKE_USER_RESPONSE_FN
), f'Unsupported agent class: {agent_class}'
model_name = config.llm.model.split('/')[-1]
max_iterations = args.max_iterations
eval_note = ''
if args.eval_note is not None:
eval_note += '_N_' + args.eval_note
eval_output_dir = os.path.join(
args.eval_output_dir,
'logic_reasoning',
agent_class,
dataset_name,
model_name + '_maxiter_' + str(max_iterations) + eval_note
)
pathlib.Path(eval_output_dir).mkdir(parents=True, exist_ok=True)
pathlib.Path(os.path.join(eval_output_dir, 'logs')).mkdir(
parents=True, exist_ok=True
)
logger.info(f'Using evaluation output directory: {eval_output_dir}')
# LIMIT EVALUATION
eval_n_limit = args.eval_n_limit
if eval_n_limit:
logic_reasoning_tests = logic_reasoning_tests.select(list(range(eval_n_limit)))
logger.info(f'Limiting evaluation to first {eval_n_limit} instances.')
start_time = time.strftime('%Y-%m-%d %H:%M:%S')
# OUTPUT FILE
output_file = os.path.join(eval_output_dir, 'output.jsonl')
logger.info(f'Writing evaluation output to {output_file}')
finished_task_ids = set()
if os.path.exists(output_file):
with open(output_file, 'r') as f:
for line in f:
data = json.loads(line)
finished_task_ids.add(data['id'])
logger.warning(
f'Output file {output_file} already exists. Loaded {len(finished_task_ids)} finished instances.'
)
output_fp = open(output_file, 'a')
logger.info(
f'Evaluation started with Agent {agent_class}, model {model_name}, max iterations {max_iterations}.'
)
# =============================================
# filter out finished instances
new_logic_reasoning_tests = []
for instance in logic_reasoning_tests:
if instance['id'] in finished_task_ids:
logger.info(
f'Skipping instance {instance["id"]} as it is already finished.'
)
continue
new_logic_reasoning_tests.append(instance)
logic_reasoning_tests = new_logic_reasoning_tests
logger.info(
f'Finished instances: {len(finished_task_ids)}, Remaining instances: {len(logic_reasoning_tests)}'
)
# =============================================
pbar = tqdm(total=len(logic_reasoning_tests))
# This function tracks the progress AND write the output to a JSONL file
def update_progress(future):
pbar.update(1)
output = future.result()
pbar.set_description(f'Instance {output["id"]}')
pbar.set_postfix_str(f'Test Result: {output["test_result"]["result"]}')
logger.info(
f'Finished evaluation for instance {output["id"]}: {output["test_result"]["result"]}'
)
output_fp.write(json.dumps(output) + '\n')
# json.dump(output, output_fp, indent=4)
output_fp.flush()
# This sets the multi-processing
num_workers = args.eval_num_workers
# num_workers = 1
logger.info(f'Using {num_workers} workers for evaluation.')
# This is SWE-Bench specific - CodeActAgent don't requires mounted workspace to work
skip_workspace_mount = False
logger.info(f'Skipping workspace mount: {skip_workspace_mount}')
try:
with ProcessPoolExecutor(num_workers) as executor:
futures = []
# This is how we perform multi-processing
for instance in logic_reasoning_tests:
future = executor.submit(
process_instance,
instance,
agent_class,
dataset_name,
skip_workspace_mount,
eval_output_dir,
reset_logger=bool(num_workers > 1),
)
future.add_done_callback(update_progress)
futures.append(future)
# Wait for all futures to complete
for future in futures:
future.result()
except KeyboardInterrupt:
print('KeyboardInterrupt received. Cleaning up...')
cleanup()
output_fp.close()
with open(output_file, 'r') as f:
test_result = [(json.loads(line))["test_result"]["result"] for line in f]
metadata = {
"Dataset": dataset_name,
"Data split": data_split,
"Number of Samples": len(test_result),
'Agent class': agent_class,
'Model name': model_name,
'Start_time': start_time,
"End_time": time.strftime('%Y-%m-%d %H:%M:%S'),
"Final Accuracy": f"{sum(test_result)/len(test_result):.2f}",
}
with open(os.path.join(eval_output_dir, 'metadata.json'), 'w') as f:
json.dump(metadata, f, indent=4)
logger.info(f'Metadata: {json.dumps(metadata, indent=4)}')
logger.info(f'Evaluation finished. Metadata saved to {eval_output_dir}/metadata.json')

View File

@ -0,0 +1,37 @@
#!/bin/bash
DATASET=$1
MODEL_CONFIG=$2
EVAL_LIMIT=$3
AGENT=$4
# ################################################################################
if [ -z "$AGENT" ]; then
echo "Agent not specified, use default CodeActAgent"
AGENT="CodeActAgent"
fi
# IMPORTANT: Because Agent's prompt changes fairly often in the rapidly evolving codebase of OpenDevin
# We need to track the version of Agent in the evaluation to make sure results are comparable
AGENT_VERSION=v$(poetry run python -c "import agenthub; from opendevin.controller.agent import Agent; print(Agent.get_cls('$AGENT').VERSION)")
echo "AGENT: $AGENT"
echo "AGENT_VERSION: $AGENT_VERSION"
echo "MODEL_CONFIG: $MODEL_CONFIG"
COMMAND="poetry run python evaluation/logic_reasoning/run_infer.py \
--agent-cls $AGENT \
--llm-config $MODEL_CONFIG \
--dataset $DATASET \
--max-iterations 10 \
--max-chars 10000000 \
--eval-num-workers 1 \
--eval-note $AGENT_VERSION"
if [ -n "$EVAL_LIMIT" ]; then
echo "EVAL_LIMIT: $EVAL_LIMIT"
COMMAND="$COMMAND --eval-n-limit $EVAL_LIMIT"
fi
# Run the command
eval $COMMAND