Support MINT benchmark (MATH, GSM8K subset) (#1955)

* setup boilerplate and README

* setup test script and load dataset

* add temp intg that works

* refactor code

* add solution evaluation through 'fake_user_response_fn'

* finish integrating MATH subset

* Update evaluation/mint/run_infer.py

* Update evaluation/mint/run_infer.sh

* Update opendevin/core/main.py

* remove redudant templates, add eval_note, update README

* use <execute_ipython> tag instead of <execute>

* hardcode AGENT option for run_infer.sh

* Update evaluation/mint/task.py

Co-authored-by: Yufan Song <33971064+yufansong@users.noreply.github.com>

* fix: bug no message returned when task's success

* change message to make the agent exit

* import bash abstractmethod

* install all required packages inside sandbox before the agent runs, adjust prompt

* add subset eval folder separation and test for gsm8k

* fix bug in Reasoning task result check, add requirements.txt

* Fix syntax error in evaluation/mint/run_infer.py

* update README, add default values for `SUBSET` and `EVAL_LIMIT`

---------

Co-authored-by: Yufan Song <33971064+yufansong@users.noreply.github.com>
Co-authored-by: yufansong <yufan@risingwave-labs.com>
Co-authored-by: Boxuan Li <liboxuan@connect.hku.hk>
This commit is contained in:
Ryan H. Tran 2024-05-28 14:42:52 +07:00 committed by GitHub
parent 110c530582
commit 9434bcce48
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
14 changed files with 902 additions and 0 deletions

View File

@ -16,6 +16,7 @@ all the preprocessing/evaluation/analysis scripts.
- HumanEvalFix: [`evaluation/humanevalfix`](./humanevalfix)
- GAIA: [`evaluation/gaia`](./gaia)
- Entity deduction Arena (EDA): [`evaluation/EDA`](./EDA)
- MINT: [`evaluation/mint`](./mint)
### Result Visualization

1
evaluation/mint/.gitignore vendored Normal file
View File

@ -0,0 +1 @@
!requirements.txt

45
evaluation/mint/README.md Normal file
View File

@ -0,0 +1,45 @@
# MINT Benchmark
This folder contains the evaluation harness for the [MINT benchmark](https://arxiv.org/abs/2309.10691) on LLMs' ability to solve tasks with multi-turn interactions.
## Configure OpenDevin and LM
Create a `config.toml` file if it does not exist at the root of the workspace. Please check [README.md](../../README.md) for how to set this up.
## Start the evaluation
We are using the MINT dataset hosted on [Hugging Face](https://huggingface.co/datasets/ryanhoangt/xingyaoww-mint-bench).
Following is the basic command to start the evaluation. Currently, the only agent supported with MINT is `CodeActAgent`.
```bash
./evaluation/mint/scripts/run_infer.sh [model_config] [subset] [eval_limit]
```
where `model_config` is mandatory, while `subset` and `eval_limit` are optional.
- `model_config`, e.g. `eval_gpt4_1106_preview`, is the config group name for your LLM settings, as defined in your `config.toml`.
- `subset`, e.g. `math`, is the subset of the MINT benchmark to evaluate on, defaulting to `math`.
- `eval_limit`, e.g. `2`, limits the evaluation to the first `eval_limit` instances, defaulting to all instances.
Note: in order to use `eval_limit`, you must also set `subset`.
Let's say you'd like to run 3 instances on the `gsm8k` subset using `eval_gpt4_1106_preview`,
then your command would be:
```bash
./evaluation/swe_bench/scripts/run_infer.sh eval_gpt4_1106_preview gsm8k 3
```
## Reference
```
@misc{wang2024mint,
title={MINT: Evaluating LLMs in Multi-turn Interaction with Tools and Language Feedback},
author={Xingyao Wang and Zihan Wang and Jiateng Liu and Yangyi Chen and Lifan Yuan and Hao Peng and Heng Ji},
year={2024},
eprint={2309.10691},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
```

View File

@ -0,0 +1,5 @@
TASK_INFO_MAP = {
# === Reasoning ===
'gsm8k': {'class': 'ReasoningTask', 'type': 'reasoning'},
'math': {'class': 'ReasoningTask', 'type': 'reasoning'},
}

View File

@ -0,0 +1,82 @@
import enum
from typing import Any, Dict, Tuple
class TaskState:
def __init__(
self,
finished: bool = False,
success: bool = False,
agent_action_count: dict = None,
terminate_reason: str = None,
latest_output: Dict[str, Any] = None,
):
self.finished = finished
self.success = success
self.agent_action_count: Dict[str, int] = agent_action_count or {
'propose_solution': 0,
'use_tool': 0,
'invalid_action': 0,
}
self.terminate_reason = terminate_reason
self.latest_output = latest_output
def to_dict(self) -> Dict[str, Any]:
return {
'finished': self.finished,
'success': self.success,
'agent_action_count': self.agent_action_count,
'terminate_reason': self.terminate_reason,
'latest_output': self.latest_output,
}
class ParseError(Exception):
pass
class FeedbackType(enum.Enum):
FEEDBACK_WITH_GT = 'feedback_with_gt'
FEEDBACK_WO_GT = 'feedback_wo_gt'
NO_FEEDBACK = 'no_feedback'
class StepOutput:
def __init__(
self,
observation: str = None,
success: bool = False,
extra: Dict[str, Any] = None,
turn_info: Tuple[int, int] = None,
):
self.observation: str = observation
self.success: bool = success
self.extra: Dict[str, Any] = extra
self.turn_info = turn_info
def __repr__(self) -> str:
return self.observation
def to_str(self) -> str:
output = 'Observation:\n'
if self.observation is not None:
output += self.observation + '\n'
else:
if not self.success:
output += 'Your answer is wrong.\n'
if self.turn_info is not None:
n_steps_left, n_propose_solution_left = self.turn_info
output += 'You have {} steps left and {} chances to propose solution left.\n'.format(
n_steps_left, n_propose_solution_left
)
if n_steps_left <= 1:
output += 'You should take the last step to propose a solution.\n'
return output
def to_dict(self) -> Dict[str, Any]:
return {
'observation': self.observation,
'success': self.success,
}

119
evaluation/mint/env.py Normal file
View File

@ -0,0 +1,119 @@
import re
import traceback
from typing import Dict, Optional
from datatypes import ParseError, StepOutput, TaskState
from task import Task
from opendevin.controller.state.state import State
class SimplifiedEnv:
INVALID_INPUT_MESSAGE = (
"I don't understand your input. \n"
'If you want to execute code, please use <execute_ipython> YOUR_CODE_HERE </execute_ipython>.\n'
'If you want to give me an answer, please use <solution> YOUR_SOLUTION_HERE </solution>.\n'
'For example: The answer to the question is <solution> 42 </solution>. \n'
)
def __init__(self, agent_state: State, task: Task, task_config: Dict[str, int]):
self.agent_state = agent_state
self.task = task
self.task_state = TaskState()
self.task_config = task_config
def step(self, lm_message: str):
observation = self.handle_propose_solution(lm_message)
self.check_max_iteration()
turn_info = (
self.task_config['max_iterations'] - self.agent_state.iteration,
self.task_config['max_propose_solution']
- self.task_state.agent_action_count['propose_solution'],
)
output = StepOutput(
observation=observation,
success=self.task_state.success,
turn_info=turn_info,
)
self.log_output(output)
return self.task_state
def handle_propose_solution(self, lm_message) -> Optional[str]:
"""Propose answer to check the task success.
It might set self.state.finished = True if the task is successful.
"""
self.task_state.agent_action_count['propose_solution'] += 1
try:
parsed = self.parse_propose_solution(lm_message)
task_success = self.check_task_success(parsed['answer'])
if task_success:
self.task_state.finished = True
self.task_state.success = True
self.task_state.terminate_reason = 'task_success'
# NOTE: should not return the function now, because we need to log the output
# Set state.finished = True will terminate the episode
except ParseError:
return SimplifiedEnv.INVALID_INPUT_MESSAGE
except Exception:
error_traceback = traceback.format_exc()
return f'{error_traceback}'
def parse_propose_solution(self, lm_message: str) -> dict:
"""Define the parsing logic."""
lm_output = '\n' + lm_message + '\n'
answer = '\n'.join(
[
i.strip()
for i in re.findall(r'<solution>(.*?)</solution>', lm_output, re.DOTALL)
]
)
if answer == '':
raise ParseError('No answer found.')
return {'answer': answer}
def log_output(self, output: StepOutput) -> None:
if self.task_state.finished:
return
content = output.to_str()
# self.state.history.append({"role": "user", "content": content})
self.task_state.latest_output = output.to_dict()
self.task_state.latest_output['content'] = content
def check_task_success(self, answer: str) -> bool:
# log_message.info(f"STUDENT ANSWER: [{answer}]")
# log_message.info(f"REFERENCE ANSWER: [{self.task.reference}]")
return self.task.success(answer)
def check_max_iteration(self):
"""Check if the agent has reached the max iteration limit.
It might set self.state.finished = True if the agent has reached the max iteration limit.
"""
if self.task_state.finished:
# ignore if the episode is already finished (e.g., task success)
return
if (
# propose solution > max output solution
self.task_state.agent_action_count['propose_solution']
>= self.task_config['max_propose_solution']
):
self.task_state.finished = True
self.task_state.success = False
self.task_state.terminate_reason = 'max_propose_steps'
elif (
# (propose_solution + use_tool) > max iteration limit
sum(self.task_state.agent_action_count.values())
>= self.task_config['max_iterations']
):
self.task_state.finished = True
self.task_state.success = False
self.task_state.terminate_reason = 'max_iterations'

View File

@ -0,0 +1,48 @@
Task:
Evaluate $\int\int_C (x+y)^2 dx,dy$, where C is the circle defined by $x^2 + y^2 = M$, and M is the birth year of Li Bai.
Output format requirement: Round to three decimal places, using half up rounding rule.
Assistant:
<thought> To solve this problem, let's start by determining Li Bai's birth year. </thought>
<execute_ipython> print(wikipedia_search('Li Bai')) </execute_ipython>
Observation:
Page: Li Bai
Summary: Li Bai (Chinese: 李白; pinyin: Lǐ Bái, 701762), also pronounced as Li Bo, courtesy name Taibai (Chinese: 太白), was a Chinese poet, acclaimed from his own time to the present as a brilliant and romantic figure who took traditional poetic forms to new heights.
Assistant:
<thought>
From the wiki_search result, we can see that Li Bai was born in 701, thus M = 701.
To evaluate the double integral of $(x+y)^2$ over the region $C$, where $C$ is the circle defined by $x^2 + y^2 = 701$, we should use polar coordinates.
In polar coordinates, $x = r\cos(\theta)$ and $y = r\sin(\theta)$, and the differential area element $dx,dy$ becomes $r,dr,d\theta$.
$\int\int_C (x+y)^2 dx,dy = \int_0^{2\pi}\int_0^{\sqrt{701}} (r\cos(\theta) + r\sin(\theta))^2r,dr,d\theta$
Let's use the SymPy package to calculate this.
</thought>
<execute_ipython>
from sympy import symbols, integrate, cos, sin, pi
r, theta = symbols('r theta')
# the integrand
integrand = (r*cos(theta) + r*sin(theta))**2 * r
# perform the double integration
double_integral = integrate(integrate(integrand, (r, 0, 701 ** 0.5)), (theta, 0, 2*pi))
print(double_integral)
</execute_ipython>
Observation:
Out[2]: 245700.5*pi
Assistant:
<thought> Let's round this result to three decimal places using half up rounding rule. </thought>
<execute_ipython>
import math
print(round(245700.5*math.pi, 3))
</execute_ipython>
Observation:
771890.886
Assistant:
The answer is <solution> 771890.886 </solution>.

View File

@ -0,0 +1,25 @@
import os
from utils import load_file
PROMPT_DIR = os.path.dirname(__file__)
TEMPLATE_WITH_TOOL = load_file(os.path.join(PROMPT_DIR, 'template_with_tool.txt'))
class PromptTemplate:
"""A prompt template."""
def __init__(self, template: str):
self.template: str = template
def __call__(self, **kwargs) -> str:
return self.template.format(**kwargs)
class ToolPromptTemplate(PromptTemplate):
def __init__(self, use_tool: bool):
if use_tool:
template = TEMPLATE_WITH_TOOL
else:
raise NotImplementedError('Evaluation without tool is not supported yet.')
super().__init__(template)

View File

@ -0,0 +1,19 @@
You are a helpful assistant assigned with the task of problem-solving.
To solve the task, you can only interact with the interactive Python (Jupyter Notebook) environment using <execute_ipython> tag. Other tools cannot be used.
At each turn, you should first provide your step-by-step thinking for solving the task. Your thought process should be enclosed using "<thought>" tag, for example: <thought> I need to print "Hello World!" </thought>.
After that, you have two options:
1) Interact with a Python programming environment and receive the corresponding output.
2) Directly provide a solution by sending your answer to user through message that adheres to the required format for the given task. Your solution should be enclosed using "<solution>" tag, for example: The answer is <solution> A </solution>.
Either you choose to interact with the Python environment or provide a solution, you need to send a message to the user to evaluate your response and provide feedback.
You have {max_total_steps} chances to interact with the environment or propose a solution. You can only propose a solution {max_propose_solution} times.
---
{in_context_example}
---
# Problem statement:
{task_prompt}

View File

@ -0,0 +1,32 @@
pre-commit
openai
datasets
backoff
charset-normalizer==3.1.0
# Alfworld
pandas==1.4.4
opencv-python
networkx
tqdm
vocab
revtok
Click
ai2thor==2.1.0
transformers
tokenizers
scipy==1.10.1
ipython
matplotlib
cython
nltk
gym==0.15.4
pipreqs
pyyaml
pytz
visdom
sympy
pycocotools
seaborn
google-generativeai
python-dateutil
statsmodels

View File

@ -0,0 +1,357 @@
import asyncio
import functools
import json
import logging
import multiprocessing as mp
import os
import pathlib
import subprocess
import time
from concurrent.futures import ProcessPoolExecutor
from typing import Dict
from datasets import load_dataset
from datatypes import TaskState
from env import SimplifiedEnv
from prompts import ToolPromptTemplate
from task import ReasoningTask, Task
from tqdm import tqdm
from evaluation.swe_bench.swe_env_box import DockerSSHBox
from opendevin.controller.state.state import State
from opendevin.core.config import config, get_llm_config_arg, get_parser
from opendevin.core.logger import get_console_handler
from opendevin.core.logger import opendevin_logger as logger
from opendevin.core.main import main
from opendevin.events.serialization.event import event_to_dict
def cleanup():
print('Cleaning up child processes...')
for process in mp.active_children():
print(f'Terminating child process: {process.name}')
process.terminate()
process.join()
def codeact_user_response(state: State, task: Task, task_config: Dict[str, int]):
logger.info(f'Gold reference: {task.reference}')
logger.info(f'Task config: {task_config}')
env = SimplifiedEnv(
agent_state=state,
task=task,
task_config=task_config,
)
last_action, _ = state.history[-1]
result_state: TaskState = env.step(last_action.message)
state.task_state = result_state
if not result_state.latest_output:
if result_state.success:
msg = 'Your answer is correct. Please EXIT using the following command: <execute_bash> exit </execute_bash>.'
else:
msg = 'Something went wrong! No output from the model.'
else:
msg = result_state.latest_output['content']
logger.info('User response:' + msg)
return msg
def monologue_user_response(state: State) -> str:
raise NotImplementedError('MonologueAgent should never ask for user responses.')
AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = {
'CodeActAgent': codeact_user_response,
'MonologueAgent': monologue_user_response,
}
AGENT_CLS_TO_INST_SUFFIX = {
'CodeActAgent': '\nIMPORTANT: When your answer is confirmed by the user to be correct, you can exit using the following command: <execute_bash> exit </execute_bash>.\n'
}
def process_instance(
instance: Task,
agent_class,
metadata,
skip_workspace_mount,
eval_output_dir,
reset_logger: bool = True,
):
workspace_mount_path = os.path.join(config.workspace_mount_path, '_eval_workspace')
# create process-specific workspace dir
# if `not skip_workspace_mount` - we will create a workspace directory for EACH process
# so that different agent don't interfere with each other.
if not skip_workspace_mount:
workspace_mount_path = os.path.join(workspace_mount_path, str(os.getpid()))
pathlib.Path(workspace_mount_path).mkdir(parents=True, exist_ok=True)
# Setup the logger properly, so you can run multi-processing to parallize the evaluation
if reset_logger:
# Set up logger
log_file = os.path.join(
eval_output_dir, 'logs', f'instance_{instance.task_id}.log'
)
# Remove all existing handlers from logger
for handler in logger.handlers[:]:
logger.removeHandler(handler)
# add back the console handler to print ONE line
logger.addHandler(get_console_handler())
logger.info(
f'Starting evaluation for instance {instance.task_id}.\nHint: run "tail -f {log_file}" to see live logs in a seperate shell'
)
# Remove all existing handlers from logger
for handler in logger.handlers[:]:
logger.removeHandler(handler)
file_handler = logging.FileHandler(log_file)
file_handler.setFormatter(
logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
)
logger.addHandler(file_handler)
if not skip_workspace_mount:
logger.info(f'Process-specific workspace mounted at {workspace_mount_path}')
sandbox = DockerSSHBox()
requirements_host_src = 'evaluation/mint/requirements.txt'
requirements_sandbox_dest = '/opendevin/plugins/mint/requirements.txt'
sandbox.copy_to(
host_src=requirements_host_src,
sandbox_dest=requirements_sandbox_dest,
recursive=False,
)
logger.info(
f'Copied files from [{requirements_host_src}] to [{requirements_sandbox_dest}] inside sandbox.'
)
exit_code, output = sandbox.execute(f'pip install -r {requirements_sandbox_dest}')
# Prepare instruction
instruction = ToolPromptTemplate(use_tool=True)(
max_total_steps=metadata['max_iterations'],
max_propose_solution=metadata['max_propose_solution'],
in_context_example=instance.in_context_example(
use_tool=True, with_feedback=False
),
task_prompt='Task:\n' + instance.prompt,
)
instruction += 'IMPORTANT: You should ONLY interact with the environment provided to you or provide the solution inside <solution> tag AND NEVER ASK FOR HUMAN HELP.\n'
# NOTE: You can actually set slightly different instruction for different agents
instruction += AGENT_CLS_TO_INST_SUFFIX.get(agent_class, '')
# Here's how you can run the agent (similar to the `main` function) and get the final task state
fake_user_response_fn = functools.partial(
AGENT_CLS_TO_FAKE_USER_RESPONSE_FN.get(agent_class),
task=instance,
task_config={
'max_iterations': metadata['max_iterations'],
'max_propose_solution': metadata['max_propose_solution'],
},
)
state: State = asyncio.run(
main(
instruction,
fake_user_response_fn=fake_user_response_fn,
sandbox=sandbox,
)
)
if state is None:
raise ValueError('State should not be None.')
logger.info('Msgs: ' + str(state.history))
task_state: TaskState = state.task_state
logger.info('Task state: ' + str(task_state.to_dict()))
# Save the output
output = {
'id': instance.task_id,
'instance': instance.to_dict(),
'instruction': instruction,
'metadata': metadata,
'history': [
(event_to_dict(action), event_to_dict(obs)) for action, obs in state.history
],
'error': state.error if state and state.error else None,
'test_result': task_state.success,
}
# Close the sandbox
sandbox.close()
return output
if __name__ == '__main__':
parser = get_parser()
parser.add_argument(
'--subset',
default='math',
choices=['math', 'gsm8k'],
type=str,
help='subset of the dataset to be used',
)
parser.add_argument(
'--max-propose-solution',
default=2,
type=int,
help='maximum number of times the agent can propose a solution',
)
args, _ = parser.parse_known_args()
# NOTE: It is preferable to load datasets from huggingface datasets and perform post-processing
# so we don't need to manage file uploading to OpenDevin's repo
mint_dataset = load_dataset(
'ryanhoangt/xingyaoww-mint-bench', name=args.subset, split='test'
)
logger.info(f'Evaluating MINT - {args.subset} subset')
# Check https://github.com/OpenDevin/OpenDevin/blob/main/evaluation/swe_bench/README.md#configure-opendevin-and-your-llm
# for details of how to set `llm_config`
if args.llm_config:
specified_llm_config = get_llm_config_arg(args.llm_config)
if specified_llm_config:
config.llm = specified_llm_config
logger.info(f'Config for evaluation: {config}')
# TEST METADATA
agent_class = args.agent_cls
assert (
agent_class in AGENT_CLS_TO_FAKE_USER_RESPONSE_FN
), f'Unsupported agent class: {agent_class}'
model_name = config.llm.model.split('/')[-1]
max_iterations = args.max_iterations
eval_note = ''
if args.eval_note is not None:
eval_note += '_N_' + args.eval_note
eval_output_dir = os.path.join(
args.eval_output_dir,
'mint',
agent_class,
model_name + '_maxiter_' + str(max_iterations) + eval_note,
args.subset,
)
pathlib.Path(eval_output_dir).mkdir(parents=True, exist_ok=True)
pathlib.Path(os.path.join(eval_output_dir, 'logs')).mkdir(
parents=True, exist_ok=True
)
logger.info(f'Using evaluation output directory: {eval_output_dir}')
metadata = {
'agent_class': agent_class,
'model_name': model_name,
'max_iterations': max_iterations,
'max_propose_solution': args.max_propose_solution,
'eval_output_dir': eval_output_dir,
'start_time': time.strftime('%Y-%m-%d %H:%M:%S'),
# get the commit id of current repo for reproduciblity
'git_commit': subprocess.check_output(['git', 'rev-parse', 'HEAD'])
.decode('utf-8')
.strip(),
}
logger.info(f'Metadata: {metadata}')
with open(os.path.join(eval_output_dir, 'metadata.json'), 'w') as f:
json.dump(metadata, f)
# LIMIT EVALUATION
eval_n_limit = args.eval_n_limit
if eval_n_limit:
mint_dataset = mint_dataset.select(range(eval_n_limit))
logger.info(f'Limiting evaluation to first {eval_n_limit} instances.')
# OUTPUT FILE
output_file = os.path.join(eval_output_dir, 'output.jsonl')
logger.info(f'Writing evaluation output to {output_file}')
finished_instance_ids = set()
if os.path.exists(output_file):
with open(output_file, 'r') as f:
for line in f:
data = json.loads(line)
finished_instance_ids.add(data['id'])
logger.warning(
f'Output file {output_file} already exists. Loaded {len(finished_instance_ids)} finished instances.'
)
output_fp = open(output_file, 'a')
logger.info(
f'Evaluation started with Agent {agent_class}, model {model_name}, max iterations {max_iterations}, max propose solution {args.max_propose_solution}.'
)
# =============================================
# filter out finished instances
task_class = ReasoningTask
new_mint_tests: list[ReasoningTask] = []
for instance in mint_dataset:
if instance['id'] in finished_instance_ids:
logger.info(
f'Skipping instance {instance["id"]} as it is already finished.'
)
continue
# convert to Task object
instance = ReasoningTask(**instance)
new_mint_tests.append(instance)
mint_dataset = new_mint_tests
logger.info(
f'Finished instances: {len(finished_instance_ids)}, Remaining instances: {len(mint_dataset)}'
)
# =============================================
pbar = tqdm(total=len(mint_dataset))
# This function tracks the progress AND write the output to a JSONL file
def update_progress(future):
pbar.update(1)
output = future.result()
# logger.info('Output: ', output)
# pbar.set_description(f'Instance {output["instance_id"]}')
# pbar.set_postfix_str(f'Test Result: {output["test_result"]["result"]}')
# logger.info(
# f'Finished evaluation for instance {output["instance_id"]}: {output["test_result"]["result"]}'
# )
output_fp.write(json.dumps(output) + '\n')
output_fp.flush()
# This sets the multi-processing
num_workers = args.eval_num_workers
logger.info(f'Using {num_workers} workers for evaluation.')
# This is SWE-Bench specific - CodeActAgent doesn't require mounted workspace to work
skip_workspace_mount = agent_class == 'CodeActAgent'
logger.info(f'Skipping workspace mount: {skip_workspace_mount}')
try:
with ProcessPoolExecutor(num_workers) as executor:
futures = []
# This is how we perform multi-processing
for instance in mint_dataset:
future = executor.submit(
process_instance,
instance,
agent_class,
metadata,
skip_workspace_mount,
eval_output_dir,
reset_logger=bool(num_workers > 1),
)
future.add_done_callback(update_progress)
futures.append(future)
# Wait for all futures to complete
for future in futures:
future.result()
except KeyboardInterrupt:
print('KeyboardInterrupt received. Cleaning up...')
cleanup()
output_fp.close()
logger.info('Evaluation finished.')

View File

@ -0,0 +1,37 @@
#!/bin/bash
MODEL_CONFIG=$1
SUBSET=$2
EVAL_LIMIT=$3
# Only 'CodeActAgent' is supported for MINT now
AGENT="CodeActAgent"
# We need to track the version of Agent in the evaluation to make sure results are comparable
AGENT_VERSION=v$(poetry run python -c "import agenthub; from opendevin.controller.agent import Agent; print(Agent.get_cls('$AGENT').VERSION)")
echo "AGENT: $AGENT"
echo "AGENT_VERSION: $AGENT_VERSION"
export PYTHONPATH=$(pwd)
COMMAND="poetry run python ./evaluation/mint/run_infer.py \
--max-iterations 5 \
--max-propose-solution 2 \
--eval-note $AGENT_VERSION"
if [ -n "$SUBSET" ]; then
echo "SUBSET: $SUBSET"
COMMAND="$COMMAND --subset $SUBSET"
# otherwise default to use the math subset
else
echo "SUBSET: math"
COMMAND="$COMMAND --subset math"
fi
if [ -n "$EVAL_LIMIT" ]; then
echo "EVAL_LIMIT: $EVAL_LIMIT"
COMMAND="$COMMAND --eval-n-limit $EVAL_LIMIT"
fi
# Run the command
eval $COMMAND

121
evaluation/mint/task.py Normal file
View File

@ -0,0 +1,121 @@
import json
import logging
import os
from abc import ABC, abstractmethod
from typing import List, Optional, Tuple
from utils import load_file
LOGGER = logging.getLogger('MINT')
class Task(ABC):
"""Base class for a task instance."""
task_name: str = 'base'
in_context_example_dir = os.path.join(
os.path.dirname(os.path.abspath(__file__)),
'in_context_examples',
)
def __init__(self, **kwargs) -> None:
if 'loaded_history' in kwargs:
self.loaded_history = kwargs['loaded_history']
else:
self.loaded_history = None
# pre-load the in-context example
task_dir = os.path.join(self.in_context_example_dir, self.task_name)
self._in_context_example = {
'with_tool': load_file(os.path.join(task_dir, 'with_tool.txt')),
}
self.metadata = {}
@property
def task_id(self) -> str:
"""Return the task id."""
assert hasattr(self, '_id'), 'Task does not have an id.'
return self._id
def in_context_example(
self, use_tool: bool = True, with_feedback: bool = False
) -> str:
"""Return the in-context example for the task."""
if use_tool and not with_feedback:
return self._in_context_example['with_tool']
else:
raise NotImplementedError
@property
def prompt(self) -> str:
"""Return the task prompt."""
assert hasattr(self, '_prompt'), 'Task does not have a prompt.'
return self._prompt
@property
def reference(self) -> str:
"""Return the reference solution for the task."""
assert hasattr(self, '_reference'), 'Task does not have a reference solution.'
return self._reference
@abstractmethod
def extract_answer(self, solution: str) -> Optional[str]:
"""Extract the answer from the given solution."""
pass
@abstractmethod
def success(self, solution: str) -> bool:
"""This checks whether the given solution can complete the current task.
Can be used to provide binary feedback.
"""
answer = self.extract_answer(solution)
return answer == self.reference
@classmethod
def load_tasks(cls, path: str) -> Tuple[List['Task'], int]:
"""Load all the tasks from a given jsonl file."""
assert path.endswith('.jsonl') or path.endswith('.json')
with open(path, 'r') as f:
tasks = [cls(**json.loads(line)) for line in f.readlines()]
LOGGER.info(f'Loaded {len(tasks)} tasks from {path}')
return tasks, len(tasks)
def to_dict(self) -> dict:
"""Convert the task to a dictionary."""
return {
'task_name': self.task_name,
'task_id': self.task_id,
'prompt': self.prompt,
'reference': self.reference,
'metadata': self.metadata,
}
class ReasoningTask(Task):
task_name = 'reasoning'
def __init__(self, id: str, prompt: str, reference: str, **kwargs):
super().__init__(**kwargs)
self._id = id
self._prompt = prompt.strip()
self._reference = str(reference).strip().lower()
def extract_answer(self, solution: str) -> Optional[str]:
"""Extract the answer from the given solution."""
return solution.lower().strip()
def compare_w_digits(self, reference: str, answer: str) -> bool:
"""Compare the reference and answer with digits."""
# if reference can and answer can both be converted to floats by float()
try:
float(reference)
float(answer)
return abs(float(reference) - float(answer)) <= 0.05 * abs(float(reference))
except ValueError:
return reference in answer
except Exception:
raise ValueError(f'Cannot compare {reference} and {answer}')
def success(self, solution: str) -> bool:
answer = self.extract_answer(solution)
return self.compare_w_digits(self._reference, answer)

10
evaluation/mint/utils.py Normal file
View File

@ -0,0 +1,10 @@
import functools
# use cache to avoid loading the same file multiple times
# which can leads to too many open files error
@functools.lru_cache(maxsize=128)
def load_file(filepath: str) -> str:
with open(filepath, 'r') as f:
content = f.read()
return content