mirror of
https://github.com/OpenHands/OpenHands.git
synced 2025-12-26 05:48:36 +08:00
Support MINT benchmark (MATH, GSM8K subset) (#1955)
* setup boilerplate and README * setup test script and load dataset * add temp intg that works * refactor code * add solution evaluation through 'fake_user_response_fn' * finish integrating MATH subset * Update evaluation/mint/run_infer.py * Update evaluation/mint/run_infer.sh * Update opendevin/core/main.py * remove redudant templates, add eval_note, update README * use <execute_ipython> tag instead of <execute> * hardcode AGENT option for run_infer.sh * Update evaluation/mint/task.py Co-authored-by: Yufan Song <33971064+yufansong@users.noreply.github.com> * fix: bug no message returned when task's success * change message to make the agent exit * import bash abstractmethod * install all required packages inside sandbox before the agent runs, adjust prompt * add subset eval folder separation and test for gsm8k * fix bug in Reasoning task result check, add requirements.txt * Fix syntax error in evaluation/mint/run_infer.py * update README, add default values for `SUBSET` and `EVAL_LIMIT` --------- Co-authored-by: Yufan Song <33971064+yufansong@users.noreply.github.com> Co-authored-by: yufansong <yufan@risingwave-labs.com> Co-authored-by: Boxuan Li <liboxuan@connect.hku.hk>
This commit is contained in:
parent
110c530582
commit
9434bcce48
@ -16,6 +16,7 @@ all the preprocessing/evaluation/analysis scripts.
|
||||
- HumanEvalFix: [`evaluation/humanevalfix`](./humanevalfix)
|
||||
- GAIA: [`evaluation/gaia`](./gaia)
|
||||
- Entity deduction Arena (EDA): [`evaluation/EDA`](./EDA)
|
||||
- MINT: [`evaluation/mint`](./mint)
|
||||
|
||||
### Result Visualization
|
||||
|
||||
|
||||
1
evaluation/mint/.gitignore
vendored
Normal file
1
evaluation/mint/.gitignore
vendored
Normal file
@ -0,0 +1 @@
|
||||
!requirements.txt
|
||||
45
evaluation/mint/README.md
Normal file
45
evaluation/mint/README.md
Normal file
@ -0,0 +1,45 @@
|
||||
# MINT Benchmark
|
||||
|
||||
This folder contains the evaluation harness for the [MINT benchmark](https://arxiv.org/abs/2309.10691) on LLMs' ability to solve tasks with multi-turn interactions.
|
||||
|
||||
## Configure OpenDevin and LM
|
||||
|
||||
Create a `config.toml` file if it does not exist at the root of the workspace. Please check [README.md](../../README.md) for how to set this up.
|
||||
|
||||
## Start the evaluation
|
||||
|
||||
We are using the MINT dataset hosted on [Hugging Face](https://huggingface.co/datasets/ryanhoangt/xingyaoww-mint-bench).
|
||||
|
||||
Following is the basic command to start the evaluation. Currently, the only agent supported with MINT is `CodeActAgent`.
|
||||
|
||||
```bash
|
||||
./evaluation/mint/scripts/run_infer.sh [model_config] [subset] [eval_limit]
|
||||
```
|
||||
|
||||
where `model_config` is mandatory, while `subset` and `eval_limit` are optional.
|
||||
|
||||
- `model_config`, e.g. `eval_gpt4_1106_preview`, is the config group name for your LLM settings, as defined in your `config.toml`.
|
||||
|
||||
- `subset`, e.g. `math`, is the subset of the MINT benchmark to evaluate on, defaulting to `math`.
|
||||
|
||||
- `eval_limit`, e.g. `2`, limits the evaluation to the first `eval_limit` instances, defaulting to all instances.
|
||||
|
||||
Note: in order to use `eval_limit`, you must also set `subset`.
|
||||
|
||||
Let's say you'd like to run 3 instances on the `gsm8k` subset using `eval_gpt4_1106_preview`,
|
||||
then your command would be:
|
||||
|
||||
```bash
|
||||
./evaluation/swe_bench/scripts/run_infer.sh eval_gpt4_1106_preview gsm8k 3
|
||||
```
|
||||
## Reference
|
||||
```
|
||||
@misc{wang2024mint,
|
||||
title={MINT: Evaluating LLMs in Multi-turn Interaction with Tools and Language Feedback},
|
||||
author={Xingyao Wang and Zihan Wang and Jiateng Liu and Yangyi Chen and Lifan Yuan and Hao Peng and Heng Ji},
|
||||
year={2024},
|
||||
eprint={2309.10691},
|
||||
archivePrefix={arXiv},
|
||||
primaryClass={cs.CL}
|
||||
}
|
||||
```
|
||||
5
evaluation/mint/config_variables.py
Normal file
5
evaluation/mint/config_variables.py
Normal file
@ -0,0 +1,5 @@
|
||||
TASK_INFO_MAP = {
|
||||
# === Reasoning ===
|
||||
'gsm8k': {'class': 'ReasoningTask', 'type': 'reasoning'},
|
||||
'math': {'class': 'ReasoningTask', 'type': 'reasoning'},
|
||||
}
|
||||
82
evaluation/mint/datatypes.py
Normal file
82
evaluation/mint/datatypes.py
Normal file
@ -0,0 +1,82 @@
|
||||
import enum
|
||||
from typing import Any, Dict, Tuple
|
||||
|
||||
|
||||
class TaskState:
|
||||
def __init__(
|
||||
self,
|
||||
finished: bool = False,
|
||||
success: bool = False,
|
||||
agent_action_count: dict = None,
|
||||
terminate_reason: str = None,
|
||||
latest_output: Dict[str, Any] = None,
|
||||
):
|
||||
self.finished = finished
|
||||
self.success = success
|
||||
self.agent_action_count: Dict[str, int] = agent_action_count or {
|
||||
'propose_solution': 0,
|
||||
'use_tool': 0,
|
||||
'invalid_action': 0,
|
||||
}
|
||||
self.terminate_reason = terminate_reason
|
||||
self.latest_output = latest_output
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
return {
|
||||
'finished': self.finished,
|
||||
'success': self.success,
|
||||
'agent_action_count': self.agent_action_count,
|
||||
'terminate_reason': self.terminate_reason,
|
||||
'latest_output': self.latest_output,
|
||||
}
|
||||
|
||||
|
||||
class ParseError(Exception):
|
||||
pass
|
||||
|
||||
|
||||
class FeedbackType(enum.Enum):
|
||||
FEEDBACK_WITH_GT = 'feedback_with_gt'
|
||||
FEEDBACK_WO_GT = 'feedback_wo_gt'
|
||||
NO_FEEDBACK = 'no_feedback'
|
||||
|
||||
|
||||
class StepOutput:
|
||||
def __init__(
|
||||
self,
|
||||
observation: str = None,
|
||||
success: bool = False,
|
||||
extra: Dict[str, Any] = None,
|
||||
turn_info: Tuple[int, int] = None,
|
||||
):
|
||||
self.observation: str = observation
|
||||
self.success: bool = success
|
||||
self.extra: Dict[str, Any] = extra
|
||||
self.turn_info = turn_info
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return self.observation
|
||||
|
||||
def to_str(self) -> str:
|
||||
output = 'Observation:\n'
|
||||
if self.observation is not None:
|
||||
output += self.observation + '\n'
|
||||
else:
|
||||
if not self.success:
|
||||
output += 'Your answer is wrong.\n'
|
||||
|
||||
if self.turn_info is not None:
|
||||
n_steps_left, n_propose_solution_left = self.turn_info
|
||||
output += 'You have {} steps left and {} chances to propose solution left.\n'.format(
|
||||
n_steps_left, n_propose_solution_left
|
||||
)
|
||||
if n_steps_left <= 1:
|
||||
output += 'You should take the last step to propose a solution.\n'
|
||||
|
||||
return output
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
return {
|
||||
'observation': self.observation,
|
||||
'success': self.success,
|
||||
}
|
||||
119
evaluation/mint/env.py
Normal file
119
evaluation/mint/env.py
Normal file
@ -0,0 +1,119 @@
|
||||
import re
|
||||
import traceback
|
||||
from typing import Dict, Optional
|
||||
|
||||
from datatypes import ParseError, StepOutput, TaskState
|
||||
from task import Task
|
||||
|
||||
from opendevin.controller.state.state import State
|
||||
|
||||
|
||||
class SimplifiedEnv:
|
||||
INVALID_INPUT_MESSAGE = (
|
||||
"I don't understand your input. \n"
|
||||
'If you want to execute code, please use <execute_ipython> YOUR_CODE_HERE </execute_ipython>.\n'
|
||||
'If you want to give me an answer, please use <solution> YOUR_SOLUTION_HERE </solution>.\n'
|
||||
'For example: The answer to the question is <solution> 42 </solution>. \n'
|
||||
)
|
||||
|
||||
def __init__(self, agent_state: State, task: Task, task_config: Dict[str, int]):
|
||||
self.agent_state = agent_state
|
||||
self.task = task
|
||||
self.task_state = TaskState()
|
||||
self.task_config = task_config
|
||||
|
||||
def step(self, lm_message: str):
|
||||
observation = self.handle_propose_solution(lm_message)
|
||||
|
||||
self.check_max_iteration()
|
||||
|
||||
turn_info = (
|
||||
self.task_config['max_iterations'] - self.agent_state.iteration,
|
||||
self.task_config['max_propose_solution']
|
||||
- self.task_state.agent_action_count['propose_solution'],
|
||||
)
|
||||
|
||||
output = StepOutput(
|
||||
observation=observation,
|
||||
success=self.task_state.success,
|
||||
turn_info=turn_info,
|
||||
)
|
||||
|
||||
self.log_output(output)
|
||||
return self.task_state
|
||||
|
||||
def handle_propose_solution(self, lm_message) -> Optional[str]:
|
||||
"""Propose answer to check the task success.
|
||||
|
||||
It might set self.state.finished = True if the task is successful.
|
||||
"""
|
||||
self.task_state.agent_action_count['propose_solution'] += 1
|
||||
try:
|
||||
parsed = self.parse_propose_solution(lm_message)
|
||||
task_success = self.check_task_success(parsed['answer'])
|
||||
if task_success:
|
||||
self.task_state.finished = True
|
||||
self.task_state.success = True
|
||||
self.task_state.terminate_reason = 'task_success'
|
||||
# NOTE: should not return the function now, because we need to log the output
|
||||
# Set state.finished = True will terminate the episode
|
||||
except ParseError:
|
||||
return SimplifiedEnv.INVALID_INPUT_MESSAGE
|
||||
except Exception:
|
||||
error_traceback = traceback.format_exc()
|
||||
return f'{error_traceback}'
|
||||
|
||||
def parse_propose_solution(self, lm_message: str) -> dict:
|
||||
"""Define the parsing logic."""
|
||||
lm_output = '\n' + lm_message + '\n'
|
||||
|
||||
answer = '\n'.join(
|
||||
[
|
||||
i.strip()
|
||||
for i in re.findall(r'<solution>(.*?)</solution>', lm_output, re.DOTALL)
|
||||
]
|
||||
)
|
||||
if answer == '':
|
||||
raise ParseError('No answer found.')
|
||||
|
||||
return {'answer': answer}
|
||||
|
||||
def log_output(self, output: StepOutput) -> None:
|
||||
if self.task_state.finished:
|
||||
return
|
||||
|
||||
content = output.to_str()
|
||||
# self.state.history.append({"role": "user", "content": content})
|
||||
self.task_state.latest_output = output.to_dict()
|
||||
self.task_state.latest_output['content'] = content
|
||||
|
||||
def check_task_success(self, answer: str) -> bool:
|
||||
# log_message.info(f"STUDENT ANSWER: [{answer}]")
|
||||
# log_message.info(f"REFERENCE ANSWER: [{self.task.reference}]")
|
||||
return self.task.success(answer)
|
||||
|
||||
def check_max_iteration(self):
|
||||
"""Check if the agent has reached the max iteration limit.
|
||||
|
||||
It might set self.state.finished = True if the agent has reached the max iteration limit.
|
||||
"""
|
||||
if self.task_state.finished:
|
||||
# ignore if the episode is already finished (e.g., task success)
|
||||
return
|
||||
|
||||
if (
|
||||
# propose solution > max output solution
|
||||
self.task_state.agent_action_count['propose_solution']
|
||||
>= self.task_config['max_propose_solution']
|
||||
):
|
||||
self.task_state.finished = True
|
||||
self.task_state.success = False
|
||||
self.task_state.terminate_reason = 'max_propose_steps'
|
||||
elif (
|
||||
# (propose_solution + use_tool) > max iteration limit
|
||||
sum(self.task_state.agent_action_count.values())
|
||||
>= self.task_config['max_iterations']
|
||||
):
|
||||
self.task_state.finished = True
|
||||
self.task_state.success = False
|
||||
self.task_state.terminate_reason = 'max_iterations'
|
||||
48
evaluation/mint/in_context_examples/reasoning/with_tool.txt
Normal file
48
evaluation/mint/in_context_examples/reasoning/with_tool.txt
Normal file
@ -0,0 +1,48 @@
|
||||
Task:
|
||||
Evaluate $\int\int_C (x+y)^2 dx,dy$, where C is the circle defined by $x^2 + y^2 = M$, and M is the birth year of Li Bai.
|
||||
Output format requirement: Round to three decimal places, using half up rounding rule.
|
||||
|
||||
Assistant:
|
||||
<thought> To solve this problem, let's start by determining Li Bai's birth year. </thought>
|
||||
<execute_ipython> print(wikipedia_search('Li Bai')) </execute_ipython>
|
||||
|
||||
Observation:
|
||||
Page: Li Bai
|
||||
Summary: Li Bai (Chinese: 李白; pinyin: Lǐ Bái, 701–762), also pronounced as Li Bo, courtesy name Taibai (Chinese: 太白), was a Chinese poet, acclaimed from his own time to the present as a brilliant and romantic figure who took traditional poetic forms to new heights.
|
||||
|
||||
Assistant:
|
||||
<thought>
|
||||
From the wiki_search result, we can see that Li Bai was born in 701, thus M = 701.
|
||||
To evaluate the double integral of $(x+y)^2$ over the region $C$, where $C$ is the circle defined by $x^2 + y^2 = 701$, we should use polar coordinates.
|
||||
In polar coordinates, $x = r\cos(\theta)$ and $y = r\sin(\theta)$, and the differential area element $dx,dy$ becomes $r,dr,d\theta$.
|
||||
$\int\int_C (x+y)^2 dx,dy = \int_0^{2\pi}\int_0^{\sqrt{701}} (r\cos(\theta) + r\sin(\theta))^2r,dr,d\theta$
|
||||
Let's use the SymPy package to calculate this.
|
||||
</thought>
|
||||
<execute_ipython>
|
||||
from sympy import symbols, integrate, cos, sin, pi
|
||||
|
||||
r, theta = symbols('r theta')
|
||||
|
||||
# the integrand
|
||||
integrand = (r*cos(theta) + r*sin(theta))**2 * r
|
||||
|
||||
# perform the double integration
|
||||
double_integral = integrate(integrate(integrand, (r, 0, 701 ** 0.5)), (theta, 0, 2*pi))
|
||||
print(double_integral)
|
||||
</execute_ipython>
|
||||
|
||||
Observation:
|
||||
Out[2]: 245700.5*pi
|
||||
|
||||
Assistant:
|
||||
<thought> Let's round this result to three decimal places using half up rounding rule. </thought>
|
||||
<execute_ipython>
|
||||
import math
|
||||
print(round(245700.5*math.pi, 3))
|
||||
</execute_ipython>
|
||||
|
||||
Observation:
|
||||
771890.886
|
||||
|
||||
Assistant:
|
||||
The answer is <solution> 771890.886 </solution>.
|
||||
25
evaluation/mint/prompts/__init__.py
Normal file
25
evaluation/mint/prompts/__init__.py
Normal file
@ -0,0 +1,25 @@
|
||||
import os
|
||||
|
||||
from utils import load_file
|
||||
|
||||
PROMPT_DIR = os.path.dirname(__file__)
|
||||
TEMPLATE_WITH_TOOL = load_file(os.path.join(PROMPT_DIR, 'template_with_tool.txt'))
|
||||
|
||||
|
||||
class PromptTemplate:
|
||||
"""A prompt template."""
|
||||
|
||||
def __init__(self, template: str):
|
||||
self.template: str = template
|
||||
|
||||
def __call__(self, **kwargs) -> str:
|
||||
return self.template.format(**kwargs)
|
||||
|
||||
|
||||
class ToolPromptTemplate(PromptTemplate):
|
||||
def __init__(self, use_tool: bool):
|
||||
if use_tool:
|
||||
template = TEMPLATE_WITH_TOOL
|
||||
else:
|
||||
raise NotImplementedError('Evaluation without tool is not supported yet.')
|
||||
super().__init__(template)
|
||||
19
evaluation/mint/prompts/template_with_tool.txt
Normal file
19
evaluation/mint/prompts/template_with_tool.txt
Normal file
@ -0,0 +1,19 @@
|
||||
You are a helpful assistant assigned with the task of problem-solving.
|
||||
To solve the task, you can only interact with the interactive Python (Jupyter Notebook) environment using <execute_ipython> tag. Other tools cannot be used.
|
||||
At each turn, you should first provide your step-by-step thinking for solving the task. Your thought process should be enclosed using "<thought>" tag, for example: <thought> I need to print "Hello World!" </thought>.
|
||||
|
||||
After that, you have two options:
|
||||
1) Interact with a Python programming environment and receive the corresponding output.
|
||||
2) Directly provide a solution by sending your answer to user through message that adheres to the required format for the given task. Your solution should be enclosed using "<solution>" tag, for example: The answer is <solution> A </solution>.
|
||||
Either you choose to interact with the Python environment or provide a solution, you need to send a message to the user to evaluate your response and provide feedback.
|
||||
|
||||
You have {max_total_steps} chances to interact with the environment or propose a solution. You can only propose a solution {max_propose_solution} times.
|
||||
|
||||
---
|
||||
|
||||
{in_context_example}
|
||||
|
||||
---
|
||||
|
||||
# Problem statement:
|
||||
{task_prompt}
|
||||
32
evaluation/mint/requirements.txt
Normal file
32
evaluation/mint/requirements.txt
Normal file
@ -0,0 +1,32 @@
|
||||
pre-commit
|
||||
openai
|
||||
datasets
|
||||
backoff
|
||||
charset-normalizer==3.1.0
|
||||
# Alfworld
|
||||
pandas==1.4.4
|
||||
opencv-python
|
||||
networkx
|
||||
tqdm
|
||||
vocab
|
||||
revtok
|
||||
Click
|
||||
ai2thor==2.1.0
|
||||
transformers
|
||||
tokenizers
|
||||
scipy==1.10.1
|
||||
ipython
|
||||
matplotlib
|
||||
cython
|
||||
nltk
|
||||
gym==0.15.4
|
||||
pipreqs
|
||||
pyyaml
|
||||
pytz
|
||||
visdom
|
||||
sympy
|
||||
pycocotools
|
||||
seaborn
|
||||
google-generativeai
|
||||
python-dateutil
|
||||
statsmodels
|
||||
357
evaluation/mint/run_infer.py
Normal file
357
evaluation/mint/run_infer.py
Normal file
@ -0,0 +1,357 @@
|
||||
import asyncio
|
||||
import functools
|
||||
import json
|
||||
import logging
|
||||
import multiprocessing as mp
|
||||
import os
|
||||
import pathlib
|
||||
import subprocess
|
||||
import time
|
||||
from concurrent.futures import ProcessPoolExecutor
|
||||
from typing import Dict
|
||||
|
||||
from datasets import load_dataset
|
||||
from datatypes import TaskState
|
||||
from env import SimplifiedEnv
|
||||
from prompts import ToolPromptTemplate
|
||||
from task import ReasoningTask, Task
|
||||
from tqdm import tqdm
|
||||
|
||||
from evaluation.swe_bench.swe_env_box import DockerSSHBox
|
||||
from opendevin.controller.state.state import State
|
||||
from opendevin.core.config import config, get_llm_config_arg, get_parser
|
||||
from opendevin.core.logger import get_console_handler
|
||||
from opendevin.core.logger import opendevin_logger as logger
|
||||
from opendevin.core.main import main
|
||||
from opendevin.events.serialization.event import event_to_dict
|
||||
|
||||
|
||||
def cleanup():
|
||||
print('Cleaning up child processes...')
|
||||
for process in mp.active_children():
|
||||
print(f'Terminating child process: {process.name}')
|
||||
process.terminate()
|
||||
process.join()
|
||||
|
||||
|
||||
def codeact_user_response(state: State, task: Task, task_config: Dict[str, int]):
|
||||
logger.info(f'Gold reference: {task.reference}')
|
||||
logger.info(f'Task config: {task_config}')
|
||||
|
||||
env = SimplifiedEnv(
|
||||
agent_state=state,
|
||||
task=task,
|
||||
task_config=task_config,
|
||||
)
|
||||
last_action, _ = state.history[-1]
|
||||
result_state: TaskState = env.step(last_action.message)
|
||||
state.task_state = result_state
|
||||
|
||||
if not result_state.latest_output:
|
||||
if result_state.success:
|
||||
msg = 'Your answer is correct. Please EXIT using the following command: <execute_bash> exit </execute_bash>.'
|
||||
else:
|
||||
msg = 'Something went wrong! No output from the model.'
|
||||
else:
|
||||
msg = result_state.latest_output['content']
|
||||
|
||||
logger.info('User response:' + msg)
|
||||
return msg
|
||||
|
||||
|
||||
def monologue_user_response(state: State) -> str:
|
||||
raise NotImplementedError('MonologueAgent should never ask for user responses.')
|
||||
|
||||
|
||||
AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = {
|
||||
'CodeActAgent': codeact_user_response,
|
||||
'MonologueAgent': monologue_user_response,
|
||||
}
|
||||
|
||||
AGENT_CLS_TO_INST_SUFFIX = {
|
||||
'CodeActAgent': '\nIMPORTANT: When your answer is confirmed by the user to be correct, you can exit using the following command: <execute_bash> exit </execute_bash>.\n'
|
||||
}
|
||||
|
||||
|
||||
def process_instance(
|
||||
instance: Task,
|
||||
agent_class,
|
||||
metadata,
|
||||
skip_workspace_mount,
|
||||
eval_output_dir,
|
||||
reset_logger: bool = True,
|
||||
):
|
||||
workspace_mount_path = os.path.join(config.workspace_mount_path, '_eval_workspace')
|
||||
# create process-specific workspace dir
|
||||
# if `not skip_workspace_mount` - we will create a workspace directory for EACH process
|
||||
# so that different agent don't interfere with each other.
|
||||
if not skip_workspace_mount:
|
||||
workspace_mount_path = os.path.join(workspace_mount_path, str(os.getpid()))
|
||||
pathlib.Path(workspace_mount_path).mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Setup the logger properly, so you can run multi-processing to parallize the evaluation
|
||||
if reset_logger:
|
||||
# Set up logger
|
||||
log_file = os.path.join(
|
||||
eval_output_dir, 'logs', f'instance_{instance.task_id}.log'
|
||||
)
|
||||
# Remove all existing handlers from logger
|
||||
for handler in logger.handlers[:]:
|
||||
logger.removeHandler(handler)
|
||||
# add back the console handler to print ONE line
|
||||
logger.addHandler(get_console_handler())
|
||||
logger.info(
|
||||
f'Starting evaluation for instance {instance.task_id}.\nHint: run "tail -f {log_file}" to see live logs in a seperate shell'
|
||||
)
|
||||
# Remove all existing handlers from logger
|
||||
for handler in logger.handlers[:]:
|
||||
logger.removeHandler(handler)
|
||||
file_handler = logging.FileHandler(log_file)
|
||||
file_handler.setFormatter(
|
||||
logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
|
||||
)
|
||||
logger.addHandler(file_handler)
|
||||
|
||||
if not skip_workspace_mount:
|
||||
logger.info(f'Process-specific workspace mounted at {workspace_mount_path}')
|
||||
|
||||
sandbox = DockerSSHBox()
|
||||
|
||||
requirements_host_src = 'evaluation/mint/requirements.txt'
|
||||
requirements_sandbox_dest = '/opendevin/plugins/mint/requirements.txt'
|
||||
sandbox.copy_to(
|
||||
host_src=requirements_host_src,
|
||||
sandbox_dest=requirements_sandbox_dest,
|
||||
recursive=False,
|
||||
)
|
||||
logger.info(
|
||||
f'Copied files from [{requirements_host_src}] to [{requirements_sandbox_dest}] inside sandbox.'
|
||||
)
|
||||
exit_code, output = sandbox.execute(f'pip install -r {requirements_sandbox_dest}')
|
||||
|
||||
# Prepare instruction
|
||||
instruction = ToolPromptTemplate(use_tool=True)(
|
||||
max_total_steps=metadata['max_iterations'],
|
||||
max_propose_solution=metadata['max_propose_solution'],
|
||||
in_context_example=instance.in_context_example(
|
||||
use_tool=True, with_feedback=False
|
||||
),
|
||||
task_prompt='Task:\n' + instance.prompt,
|
||||
)
|
||||
instruction += 'IMPORTANT: You should ONLY interact with the environment provided to you or provide the solution inside <solution> tag AND NEVER ASK FOR HUMAN HELP.\n'
|
||||
|
||||
# NOTE: You can actually set slightly different instruction for different agents
|
||||
instruction += AGENT_CLS_TO_INST_SUFFIX.get(agent_class, '')
|
||||
|
||||
# Here's how you can run the agent (similar to the `main` function) and get the final task state
|
||||
fake_user_response_fn = functools.partial(
|
||||
AGENT_CLS_TO_FAKE_USER_RESPONSE_FN.get(agent_class),
|
||||
task=instance,
|
||||
task_config={
|
||||
'max_iterations': metadata['max_iterations'],
|
||||
'max_propose_solution': metadata['max_propose_solution'],
|
||||
},
|
||||
)
|
||||
|
||||
state: State = asyncio.run(
|
||||
main(
|
||||
instruction,
|
||||
fake_user_response_fn=fake_user_response_fn,
|
||||
sandbox=sandbox,
|
||||
)
|
||||
)
|
||||
|
||||
if state is None:
|
||||
raise ValueError('State should not be None.')
|
||||
|
||||
logger.info('Msgs: ' + str(state.history))
|
||||
|
||||
task_state: TaskState = state.task_state
|
||||
logger.info('Task state: ' + str(task_state.to_dict()))
|
||||
|
||||
# Save the output
|
||||
output = {
|
||||
'id': instance.task_id,
|
||||
'instance': instance.to_dict(),
|
||||
'instruction': instruction,
|
||||
'metadata': metadata,
|
||||
'history': [
|
||||
(event_to_dict(action), event_to_dict(obs)) for action, obs in state.history
|
||||
],
|
||||
'error': state.error if state and state.error else None,
|
||||
'test_result': task_state.success,
|
||||
}
|
||||
|
||||
# Close the sandbox
|
||||
sandbox.close()
|
||||
|
||||
return output
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = get_parser()
|
||||
|
||||
parser.add_argument(
|
||||
'--subset',
|
||||
default='math',
|
||||
choices=['math', 'gsm8k'],
|
||||
type=str,
|
||||
help='subset of the dataset to be used',
|
||||
)
|
||||
parser.add_argument(
|
||||
'--max-propose-solution',
|
||||
default=2,
|
||||
type=int,
|
||||
help='maximum number of times the agent can propose a solution',
|
||||
)
|
||||
|
||||
args, _ = parser.parse_known_args()
|
||||
|
||||
# NOTE: It is preferable to load datasets from huggingface datasets and perform post-processing
|
||||
# so we don't need to manage file uploading to OpenDevin's repo
|
||||
mint_dataset = load_dataset(
|
||||
'ryanhoangt/xingyaoww-mint-bench', name=args.subset, split='test'
|
||||
)
|
||||
logger.info(f'Evaluating MINT - {args.subset} subset')
|
||||
|
||||
# Check https://github.com/OpenDevin/OpenDevin/blob/main/evaluation/swe_bench/README.md#configure-opendevin-and-your-llm
|
||||
# for details of how to set `llm_config`
|
||||
if args.llm_config:
|
||||
specified_llm_config = get_llm_config_arg(args.llm_config)
|
||||
if specified_llm_config:
|
||||
config.llm = specified_llm_config
|
||||
logger.info(f'Config for evaluation: {config}')
|
||||
|
||||
# TEST METADATA
|
||||
agent_class = args.agent_cls
|
||||
assert (
|
||||
agent_class in AGENT_CLS_TO_FAKE_USER_RESPONSE_FN
|
||||
), f'Unsupported agent class: {agent_class}'
|
||||
model_name = config.llm.model.split('/')[-1]
|
||||
max_iterations = args.max_iterations
|
||||
eval_note = ''
|
||||
if args.eval_note is not None:
|
||||
eval_note += '_N_' + args.eval_note
|
||||
eval_output_dir = os.path.join(
|
||||
args.eval_output_dir,
|
||||
'mint',
|
||||
agent_class,
|
||||
model_name + '_maxiter_' + str(max_iterations) + eval_note,
|
||||
args.subset,
|
||||
)
|
||||
|
||||
pathlib.Path(eval_output_dir).mkdir(parents=True, exist_ok=True)
|
||||
pathlib.Path(os.path.join(eval_output_dir, 'logs')).mkdir(
|
||||
parents=True, exist_ok=True
|
||||
)
|
||||
logger.info(f'Using evaluation output directory: {eval_output_dir}')
|
||||
|
||||
metadata = {
|
||||
'agent_class': agent_class,
|
||||
'model_name': model_name,
|
||||
'max_iterations': max_iterations,
|
||||
'max_propose_solution': args.max_propose_solution,
|
||||
'eval_output_dir': eval_output_dir,
|
||||
'start_time': time.strftime('%Y-%m-%d %H:%M:%S'),
|
||||
# get the commit id of current repo for reproduciblity
|
||||
'git_commit': subprocess.check_output(['git', 'rev-parse', 'HEAD'])
|
||||
.decode('utf-8')
|
||||
.strip(),
|
||||
}
|
||||
logger.info(f'Metadata: {metadata}')
|
||||
with open(os.path.join(eval_output_dir, 'metadata.json'), 'w') as f:
|
||||
json.dump(metadata, f)
|
||||
|
||||
# LIMIT EVALUATION
|
||||
eval_n_limit = args.eval_n_limit
|
||||
if eval_n_limit:
|
||||
mint_dataset = mint_dataset.select(range(eval_n_limit))
|
||||
logger.info(f'Limiting evaluation to first {eval_n_limit} instances.')
|
||||
|
||||
# OUTPUT FILE
|
||||
output_file = os.path.join(eval_output_dir, 'output.jsonl')
|
||||
logger.info(f'Writing evaluation output to {output_file}')
|
||||
finished_instance_ids = set()
|
||||
if os.path.exists(output_file):
|
||||
with open(output_file, 'r') as f:
|
||||
for line in f:
|
||||
data = json.loads(line)
|
||||
finished_instance_ids.add(data['id'])
|
||||
logger.warning(
|
||||
f'Output file {output_file} already exists. Loaded {len(finished_instance_ids)} finished instances.'
|
||||
)
|
||||
output_fp = open(output_file, 'a')
|
||||
|
||||
logger.info(
|
||||
f'Evaluation started with Agent {agent_class}, model {model_name}, max iterations {max_iterations}, max propose solution {args.max_propose_solution}.'
|
||||
)
|
||||
|
||||
# =============================================
|
||||
# filter out finished instances
|
||||
task_class = ReasoningTask
|
||||
new_mint_tests: list[ReasoningTask] = []
|
||||
for instance in mint_dataset:
|
||||
if instance['id'] in finished_instance_ids:
|
||||
logger.info(
|
||||
f'Skipping instance {instance["id"]} as it is already finished.'
|
||||
)
|
||||
continue
|
||||
# convert to Task object
|
||||
instance = ReasoningTask(**instance)
|
||||
new_mint_tests.append(instance)
|
||||
|
||||
mint_dataset = new_mint_tests
|
||||
logger.info(
|
||||
f'Finished instances: {len(finished_instance_ids)}, Remaining instances: {len(mint_dataset)}'
|
||||
)
|
||||
# =============================================
|
||||
|
||||
pbar = tqdm(total=len(mint_dataset))
|
||||
|
||||
# This function tracks the progress AND write the output to a JSONL file
|
||||
def update_progress(future):
|
||||
pbar.update(1)
|
||||
output = future.result()
|
||||
# logger.info('Output: ', output)
|
||||
# pbar.set_description(f'Instance {output["instance_id"]}')
|
||||
# pbar.set_postfix_str(f'Test Result: {output["test_result"]["result"]}')
|
||||
# logger.info(
|
||||
# f'Finished evaluation for instance {output["instance_id"]}: {output["test_result"]["result"]}'
|
||||
# )
|
||||
output_fp.write(json.dumps(output) + '\n')
|
||||
output_fp.flush()
|
||||
|
||||
# This sets the multi-processing
|
||||
num_workers = args.eval_num_workers
|
||||
logger.info(f'Using {num_workers} workers for evaluation.')
|
||||
|
||||
# This is SWE-Bench specific - CodeActAgent doesn't require mounted workspace to work
|
||||
skip_workspace_mount = agent_class == 'CodeActAgent'
|
||||
logger.info(f'Skipping workspace mount: {skip_workspace_mount}')
|
||||
|
||||
try:
|
||||
with ProcessPoolExecutor(num_workers) as executor:
|
||||
futures = []
|
||||
# This is how we perform multi-processing
|
||||
for instance in mint_dataset:
|
||||
future = executor.submit(
|
||||
process_instance,
|
||||
instance,
|
||||
agent_class,
|
||||
metadata,
|
||||
skip_workspace_mount,
|
||||
eval_output_dir,
|
||||
reset_logger=bool(num_workers > 1),
|
||||
)
|
||||
future.add_done_callback(update_progress)
|
||||
futures.append(future)
|
||||
|
||||
# Wait for all futures to complete
|
||||
for future in futures:
|
||||
future.result()
|
||||
except KeyboardInterrupt:
|
||||
print('KeyboardInterrupt received. Cleaning up...')
|
||||
cleanup()
|
||||
|
||||
output_fp.close()
|
||||
logger.info('Evaluation finished.')
|
||||
37
evaluation/mint/scripts/run_infer.sh
Normal file
37
evaluation/mint/scripts/run_infer.sh
Normal file
@ -0,0 +1,37 @@
|
||||
#!/bin/bash
|
||||
|
||||
MODEL_CONFIG=$1
|
||||
SUBSET=$2
|
||||
EVAL_LIMIT=$3
|
||||
# Only 'CodeActAgent' is supported for MINT now
|
||||
AGENT="CodeActAgent"
|
||||
|
||||
# We need to track the version of Agent in the evaluation to make sure results are comparable
|
||||
AGENT_VERSION=v$(poetry run python -c "import agenthub; from opendevin.controller.agent import Agent; print(Agent.get_cls('$AGENT').VERSION)")
|
||||
|
||||
echo "AGENT: $AGENT"
|
||||
echo "AGENT_VERSION: $AGENT_VERSION"
|
||||
|
||||
export PYTHONPATH=$(pwd)
|
||||
|
||||
COMMAND="poetry run python ./evaluation/mint/run_infer.py \
|
||||
--max-iterations 5 \
|
||||
--max-propose-solution 2 \
|
||||
--eval-note $AGENT_VERSION"
|
||||
|
||||
if [ -n "$SUBSET" ]; then
|
||||
echo "SUBSET: $SUBSET"
|
||||
COMMAND="$COMMAND --subset $SUBSET"
|
||||
# otherwise default to use the math subset
|
||||
else
|
||||
echo "SUBSET: math"
|
||||
COMMAND="$COMMAND --subset math"
|
||||
fi
|
||||
|
||||
if [ -n "$EVAL_LIMIT" ]; then
|
||||
echo "EVAL_LIMIT: $EVAL_LIMIT"
|
||||
COMMAND="$COMMAND --eval-n-limit $EVAL_LIMIT"
|
||||
fi
|
||||
|
||||
# Run the command
|
||||
eval $COMMAND
|
||||
121
evaluation/mint/task.py
Normal file
121
evaluation/mint/task.py
Normal file
@ -0,0 +1,121 @@
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import List, Optional, Tuple
|
||||
|
||||
from utils import load_file
|
||||
|
||||
LOGGER = logging.getLogger('MINT')
|
||||
|
||||
|
||||
class Task(ABC):
|
||||
"""Base class for a task instance."""
|
||||
|
||||
task_name: str = 'base'
|
||||
in_context_example_dir = os.path.join(
|
||||
os.path.dirname(os.path.abspath(__file__)),
|
||||
'in_context_examples',
|
||||
)
|
||||
|
||||
def __init__(self, **kwargs) -> None:
|
||||
if 'loaded_history' in kwargs:
|
||||
self.loaded_history = kwargs['loaded_history']
|
||||
else:
|
||||
self.loaded_history = None
|
||||
# pre-load the in-context example
|
||||
task_dir = os.path.join(self.in_context_example_dir, self.task_name)
|
||||
self._in_context_example = {
|
||||
'with_tool': load_file(os.path.join(task_dir, 'with_tool.txt')),
|
||||
}
|
||||
self.metadata = {}
|
||||
|
||||
@property
|
||||
def task_id(self) -> str:
|
||||
"""Return the task id."""
|
||||
assert hasattr(self, '_id'), 'Task does not have an id.'
|
||||
return self._id
|
||||
|
||||
def in_context_example(
|
||||
self, use_tool: bool = True, with_feedback: bool = False
|
||||
) -> str:
|
||||
"""Return the in-context example for the task."""
|
||||
if use_tool and not with_feedback:
|
||||
return self._in_context_example['with_tool']
|
||||
else:
|
||||
raise NotImplementedError
|
||||
|
||||
@property
|
||||
def prompt(self) -> str:
|
||||
"""Return the task prompt."""
|
||||
assert hasattr(self, '_prompt'), 'Task does not have a prompt.'
|
||||
return self._prompt
|
||||
|
||||
@property
|
||||
def reference(self) -> str:
|
||||
"""Return the reference solution for the task."""
|
||||
assert hasattr(self, '_reference'), 'Task does not have a reference solution.'
|
||||
return self._reference
|
||||
|
||||
@abstractmethod
|
||||
def extract_answer(self, solution: str) -> Optional[str]:
|
||||
"""Extract the answer from the given solution."""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def success(self, solution: str) -> bool:
|
||||
"""This checks whether the given solution can complete the current task.
|
||||
|
||||
Can be used to provide binary feedback.
|
||||
"""
|
||||
answer = self.extract_answer(solution)
|
||||
return answer == self.reference
|
||||
|
||||
@classmethod
|
||||
def load_tasks(cls, path: str) -> Tuple[List['Task'], int]:
|
||||
"""Load all the tasks from a given jsonl file."""
|
||||
assert path.endswith('.jsonl') or path.endswith('.json')
|
||||
with open(path, 'r') as f:
|
||||
tasks = [cls(**json.loads(line)) for line in f.readlines()]
|
||||
LOGGER.info(f'Loaded {len(tasks)} tasks from {path}')
|
||||
return tasks, len(tasks)
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
"""Convert the task to a dictionary."""
|
||||
return {
|
||||
'task_name': self.task_name,
|
||||
'task_id': self.task_id,
|
||||
'prompt': self.prompt,
|
||||
'reference': self.reference,
|
||||
'metadata': self.metadata,
|
||||
}
|
||||
|
||||
|
||||
class ReasoningTask(Task):
|
||||
task_name = 'reasoning'
|
||||
|
||||
def __init__(self, id: str, prompt: str, reference: str, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
self._id = id
|
||||
self._prompt = prompt.strip()
|
||||
self._reference = str(reference).strip().lower()
|
||||
|
||||
def extract_answer(self, solution: str) -> Optional[str]:
|
||||
"""Extract the answer from the given solution."""
|
||||
return solution.lower().strip()
|
||||
|
||||
def compare_w_digits(self, reference: str, answer: str) -> bool:
|
||||
"""Compare the reference and answer with digits."""
|
||||
# if reference can and answer can both be converted to floats by float()
|
||||
try:
|
||||
float(reference)
|
||||
float(answer)
|
||||
return abs(float(reference) - float(answer)) <= 0.05 * abs(float(reference))
|
||||
except ValueError:
|
||||
return reference in answer
|
||||
except Exception:
|
||||
raise ValueError(f'Cannot compare {reference} and {answer}')
|
||||
|
||||
def success(self, solution: str) -> bool:
|
||||
answer = self.extract_answer(solution)
|
||||
return self.compare_w_digits(self._reference, answer)
|
||||
10
evaluation/mint/utils.py
Normal file
10
evaluation/mint/utils.py
Normal file
@ -0,0 +1,10 @@
|
||||
import functools
|
||||
|
||||
|
||||
# use cache to avoid loading the same file multiple times
|
||||
# which can leads to too many open files error
|
||||
@functools.lru_cache(maxsize=128)
|
||||
def load_file(filepath: str) -> str:
|
||||
with open(filepath, 'r') as f:
|
||||
content = f.read()
|
||||
return content
|
||||
Loading…
x
Reference in New Issue
Block a user