diff --git a/evaluation/benchmarks/swe_bench/README.md b/evaluation/benchmarks/swe_bench/README.md index 68335a0d9f..169abe4637 100644 --- a/evaluation/benchmarks/swe_bench/README.md +++ b/evaluation/benchmarks/swe_bench/README.md @@ -2,6 +2,8 @@ This folder contains the evaluation harness that we built on top of the original [SWE-Bench benchmark](https://www.swebench.com/) ([paper](https://arxiv.org/abs/2310.06770)). +**UPDATE (5/26/2025): We now support running interactive SWE-Bench evaluation (see the paper [here](https://arxiv.org/abs/2502.13069))! For how to run it, checkout [this README](./SWE-Interact.md).** + **UPDATE (4/8/2025): We now support running SWT-Bench evaluation! For more details, checkout [the corresponding section](#SWT-Bench-Evaluation).** **UPDATE (03/27/2025): We now support SWE-Bench multimodal evaluation! Simply use "princeton-nlp/SWE-bench_Multimodal" as the dataset name in the `run_infer.sh` script to evaluate on multimodal instances.** diff --git a/evaluation/benchmarks/swe_bench/SWE-Interact.md b/evaluation/benchmarks/swe_bench/SWE-Interact.md new file mode 100644 index 0000000000..c56e302863 --- /dev/null +++ b/evaluation/benchmarks/swe_bench/SWE-Interact.md @@ -0,0 +1,92 @@ +# SWE-Interact Benchmark + +This document explains how to use the [Interactive SWE-Bench](https://arxiv.org/abs/2502.13069) benchmark scripts for running and evaluating interactive software engineering tasks. + +## Setting things up +After following the [README](./README.md) to set up the environment, you would need to additionally add LLM configurations for simulated human users. In the original [paper](https://arxiv.org/abs/2502.13069), we use gpt-4o as the simulated human user. You can add the following to your `config.toml` file: + +```toml +[llm.fake_user] +model="litellm_proxy/gpt-4o-2024-08-06" +api_key="" +temperature = 0.0 +base_url = "https://llm-proxy.eval.all-hands.dev" +``` + +## Running the Benchmark + +The main script for running the benchmark is `run_infer_interact.sh`. Here's how to use it: + +```bash +bash ./evaluation/benchmarks/swe_bench/scripts/run_infer_interact.sh +``` + +### Parameters: + +- `model_config`: Path to the LLM configuration file (e.g., `llm.claude-3-7-sonnet`) +- `commit_hash`: Git commit hash to use (e.g., `HEAD`) +- `agent`: The agent class to use (e.g., `CodeActAgent`) +- `eval_limit`: Number of examples to evaluate (e.g., `500`) +- `max_iter`: Maximum number of iterations per task (e.g., `100`) +- `num_workers`: Number of parallel workers (e.g., `1`) +- `split`: Dataset split to use (e.g., `test`) + +### Example: + +```bash +bash ./evaluation/benchmarks/swe_bench/scripts/run_infer_interact.sh llm.claude-3-7-sonnet HEAD CodeActAgent 500 100 1 test +``` + +### Additional Environment Variables: + +You can customize the behavior using these environment variables: + +- `RUN_WITH_BROWSING`: Enable/disable web browsing (default: false) +- `USE_HINT_TEXT`: Enable/disable hint text (default: false) +- `EVAL_CONDENSER`: Specify a condenser configuration +- `EXP_NAME`: Add a custom experiment name to the output +- `N_RUNS`: Number of runs to perform (default: 1) +- `SKIP_RUNS`: Comma-separated list of run numbers to skip + +## Evaluating Results + +After running the benchmark, you can evaluate the results using `eval_infer.sh`: + +```bash +./evaluation/benchmarks/swe_bench/scripts/eval_infer.sh +``` + +### Parameters: + +- `output_file`: Path to the output JSONL file +- `instance_id`: The specific instance ID to evaluate +- `dataset`: Dataset name (e.g., `cmu-lti/interactive-swe`) +- `split`: Dataset split (e.g., `test`) + +### Example: + +```bash +./evaluation/benchmarks/swe_bench/scripts/eval_infer.sh evaluation/evaluation_outputs/outputs/cmu-lti__interactive-swe-test/CodeActAgent/claude-3-7-sonnet-20250219_maxiter_100_N_v0.39.0-no-hint-run_1/output.jsonl sphinx-doc__sphinx-8721 cmu-lti/interactive-swe test +``` + +## Output Structure + +The benchmark outputs are stored in the `evaluation/evaluation_outputs/outputs/` directory with the following structure: + +``` +evaluation/evaluation_outputs/outputs/ +└── cmu-lti__interactive-swe-{split}/ + └── {agent}/ + └── {model}-{date}_maxiter_{max_iter}_N_{version}-{options}-run_{run_number}/ + └── output.jsonl +``` + +Where: +- `{split}` is the dataset split (e.g., test) +- `{agent}` is the agent class name +- `{model}` is the model name +- `{date}` is the run date +- `{max_iter}` is the maximum iterations +- `{version}` is the OpenHands version +- `{options}` includes any additional options (e.g., no-hint, with-browsing) +- `{run_number}` is the run number diff --git a/evaluation/benchmarks/swe_bench/run_infer_interact.py b/evaluation/benchmarks/swe_bench/run_infer_interact.py new file mode 100755 index 0000000000..1ed4cc4e2f --- /dev/null +++ b/evaluation/benchmarks/swe_bench/run_infer_interact.py @@ -0,0 +1,411 @@ +import asyncio +import json +import os + +import pandas as pd +from datasets import load_dataset +from litellm import completion as litellm_completion + +import openhands.agenthub +from evaluation.benchmarks.swe_bench.run_infer import ( + AgentFinishedCritic, + complete_runtime, + filter_dataset, + get_config, + initialize_runtime, +) +from evaluation.benchmarks.swe_bench.run_infer import ( + get_instruction as base_get_instruction, +) +from evaluation.utils.shared import ( + EvalException, + EvalMetadata, + EvalOutput, + make_metadata, + prepare_dataset, + reset_logger_for_multiprocessing, + run_evaluation, +) +from openhands.controller.state.state import State +from openhands.core.config import ( + get_llm_config_arg, + get_parser, +) +from openhands.core.config.condenser_config import NoOpCondenserConfig +from openhands.core.config.utils import get_condenser_config_arg +from openhands.core.logger import openhands_logger as logger +from openhands.core.main import create_runtime, run_controller +from openhands.events.action import MessageAction +from openhands.events.serialization.event import event_from_dict, event_to_dict +from openhands.utils.async_utils import call_async_from_sync + +USE_HINT_TEXT = os.environ.get('USE_HINT_TEXT', 'false').lower() == 'true' +USE_INSTANCE_IMAGE = os.environ.get('USE_INSTANCE_IMAGE', 'false').lower() == 'true' +RUN_WITH_BROWSING = os.environ.get('RUN_WITH_BROWSING', 'false').lower() == 'false' + + +class FakeUser: + def __init__(self, issue, hints, files): + self.system_message = f""" + You are a GitHub user reporting an issue. Here are the details of your issue and environment: + + Issue: {issue} + + Hints: {hints} + + Files relative to your current directory: {files} + + Your task is to respond to questions from a coder who is trying to solve your issue. The coder has a summarized version of the issue you have. Follow these rules: + 1. If the coder asks a question that is directly related to the information in the issue you have, provide that information. + 2. Always stay in character as a user reporting an issue, not as an AI assistant. + 3. Keep your responses concise and to the point. + 4. The coder has limited turns to solve the issue. Do not interact with the coder beyond 3 turns. + + Respond with "I don't have that information" if the question is unrelated or you're unsure. + """ + self.chat_history = [{'role': 'system', 'content': self.system_message}] + self.turns = 0 + # Get LLM config from config.toml + self.llm_config = get_llm_config_arg( + 'llm.fake_user' + ) # You can change 'fake_user' to any config name you want + + def generate_reply(self, question): + if self.turns > 3: + return 'Please continue working on the task. Do NOT ask for more help.' + self.chat_history.append({'role': 'user', 'content': question.content}) + + response = litellm_completion( + model=self.llm_config.model, + messages=self.chat_history, + api_key=self.llm_config.api_key.get_secret_value(), + temperature=self.llm_config.temperature, + base_url=self.llm_config.base_url, + ) + + reply = response.choices[0].message.content + self.chat_history.append({'role': 'assistant', 'content': reply}) + self.turns += 1 + return reply + + +# Global variable for fake user +fake_user = None + + +def get_fake_user_response(state: State) -> str: + global fake_user + if not fake_user: + return 'Please continue working on the task.' + last_agent_message = state.get_last_agent_message() + if last_agent_message: + return fake_user.generate_reply(last_agent_message) + return 'Please continue working on the task.' + + +AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = { + 'CodeActAgent': get_fake_user_response, +} + + +def get_instruction(instance: pd.Series, metadata: EvalMetadata) -> MessageAction: + instance_copy = instance.copy() + instance_copy.problem_statement = f'{instance.problem_statement}\n\nHints:\nThe user has not provided all the necessary details about the issue, and there are some hidden details that are helpful. Please ask the user specific questions using non-code commands to gather the relevant information that the user has to help you solve the issue. Ensure you have all the details you require to solve the issue.' + return base_get_instruction(instance_copy, metadata) + + +def process_instance( + instance: pd.Series, + metadata: EvalMetadata, + reset_logger: bool = True, +) -> EvalOutput: + config = get_config(instance, metadata) + global fake_user + original_issue = instance.original_issue + issue = str(original_issue) + fake_user = FakeUser(issue=issue, hints=instance.hints_text, files=instance.files) + + # Setup the logger properly, so you can run multi-processing to parallelize the evaluation + if reset_logger: + log_dir = os.path.join(metadata.eval_output_dir, 'infer_logs') + reset_logger_for_multiprocessing(logger, instance.instance_id, log_dir) + else: + logger.info(f'Starting evaluation for instance {instance.instance_id}.') + + runtime = create_runtime(config) + call_async_from_sync(runtime.connect) + + try: + initialize_runtime(runtime, instance, metadata) + + message_action = get_instruction(instance, metadata) + + # Here's how you can run the agent (similar to the `main` function) and get the final task state + state: State | None = asyncio.run( + run_controller( + config=config, + initial_user_action=message_action, + runtime=runtime, + fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN[ + metadata.agent_class + ], + ) + ) + + # if fatal error, throw EvalError to trigger re-run + if ( + state + and state.last_error + and 'fatal error during agent execution' in state.last_error + and 'stuck in a loop' not in state.last_error + ): + raise EvalException('Fatal error detected: ' + state.last_error) + + # Get git patch + return_val = complete_runtime(runtime, instance) + git_patch = return_val['git_patch'] + logger.info( + f'Got git diff for instance {instance.instance_id}:\n--------\n{git_patch}\n--------' + ) + finally: + runtime.close() + + # Prepare test result + test_result = { + 'git_patch': git_patch, + } + + if state is None: + raise ValueError('State should not be None.') + + histories = [event_to_dict(event) for event in state.history] + metrics = state.metrics.get() if state.metrics else None + + # Save the output + instruction = message_action.content + if message_action.image_urls: + instruction += ( + '\n\n' + '\n'.join(message_action.image_urls) + '' + ) + output = EvalOutput( + instance_id=instance.instance_id, + instruction=instruction, + instance=instance.to_dict(), + test_result=test_result, + metadata=metadata, + history=histories, + metrics=metrics, + error=state.last_error if state and state.last_error else None, + ) + return output + + +if __name__ == '__main__': + parser = get_parser() + parser.add_argument( + '--dataset', + type=str, + default='cmu-lti/interactive-swe', + help='dataset to evaluate on', + ) + parser.add_argument( + '--split', + type=str, + default='test', + help='split to evaluate on', + ) + + args, _ = parser.parse_known_args() + + # Load dataset from huggingface datasets + dataset = load_dataset(args.dataset, split=args.split) + swe_bench_tests = filter_dataset(dataset.to_pandas(), 'instance_id') + logger.info( + f'Loaded dataset {args.dataset} with split {args.split}: {len(swe_bench_tests)} tasks' + ) + llm_config = None + if args.llm_config: + llm_config = get_llm_config_arg(args.llm_config) + llm_config.log_completions = True + # modify_params must be False for evaluation purpose, for reproducibility and accurancy of results + llm_config.modify_params = False + + if llm_config is None: + raise ValueError(f'Could not find LLM config: --llm_config {args.llm_config}') + + # Get condenser config from environment variable + condenser_name = os.environ.get('EVAL_CONDENSER') + if condenser_name: + condenser_config = get_condenser_config_arg(condenser_name) + if condenser_config is None: + raise ValueError( + f'Could not find Condenser config: EVAL_CONDENSER={condenser_name}' + ) + else: + # If no specific condenser config is provided via env var, default to NoOpCondenser + condenser_config = NoOpCondenserConfig() + logger.debug( + 'No Condenser config provided via EVAL_CONDENSER, using NoOpCondenser.' + ) + + details = {'mode': 'interact'} + _agent_cls = openhands.agenthub.Agent.get_cls(args.agent_cls) + + dataset_descrption = ( + args.dataset.replace('/', '__') + '-' + args.split.replace('/', '__') + ) + metadata = make_metadata( + llm_config, + dataset_descrption, + args.agent_cls, + args.max_iterations, + args.eval_note, + args.eval_output_dir, + details=details, + condenser_config=condenser_config, + ) + + output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl') + print(f'### OUTPUT FILE: {output_file} ###') + + # Run evaluation in iterative mode: + # If a rollout fails to output AgentFinishAction, we will try again until it succeeds OR total 3 attempts have been made. + ITERATIVE_EVAL_MODE = ( + os.environ.get('ITERATIVE_EVAL_MODE', 'false').lower() == 'true' + ) + ITERATIVE_EVAL_MODE_MAX_ATTEMPTS = int( + os.environ.get('ITERATIVE_EVAL_MODE_MAX_ATTEMPTS', '3') + ) + + if not ITERATIVE_EVAL_MODE: + # load the dataset + instances = prepare_dataset(swe_bench_tests, output_file, args.eval_n_limit) + if len(instances) > 0 and not isinstance( + instances['PASS_TO_PASS'][instances['PASS_TO_PASS'].index[0]], str + ): + for col in ['PASS_TO_PASS', 'FAIL_TO_PASS']: + instances[col] = instances[col].apply(lambda x: str(x)) + run_evaluation( + instances, + metadata, + output_file, + args.eval_num_workers, + process_instance, + timeout_seconds=8 + * 60 + * 60, # 8 hour PER instance should be more than enough + max_retries=5, + ) + else: + critic = AgentFinishedCritic() + + def get_cur_output_file_path(attempt: int) -> str: + return ( + f'{output_file.removesuffix(".jsonl")}.critic_attempt_{attempt}.jsonl' + ) + + eval_ids = None + for attempt in range(1, ITERATIVE_EVAL_MODE_MAX_ATTEMPTS + 1): + cur_output_file = get_cur_output_file_path(attempt) + logger.info( + f'Running evaluation with critic {critic.__class__.__name__} for attempt {attempt} of {ITERATIVE_EVAL_MODE_MAX_ATTEMPTS}.' + ) + + # For deterministic eval, we set temperature to 0.1 for (>1) attempt + # so hopefully we get slightly different results + if attempt > 1 and metadata.llm_config.temperature == 0: + logger.info( + f'Detected temperature is 0 for (>1) attempt {attempt}. Setting temperature to 0.1...' + ) + metadata.llm_config.temperature = 0.1 + + # Load instances - at first attempt, we evaluate all instances + # On subsequent attempts, we only evaluate the instances that failed the previous attempt determined by critic + instances = prepare_dataset( + swe_bench_tests, cur_output_file, args.eval_n_limit, eval_ids=eval_ids + ) + if len(instances) > 0 and not isinstance( + instances['PASS_TO_PASS'][instances['PASS_TO_PASS'].index[0]], str + ): + for col in ['PASS_TO_PASS', 'FAIL_TO_PASS']: + instances[col] = instances[col].apply(lambda x: str(x)) + + # Run evaluation - but save them to cur_output_file + logger.info( + f'Evaluating {len(instances)} instances for attempt {attempt}...' + ) + run_evaluation( + instances, + metadata, + cur_output_file, + args.eval_num_workers, + process_instance, + timeout_seconds=8 + * 60 + * 60, # 8 hour PER instance should be more than enough + max_retries=5, + ) + + # When eval is done, we update eval_ids to the instances that failed the current attempt + instances_failed = [] + logger.info( + f'Use critic {critic.__class__.__name__} to check {len(instances)} instances for attempt {attempt}...' + ) + with open(cur_output_file, 'r') as f: + for line in f: + instance = json.loads(line) + try: + history = [ + event_from_dict(event) for event in instance['history'] + ] + critic_result = critic.evaluate( + history, instance['test_result'].get('git_patch', '') + ) + if not critic_result.success: + instances_failed.append(instance['instance_id']) + except Exception as e: + logger.error( + f'Error loading history for instance {instance["instance_id"]}: {e}' + ) + instances_failed.append(instance['instance_id']) + logger.info( + f'{len(instances_failed)} instances failed the current attempt {attempt}: {instances_failed}' + ) + eval_ids = instances_failed + + # If no instances failed, we break + if len(instances_failed) == 0: + break + + # Then we should aggregate the results from all attempts into the original output file + # and remove the intermediate files + logger.info( + 'Aggregating results from all attempts into the original output file...' + ) + fout = open(output_file, 'w') + added_instance_ids = set() + for attempt in reversed(range(1, ITERATIVE_EVAL_MODE_MAX_ATTEMPTS + 1)): + cur_output_file = get_cur_output_file_path(attempt) + if not os.path.exists(cur_output_file): + logger.warning( + f'Intermediate output file {cur_output_file} does not exist. Skipping...' + ) + continue + + with open(cur_output_file, 'r') as f: + for line in f: + instance = json.loads(line) + # Also make sure git_patch is not empty - otherwise we fall back to previous attempt (empty patch is worse than anything else) + if ( + instance['instance_id'] not in added_instance_ids + and instance['test_result'].get('git_patch', '').strip() + ): + fout.write(line) + added_instance_ids.add(instance['instance_id']) + logger.info( + f'Aggregated instances from {cur_output_file}. Total instances added so far: {len(added_instance_ids)}' + ) + fout.close() + logger.info( + f'Done! Total {len(added_instance_ids)} instances added to {output_file}' + ) diff --git a/evaluation/benchmarks/swe_bench/scripts/run_infer_interact.sh b/evaluation/benchmarks/swe_bench/scripts/run_infer_interact.sh new file mode 100644 index 0000000000..6d97f7bb1f --- /dev/null +++ b/evaluation/benchmarks/swe_bench/scripts/run_infer_interact.sh @@ -0,0 +1,131 @@ +#!/usr/bin/env bash +set -eo pipefail + +source "evaluation/utils/version_control.sh" + +MODEL_CONFIG=$1 +COMMIT_HASH=$2 +AGENT=$3 +EVAL_LIMIT=$4 +MAX_ITER=$5 +NUM_WORKERS=$6 +SPLIT=$8 +N_RUNS=$9 + + +if [ -z "$NUM_WORKERS" ]; then + NUM_WORKERS=1 + echo "Number of workers not specified, use default $NUM_WORKERS" +fi +checkout_eval_branch + +if [ -z "$AGENT" ]; then + echo "Agent not specified, use default CodeActAgent" + AGENT="CodeActAgent" +fi + +if [ -z "$MAX_ITER" ]; then + echo "MAX_ITER not specified, use default 100" + MAX_ITER=100 +fi + +if [ -z "$RUN_WITH_BROWSING" ]; then + echo "RUN_WITH_BROWSING not specified, use default false" + RUN_WITH_BROWSING=false +fi + + +if [ -z "$DATASET" ]; then + echo "DATASET not specified, use default cmu-lti/interactive-swe" + DATASET="cmu-lti/interactive-swe" +fi + +if [ -z "$SPLIT" ]; then + echo "SPLIT not specified, use default test" + SPLIT="test" +fi + +if [ -n "$EVAL_CONDENSER" ]; then + echo "Using Condenser Config: $EVAL_CONDENSER" +else + echo "No Condenser Config provided via EVAL_CONDENSER, use default (NoOpCondenser)." +fi + +export RUN_WITH_BROWSING=$RUN_WITH_BROWSING +echo "RUN_WITH_BROWSING: $RUN_WITH_BROWSING" + +get_openhands_version + +echo "AGENT: $AGENT" +echo "OPENHANDS_VERSION: $OPENHANDS_VERSION" +echo "MODEL_CONFIG: $MODEL_CONFIG" +echo "DATASET: $DATASET" +echo "SPLIT: $SPLIT" +echo "MAX_ITER: $MAX_ITER" +echo "NUM_WORKERS: $NUM_WORKERS" +echo "COMMIT_HASH: $COMMIT_HASH" +echo "EVAL_CONDENSER: $EVAL_CONDENSER" + +# Default to NOT use Hint +if [ -z "$USE_HINT_TEXT" ]; then + export USE_HINT_TEXT=false +fi +echo "USE_HINT_TEXT: $USE_HINT_TEXT" +EVAL_NOTE="$OPENHANDS_VERSION" +# if not using Hint, add -no-hint to the eval note +if [ "$USE_HINT_TEXT" = false ]; then + EVAL_NOTE="$EVAL_NOTE-no-hint" +fi + +if [ "$RUN_WITH_BROWSING" = true ]; then + EVAL_NOTE="$EVAL_NOTE-with-browsing" +fi + +if [ -n "$EXP_NAME" ]; then + EVAL_NOTE="$EVAL_NOTE-$EXP_NAME" +fi +# Add condenser config to eval note if provided +if [ -n "$EVAL_CONDENSER" ]; then + EVAL_NOTE="${EVAL_NOTE}-${EVAL_CONDENSER}" +fi + +function run_eval() { + local eval_note="${1}" + COMMAND="poetry run python evaluation/benchmarks/swe_bench/run_infer_interact.py \ + --agent-cls $AGENT \ + --llm-config $MODEL_CONFIG \ + --max-iterations $MAX_ITER \ + --eval-num-workers $NUM_WORKERS \ + --eval-note $eval_note \ + --dataset $DATASET \ + --split $SPLIT" + + if [ -n "$EVAL_LIMIT" ]; then + echo "EVAL_LIMIT: $EVAL_LIMIT" + COMMAND="$COMMAND --eval-n-limit $EVAL_LIMIT" + fi + + # Run the command + eval $COMMAND +} + +unset SANDBOX_ENV_GITHUB_TOKEN # prevent the agent from using the github token to push +if [ -z "$N_RUNS" ]; then + N_RUNS=1 + echo "N_RUNS not specified, use default $N_RUNS" +fi + +# Skip runs if the run number is in the SKIP_RUNS list +# read from env variable SKIP_RUNS as a comma separated list of run numbers +SKIP_RUNS=(${SKIP_RUNS//,/ }) +for i in $(seq 1 $N_RUNS); do + if [[ " ${SKIP_RUNS[@]} " =~ " $i " ]]; then + echo "Skipping run $i" + continue + fi + current_eval_note="$EVAL_NOTE-run_$i" + echo "EVAL_NOTE: $current_eval_note" + run_eval $current_eval_note +done + +checkout_original_branch