diff --git a/evaluation/benchmarks/multi_swe_bench/README.md b/evaluation/benchmarks/multi_swe_bench/README.md index 843343f922..88843ca9ce 100644 --- a/evaluation/benchmarks/multi_swe_bench/README.md +++ b/evaluation/benchmarks/multi_swe_bench/README.md @@ -41,6 +41,10 @@ default, it is set to 1. - `language`, the language of your evaluating dataset. - `dataset`, the absolute position of the dataset jsonl. +**Skipping errors on build** + +For debugging purposes, you can set `export EVAL_SKIP_MAXIMUM_RETRIES_EXCEEDED=true` to continue evaluation even when instances reach maximum retries. After evaluation completes, check `maximum_retries_exceeded.jsonl` for a list of failed instances, fix those issues, and then run the evaluation again with `export EVAL_SKIP_MAXIMUM_RETRIES_EXCEEDED=false`. + The results will be generated in evaluation/evaluation_outputs/outputs/XXX/CodeActAgent/YYY/output.jsonl, you can refer to the [example](examples/output.jsonl). ## Runing evaluation diff --git a/evaluation/benchmarks/multi_swe_bench/run_infer.py b/evaluation/benchmarks/multi_swe_bench/run_infer.py index df0efd339d..1881b1e422 100644 --- a/evaluation/benchmarks/multi_swe_bench/run_infer.py +++ b/evaluation/benchmarks/multi_swe_bench/run_infer.py @@ -17,6 +17,7 @@ from evaluation.utils.shared import ( EvalMetadata, EvalOutput, assert_and_raise, + check_maximum_retries_exceeded, codeact_user_response, get_default_sandbox_config_for_eval, get_metrics, @@ -843,3 +844,5 @@ if __name__ == '__main__': timeout_seconds=120 * 60, # 2 hour PER instance should be more than enough max_retries=5, ) + # Check if any instances reached maximum retries + check_maximum_retries_exceeded(metadata.eval_output_dir) diff --git a/evaluation/benchmarks/swe_bench/README.md b/evaluation/benchmarks/swe_bench/README.md index f968130211..b1858bf70a 100644 --- a/evaluation/benchmarks/swe_bench/README.md +++ b/evaluation/benchmarks/swe_bench/README.md @@ -38,6 +38,10 @@ Please follow instruction [here](../../README.md#setup) to setup your local deve > - If your LLM config has temperature=0, we will automatically use temperature=0.1 for the 2nd and 3rd attempts > > To enable this iterative protocol, set `export ITERATIVE_EVAL_MODE=true` +> +> **Skipping errors on build** +> +> For debugging purposes, you can set `export EVAL_SKIP_MAXIMUM_RETRIES_EXCEEDED=true` to continue evaluation even when instances reach maximum retries. After evaluation completes, check `maximum_retries_exceeded.jsonl` for a list of failed instances, fix those issues, and then run the evaluation again with `export EVAL_SKIP_MAXIMUM_RETRIES_EXCEEDED=false`. ### Running Locally with Docker diff --git a/evaluation/benchmarks/swe_bench/run_infer.py b/evaluation/benchmarks/swe_bench/run_infer.py index 30a40c8e66..876645b910 100644 --- a/evaluation/benchmarks/swe_bench/run_infer.py +++ b/evaluation/benchmarks/swe_bench/run_infer.py @@ -28,6 +28,7 @@ from evaluation.utils.shared import ( EvalMetadata, EvalOutput, assert_and_raise, + check_maximum_retries_exceeded, codeact_user_response, get_default_sandbox_config_for_eval, get_metrics, @@ -968,3 +969,5 @@ if __name__ == '__main__': logger.info( f'Done! Total {len(added_instance_ids)} instances added to {output_file}' ) + # Check if any instances reached maximum retries + check_maximum_retries_exceeded(metadata.eval_output_dir) diff --git a/evaluation/utils/shared.py b/evaluation/utils/shared.py index 2338277d48..aeb563240d 100644 --- a/evaluation/utils/shared.py +++ b/evaluation/utils/shared.py @@ -311,6 +311,76 @@ def assert_and_raise(condition: bool, msg: str): raise EvalException(msg) +def log_skipped_maximum_retries_exceeded(instance, metadata, error, max_retries=5): + """Log and skip the instance when maximum retries are exceeded. + + Args: + instance: The instance that failed + metadata: The evaluation metadata + error: The error that occurred + max_retries: The maximum number of retries that were attempted + + Returns: + EvalOutput with the error information + """ + from openhands.core.logger import openhands_logger as logger + + # Log the error + logger.exception(error) + logger.error( + f'Maximum error retries reached for instance {instance.instance_id}. ' + f'Check maximum_retries_exceeded.jsonl, fix the issue and run evaluation again. ' + f'Skipping this instance and continuing with others.' + ) + + # Add the instance name to maximum_retries_exceeded.jsonl in the same folder as output.jsonl + if metadata and metadata.eval_output_dir: + retries_file_path = os.path.join( + metadata.eval_output_dir, + 'maximum_retries_exceeded.jsonl', + ) + try: + # Write the instance info as a JSON line + with open(retries_file_path, 'a') as f: + import json + + # No need to get Docker image as we're not including it in the error entry + + error_entry = { + 'instance_id': instance.instance_id, + 'error': str(error), + 'timestamp': time.strftime('%Y-%m-%d %H:%M:%S'), + } + f.write(json.dumps(error_entry) + '\n') + logger.info(f'Added instance {instance.instance_id} to {retries_file_path}') + except Exception as write_error: + logger.error( + f'Failed to write to maximum_retries_exceeded.jsonl: {write_error}' + ) + + return EvalOutput( + instance_id=instance.instance_id, + test_result={}, + error=f'Maximum retries ({max_retries}) reached: {str(error)}', + status='error', + ) + + +def check_maximum_retries_exceeded(eval_output_dir): + """Check if maximum_retries_exceeded.jsonl exists and output a message.""" + from openhands.core.logger import openhands_logger as logger + + retries_file_path = os.path.join(eval_output_dir, 'maximum_retries_exceeded.jsonl') + if os.path.exists(retries_file_path): + logger.info( + 'ATTENTION: Some instances reached maximum error retries and were skipped.' + ) + logger.info(f'These instances are listed in: {retries_file_path}') + logger.info( + 'Fix these instances and run evaluation again with EVAL_SKIP_MAXIMUM_RETRIES_EXCEEDED=false' + ) + + def _process_instance_wrapper( process_instance_func: Callable[[pd.Series, EvalMetadata, bool], EvalOutput], instance: pd.Series, @@ -363,11 +433,26 @@ def _process_instance_wrapper( + f'[Encountered after {max_retries} retries. Please check the logs and report the issue.]' + '-' * 10 ) - # Raise an error after all retries & stop the evaluation - logger.exception(e) - raise RuntimeError( - f'Maximum error retries reached for instance {instance.instance_id}' - ) from e + + # Check if EVAL_SKIP_MAXIMUM_RETRIES_EXCEEDED is set to true + skip_errors = ( + os.environ.get( + 'EVAL_SKIP_MAXIMUM_RETRIES_EXCEEDED', 'false' + ).lower() + == 'true' + ) + + if skip_errors: + # Use the dedicated function to log and skip maximum retries exceeded + return log_skipped_maximum_retries_exceeded( + instance, metadata, e, max_retries + ) + else: + # Raise an error after all retries & stop the evaluation + logger.exception(e) + raise RuntimeError( + f'Maximum error retries reached for instance {instance.instance_id}' + ) from e msg = ( '-' * 10 + '\n' @@ -456,6 +541,10 @@ def run_evaluation( output_fp.close() logger.info('\nEvaluation finished.\n') + # Check if any instances reached maximum retries + if metadata and metadata.eval_output_dir: + check_maximum_retries_exceeded(metadata.eval_output_dir) + def reset_logger_for_multiprocessing( logger: logging.Logger, instance_id: str, log_dir: str