Fix: Continue evaluation when an instance fails after max retries (#8868)

Co-authored-by: openhands <openhands@all-hands.dev>
Co-authored-by: Xingyao Wang <xingyaoww@gmail.com>
Co-authored-by: Xingyao Wang <xingyao@all-hands.dev>
This commit is contained in:
juanmichelini 2025-07-17 00:42:44 +02:00 committed by GitHub
parent b057af8d63
commit ea50fe4e3c
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 108 additions and 5 deletions

View File

@ -41,6 +41,10 @@ default, it is set to 1.
- `language`, the language of your evaluating dataset.
- `dataset`, the absolute position of the dataset jsonl.
**Skipping errors on build**
For debugging purposes, you can set `export EVAL_SKIP_MAXIMUM_RETRIES_EXCEEDED=true` to continue evaluation even when instances reach maximum retries. After evaluation completes, check `maximum_retries_exceeded.jsonl` for a list of failed instances, fix those issues, and then run the evaluation again with `export EVAL_SKIP_MAXIMUM_RETRIES_EXCEEDED=false`.
The results will be generated in evaluation/evaluation_outputs/outputs/XXX/CodeActAgent/YYY/output.jsonl, you can refer to the [example](examples/output.jsonl).
## Runing evaluation

View File

@ -17,6 +17,7 @@ from evaluation.utils.shared import (
EvalMetadata,
EvalOutput,
assert_and_raise,
check_maximum_retries_exceeded,
codeact_user_response,
get_default_sandbox_config_for_eval,
get_metrics,
@ -843,3 +844,5 @@ if __name__ == '__main__':
timeout_seconds=120 * 60, # 2 hour PER instance should be more than enough
max_retries=5,
)
# Check if any instances reached maximum retries
check_maximum_retries_exceeded(metadata.eval_output_dir)

View File

@ -38,6 +38,10 @@ Please follow instruction [here](../../README.md#setup) to setup your local deve
> - If your LLM config has temperature=0, we will automatically use temperature=0.1 for the 2nd and 3rd attempts
>
> To enable this iterative protocol, set `export ITERATIVE_EVAL_MODE=true`
>
> **Skipping errors on build**
>
> For debugging purposes, you can set `export EVAL_SKIP_MAXIMUM_RETRIES_EXCEEDED=true` to continue evaluation even when instances reach maximum retries. After evaluation completes, check `maximum_retries_exceeded.jsonl` for a list of failed instances, fix those issues, and then run the evaluation again with `export EVAL_SKIP_MAXIMUM_RETRIES_EXCEEDED=false`.
### Running Locally with Docker

View File

@ -28,6 +28,7 @@ from evaluation.utils.shared import (
EvalMetadata,
EvalOutput,
assert_and_raise,
check_maximum_retries_exceeded,
codeact_user_response,
get_default_sandbox_config_for_eval,
get_metrics,
@ -968,3 +969,5 @@ if __name__ == '__main__':
logger.info(
f'Done! Total {len(added_instance_ids)} instances added to {output_file}'
)
# Check if any instances reached maximum retries
check_maximum_retries_exceeded(metadata.eval_output_dir)

View File

@ -311,6 +311,76 @@ def assert_and_raise(condition: bool, msg: str):
raise EvalException(msg)
def log_skipped_maximum_retries_exceeded(instance, metadata, error, max_retries=5):
"""Log and skip the instance when maximum retries are exceeded.
Args:
instance: The instance that failed
metadata: The evaluation metadata
error: The error that occurred
max_retries: The maximum number of retries that were attempted
Returns:
EvalOutput with the error information
"""
from openhands.core.logger import openhands_logger as logger
# Log the error
logger.exception(error)
logger.error(
f'Maximum error retries reached for instance {instance.instance_id}. '
f'Check maximum_retries_exceeded.jsonl, fix the issue and run evaluation again. '
f'Skipping this instance and continuing with others.'
)
# Add the instance name to maximum_retries_exceeded.jsonl in the same folder as output.jsonl
if metadata and metadata.eval_output_dir:
retries_file_path = os.path.join(
metadata.eval_output_dir,
'maximum_retries_exceeded.jsonl',
)
try:
# Write the instance info as a JSON line
with open(retries_file_path, 'a') as f:
import json
# No need to get Docker image as we're not including it in the error entry
error_entry = {
'instance_id': instance.instance_id,
'error': str(error),
'timestamp': time.strftime('%Y-%m-%d %H:%M:%S'),
}
f.write(json.dumps(error_entry) + '\n')
logger.info(f'Added instance {instance.instance_id} to {retries_file_path}')
except Exception as write_error:
logger.error(
f'Failed to write to maximum_retries_exceeded.jsonl: {write_error}'
)
return EvalOutput(
instance_id=instance.instance_id,
test_result={},
error=f'Maximum retries ({max_retries}) reached: {str(error)}',
status='error',
)
def check_maximum_retries_exceeded(eval_output_dir):
"""Check if maximum_retries_exceeded.jsonl exists and output a message."""
from openhands.core.logger import openhands_logger as logger
retries_file_path = os.path.join(eval_output_dir, 'maximum_retries_exceeded.jsonl')
if os.path.exists(retries_file_path):
logger.info(
'ATTENTION: Some instances reached maximum error retries and were skipped.'
)
logger.info(f'These instances are listed in: {retries_file_path}')
logger.info(
'Fix these instances and run evaluation again with EVAL_SKIP_MAXIMUM_RETRIES_EXCEEDED=false'
)
def _process_instance_wrapper(
process_instance_func: Callable[[pd.Series, EvalMetadata, bool], EvalOutput],
instance: pd.Series,
@ -363,11 +433,26 @@ def _process_instance_wrapper(
+ f'[Encountered after {max_retries} retries. Please check the logs and report the issue.]'
+ '-' * 10
)
# Raise an error after all retries & stop the evaluation
logger.exception(e)
raise RuntimeError(
f'Maximum error retries reached for instance {instance.instance_id}'
) from e
# Check if EVAL_SKIP_MAXIMUM_RETRIES_EXCEEDED is set to true
skip_errors = (
os.environ.get(
'EVAL_SKIP_MAXIMUM_RETRIES_EXCEEDED', 'false'
).lower()
== 'true'
)
if skip_errors:
# Use the dedicated function to log and skip maximum retries exceeded
return log_skipped_maximum_retries_exceeded(
instance, metadata, e, max_retries
)
else:
# Raise an error after all retries & stop the evaluation
logger.exception(e)
raise RuntimeError(
f'Maximum error retries reached for instance {instance.instance_id}'
) from e
msg = (
'-' * 10
+ '\n'
@ -456,6 +541,10 @@ def run_evaluation(
output_fp.close()
logger.info('\nEvaluation finished.\n')
# Check if any instances reached maximum retries
if metadata and metadata.eval_output_dir:
check_maximum_retries_exceeded(metadata.eval_output_dir)
def reset_logger_for_multiprocessing(
logger: logging.Logger, instance_id: str, log_dir: str