mirror of
https://github.com/OpenHands/OpenHands.git
synced 2025-12-26 05:48:36 +08:00
Fix: Continue evaluation when an instance fails after max retries (#8868)
Co-authored-by: openhands <openhands@all-hands.dev> Co-authored-by: Xingyao Wang <xingyaoww@gmail.com> Co-authored-by: Xingyao Wang <xingyao@all-hands.dev>
This commit is contained in:
parent
b057af8d63
commit
ea50fe4e3c
@ -41,6 +41,10 @@ default, it is set to 1.
|
||||
- `language`, the language of your evaluating dataset.
|
||||
- `dataset`, the absolute position of the dataset jsonl.
|
||||
|
||||
**Skipping errors on build**
|
||||
|
||||
For debugging purposes, you can set `export EVAL_SKIP_MAXIMUM_RETRIES_EXCEEDED=true` to continue evaluation even when instances reach maximum retries. After evaluation completes, check `maximum_retries_exceeded.jsonl` for a list of failed instances, fix those issues, and then run the evaluation again with `export EVAL_SKIP_MAXIMUM_RETRIES_EXCEEDED=false`.
|
||||
|
||||
The results will be generated in evaluation/evaluation_outputs/outputs/XXX/CodeActAgent/YYY/output.jsonl, you can refer to the [example](examples/output.jsonl).
|
||||
|
||||
## Runing evaluation
|
||||
|
||||
@ -17,6 +17,7 @@ from evaluation.utils.shared import (
|
||||
EvalMetadata,
|
||||
EvalOutput,
|
||||
assert_and_raise,
|
||||
check_maximum_retries_exceeded,
|
||||
codeact_user_response,
|
||||
get_default_sandbox_config_for_eval,
|
||||
get_metrics,
|
||||
@ -843,3 +844,5 @@ if __name__ == '__main__':
|
||||
timeout_seconds=120 * 60, # 2 hour PER instance should be more than enough
|
||||
max_retries=5,
|
||||
)
|
||||
# Check if any instances reached maximum retries
|
||||
check_maximum_retries_exceeded(metadata.eval_output_dir)
|
||||
|
||||
@ -38,6 +38,10 @@ Please follow instruction [here](../../README.md#setup) to setup your local deve
|
||||
> - If your LLM config has temperature=0, we will automatically use temperature=0.1 for the 2nd and 3rd attempts
|
||||
>
|
||||
> To enable this iterative protocol, set `export ITERATIVE_EVAL_MODE=true`
|
||||
>
|
||||
> **Skipping errors on build**
|
||||
>
|
||||
> For debugging purposes, you can set `export EVAL_SKIP_MAXIMUM_RETRIES_EXCEEDED=true` to continue evaluation even when instances reach maximum retries. After evaluation completes, check `maximum_retries_exceeded.jsonl` for a list of failed instances, fix those issues, and then run the evaluation again with `export EVAL_SKIP_MAXIMUM_RETRIES_EXCEEDED=false`.
|
||||
|
||||
|
||||
### Running Locally with Docker
|
||||
|
||||
@ -28,6 +28,7 @@ from evaluation.utils.shared import (
|
||||
EvalMetadata,
|
||||
EvalOutput,
|
||||
assert_and_raise,
|
||||
check_maximum_retries_exceeded,
|
||||
codeact_user_response,
|
||||
get_default_sandbox_config_for_eval,
|
||||
get_metrics,
|
||||
@ -968,3 +969,5 @@ if __name__ == '__main__':
|
||||
logger.info(
|
||||
f'Done! Total {len(added_instance_ids)} instances added to {output_file}'
|
||||
)
|
||||
# Check if any instances reached maximum retries
|
||||
check_maximum_retries_exceeded(metadata.eval_output_dir)
|
||||
|
||||
@ -311,6 +311,76 @@ def assert_and_raise(condition: bool, msg: str):
|
||||
raise EvalException(msg)
|
||||
|
||||
|
||||
def log_skipped_maximum_retries_exceeded(instance, metadata, error, max_retries=5):
|
||||
"""Log and skip the instance when maximum retries are exceeded.
|
||||
|
||||
Args:
|
||||
instance: The instance that failed
|
||||
metadata: The evaluation metadata
|
||||
error: The error that occurred
|
||||
max_retries: The maximum number of retries that were attempted
|
||||
|
||||
Returns:
|
||||
EvalOutput with the error information
|
||||
"""
|
||||
from openhands.core.logger import openhands_logger as logger
|
||||
|
||||
# Log the error
|
||||
logger.exception(error)
|
||||
logger.error(
|
||||
f'Maximum error retries reached for instance {instance.instance_id}. '
|
||||
f'Check maximum_retries_exceeded.jsonl, fix the issue and run evaluation again. '
|
||||
f'Skipping this instance and continuing with others.'
|
||||
)
|
||||
|
||||
# Add the instance name to maximum_retries_exceeded.jsonl in the same folder as output.jsonl
|
||||
if metadata and metadata.eval_output_dir:
|
||||
retries_file_path = os.path.join(
|
||||
metadata.eval_output_dir,
|
||||
'maximum_retries_exceeded.jsonl',
|
||||
)
|
||||
try:
|
||||
# Write the instance info as a JSON line
|
||||
with open(retries_file_path, 'a') as f:
|
||||
import json
|
||||
|
||||
# No need to get Docker image as we're not including it in the error entry
|
||||
|
||||
error_entry = {
|
||||
'instance_id': instance.instance_id,
|
||||
'error': str(error),
|
||||
'timestamp': time.strftime('%Y-%m-%d %H:%M:%S'),
|
||||
}
|
||||
f.write(json.dumps(error_entry) + '\n')
|
||||
logger.info(f'Added instance {instance.instance_id} to {retries_file_path}')
|
||||
except Exception as write_error:
|
||||
logger.error(
|
||||
f'Failed to write to maximum_retries_exceeded.jsonl: {write_error}'
|
||||
)
|
||||
|
||||
return EvalOutput(
|
||||
instance_id=instance.instance_id,
|
||||
test_result={},
|
||||
error=f'Maximum retries ({max_retries}) reached: {str(error)}',
|
||||
status='error',
|
||||
)
|
||||
|
||||
|
||||
def check_maximum_retries_exceeded(eval_output_dir):
|
||||
"""Check if maximum_retries_exceeded.jsonl exists and output a message."""
|
||||
from openhands.core.logger import openhands_logger as logger
|
||||
|
||||
retries_file_path = os.path.join(eval_output_dir, 'maximum_retries_exceeded.jsonl')
|
||||
if os.path.exists(retries_file_path):
|
||||
logger.info(
|
||||
'ATTENTION: Some instances reached maximum error retries and were skipped.'
|
||||
)
|
||||
logger.info(f'These instances are listed in: {retries_file_path}')
|
||||
logger.info(
|
||||
'Fix these instances and run evaluation again with EVAL_SKIP_MAXIMUM_RETRIES_EXCEEDED=false'
|
||||
)
|
||||
|
||||
|
||||
def _process_instance_wrapper(
|
||||
process_instance_func: Callable[[pd.Series, EvalMetadata, bool], EvalOutput],
|
||||
instance: pd.Series,
|
||||
@ -363,11 +433,26 @@ def _process_instance_wrapper(
|
||||
+ f'[Encountered after {max_retries} retries. Please check the logs and report the issue.]'
|
||||
+ '-' * 10
|
||||
)
|
||||
# Raise an error after all retries & stop the evaluation
|
||||
logger.exception(e)
|
||||
raise RuntimeError(
|
||||
f'Maximum error retries reached for instance {instance.instance_id}'
|
||||
) from e
|
||||
|
||||
# Check if EVAL_SKIP_MAXIMUM_RETRIES_EXCEEDED is set to true
|
||||
skip_errors = (
|
||||
os.environ.get(
|
||||
'EVAL_SKIP_MAXIMUM_RETRIES_EXCEEDED', 'false'
|
||||
).lower()
|
||||
== 'true'
|
||||
)
|
||||
|
||||
if skip_errors:
|
||||
# Use the dedicated function to log and skip maximum retries exceeded
|
||||
return log_skipped_maximum_retries_exceeded(
|
||||
instance, metadata, e, max_retries
|
||||
)
|
||||
else:
|
||||
# Raise an error after all retries & stop the evaluation
|
||||
logger.exception(e)
|
||||
raise RuntimeError(
|
||||
f'Maximum error retries reached for instance {instance.instance_id}'
|
||||
) from e
|
||||
msg = (
|
||||
'-' * 10
|
||||
+ '\n'
|
||||
@ -456,6 +541,10 @@ def run_evaluation(
|
||||
output_fp.close()
|
||||
logger.info('\nEvaluation finished.\n')
|
||||
|
||||
# Check if any instances reached maximum retries
|
||||
if metadata and metadata.eval_output_dir:
|
||||
check_maximum_retries_exceeded(metadata.eval_output_dir)
|
||||
|
||||
|
||||
def reset_logger_for_multiprocessing(
|
||||
logger: logging.Logger, instance_id: str, log_dir: str
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user