mirror of
https://github.com/OpenHands/OpenHands.git
synced 2025-12-26 13:52:43 +08:00
* add initial version of swebench-docker eval * update the branch of git repo * add poetry run * download dev set too and pre-load f2p and p2p * update eval infer script * increase timeout * add poetry run * install swebench from our fork * update script * update loc * support single instance debug * replace \r\n from model patch * replace eval docker from namespace xingyaoww * update script to auto detect swe-bench format jsonl * support eval infer on single instance id * change log output dir to logs * update summarise result script * update README * update readme * tweak branch * Update evaluation/swe_bench/scripts/eval/prep_eval.sh Co-authored-by: Graham Neubig <neubig@gmail.com> --------- Co-authored-by: Graham Neubig <neubig@gmail.com>
40 lines
1.2 KiB
Python
40 lines
1.2 KiB
Python
import json
|
|
import sys
|
|
|
|
|
|
def extract_test_results(json_file_path):
|
|
passed_instances = set()
|
|
all_instances = set()
|
|
|
|
with open(json_file_path, 'r') as file:
|
|
report = json.load(file)
|
|
|
|
# Add resolved instances
|
|
for instance_id in report['resolved']:
|
|
passed_instances.add(instance_id)
|
|
|
|
# Add all instances in the report
|
|
for _, instance_ids in report.items():
|
|
for instance_id in instance_ids:
|
|
all_instances.add(instance_id)
|
|
|
|
return passed_instances, all_instances
|
|
|
|
|
|
if __name__ == '__main__':
|
|
if len(sys.argv) != 2:
|
|
print(
|
|
'Usage: poetry run python summarise_results.py <path_to_report_json_file>'
|
|
)
|
|
sys.exit(1)
|
|
json_file_path = sys.argv[1]
|
|
passed_instances, all_instances = extract_test_results(json_file_path)
|
|
succ_rate = len(passed_instances) / len(all_instances)
|
|
print(
|
|
f'\nPassed {len(passed_instances)} tests, total {len(all_instances)} tests, resolve rate = {succ_rate:.2%}'
|
|
)
|
|
print('PASSED TESTS:')
|
|
print(sorted(list(passed_instances)))
|
|
print('FAILED TESTS:')
|
|
print(sorted(list(all_instances - passed_instances)))
|