mirror of
https://github.com/OpenHands/OpenHands.git
synced 2025-12-25 21:36:52 +08:00
627 lines
22 KiB
Python
627 lines
22 KiB
Python
import os
|
|
import tempfile
|
|
import time
|
|
from functools import partial
|
|
|
|
import pandas as pd
|
|
from report_utils import (
|
|
check_coverage,
|
|
check_mutation,
|
|
count_methods,
|
|
get_lines_of_code,
|
|
)
|
|
|
|
from evaluation.benchmarks.testgeneval.compute_readability import compute_readability
|
|
from evaluation.benchmarks.testgeneval.constants import (
|
|
COVERAGE_PREFIX,
|
|
MUTATION_BUFFER,
|
|
MUTATION_TEMPLATE,
|
|
MUTATION_TIMEOUT,
|
|
TESTS_SUFFIX,
|
|
)
|
|
from evaluation.benchmarks.testgeneval.metrics import (
|
|
bleu,
|
|
edit_sim,
|
|
exact_match,
|
|
rouge_l,
|
|
)
|
|
from evaluation.benchmarks.testgeneval.pygments_utils import tokenize_code
|
|
from evaluation.benchmarks.testgeneval.run_infer import get_instance_docker_image
|
|
from evaluation.benchmarks.testgeneval.test_filter import filter_tests
|
|
from evaluation.benchmarks.testgeneval.test_spec import (
|
|
TestGenEvalInstance,
|
|
TestSpec,
|
|
make_test_spec,
|
|
)
|
|
from evaluation.benchmarks.testgeneval.utils import load_testgeneval_dataset
|
|
from evaluation.utils.shared import (
|
|
EvalMetadata,
|
|
EvalOutput,
|
|
prepare_dataset,
|
|
reset_logger_for_multiprocessing,
|
|
run_evaluation,
|
|
)
|
|
from openhands.core.config import OpenHandsConfig, SandboxConfig, get_evaluation_parser
|
|
from openhands.core.logger import openhands_logger as logger
|
|
from openhands.core.main import create_runtime
|
|
from openhands.events.action import CmdRunAction
|
|
from openhands.events.observation import CmdOutputObservation
|
|
from openhands.utils.async_utils import call_async_from_sync
|
|
|
|
DOCKER_IMAGE_PREFIX = os.environ.get('EVAL_DOCKER_IMAGE_PREFIX', 'docker.io/kdjain/')
|
|
logger.info(f'Using docker image prefix: {DOCKER_IMAGE_PREFIX}')
|
|
|
|
|
|
def get_config(instance: pd.Series) -> OpenHandsConfig:
|
|
base_container_image = get_instance_docker_image(instance['instance_id_swebench'])
|
|
assert base_container_image, (
|
|
f'Invalid container image for instance {instance["instance_id_swebench"]}.'
|
|
)
|
|
logger.info(f'Using instance container image: {base_container_image}.')
|
|
return OpenHandsConfig(
|
|
run_as_openhands=False,
|
|
runtime=os.environ.get('RUNTIME', 'eventstream'),
|
|
sandbox=SandboxConfig(
|
|
base_container_image=base_container_image,
|
|
use_host_network=False,
|
|
timeout=1800,
|
|
api_key=os.environ.get('ALLHANDS_API_KEY'),
|
|
remote_runtime_api_url=os.environ.get(
|
|
'SANDBOX_REMOTE_RUNTIME_API_URL', 'http://localhost:8000'
|
|
),
|
|
),
|
|
workspace_base=None,
|
|
workspace_mount_path=None,
|
|
)
|
|
|
|
|
|
def compute_lexical_metrics(pred_suite, gold_suite):
|
|
pred_loc = get_lines_of_code(pred_suite)
|
|
gold_loc = get_lines_of_code(gold_suite)
|
|
pred_methods = count_methods(pred_suite)
|
|
gold_methods = count_methods(gold_suite)
|
|
readability_pred = compute_readability(pred_suite)
|
|
readability_gold = compute_readability(gold_suite)
|
|
|
|
preds = tokenize_code(pred_suite)
|
|
golds = tokenize_code(gold_suite)
|
|
|
|
return {
|
|
'pred_loc': pred_loc,
|
|
'gold_loc': gold_loc,
|
|
'pred_readability': readability_pred,
|
|
'gold_readability': readability_gold,
|
|
'pred_methods': pred_methods,
|
|
'gold_methods': gold_methods,
|
|
'bleu': bleu(preds, golds),
|
|
'xmatch': exact_match(preds, golds),
|
|
'edit_sim': edit_sim(preds, golds),
|
|
'rouge_f': rouge_l(golds, preds)['f'],
|
|
'rouge_p': rouge_l(golds, preds)['p'],
|
|
'rouge_r': rouge_l(golds, preds)['r'],
|
|
}
|
|
|
|
|
|
def run_command(runtime, command, timeout=600):
|
|
action = CmdRunAction(command=command)
|
|
action.set_hard_timeout(timeout)
|
|
logger.info(action, extra={'msg_type': 'ACTION'})
|
|
obs = runtime.run_action(action)
|
|
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
|
assert obs.exit_code == 0
|
|
return obs
|
|
|
|
|
|
def run_tests(runtime, instance, test_script, log_file='/tmp/test_output.log'):
|
|
action = CmdRunAction(command=f'bash {test_script} > {log_file} 2>&1 & echo $!')
|
|
action.set_hard_timeout(60)
|
|
obs = runtime.run_action(action)
|
|
|
|
assert isinstance(obs, CmdOutputObservation), 'Failed to start test script.'
|
|
pid = obs.content.split()[-1].strip()
|
|
logger.info(f'[{instance.instance_id}] Test process started with PID: {pid}')
|
|
|
|
start_time = time.time()
|
|
timeout = 1800
|
|
while True:
|
|
elapsed_time = time.time() - start_time
|
|
if elapsed_time > timeout:
|
|
logger.info(f'[{instance.instance_id}] Test process timed out.')
|
|
instance['test_result']['report']['test_timeout'] = True
|
|
break
|
|
|
|
check_action = CmdRunAction(command=f'ps -p {pid} > /dev/null; echo $?')
|
|
check_obs = runtime.run_action(check_action)
|
|
if (
|
|
isinstance(check_obs, CmdOutputObservation)
|
|
and len(check_obs.content.split()) > 0
|
|
and check_obs.content.split()[-1].strip() == '1'
|
|
):
|
|
logger.info(f'[{instance.instance_id}] Test process completed.')
|
|
break
|
|
time.sleep(30)
|
|
|
|
test_action = CmdRunAction(command=f'cat {log_file}')
|
|
test_action.set_hard_timeout(300)
|
|
test_obs = runtime.run_action(test_action)
|
|
assert isinstance(test_obs, CmdOutputObservation), 'Failed to retrieve test output.'
|
|
return test_obs.exit_code, test_obs.content, elapsed_time
|
|
|
|
|
|
def run_mutation_testing(
|
|
runtime, instance, mutation_script, log_file='/tmp/mutation_output.log'
|
|
):
|
|
action = CmdRunAction(command=f'bash {mutation_script} > {log_file} 2>&1 & echo $!')
|
|
action.set_hard_timeout(60)
|
|
obs = runtime.run_action(action)
|
|
|
|
assert isinstance(obs, CmdOutputObservation), 'Failed to start test script.'
|
|
pid = obs.content.split()[-1].strip()
|
|
logger.info(f'[{instance.instance_id}] Mutation process started with PID: {pid}')
|
|
|
|
start_time = time.time()
|
|
timeout = 4000
|
|
while True:
|
|
elapsed_time = time.time() - start_time
|
|
if elapsed_time > timeout:
|
|
logger.info(f'[{instance.instance_id}] Mutation process timed out.')
|
|
instance['test_result']['report']['mutation_timeout'] = True
|
|
break
|
|
|
|
check_action = CmdRunAction(command=f'ps -p {pid} > /dev/null; echo $?')
|
|
check_obs = runtime.run_action(check_action)
|
|
if (
|
|
isinstance(check_obs, CmdOutputObservation)
|
|
and len(check_obs.content.split()) > 0
|
|
and check_obs.content.split()[-1].strip() == '1'
|
|
):
|
|
logger.info(f'[{instance.instance_id}] Mutation process completed.')
|
|
break
|
|
time.sleep(30)
|
|
|
|
assert isinstance(obs, CmdOutputObservation), 'Failed to run mutation script.'
|
|
mutation_action = CmdRunAction(command=f'cat {log_file}')
|
|
mutation_action.set_hard_timeout(300)
|
|
mutation_obs = runtime.run_action(mutation_action)
|
|
assert isinstance(mutation_obs, CmdOutputObservation), (
|
|
'Failed to retrieve mutation output.'
|
|
)
|
|
return mutation_obs.exit_code, mutation_obs.content
|
|
|
|
|
|
def grade_test_output(
|
|
test_suite: str, instance: pd.Series, test_output: str, test_spec: TestSpec, runtime
|
|
):
|
|
"""Two-pass test grading with short-circuiting:
|
|
1. Run all tests to identify passing/failing tests
|
|
2. If no failing tests, evaluate coverage immediately
|
|
3. Otherwise, run only passing tests for coverage analysis
|
|
"""
|
|
unit_test_output, coverage_output = '', ''
|
|
if TESTS_SUFFIX in test_output:
|
|
unit_test_output = test_output.split(TESTS_SUFFIX)[0]
|
|
|
|
if not unit_test_output:
|
|
return (
|
|
False,
|
|
0,
|
|
'',
|
|
'',
|
|
{
|
|
'total_tests': 0,
|
|
'passing_tests': 0,
|
|
'failing_tests': 0,
|
|
'any_pass': False,
|
|
'all_pass': False,
|
|
'passing_test_names': [],
|
|
'failing_test_names': [],
|
|
},
|
|
)
|
|
|
|
logger.info('Calling filter unit tests')
|
|
filtered_content, passing_tests, failing_tests = filter_tests(
|
|
test_suite, unit_test_output, test_spec.repo
|
|
)
|
|
|
|
total_tests = len(passing_tests) + len(failing_tests)
|
|
test_stats = {
|
|
'total_tests': total_tests,
|
|
'passing_tests': len(passing_tests),
|
|
'failing_tests': len(failing_tests),
|
|
'any_pass': len(passing_tests) > 0,
|
|
'all_pass': len(failing_tests) == 0 and total_tests > 0,
|
|
'passing_test_names': passing_tests,
|
|
'failing_test_names': failing_tests,
|
|
}
|
|
|
|
if not passing_tests:
|
|
return False, 0, unit_test_output, coverage_output, test_stats
|
|
|
|
# If all tests pass, evaluate coverage immediately
|
|
if not failing_tests:
|
|
coverage = 0
|
|
cov_success = False
|
|
if COVERAGE_PREFIX in test_output:
|
|
coverage_output = test_output.split(COVERAGE_PREFIX)[1]
|
|
_, coverage = check_coverage(coverage_output, test_spec.code_file)
|
|
cov_success = True
|
|
# test_stats['filtered_suite'] = test_suite
|
|
return cov_success, coverage, unit_test_output, coverage_output, test_stats
|
|
|
|
cov_success = False
|
|
coverage = 0
|
|
# Second pass - run coverage on passing tests
|
|
if filtered_content:
|
|
with tempfile.TemporaryDirectory() as temp_dir:
|
|
test_suite_path = os.path.join(temp_dir, 'test_suite.py')
|
|
with open(test_suite_path, 'w') as f:
|
|
f.write(filtered_content)
|
|
runtime.copy_to(test_suite_path, '/tmp')
|
|
|
|
run_command(runtime, f'cp /tmp/test_suite.py /testbed/{test_spec.test_file}')
|
|
_, test_output_second_pass, _ = run_tests(runtime, instance, '/tmp/test.sh')
|
|
|
|
coverage, coverage_output, unit_test_output = 0, '', test_output_second_pass
|
|
|
|
if COVERAGE_PREFIX in test_output_second_pass:
|
|
coverage_output = test_output_second_pass.split(COVERAGE_PREFIX)[1]
|
|
unit_test_output = test_output_second_pass.split(TESTS_SUFFIX)[0]
|
|
_, coverage = check_coverage(coverage_output, test_spec.code_file)
|
|
cov_success = True
|
|
|
|
# test_stats['filtered_suite'] = filtered_content
|
|
return cov_success, coverage, unit_test_output, coverage_output, test_stats
|
|
|
|
|
|
def process_instance(
|
|
instance: pd.Series,
|
|
metadata: EvalMetadata,
|
|
reset_logger: bool = True,
|
|
log_dir: str | None = None,
|
|
) -> EvalOutput:
|
|
"""Evaluate agent performance on a TestGenEval problem instance.
|
|
|
|
Note that this signature differs from the expected input to `run_evaluation`. Use
|
|
`functools.partial` to provide optional arguments before passing to the evaluation harness.
|
|
|
|
Args:
|
|
log_dir (str | None, default=None): Path to directory where log files will be written. Must
|
|
be provided if `reset_logger` is set.
|
|
|
|
Raises:
|
|
AssertionError: if the `reset_logger` flag is set without a provided log directory.
|
|
"""
|
|
if reset_logger:
|
|
assert log_dir is not None, (
|
|
"Can't reset logger without a provided log directory."
|
|
)
|
|
os.makedirs(log_dir, exist_ok=True)
|
|
reset_logger_for_multiprocessing(logger, instance.instance_id, log_dir)
|
|
else:
|
|
logger.info(f'Starting evaluation for instance {instance.instance_id}.')
|
|
|
|
config = get_config(instance)
|
|
id = instance.instance_id
|
|
logger.info(f'Starting evaluation for instance {id}.')
|
|
|
|
instance['test_result']['id'] = id
|
|
instance['test_result']['report'] = {
|
|
'test_output': '',
|
|
# 'coverage_output': '',
|
|
# 'mutation_output': '',
|
|
'empty_generation': False,
|
|
'error_eval': False,
|
|
'all_tests_pass': False,
|
|
'tests_pass': False,
|
|
'test_timeout': False,
|
|
'mutation_timeout': False,
|
|
'coverage_success': False,
|
|
'mutation_success': False,
|
|
'coverage': 0,
|
|
'mutation_score': 0,
|
|
'mutation_error_interval': -1,
|
|
'num_mutants': -1,
|
|
}
|
|
|
|
instance['test_result']['lexical'] = {
|
|
'pred_loc': -1,
|
|
'gold_loc': -1,
|
|
'pred_readability': -1,
|
|
'gold_readability': -1,
|
|
'pred_methods': -1,
|
|
'gold_methods': -1,
|
|
'bleu': -1,
|
|
'xmatch': -1,
|
|
'edit_sim': -1,
|
|
'rouge_f': -1,
|
|
'rouge_p': -1,
|
|
'rouge_r': -1,
|
|
}
|
|
|
|
if instance['test_suite'] == '' or instance['test_suite'] is None:
|
|
instance['test_result']['report']['empty_generation'] = True
|
|
return EvalOutput(
|
|
instance_id=instance.instance_id, test_result=instance['test_result']
|
|
)
|
|
|
|
if not args.skip_lexical:
|
|
lexical_metrics = compute_lexical_metrics(
|
|
instance['test_suite'], instance['instance']['test_src']
|
|
)
|
|
instance['test_result']['lexical'] = lexical_metrics
|
|
|
|
test_suite = instance['test_suite']
|
|
test_spec: TestSpec = instance['test_spec']
|
|
runtime = create_runtime(config)
|
|
call_async_from_sync(runtime.connect)
|
|
with tempfile.TemporaryDirectory() as temp_dir:
|
|
test_suite_path = os.path.join(temp_dir, 'test_suite.py')
|
|
with open(test_suite_path, 'w') as f:
|
|
f.write(test_suite)
|
|
runtime.copy_to(test_suite_path, '/tmp')
|
|
|
|
test_script_path = os.path.join(temp_dir, 'test.sh')
|
|
with open(test_script_path, 'w') as f:
|
|
f.write(test_spec.test_script)
|
|
runtime.copy_to(test_script_path, '/tmp')
|
|
|
|
mutation_script_path = os.path.join(temp_dir, 'mutation.sh')
|
|
with open(mutation_script_path, 'w') as f:
|
|
f.write(test_spec.mutation_script)
|
|
runtime.copy_to(mutation_script_path, '/tmp')
|
|
|
|
try:
|
|
run_command(runtime, 'chmod +x /tmp/test.sh /tmp/mutation.sh')
|
|
run_command(runtime, f'cp /tmp/test_suite.py /testbed/{test_spec.test_file}')
|
|
|
|
# First pass - run all tests
|
|
_, test_output, test_time = run_tests(runtime, instance, '/tmp/test.sh')
|
|
|
|
# Grade tests with two-pass approach
|
|
coverage_success, coverage, unit_test_output, coverage_output, test_stats = (
|
|
grade_test_output(test_suite, instance, test_output, test_spec, runtime)
|
|
)
|
|
|
|
# Update report with test statistics
|
|
instance['test_result']['report'].update(
|
|
{
|
|
'test_output': unit_test_output,
|
|
# 'coverage_output': coverage_output,
|
|
'tests_pass': test_stats['any_pass'], # Changed to use any_pass
|
|
'all_tests_pass': test_stats['all_pass'], # Added all_pass metric
|
|
'coverage_success': coverage_success,
|
|
'coverage': coverage if coverage_success else 0,
|
|
'test_stats': test_stats,
|
|
}
|
|
)
|
|
|
|
# Only run mutation testing if we have passing tests and coverage
|
|
if (
|
|
not args.skip_mutation
|
|
and coverage_success
|
|
and test_stats['any_pass']
|
|
and coverage > 0
|
|
):
|
|
mutation_timeout = max(10, 1.5 * test_time)
|
|
mutation_toml = MUTATION_TEMPLATE.format(
|
|
test_cmd=test_spec.test_cmd,
|
|
source_fp=test_spec.code_file,
|
|
timeout=mutation_timeout,
|
|
)
|
|
|
|
with tempfile.TemporaryDirectory() as temp_dir:
|
|
mutation_toml_path = os.path.join(temp_dir, 'mutation.toml')
|
|
with open(mutation_toml_path, 'w') as f:
|
|
f.write(mutation_toml)
|
|
runtime.copy_to(mutation_toml_path, '/tmp')
|
|
|
|
run_command(runtime, 'cp /tmp/mutation.toml /testbed/mutation.toml')
|
|
|
|
mutation_code, mutation_output = run_mutation_testing(
|
|
runtime, instance, '/tmp/mutation.sh'
|
|
)
|
|
# instance['test_result']['report']['mutation_output'] = mutation_output
|
|
if mutation_output and mutation_code == 0:
|
|
(
|
|
mutation_success,
|
|
num_mutants,
|
|
mutation_score,
|
|
mutation_confidence_interval,
|
|
) = check_mutation(mutation_output)
|
|
instance['test_result']['report']['num_mutants'] = num_mutants
|
|
instance['test_result']['report']['mutation_success'] = mutation_success
|
|
instance['test_result']['report']['mutation_score'] = mutation_score
|
|
instance['test_result']['report']['mutation_error_interval'] = (
|
|
mutation_confidence_interval
|
|
)
|
|
|
|
return EvalOutput(
|
|
instance_id=instance.instance_id, test_result=instance['test_result']
|
|
)
|
|
except Exception as e:
|
|
logger.error(f'Error processing instance {instance.instance_id}: {e}')
|
|
raise RuntimeError(
|
|
instance.instance_id,
|
|
'Unexpected output...',
|
|
logger,
|
|
)
|
|
|
|
finally:
|
|
runtime.close()
|
|
|
|
|
|
def count_and_log_fields(evaluated_predictions, fields, key):
|
|
"""Count and log the sum of specified fields in the evaluated predictions,
|
|
ignoring fields with a value of -1. If all values for a field are -1,
|
|
return -1.
|
|
|
|
:param evaluated_predictions: DataFrame containing evaluation results
|
|
:param fields: List of field names to count
|
|
:param key: Key to access the field values ('report' or 'lexical')
|
|
"""
|
|
|
|
def count_field(row, field):
|
|
value = row['test_result'][key][field]
|
|
return (
|
|
value if value != -1 else None
|
|
) # Ignore -1 fields by treating them as None
|
|
|
|
for field in fields:
|
|
# Extract the valid values for the field, ignoring -1
|
|
valid_values = evaluated_predictions.apply(
|
|
count_field, args=(field,), axis=1
|
|
).dropna()
|
|
|
|
if valid_values.empty: # If all values are -1
|
|
logger.info(f'# {field}: -1 (All values are -1)')
|
|
else:
|
|
count = valid_values.sum() # Sum of valid values
|
|
length = len(valid_values) # Count of valid entries
|
|
logger.info(f'# {field}: {length}. ({count / length:.2f})')
|
|
|
|
|
|
if __name__ == '__main__':
|
|
parser = get_evaluation_parser()
|
|
parser.add_argument(
|
|
'--input-file', type=str, required=True, help='Path to input predictions file'
|
|
)
|
|
parser.add_argument(
|
|
'--dataset',
|
|
type=str,
|
|
default='kjain14/testgeneval',
|
|
help='Dataset to evaluate on',
|
|
)
|
|
parser.add_argument(
|
|
'--split', type=str, default='test', help='Split to evaluate on'
|
|
)
|
|
parser.add_argument(
|
|
'--skip_mutation', action='store_true', help='Skip mutation testing'
|
|
)
|
|
parser.add_argument(
|
|
'--skip_lexical', action='store_true', help='Skip lexical metrics'
|
|
)
|
|
parser.add_argument(
|
|
'--mutation_timeout',
|
|
type=int,
|
|
default=MUTATION_TIMEOUT,
|
|
help='Mutation timeout',
|
|
)
|
|
parser.add_argument(
|
|
'--mutation_buffer',
|
|
type=int,
|
|
default=MUTATION_BUFFER,
|
|
help='Mutation buffer',
|
|
)
|
|
args, _ = parser.parse_known_args()
|
|
|
|
dataset: list[TestGenEvalInstance] = load_testgeneval_dataset(
|
|
args.dataset, args.split
|
|
)
|
|
|
|
logger.info(
|
|
f'Loaded dataset {args.dataset} with split {args.split} to run inference on.'
|
|
)
|
|
|
|
# Load predictions
|
|
assert args.input_file.endswith('.jsonl'), 'Input file must be a jsonl file.'
|
|
predictions = pd.read_json(args.input_file, lines=True)
|
|
assert 'instance_id' in predictions.columns, (
|
|
'Input file must contain instance_id column.'
|
|
)
|
|
|
|
if 'test_suite' not in predictions.columns and (
|
|
'test_result' in predictions.columns
|
|
and 'test_suite' in predictions['test_result'].iloc(0)
|
|
):
|
|
raise ValueError(
|
|
'Input file must contain test_suite column OR test_result column with test_suite field.'
|
|
)
|
|
|
|
if 'instance_id_swebench' not in predictions.columns:
|
|
predictions['instance_id_swebench'] = predictions['instance'].apply(
|
|
lambda x: x['instance_id_swebench']
|
|
)
|
|
|
|
if 'instance_id' not in predictions.columns and (
|
|
'instance_id' in predictions['instance'].iloc(0)
|
|
):
|
|
raise ValueError(
|
|
'Input file must contain id column OR instance column with id field.'
|
|
)
|
|
|
|
if 'instance_id' not in predictions.columns:
|
|
predictions['instance_id'] = predictions['instance'].apply(
|
|
lambda x: x['instance_id']
|
|
)
|
|
|
|
if 'test_suite' not in predictions.columns:
|
|
predictions['test_suite'] = predictions['test_result'].apply(
|
|
lambda x: x['test_suite']
|
|
)
|
|
|
|
assert len(predictions['instance_id'].unique()) == len(predictions), (
|
|
'instance_id column must be unique.'
|
|
)
|
|
|
|
assert {'instance_id_swebench', 'test_suite', 'instance_id'}.issubset(
|
|
set(predictions.columns)
|
|
), 'Input file must contain id, instance_id and test_suite columns.'
|
|
|
|
predictions['test_spec'] = predictions['instance'].apply(
|
|
lambda x: make_test_spec(x, args.mutation_timeout, args.mutation_buffer)
|
|
)
|
|
|
|
output_file = args.input_file.replace('.jsonl', '.testgeneval.jsonl')
|
|
instances = prepare_dataset(predictions, output_file, args.eval_n_limit)
|
|
|
|
# If possible, load the relevant metadata to avoid issues with `run_evaluation`.
|
|
metadata: EvalMetadata | None = None
|
|
metadata_filepath = os.path.join(os.path.dirname(args.input_file), 'metadata.json')
|
|
if os.path.exists(metadata_filepath):
|
|
with open(metadata_filepath, 'r') as metadata_file:
|
|
data = metadata_file.read()
|
|
metadata = EvalMetadata.model_validate_json(data)
|
|
|
|
# The evaluation harness constrains the signature of `process_instance_func` but we need to
|
|
# pass extra information. Build a new function object to avoid issues with multiprocessing.
|
|
process_instance_func = partial(
|
|
process_instance, log_dir=output_file.replace('.jsonl', '.logs')
|
|
)
|
|
|
|
run_evaluation(
|
|
instances,
|
|
metadata=None,
|
|
output_file=output_file,
|
|
num_workers=args.eval_num_workers,
|
|
process_instance_func=process_instance_func,
|
|
)
|
|
|
|
# Load evaluated predictions & print number of resolved predictions
|
|
evaluated_predictions = pd.read_json(output_file, lines=True)
|
|
report_fields = [
|
|
'coverage',
|
|
'mutation_score',
|
|
'tests_pass',
|
|
'all_tests_pass',
|
|
'empty_generation',
|
|
'coverage_success',
|
|
'test_timeout',
|
|
'error_eval',
|
|
]
|
|
lexical_fields = [
|
|
'pred_loc',
|
|
'gold_loc',
|
|
'pred_methods',
|
|
'gold_methods',
|
|
'bleu',
|
|
'xmatch',
|
|
'edit_sim',
|
|
'rouge_f',
|
|
'rouge_p',
|
|
'rouge_r',
|
|
]
|
|
|
|
# Log report and lexical fields
|
|
count_and_log_fields(evaluated_predictions, report_fields, key='report')
|
|
count_and_log_fields(evaluated_predictions, lexical_fields, key='lexical')
|