import asyncio import functools import os import re from datetime import datetime from pathlib import Path from typing import Any import pandas as pd from evaluation.utils.shared import ( EvalMetadata, EvalOutput, compatibility_for_eval_history_pairs, get_default_sandbox_config_for_eval, make_metadata, prepare_dataset, reset_logger_for_multiprocessing, run_evaluation, update_llm_config_for_completions_logging, ) from openhands.controller.state.state import State from openhands.core.config import ( AgentConfig, OpenHandsConfig, get_agent_config_arg, get_evaluation_parser, get_llm_config_arg, ) from openhands.core.logger import openhands_logger as logger from openhands.core.main import create_runtime, run_controller from openhands.events.action import CmdRunAction, MessageAction from openhands.runtime.base import Runtime from openhands.utils.async_utils import call_async_from_sync def discover_tasks(): """Automatically discover all available tasks by scanning directories.""" script_dir = Path(__file__).parent tasks_dir = script_dir / 'tasks' tasks = {} if not tasks_dir.exists(): return tasks for item in tasks_dir.iterdir(): if ( item.is_dir() and not item.name.startswith('.') and not item.name.startswith('__') ): # Check if it's a valid task directory evaluate_file = item / 'evaluator.py' test_outputs_file = item / 'test_outputs.py' solver_file = item / 'solution.sh' if all(f.exists() for f in [solver_file, evaluate_file, test_outputs_file]): tasks[item.name] = str(item) return tasks def algotune_user_response(state, runtime: Runtime, **kwargs): """User response function for algorithm optimization training with iterative feedback.""" base_msg = 'Please continue on whatever approach you think is suitable.\nIf you think you have solved the task, please finish the interaction.' base_msg += '\n\nIMPORTANT: You should ONLY interact with the environment provided to you AND NEVER ASK FOR HUMAN HELP.\n' return base_msg def get_config( metadata: EvalMetadata, workspace_id: str = None, enable_volumes: bool = True ) -> OpenHandsConfig: """Configure OpenHands for algorithm optimization evaluation.""" sandbox_config = get_default_sandbox_config_for_eval() sandbox_config.timeout = 600 # Set execution timeout to 10 minutes sandbox_config.remote_runtime_api_timeout = 600 sandbox_config.base_container_image = 'linhaowei1/algotune-openhands:v0.0.2' sandbox_config.use_host_network = True sandbox_config.enable_auto_lint = True # Set volumes based on enable_volumes parameter if enable_volumes: # Create unique workspace directory for the entire experiment if workspace_id: workspace_dir = os.path.join( os.getcwd(), 'external', 'algotune', workspace_id ) os.makedirs(workspace_dir, exist_ok=True) sandbox_config.volumes = f'{workspace_dir}:/workspace:rw' logger.info( f'Created workspace directory for {workspace_id}: {workspace_dir}' ) else: sandbox_config.volumes = 'external:/workspace:rw' else: sandbox_config.volumes = None logger.info('Volumes disabled - container will not have persistent storage') # Set unique container labels for complete isolation if workspace_id: container_labels = { 'algotune.experiment_id': workspace_id, 'algotune.model': metadata.llm_config.model.replace('/', '_').replace( ':', '_' ), 'algotune.agent': metadata.agent_class, 'algotune.pid': str(os.getpid()), 'algotune.timestamp': str(int(datetime.now().timestamp())), } else: container_labels = { 'algotune.experiment_id': 'default', 'algotune.pid': str(os.getpid()), 'algotune.timestamp': str(int(datetime.now().timestamp())), } sandbox_config.docker_runtime_kwargs = {'labels': container_labels} logger.info(f'Container labels: {container_labels}') config = OpenHandsConfig( default_agent=metadata.agent_class, run_as_openhands=False, runtime='docker', max_iterations=metadata.max_iterations, sandbox=sandbox_config, workspace_base=None, workspace_mount_path=None, debug=True, ) # Set up LLM config with logging config.set_llm_config( update_llm_config_for_completions_logging( metadata.llm_config, metadata.eval_output_dir, workspace_id or 'default' ) ) # Set up agent config agent_config = AgentConfig( enable_jupyter=False, enable_browsing=False, enable_mcp=False, condenser=metadata.condenser_config, enable_prompt_extensions=False, ) config.set_agent_config(agent_config) return config def initialize_runtime(runtime: Runtime, task_name: str): """Initialize the runtime for algorithm optimization training.""" logger.info(f'{"-" * 50} BEGIN Runtime Initialization {"-" * 50}') # Create workspace directory action = CmdRunAction(command='mkdir -p /workspace') logger.info(action, extra={'msg_type': 'ACTION'}) obs = runtime.run_action(action) assert obs.exit_code == 0 # Copy task-specific files task_dir = f'evaluation/benchmarks/algotune/tasks/{task_name}' # Copy evaluation script eval_path = f'{task_dir}/evaluator.py' runtime.copy_to(eval_path, '/workspace/') # Copy test outputs script test_outputs_path = f'{task_dir}/test_outputs.py' runtime.copy_to(test_outputs_path, '/workspace/') logger.info(f'Initialized runtime with data directory for {task_name}') logger.info(f'{"-" * 50} END Runtime Initialization {"-" * 50}') def _backup_solver_code(runtime: Runtime) -> str: """Reads the content of the solver.py file from the runtime.""" try: # Use run_action to cat the file content action = CmdRunAction(command='cat /workspace/solver.py') obs = runtime.run_action(action) if obs.exit_code == 0: return obs.content else: return f'Error reading solver file: {obs.content}' except Exception as e: return f'Failed to backup solver code: {str(e)}' def _parse_evaluation_output(output: str) -> dict[str, Any]: """Parses the complex output from the run-tests.sh script.""" results = { 'target_speedup': 0.0, 'solver_speedup': 0.0, 'validity': False, 'passed_tests': [], 'failed_tests': [], # Assuming future need 'error': 'None', } try: # Extract Target Speedup from the entire output target_speedup_match = re.search( r'--- TARGET SPEEDUP ---\s*([\d.]+)\s*--- TARGET SPEED END ---', output ) if target_speedup_match: results['target_speedup'] = float(target_speedup_match.group(1)) logger.info(f'Target speedup: {results["target_speedup"]}') # Isolate the final validation section to parse solver stats and test results if 'Running final validation on original solver...' not in output: results['error'] = 'Final validation section not found in output.' results['validity'] = False return results final_validation_section = output.split( 'Running final validation on original solver...' )[-1] logger.info(f'Final validation section: {final_validation_section}') # The second performance summary block relates to the final solver's stats perf_summary_match = re.search( r'--- Performance Summary ---\s*' r'Validity: (True|False)\s*' r'.*?' # Non-greedy match for lines between r'Calculated Speedup:\s*([\d.]+) x', final_validation_section, # Search only in the final section re.DOTALL, ) if perf_summary_match: results['solver_speedup'] = float(perf_summary_match.group(2)) logger.info(f'Solver speedup: {results["solver_speedup"]}') # Extract passed tests from the final validation block passed_tests = re.findall(r'PASSED\s+([^\n]+)', final_validation_section) results['passed_tests'] = [test.strip() for test in passed_tests] logger.info(f'Passed tests: {results["passed_tests"]}') # Determine overall validity based on the final test run summary summary_line_match = re.search( r'={2,}\s(\d+\spassed.*)\s={2,}', final_validation_section ) if summary_line_match: summary_line = summary_line_match.group(1).strip() logger.info(f'Summary line: {summary_line}') # If the summary contains "failed" or "errors", it's not valid. if ( 'failed' not in summary_line and 'errors' not in summary_line and 'passed' in summary_line ): results['validity'] = True else: results['error'] = f'Final validation failed: {summary_line}' else: results['error'] = 'Could not parse final test summary.' results['validity'] = False except Exception as e: results['error'] = f'Failed to parse output: {str(e)}' results['validity'] = False return results def evaluate_test_cases(runtime: Any, task_name: str) -> dict[str, Any]: """Evaluate the final solution on test instances using the evaluator.py script.""" logger.info(f'{"-" * 50} BEGIN Test Instance Evaluation {"-" * 50}') backup_solver_code = _backup_solver_code(runtime) # Prepare and run the evaluation script task_dir = f'evaluation/benchmarks/algotune/tasks/{task_name}' eval_script_path = f'{task_dir}/run-tests.sh' runtime.copy_to(eval_script_path, '/workspace/') action = CmdRunAction(command='cd /workspace && bash run-tests.sh', blocking=True) action.set_hard_timeout(600) test_results = [] summary = {} try: obs = runtime.run_action(action) full_output = obs.content if obs.exit_code == 0: parsed_data = _parse_evaluation_output(full_output) test_result = { 'instance_id': f'{task_name}_test', 'valid': parsed_data['validity'], 'score': parsed_data[ 'solver_speedup' ], # Main score is the achieved speedup 'target_speedup': parsed_data['target_speedup'], 'passed_tests': parsed_data['passed_tests'], 'error': parsed_data['error'], 'solver_code': backup_solver_code, 'timestamp': datetime.now().isoformat(), 'evaluation_output': full_output, } else: # Evaluation script itself failed to run test_result = { 'instance_id': f'{task_name}_test', 'valid': False, 'score': 0.0, 'target_speedup': 0.0, 'passed_tests': [], 'error': f'Evaluation script failed with exit code {obs.exit_code}: {full_output}', 'solver_code': backup_solver_code, 'timestamp': datetime.now().isoformat(), 'evaluation_output': full_output, } test_results.append(test_result) except Exception as e: test_result = { 'instance_id': f'{task_name}_test', 'valid': False, 'score': 0.0, 'target_speedup': 0.0, 'passed_tests': [], 'error': f'Unexpected error during evaluation: {str(e)}', 'solver_code': backup_solver_code, 'timestamp': datetime.now().isoformat(), 'evaluation_output': 'Execution failed before output could be captured.', } test_results.append(test_result) # Log detailed progress for res in test_results: status = '✓ PASSED' if res['valid'] else '✗ FAILED' logger.info(f'Test evaluation {status} for instance {res["instance_id"]}') logger.info( f' Solver Speedup: {res["score"]:.2f}x, Target Speedup: {res["target_speedup"]:.2f}x' ) if res['valid']: logger.info( f' Passed Tests ({len(res["passed_tests"])}): {", ".join(res["passed_tests"])}' ) else: logger.info(f' Error: {res["error"]}') # Calculate summary statistics valid_results = [r for r in test_results if r['valid']] summary = { 'total_test_instances': len(test_results), 'valid_solutions': len(valid_results), 'test_results': test_results, } if valid_results: scores = [r['score'] for r in valid_results] summary['average_score'] = sum(scores) / len(scores) summary['total_score'] = sum(scores) summary['min_score'] = min(scores) summary['max_score'] = max(scores) else: summary.update( { 'average_score': 0.0, 'total_score': 0.0, 'min_score': 0.0, 'max_score': 0.0, } ) logger.info( f'Test evaluation completed: {len(valid_results)}/{len(test_results)} instances solved' ) if valid_results: logger.info(f'Average speedup: {summary["average_score"]:.4f}x') logger.info(f'{"-" * 50} END Test Instance Evaluation {"-" * 50}') return summary def process_training_and_testing( metadata: EvalMetadata, task_name: str, reset_logger: bool = True, enable_volumes: bool = True, ) -> EvalOutput: """Process training on validation cases and testing on remaining cases.""" # Create unique workspace_id with timestamp to support multiple runs model_name = ( metadata.llm_config.model.split('/')[-1].replace(':', '_').replace('@', '-') ) agent_name = metadata.agent_class timestamp = datetime.now().strftime('%Y%m%d_%H%M%S_%f')[ :-3 ] # Include milliseconds for uniqueness workspace_id = f'{task_name}_{agent_name}_{model_name}_{timestamp}_experiment' config = get_config(metadata, workspace_id, enable_volumes) if reset_logger: log_dir = os.path.join(metadata.eval_output_dir, 'infer_logs') reset_logger_for_multiprocessing(logger, workspace_id, log_dir) else: logger.info(f'Starting training and testing for {task_name}.') # read from task_dir problem_statement = open( f'evaluation/benchmarks/algotune/tasks/{task_name}/problem_statement.txt' ).read() # Prepare instruction for training phase instruction = f"""You are tasked with developing and optimizing an algorithm. **Task Description:** {problem_statement} You should create Solver class and function solve() in `/workspace/solver.py` by yourself. The additional packages have been installed in this environment: /usr/local/bin/python. """ # Create unique session ID for container isolation unique_sid = f'{workspace_id}_{os.getpid()}_{int(datetime.now().timestamp())}' runtime = create_runtime(config, sid=unique_sid) call_async_from_sync(runtime.connect) try: initialize_runtime(runtime, task_name) # Create partial function for user response user_response_fn = functools.partial(algotune_user_response, runtime=runtime) # Run the controller for training phase state: State | None = asyncio.run( run_controller( config=config, initial_user_action=MessageAction(content=instruction), runtime=runtime, fake_user_response_fn=user_response_fn, ) ) if state is None: raise ValueError('State should not be None.') # After training, evaluate on test cases test_results = evaluate_test_cases(runtime, task_name) metrics = state.metrics.get() if state.metrics else None histories = compatibility_for_eval_history_pairs(state.history) # Save the output output = EvalOutput( instance_id=workspace_id, instance={'task_name': task_name}, instruction=instruction, metadata=metadata, history=histories, metrics=metrics, error=state.last_error if state and state.last_error else None, test_result={'result_summary': test_results}, ) return output finally: # Ensure runtime is properly closed to release resources try: runtime.close() logger.info(f'Runtime closed successfully for workspace: {workspace_id}') except Exception as e: logger.warning(f'Failed to close runtime for workspace {workspace_id}: {e}') def process_task( instance: pd.Series, metadata: EvalMetadata, reset_logger: bool = True, ) -> EvalOutput: """ Wrapper function to process a single task, called by run_evaluation. """ task_name = instance['task_name'] enable_volumes = metadata.details.get('enable_volumes', False) return process_training_and_testing( metadata=metadata, task_name=task_name, reset_logger=reset_logger, enable_volumes=enable_volumes, ) if __name__ == '__main__': parser = get_evaluation_parser() available_tasks = discover_tasks() optim_task_choices = ['all'] + list(available_tasks.keys()) parser.add_argument( '--optim_task', type=str, choices=optim_task_choices, default='all', help=f'Algorithm optimization task to run. Use "all" to run all tasks. Available: {optim_task_choices}', ) parser.add_argument( '--enable_volumes', type=str, choices=['true', 'false'], default='false', help='Enable persistent volumes for the container to store workspace data. Default: false', ) args, _ = parser.parse_known_args() if not available_tasks: logger.error('No valid tasks found in the algotune/tasks directory.') exit(1) # Determine which tasks to run if args.optim_task == 'all': tasks_to_run = list(available_tasks.keys()) dataset_name = 'algotune_all' else: tasks_to_run = [args.optim_task] dataset_name = f'algotune_{args.optim_task}' # Create a DataFrame for the tasks tasks_df = pd.DataFrame({'instance_id': tasks_to_run, 'task_name': tasks_to_run}) llm_config = None if args.llm_config: llm_config = get_llm_config_arg(args.llm_config) llm_config.log_completions = True llm_config.modify_params = False llm_config.num_retries = 10 if llm_config is None: raise ValueError(f'Could not find LLM config: --llm_config {args.llm_config}') agent_config = ( get_agent_config_arg(args.agent_config) if args.agent_config else None ) timestamp = datetime.now().strftime('%Y%m%d_%H%M%S') eval_output_dir_with_timestamp = os.path.join( args.eval_output_dir, 'algotune', timestamp ) details = {'enable_volumes': args.enable_volumes.lower() == 'true'} metadata = make_metadata( llm_config, dataset_name, args.agent_cls, args.max_iterations, args.eval_note, eval_output_dir_with_timestamp, agent_config=agent_config, details=details, ) output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl') # Filter out tasks that have already been completed instances_to_run = prepare_dataset( tasks_df, output_file, args.eval_n_limit, ) # Use the evaluation utility to run the tasks run_evaluation( dataset=instances_to_run, metadata=metadata, output_file=output_file, num_workers=args.eval_num_workers, process_instance_func=process_task, ) logger.info(f'Evaluation finished. Results are in {output_file}')