diff --git a/docs/modules/usage/how-to/evaluation-harness.md b/docs/modules/usage/how-to/evaluation-harness.md index a15e1c3707..903d079728 100644 --- a/docs/modules/usage/how-to/evaluation-harness.md +++ b/docs/modules/usage/how-to/evaluation-harness.md @@ -126,7 +126,7 @@ To create an evaluation workflow for your benchmark, follow these steps: 3. Initialize the runtime and set up the evaluation environment: ```python - async def initialize_runtime(runtime: Runtime, instance: pd.Series): + def initialize_runtime(runtime: Runtime, instance: pd.Series): # Set up your evaluation environment here # For example, setting environment variables, preparing files, etc. pass @@ -134,14 +134,14 @@ To create an evaluation workflow for your benchmark, follow these steps: 4. Create a function to process each instance: ```python - async def process_instance(instance: pd.Series, metadata: EvalMetadata) -> EvalOutput: + def process_instance(instance: pd.Series, metadata: EvalMetadata) -> EvalOutput: config = get_config(instance, metadata) - runtime = await create_runtime(config, sid=instance.instance_id) - await initialize_runtime(runtime, instance) + runtime = create_runtime(config, sid=instance.instance_id) + initialize_runtime(runtime, instance) instruction = get_instruction(instance, metadata) - state = await run_controller( + state = run_controller( config=config, task_str=instruction, runtime=runtime, diff --git a/evaluation/EDA/run_infer.py b/evaluation/EDA/run_infer.py index 01c7bfa418..743cde3980 100644 --- a/evaluation/EDA/run_infer.py +++ b/evaluation/EDA/run_infer.py @@ -74,7 +74,7 @@ def get_config( return config -async def process_instance( +def process_instance( instance: pd.Series, metadata: EvalMetadata, reset_logger: bool = True, @@ -117,13 +117,17 @@ async def process_instance( instruction += AGENT_CLS_TO_INST_SUFFIX[metadata.agent_class] # Here's how you can run the agent (similar to the `main` function) and get the final task state - runtime = await create_runtime(config, sid=instance['text'].strip()) + runtime = create_runtime(config, sid=instance['text'].strip()) - state: State | None = await run_controller( - config=config, - task_str=instruction, - runtime=runtime, - fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN[metadata.agent_class], + state: State | None = asyncio.run( + run_controller( + config=config, + task_str=instruction, + runtime=runtime, + fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN[ + metadata.agent_class + ], + ) ) # ======= Attempt to evaluate the agent's edits ======= # If you are working on simpler benchmark that only evaluates the final model output (e.g., in a MessageAction) @@ -214,12 +218,10 @@ if __name__ == '__main__': eda_dataset.to_pandas(), output_file, args.eval_n_limit ) - asyncio.run( - run_evaluation( - prepared_dataset, - metadata, - output_file, - args.eval_num_workers, - process_instance, - ) + run_evaluation( + prepared_dataset, + metadata, + output_file, + args.eval_num_workers, + process_instance, ) diff --git a/evaluation/agent_bench/run_infer.py b/evaluation/agent_bench/run_infer.py index a83b3b8780..e8c85ec02c 100644 --- a/evaluation/agent_bench/run_infer.py +++ b/evaluation/agent_bench/run_infer.py @@ -56,7 +56,7 @@ def get_config( return config -async def initialize_runtime( +def initialize_runtime( runtime: Runtime, instance: pd.Series, # this argument is not required ): @@ -70,12 +70,12 @@ async def initialize_runtime( # Set instance id action = CmdRunAction(command='mkdir -p /workspace') logger.info(action, extra={'msg_type': 'ACTION'}) - obs = await runtime.run_action(action) + obs = runtime.run_action(action) assert obs.exit_code == 0 action = CmdRunAction(command='cd /workspace') logger.info(action, extra={'msg_type': 'ACTION'}) - obs = await runtime.run_action(action) + obs = runtime.run_action(action) assert obs.exit_code == 0 init_cmd = instance.init @@ -85,7 +85,7 @@ async def initialize_runtime( with tempfile.TemporaryDirectory() as tmpdir: host_script_path = os.path.join(tmpdir, script_name) create_sh_file(host_script_path, init_cmd) - await runtime.copy_to( + runtime.copy_to( host_script_path, '/workspace', ) @@ -93,14 +93,14 @@ async def initialize_runtime( logger.info(f'Running init script: {script_name}') action = CmdRunAction(command=f'chmod +x ./{script_name} && ./{script_name}') logger.info(action, extra={'msg_type': 'ACTION'}) - obs = await runtime.run_action(action) + obs = runtime.run_action(action) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) assert obs.exit_code == 0 logger.info(f"{'-' * 50} END Runtime Initialization Fn {'-' * 50}") -async def complete_runtime( +def complete_runtime( runtime: Runtime, instance: pd.Series, # this argument is not required, but it is used to get the workspace_dir_name ) -> dict[str, Any]: @@ -121,7 +121,7 @@ async def complete_runtime( with tempfile.TemporaryDirectory() as tmpdir: host_script_path = os.path.join(tmpdir, script_name) create_sh_file(host_script_path, get_agent_result_cmd) - await runtime.copy_to( + runtime.copy_to( host_script_path, '/workspace', ) @@ -132,7 +132,7 @@ async def complete_runtime( keep_prompt=False, ) logger.info(action, extra={'msg_type': 'ACTION'}) - obs = await runtime.run_action(action) + obs = runtime.run_action(action) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) assert obs.exit_code == 0 agent_answer = obs.content @@ -149,7 +149,7 @@ async def complete_runtime( with tempfile.TemporaryDirectory() as tmpdir: host_script_path = os.path.join(tmpdir, script_name) create_sh_file(host_script_path, get_ground_truth_cmd) - await runtime.copy_to( + runtime.copy_to( host_script_path, '/workspace', ) @@ -160,7 +160,7 @@ async def complete_runtime( keep_prompt=False, ) logger.info(action, extra={'msg_type': 'ACTION'}) - obs = await runtime.run_action(action) + obs = runtime.run_action(action) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) final_ans = obs.content @@ -171,7 +171,7 @@ async def complete_runtime( } -async def process_instance( +def process_instance( instance: pd.Series, metadata: EvalMetadata, reset_logger: bool = True, @@ -209,16 +209,18 @@ async def process_instance( # create sandbox and run the agent # ============================================= - runtime: Runtime = await create_runtime(config, sid=instance.instance_id) + runtime: Runtime = create_runtime(config, sid=instance.instance_id) - await initialize_runtime(runtime, instance=instance) + initialize_runtime(runtime, instance=instance) # Here's how you can run the agent (similar to the `main` function) and get the final task state - state: State | None = await run_controller( - config=config, - task_str=instruction, - runtime=runtime, - fake_user_response_fn=FAKE_RESPONSES[metadata.agent_class], + state: State | None = asyncio.run( + run_controller( + config=config, + task_str=instruction, + runtime=runtime, + fake_user_response_fn=FAKE_RESPONSES[metadata.agent_class], + ) ) if state is None: raise ValueError('State should not be None.') @@ -227,7 +229,7 @@ async def process_instance( # result evaluation # ============================================= - return_val = await complete_runtime(runtime, instance) + return_val = complete_runtime(runtime, instance) agent_answer = return_val['agent_answer'] final_ans = return_val['final_ans'] @@ -313,8 +315,6 @@ if __name__ == '__main__': output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl') instances = prepare_dataset(agent_bench_tests, output_file, args.eval_n_limit) - asyncio.run( - run_evaluation( - instances, metadata, output_file, args.eval_num_workers, process_instance - ) + run_evaluation( + instances, metadata, output_file, args.eval_num_workers, process_instance ) diff --git a/evaluation/aider_bench/run_infer.py b/evaluation/aider_bench/run_infer.py index 7776dd3d64..6457e11288 100644 --- a/evaluation/aider_bench/run_infer.py +++ b/evaluation/aider_bench/run_infer.py @@ -62,7 +62,7 @@ def get_config( return config -async def initialize_runtime( +def initialize_runtime( runtime: Runtime, instance: pd.Series, ): @@ -76,19 +76,19 @@ async def initialize_runtime( # Set instance id action = CmdRunAction(command='mkdir -p /workspace') logger.info(action, extra={'msg_type': 'ACTION'}) - obs = await runtime.run_action(action) + obs = runtime.run_action(action) assert obs.exit_code == 0 action = CmdRunAction(command='cd /workspace') logger.info(action, extra={'msg_type': 'ACTION'}) - obs = await runtime.run_action(action) + obs = runtime.run_action(action) assert obs.exit_code == 0 with tempfile.TemporaryDirectory() as tmpdir: file_path = os.path.join(tmpdir, f'{instance.instance_name}.py') with open(file_path, 'w') as f: f.write(instance.signature) - await runtime.copy_to( + runtime.copy_to( file_path, '/workspace', ) @@ -96,14 +96,14 @@ async def initialize_runtime( file_path = os.path.join(tmpdir, f'{instance.instance_name}_test.py') with open(file_path, 'w') as f: f.write(instance.test) - await runtime.copy_to( + runtime.copy_to( file_path, '/workspace', ) logger.info(f"\n{'-' * 50} END Runtime Initialization Fn {'-' * 50}\n") -async def complete_runtime( +def complete_runtime( runtime: Runtime, instance: pd.Series, ) -> dict[str, Any]: @@ -122,7 +122,7 @@ async def complete_runtime( file_path = os.path.join(tmpdir, script_name) with open(file_path, 'w') as f: f.write(instance.test) - await runtime.copy_to( + runtime.copy_to( file_path, '/workspace', ) @@ -133,7 +133,7 @@ async def complete_runtime( keep_prompt=False, ) logger.info(action, extra={'msg_type': 'ACTION'}) - obs = await runtime.run_action(action) + obs = runtime.run_action(action) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) exit_code = 1 @@ -142,7 +142,7 @@ async def complete_runtime( logger.info(f"\n{'-' * 50} END Runtime Completion Fn {'-' * 50}\n") - await runtime.close() + runtime.close() return { 'test_output': obs.content, @@ -150,7 +150,7 @@ async def complete_runtime( } -async def process_instance( +def process_instance( instance: pd.Series, metadata: EvalMetadata, reset_logger: bool = True, @@ -193,16 +193,18 @@ async def process_instance( # create sandbox and run the agent # ============================================= - runtime: Runtime = await create_runtime(config, sid=str(instance.instance_id)) + runtime: Runtime = create_runtime(config, sid=str(instance.instance_id)) - await initialize_runtime(runtime, instance=instance) + initialize_runtime(runtime, instance=instance) # Here's how you can run the agent (similar to the `main` function) and get the final task state - state: State | None = await run_controller( - config=config, - task_str=instruction, - runtime=runtime, - fake_user_response_fn=FAKE_RESPONSES[metadata.agent_class], + state: State | None = asyncio.run( + run_controller( + config=config, + task_str=instruction, + runtime=runtime, + fake_user_response_fn=FAKE_RESPONSES[metadata.agent_class], + ) ) if state is None: raise ValueError('State should not be None.') @@ -211,7 +213,7 @@ async def process_instance( # # result evaluation # # ============================================= - return_val = await complete_runtime(runtime, instance) + return_val = complete_runtime(runtime, instance) exit_code = return_val['exit_code'] test_output = return_val['test_output'] @@ -286,12 +288,10 @@ if __name__ == '__main__': skip_num=SKIP_NUM, ) - asyncio.run( - run_evaluation( - instances, - metadata, - output_file, - args.eval_num_workers, - process_instance, - ) + run_evaluation( + instances, + metadata, + output_file, + args.eval_num_workers, + process_instance, ) diff --git a/evaluation/biocoder/run_infer.py b/evaluation/biocoder/run_infer.py index 5252001ad1..33707ea187 100644 --- a/evaluation/biocoder/run_infer.py +++ b/evaluation/biocoder/run_infer.py @@ -74,7 +74,7 @@ def get_config( return config -async def initialize_runtime( +def initialize_runtime( runtime: Runtime, instance: BiocoderData, # this argument is not required ): @@ -89,19 +89,19 @@ async def initialize_runtime( action = CmdRunAction(command='mkdir -p /workspace && mkdir -p /testing_files') logger.info(action, extra={'msg_type': 'ACTION'}) - obs = await runtime.run_action(action) + obs = runtime.run_action(action) assert obs.exit_code == 0 with tempfile.TemporaryDirectory() as tmpdir: context_path = os.path.join(tmpdir, 'context.' + file_ext) with open(context_path, 'w') as f: f.write(instance.contextCode) - await runtime.copy_to(context_path, '/testing_files') + runtime.copy_to(context_path, '/testing_files') golden_path = os.path.join(tmpdir, 'golden.' + file_ext) with open(golden_path, 'w') as f: f.write(instance.goldenCode) - await runtime.copy_to(golden_path, '/testing_files') + runtime.copy_to(golden_path, '/testing_files') testcase_json = { 'test_case_id': instance.test_case_id, @@ -112,36 +112,36 @@ async def initialize_runtime( with open(testcase_path, 'w') as f: f.write(json.dumps(testcase_json, indent=4)) - await runtime.copy_to(testcase_path, '/testing_files') + runtime.copy_to(testcase_path, '/testing_files') # setup paths remove_code_script = os.path.join( os.path.dirname(__file__), 'scripts', 'setup', 'remove_code.py' ) - await runtime.copy_to(remove_code_script, '/testing_files') + runtime.copy_to(remove_code_script, '/testing_files') action = CmdRunAction(command='cd /workspace') logger.info(action, extra={'msg_type': 'ACTION'}) - obs = await runtime.run_action(action) + obs = runtime.run_action(action) assert obs.exit_code == 0 # download repository archive repository_url = f"https://biocoder.lilbillbiscuit.com/repos/{instance.repository.split('/')[1]}.zip" action = CmdRunAction(command='wget -O repo.zip ' + repository_url) logger.info(action, extra={'msg_type': 'ACTION'}) - obs = await runtime.run_action(action) + obs = runtime.run_action(action) assert obs.exit_code == 0, f'Failed to download the repository: {obs.content}' # unzip the repository action = CmdRunAction(command='unzip -o -q repo.zip && rm repo.zip') logger.info(action, extra={'msg_type': 'ACTION'}) - obs = await runtime.run_action(action) + obs = runtime.run_action(action) assert obs.exit_code == 0, f'Failed to unzip the repository: {obs.content}' # chmod 777 action = CmdRunAction(command='chmod -R 777 /workspace') logger.info(action, extra={'msg_type': 'ACTION'}) - obs = await runtime.run_action(action) + obs = runtime.run_action(action) assert obs.exit_code == 0, f'Failed to chmod the files: {obs.content}' # remove code for evaluation instance @@ -155,13 +155,13 @@ async def initialize_runtime( command=f'python3 /testing_files/remove_code.py --target_filepath {target_filepath} --line_start {line_start} --line_end {line_end} --language {language}' ) logger.info(action, extra={'msg_type': 'ACTION'}) - obs = await runtime.run_action(action) + obs = runtime.run_action(action) assert obs.exit_code == 0, f'Failed to remove the code: {obs.content}' logger.info(f"{'-' * 50} END Runtime Initialization Fn {'-' * 50}") -async def complete_runtime( +def complete_runtime( runtime: Runtime, instance: pd.Series, # this argument is not required, but it is used to get the workspace_dir_name ) -> dict[str, Any]: @@ -179,7 +179,7 @@ async def complete_runtime( copy_changed_code_script = os.path.join( os.path.dirname(__file__), 'scripts', 'setup', 'copy_changed_code.py' ) - await runtime.copy_to(copy_changed_code_script, '/testing_files') + runtime.copy_to(copy_changed_code_script, '/testing_files') file_ext = FILE_EXT_MAP[instance.language.lower()] target_filepath = os.path.join( @@ -191,13 +191,13 @@ async def complete_runtime( command=f'python3 /testing_files/copy_changed_code.py --target_filepath {target_filepath} --generated_code_filepath {generated_path} --line_start {instance.lineStart} --include_signature' ) logger.info(action, extra={'msg_type': 'ACTION'}) - obs = await runtime.run_action(action) + obs = runtime.run_action(action) if obs.exit_code == 0: test_result['metadata']['1_copy_change_success'] = True action = CmdRunAction(command=f'cat {generated_path}', keep_prompt=False) logger.info(action, extra={'msg_type': 'ACTION'}) - obs = await runtime.run_action(action) + obs = runtime.run_action(action) assert obs.exit_code == 0 code = obs.content @@ -208,14 +208,14 @@ async def complete_runtime( action = CmdRunAction(command='cd /testing_files') logger.info(action, extra={'msg_type': 'ACTION'}) - obs = await runtime.run_action(action) + obs = runtime.run_action(action) assert obs.exit_code == 0 action = CmdRunAction( command='/home/openhands/mambaforge/bin/mamba run -n test python3 /testing/start_test_openhands.py' ) logger.info(action, extra={'msg_type': 'ACTION'}) - obs = await runtime.run_action(action) + obs = runtime.run_action(action) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) assert obs.exit_code == 0 @@ -223,7 +223,7 @@ async def complete_runtime( command='cat /testing_files/results_biocoder.json', keep_prompt=False ) logger.info(action, extra={'msg_type': 'ACTION'}) - obs = await runtime.run_action(action) + obs = runtime.run_action(action) if obs.exit_code == 0: test_result['metadata']['2_run_test_success'] = True test_result['metadata']['2_run_test_result'] = str(obs.content) @@ -237,7 +237,7 @@ async def complete_runtime( return test_result -async def process_instance( +def process_instance( instance: pd.Series, metadata: EvalMetadata, reset_logger: bool = True, @@ -277,22 +277,26 @@ async def process_instance( # use a session id for concurrent evaluation sid = instance.instance_id.replace('/', '__') - runtime = await create_runtime(config, sid=sid) + runtime = create_runtime(config, sid=sid) - await initialize_runtime(runtime, instance) + initialize_runtime(runtime, instance) # Here's how you can run the agent (similar to the `main` function) and get the final task state - state: State | None = await run_controller( - config=config, - task_str=instruction, - runtime=runtime, - fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN[metadata.agent_class], + state: State | None = asyncio.run( + run_controller( + config=config, + task_str=instruction, + runtime=runtime, + fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN[ + metadata.agent_class + ], + ) ) if state is None: raise ValueError('State should not be None.') - test_result = await complete_runtime(runtime, instance) + test_result = complete_runtime(runtime, instance) metrics = state.metrics.get() if state.metrics else None # history is now available as a stream of events, rather than list of pairs of (Action, Observation) # for compatibility with the existing output format, we can remake the pairs here @@ -340,8 +344,6 @@ if __name__ == '__main__': output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl') instances = prepare_dataset(biocoder_tests, output_file, args.eval_n_limit) - asyncio.run( - run_evaluation( - instances, metadata, output_file, args.eval_num_workers, process_instance - ) + run_evaluation( + instances, metadata, output_file, args.eval_num_workers, process_instance ) diff --git a/evaluation/bird/run_infer.py b/evaluation/bird/run_infer.py index a48e272b72..a297386bd8 100644 --- a/evaluation/bird/run_infer.py +++ b/evaluation/bird/run_infer.py @@ -242,7 +242,7 @@ def load_bird(): return bird_dataset -async def initialize_runtime( +def initialize_runtime( runtime: Runtime, instance: pd.Series, # this argument is not required ): @@ -261,14 +261,14 @@ async def initialize_runtime( instance.db_id, f'{instance.db_id}.sqlite', ) - await runtime.copy_to(db_file, '/workspace') + runtime.copy_to(db_file, '/workspace') # Check the database is copied action = CmdRunAction( command='cd /workspace && ls -l', keep_prompt=False, ) - obs = await runtime.run_action(action) + obs = runtime.run_action(action) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) assert obs.exit_code == 0 assert f'{instance.db_id}.sqlite' in obs.content @@ -276,7 +276,7 @@ async def initialize_runtime( logger.info(f"{'-' * 50} END Runtime Initialization Fn {'-' * 50}") -async def complete_runtime( +def complete_runtime( runtime: Runtime, instance: pd.Series, # this argument is not required, but it is used to get the workspace_dir_name ) -> dict[str, Any]: @@ -300,7 +300,7 @@ async def complete_runtime( command=f'cat {path}', keep_prompt=False, ) - obs = await runtime.run_action(action) + obs = runtime.run_action(action) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) if obs.exit_code != 0: @@ -350,7 +350,7 @@ async def complete_runtime( return test_result -async def process_instance( +def process_instance( instance: pd.Series, metadata: EvalMetadata, reset_logger: bool = True, @@ -402,19 +402,23 @@ async def process_instance( # NOTE: You can actually set slightly different instruction for different agents instruction += AGENT_CLS_TO_INST_SUFFIX[metadata.agent_class] - runtime = await create_runtime(config, sid=instance_id) - await initialize_runtime(runtime, instance) + runtime = create_runtime(config, sid=instance_id) + initialize_runtime(runtime, instance) # Here's how you can run the agent (similar to the `main` function) and get the final task state - state: State | None = await run_controller( - config=config, - task_str=instruction, - fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN[metadata.agent_class], - runtime=runtime, + state: State | None = asyncio.run( + run_controller( + config=config, + task_str=instruction, + fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN[ + metadata.agent_class + ], + runtime=runtime, + ) ) # ======= Attempt to evaluate the agent's edits ======= - test_result = await complete_runtime(runtime, instance) + test_result = complete_runtime(runtime, instance) # If you are working on some simpler benchmark that only evaluates the final model output (e.g., in a MessageAction) # You can simply get the LAST `MessageAction` from the returned `state.history` and parse it for evaluation. @@ -463,8 +467,6 @@ if __name__ == '__main__': output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl') instances = prepare_dataset(dataset, output_file, args.eval_n_limit) - asyncio.run( - run_evaluation( - instances, metadata, output_file, args.eval_num_workers, process_instance - ) + run_evaluation( + instances, metadata, output_file, args.eval_num_workers, process_instance ) diff --git a/evaluation/browsing_delegation/run_infer.py b/evaluation/browsing_delegation/run_infer.py index 40cfa20cc9..08289ce588 100644 --- a/evaluation/browsing_delegation/run_infer.py +++ b/evaluation/browsing_delegation/run_infer.py @@ -51,7 +51,7 @@ def get_config( return config -async def process_instance( +def process_instance( instance: pd.Series, metadata: EvalMetadata, reset_logger: bool = True, @@ -71,12 +71,14 @@ async def process_instance( f'NOTE: You should copy the "query" as is into the tag. DO NOT change ANYTHING in the query.' ) - runtime = await create_runtime(config, sid=instance.instance_id) + runtime = create_runtime(config, sid=instance.instance_id) - state: State | None = await run_controller( - config=config, - task_str=instruction, - runtime=runtime, + state: State | None = asyncio.run( + run_controller( + config=config, + task_str=instruction, + runtime=runtime, + ) ) if state is None: @@ -158,12 +160,10 @@ if __name__ == '__main__': output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl') instances = prepare_dataset(dataset, output_file, args.eval_n_limit) - asyncio.run( - run_evaluation( - instances, - metadata, - output_file, - args.eval_num_workers, - process_instance, - ) + run_evaluation( + instances, + metadata, + output_file, + args.eval_num_workers, + process_instance, ) diff --git a/evaluation/gaia/run_infer.py b/evaluation/gaia/run_infer.py index ba9773e339..71bc0a87b3 100644 --- a/evaluation/gaia/run_infer.py +++ b/evaluation/gaia/run_infer.py @@ -63,7 +63,7 @@ def get_config( return config -async def initialize_runtime( +def initialize_runtime( runtime: Runtime, instance: pd.Series, # this argument is not required ): @@ -76,7 +76,7 @@ async def initialize_runtime( action = CmdRunAction(command='mkdir -p /workspace') logger.info(action, extra={'msg_type': 'ACTION'}) - obs = await runtime.run_action(action) + obs = runtime.run_action(action) assert obs.exit_code == 0 if instance['file_name'] != '': @@ -87,7 +87,7 @@ async def initialize_runtime( ) assert os.path.exists(src_file) dest_file = os.path.join('/workspace', instance['file_name']) - await runtime.copy_to(src_file, dest_file) + runtime.copy_to(src_file, dest_file) # rename to file.extension_name extension_name = instance['file_name'].split('.')[-1] @@ -95,18 +95,18 @@ async def initialize_runtime( command=f'mv /workspace/{instance["file_name"]} /workspace/file.{extension_name}' ) logger.info(action, extra={'msg_type': 'ACTION'}) - obs = await runtime.run_action(action) + obs = runtime.run_action(action) assert obs.exit_code == 0 action = CmdRunAction(command='cd /workspace') logger.info(action, extra={'msg_type': 'ACTION'}) - obs = await runtime.run_action(action) + obs = runtime.run_action(action) assert obs.exit_code == 0 logger.info(f"{'-' * 50} END Runtime Initialization Fn {'-' * 50}") -async def process_instance( +def process_instance( instance: pd.Series, metadata: EvalMetadata, reset_logger: bool = True, @@ -141,15 +141,19 @@ async def process_instance( instruction += AGENT_CLS_TO_INST_SUFFIX.get(metadata.agent_class, '') logger.info(f'Instruction:\n{instruction}', extra={'msg_type': 'OBSERVATION'}) - runtime = await create_runtime(config, sid=instance['instance_id']) - await initialize_runtime(runtime, instance) + runtime = create_runtime(config, sid=instance['instance_id']) + initialize_runtime(runtime, instance) # Here's how you can run the agent (similar to the `main` function) and get the final task state - state: State | None = await run_controller( - config=config, - task_str=instruction, - runtime=runtime, - fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN[metadata.agent_class], + state: State | None = asyncio.run( + run_controller( + config=config, + task_str=instruction, + runtime=runtime, + fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN[ + metadata.agent_class + ], + ) ) # ======= Attempt to evaluate the agent's edits ======= # If you are working on simpler benchmark that only evaluates the final model output (e.g., in a MessageAction) @@ -257,12 +261,10 @@ if __name__ == '__main__': output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl') prepared_dataset = prepare_dataset(gaia_tests, output_file, args.eval_n_limit) - asyncio.run( - run_evaluation( - dataset=prepared_dataset, - metadata=metadata, - output_file=output_file, - num_workers=args.eval_num_workers, - process_instance_func=process_instance, - ) + run_evaluation( + dataset=prepared_dataset, + metadata=metadata, + output_file=output_file, + num_workers=args.eval_num_workers, + process_instance_func=process_instance, ) diff --git a/evaluation/gorilla/run_infer.py b/evaluation/gorilla/run_infer.py index 7f17e4235c..988cdfeade 100644 --- a/evaluation/gorilla/run_infer.py +++ b/evaluation/gorilla/run_infer.py @@ -55,7 +55,7 @@ def get_config( return config -async def process_instance( +def process_instance( instance: pd.Series, metadata: EvalMetadata, reset_logger: bool = True, @@ -79,14 +79,16 @@ async def process_instance( # logger.info(f'Instruction:\n{instruction}', extra={'msg_type': 'OBSERVATION'}) # Here's how you can run the agent (similar to the `main` function) and get the final task state - runtime = await create_runtime(config, sid=instance_id) - state: State | None = await run_controller( - config=config, - task_str=instruction, - runtime=runtime, - fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN.get( - metadata.agent_class - ), + runtime = create_runtime(config, sid=instance_id) + state: State | None = asyncio.run( + run_controller( + config=config, + task_str=instruction, + runtime=runtime, + fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN.get( + metadata.agent_class + ), + ) ) # ======= Attempt to evaluate the agent's edits ======= # If you are working on simpler benchmark that only evaluates the final model output (e.g., in a MessageAction) @@ -179,14 +181,12 @@ if __name__ == '__main__': else: print('File already exists, skipping download.') - asyncio.run( - run_evaluation( - dataset=dataset, - metadata=metadata, - output_file=output_file, - num_workers=args.eval_num_workers, - process_instance_func=process_instance, - ) + run_evaluation( + dataset=dataset, + metadata=metadata, + output_file=output_file, + num_workers=args.eval_num_workers, + process_instance_func=process_instance, ) # Read the output file and calculate the accuracy diff --git a/evaluation/gpqa/run_infer.py b/evaluation/gpqa/run_infer.py index ce4fa7de23..aabd44bcd0 100644 --- a/evaluation/gpqa/run_infer.py +++ b/evaluation/gpqa/run_infer.py @@ -169,7 +169,7 @@ def convert_instance_dict(instance): return out_instance_dict -async def process_instance( +def process_instance( instance: pd.Series, metadata: EvalMetadata, reset_logger: bool = True, @@ -214,15 +214,17 @@ Again do not quit without reporting the answer first. Ok now its time to start solving the question. Good luck! """ - runtime = await create_runtime(config, sid=f'gptq_{str(instance.instance_id)}') + runtime = create_runtime(config, sid=f'gptq_{str(instance.instance_id)}') - state: State | None = await run_controller( - config=config, - task_str=instruction, - runtime=runtime, - fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN.get( - metadata.agent_class - ), + state: State | None = asyncio.run( + run_controller( + config=config, + task_str=instruction, + runtime=runtime, + fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN.get( + metadata.agent_class + ), + ) ) assert state is not None, 'State should not be None.' @@ -355,12 +357,10 @@ if __name__ == '__main__': output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl') prepared_dataset = prepare_dataset(gpqa_dataset, output_file, args.eval_n_limit) - asyncio.run( - run_evaluation( - dataset=prepared_dataset, - metadata=metadata, - output_file=output_file, - num_workers=args.eval_num_workers, - process_instance_func=process_instance, - ) + run_evaluation( + dataset=prepared_dataset, + metadata=metadata, + output_file=output_file, + num_workers=args.eval_num_workers, + process_instance_func=process_instance, ) diff --git a/evaluation/humanevalfix/run_infer.py b/evaluation/humanevalfix/run_infer.py index 3e115b67b5..726ffa9dc9 100644 --- a/evaluation/humanevalfix/run_infer.py +++ b/evaluation/humanevalfix/run_infer.py @@ -102,7 +102,7 @@ def _get_instance_id(instance: pd.Series) -> str: return instance.task_id.replace('/', '__') -async def initialize_runtime( +def initialize_runtime( runtime: Runtime, instance: pd.Series, # this argument is not required ): @@ -115,12 +115,12 @@ async def initialize_runtime( action = CmdRunAction(command='mkdir -p /workspace') logger.info(action, extra={'msg_type': 'ACTION'}) - obs = await runtime.run_action(action) + obs = runtime.run_action(action) assert obs.exit_code == 0 action = CmdRunAction(command='cd /workspace') logger.info(action, extra={'msg_type': 'ACTION'}) - obs = await runtime.run_action(action) + obs = runtime.run_action(action) assert obs.exit_code == 0 problem_statement = ( @@ -131,20 +131,20 @@ async def initialize_runtime( host_script_path = os.path.join(tmpdir, filename) with open(host_script_path, 'w') as f: f.write(problem_statement) - await runtime.copy_to( + runtime.copy_to( host_script_path, '/workspace', ) # check file exists action = CmdRunAction(command=f'ls /workspace/{_get_instance_id(instance)}.py') - obs = await runtime.run_action(action) + obs = runtime.run_action(action) assert obs.exit_code == 0 logger.info(f"{'-' * 50} END Runtime Initialization Fn {'-' * 50}") -async def complete_runtime( +def complete_runtime( runtime: Runtime, instance: pd.Series, # this argument is not required, but it is used to get the workspace_dir_name ) -> dict[str, Any]: @@ -170,7 +170,7 @@ async def complete_runtime( action = CmdRunAction( command=f'cat /workspace/{_get_instance_id(instance)}.py', keep_prompt=False ) - obs = await runtime.run_action(action) + obs = runtime.run_action(action) assert obs.exit_code == 0 function = obs.content.replace('\r\n', '\n') @@ -194,7 +194,7 @@ async def complete_runtime( return test_result -async def process_instance( +def process_instance( instance: pd.Series, metadata: EvalMetadata, reset_logger: bool = True, @@ -232,21 +232,23 @@ async def process_instance( instruction += AGENT_CLS_TO_INST_SUFFIX[metadata.agent_class] # Here's how you can run the agent (similar to the `main` function) and get the final task state - runtime = await create_runtime(config, sid=sid) - await initialize_runtime(runtime, instance) - state: State | None = await run_controller( - config=config, - task_str=instruction, - runtime=runtime, - fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN.get( - metadata.agent_class - ), + runtime = create_runtime(config, sid=sid) + initialize_runtime(runtime, instance) + state: State | None = asyncio.run( + run_controller( + config=config, + task_str=instruction, + runtime=runtime, + fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN.get( + metadata.agent_class + ), + ) ) if state is None: raise ValueError('State should not be None.') metrics = state.metrics.get() if state.metrics else None - test_result = await complete_runtime(runtime, instance) + test_result = complete_runtime(runtime, instance) # history is now available as a stream of events, rather than list of pairs of (Action, Observation) # for compatibility with the existing output format, we can remake the pairs here @@ -294,12 +296,10 @@ if __name__ == '__main__': output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl') instances = prepare_dataset(hefix_tests, output_file, args.eval_n_limit) - asyncio.run( - run_evaluation( - instances, - metadata, - output_file, - args.eval_num_workers, - process_instance, - ) + run_evaluation( + instances, + metadata, + output_file, + args.eval_num_workers, + process_instance, ) diff --git a/evaluation/logic_reasoning/run_infer.py b/evaluation/logic_reasoning/run_infer.py index 83e8642d4f..433559fb19 100644 --- a/evaluation/logic_reasoning/run_infer.py +++ b/evaluation/logic_reasoning/run_infer.py @@ -128,7 +128,7 @@ def get_test_result( CUR_EVAL_DIR = os.path.dirname(__file__) -async def initialize_runtime( +def initialize_runtime( runtime: Runtime, instance: pd.Series, # this argument is not required ): @@ -142,33 +142,31 @@ async def initialize_runtime( # Set instance id action = CmdRunAction(command='mkdir -p /workspace') logger.info(action, extra={'msg_type': 'ACTION'}) - obs = await runtime.run_action(action) + obs = runtime.run_action(action) assert obs.exit_code == 0 action = CmdRunAction(command='cd /workspace') logger.info(action, extra={'msg_type': 'ACTION'}) - obs = await runtime.run_action(action) + obs = runtime.run_action(action) assert obs.exit_code == 0 # copy logic_inference.py to /workspace - await runtime.copy_to( - os.path.join(CUR_EVAL_DIR, 'logic_inference.py'), '/workspace' - ) + runtime.copy_to(os.path.join(CUR_EVAL_DIR, 'logic_inference.py'), '/workspace') # check if the file exists - obs = await runtime.run_action(CmdRunAction(command='ls /workspace')) + obs = runtime.run_action(CmdRunAction(command='ls /workspace')) assert obs.exit_code == 0 assert 'logic_inference.py' in obs.content - await runtime.add_env_vars({'DATASET_NAME': metadata.dataset}) + runtime.add_env_vars({'DATASET_NAME': metadata.dataset}) action = CmdRunAction(command='mkdir -p /workspace/.cache_program') logger.info(action, extra={'msg_type': 'ACTION'}) - obs = await runtime.run_action(action) + obs = runtime.run_action(action) assert obs.exit_code == 0 action = IPythonRunCellAction(code='%pip install scitools-pyke') logger.info(action, extra={'msg_type': 'ACTION'}) - ipynb_obs = await runtime.run_action(action) + ipynb_obs = runtime.run_action(action) logger.info(ipynb_obs, extra={'msg_type': 'OBSERVATION'}) logger.info(f"{'-' * 50} END Runtime Initialization Fn {'-' * 50}") @@ -179,7 +177,7 @@ with open(os.path.join(CUR_EVAL_DIR, 'instruction.txt'), 'r') as f: INSTRUCTION_TEMPLATE = f.read() -async def process_instance( +def process_instance( instance: pd.Series, metadata: EvalMetadata, reset_logger: bool = True, @@ -206,8 +204,8 @@ async def process_instance( # use a session id for concurrent evaluation sid = instance['instance_id'] - runtime = await create_runtime(config, sid=sid) - await initialize_runtime(runtime, instance) + runtime = create_runtime(config, sid=sid) + initialize_runtime(runtime, instance) # Here's how you can run the agent (similar to the `main` function) and get the final task state state: State | None = asyncio.run( @@ -303,8 +301,6 @@ if __name__ == '__main__': ) output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl') instances = prepare_dataset(dataset_df, output_file, args.eval_n_limit) - asyncio.run( - run_evaluation( - instances, metadata, output_file, args.eval_num_workers, process_instance - ) + run_evaluation( + instances, metadata, output_file, args.eval_num_workers, process_instance ) diff --git a/evaluation/miniwob/run_infer.py b/evaluation/miniwob/run_infer.py index 5362ac6f5c..89c20154c7 100644 --- a/evaluation/miniwob/run_infer.py +++ b/evaluation/miniwob/run_infer.py @@ -62,7 +62,7 @@ def get_config( return config -async def initialize_runtime( +def initialize_runtime( runtime: Runtime, ) -> str: """Initialize the runtime for the agent. @@ -75,12 +75,12 @@ async def initialize_runtime( # Set instance id action = CmdRunAction(command='mkdir -p /workspace') logger.info(action, extra={'msg_type': 'ACTION'}) - obs = await runtime.run_action(action) + obs = runtime.run_action(action) assert obs.exit_code == 0 action = BrowseInteractiveAction(browser_actions=BROWSER_EVAL_GET_GOAL_ACTION) logger.info(action, extra={'msg_type': 'ACTION'}) - obs = await runtime.run_action(action) + obs = runtime.run_action(action) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) goal = obs.content @@ -88,7 +88,7 @@ async def initialize_runtime( return goal -async def complete_runtime( +def complete_runtime( runtime: Runtime, ) -> dict[str, Any]: """Complete the runtime for the agent. @@ -102,7 +102,7 @@ async def complete_runtime( action = BrowseInteractiveAction(browser_actions=BROWSER_EVAL_GET_REWARDS_ACTION) logger.info(action, extra={'msg_type': 'ACTION'}) - obs = await runtime.run_action(action) + obs = runtime.run_action(action) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) logger.info(f"{'-' * 50} END Runtime Completion Fn {'-' * 50}") @@ -111,7 +111,7 @@ async def complete_runtime( } -async def process_instance( +def process_instance( instance: pd.Series, metadata: EvalMetadata, reset_logger: bool = True, @@ -126,8 +126,8 @@ async def process_instance( else: logger.info(f'Starting evaluation for instance {env_id}.') - runtime = await create_runtime(config, sid=env_id) - task_str = await initialize_runtime(runtime) + runtime = create_runtime(config, sid=env_id) + task_str = initialize_runtime(runtime) state: State | None = asyncio.run( run_controller( @@ -154,7 +154,7 @@ async def process_instance( instruction = event.content break - return_val = await complete_runtime(runtime) + return_val = complete_runtime(runtime) logger.info(f'Return value from complete_runtime: {return_val}') reward = max(return_val['rewards']) @@ -208,8 +208,6 @@ if __name__ == '__main__': output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl') instances = prepare_dataset(dataset, output_file, args.eval_n_limit) - asyncio.run( - run_evaluation( - instances, metadata, output_file, args.eval_num_workers, process_instance - ) + run_evaluation( + instances, metadata, output_file, args.eval_num_workers, process_instance ) diff --git a/evaluation/mint/run_infer.py b/evaluation/mint/run_infer.py index 081ab76370..be174b2b57 100644 --- a/evaluation/mint/run_infer.py +++ b/evaluation/mint/run_infer.py @@ -1,3 +1,4 @@ +import asyncio import functools import os from typing import Any @@ -114,7 +115,7 @@ def get_config( return config -async def initialize_runtime(runtime: Runtime): +def initialize_runtime(runtime: Runtime): """Initialize the runtime for the agent. This function is called before the runtime is used to run the agent. @@ -125,18 +126,18 @@ async def initialize_runtime(runtime: Runtime): # Set instance id action = CmdRunAction(command='mkdir -p /workspace') logger.info(action, extra={'msg_type': 'ACTION'}) - obs = await runtime.run_action(action) + obs = runtime.run_action(action) assert obs.exit_code == 0 action = CmdRunAction(command='cd /workspace') logger.info(action, extra={'msg_type': 'ACTION'}) - obs = await runtime.run_action(action) + obs = runtime.run_action(action) assert obs.exit_code == 0 logger.info(f"{'-' * 50} END Runtime Initialization Fn {'-' * 50}") -async def process_instance( +def process_instance( instance: Any, metadata: EvalMetadata, reset_logger: bool = True, @@ -173,14 +174,16 @@ async def process_instance( }, ) - runtime = await create_runtime(config, sid=instance.instance_id) - await initialize_runtime(runtime) + runtime = create_runtime(config, sid=instance.instance_id) + initialize_runtime(runtime) - state: State | None = await run_controller( - config=config, - task_str=instruction, - runtime=runtime, - fake_user_response_fn=fake_user_response_fn, + state: State | None = asyncio.run( + run_controller( + config=config, + task_str=instruction, + runtime=runtime, + fake_user_response_fn=fake_user_response_fn, + ) ) if state is None: diff --git a/evaluation/ml_bench/run_infer.py b/evaluation/ml_bench/run_infer.py index a927aba69d..7f1c12d12e 100644 --- a/evaluation/ml_bench/run_infer.py +++ b/evaluation/ml_bench/run_infer.py @@ -13,6 +13,7 @@ TODOs: - Clean up the code and docker image used for evaluation. """ +import asyncio import os from typing import Any @@ -92,7 +93,7 @@ def get_config( return config -async def initialize_runtime( +def initialize_runtime( runtime: Runtime, instance: pd.Series, # this argument is not required ): @@ -106,38 +107,38 @@ async def initialize_runtime( # Set instance id action = CmdRunAction(command='mkdir -p /workspace') logger.info(action, extra={'msg_type': 'ACTION'}) - obs = await runtime.run_action(action) + obs = runtime.run_action(action) assert obs.exit_code == 0 # Set up the task environment action = CmdRunAction(command=f'conda activate {ID2CONDA[instance["github_id"]]}') logger.info(action, extra={'msg_type': 'ACTION'}) - obs = await runtime.run_action(action) + obs = runtime.run_action(action) assert obs.exit_code == 0 repo_url = instance['github'] repo_name = repo_url.split('/')[-1] action = CmdRunAction(command=f'git clone {repo_url} /workspace/{repo_name}') logger.info(action, extra={'msg_type': 'ACTION'}) - obs = await runtime.run_action(action) + obs = runtime.run_action(action) assert obs.exit_code == 0 action = CmdRunAction(command=f'chmod -R 777 /workspace/{repo_name}') logger.info(action, extra={'msg_type': 'ACTION'}) - obs = await runtime.run_action(action) + obs = runtime.run_action(action) assert obs.exit_code == 0 # Navigate to the task's code path task_path = os.path.join('/workspace', repo_name, instance['path'][2:]) action = CmdRunAction(command=f'cd {task_path}') logger.info(action, extra={'msg_type': 'ACTION'}) - obs = await runtime.run_action(action) + obs = runtime.run_action(action) assert obs.exit_code == 0 logger.info(f"{'-' * 50} END Runtime Initialization Fn {'-' * 50}") -async def complete_runtime( +def complete_runtime( runtime: Runtime, instance: pd.Series, # this argument is not required, but it is used to get the workspace_dir_name ) -> dict[str, Any]: @@ -160,7 +161,7 @@ async def complete_runtime( action = CmdRunAction(command=f'cat {eval_script}', keep_prompt=False) logger.info(action, extra={'msg_type': 'ACTION'}) - obs = await runtime.run_action(action) + obs = runtime.run_action(action) if obs.exit_code == 0: eval_script_content = obs.content else: @@ -172,7 +173,7 @@ async def complete_runtime( timeout=600, ) logger.info(action, extra={'msg_type': 'ACTION'}) - obs = await runtime.run_action(action) + obs = runtime.run_action(action) if obs.exit_code == 0: eval_output = obs.content else: @@ -200,9 +201,7 @@ async def complete_runtime( return outputs -async def process_instance( - instance: Any, metadata: EvalMetadata, reset_logger: bool = True -): +def process_instance(instance: Any, metadata: EvalMetadata, reset_logger: bool = True): config = get_config(metadata) # Setup the logger properly, so you can run multi-processing to parallelize the evaluation @@ -236,22 +235,24 @@ async def process_instance( ) instruction += AGENT_CLS_TO_INST_SUFFIX[metadata.agent_class] - runtime = await create_runtime(config, sid=sid) - await initialize_runtime(runtime, instance) + runtime = create_runtime(config, sid=sid) + initialize_runtime(runtime, instance) # Run the agent - state: State | None = await run_controller( - config=config, - task_str=instruction, - runtime=runtime, - fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN.get( - metadata.agent_class - ), + state: State | None = asyncio.run( + run_controller( + config=config, + task_str=instruction, + runtime=runtime, + fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN.get( + metadata.agent_class + ), + ) ) assert state is not None metrics = state.metrics.get() if state.metrics else {} - test_result = await complete_runtime(runtime) + test_result = complete_runtime(runtime) # history is now available as a stream of events, rather than list of pairs of (Action, Observation) # for compatibility with the existing output format, we can remake the pairs here diff --git a/evaluation/swe_bench/run_infer.py b/evaluation/swe_bench/run_infer.py index 2a17980335..d0fcfbd8ab 100644 --- a/evaluation/swe_bench/run_infer.py +++ b/evaluation/swe_bench/run_infer.py @@ -2,6 +2,7 @@ import asyncio import json import os import tempfile +import time from typing import Any import pandas as pd @@ -141,7 +142,7 @@ def get_config( return config -async def initialize_runtime( +def initialize_runtime( runtime: Runtime, instance: pd.Series, # this argument is not required ): @@ -160,13 +161,13 @@ async def initialize_runtime( command=f"""echo 'export SWE_INSTANCE_ID={instance['instance_id']}' >> ~/.bashrc && echo 'export PIP_CACHE_DIR=~/.cache/pip' >> ~/.bashrc && echo "alias git='git --no-pager'" >> ~/.bashrc""" ) logger.info(action, extra={'msg_type': 'ACTION'}) - obs = await runtime.run_action(action) + obs = runtime.run_action(action) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) assert obs.exit_code == 0 action = CmdRunAction(command="""export USER=$(whoami); echo USER=${USER} """) logger.info(action, extra={'msg_type': 'ACTION'}) - obs = await runtime.run_action(action) + obs = runtime.run_action(action) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) assert obs.exit_code == 0 @@ -177,7 +178,7 @@ async def initialize_runtime( # inject the instance info action = CmdRunAction(command='mkdir -p /swe_util/eval_data/instances') logger.info(action, extra={'msg_type': 'ACTION'}) - obs = await runtime.run_action(action) + obs = runtime.run_action(action) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) assert ( obs.exit_code == 0 @@ -195,35 +196,35 @@ async def initialize_runtime( json.dump([instance], f) # Copy the file to the desired location - await runtime.copy_to(temp_file_path, '/swe_util/eval_data/instances/') + runtime.copy_to(temp_file_path, '/swe_util/eval_data/instances/') # inject the instance swe entry - await runtime.copy_to( + runtime.copy_to( str(os.path.join(script_dir, 'scripts/setup/instance_swe_entry.sh')), '/swe_util/', ) action = CmdRunAction(command='cat ~/.bashrc') logger.info(action, extra={'msg_type': 'ACTION'}) - obs = await runtime.run_action(action) + obs = runtime.run_action(action) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) assert obs.exit_code == 0 action = CmdRunAction(command='source ~/.bashrc') logger.info(action, extra={'msg_type': 'ACTION'}) - obs = await runtime.run_action(action) + obs = runtime.run_action(action) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) assert obs.exit_code == 0 action = CmdRunAction(command='source /swe_util/instance_swe_entry.sh') logger.info(action, extra={'msg_type': 'ACTION'}) - obs = await runtime.run_action(action) + obs = runtime.run_action(action) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) assert obs.exit_code == 0 else: action = CmdRunAction(command='source /swe_util/swe_entry.sh') action.timeout = 1800 logger.info(action, extra={'msg_type': 'ACTION'}) - obs = await runtime.run_action(action) + obs = runtime.run_action(action) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) assert ( obs.exit_code == 0 @@ -231,13 +232,13 @@ async def initialize_runtime( action = CmdRunAction(command=f'cd /workspace/{workspace_dir_name}') logger.info(action, extra={'msg_type': 'ACTION'}) - obs = await runtime.run_action(action) + obs = runtime.run_action(action) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) assert obs.exit_code == 0 action = CmdRunAction(command='git reset --hard') logger.info(action, extra={'msg_type': 'ACTION'}) - obs = await runtime.run_action(action) + obs = runtime.run_action(action) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) assert obs.exit_code == 0 @@ -245,7 +246,7 @@ async def initialize_runtime( command='for remote_name in $(git remote); do git remote remove "${remote_name}"; done' ) logger.info(action, extra={'msg_type': 'ACTION'}) - obs = await runtime.run_action(action) + obs = runtime.run_action(action) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) assert obs.exit_code == 0 @@ -254,7 +255,7 @@ async def initialize_runtime( logger.info('-' * 30) -async def complete_runtime( +def complete_runtime( runtime: Runtime, instance: pd.Series, # this argument is not required, but it is used to get the workspace_dir_name ) -> dict[str, Any]: @@ -272,19 +273,19 @@ async def complete_runtime( action = CmdRunAction(command=f'cd /workspace/{workspace_dir_name}') logger.info(action, extra={'msg_type': 'ACTION'}) - obs = await runtime.run_action(action) + obs = runtime.run_action(action) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) assert obs.exit_code == 0 action = CmdRunAction(command='git config --global core.pager ""') logger.info(action, extra={'msg_type': 'ACTION'}) - obs = await runtime.run_action(action) + obs = runtime.run_action(action) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) assert obs.exit_code == 0 action = CmdRunAction(command='git add -A') logger.info(action, extra={'msg_type': 'ACTION'}) - obs = await runtime.run_action(action) + obs = runtime.run_action(action) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) assert obs.exit_code == 0 @@ -297,7 +298,7 @@ async def complete_runtime( ) action.timeout = 600 + 100 * n_retries logger.info(action, extra={'msg_type': 'ACTION'}) - obs = await runtime.run_action(action) + obs = runtime.run_action(action) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) n_retries += 1 if isinstance(obs, CmdOutputObservation): @@ -306,10 +307,10 @@ async def complete_runtime( break else: logger.info('Failed to get git diff, retrying...') - await asyncio.sleep(10) + time.sleep(10) elif isinstance(obs, ErrorObservation): logger.error(f'Error occurred: {obs.content}. Retrying...') - await asyncio.sleep(10) + time.sleep(10) else: raise ValueError(f'Unexpected observation type: {type(obs)}') @@ -319,7 +320,7 @@ async def complete_runtime( return {'git_patch': git_patch} -async def process_instance( +def process_instance( instance: pd.Series, metadata: EvalMetadata, reset_logger: bool = True, @@ -333,28 +334,32 @@ async def process_instance( else: logger.info(f'Starting evaluation for instance {instance.instance_id}.') - runtime = await create_runtime(config, sid=instance.instance_id) - await initialize_runtime(runtime, instance) + runtime = create_runtime(config, sid=instance.instance_id) + initialize_runtime(runtime, instance) instruction = get_instruction(instance, metadata) # Here's how you can run the agent (similar to the `main` function) and get the final task state - state: State | None = await run_controller( - config=config, - task_str=instruction, - runtime=runtime, - fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN[metadata.agent_class], + state: State | None = asyncio.run( + run_controller( + config=config, + task_str=instruction, + runtime=runtime, + fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN[ + metadata.agent_class + ], + ) ) # ======= THIS IS SWE-Bench specific ======= # Get git patch - return_val = await complete_runtime(runtime, instance) + return_val = complete_runtime(runtime, instance) git_patch = return_val['git_patch'] logger.info( f'Got git diff for instance {instance.instance_id}:\n--------\n{git_patch}\n--------' ) - await runtime.close() + runtime.close() # ========================================== # ======= Attempt to evaluate the agent's edits ======= @@ -440,8 +445,6 @@ if __name__ == '__main__': output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl') instances = prepare_dataset(swe_bench_tests, output_file, args.eval_n_limit) - asyncio.run( - run_evaluation( - instances, metadata, output_file, args.eval_num_workers, process_instance - ) + run_evaluation( + instances, metadata, output_file, args.eval_num_workers, process_instance ) diff --git a/evaluation/toolqa/run_infer.py b/evaluation/toolqa/run_infer.py index 7398354267..0a542cc0e3 100644 --- a/evaluation/toolqa/run_infer.py +++ b/evaluation/toolqa/run_infer.py @@ -57,7 +57,7 @@ def get_config( return config -async def initialize_runtime(runtime: Runtime): +def initialize_runtime(runtime: Runtime): """Initialize the runtime for the agent. This function is called before the runtime is used to run the agent. @@ -68,22 +68,20 @@ async def initialize_runtime(runtime: Runtime): # Set instance id action = CmdRunAction(command='mkdir -p /workspace') logger.info(action, extra={'msg_type': 'ACTION'}) - obs = await runtime.run_action(action) + obs = runtime.run_action(action) assert obs.exit_code == 0 action = CmdRunAction(command='cd /workspace') logger.info(action, extra={'msg_type': 'ACTION'}) - obs = await runtime.run_action(action) + obs = runtime.run_action(action) assert obs.exit_code == 0 - await runtime.add_env_vars({'WOLFRAM_ALPHA_APPID': args.wolfram_alpha_appid}) + runtime.add_env_vars({'WOLFRAM_ALPHA_APPID': args.wolfram_alpha_appid}) logger.info(f"{'-' * 50} END Runtime Initialization Fn {'-' * 50}") -async def process_instance( - instance: Any, metadata: EvalMetadata, reset_logger: bool = True -): +def process_instance(instance: Any, metadata: EvalMetadata, reset_logger: bool = True): config = get_config(metadata) qid = instance.qid @@ -104,15 +102,19 @@ async def process_instance( instruction += AGENT_CLS_TO_INST_SUFFIX[metadata.agent_class] logger.info(f'Instruction:\n{instruction}', extra={'msg_type': 'OBSERVATION'}) - runtime = await create_runtime(config, sid=qid) - await initialize_runtime(runtime) + runtime = create_runtime(config, sid=qid) + initialize_runtime(runtime) # Here's how you can run the agent (similar to the `main` function) and get the final task state - state: State | None = await run_controller( - config=config, - task_str=instruction, - runtime=runtime, - fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN[metadata.agent_class], + state: State | None = asyncio.run( + run_controller( + config=config, + task_str=instruction, + runtime=runtime, + fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN[ + metadata.agent_class + ], + ) ) # ======= Attempt to evaluate the agent's edits ======= # If you are working on simpler benchmark that only evaluates the final model output (e.g., in a MessageAction) @@ -210,8 +212,6 @@ if __name__ == '__main__': ) output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl') instances = prepare_dataset(toolqa_test, output_file, args.eval_n_limit) - asyncio.run( - run_evaluation( - instances, metadata, output_file, args.eval_num_workers, process_instance - ) + run_evaluation( + instances, metadata, output_file, args.eval_num_workers, process_instance ) diff --git a/evaluation/utils/shared.py b/evaluation/utils/shared.py index 4cf570ace2..7a91402571 100644 --- a/evaluation/utils/shared.py +++ b/evaluation/utils/shared.py @@ -1,4 +1,3 @@ -import asyncio import json import logging import multiprocessing as mp @@ -227,7 +226,7 @@ def prepare_dataset( return pd.DataFrame(new_dataset) -async def run_evaluation( +def run_evaluation( dataset: pd.DataFrame, metadata: EvalMetadata, output_file: str, @@ -244,14 +243,14 @@ async def run_evaluation( pbar = tqdm(total=len(dataset)) output_fp = open(output_file, 'a') - async def update_progress(future): + def update_progress(future): pbar.update(1) - output: EvalOutput = await future if use_multiprocessing else future + output: EvalOutput = future.result() if use_multiprocessing else future pbar.set_description(f'Instance {output.instance_id}') pbar.set_postfix_str(f'Test Result: {output.test_result}') logger.info( - f'Finished evaluation for instance {output.instance_id}: {output.test_result}\n' + f'Finished evaluation for instance {output.instance_id}: {str(output.test_result)[:300]}...\n' ) output_fp.write(json.dumps(output.model_dump()) + '\n') output_fp.flush() @@ -259,25 +258,24 @@ async def run_evaluation( try: if use_multiprocessing: with ProcessPoolExecutor(num_workers) as executor: - loop = asyncio.get_event_loop() futures = [] for _, instance in dataset.iterrows(): - future = loop.run_in_executor( - executor, + future = executor.submit( process_instance_func, instance, metadata, bool(num_workers > 1), ) - futures.append(update_progress(future)) - - await asyncio.gather(*futures) + future.add_done_callback(update_progress) + futures.append(future) + for future in futures: + future.result() # Use plain for loop for single process for easier debugging else: assert num_workers == 1 for _, instance in dataset.iterrows(): - output = await process_instance_func(instance, metadata, False) - await update_progress(output) + output = process_instance_func(instance, metadata, False) + update_progress(output) except KeyboardInterrupt: print('\nKeyboardInterrupt received. Cleaning up...\n') diff --git a/evaluation/webarena/run_infer.py b/evaluation/webarena/run_infer.py index 53dfc7e431..026d6c7ad0 100644 --- a/evaluation/webarena/run_infer.py +++ b/evaluation/webarena/run_infer.py @@ -78,7 +78,7 @@ def get_config( return config -async def initialize_runtime( +def initialize_runtime( runtime: Runtime, ) -> dict: """Initialize the runtime for the agent. @@ -91,12 +91,12 @@ async def initialize_runtime( # Set instance id action = CmdRunAction(command='mkdir -p /workspace') logger.info(action, extra={'msg_type': 'ACTION'}) - obs = await runtime.run_action(action) + obs = runtime.run_action(action) assert obs.exit_code == 0 action = BrowseInteractiveAction(browser_actions=BROWSER_EVAL_GET_GOAL_ACTION) logger.info(action, extra={'msg_type': 'ACTION'}) - obs = await runtime.run_action(action) + obs = runtime.run_action(action) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) goal = obs.content @@ -104,7 +104,7 @@ async def initialize_runtime( return goal -async def complete_runtime( +def complete_runtime( runtime: Runtime, ) -> dict[str, Any]: """Complete the runtime for the agent. @@ -118,7 +118,7 @@ async def complete_runtime( action = BrowseInteractiveAction(browser_actions=BROWSER_EVAL_GET_REWARDS_ACTION) logger.info(action, extra={'msg_type': 'ACTION'}) - obs = await runtime.run_action(action) + obs = runtime.run_action(action) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) logger.info(f"{'-' * 50} END Runtime Completion Fn {'-' * 50}") @@ -127,7 +127,7 @@ async def complete_runtime( } -async def process_instance( +def process_instance( instance: pd.Series, metadata: EvalMetadata, reset_logger: bool = True, @@ -142,15 +142,16 @@ async def process_instance( else: logger.info(f'Starting evaluation for instance {env_id}.') - runtime = await create_runtime(config, sid=env_id) - task_str = await initialize_runtime(runtime) + runtime = create_runtime(config, sid=env_id) + task_str = initialize_runtime(runtime) - state: State | None = await run_controller( - config=config, - task_str=task_str, - runtime=runtime, + state: State | None = asyncio.run( + run_controller( + config=config, + task_str=task_str, + runtime=runtime, + ) ) - # ======= Attempt to evaluate the agent's environment impact ======= # If you are working on some simpler benchmark that only evaluates the final model output (e.g., in a MessageAction) @@ -168,7 +169,7 @@ async def process_instance( instruction = event.content break - return_val = await complete_runtime(runtime) + return_val = complete_runtime(runtime) logger.info(f'Return value from complete_runtime: {return_val}') reward = max(return_val['rewards']) @@ -222,12 +223,10 @@ if __name__ == '__main__': output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl') instances = prepare_dataset(dataset, output_file, args.eval_n_limit) - asyncio.run( - run_evaluation( - instances, - metadata, - output_file, - args.eval_num_workers, - process_instance, - ) + run_evaluation( + instances, + metadata, + output_file, + args.eval_num_workers, + process_instance, ) diff --git a/openhands/core/cli.py b/openhands/core/cli.py index e2a8e5d7b4..4159363f26 100644 --- a/openhands/core/cli.py +++ b/openhands/core/cli.py @@ -79,13 +79,12 @@ async def main(): event_stream = EventStream(sid, file_store) runtime_cls = get_runtime_cls(config.runtime) - runtime: Runtime = runtime_cls( + runtime: Runtime = runtime_cls( # noqa: F841 config=config, event_stream=event_stream, sid=sid, plugins=agent_cls.sandbox_plugins, ) - await runtime.ainit() controller = AgentController( agent=agent, diff --git a/openhands/core/main.py b/openhands/core/main.py index d85ff4b4d0..fc59ba89cd 100644 --- a/openhands/core/main.py +++ b/openhands/core/main.py @@ -47,10 +47,9 @@ def read_task_from_stdin() -> str: return sys.stdin.read() -async def create_runtime( +def create_runtime( config: AppConfig, sid: str | None = None, - runtime_tools_config: dict | None = None, ) -> Runtime: """Create a runtime for the agent to run on. @@ -79,7 +78,6 @@ async def create_runtime( sid=session_id, plugins=agent_cls.sandbox_plugins, ) - await runtime.ainit() return runtime @@ -121,7 +119,7 @@ async def run_controller( sid = sid or generate_sid(config) if runtime is None: - runtime = await create_runtime(config, sid=sid) + runtime = create_runtime(config, sid=sid) event_stream = runtime.event_stream # restore cli session if enabled diff --git a/openhands/runtime/client/runtime.py b/openhands/runtime/client/runtime.py index 9df5381b64..521a2a1caf 100644 --- a/openhands/runtime/client/runtime.py +++ b/openhands/runtime/client/runtime.py @@ -1,12 +1,12 @@ -import asyncio import os import tempfile import threading +import time import uuid from zipfile import ZipFile -import aiohttp import docker +import requests import tenacity from openhands.core.config import AppConfig @@ -104,25 +104,23 @@ class EventStreamRuntime(Runtime): event_stream: EventStream, sid: str = 'default', plugins: list[PluginRequirement] | None = None, + env_vars: dict[str, str] | None = None, ): - super().__init__( - config, event_stream, sid, plugins - ) # will initialize the event stream + self.config = config self._port = find_available_tcp_port() self.api_url = f'http://{self.config.sandbox.api_hostname}:{self._port}' - self.session: aiohttp.ClientSession | None = None + self.session = requests.Session() self.instance_id = ( sid + '_' + str(uuid.uuid4()) if sid is not None else str(uuid.uuid4()) ) - # TODO: We can switch to aiodocker when `build_sandbox_image` is updated to use aiodocker self.docker_client: docker.DockerClient = self._init_docker_client() self.base_container_image = self.config.sandbox.base_container_image self.runtime_container_image = self.config.sandbox.runtime_container_image self.container_name = self.container_name_prefix + self.instance_id self.container = None - self.action_semaphore = asyncio.Semaphore(1) # Ensure one action at a time + self.action_semaphore = threading.Semaphore(1) # Ensure one action at a time self.runtime_builder = DockerRuntimeBuilder(self.docker_client) logger.debug(f'EventStreamRuntime `{sid}` config:\n{self.config}') @@ -131,7 +129,6 @@ class EventStreamRuntime(Runtime): self.log_buffer: LogBuffer | None = None self.startup_done = False - async def ainit(self, env_vars: dict[str, str] | None = None): if self.config.sandbox.runtime_extra_deps: logger.info( f'Installing extra user-provided dependencies in the runtime image: {self.config.sandbox.runtime_extra_deps}' @@ -147,14 +144,13 @@ class EventStreamRuntime(Runtime): self.runtime_builder, extra_deps=self.config.sandbox.runtime_extra_deps, ) - self.container = await self._init_container( + self.container = self._init_container( self.sandbox_workspace_dir, mount_dir=self.config.workspace_mount_path, - plugins=self.plugins, + plugins=plugins, ) - # MUST call super().ainit() to initialize both default env vars - # AND the ones in env vars! - await super().ainit(env_vars) + # will initialize both the event stream and the env vars + super().__init__(config, event_stream, sid, plugins, env_vars) logger.info( f'Container initialized with plugins: {[plugin.name for plugin in self.plugins]}' @@ -175,7 +171,7 @@ class EventStreamRuntime(Runtime): stop=tenacity.stop_after_attempt(5), wait=tenacity.wait_exponential(multiplier=1, min=4, max=60), ) - async def _init_container( + def _init_container( self, sandbox_workspace_dir: str, mount_dir: str | None = None, @@ -242,19 +238,14 @@ class EventStreamRuntime(Runtime): except Exception as e: logger.error('Failed to start container') logger.exception(e) - await self.close(close_client=False) + self.close(close_client=False) raise e - async def _ensure_session(self): - if self.session is None or self.session.closed: - self.session = aiohttp.ClientSession() - return self.session - @tenacity.retry( stop=tenacity.stop_after_attempt(10), wait=tenacity.wait_exponential(multiplier=2, min=10, max=60), ) - async def _wait_until_alive(self): + def _wait_until_alive(self): init_msg = 'Runtime client initialized.' logger.debug('Getting container logs...') @@ -284,7 +275,7 @@ class EventStreamRuntime(Runtime): attempts = 0 while not self.startup_done and attempts < 10: attempts += 1 - await asyncio.sleep(1) + time.sleep(1) logs = self.log_buffer.get_and_clear() if logs: formatted_logs = '\n'.join([f' |{log}' for log in logs]) @@ -301,25 +292,24 @@ class EventStreamRuntime(Runtime): self.startup_done = True break - async with aiohttp.ClientSession() as session: - async with session.get(f'{self.api_url}/alive') as response: - if response.status == 200: - return - else: - msg = f'Action execution API is not alive. Response: {response}' - logger.error(msg) - raise RuntimeError(msg) + response = self.session.get(f'{self.api_url}/alive') + if response.status_code == 200: + return + else: + msg = f'Action execution API is not alive. Response: {response}' + logger.error(msg) + raise RuntimeError(msg) @property def sandbox_workspace_dir(self): return self.config.workspace_mount_path_in_sandbox - async def close(self, close_client: bool = True): + def close(self, close_client: bool = True): if self.log_buffer: self.log_buffer.close() - if self.session is not None and not self.session.closed: - await self.session.close() + if self.session: + self.session.close() containers = self.docker_client.containers.list(all=True) for container in containers: @@ -335,12 +325,12 @@ class EventStreamRuntime(Runtime): if close_client: self.docker_client.close() - async def run_action(self, action: Action) -> Observation: + def run_action(self, action: Action) -> Observation: # set timeout to default if not set if action.timeout is None: action.timeout = self.config.sandbox.timeout - async with self.action_semaphore: + with self.action_semaphore: if not action.runnable: return NullObservation('') action_type = action.action # type: ignore[attr-defined] @@ -352,30 +342,26 @@ class EventStreamRuntime(Runtime): ) logger.info('Awaiting session') - session = await self._ensure_session() - await self._wait_until_alive() + self._wait_until_alive() assert action.timeout is not None try: - logger.info('Executing command') - async with session.post( + response = self.session.post( f'{self.api_url}/execute_action', json={'action': event_to_dict(action)}, timeout=action.timeout, - ) as response: - if response.status == 200: - output = await response.json() - obs = observation_from_dict(output) - obs._cause = action.id # type: ignore[attr-defined] - return obs - else: - error_message = await response.text() - logger.error(f'Error from server: {error_message}') - obs = ErrorObservation( - f'Command execution failed: {error_message}' - ) - except asyncio.TimeoutError: + ) + if response.status_code == 200: + output = response.json() + obs = observation_from_dict(output) + obs._cause = action.id # type: ignore[attr-defined] + return obs + else: + error_message = response.text + logger.error(f'Error from server: {error_message}') + obs = ErrorObservation(f'Command execution failed: {error_message}') + except requests.Timeout: logger.error('No response received within the timeout period.') obs = ErrorObservation('Command execution timed out') except Exception as e: @@ -383,36 +369,35 @@ class EventStreamRuntime(Runtime): obs = ErrorObservation(f'Command execution failed: {str(e)}') return obs - async def run(self, action: CmdRunAction) -> Observation: - return await self.run_action(action) + def run(self, action: CmdRunAction) -> Observation: + return self.run_action(action) - async def run_ipython(self, action: IPythonRunCellAction) -> Observation: - return await self.run_action(action) + def run_ipython(self, action: IPythonRunCellAction) -> Observation: + return self.run_action(action) - async def read(self, action: FileReadAction) -> Observation: - return await self.run_action(action) + def read(self, action: FileReadAction) -> Observation: + return self.run_action(action) - async def write(self, action: FileWriteAction) -> Observation: - return await self.run_action(action) + def write(self, action: FileWriteAction) -> Observation: + return self.run_action(action) - async def browse(self, action: BrowseURLAction) -> Observation: - return await self.run_action(action) + def browse(self, action: BrowseURLAction) -> Observation: + return self.run_action(action) - async def browse_interactive(self, action: BrowseInteractiveAction) -> Observation: - return await self.run_action(action) + def browse_interactive(self, action: BrowseInteractiveAction) -> Observation: + return self.run_action(action) # ==================================================================== # Implement these methods (for file operations) in the subclass # ==================================================================== - async def copy_to( + def copy_to( self, host_src: str, sandbox_dest: str, recursive: bool = False ) -> None: if not os.path.exists(host_src): raise FileNotFoundError(f'Source file {host_src} does not exist') - session = await self._ensure_session() - await self._wait_until_alive() + self._wait_until_alive() try: if recursive: # For recursive copy, create a zip file @@ -437,16 +422,16 @@ class EventStreamRuntime(Runtime): params = {'destination': sandbox_dest, 'recursive': str(recursive).lower()} - async with session.post( - f'{self.api_url}/upload_file', data=upload_data, params=params - ) as response: - if response.status == 200: - return - else: - error_message = await response.text() - raise Exception(f'Copy operation failed: {error_message}') + response = self.session.post( + f'{self.api_url}/upload_file', files=upload_data, params=params + ) + if response.status_code == 200: + return + else: + error_message = response.text + raise Exception(f'Copy operation failed: {error_message}') - except asyncio.TimeoutError: + except requests.Timeout: raise TimeoutError('Copy operation timed out') except Exception as e: raise RuntimeError(f'Copy operation failed: {str(e)}') @@ -455,29 +440,26 @@ class EventStreamRuntime(Runtime): os.unlink(temp_zip_path) logger.info(f'Copy completed: host:{host_src} -> runtime:{sandbox_dest}') - async def list_files(self, path: str | None = None) -> list[str]: + def list_files(self, path: str | None = None) -> list[str]: """List files in the sandbox. If path is None, list files in the sandbox's initial working directory (e.g., /workspace). """ - session = await self._ensure_session() - await self._wait_until_alive() + self._wait_until_alive() try: data = {} if path is not None: data['path'] = path - async with session.post( - f'{self.api_url}/list_files', json=data - ) as response: - if response.status == 200: - response_json = await response.json() - assert isinstance(response_json, list) - return response_json - else: - error_message = await response.text() - raise Exception(f'List files operation failed: {error_message}') - except asyncio.TimeoutError: + response = self.session.post(f'{self.api_url}/list_files', json=data) + if response.status_code == 200: + response_json = response.json() + assert isinstance(response_json, list) + return response_json + else: + error_message = response.text + raise Exception(f'List files operation failed: {error_message}') + except requests.Timeout: raise TimeoutError('List files operation timed out') except Exception as e: raise RuntimeError(f'List files operation failed: {str(e)}') diff --git a/openhands/runtime/e2b/runtime.py b/openhands/runtime/e2b/runtime.py index 852e419f68..82ca16f939 100644 --- a/openhands/runtime/e2b/runtime.py +++ b/openhands/runtime/e2b/runtime.py @@ -33,13 +33,13 @@ class E2BRuntime(Runtime): raise ValueError('E2BRuntime requires an E2BSandbox') self.file_store = E2BFileStore(self.sandbox.filesystem) - async def read(self, action: FileReadAction) -> Observation: + def read(self, action: FileReadAction) -> Observation: content = self.file_store.read(action.path) lines = read_lines(content.split('\n'), action.start, action.end) code_view = ''.join(lines) return FileReadObservation(code_view, path=action.path) - async def write(self, action: FileWriteAction) -> Observation: + def write(self, action: FileWriteAction) -> Observation: if action.start == 0 and action.end == -1: self.file_store.write(action.path, action.content) return FileWriteObservation(content='', path=action.path) diff --git a/openhands/runtime/remote/runtime.py b/openhands/runtime/remote/runtime.py index 1ff01edb28..5931174e8d 100644 --- a/openhands/runtime/remote/runtime.py +++ b/openhands/runtime/remote/runtime.py @@ -1,14 +1,19 @@ -import asyncio import os import ssl import tempfile +import threading import uuid -from typing import Any, Optional, Type +from typing import Any, Type from zipfile import ZipFile -import aiohttp -import aiohttp.client_exceptions -import tenacity +import requests +from requests.exceptions import HTTPError, RequestException, Timeout +from tenacity import ( + retry, + retry_if_exception_type, + stop_after_attempt, + wait_exponential, +) from openhands.core.config import AppConfig from openhands.core.logger import openhands_logger as logger @@ -36,11 +41,9 @@ from openhands.runtime.utils.runtime_build import build_runtime_image DEFAULT_RETRY_EXCEPTIONS = [ ssl.SSLCertVerificationError, - aiohttp.ClientError, - aiohttp.client_exceptions.ContentTypeError, - aiohttp.client_exceptions.ClientConnectorCertificateError, - ssl.SSLCertVerificationError, - asyncio.TimeoutError, + RequestException, + HTTPError, + Timeout, ] @@ -55,8 +58,9 @@ class RemoteRuntime(Runtime): event_stream: EventStream, sid: str = 'default', plugins: list[PluginRequirement] | None = None, + env_vars: dict[str, str] | None = None, ): - super().__init__(config, event_stream, sid, plugins) + self.config = config if self.config.sandbox.api_hostname == 'localhost': self.config.sandbox.api_hostname = 'api.all-hands.dev/v0/runtime' logger.warning( @@ -65,20 +69,20 @@ class RemoteRuntime(Runtime): ) self.api_url = f'https://{self.config.sandbox.api_hostname.rstrip("/")}' - self.session: Optional[aiohttp.ClientSession] = None - - self.action_semaphore = asyncio.Semaphore(1) # Ensure one action at a time + if self.config.sandbox.api_key is None: + raise ValueError( + 'API key is required to use the remote runtime. ' + 'Please set the API key in the config (config.toml) or as an environment variable (SANDBOX_API_KEY).' + ) + self.session = requests.Session() + self.session.headers.update({'X-API-Key': self.config.sandbox.api_key}) + self.action_semaphore = threading.Semaphore(1) if self.config.workspace_base is not None: logger.warning( 'Setting workspace_base is not supported in the remote runtime.' ) - if self.config.sandbox.api_key is None: - raise ValueError( - 'API key is required to use the remote runtime. ' - 'Please set the API key in the config (config.toml) or as an environment variable (SANDBOX_API_KEY).' - ) self.runtime_builder = RemoteRuntimeBuilder( self.api_url, self.config.sandbox.api_key ) @@ -95,48 +99,8 @@ class RemoteRuntime(Runtime): self.container_image: str = self.config.sandbox.base_container_image self.container_name = 'od-remote-runtime-' + self.instance_id logger.debug(f'RemoteRuntime `{sid}` config:\n{self.config}') - - async def _send_request( - self, - method: str, - url: str, - retry_exceptions: list[Type[Exception]] | None = None, - **kwargs: Any, - ) -> aiohttp.ClientResponse: - if retry_exceptions is None: - retry_exceptions = DEFAULT_RETRY_EXCEPTIONS - - session = await self._ensure_session() - - def log_retry(retry_state): - exception = retry_state.outcome.exception() - logger.warning( - f'Retry attempt {retry_state.attempt_number} failed with exception: {exception}' - ) - - @tenacity.retry( - stop=tenacity.stop_after_attempt(10), - wait=tenacity.wait_exponential(multiplier=1, min=4, max=60), - retry=tenacity.retry_if_exception_type(tuple(retry_exceptions)), - reraise=True, - after=log_retry, - ) - async def _send_request_with_retry(): - async with session.request(method, url, **kwargs) as response: - await response.read() - return response - - return await _send_request_with_retry() - - async def ainit(self, env_vars: dict[str, str] | None = None): - # Check if the container image exists - # Use the /registry_prefix endpoint to get the registry prefix - response = await self._send_request('GET', f'{self.api_url}/registry_prefix') - if response.status != 200: - raise RuntimeError( - f'Failed to get registry prefix: {await response.text()}' - ) - response_json = await response.json() + response = self._send_request('GET', f'{self.api_url}/registry_prefix') + response_json = response.json() registry_prefix = response_json['registry_prefix'] os.environ['OD_RUNTIME_RUNTIME_IMAGE_REPO'] = ( registry_prefix.rstrip('/') + '/runtime' @@ -158,26 +122,23 @@ class RemoteRuntime(Runtime): ) # Use the /image_exists endpoint to check if the image exists - response = await self._send_request( + response = self._send_request( 'GET', f'{self.api_url}/image_exists', params={'image': self.container_image}, ) - if response.status != 200 or not (await response.json())['exists']: + if response.status_code != 200 or not response.json()['exists']: raise RuntimeError(f'Container image {self.container_image} does not exist') # Prepare the request body for the /start endpoint plugin_arg = '' - if self.plugins is not None and len(self.plugins) > 0: - plugin_arg = ( - f'--plugins {" ".join([plugin.name for plugin in self.plugins])} ' - ) - if self.config.sandbox.browsergym_eval_env is not None: - browsergym_arg = ( - f'--browsergym-eval-env {self.config.sandbox.browsergym_eval_env}' - ) - else: - browsergym_arg = '' + if plugins is not None and len(plugins) > 0: + plugin_arg = f'--plugins {" ".join([plugin.name for plugin in plugins])} ' + browsergym_arg = ( + f'--browsergym-eval-env {self.config.sandbox.browsergym_eval_env}' + if self.config.sandbox.browsergym_eval_env is not None + else '' + ) start_request = { 'image': self.container_image, 'command': ( @@ -196,12 +157,12 @@ class RemoteRuntime(Runtime): } # Start the sandbox using the /start endpoint - response = await self._send_request( + response = self._send_request( 'POST', f'{self.api_url}/start', json=start_request ) - if response.status != 201: - raise RuntimeError(f'Failed to start sandbox: {await response.text()}') - start_response = await response.json() + if response.status_code != 201: + raise RuntimeError(f'Failed to start sandbox: {response.text}') + start_response = response.json() self.runtime_id = start_response['runtime_id'] self.runtime_url = start_response['url'] @@ -209,8 +170,8 @@ class RemoteRuntime(Runtime): f'Sandbox started. Runtime ID: {self.runtime_id}, URL: {self.runtime_url}' ) - # Initialize environment variables - await super().ainit(env_vars) + # Initialize the eventstream and env vars + super().__init__(config, event_stream, sid, plugins, env_vars) logger.info( f'Runtime initialized with plugins: {[plugin.name for plugin in self.plugins]}' @@ -223,26 +184,40 @@ class RemoteRuntime(Runtime): self.runtime_url is not None ), 'Runtime URL is not set. This should never happen.' - async def _ensure_session(self): - if self.session is None or self.session.closed: - self.session = aiohttp.ClientSession( - headers={'X-API-Key': self.config.sandbox.api_key} - ) - return self.session + def _send_request( + self, + method: str, + url: str, + retry_exceptions: list[Type[Exception]] | None = None, + **kwargs: Any, + ) -> requests.Response: + if retry_exceptions is None: + retry_exceptions = DEFAULT_RETRY_EXCEPTIONS - @tenacity.retry( - stop=tenacity.stop_after_attempt(10), - wait=tenacity.wait_exponential(multiplier=1, min=4, max=60), - retry=tenacity.retry_if_exception_type(RuntimeError), + @retry( + stop=stop_after_attempt(10), + wait=wait_exponential(multiplier=1, min=4, max=60), + retry=retry_if_exception_type(tuple(retry_exceptions)), + reraise=True, + ) + def _send_request_with_retry(): + response = self.session.request(method, url, **kwargs) + response.raise_for_status() + return response + + return _send_request_with_retry() + + @retry( + stop=stop_after_attempt(10), + wait=wait_exponential(multiplier=1, min=4, max=60), + retry=retry_if_exception_type(RuntimeError), reraise=True, ) - async def _wait_until_alive(self): + def _wait_until_alive(self): logger.info('Waiting for sandbox to be alive...') - response = await self._send_request('GET', f'{self.runtime_url}/alive') - if response.status == 200: - return - else: - msg = f'Runtime is not alive (id={self.runtime_id}). Status: {response.status}.' + response = self._send_request('GET', f'{self.runtime_url}/alive') + if response.status_code != 200: + msg = f'Runtime is not alive yet (id={self.runtime_id}). Status: {response.status_code}.' logger.warning(msg) raise RuntimeError(msg) @@ -250,28 +225,25 @@ class RemoteRuntime(Runtime): def sandbox_workspace_dir(self): return self.config.workspace_mount_path_in_sandbox - async def close(self): + def close(self): if self.runtime_id: try: - response = await self._send_request( + response = self._send_request( 'POST', f'{self.api_url}/stop', json={'runtime_id': self.runtime_id} ) - if response.status != 200: - logger.error(f'Failed to stop sandbox: {await response.text()}') + if response.status_code != 200: + logger.error(f'Failed to stop sandbox: {response.text}') else: logger.info(f'Sandbox stopped. Runtime ID: {self.runtime_id}') except Exception as e: raise e finally: - if self.session is not None: - await self.session.close() - self.session = None + self.session.close() - async def run_action(self, action: Action) -> Observation: + def run_action(self, action: Action) -> Observation: if action.timeout is None: action.timeout = self.config.sandbox.timeout - - async with self.action_semaphore: + with self.action_semaphore: if not action.runnable: return NullObservation('') action_type = action.action # type: ignore[attr-defined] @@ -282,7 +254,7 @@ class RemoteRuntime(Runtime): f'Action {action_type} is not supported in the current runtime.' ) - await self._wait_until_alive() + self._wait_until_alive() assert action.timeout is not None @@ -290,28 +262,25 @@ class RemoteRuntime(Runtime): logger.info('Executing action') request_body = {'action': event_to_dict(action)} logger.debug(f'Request body: {request_body}') - response = await self._send_request( + response = self._send_request( 'POST', f'{self.runtime_url}/execute_action', json=request_body, timeout=action.timeout, retry_exceptions=list( - filter( - lambda e: e != asyncio.TimeoutError, - DEFAULT_RETRY_EXCEPTIONS, - ) + filter(lambda e: e != TimeoutError, DEFAULT_RETRY_EXCEPTIONS) ), ) - if response.status == 200: - output = await response.json() + if response.status_code == 200: + output = response.json() obs = observation_from_dict(output) obs._cause = action.id # type: ignore[attr-defined] return obs else: - error_message = await response.text() + error_message = response.text logger.error(f'Error from server: {error_message}') obs = ErrorObservation(f'Action execution failed: {error_message}') - except asyncio.TimeoutError: + except Timeout: logger.error('No response received within the timeout period.') obs = ErrorObservation('Action execution timed out') except Exception as e: @@ -319,31 +288,31 @@ class RemoteRuntime(Runtime): obs = ErrorObservation(f'Action execution failed: {str(e)}') return obs - async def run(self, action: CmdRunAction) -> Observation: - return await self.run_action(action) + def run(self, action: CmdRunAction) -> Observation: + return self.run_action(action) - async def run_ipython(self, action: IPythonRunCellAction) -> Observation: - return await self.run_action(action) + def run_ipython(self, action: IPythonRunCellAction) -> Observation: + return self.run_action(action) - async def read(self, action: FileReadAction) -> Observation: - return await self.run_action(action) + def read(self, action: FileReadAction) -> Observation: + return self.run_action(action) - async def write(self, action: FileWriteAction) -> Observation: - return await self.run_action(action) + def write(self, action: FileWriteAction) -> Observation: + return self.run_action(action) - async def browse(self, action: BrowseURLAction) -> Observation: - return await self.run_action(action) + def browse(self, action: BrowseURLAction) -> Observation: + return self.run_action(action) - async def browse_interactive(self, action: BrowseInteractiveAction) -> Observation: - return await self.run_action(action) + def browse_interactive(self, action: BrowseInteractiveAction) -> Observation: + return self.run_action(action) - async def copy_to( + def copy_to( self, host_src: str, sandbox_dest: str, recursive: bool = False ) -> None: if not os.path.exists(host_src): raise FileNotFoundError(f'Source file {host_src} does not exist') - await self._wait_until_alive() + self._wait_until_alive() try: if recursive: with tempfile.NamedTemporaryFile( @@ -366,26 +335,24 @@ class RemoteRuntime(Runtime): params = {'destination': sandbox_dest, 'recursive': str(recursive).lower()} - response = await self._send_request( + response = self._send_request( 'POST', f'{self.runtime_url}/upload_file', - data=upload_data, + files=upload_data, params=params, retry_exceptions=list( - filter( - lambda e: e != asyncio.TimeoutError, DEFAULT_RETRY_EXCEPTIONS - ) + filter(lambda e: e != TimeoutError, DEFAULT_RETRY_EXCEPTIONS) ), ) - if response.status == 200: + if response.status_code == 200: logger.info( - f'Copy completed: host:{host_src} -> runtime:{sandbox_dest}. Response: {await response.text()}' + f'Copy completed: host:{host_src} -> runtime:{sandbox_dest}. Response: {response.text}' ) return else: - error_message = await response.text() + error_message = response.text raise Exception(f'Copy operation failed: {error_message}') - except asyncio.TimeoutError: + except TimeoutError: raise TimeoutError('Copy operation timed out') except Exception as e: raise RuntimeError(f'Copy operation failed: {str(e)}') @@ -394,31 +361,29 @@ class RemoteRuntime(Runtime): os.unlink(temp_zip_path) logger.info(f'Copy completed: host:{host_src} -> runtime:{sandbox_dest}') - async def list_files(self, path: str | None = None) -> list[str]: - await self._wait_until_alive() + def list_files(self, path: str | None = None) -> list[str]: + self._wait_until_alive() try: data = {} if path is not None: data['path'] = path - response = await self._send_request( + response = self._send_request( 'POST', f'{self.runtime_url}/list_files', json=data, retry_exceptions=list( - filter( - lambda e: e != asyncio.TimeoutError, DEFAULT_RETRY_EXCEPTIONS - ) + filter(lambda e: e != TimeoutError, DEFAULT_RETRY_EXCEPTIONS) ), ) - if response.status == 200: - response_json = await response.json() + if response.status_code == 200: + response_json = response.json() assert isinstance(response_json, list) return response_json else: - error_message = await response.text() + error_message = response.text raise Exception(f'List files operation failed: {error_message}') - except asyncio.TimeoutError: + except TimeoutError: raise TimeoutError('List files operation timed out') except Exception as e: raise RuntimeError(f'List files operation failed: {str(e)}') diff --git a/openhands/runtime/runtime.py b/openhands/runtime/runtime.py index d59c10c0fe..f694315c3f 100644 --- a/openhands/runtime/runtime.py +++ b/openhands/runtime/runtime.py @@ -1,4 +1,3 @@ -import asyncio import atexit import copy import json @@ -58,6 +57,7 @@ class Runtime: event_stream: EventStream, sid: str = 'default', plugins: list[PluginRequirement] | None = None, + env_vars: dict[str, str] | None = None, ): self.sid = sid self.event_stream = event_stream @@ -66,41 +66,22 @@ class Runtime: self.config = copy.deepcopy(config) self.DEFAULT_ENV_VARS = _default_env_vars(config.sandbox) - atexit.register(self.close_sync) + atexit.register(self.close) logger.debug(f'Runtime `{sid}` config:\n{self.config}') - async def ainit(self, env_vars: dict[str, str] | None = None) -> None: - """ - Initialize the runtime (asynchronously). - - This method should be called after the runtime's constructor. - """ if self.DEFAULT_ENV_VARS: logger.debug(f'Adding default env vars: {self.DEFAULT_ENV_VARS}') - await self.add_env_vars(self.DEFAULT_ENV_VARS) + self.add_env_vars(self.DEFAULT_ENV_VARS) if env_vars is not None: logger.debug(f'Adding provided env vars: {env_vars}') - await self.add_env_vars(env_vars) + self.add_env_vars(env_vars) - async def close(self) -> None: + def close(self) -> None: pass - def close_sync(self) -> None: - try: - loop = asyncio.get_event_loop() - if loop.is_closed(): - return - if loop.is_running(): - loop.create_task(self.close()) - else: - loop.run_until_complete(self.close()) - except RuntimeError: - # Event loop is already closed, nothing to do - pass - # ==================================================================== - async def add_env_vars(self, env_vars: dict[str, str]) -> None: + def add_env_vars(self, env_vars: dict[str, str]) -> None: # Add env vars to the IPython shell (if Jupyter is used) if any(isinstance(plugin, JupyterRequirement) for plugin in self.plugins): code = 'import os\n' @@ -108,7 +89,7 @@ class Runtime: # Note: json.dumps gives us nice escaping for free code += f'os.environ["{key}"] = {json.dumps(value)}\n' code += '\n' - obs = await self.run_ipython(IPythonRunCellAction(code)) + obs = self.run_ipython(IPythonRunCellAction(code)) logger.info(f'Added env vars to IPython: code={code}, obs={obs}') # Add env vars to the Bash shell @@ -120,7 +101,7 @@ class Runtime: return cmd = cmd.strip() logger.debug(f'Adding env var: {cmd}') - obs = await self.run(CmdRunAction(cmd)) + obs = self.run(CmdRunAction(cmd)) if not isinstance(obs, CmdOutputObservation) or obs.exit_code != 0: raise RuntimeError( f'Failed to add env vars [{env_vars}] to environment: {obs.content}' @@ -132,12 +113,12 @@ class Runtime: if event.timeout is None: event.timeout = self.config.sandbox.timeout assert event.timeout is not None - observation = await self.run_action(event) + observation = self.run_action(event) observation._cause = event.id # type: ignore[attr-defined] source = event.source if event.source else EventSource.AGENT self.event_stream.add_event(observation, source) # type: ignore[arg-type] - async def run_action(self, action: Action) -> Observation: + def run_action(self, action: Action) -> Observation: """Run an action and return the resulting observation. If the action is not runnable in any runtime, a NullObservation is returned. If the action is not supported by the current runtime, an ErrorObservation is returned. @@ -163,35 +144,45 @@ class Runtime: return UserRejectObservation( 'Action has been rejected by the user! Waiting for further user input.' ) - observation = await getattr(self, action_type)(action) + observation = getattr(self, action_type)(action) return observation + # ==================================================================== + # Context manager + # ==================================================================== + + def __enter__(self) -> 'Runtime': + return self + + def __exit__(self, exc_type, exc_value, traceback) -> None: + self.close() + # ==================================================================== # Action execution # ==================================================================== @abstractmethod - async def run(self, action: CmdRunAction) -> Observation: + def run(self, action: CmdRunAction) -> Observation: pass @abstractmethod - async def run_ipython(self, action: IPythonRunCellAction) -> Observation: + def run_ipython(self, action: IPythonRunCellAction) -> Observation: pass @abstractmethod - async def read(self, action: FileReadAction) -> Observation: + def read(self, action: FileReadAction) -> Observation: pass @abstractmethod - async def write(self, action: FileWriteAction) -> Observation: + def write(self, action: FileWriteAction) -> Observation: pass @abstractmethod - async def browse(self, action: BrowseURLAction) -> Observation: + def browse(self, action: BrowseURLAction) -> Observation: pass @abstractmethod - async def browse_interactive(self, action: BrowseInteractiveAction) -> Observation: + def browse_interactive(self, action: BrowseInteractiveAction) -> Observation: pass # ==================================================================== @@ -199,11 +190,11 @@ class Runtime: # ==================================================================== @abstractmethod - async def copy_to(self, host_src: str, sandbox_dest: str, recursive: bool = False): + def copy_to(self, host_src: str, sandbox_dest: str, recursive: bool = False): raise NotImplementedError('This method is not implemented in the base class.') @abstractmethod - async def list_files(self, path: str | None = None) -> list[str]: + def list_files(self, path: str | None = None) -> list[str]: """List files in the sandbox. If path is None, list files in the sandbox's initial working directory (e.g., /workspace). diff --git a/openhands/server/listen.py b/openhands/server/listen.py index f2e0a4c11f..2b52390358 100644 --- a/openhands/server/listen.py +++ b/openhands/server/listen.py @@ -406,7 +406,7 @@ async def list_files(request: Request, path: str | None = None): content={'error': 'Runtime not yet initialized'}, ) runtime: Runtime = request.state.session.agent_session.runtime - file_list = await runtime.list_files(path) + file_list = runtime.list_files(path) return file_list @@ -440,7 +440,7 @@ async def select_file(file: str, request: Request): ) read_action = FileReadAction(file) - observation = await runtime.run_action(read_action) + observation = runtime.run_action(read_action) if isinstance(observation, FileReadObservation): content = observation.content @@ -519,7 +519,7 @@ async def upload_file(request: Request, files: list[UploadFile]): tmp_file.flush() runtime: Runtime = request.state.session.agent_session.runtime - await runtime.copy_to( + runtime.copy_to( tmp_file_path, runtime.config.workspace_mount_path_in_sandbox ) uploaded_files.append(safe_filename) @@ -686,7 +686,7 @@ async def save_file(request: Request): # Save the file to the agent's runtime file store runtime: Runtime = request.state.session.agent_session.runtime write_action = FileWriteAction(file_path, content) - observation = await runtime.run_action(write_action) + observation = runtime.run_action(write_action) if isinstance(observation, FileWriteObservation): return JSONResponse( diff --git a/openhands/server/session/agent.py b/openhands/server/session/agent.py index 0dab396dbe..7cf244c12f 100644 --- a/openhands/server/session/agent.py +++ b/openhands/server/session/agent.py @@ -69,7 +69,7 @@ class AgentSession: end_state.save_to_session(self.sid, self.file_store) await self.controller.close() if self.runtime is not None: - await self.runtime.close() + self.runtime.close() if self.security_analyzer is not None: await self.security_analyzer.close() self._closed = True @@ -95,7 +95,6 @@ class AgentSession: sid=self.sid, plugins=agent.sandbox_plugins, ) - await self.runtime.ainit() async def _create_controller( self, diff --git a/tests/runtime/conftest.py b/tests/runtime/conftest.py index 99fda5dd59..9b57e14aa4 100644 --- a/tests/runtime/conftest.py +++ b/tests/runtime/conftest.py @@ -1,4 +1,3 @@ -import asyncio import os import time @@ -91,14 +90,14 @@ def base_container_image(request): @pytest.fixture -async def runtime(temp_dir, box_class, run_as_openhands): - runtime = await _load_runtime(temp_dir, box_class, run_as_openhands) +def runtime(temp_dir, box_class, run_as_openhands): + runtime = _load_runtime(temp_dir, box_class, run_as_openhands) yield runtime - await runtime.close() - await asyncio.sleep(1) + runtime.close() + time.sleep(1) -async def _load_runtime( +def _load_runtime( temp_dir, box_class, run_as_openhands: bool = True, @@ -135,8 +134,7 @@ async def _load_runtime( sid=sid, plugins=plugins, ) - await runtime.ainit() - await asyncio.sleep(1) + time.sleep(1) return runtime diff --git a/tests/runtime/test_bash.py b/tests/runtime/test_bash.py index f8197609ff..34b9a5717b 100644 --- a/tests/runtime/test_bash.py +++ b/tests/runtime/test_bash.py @@ -1,8 +1,8 @@ """Bash-related tests for the EventStreamRuntime, which connects to the RuntimeClient running in the sandbox.""" -import asyncio import os import tempfile +import time import pytest from conftest import _load_runtime @@ -16,15 +16,14 @@ from openhands.events.observation import CmdOutputObservation # ============================================================================================================================ -@pytest.mark.asyncio -async def test_bash_command_pexcept(temp_dir, box_class, run_as_openhands): - runtime = await _load_runtime(temp_dir, box_class, run_as_openhands) +def test_bash_command_pexcept(temp_dir, box_class, run_as_openhands): + runtime = _load_runtime(temp_dir, box_class, run_as_openhands) # We set env var PS1="\u@\h:\w $" # and construct the PEXCEPT prompt base on it. # When run `env`, bad implementation of CmdRunAction will be pexcepted by this # and failed to pexcept the right content, causing it fail to get error code. - obs = await runtime.run_action(CmdRunAction(command='env')) + obs = runtime.run_action(CmdRunAction(command='env')) # For example: # 02:16:13 - openhands:DEBUG: client.py:78 - Executing command: env @@ -43,58 +42,54 @@ async def test_bash_command_pexcept(temp_dir, box_class, run_as_openhands): ), 'The observation should be a CmdOutputObservation.' assert obs.exit_code == 0, 'The exit code should be 0.' - await runtime.close() - await asyncio.sleep(1) + runtime.close() + time.sleep(1) -@pytest.mark.asyncio -async def test_single_multiline_command(temp_dir, box_class): - runtime = await _load_runtime(temp_dir, box_class) +def test_single_multiline_command(temp_dir, box_class): + runtime = _load_runtime(temp_dir, box_class) action = CmdRunAction(command='echo \\\n -e "foo"') logger.info(action, extra={'msg_type': 'ACTION'}) - obs = await runtime.run_action(action) + obs = runtime.run_action(action) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) assert obs.exit_code == 0, 'The exit code should be 0.' assert 'foo' in obs.content - await runtime.close() - await asyncio.sleep(1) + runtime.close() + time.sleep(1) -@pytest.mark.asyncio -async def test_multiline_echo(temp_dir, box_class): - runtime = await _load_runtime(temp_dir, box_class) +def test_multiline_echo(temp_dir, box_class): + runtime = _load_runtime(temp_dir, box_class) action = CmdRunAction(command='echo -e "hello\nworld"') logger.info(action, extra={'msg_type': 'ACTION'}) - obs = await runtime.run_action(action) + obs = runtime.run_action(action) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) assert obs.exit_code == 0, 'The exit code should be 0.' assert 'hello\r\nworld' in obs.content - await runtime.close() - await asyncio.sleep(1) + runtime.close() + time.sleep(1) -@pytest.mark.asyncio -async def test_runtime_whitespace(temp_dir, box_class): - runtime = await _load_runtime(temp_dir, box_class) +def test_runtime_whitespace(temp_dir, box_class): + runtime = _load_runtime(temp_dir, box_class) action = CmdRunAction(command='echo -e "\\n\\n\\n"') logger.info(action, extra={'msg_type': 'ACTION'}) - obs = await runtime.run_action(action) + obs = runtime.run_action(action) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) assert obs.exit_code == 0, 'The exit code should be 0.' assert '\r\n\r\n\r\n' in obs.content - await runtime.close() - await asyncio.sleep(1) + runtime.close() + time.sleep(1) -@pytest.mark.asyncio -async def test_multiple_multiline_commands(temp_dir, box_class, run_as_openhands): +def test_multiple_multiline_commands(temp_dir, box_class, run_as_openhands): cmds = [ 'ls -l', 'echo -e "hello\nworld"', @@ -124,11 +119,11 @@ world " ] joined_cmds = '\n'.join(cmds) - runtime = await _load_runtime(temp_dir, box_class, run_as_openhands) + runtime = _load_runtime(temp_dir, box_class, run_as_openhands) action = CmdRunAction(command=joined_cmds) logger.info(action, extra={'msg_type': 'ACTION'}) - obs = await runtime.run_action(action) + obs = runtime.run_action(action) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) assert isinstance(obs, CmdOutputObservation) @@ -142,32 +137,30 @@ world " assert 'hello\r\nworld\r\nare\r\nyou\r\n\r\nthere?' in obs.content assert 'hello\r\nworld "\r\n' in obs.content - await runtime.close() - await asyncio.sleep(1) + runtime.close() + time.sleep(1) -@pytest.mark.asyncio -async def test_no_ps2_in_output(temp_dir, box_class, run_as_openhands): +def test_no_ps2_in_output(temp_dir, box_class, run_as_openhands): """Test that the PS2 sign is not added to the output of a multiline command.""" - runtime = await _load_runtime(temp_dir, box_class, run_as_openhands) + runtime = _load_runtime(temp_dir, box_class, run_as_openhands) action = CmdRunAction(command='echo -e "hello\nworld"') logger.info(action, extra={'msg_type': 'ACTION'}) - obs = await runtime.run_action(action) + obs = runtime.run_action(action) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) assert 'hello\r\nworld' in obs.content assert '>' not in obs.content - await runtime.close() - await asyncio.sleep(1) + runtime.close() + time.sleep(1) -@pytest.mark.asyncio -async def test_multiline_command_loop(temp_dir, box_class): +def test_multiline_command_loop(temp_dir, box_class): # https://github.com/All-Hands-AI/OpenHands/issues/3143 - runtime = await _load_runtime(temp_dir, box_class) + runtime = _load_runtime(temp_dir, box_class) init_cmd = """ mkdir -p _modules && \ @@ -180,7 +173,7 @@ echo "created files" """ action = CmdRunAction(command=init_cmd) logger.info(action, extra={'msg_type': 'ACTION'}) - obs = await runtime.run_action(action) + obs = runtime.run_action(action) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) assert isinstance(obs, CmdOutputObservation) @@ -196,24 +189,23 @@ echo "success" """ action = CmdRunAction(command=follow_up_cmd) logger.info(action, extra={'msg_type': 'ACTION'}) - obs = await runtime.run_action(action) + obs = runtime.run_action(action) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) assert isinstance(obs, CmdOutputObservation) assert obs.exit_code == 0, 'The exit code should be 0.' assert 'success' in obs.content - await runtime.close() - await asyncio.sleep(1) + runtime.close() + time.sleep(1) -@pytest.mark.asyncio -async def test_cmd_run(temp_dir, box_class, run_as_openhands): - runtime = await _load_runtime(temp_dir, box_class, run_as_openhands) +def test_cmd_run(temp_dir, box_class, run_as_openhands): + runtime = _load_runtime(temp_dir, box_class, run_as_openhands) action = CmdRunAction(command='ls -l') logger.info(action, extra={'msg_type': 'ACTION'}) - obs = await runtime.run_action(action) + obs = runtime.run_action(action) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) assert isinstance(obs, CmdOutputObservation) assert obs.exit_code == 0 @@ -221,14 +213,14 @@ async def test_cmd_run(temp_dir, box_class, run_as_openhands): action = CmdRunAction(command='mkdir test') logger.info(action, extra={'msg_type': 'ACTION'}) - obs = await runtime.run_action(action) + obs = runtime.run_action(action) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) assert isinstance(obs, CmdOutputObservation) assert obs.exit_code == 0 action = CmdRunAction(command='ls -l') logger.info(action, extra={'msg_type': 'ACTION'}) - obs = await runtime.run_action(action) + obs = runtime.run_action(action) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) assert isinstance(obs, CmdOutputObservation) assert obs.exit_code == 0 @@ -240,14 +232,14 @@ async def test_cmd_run(temp_dir, box_class, run_as_openhands): action = CmdRunAction(command='touch test/foo.txt') logger.info(action, extra={'msg_type': 'ACTION'}) - obs = await runtime.run_action(action) + obs = runtime.run_action(action) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) assert isinstance(obs, CmdOutputObservation) assert obs.exit_code == 0 action = CmdRunAction(command='ls -l test') logger.info(action, extra={'msg_type': 'ACTION'}) - obs = await runtime.run_action(action) + obs = runtime.run_action(action) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) assert isinstance(obs, CmdOutputObservation) assert obs.exit_code == 0 @@ -258,22 +250,21 @@ async def test_cmd_run(temp_dir, box_class, run_as_openhands): # owned by root action = CmdRunAction(command='rm -rf test') logger.info(action, extra={'msg_type': 'ACTION'}) - obs = await runtime.run_action(action) + obs = runtime.run_action(action) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) assert isinstance(obs, CmdOutputObservation) assert obs.exit_code == 0 - await runtime.close() - await asyncio.sleep(1) + runtime.close() + time.sleep(1) -@pytest.mark.asyncio -async def test_run_as_user_correct_home_dir(temp_dir, box_class, run_as_openhands): - runtime = await _load_runtime(temp_dir, box_class, run_as_openhands) +def test_run_as_user_correct_home_dir(temp_dir, box_class, run_as_openhands): + runtime = _load_runtime(temp_dir, box_class, run_as_openhands) action = CmdRunAction(command='cd ~ && pwd') logger.info(action, extra={'msg_type': 'ACTION'}) - obs = await runtime.run_action(action) + obs = runtime.run_action(action) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) assert isinstance(obs, CmdOutputObservation) assert obs.exit_code == 0 @@ -282,70 +273,67 @@ async def test_run_as_user_correct_home_dir(temp_dir, box_class, run_as_openhand else: assert '/root' in obs.content - await runtime.close() - await asyncio.sleep(1) + runtime.close() + time.sleep(1) -@pytest.mark.asyncio -async def test_multi_cmd_run_in_single_line(temp_dir, box_class): - runtime = await _load_runtime(temp_dir, box_class) +def test_multi_cmd_run_in_single_line(temp_dir, box_class): + runtime = _load_runtime(temp_dir, box_class) action = CmdRunAction(command='pwd && ls -l') logger.info(action, extra={'msg_type': 'ACTION'}) - obs = await runtime.run_action(action) + obs = runtime.run_action(action) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) assert isinstance(obs, CmdOutputObservation) assert obs.exit_code == 0 assert '/workspace' in obs.content assert 'total 0' in obs.content - await runtime.close() - await asyncio.sleep(1) + runtime.close() + time.sleep(1) -@pytest.mark.asyncio -async def test_stateful_cmd(temp_dir, box_class): - runtime = await _load_runtime(temp_dir, box_class) +def test_stateful_cmd(temp_dir, box_class): + runtime = _load_runtime(temp_dir, box_class) action = CmdRunAction(command='mkdir test') logger.info(action, extra={'msg_type': 'ACTION'}) - obs = await runtime.run_action(action) + obs = runtime.run_action(action) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) assert isinstance(obs, CmdOutputObservation) assert obs.exit_code == 0, 'The exit code should be 0.' action = CmdRunAction(command='cd test') logger.info(action, extra={'msg_type': 'ACTION'}) - obs = await runtime.run_action(action) + obs = runtime.run_action(action) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) assert isinstance(obs, CmdOutputObservation) assert obs.exit_code == 0, 'The exit code should be 0.' action = CmdRunAction(command='pwd') logger.info(action, extra={'msg_type': 'ACTION'}) - obs = await runtime.run_action(action) + obs = runtime.run_action(action) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) assert isinstance(obs, CmdOutputObservation) assert obs.exit_code == 0, 'The exit code should be 0.' assert '/workspace/test' in obs.content - await runtime.close() - await asyncio.sleep(1) + runtime.close() + time.sleep(1) -@pytest.mark.asyncio -async def test_failed_cmd(temp_dir, box_class): - runtime = await _load_runtime(temp_dir, box_class) +def test_failed_cmd(temp_dir, box_class): + runtime = _load_runtime(temp_dir, box_class) action = CmdRunAction(command='non_existing_command') logger.info(action, extra={'msg_type': 'ACTION'}) - obs = await runtime.run_action(action) + obs = runtime.run_action(action) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) assert isinstance(obs, CmdOutputObservation) assert obs.exit_code != 0, 'The exit code should not be 0 for a failed command.' - await runtime.close() - await asyncio.sleep(1) + runtime.close() + time.sleep(1) def _create_test_file(host_temp_dir): @@ -354,19 +342,16 @@ def _create_test_file(host_temp_dir): f.write('Hello, World!') -@pytest.mark.asyncio -async def test_copy_single_file(temp_dir, box_class): - runtime = await _load_runtime(temp_dir, box_class) +def test_copy_single_file(temp_dir, box_class): + runtime = _load_runtime(temp_dir, box_class) with tempfile.TemporaryDirectory() as host_temp_dir: _create_test_file(host_temp_dir) - await runtime.copy_to( - os.path.join(host_temp_dir, 'test_file.txt'), '/workspace' - ) + runtime.copy_to(os.path.join(host_temp_dir, 'test_file.txt'), '/workspace') action = CmdRunAction(command='ls -alh /workspace') logger.info(action, extra={'msg_type': 'ACTION'}) - obs = await runtime.run_action(action) + obs = runtime.run_action(action) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) assert isinstance(obs, CmdOutputObservation) assert obs.exit_code == 0 @@ -374,14 +359,14 @@ async def test_copy_single_file(temp_dir, box_class): action = CmdRunAction(command='cat /workspace/test_file.txt') logger.info(action, extra={'msg_type': 'ACTION'}) - obs = await runtime.run_action(action) + obs = runtime.run_action(action) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) assert isinstance(obs, CmdOutputObservation) assert obs.exit_code == 0 assert 'Hello, World!' in obs.content - await runtime.close() - await asyncio.sleep(1) + runtime.close() + time.sleep(1) def _create_test_dir_with_files(host_temp_dir): @@ -392,20 +377,19 @@ def _create_test_dir_with_files(host_temp_dir): f.write('File 2 content') -@pytest.mark.asyncio -async def test_copy_directory_recursively(temp_dir, box_class): - runtime = await _load_runtime(temp_dir, box_class) +def test_copy_directory_recursively(temp_dir, box_class): + runtime = _load_runtime(temp_dir, box_class) with tempfile.TemporaryDirectory() as host_temp_dir: # We need a separate directory, since temp_dir is mounted to /workspace _create_test_dir_with_files(host_temp_dir) - await runtime.copy_to( + runtime.copy_to( os.path.join(host_temp_dir, 'test_dir'), '/workspace', recursive=True ) action = CmdRunAction(command='ls -alh /workspace') logger.info(action, extra={'msg_type': 'ACTION'}) - obs = await runtime.run_action(action) + obs = runtime.run_action(action) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) assert isinstance(obs, CmdOutputObservation) assert obs.exit_code == 0 @@ -415,7 +399,7 @@ async def test_copy_directory_recursively(temp_dir, box_class): action = CmdRunAction(command='ls -alh /workspace/test_dir') logger.info(action, extra={'msg_type': 'ACTION'}) - obs = await runtime.run_action(action) + obs = runtime.run_action(action) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) assert isinstance(obs, CmdOutputObservation) assert obs.exit_code == 0 @@ -424,53 +408,51 @@ async def test_copy_directory_recursively(temp_dir, box_class): action = CmdRunAction(command='cat /workspace/test_dir/file1.txt') logger.info(action, extra={'msg_type': 'ACTION'}) - obs = await runtime.run_action(action) + obs = runtime.run_action(action) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) assert isinstance(obs, CmdOutputObservation) assert obs.exit_code == 0 assert 'File 1 content' in obs.content - await runtime.close() - await asyncio.sleep(1) + runtime.close() + time.sleep(1) -@pytest.mark.asyncio -async def test_copy_to_non_existent_directory(temp_dir, box_class): - runtime = await _load_runtime(temp_dir, box_class) +def test_copy_to_non_existent_directory(temp_dir, box_class): + runtime = _load_runtime(temp_dir, box_class) with tempfile.TemporaryDirectory() as host_temp_dir: _create_test_file(host_temp_dir) - await runtime.copy_to( + runtime.copy_to( os.path.join(host_temp_dir, 'test_file.txt'), '/workspace/new_dir' ) action = CmdRunAction(command='cat /workspace/new_dir/test_file.txt') logger.info(action, extra={'msg_type': 'ACTION'}) - obs = await runtime.run_action(action) + obs = runtime.run_action(action) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) assert isinstance(obs, CmdOutputObservation) assert obs.exit_code == 0 assert 'Hello, World!' in obs.content - await runtime.close() - await asyncio.sleep(1) + runtime.close() + time.sleep(1) -@pytest.mark.asyncio -async def test_overwrite_existing_file(temp_dir, box_class): - runtime = await _load_runtime(temp_dir, box_class) +def test_overwrite_existing_file(temp_dir, box_class): + runtime = _load_runtime(temp_dir, box_class) # touch a file in /workspace action = CmdRunAction(command='touch /workspace/test_file.txt') logger.info(action, extra={'msg_type': 'ACTION'}) - obs = await runtime.run_action(action) + obs = runtime.run_action(action) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) assert isinstance(obs, CmdOutputObservation) assert obs.exit_code == 0 action = CmdRunAction(command='cat /workspace/test_file.txt') logger.info(action, extra={'msg_type': 'ACTION'}) - obs = await runtime.run_action(action) + obs = runtime.run_action(action) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) assert isinstance(obs, CmdOutputObservation) assert obs.exit_code == 0 @@ -478,46 +460,42 @@ async def test_overwrite_existing_file(temp_dir, box_class): with tempfile.TemporaryDirectory() as host_temp_dir: _create_test_file(host_temp_dir) - await runtime.copy_to( - os.path.join(host_temp_dir, 'test_file.txt'), '/workspace' - ) + runtime.copy_to(os.path.join(host_temp_dir, 'test_file.txt'), '/workspace') action = CmdRunAction(command='cat /workspace/test_file.txt') logger.info(action, extra={'msg_type': 'ACTION'}) - obs = await runtime.run_action(action) + obs = runtime.run_action(action) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) assert isinstance(obs, CmdOutputObservation) assert obs.exit_code == 0 assert 'Hello, World!' in obs.content - await runtime.close() - await asyncio.sleep(1) + runtime.close() + time.sleep(1) -@pytest.mark.asyncio -async def test_copy_non_existent_file(temp_dir, box_class): - runtime = await _load_runtime(temp_dir, box_class) +def test_copy_non_existent_file(temp_dir, box_class): + runtime = _load_runtime(temp_dir, box_class) with pytest.raises(FileNotFoundError): - await runtime.copy_to( + runtime.copy_to( os.path.join(temp_dir, 'non_existent_file.txt'), '/workspace/should_not_exist.txt', ) action = CmdRunAction(command='ls /workspace/should_not_exist.txt') logger.info(action, extra={'msg_type': 'ACTION'}) - obs = await runtime.run_action(action) + obs = runtime.run_action(action) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) assert isinstance(obs, CmdOutputObservation) assert obs.exit_code != 0 # File should not exist - await runtime.close() - await asyncio.sleep(1) + runtime.close() + time.sleep(1) -@pytest.mark.asyncio -async def test_keep_prompt(box_class, temp_dir): - runtime = await _load_runtime( +def test_keep_prompt(box_class, temp_dir): + runtime = _load_runtime( temp_dir, box_class=box_class, run_as_openhands=False, @@ -525,7 +503,7 @@ async def test_keep_prompt(box_class, temp_dir): action = CmdRunAction(command='touch /workspace/test_file.txt') logger.info(action, extra={'msg_type': 'ACTION'}) - obs = await runtime.run_action(action) + obs = runtime.run_action(action) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) assert isinstance(obs, CmdOutputObservation) assert obs.exit_code == 0 @@ -533,22 +511,21 @@ async def test_keep_prompt(box_class, temp_dir): action = CmdRunAction(command='cat /workspace/test_file.txt', keep_prompt=False) logger.info(action, extra={'msg_type': 'ACTION'}) - obs = await runtime.run_action(action) + obs = runtime.run_action(action) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) assert isinstance(obs, CmdOutputObservation) assert obs.exit_code == 0 assert 'root@' not in obs.content - await runtime.close() - await asyncio.sleep(1) + runtime.close() + time.sleep(1) -@pytest.mark.asyncio -async def test_git_operation(box_class): +def test_git_operation(box_class): # do not mount workspace, since workspace mount by tests will be owned by root # while the user_id we get via os.getuid() is different from root # which causes permission issues - runtime = await _load_runtime( + runtime = _load_runtime( temp_dir=None, box_class=box_class, # Need to use non-root user to expose issues @@ -561,7 +538,7 @@ async def test_git_operation(box_class): # check the ownership of the current directory action = CmdRunAction(command='ls -alh .') logger.info(action, extra={'msg_type': 'ACTION'}) - obs = await runtime.run_action(action) + obs = runtime.run_action(action) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) assert isinstance(obs, CmdOutputObservation) assert obs.exit_code == 0 @@ -581,7 +558,7 @@ async def test_git_operation(box_class): # make sure all git operations are allowed action = CmdRunAction(command='git init') logger.info(action, extra={'msg_type': 'ACTION'}) - obs = await runtime.run_action(action) + obs = runtime.run_action(action) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) assert isinstance(obs, CmdOutputObservation) assert obs.exit_code == 0 @@ -589,7 +566,7 @@ async def test_git_operation(box_class): # create a file action = CmdRunAction(command='echo "hello" > test_file.txt') logger.info(action, extra={'msg_type': 'ACTION'}) - obs = await runtime.run_action(action) + obs = runtime.run_action(action) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) assert isinstance(obs, CmdOutputObservation) assert obs.exit_code == 0 @@ -597,7 +574,7 @@ async def test_git_operation(box_class): # git add action = CmdRunAction(command='git add test_file.txt') logger.info(action, extra={'msg_type': 'ACTION'}) - obs = await runtime.run_action(action) + obs = runtime.run_action(action) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) assert isinstance(obs, CmdOutputObservation) assert obs.exit_code == 0 @@ -605,7 +582,7 @@ async def test_git_operation(box_class): # git diff action = CmdRunAction(command='git diff') logger.info(action, extra={'msg_type': 'ACTION'}) - obs = await runtime.run_action(action) + obs = runtime.run_action(action) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) assert isinstance(obs, CmdOutputObservation) assert obs.exit_code == 0 @@ -613,12 +590,12 @@ async def test_git_operation(box_class): # git commit action = CmdRunAction(command='git commit -m "test commit"') logger.info(action, extra={'msg_type': 'ACTION'}) - obs = await runtime.run_action(action) + obs = runtime.run_action(action) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) assert isinstance(obs, CmdOutputObservation) assert obs.exit_code == 0 - await runtime.close() + runtime.close() - await runtime.close() - await asyncio.sleep(1) + runtime.close() + time.sleep(1) diff --git a/tests/runtime/test_browsing.py b/tests/runtime/test_browsing.py index b8b3a3d05f..5051b15a24 100644 --- a/tests/runtime/test_browsing.py +++ b/tests/runtime/test_browsing.py @@ -1,9 +1,8 @@ """Browsing-related tests for the EventStreamRuntime, which connects to the RuntimeClient running in the sandbox.""" -import asyncio import json +import time -import pytest from conftest import _load_runtime from openhands.core.logger import openhands_logger as logger @@ -24,16 +23,15 @@ from openhands.events.observation import ( PY3_FOR_TESTING = '/openhands/miniforge3/bin/mamba run -n base python3' -@pytest.mark.asyncio -async def test_simple_browse(temp_dir, box_class, run_as_openhands): - runtime = await _load_runtime(temp_dir, box_class, run_as_openhands) +def test_simple_browse(temp_dir, box_class, run_as_openhands): + runtime = _load_runtime(temp_dir, box_class, run_as_openhands) # Test browse action_cmd = CmdRunAction( command=f'{PY3_FOR_TESTING} -m http.server 8000 > server.log 2>&1 &' ) logger.info(action_cmd, extra={'msg_type': 'ACTION'}) - obs = await runtime.run_action(action_cmd) + obs = runtime.run_action(action_cmd) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) assert isinstance(obs, CmdOutputObservation) @@ -42,13 +40,13 @@ async def test_simple_browse(temp_dir, box_class, run_as_openhands): action_cmd = CmdRunAction(command='sleep 5 && cat server.log') logger.info(action_cmd, extra={'msg_type': 'ACTION'}) - obs = await runtime.run_action(action_cmd) + obs = runtime.run_action(action_cmd) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) assert obs.exit_code == 0 action_browse = BrowseURLAction(url='http://localhost:8000') logger.info(action_browse, extra={'msg_type': 'ACTION'}) - obs = await runtime.run_action(action_browse) + obs = runtime.run_action(action_browse) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) assert isinstance(obs, BrowserOutputObservation) @@ -64,17 +62,16 @@ async def test_simple_browse(temp_dir, box_class, run_as_openhands): # clean up action = CmdRunAction(command='rm -rf server.log') logger.info(action, extra={'msg_type': 'ACTION'}) - obs = await runtime.run_action(action) + obs = runtime.run_action(action) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) assert obs.exit_code == 0 - await runtime.close() - await asyncio.sleep(1) + runtime.close() + time.sleep(1) -@pytest.mark.asyncio -async def test_browsergym_eval_env(box_class, temp_dir): - runtime = await _load_runtime( +def test_browsergym_eval_env(box_class, temp_dir): + runtime = _load_runtime( temp_dir, box_class=box_class, run_as_openhands=False, # need root permission to access file @@ -89,7 +86,7 @@ async def test_browsergym_eval_env(box_class, temp_dir): # Test browse action = BrowseInteractiveAction(browser_actions=BROWSER_EVAL_GET_GOAL_ACTION) logger.info(action, extra={'msg_type': 'ACTION'}) - obs = await runtime.run_action(action) + obs = runtime.run_action(action) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) assert isinstance(obs, BrowserOutputObservation) @@ -100,7 +97,7 @@ async def test_browsergym_eval_env(box_class, temp_dir): # Make sure the browser can produce observation in eva[l action = BrowseInteractiveAction(browser_actions='noop()') logger.info(action, extra={'msg_type': 'ACTION'}) - obs = await runtime.run_action(action) + obs = runtime.run_action(action) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) assert ( obs.url.strip() @@ -110,9 +107,9 @@ async def test_browsergym_eval_env(box_class, temp_dir): # Make sure the rewards are working action = BrowseInteractiveAction(browser_actions=BROWSER_EVAL_GET_REWARDS_ACTION) logger.info(action, extra={'msg_type': 'ACTION'}) - obs = await runtime.run_action(action) + obs = runtime.run_action(action) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) assert json.loads(obs.content) == [0.0] - await runtime.close() - await asyncio.sleep(1) + runtime.close() + time.sleep(1) diff --git a/tests/runtime/test_env_vars.py b/tests/runtime/test_env_vars.py index 40ed4a5641..74ba15284f 100644 --- a/tests/runtime/test_env_vars.py +++ b/tests/runtime/test_env_vars.py @@ -1,10 +1,9 @@ """Env vars related tests for the EventStreamRuntime, which connects to the RuntimeClient running in the sandbox.""" -import asyncio import os +import time from unittest.mock import patch -import pytest from conftest import _load_runtime from openhands.events.action import CmdRunAction @@ -15,17 +14,14 @@ from openhands.events.observation import CmdOutputObservation # ============================================================================================================================ -@pytest.mark.asyncio -async def test_env_vars_os_environ(temp_dir, box_class, run_as_openhands): +def test_env_vars_os_environ(temp_dir, box_class, run_as_openhands): with patch.dict(os.environ, {'SANDBOX_ENV_FOOBAR': 'BAZ'}): - runtime = await _load_runtime(temp_dir, box_class, run_as_openhands) + runtime = _load_runtime(temp_dir, box_class, run_as_openhands) - obs: CmdOutputObservation = await runtime.run_action( - CmdRunAction(command='env') - ) + obs: CmdOutputObservation = runtime.run_action(CmdRunAction(command='env')) print(obs) - obs: CmdOutputObservation = await runtime.run_action( + obs: CmdOutputObservation = runtime.run_action( CmdRunAction(command='echo $FOOBAR') ) print(obs) @@ -34,55 +30,50 @@ async def test_env_vars_os_environ(temp_dir, box_class, run_as_openhands): obs.content.strip().split('\n\r')[0].strip() == 'BAZ' ), f'Output: [{obs.content}] for {box_class}' - await runtime.close() - await asyncio.sleep(1) + runtime.close() + time.sleep(1) -@pytest.mark.asyncio -async def test_env_vars_runtime_add_env_vars(temp_dir, box_class): - runtime = await _load_runtime(temp_dir, box_class) - await runtime.add_env_vars({'QUUX': 'abc"def'}) +def test_env_vars_runtime_add_env_vars(temp_dir, box_class): + runtime = _load_runtime(temp_dir, box_class) + runtime.add_env_vars({'QUUX': 'abc"def'}) - obs: CmdOutputObservation = await runtime.run_action( - CmdRunAction(command='echo $QUUX') - ) + obs: CmdOutputObservation = runtime.run_action(CmdRunAction(command='echo $QUUX')) print(obs) assert obs.exit_code == 0, 'The exit code should be 0.' assert ( obs.content.strip().split('\r\n')[0].strip() == 'abc"def' ), f'Output: [{obs.content}] for {box_class}' - await runtime.close() - await asyncio.sleep(1) + runtime.close() + time.sleep(1) -@pytest.mark.asyncio -async def test_env_vars_runtime_add_empty_dict(temp_dir, box_class): - runtime = await _load_runtime(temp_dir, box_class) +def test_env_vars_runtime_add_empty_dict(temp_dir, box_class): + runtime = _load_runtime(temp_dir, box_class) - prev_obs = await runtime.run_action(CmdRunAction(command='env')) + prev_obs = runtime.run_action(CmdRunAction(command='env')) assert prev_obs.exit_code == 0, 'The exit code should be 0.' print(prev_obs) - await runtime.add_env_vars({}) + runtime.add_env_vars({}) - obs = await runtime.run_action(CmdRunAction(command='env')) + obs = runtime.run_action(CmdRunAction(command='env')) assert obs.exit_code == 0, 'The exit code should be 0.' print(obs) assert ( obs.content == prev_obs.content ), 'The env var content should be the same after adding an empty dict.' - await runtime.close() - await asyncio.sleep(1) + runtime.close() + time.sleep(1) -@pytest.mark.asyncio -async def test_env_vars_runtime_add_multiple_env_vars(temp_dir, box_class): - runtime = await _load_runtime(temp_dir, box_class) - await runtime.add_env_vars({'QUUX': 'abc"def', 'FOOBAR': 'xyz'}) +def test_env_vars_runtime_add_multiple_env_vars(temp_dir, box_class): + runtime = _load_runtime(temp_dir, box_class) + runtime.add_env_vars({'QUUX': 'abc"def', 'FOOBAR': 'xyz'}) - obs: CmdOutputObservation = await runtime.run_action( + obs: CmdOutputObservation = runtime.run_action( CmdRunAction(command='echo $QUUX $FOOBAR') ) print(obs) @@ -91,17 +82,16 @@ async def test_env_vars_runtime_add_multiple_env_vars(temp_dir, box_class): obs.content.strip().split('\r\n')[0].strip() == 'abc"def xyz' ), f'Output: [{obs.content}] for {box_class}' - await runtime.close() - await asyncio.sleep(1) + runtime.close() + time.sleep(1) -@pytest.mark.asyncio -async def test_env_vars_runtime_add_env_vars_overwrite(temp_dir, box_class): +def test_env_vars_runtime_add_env_vars_overwrite(temp_dir, box_class): with patch.dict(os.environ, {'SANDBOX_ENV_FOOBAR': 'BAZ'}): - runtime = await _load_runtime(temp_dir, box_class) - await runtime.add_env_vars({'FOOBAR': 'xyz'}) + runtime = _load_runtime(temp_dir, box_class) + runtime.add_env_vars({'FOOBAR': 'xyz'}) - obs: CmdOutputObservation = await runtime.run_action( + obs: CmdOutputObservation = runtime.run_action( CmdRunAction(command='echo $FOOBAR') ) print(obs) @@ -110,5 +100,5 @@ async def test_env_vars_runtime_add_env_vars_overwrite(temp_dir, box_class): obs.content.strip().split('\r\n')[0].strip() == 'xyz' ), f'Output: [{obs.content}] for {box_class}' - await runtime.close() - await asyncio.sleep(1) + runtime.close() + time.sleep(1) diff --git a/tests/runtime/test_images.py b/tests/runtime/test_images.py index 24d0aa664b..294c9b2b2c 100644 --- a/tests/runtime/test_images.py +++ b/tests/runtime/test_images.py @@ -1,6 +1,6 @@ """Image-related tests for the EventStreamRuntime, which connects to the RuntimeClient running in the sandbox.""" -import asyncio +import time import pytest from conftest import _load_runtime @@ -13,8 +13,7 @@ from openhands.events.action import CmdRunAction # ============================================================================================================================ -@pytest.mark.asyncio -async def test_bash_python_version(temp_dir, box_class, base_container_image): +def test_bash_python_version(temp_dir, box_class, base_container_image): """Make sure Python is available in bash.""" if base_container_image not in [ 'python:3.11-bookworm', @@ -22,36 +21,35 @@ async def test_bash_python_version(temp_dir, box_class, base_container_image): ]: pytest.skip('This test is only for python-related images') - runtime = await _load_runtime( + runtime = _load_runtime( temp_dir, box_class, base_container_image=base_container_image ) action = CmdRunAction(command='which python') logger.info(action, extra={'msg_type': 'ACTION'}) - obs = await runtime.run_action(action) + obs = runtime.run_action(action) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) assert obs.exit_code == 0 action = CmdRunAction(command='python --version') logger.info(action, extra={'msg_type': 'ACTION'}) - obs = await runtime.run_action(action) + obs = runtime.run_action(action) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) assert obs.exit_code == 0 assert 'Python 3.11' in obs.content # Check for specific version action = CmdRunAction(command='pip --version') logger.info(action, extra={'msg_type': 'ACTION'}) - obs = await runtime.run_action(action) + obs = runtime.run_action(action) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) assert obs.exit_code == 0 assert 'pip' in obs.content # Check that pip is available - await runtime.close() - await asyncio.sleep(1) + runtime.close() + time.sleep(1) -@pytest.mark.asyncio -async def test_nodejs_22_version(temp_dir, box_class, base_container_image): +def test_nodejs_22_version(temp_dir, box_class, base_container_image): """Make sure Node.js is available in bash.""" if base_container_image not in [ 'node:22-bookworm', @@ -59,39 +57,38 @@ async def test_nodejs_22_version(temp_dir, box_class, base_container_image): ]: pytest.skip('This test is only for nodejs-related images') - runtime = await _load_runtime( + runtime = _load_runtime( temp_dir, box_class, base_container_image=base_container_image ) action = CmdRunAction(command='node --version') logger.info(action, extra={'msg_type': 'ACTION'}) - obs = await runtime.run_action(action) + obs = runtime.run_action(action) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) assert obs.exit_code == 0 assert 'v22' in obs.content # Check for specific version - await runtime.close() - await asyncio.sleep(1) + runtime.close() + time.sleep(1) -@pytest.mark.asyncio -async def test_go_version(temp_dir, box_class, base_container_image): +def test_go_version(temp_dir, box_class, base_container_image): """Make sure Go is available in bash.""" if base_container_image not in [ 'golang:1.23-bookworm', ]: pytest.skip('This test is only for go-related images') - runtime = await _load_runtime( + runtime = _load_runtime( temp_dir, box_class, base_container_image=base_container_image ) action = CmdRunAction(command='go version') logger.info(action, extra={'msg_type': 'ACTION'}) - obs = await runtime.run_action(action) + obs = runtime.run_action(action) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) assert obs.exit_code == 0 assert 'go1.23' in obs.content # Check for specific version - await runtime.close() - await asyncio.sleep(1) + runtime.close() + time.sleep(1) diff --git a/tests/runtime/test_ipython.py b/tests/runtime/test_ipython.py index 7cefcba87a..656eba63ba 100644 --- a/tests/runtime/test_ipython.py +++ b/tests/runtime/test_ipython.py @@ -1,8 +1,7 @@ """Test the EventStreamRuntime, which connects to the RuntimeClient running in the sandbox.""" -import asyncio +import time -import pytest from conftest import _load_runtime from openhands.core.logger import openhands_logger as logger @@ -26,14 +25,13 @@ from openhands.runtime.client.runtime import EventStreamRuntime # ============================================================================================================================ -@pytest.mark.asyncio -async def test_simple_cmd_ipython_and_fileop(temp_dir, box_class, run_as_openhands): - runtime = await _load_runtime(temp_dir, box_class, run_as_openhands) +def test_simple_cmd_ipython_and_fileop(temp_dir, box_class, run_as_openhands): + runtime = _load_runtime(temp_dir, box_class, run_as_openhands) # Test run command action_cmd = CmdRunAction(command='ls -l') logger.info(action_cmd, extra={'msg_type': 'ACTION'}) - obs = await runtime.run_action(action_cmd) + obs = runtime.run_action(action_cmd) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) assert isinstance(obs, CmdOutputObservation) @@ -44,7 +42,7 @@ async def test_simple_cmd_ipython_and_fileop(temp_dir, box_class, run_as_openhan test_code = "print('Hello, `World`!\\n')" action_ipython = IPythonRunCellAction(code=test_code) logger.info(action_ipython, extra={'msg_type': 'ACTION'}) - obs = await runtime.run_action(action_ipython) + obs = runtime.run_action(action_ipython) assert isinstance(obs, IPythonRunCellObservation) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) @@ -57,7 +55,7 @@ async def test_simple_cmd_ipython_and_fileop(temp_dir, box_class, run_as_openhan # Test read file (file should not exist) action_read = FileReadAction(path='hello.sh') logger.info(action_read, extra={'msg_type': 'ACTION'}) - obs = await runtime.run_action(action_read) + obs = runtime.run_action(action_read) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) assert isinstance(obs, ErrorObservation) assert 'File not found' in obs.content @@ -65,7 +63,7 @@ async def test_simple_cmd_ipython_and_fileop(temp_dir, box_class, run_as_openhan # Test write file action_write = FileWriteAction(content='echo "Hello, World!"', path='hello.sh') logger.info(action_write, extra={'msg_type': 'ACTION'}) - obs = await runtime.run_action(action_write) + obs = runtime.run_action(action_write) assert isinstance(obs, FileWriteObservation) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) @@ -76,7 +74,7 @@ async def test_simple_cmd_ipython_and_fileop(temp_dir, box_class, run_as_openhan # Test read file (file should exist) action_read = FileReadAction(path='hello.sh') logger.info(action_read, extra={'msg_type': 'ACTION'}) - obs = await runtime.run_action(action_read) + obs = runtime.run_action(action_read) assert isinstance( obs, FileReadObservation ), 'The observation should be a FileReadObservation.' @@ -88,24 +86,23 @@ async def test_simple_cmd_ipython_and_fileop(temp_dir, box_class, run_as_openhan # clean up action = CmdRunAction(command='rm -rf hello.sh') logger.info(action, extra={'msg_type': 'ACTION'}) - obs = await runtime.run_action(action) + obs = runtime.run_action(action) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) assert obs.exit_code == 0 - await runtime.close() - await asyncio.sleep(1) + runtime.close() + time.sleep(1) -@pytest.mark.asyncio -async def test_ipython_multi_user(temp_dir, box_class, run_as_openhands): - runtime = await _load_runtime(temp_dir, box_class, run_as_openhands) +def test_ipython_multi_user(temp_dir, box_class, run_as_openhands): + runtime = _load_runtime(temp_dir, box_class, run_as_openhands) # Test run ipython # get username test_code = "import os; print(os.environ['USER'])" action_ipython = IPythonRunCellAction(code=test_code) logger.info(action_ipython, extra={'msg_type': 'ACTION'}) - obs = await runtime.run_action(action_ipython) + obs = runtime.run_action(action_ipython) assert isinstance(obs, IPythonRunCellObservation) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) @@ -118,7 +115,7 @@ async def test_ipython_multi_user(temp_dir, box_class, run_as_openhands): test_code = 'import os; print(os.getcwd())' action_ipython = IPythonRunCellAction(code=test_code) logger.info(action_ipython, extra={'msg_type': 'ACTION'}) - obs = await runtime.run_action(action_ipython) + obs = runtime.run_action(action_ipython) assert isinstance(obs, IPythonRunCellObservation) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) assert ( @@ -134,7 +131,7 @@ async def test_ipython_multi_user(temp_dir, box_class, run_as_openhands): test_code = "with open('test.txt', 'w') as f: f.write('Hello, world!')" action_ipython = IPythonRunCellAction(code=test_code) logger.info(action_ipython, extra={'msg_type': 'ACTION'}) - obs = await runtime.run_action(action_ipython) + obs = runtime.run_action(action_ipython) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) assert isinstance(obs, IPythonRunCellObservation) assert ( @@ -149,7 +146,7 @@ async def test_ipython_multi_user(temp_dir, box_class, run_as_openhands): # check file owner via bash action = CmdRunAction(command='ls -alh test.txt') logger.info(action, extra={'msg_type': 'ACTION'}) - obs = await runtime.run_action(action) + obs = runtime.run_action(action) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) assert obs.exit_code == 0 if run_as_openhands: @@ -163,24 +160,23 @@ async def test_ipython_multi_user(temp_dir, box_class, run_as_openhands): # clean up action = CmdRunAction(command='rm -rf test') logger.info(action, extra={'msg_type': 'ACTION'}) - obs = await runtime.run_action(action) + obs = runtime.run_action(action) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) assert obs.exit_code == 0 - await runtime.close() - await asyncio.sleep(1) + runtime.close() + time.sleep(1) -@pytest.mark.asyncio -async def test_ipython_simple(temp_dir, box_class): - runtime = await _load_runtime(temp_dir, box_class) +def test_ipython_simple(temp_dir, box_class): + runtime = _load_runtime(temp_dir, box_class) # Test run ipython # get username test_code = 'print(1)' action_ipython = IPythonRunCellAction(code=test_code) logger.info(action_ipython, extra={'msg_type': 'ACTION'}) - obs = await runtime.run_action(action_ipython) + obs = runtime.run_action(action_ipython) assert isinstance(obs, IPythonRunCellObservation) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) assert ( @@ -192,30 +188,30 @@ async def test_ipython_simple(temp_dir, box_class): ).strip() ) - await runtime.close() - await asyncio.sleep(1) + runtime.close() + time.sleep(1) -async def _test_ipython_agentskills_fileop_pwd_impl( +def _test_ipython_agentskills_fileop_pwd_impl( runtime: EventStreamRuntime, enable_auto_lint: bool ): # remove everything in /workspace action = CmdRunAction(command='rm -rf /workspace/*') logger.info(action, extra={'msg_type': 'ACTION'}) - obs = await runtime.run_action(action) + obs = runtime.run_action(action) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) assert obs.exit_code == 0 action = CmdRunAction(command='mkdir test') logger.info(action, extra={'msg_type': 'ACTION'}) - obs = await runtime.run_action(action) + obs = runtime.run_action(action) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) assert isinstance(obs, CmdOutputObservation) assert obs.exit_code == 0 action = IPythonRunCellAction(code="create_file('hello.py')") logger.info(action, extra={'msg_type': 'ACTION'}) - obs = await runtime.run_action(action) + obs = runtime.run_action(action) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) assert isinstance(obs, IPythonRunCellObservation) assert obs.content.replace('\r\n', '\n').strip().split('\n') == ( @@ -230,7 +226,7 @@ async def _test_ipython_agentskills_fileop_pwd_impl( action = CmdRunAction(command='cd test') logger.info(action, extra={'msg_type': 'ACTION'}) - obs = await runtime.run_action(action) + obs = runtime.run_action(action) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) assert isinstance(obs, CmdOutputObservation) assert obs.exit_code == 0 @@ -239,7 +235,7 @@ async def _test_ipython_agentskills_fileop_pwd_impl( # i.e., /workspace/test/hello.py instead of /workspace/hello.py action = IPythonRunCellAction(code="create_file('hello.py')") logger.info(action, extra={'msg_type': 'ACTION'}) - obs = await runtime.run_action(action) + obs = runtime.run_action(action) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) assert isinstance(obs, IPythonRunCellObservation) assert obs.content.replace('\r\n', '\n').strip().split('\n') == ( @@ -258,7 +254,7 @@ async def _test_ipython_agentskills_fileop_pwd_impl( code="insert_content_at_line('hello.py', 1, ' print(\"hello world\")')" ) logger.info(action, extra={'msg_type': 'ACTION'}) - obs = await runtime.run_action(action) + obs = runtime.run_action(action) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) assert isinstance(obs, IPythonRunCellObservation) assert obs.content.replace('\r\n', '\n').strip().split('\n') == ( @@ -292,7 +288,7 @@ DO NOT re-run the same failed edit command. Running it again will lead to the sa code="insert_content_at_line('hello.py', 1, 'print(\"hello world\")')" ) logger.info(action, extra={'msg_type': 'ACTION'}) - obs = await runtime.run_action(action) + obs = runtime.run_action(action) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) assert isinstance(obs, IPythonRunCellObservation) assert obs.content.replace('\r\n', '\n').strip().split('\n') == ( @@ -309,38 +305,36 @@ DO NOT re-run the same failed edit command. Running it again will lead to the sa action = CmdRunAction(command='rm -rf /workspace/*') logger.info(action, extra={'msg_type': 'ACTION'}) - obs = await runtime.run_action(action) + obs = runtime.run_action(action) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) assert obs.exit_code == 0 - await runtime.close() - await asyncio.sleep(1) + runtime.close() + time.sleep(1) -@pytest.mark.asyncio -async def test_ipython_agentskills_fileop_pwd( +def test_ipython_agentskills_fileop_pwd( temp_dir, box_class, run_as_openhands, enable_auto_lint ): """Make sure that cd in bash also update the current working directory in ipython.""" - runtime = await _load_runtime( + runtime = _load_runtime( temp_dir, box_class, run_as_openhands, enable_auto_lint=enable_auto_lint ) - await _test_ipython_agentskills_fileop_pwd_impl(runtime, enable_auto_lint) + _test_ipython_agentskills_fileop_pwd_impl(runtime, enable_auto_lint) - await runtime.close() - await asyncio.sleep(1) + runtime.close() + time.sleep(1) -@pytest.mark.asyncio -async def test_ipython_agentskills_fileop_pwd_with_userdir(temp_dir, box_class): +def test_ipython_agentskills_fileop_pwd_with_userdir(temp_dir, box_class): """Make sure that cd in bash also update the current working directory in ipython. Handle special case where the pwd is provided as "~", which should be expanded using os.path.expanduser on the client side. """ - runtime = await _load_runtime( + runtime = _load_runtime( temp_dir, box_class, run_as_openhands=False, @@ -348,20 +342,20 @@ async def test_ipython_agentskills_fileop_pwd_with_userdir(temp_dir, box_class): action = CmdRunAction(command='cd ~') logger.info(action, extra={'msg_type': 'ACTION'}) - obs = await runtime.run_action(action) + obs = runtime.run_action(action) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) assert obs.exit_code == 0 action = CmdRunAction(command='mkdir test && ls -la') logger.info(action, extra={'msg_type': 'ACTION'}) - obs = await runtime.run_action(action) + obs = runtime.run_action(action) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) assert isinstance(obs, CmdOutputObservation) assert obs.exit_code == 0 action = IPythonRunCellAction(code="create_file('hello.py')") logger.info(action, extra={'msg_type': 'ACTION'}) - obs = await runtime.run_action(action) + obs = runtime.run_action(action) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) assert isinstance(obs, IPythonRunCellObservation) assert obs.content.replace('\r\n', '\n').strip().split('\n') == ( @@ -376,7 +370,7 @@ async def test_ipython_agentskills_fileop_pwd_with_userdir(temp_dir, box_class): action = CmdRunAction(command='cd test') logger.info(action, extra={'msg_type': 'ACTION'}) - obs = await runtime.run_action(action) + obs = runtime.run_action(action) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) assert isinstance(obs, CmdOutputObservation) assert obs.exit_code == 0 @@ -385,7 +379,7 @@ async def test_ipython_agentskills_fileop_pwd_with_userdir(temp_dir, box_class): # i.e., /workspace/test/hello.py instead of /workspace/hello.py action = IPythonRunCellAction(code="create_file('hello.py')") logger.info(action, extra={'msg_type': 'ACTION'}) - obs = await runtime.run_action(action) + obs = runtime.run_action(action) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) assert isinstance(obs, IPythonRunCellObservation) assert obs.content.replace('\r\n', '\n').strip().split('\n') == ( @@ -398,26 +392,25 @@ async def test_ipython_agentskills_fileop_pwd_with_userdir(temp_dir, box_class): '[Jupyter Python interpreter: /openhands/poetry/openhands-ai-5O4_aCHf-py3.11/bin/python]' ).strip().split('\n') - await runtime.close() - await asyncio.sleep(1) + runtime.close() + time.sleep(1) -@pytest.mark.asyncio -async def test_ipython_package_install(temp_dir, box_class, run_as_openhands): +def test_ipython_package_install(temp_dir, box_class, run_as_openhands): """Make sure that cd in bash also update the current working directory in ipython.""" - runtime = await _load_runtime(temp_dir, box_class, run_as_openhands) + runtime = _load_runtime(temp_dir, box_class, run_as_openhands) # It should error out since pymsgbox is not installed action = IPythonRunCellAction(code='import pymsgbox') logger.info(action, extra={'msg_type': 'ACTION'}) - obs = await runtime.run_action(action) + obs = runtime.run_action(action) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) assert "ModuleNotFoundError: No module named 'pymsgbox'" in obs.content # Install pymsgbox in Jupyter action = IPythonRunCellAction(code='%pip install pymsgbox==1.0.9') logger.info(action, extra={'msg_type': 'ACTION'}) - obs = await runtime.run_action(action) + obs = runtime.run_action(action) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) assert ( 'Successfully installed pymsgbox-1.0.9' in obs.content @@ -426,7 +419,7 @@ async def test_ipython_package_install(temp_dir, box_class, run_as_openhands): action = IPythonRunCellAction(code='import pymsgbox') logger.info(action, extra={'msg_type': 'ACTION'}) - obs = await runtime.run_action(action) + obs = runtime.run_action(action) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) # import should not error out assert obs.content.strip() == ( @@ -435,5 +428,5 @@ async def test_ipython_package_install(temp_dir, box_class, run_as_openhands): '[Jupyter Python interpreter: /openhands/poetry/openhands-ai-5O4_aCHf-py3.11/bin/python]' ) - await runtime.close() - await asyncio.sleep(1) + runtime.close() + time.sleep(1)