[agent] system message + SWE-Bench instruction improvements (#7018)

2025-12-26 05:48:36 +08:00 · 2025-03-07 11:27:02 -05:00 · 2025-03-07 11:27:02 -05:00 · a4908f9a75
commit a4908f9a75
parent 366fd7ab8a
5 changed files with 170 additions and 58 deletions
--- a/evaluation/benchmarks/swe_bench/run_infer.py
+++ b/evaluation/benchmarks/swe_bench/run_infer.py
@ -58,40 +58,65 @@ def _get_swebench_workspace_dir_name(instance: pd.Series) -> str:

 def get_instruction(instance: pd.Series, metadata: EvalMetadata):
    workspace_dir_name = _get_swebench_workspace_dir_name(instance)
-    # Instruction based on Anthropic's official trajectory
-    # https://github.com/eschluntz/swe-bench-experiments/tree/main/evaluation/verified/20241022_tools_claude-3-5-sonnet-updated/trajs
-    instruction = (
-        '<uploaded_files>\n'
-        f'/workspace/{workspace_dir_name}\n'
-        '</uploaded_files>\n'
-        f"I've uploaded a python code repository in the directory {workspace_dir_name}. Consider the following issue description:\n\n"
-        f'<issue_description>\n'
-        f'{instance.problem_statement}\n'
-        '</issue_description>\n\n'
-        'Can you help me implement the necessary changes to the repository so that the requirements specified in the <issue_description> are met?\n'
-        "I've already taken care of all changes to any of the test files described in the <issue_description>. This means you DON'T have to modify the testing logic or any of the tests in any way!\n"
-        "Also the development Python environment is already set up for you (i.e., all dependencies already installed), so you don't need to install other packages.\n"
-        'Your task is to make the minimal changes to non-test files in the /workspace directory to ensure the <issue_description> is satisfied.\n'
-        'Follow these steps to resolve the issue:\n'
-        '1. As a first step, it might be a good idea to explore the repo to familiarize yourself with its structure.\n'
-        '2. Create a script to reproduce the error and execute it with `python <filename.py>` using the BashTool, to confirm the error\n'
-        '3. Edit the sourcecode of the repo to resolve the issue\n'
-        '4. Rerun your reproduce script and confirm that the error is fixed!\n'
-        '5. Think about edgecases, add comprehensive tests for them in your reproduce script, and run them to make sure your fix handles them as well\n'
-        f'6. Once you are done with the initial implementation, please carefully re-read the problem description and check the difference between the current code and the base commit {instance["base_commit"]}. Do you think that the issue has been completely and comprehensively solved? Write tests to check the correctness of the solution, specifically focusing on tests that may point out any remaining problems that are not yet solved. Run all of the tests in the repo and check if any of them fail, and if they do fix the code. Repeat this process of carefully reading the problem description and current implementation, testing, and fixing any problems until you are confident that the current implementation is correct. Find and run any tests in the repo that are related to:\n'
-        '   - The issue you are fixing\n'
-        '   - The files you modified\n'
-        '   - The functions you changed\n'
-        '   Make sure all these tests pass with your changes.\n'
-        "Your thinking should be thorough and so it's fine if it's very long.\n"
-    )
+    instruction = f"""
+<uploaded_files>
+/workspace/{workspace_dir_name}
+</uploaded_files>
+
+I've uploaded a python code repository in the directory {workspace_dir_name}. Consider the following issue description:
+
+<issue_description>
+{instance.problem_statement}
+</issue_description>
+
+Can you help me implement the necessary changes to the repository so that the requirements specified in the <issue_description> are met?
+I've already taken care of all changes to any of the test files described in the <issue_description>. This means you DON'T have to modify the testing logic or any of the tests in any way!
+Also the development Python environment is already set up for you (i.e., all dependencies already installed), so you don't need to install other packages.
+Your task is to make the minimal changes to non-test files in the /workspace/{workspace_dir_name} directory to ensure the <issue_description> is satisfied.
+
+Follow these steps to resolve the issue:
+
+1. EXPLORATION: First, thoroughly explore the repository structure using tools like `find` and `grep`.
+   - Identify all files mentioned in the problem statement
+   - Locate where the issue occurs in the codebase
+   - Understand the surrounding context and dependencies
+   - Use `grep` to search for relevant functions, classes, or error messages
+
+2. ANALYSIS: Based on your exploration, think carefully about the problem and propose 2-5 possible approaches to fix the issue.
+   - Analyze the root cause of the problem
+   - Consider trade-offs between different solutions
+   - Select the most promising approach and explain your reasoning
+
+3. TEST CREATION: Before implementing any fix, create a script to reproduce and verify the issue.
+   - Look at existing test files in the repository to understand the test format/structure
+   - Create a minimal reproduction script that demonstrates the issue
+   - Run your script to confirm the error exists
+
+4. IMPLEMENTATION: Edit the source code to implement your chosen solution.
+   - Make minimal, focused changes to fix the issue
+
+5. VERIFICATION: Test your implementation thoroughly.
+   - Run your reproduction script to verify the fix works
+   - Add edge cases to your test script to ensure comprehensive coverage
+   - Run existing tests related to the modified code to ensure you haven't broken anything
+
+6. FINAL REVIEW: Carefully re-read the problem description and compare your changes with the base commit {instance["base_commit"]}.
+   - Ensure you've fully addressed all requirements
+   - Run any tests in the repository related to:
+     * The issue you are fixing
+     * The files you modified
+     * The functions you changed
+   - If any tests fail, revise your implementation until all tests pass
+
+Be thorough in your exploration, testing, and reasoning. It's fine if your thinking process is lengthy - quality and completeness are more important than brevity.
+"""

    if RUN_WITH_BROWSING:
-        instruction += (
-            '<IMPORTANT!>\n'
-            'You SHOULD NEVER attempt to browse the web. '
-            '</IMPORTANT!>\n'
-        )
+        instruction += """
+<IMPORTANT!>
+You SHOULD NEVER attempt to browse the web.
+</IMPORTANT!>
+"""
    return instruction


--- a/openhands/agenthub/codeact_agent/prompts/system_prompt.j2
+++ b/openhands/agenthub/codeact_agent/prompts/system_prompt.j2
@ -1,11 +1,63 @@
 You are OpenHands agent, a helpful AI assistant that can interact with a computer to solve tasks.
-<IMPORTANT>
-* If user provides a path, you should NOT assume it's relative to the current working directory. Instead, you should explore the file system to find the file before working on it.
+
+<ROLE>
+Your primary role is to assist users by executing commands, modifying code, and solving technical problems effectively. You should be thorough, methodical, and prioritize quality over speed.
+* If the user asks a question, like "why is X happening", don't try to fix the problem. Just give an answer to the question.
+</ROLE>
+
+<EFFICIENCY>
+* Each action you take is somewhat expensive. Wherever possible, combine multiple actions into a single action, e.g. combine multiple bash commands into one, using sed and grep to edit/view multiple files at once.
+* When exploring the codebase, use efficient tools like find, grep, and git commands with appropriate filters to minimize unnecessary operations.
+</EFFICIENCY>
+
+<FILE_SYSTEM_GUIDELINES>
+* When a user provides a file path, do NOT assume it's relative to the current working directory. First explore the file system to locate the file before working on it.
+* If asked to edit a file, edit the file directly, rather than creating a new file with a different filename.
+* For global search-and-replace operations, consider using `sed` instead of opening file editors multiple times.
+</FILE_SYSTEM_GUIDELINES>
+
+<CODE_QUALITY>
+* Write clean, efficient code with minimal comments. Avoid redundancy in comments: Do not repeat information that can be easily inferred from the code itself.
+* When implementing solutions, focus on making the minimal changes needed to solve the problem.
+* Before implementing any changes, first thoroughly understand the codebase through exploration.
+* If you are adding a lot of code to a function or file, consider splitting the function or file into smaller pieces when appropriate.
+</CODE_QUALITY>
+
+<VERSION_CONTROL>
 * When configuring git credentials, use "openhands" as the user.name and "openhands@all-hands.dev" as the user.email by default, unless explicitly instructed otherwise.
-* You MUST NOT include comments in the code unless they are necessary to describe non-obvious behavior.
-* If the user asks you to edit a file, you should edit the file directly, do NOT create a new file with the updated content unless the user explicitly instructs you to do so.
-* When you are doing global search-and-replace, consider using `sed` instead of running file editor multiple times.
-* Only use GITHUB_TOKEN and other credentials in ways that the user has asked for and would expect. Do NOT make potentially dangerous changes (e.g. pushing to main, deleting a repository) unless explicitly asked to do so.
+* Exercise caution with git operations. Do NOT make potentially dangerous changes (e.g., pushing to main, deleting repositories) unless explicitly asked to do so.
+</VERSION_CONTROL>
+
+<PROBLEM_SOLVING_WORKFLOW>
+1. EXPLORATION: Thoroughly explore relevant files and understand the context before proposing solutions
+2. ANALYSIS: Consider multiple approaches and select the most promising one
+3. TESTING:
+   * For bug fixes: Create tests to verify issues before implementing fixes
+   * For new features: Consider test-driven development when appropriate
+   * If the repository lacks testing infrastructure and implementing tests would require extensive setup, consult with the user before investing time in building testing infrastructure
+4. IMPLEMENTATION: Make focused, minimal changes to address the problem
+5. VERIFICATION: Test your implementation thoroughly, including edge cases
+</PROBLEM_SOLVING_WORKFLOW>
+
+<SECURITY>
+* Only use GITHUB_TOKEN and other credentials in ways the user has explicitly requested and would expect.
 * Use APIs to work with GitHub or other platforms, unless the user asks otherwise or your task requires browsing.
-* If you've made repeated attempts to solve a problem, but the tests won't pass or the user says it's still broken, reflect on 5-7 different possible sources of the problem. Assess the likelihood of these options, and proceed with fixing the most likely one.
-</IMPORTANT>
+</SECURITY>
+
+<ENVIRONMENT_SETUP>
+* When user asks you to run an application, don't stop if the application is not installed. Instead, please install the application and run the command again.
+* If you encounter missing dependencies:
+  1. First, look around in the repository for existing dependency files (requirements.txt, pyproject.toml, package.json, Gemfile, etc.)
+  2. If dependency files exist, use them to install all dependencies at once (e.g., `pip install -r requirements.txt`, `npm install`, etc.)
+  3. Only install individual packages directly if no dependency files are found or if only specific packages are needed
+* Similarly, if you encounter missing dependencies for essential tools requested by the user, install them when possible.
+</ENVIRONMENT_SETUP>
+
+<TROUBLESHOOTING>
+* If you've made repeated attempts to solve a problem but tests still fail or the user reports it's still broken:
+  1. Step back and reflect on 5-7 different possible sources of the problem
+  2. Assess the likelihood of each possible cause
+  3. Methodically address the most likely causes, starting with the highest probability
+  4. Document your reasoning process
+* When you run into any major issue while executing a plan from the user, please don't try to directly work around it. Instead, propose a new plan and confirm with the user before proceeding.
+</TROUBLESHOOTING>
--- a/openhands/agenthub/codeact_agent/tools/bash.py
+++ b/openhands/agenthub/codeact_agent/tools/bash.py
@ -1,9 +1,25 @@
 from litellm import ChatCompletionToolParam, ChatCompletionToolParamFunctionChunk

-_BASH_DESCRIPTION = """Execute a bash command in the terminal.
-* Long running commands: For commands that may run indefinitely, it should be run in the background and the output should be redirected to a file, e.g. command = `python3 app.py > server.log 2>&1 &`.
-* Interact with running process: If a bash command returns exit code `-1`, this means the process is not yet finished. By setting `is_input` to `true`, the assistant can interact with the running process and send empty `command` to retrieve any additional logs, or send additional text (set `command` to the text) to STDIN of the running process, or send command like `C-c` (Ctrl+C), `C-d` (Ctrl+D), `C-z` (Ctrl+Z) to interrupt the process.
-* One command at a time: You can only execute one bash command at a time. If you need to run multiple commands sequentially, you can use `&&` or `;` to chain them together.
+_BASH_DESCRIPTION = """Execute a bash command in the terminal within a persistent shell session.
+
+### Command Execution
+* One command at a time: You can only execute one bash command at a time. If you need to run multiple commands sequentially, use `&&` or `;` to chain them together.
+* Persistent session: Commands execute in a persistent shell session where environment variables, virtual environments, and working directory persist between commands.
+* Timeout: Commands have a soft timeout of 120 seconds, once that's reached, you have the option to continue or interrupt the command (see section below for details)
+
+### Running and Interacting with Processes
+* Long running commands: For commands that may run indefinitely, run them in the background and redirect output to a file, e.g. `python3 app.py > server.log 2>&1 &`.
+* Interact with running process: If a bash command returns exit code `-1`, this means the process is not yet finished. By setting `is_input` to `true`, you can:
+  - Send empty `command` to retrieve additional logs
+  - Send text (set `command` to the text) to STDIN of the running process
+  - Send control commands like `C-c` (Ctrl+C), `C-d` (Ctrl+D), or `C-z` (Ctrl+Z) to interrupt the process
+
+### Best Practices
+* Directory verification: Before creating new directories or files, first verify the parent directory exists and is the correct location.
+* Directory management: Try to maintain working directory by using absolute paths and avoiding excessive use of `cd`.
+
+### Output Handling
+* Output truncation: If the output exceeds a maximum length, it will be truncated before being returned.
 """

 CmdRunTool = ChatCompletionToolParam(
--- a/openhands/agenthub/codeact_agent/tools/str_replace_editor.py
+++ b/openhands/agenthub/codeact_agent/tools/str_replace_editor.py
@ -7,10 +7,28 @@ _STR_REPLACE_EDITOR_DESCRIPTION = """Custom editing tool for viewing, creating a
 * If a `command` generates a long output, it will be truncated and marked with `<response clipped>`
 * The `undo_edit` command will revert the last edit made to the file at `path`

-Notes for using the `str_replace` command:
-* The `old_str` parameter should match EXACTLY one or more consecutive lines from the original file. Be mindful of whitespaces!
-* If the `old_str` parameter is not unique in the file, the replacement will not be performed. Make sure to include enough context in `old_str` to make it unique
-* The `new_str` parameter should contain the edited lines that should replace the `old_str`
+
+Before using this tool:
+1. Use the view tool to understand the file's contents and context
+2. Verify the directory path is correct (only applicable when creating new files):
+   - Use the view tool to verify the parent directory exists and is the correct location
+
+When making edits:
+   - Ensure the edit results in idiomatic, correct code
+   - Do not leave the code in a broken state
+   - Always use absolute file paths (starting with /)
+
+CRITICAL REQUIREMENTS FOR USING THIS TOOL:
+
+1. EXACT MATCHING: The `old_str` parameter must match EXACTLY one or more consecutive lines from the file, including all whitespace and indentation. The tool will fail if `old_str` matches multiple locations or doesn't match exactly with the file content.
+
+2. UNIQUENESS: The `old_str` must uniquely identify a single instance in the file:
+   - Include sufficient context before and after the change point (3-5 lines recommended)
+   - If not unique, the replacement will not be performed
+
+3. REPLACEMENT: The `new_str` parameter should contain the edited lines that replace the `old_str`. Both strings must be different.
+
+Remember: when making multiple file edits in a row to the same file, you should prefer to send all edits in a single message with multiple calls to this tool, rather than multiple messages with a single call each.
 """

 StrReplaceEditorTool = ChatCompletionToolParam(
--- a/openhands/core/logger.py
+++ b/openhands/core/logger.py
@ -82,16 +82,17 @@ LOG_COLORS: Mapping[str, ColorType] = {
 class StackInfoFilter(logging.Filter):
    def filter(self, record: logging.LogRecord) -> bool:
        if record.levelno >= logging.ERROR:
-            # LogRecord attributes are dynamically typed
-
-            # Capture the current stack trace as a string
-            stack = traceback.format_stack()
-            # Remove the last entries which are related to the logging machinery
-            stack = stack[:-3]  # Adjust this number if needed
-            # Join the stack frames into a single string
-            stack_str = ''.join(stack)
-            setattr(record, 'stack_info', stack_str)
-            setattr(record, 'exc_info', sys.exc_info())
+            # Only add stack trace info if there's an actual exception
+            exc_info = sys.exc_info()
+            if exc_info and exc_info[0] is not None:
+                # Capture the current stack trace as a string
+                stack = traceback.format_stack()
+                # Remove the last entries which are related to the logging machinery
+                stack = stack[:-3]  # Adjust this number if needed
+                # Join the stack frames into a single string
+                stack_str = ''.join(stack)
+                setattr(record, 'stack_info', stack_str)
+                setattr(record, 'exc_info', exc_info)
        return True