Add task tracking tool for long-horizon tasks (#10166)

Co-authored-by: openhands <openhands@all-hands.dev> Co-authored-by: Xingyao Wang <xingyao@all-hands.dev>
2026-03-22 13:47:19 +08:00 · 2025-08-16 20:05:59 +07:00
parent 0ec6ed20cb
commit fe486ad1f1
32 changed files with 1017 additions and 39 deletions
--- a/openhands/agenthub/codeact_agent/codeact_agent.py
+++ b/openhands/agenthub/codeact_agent/codeact_agent.py
@@ -21,6 +21,9 @@ from openhands.agenthub.codeact_agent.tools.llm_based_edit import LLMBasedFileEd
 from openhands.agenthub.codeact_agent.tools.str_replace_editor import (
    create_str_replace_editor_tool,
 )
+from openhands.agenthub.codeact_agent.tools.task_tracker import (
+    create_task_tracker_tool,
+)
 from openhands.agenthub.codeact_agent.tools.think import ThinkTool
 from openhands.controller.agent import Agent
 from openhands.controller.state.state import State
@@ -98,7 +101,7 @@ class CodeActAgent(Agent):
        if self._prompt_manager is None:
            self._prompt_manager = PromptManager(
                prompt_dir=os.path.join(os.path.dirname(__file__), 'prompts'),
-                system_prompt_filename=self.config.system_prompt_filename,
+                system_prompt_filename=self.config.resolved_system_prompt_filename,
            )

        return self._prompt_manager
@@ -136,6 +139,9 @@ class CodeActAgent(Agent):
                tools.append(BrowserTool)
        if self.config.enable_jupyter:
            tools.append(IPythonTool)
+        if self.config.enable_plan_mode:
+            # In plan mode, we use the task_tracker tool for task management
+            tools.append(create_task_tracker_tool(use_short_tool_desc))
        if self.config.enable_llm_editor:
            tools.append(LLMBasedFileEditTool)
        elif self.config.enable_editor:
--- a/openhands/agenthub/codeact_agent/function_calling.py
+++ b/openhands/agenthub/codeact_agent/function_calling.py
@@ -35,11 +35,13 @@ from openhands.events.action import (
    FileReadAction,
    IPythonRunCellAction,
    MessageAction,
+    TaskTrackingAction,
 )
 from openhands.events.action.agent import CondensationRequestAction
 from openhands.events.action.mcp import MCPAction
 from openhands.events.event import FileEditSource, FileReadSource
 from openhands.events.tool import ToolCallMetadata
+from openhands.llm.tool_names import TASK_TRACKER_TOOL_NAME


 def combine_thought(action: Action, thought: str) -> Action:
@@ -220,6 +222,24 @@ def response_to_actions(
                    )
                action = BrowseInteractiveAction(browser_actions=arguments['code'])

+            # ================================================
+            # TaskTrackingAction
+            # ================================================
+            elif tool_call.function.name == TASK_TRACKER_TOOL_NAME:
+                if 'command' not in arguments:
+                    raise FunctionCallValidationError(
+                        f'Missing required argument "command" in tool call {tool_call.function.name}'
+                    )
+                if arguments['command'] == 'plan' and 'task_list' not in arguments:
+                    raise FunctionCallValidationError(
+                        f'Missing required argument "task_list" for "plan" command in tool call {tool_call.function.name}'
+                    )
+
+                action = TaskTrackingAction(
+                    command=arguments['command'],
+                    task_list=arguments.get('task_list', []),
+                )
+
            # ================================================
            # MCPAction (MCP)
            # ================================================
--- a/openhands/agenthub/codeact_agent/prompts/system_prompt_long_horizon.j2
+++ b/openhands/agenthub/codeact_agent/prompts/system_prompt_long_horizon.j2
@@ -1,39 +1,40 @@
 {% include "system_prompt.j2" %}

 <TASK_MANAGEMENT>
-* For complex, long-horizon tasks, create a TODO.md file to track progress:
-  1. Start by creating a detailed plan in TODO.md with clear steps
-  2. Check TODO.md before each new action to maintain context and track progress
-  3. Update TODO.md as you complete steps or discover new requirements
-  4. Mark completed items with ✓ or [x] to maintain a clear record of progress
-  5. For each major step, add sub-tasks as needed to break down complex work
-  6. If you discover the plan needs significant changes, propose updates and confirm with the user before proceeding and update TODO.md
-  7. IMPORTANT: Do NOT add TODO.md to git commits or version control systems
-
-* Example TODO.md format:
-```markdown
-# Task: [Brief description of the overall task]
-
-## Plan
- [ ] Step 1: [Description]
-  - [ ] Sub-task 1.1
-  - [ ] Sub-task 1.2
- [ ] Step 2: [Description]
- [x] Step 3: [Description] (Completed)
-
-## Notes
- Important discovery: [Details about something you learned]
- Potential issue: [Description of a potential problem]
-```
-
-* When working on a task:
-  - Read the README to understand how the system works
-  - Create TODO.md with every major step unchecked
-  - Add TODO.md to .gitignore if it's not already ignored
-  - Until every item in TODO.md is checked:
-    a. Pick the next unchecked item and work on it
-    b. Run appropriate tests to verify your work
-    c. If issues arise, fix them until tests pass
-    d. Once complete, check off the item in TODO.md
-    e. Proceed to the next unchecked item
+* You have access to the `task_tracker` tool to help you organize and monitor development work. Use this tool REGULARLY to maintain task visibility and provide users with clear progress updates. This tool is ESSENTIAL for systematic planning and decomposing complex development work into manageable components. Failing to use this tool for planning may result in overlooked requirements - which is unacceptable.
+* It is crucial that you update task status to "done" immediately upon completion of each work item. Do not accumulate multiple finished tasks before updating their status.
+* For complex, multi-phase development work, use `task_tracker` to establish a comprehensive plan with well-defined steps:
+  1. Begin by decomposing the overall objective into primary phases using `task_tracker`
+  2. Include detailed work items as necessary to break complex activities into actionable units
+  3. Update tasks to "in_progress" status when commencing work on them
+  4. Update tasks to "done" status immediately after completing each item
+  5. For each primary phase, incorporate additional work items as you identify new requirements
+  6. If you determine the plan requires substantial modifications, suggest revisions and obtain user confirmation before proceeding
+* Example workflow for debugging and resolution:
+  ```
+  User: "Execute the test suite and resolve any validation failures"
+  Assistant: I'm going to use the task_tracker tool to organize the following work items:
+  - Execute the test suite
+  - Resolve any validation failures
+  I'm now going to run the test suite using the terminal.
+  [After running tests and discovering 8 validation failures]
+  I found 8 validation failures that need attention. I'm going to use the task_tracker tool to add 8 specific items to the task list.
+  [Updating first task to in_progress]
+  Let me begin addressing the first validation issue...
+  [After resolving first failure]
+  The first validation issue has been resolved, let me mark that task as done and proceed to the second item...
+  ```
+* Example workflow for component development:
+  ```
+  User: "Build a dashboard component that displays analytics data with interactive charts and filtering options"
+  Assistant: I'll help you create an analytics dashboard with interactive charts and filtering. Let me first use the task_tracker tool to organize this development work.
+  Adding the following tasks to the tracker:
+  1. Analyze existing analytics data structure and requirements
+  2. Design dashboard layout and component architecture
+  3. Implement data visualization charts with interactivity
+  4. Create filtering and search functionality
+  5. Integrate components and perform testing
+  Let me start by examining the current analytics data structure to understand what we're working with...
+  [Assistant proceeds with implementation step by step, updating tasks to in_progress and done as work progresses]
+  ```
 </TASK_MANAGEMENT>
--- a/openhands/agenthub/codeact_agent/tools/task_tracker.py
+++ b/openhands/agenthub/codeact_agent/tools/task_tracker.py
@@ -0,0 +1,203 @@
+from litellm import ChatCompletionToolParam, ChatCompletionToolParamFunctionChunk
+
+from openhands.llm.tool_names import TASK_TRACKER_TOOL_NAME
+
+_DETAILED_TASK_TRACKER_DESCRIPTION = """This tool provides structured task management capabilities for development workflows.
+It enables systematic tracking of work items, progress monitoring, and efficient
+organization of complex development activities.
+
+The tool maintains visibility into project status and helps communicate
+progress effectively to users.
+
+## Application Guidelines
+
+Utilize this tool in the following situations:
+
+1. Multi-phase development work - When projects involve multiple sequential or
+   parallel activities
+2. Complex implementation tasks - Work requiring systematic planning and
+   coordination across multiple components
+3. Explicit user request for task organization - When users specifically ask
+   for structured task management
+4. Multiple concurrent requirements - When users present several work items
+   that need coordination
+5. Project initiation - Capture and organize user requirements at project start
+6. Work commencement - Update task status to in_progress before beginning
+   implementation. Maintain focus by limiting active work to one task
+7. Task completion - Update status to done and identify any additional work
+   that emerged during implementation
+
+## Situations Where Tool Usage Is Unnecessary
+
+Avoid using this tool when:
+
+1. Single atomic tasks that require no decomposition
+2. Trivial operations where tracking adds no organizational value
+3. Simple activities completable in minimal steps
+4. Pure information exchange or discussion
+
+Note: For single straightforward tasks, proceed with direct implementation
+rather than creating tracking overhead.
+
+## Usage Scenarios
+
+**Scenario A: Feature Development with Validation**
+User request: "Build a user authentication system with login/logout functionality.
+Don't forget to include input validation and error handling!"
+
+Response approach: I'll implement a user authentication system with comprehensive
+validation. Let me organize this work systematically.
+
+*Task breakdown includes:*
+1. Design authentication API endpoints and data models
+2. Implement user login/logout controllers with session management
+3. Add client-side form validation and error display
+4. Create backend input sanitization and security checks
+5. Implement comprehensive error handling for edge cases
+6. Perform integration testing and fix any issues
+
+**Scenario B: Codebase Refactoring**
+User request: "I need to update all instances of the method 'fetchData' to
+'retrieveInformation' throughout the entire project"
+
+Response approach: Let me first analyze your codebase to identify all locations
+where 'fetchData' appears.
+
+*After code analysis*
+
+I've located 12 occurrences of 'fetchData' across 6 files in your project.
+I'll create a systematic plan to update these references.
+
+*Organized task list includes specific file-by-file updates*
+
+**Scenario C: Multi-feature Development**
+User request: "Set up a blog platform with these components: admin dashboard,
+article management, comment system, and user profiles."
+
+Response approach: I'll help build your blog platform. Let me break down these
+components into manageable implementation phases.
+
+*Creates structured plan with each feature decomposed into specific development tasks*
+
+## Counter-examples - Direct Implementation Preferred
+
+**Counter-example A: Simple inquiry**
+User request: "What's the syntax for a for loop in JavaScript?"
+
+Response approach: Here's how to write a for loop in JavaScript:
+
+```javascript
+for (let i = 0; i < array.length; i++) {
+    console.log(array[i]);
+}
+```
+
+*No task tracking needed - single informational response*
+
+**Counter-example B: Minor modification**
+User request: "Please add a docstring to the processPayment function explaining
+its parameters."
+
+Response approach: I'll add a docstring to explain the processPayment function.
+
+*Direct edit without task tracking for simple modification*
+
+## Status Management and Workflow
+
+1. **Status Values**: Track work using these states:
+   - todo: Not yet initiated
+   - in_progress: Currently active (maintain single focus)
+   - done: Successfully completed
+
+2. **Workflow Practices**:
+   - Update status dynamically as work progresses
+   - Mark completion immediately upon task finish
+   - Limit active work to ONE task at any given time
+   - Complete current activities before initiating new ones
+   - Remove obsolete tasks from tracking entirely
+
+3. **Completion Criteria**:
+   - Mark tasks as done only when fully achieved
+   - Keep status as in_progress if errors, blocks, or partial completion exist
+   - Create new tasks for discovered issues or dependencies
+   - Never mark done when:
+       - Test suites are failing
+       - Implementation remains incomplete
+       - Unresolved errors persist
+       - Required resources are unavailable
+
+4. **Task Organization**:
+   - Write precise, actionable descriptions
+   - Decompose complex work into manageable units
+   - Use descriptive, clear naming conventions
+
+When uncertain, favor using this tool. Proactive task management demonstrates
+systematic approach and ensures comprehensive requirement fulfillment.
+"""
+
+_SHORT_TASK_TRACKER_DESCRIPTION = """Provides structured task management for development workflows, enabling progress
+tracking and systematic organization of complex coding activities.
+
+* Apply to multi-phase projects (3+ distinct steps) or when managing multiple user requirements
+* Update status (todo/in_progress/done) dynamically throughout work
+* Maintain single active task focus at any time
+* Mark completion immediately upon task finish
+* Decompose complex work into manageable, actionable units
+"""
+
+
+def create_task_tracker_tool(
+    use_short_description: bool = False,
+) -> ChatCompletionToolParam:
+    description = (
+        _SHORT_TASK_TRACKER_DESCRIPTION
+        if use_short_description
+        else _DETAILED_TASK_TRACKER_DESCRIPTION
+    )
+    return ChatCompletionToolParam(
+        type='function',
+        function=ChatCompletionToolParamFunctionChunk(
+            name=TASK_TRACKER_TOOL_NAME,
+            description=description,
+            parameters={
+                'type': 'object',
+                'properties': {
+                    'command': {
+                        'type': 'string',
+                        'enum': ['view', 'plan'],
+                        'description': 'The command to execute. `view` shows the current task list. `plan` creates or updates the task list based on provided requirements and progress. Always `view` the current list before making changes.',
+                    },
+                    'task_list': {
+                        'type': 'array',
+                        'description': 'The full task list. Required parameter of `plan` command.',
+                        'items': {
+                            'type': 'object',
+                            'properties': {
+                                'id': {
+                                    'type': 'string',
+                                    'description': 'Unique task identifier',
+                                },
+                                'title': {
+                                    'type': 'string',
+                                    'description': 'Brief task description',
+                                },
+                                'status': {
+                                    'type': 'string',
+                                    'description': 'Current task status',
+                                    'enum': ['todo', 'in_progress', 'done'],
+                                },
+                                'notes': {
+                                    'type': 'string',
+                                    'description': 'Optional additional context or details',
+                                },
+                            },
+                            'required': ['title', 'status', 'id'],
+                            'additionalProperties': False,
+                        },
+                    },
+                },
+                'required': ['command'],
+                'additionalProperties': False,
+            },
+        ),
+    )