Enhance dead-loop recovery by pausing agent and reprompting (#11439)

Co-authored-by: Engel Nyst <enyst@users.noreply.github.com> Co-authored-by: openhands <openhands@all-hands.dev>
2026-03-22 13:47:19 +08:00 · 2025-10-24 19:25:14 +08:00
parent 4b303ec9b4
commit 17e32af6fe
21 changed files with 932 additions and 43 deletions
--- a/openhands/controller/agent_controller.py
+++ b/openhands/controller/agent_controller.py
@@ -64,6 +64,7 @@ from openhands.events.action import (
    MessageAction,
    NullAction,
    SystemMessageAction,
+    LoopRecoveryAction,
 )
 from openhands.events.action.agent import (
    CondensationAction,
@@ -77,6 +78,7 @@ from openhands.events.observation import (
    ErrorObservation,
    NullObservation,
    Observation,
+    LoopDetectionObservation,
 )
 from openhands.events.serialization.event import truncate_content
 from openhands.llm.metrics import Metrics
@@ -523,6 +525,8 @@ class AgentController:
        elif isinstance(action, AgentRejectAction):
            self.state.outputs = action.outputs
            await self.set_agent_state_to(AgentState.REJECTED)
+        elif isinstance(action, LoopRecoveryAction):
+            await self._handle_loop_recovery_action(action)

    async def _handle_observation(self, observation: Observation) -> None:
        """Handles observation from the event stream.
@@ -595,6 +599,25 @@ class AgentController:
            if action.wait_for_response:
                await self.set_agent_state_to(AgentState.AWAITING_USER_INPUT)

+    async def _handle_loop_recovery_action(self, action: LoopRecoveryAction) -> None:
+        # Check if this is a loop recovery option
+        if self._stuck_detector.stuck_analysis:
+            option = action.option
+
+            # Handle the loop recovery option
+            if option == 1:
+                # Option 1: Restart from before loop
+                await self._perform_loop_recovery(self._stuck_detector.stuck_analysis)
+            elif option == 2:
+                # Option 2: Restart with last user message
+                await self._restart_with_last_user_message(
+                    self._stuck_detector.stuck_analysis
+                )
+            elif option == 3:
+                # Option 3: Stop agent completely
+                await self.set_agent_state_to(AgentState.STOPPED)
+            return
+
    def _reset(self) -> None:
        """Resets the agent controller."""
        # Runnable actions need an Observation
@@ -1084,6 +1107,45 @@ class AgentController:

        return self._stuck_detector.is_stuck(self.headless_mode)

+    def attempt_loop_recovery(self) -> bool:
+        """Attempts loop recovery when agent is stuck in a loop.
+        Only supports CLI for now.
+
+        Returns:
+            bool: True if recovery was successful and agent should continue,
+                  False if recovery failed or was not attempted.
+        """
+        # Check if we're in a loop
+        if not self._stuck_detector.stuck_analysis:
+            return False
+
+        """Handle loop recovery in CLI mode by pausing the agent and presenting recovery options."""
+        recovery_point = self._stuck_detector.stuck_analysis.loop_start_idx
+
+        # Present loop detection message
+        self.event_stream.add_event(
+            LoopDetectionObservation(
+                content=f"""⚠️  Agent detected in a loop!
+Loop type: {self._stuck_detector.stuck_analysis.loop_type}
+Loop detected at iteration {self.state.iteration_flag.current_value}
+\nRecovery options:
+/resume 1. Restart from before loop (preserves {recovery_point} events)
+/resume 2. Restart with last user message (reuses your most recent instruction)
+/exit. Quit directly
+\nThe agent has been paused. Type '/resume 1', '/resume 2', or '/exit' to choose an option.
+"""
+            ),
+            source=EventSource.ENVIRONMENT,
+        )
+
+        # Pause the agent using the same mechanism as Ctrl+P
+        # This ensures consistent behavior and avoids event loop conflicts
+        self.event_stream.add_event(
+            ChangeAgentStateAction(AgentState.PAUSED),
+            EventSource.ENVIRONMENT,  # Use ENVIRONMENT source to distinguish from user pause
+        )
+        return True
+
    def _prepare_metrics_for_frontend(self, action: Action) -> None:
        """Create a minimal metrics object for frontend display and log it.

@@ -1208,5 +1270,92 @@ class AgentController:
        )
        return self._cached_first_user_message

+    async def _perform_loop_recovery(
+        self, stuck_analysis: StuckDetector.StuckAnalysis
+    ) -> None:
+        """Perform loop recovery by truncating memory and restarting from before the loop."""
+        recovery_point = stuck_analysis.loop_start_idx
+
+        # Truncate memory to the recovery point
+        await self._truncate_memory_to_point(recovery_point)
+
+        # Set agent state to AWAITING_USER_INPUT to allow user to provide new instructions
+        await self.set_agent_state_to(AgentState.AWAITING_USER_INPUT)
+
+        self.event_stream.add_event(
+            LoopDetectionObservation(
+                content="""✅ Loop recovery completed. Agent has been reset to before the loop.
+You can now provide new instructions to continue.
+"""
+            ),
+            source=EventSource.ENVIRONMENT,
+        )
+
+    async def _truncate_memory_to_point(self, recovery_point: int) -> None:
+        """Truncate memory to the specified recovery point."""
+        # Get all events from state history
+        all_events = self.state.history
+
+        if recovery_point >= len(all_events):
+            return
+
+        # Keep only events up to the recovery point
+        events_to_keep = all_events[:recovery_point]
+
+        # Update state history
+        self.state.history = events_to_keep
+
+        # Update end_id to reflect the truncation
+        if events_to_keep:
+            self.state.end_id = events_to_keep[-1].id
+        else:
+            self.state.end_id = -1
+
+        # Clear any cached messages
+        self._cached_first_user_message = None
+
+    async def _restart_with_last_user_message(
+        self, stuck_analysis: StuckDetector.StuckAnalysis
+    ) -> None:
+        """Restart the agent using the last user message as the new instruction."""
+
+        # Find the last user message in the history
+        last_user_message = None
+        for event in reversed(self.state.history):
+            if isinstance(event, MessageAction) and event.source == EventSource.USER:
+                last_user_message = event
+                break
+
+        if last_user_message:
+            # Truncate memory to just before the loop started
+            recovery_point = stuck_analysis.loop_start_idx
+            await self._truncate_memory_to_point(recovery_point)
+
+            # Set agent state to RUNNING and re-use the last user message
+            await self.set_agent_state_to(AgentState.RUNNING)
+
+            # Re-use the last user message as the new instruction
+            self.event_stream.add_event(
+                LoopDetectionObservation(
+                    content=f"""\n✅ Restarting with your last instruction: {last_user_message.content}
+Agent is now continuing with the same task...
+"""
+                ),
+                source=EventSource.ENVIRONMENT,
+            )
+
+            # Create a new action with the last user message
+            new_action = MessageAction(
+                content=last_user_message.content, wait_for_response=False
+            )
+            new_action._source = EventSource.USER  # type: ignore [attr-defined]
+
+            # Process the action to restart the agent
+            await self._handle_action(new_action)
+        else:
+            # If no user message found, fall back to regular recovery
+            print('\n⚠️  No previous user message found. Using standard recovery.')
+            await self._perform_loop_recovery(stuck_analysis)
+
    def save_state(self):
        self.state_tracker.save_state()