diff --git a/openhands/core/config/condenser_config.py b/openhands/core/config/condenser_config.py
index 609ba3725d..3e7f183128 100644
--- a/openhands/core/config/condenser_config.py
+++ b/openhands/core/config/condenser_config.py
@@ -126,6 +126,33 @@ class LLMAttentionCondenserConfig(BaseModel):
model_config = {'extra': 'forbid'}
+class StructuredSummaryCondenserConfig(BaseModel):
+ """Configuration for StructuredSummaryCondenser instances."""
+
+ type: Literal['structured'] = Field('structured')
+ llm_config: LLMConfig = Field(
+ ..., description='Configuration for the LLM to use for condensing.'
+ )
+
+ # at least one event by default, because the best guess is that it's the user task
+ keep_first: int = Field(
+ default=1,
+ description='Number of initial events to always keep in history.',
+ ge=0,
+ )
+ max_size: int = Field(
+ default=100,
+ description='Maximum size of the condensed history before triggering forgetting.',
+ ge=2,
+ )
+ max_event_length: int = Field(
+ default=10_000,
+ description='Maximum length of the event representations to be passed to the LLM.',
+ )
+
+ model_config = {'extra': 'forbid'}
+
+
# Type alias for convenience
CondenserConfig = (
NoOpCondenserConfig
@@ -135,6 +162,7 @@ CondenserConfig = (
| LLMSummarizingCondenserConfig
| AmortizedForgettingCondenserConfig
| LLMAttentionCondenserConfig
+ | StructuredSummaryCondenserConfig
)
@@ -237,6 +265,7 @@ def create_condenser_config(condenser_type: str, data: dict) -> CondenserConfig:
'llm': LLMSummarizingCondenserConfig,
'amortized': AmortizedForgettingCondenserConfig,
'llm_attention': LLMAttentionCondenserConfig,
+ 'structured': StructuredSummaryCondenserConfig,
}
if condenser_type not in condenser_classes:
diff --git a/openhands/memory/condenser/impl/__init__.py b/openhands/memory/condenser/impl/__init__.py
index 0a2150cc76..3fd8b3234c 100644
--- a/openhands/memory/condenser/impl/__init__.py
+++ b/openhands/memory/condenser/impl/__init__.py
@@ -18,6 +18,9 @@ from openhands.memory.condenser.impl.observation_masking_condenser import (
from openhands.memory.condenser.impl.recent_events_condenser import (
RecentEventsCondenser,
)
+from openhands.memory.condenser.impl.structured_summary_condenser import (
+ StructuredSummaryCondenser,
+)
__all__ = [
'AmortizedForgettingCondenser',
@@ -28,4 +31,5 @@ __all__ = [
'ObservationMaskingCondenser',
'BrowserOutputCondenser',
'RecentEventsCondenser',
+ 'StructuredSummaryCondenser',
]
diff --git a/openhands/memory/condenser/impl/structured_summary_condenser.py b/openhands/memory/condenser/impl/structured_summary_condenser.py
new file mode 100644
index 0000000000..fa9fb7f630
--- /dev/null
+++ b/openhands/memory/condenser/impl/structured_summary_condenser.py
@@ -0,0 +1,322 @@
+from __future__ import annotations
+
+import json
+from typing import Any
+
+from pydantic import BaseModel, Field
+
+from openhands.core.config.condenser_config import (
+ StructuredSummaryCondenserConfig,
+)
+from openhands.core.logger import openhands_logger as logger
+from openhands.core.message import Message, TextContent
+from openhands.events.action.agent import CondensationAction
+from openhands.events.observation.agent import AgentCondensationObservation
+from openhands.events.serialization.event import truncate_content
+from openhands.llm import LLM
+from openhands.memory.condenser.condenser import (
+ Condensation,
+ RollingCondenser,
+ View,
+)
+
+
+class StateSummary(BaseModel):
+ """A structured representation summarizing the state of the agent and the task."""
+
+ # Required core fields
+ user_context: str = Field(
+ default='',
+ description='Essential user requirements, goals, and clarifications in concise form.',
+ )
+ completed_tasks: str = Field(
+ default='', description='List of tasks completed so far with brief results.'
+ )
+ pending_tasks: str = Field(
+ default='', description='List of tasks that still need to be done.'
+ )
+ current_state: str = Field(
+ default='',
+ description='Current variables, data structures, or other relevant state information.',
+ )
+
+ # Code state fields
+ files_modified: str = Field(
+ default='', description='List of files that have been created or modified.'
+ )
+ function_changes: str = Field(
+ default='', description='List of functions that have been created or modified.'
+ )
+ data_structures: str = Field(
+ default='', description='List of key data structures in use or modified.'
+ )
+
+ # Test status fields
+ tests_written: str = Field(
+ default='',
+ description='Whether tests have been written for the changes. True, false, or unknown.',
+ )
+ tests_passing: str = Field(
+ default='',
+ description='Whether all tests are currently passing. True, false, or unknown.',
+ )
+ failing_tests: str = Field(
+ default='', description='List of names or descriptions of any failing tests.'
+ )
+ error_messages: str = Field(
+ default='', description='List of key error messages encountered.'
+ )
+
+ # Version control fields
+ branch_created: str = Field(
+ default='',
+ description='Whether a branch has been created for this work. True, false, or unknown.',
+ )
+ branch_name: str = Field(
+ default='', description='Name of the current working branch if known.'
+ )
+ commits_made: str = Field(
+ default='',
+ description='Whether any commits have been made. True, false, or unknown.',
+ )
+ pr_created: str = Field(
+ default='',
+ description='Whether a pull request has been created. True, false, or unknown.',
+ )
+ pr_status: str = Field(
+ default='',
+ description="Status of any pull request: 'draft', 'open', 'merged', 'closed', or 'unknown'.",
+ )
+
+ # Other fields
+ dependencies: str = Field(
+ default='',
+ description='List of dependencies or imports that have been added or modified.',
+ )
+ other_relevant_context: str = Field(
+ default='',
+ description="Any other important information that doesn't fit into the categories above.",
+ )
+
+ @classmethod
+ def tool_description(cls) -> dict[str, Any]:
+ """Description of a tool whose arguments are the fields of this class.
+
+ Can be given to an LLM to force structured generation.
+ """
+ properties = {}
+
+ # Build properties dictionary from field information
+ for field_name, field in cls.model_fields.items():
+ description = field.description or ''
+
+ properties[field_name] = {'type': 'string', 'description': description}
+
+ return {
+ 'type': 'function',
+ 'function': {
+ 'name': 'create_state_summary',
+ 'description': 'Creates a comprehensive summary of the current state of the interaction to preserve context when history grows too large. You must include non-empty values for user_context, completed_tasks, and pending_tasks.',
+ 'parameters': {
+ 'type': 'object',
+ 'properties': properties,
+ 'required': ['user_context', 'completed_tasks', 'pending_tasks'],
+ },
+ },
+ }
+
+ def __str__(self) -> str:
+ """Format the state summary in a clear way for Claude 3.7 Sonnet."""
+ sections = [
+ '# State Summary',
+ '## Core Information',
+ f'**User Context**: {self.user_context}',
+ f'**Completed Tasks**: {self.completed_tasks}',
+ f'**Pending Tasks**: {self.pending_tasks}',
+ f'**Current State**: {self.current_state}',
+ '## Code Changes',
+ f'**Files Modified**: {self.files_modified}',
+ f'**Function Changes**: {self.function_changes}',
+ f'**Data Structures**: {self.data_structures}',
+ f'**Dependencies**: {self.dependencies}',
+ '## Testing Status',
+ f'**Tests Written**: {self.tests_written}',
+ f'**Tests Passing**: {self.tests_passing}',
+ f'**Failing Tests**: {self.failing_tests}',
+ f'**Error Messages**: {self.error_messages}',
+ '## Version Control',
+ f'**Branch Created**: {self.branch_created}',
+ f'**Branch Name**: {self.branch_name}',
+ f'**Commits Made**: {self.commits_made}',
+ f'**PR Created**: {self.pr_created}',
+ f'**PR Status**: {self.pr_status}',
+ '## Additional Context',
+ f'**Other Relevant Context**: {self.other_relevant_context}',
+ ]
+
+ # Join all sections with double newlines
+ return '\n\n'.join(sections)
+
+
+class StructuredSummaryCondenser(RollingCondenser):
+ """A condenser that summarizes forgotten events.
+
+ Maintains a condensed history and forgets old events when it grows too large. Uses structured generation via function-calling to produce summaries that replace forgotten events.
+ """
+
+ def __init__(
+ self,
+ llm: LLM,
+ max_size: int = 100,
+ keep_first: int = 1,
+ max_event_length: int = 10_000,
+ ):
+ if keep_first >= max_size // 2:
+ raise ValueError(
+ f'keep_first ({keep_first}) must be less than half of max_size ({max_size})'
+ )
+ if keep_first < 0:
+ raise ValueError(f'keep_first ({keep_first}) cannot be negative')
+ if max_size < 1:
+ raise ValueError(f'max_size ({max_size}) cannot be non-positive')
+
+ if not llm.is_function_calling_active():
+ raise ValueError(
+ 'LLM must support function calling to use StructuredSummaryCondenser'
+ )
+
+ self.max_size = max_size
+ self.keep_first = keep_first
+ self.max_event_length = max_event_length
+ self.llm = llm
+
+ super().__init__()
+
+ def _truncate(self, content: str) -> str:
+ """Truncate the content to fit within the specified maximum event length."""
+ return truncate_content(content, max_chars=self.max_event_length)
+
+ def get_condensation(self, view: View) -> Condensation:
+ head = view[: self.keep_first]
+ target_size = self.max_size // 2
+ # Number of events to keep from the tail -- target size, minus however many
+ # prefix events from the head, minus one for the summarization event
+ events_from_tail = target_size - len(head) - 1
+
+ summary_event = (
+ view[self.keep_first]
+ if isinstance(view[self.keep_first], AgentCondensationObservation)
+ else AgentCondensationObservation('No events summarized')
+ )
+
+ # Identify events to be forgotten (those not in head or tail)
+ forgotten_events = []
+ for event in view[self.keep_first : -events_from_tail]:
+ if not isinstance(event, AgentCondensationObservation):
+ forgotten_events.append(event)
+
+ # Construct prompt for summarization
+ prompt = """You are maintaining a context-aware state summary for an interactive software agent. This summary is critical because it:
+1. Preserves essential context when conversation history grows too large
+2. Prevents lost work when the session length exceeds token limits
+3. Helps maintain continuity across multiple interactions
+
+You will be given:
+- A list of events (actions taken by the agent)
+- The most recent previous summary (if one exists)
+
+Capture all relevant information, especially:
+- User requirements that were explicitly stated
+- Work that has been completed
+- Tasks that remain pending
+- Current state of code, variables, and data structures
+- The status of any version control operations"""
+
+ prompt += '\n\n'
+
+ # Add the previous summary if it exists. We'll always have a summary
+ # event, but the types aren't precise enought to guarantee that it has a
+ # message attribute.
+ summary_event_content = self._truncate(
+ summary_event.message if summary_event.message else ''
+ )
+ prompt += f'\n{summary_event_content}\n\n'
+
+ prompt += '\n\n'
+
+ # Add all events that are being forgotten. We use the string
+ # representation defined by the event, and truncate it if necessary.
+ for forgotten_event in forgotten_events:
+ event_content = self._truncate(str(forgotten_event))
+ prompt += f'\n{event_content}\n\n'
+
+ messages = [Message(role='user', content=[TextContent(text=prompt)])]
+
+ response = self.llm.completion(
+ messages=self.llm.format_messages_for_llm(messages),
+ tools=[StateSummary.tool_description()],
+ tool_choice={
+ 'type': 'function',
+ 'function': {'name': 'create_state_summary'},
+ },
+ )
+
+ try:
+ # Extract the message containing tool calls
+ message = response.choices[0].message
+
+ # Check if there are tool calls
+ if not hasattr(message, 'tool_calls') or not message.tool_calls:
+ raise ValueError('No tool calls found in response')
+
+ # Find the create_state_summary tool call
+ summary_tool_call = None
+ for tool_call in message.tool_calls:
+ if tool_call.function.name == 'create_state_summary':
+ summary_tool_call = tool_call
+ break
+
+ if not summary_tool_call:
+ raise ValueError('create_state_summary tool call not found')
+
+ # Parse the arguments
+ args_json = summary_tool_call.function.arguments
+ args_dict = json.loads(args_json)
+
+ # Create a StateSummary object
+ summary = StateSummary.model_validate(args_dict)
+
+ except (ValueError, AttributeError, KeyError, json.JSONDecodeError) as e:
+ logger.warning(
+ f'Failed to parse summary tool call: {e}. Using empty summary.'
+ )
+ summary = StateSummary()
+
+ self.add_metadata('response', response.model_dump())
+ self.add_metadata('metrics', self.llm.metrics.get())
+
+ return Condensation(
+ action=CondensationAction(
+ forgotten_events_start_id=min(event.id for event in forgotten_events),
+ forgotten_events_end_id=max(event.id for event in forgotten_events),
+ summary=str(summary),
+ summary_offset=self.keep_first,
+ )
+ )
+
+ def should_condense(self, view: View) -> bool:
+ return len(view) > self.max_size
+
+ @classmethod
+ def from_config(
+ cls, config: StructuredSummaryCondenserConfig
+ ) -> StructuredSummaryCondenser:
+ return StructuredSummaryCondenser(
+ llm=LLM(config=config.llm_config),
+ max_size=config.max_size,
+ keep_first=config.keep_first,
+ max_event_length=config.max_event_length,
+ )
+
+
+StructuredSummaryCondenser.register_config(StructuredSummaryCondenserConfig)
diff --git a/openhands/server/session/session.py b/openhands/server/session/session.py
index 0ddfd44775..fbc4f9a5c9 100644
--- a/openhands/server/session/session.py
+++ b/openhands/server/session/session.py
@@ -9,6 +9,7 @@ from openhands.controller.agent import Agent
from openhands.core.config import AppConfig
from openhands.core.config.condenser_config import (
LLMSummarizingCondenserConfig,
+ StructuredSummaryCondenserConfig,
)
from openhands.core.logger import OpenHandsLoggerAdapter
from openhands.core.schema import AgentState
@@ -19,7 +20,6 @@ from openhands.events.observation import (
CmdOutputObservation,
NullObservation,
)
-from openhands.events.observation.agent import RecallObservation
from openhands.events.observation.error import ErrorObservation
from openhands.events.serialization import event_from_dict, event_to_dict
from openhands.events.stream import EventStreamSubscriber
@@ -128,9 +128,21 @@ class Session:
agent_config = self.config.get_agent_config(agent_cls)
if settings.enable_default_condenser:
- default_condenser_config = LLMSummarizingCondenserConfig(
- llm_config=llm.config, keep_first=3, max_size=40
- )
+ # If function-calling is active we can use the structured summary
+ # condenser for more reliable summaries.
+ if llm.is_function_calling_active():
+ default_condenser_config = StructuredSummaryCondenserConfig(
+ llm_config=llm.config, keep_first=3, max_size=80
+ )
+
+ # Otherwise, we'll fall back to the unstructured summary condenser.
+ # This is a good default but struggles more than the structured
+ # summary condenser with long messages.
+ else:
+ default_condenser_config = LLMSummarizingCondenserConfig(
+ llm_config=llm.config, keep_first=3, max_size=80
+ )
+
self.logger.info(f'Enabling default condenser: {default_condenser_config}')
agent_config.condenser = default_condenser_config
@@ -200,7 +212,7 @@ class Session:
await self.send(event_to_dict(event))
# NOTE: ipython observations are not sent here currently
elif event.source == EventSource.ENVIRONMENT and isinstance(
- event, (CmdOutputObservation, AgentStateChangedObservation, RecallObservation)
+ event, (CmdOutputObservation, AgentStateChangedObservation)
):
# feedback from the environment to agent actions is understood as agent events by the UI
event_dict = event_to_dict(event)
diff --git a/tests/unit/test_condenser.py b/tests/unit/test_condenser.py
index f051ff41aa..c0b82fd53b 100644
--- a/tests/unit/test_condenser.py
+++ b/tests/unit/test_condenser.py
@@ -13,6 +13,7 @@ from openhands.core.config.condenser_config import (
NoOpCondenserConfig,
ObservationMaskingCondenserConfig,
RecentEventsCondenserConfig,
+ StructuredSummaryCondenserConfig,
)
from openhands.core.config.llm_config import LLMConfig
from openhands.core.message import Message, TextContent
@@ -32,6 +33,7 @@ from openhands.memory.condenser.impl import (
NoOpCondenser,
ObservationMaskingCondenser,
RecentEventsCondenser,
+ StructuredSummaryCondenser,
)
@@ -85,6 +87,8 @@ def mock_llm() -> LLM:
Message(role='user', content=[TextContent(text=str(event))]) for event in events
]
+ mock_llm.is_function_calling_active.return_value = True
+
return mock_llm
@@ -600,3 +604,93 @@ def test_llm_attention_condenser_handles_keep_first_events(mock_llm):
for i, view in enumerate(harness.views(events)):
assert len(view) == harness.expected_size(i, max_size)
assert view[:keep_first] == events[: min(keep_first, i + 1)]
+
+
+def test_structured_summary_condenser_from_config():
+ """Test that StructuredSummaryCondenser objects can be made from config."""
+ config = StructuredSummaryCondenserConfig(
+ max_size=50,
+ keep_first=10,
+ llm_config=LLMConfig(
+ model='gpt-4o',
+ api_key='test_key',
+ ),
+ )
+ condenser = Condenser.from_config(config)
+
+ assert isinstance(condenser, StructuredSummaryCondenser)
+ assert condenser.llm.config.model == 'gpt-4o'
+ assert condenser.llm.config.api_key.get_secret_value() == 'test_key'
+ assert condenser.max_size == 50
+ assert condenser.keep_first == 10
+
+
+def test_structured_summary_condenser_invalid_config():
+ """Test that StructuredSummaryCondenser raises error when keep_first > max_size."""
+ # Since the condenser only works when function calling is on, we need to
+ # mock up the check for that.
+ llm = MagicMock()
+ llm.is_function_calling_active.return_value = True
+
+ pytest.raises(
+ ValueError,
+ StructuredSummaryCondenser,
+ llm=llm,
+ max_size=4,
+ keep_first=2,
+ )
+
+ pytest.raises(ValueError, StructuredSummaryCondenser, llm=llm, max_size=0)
+ pytest.raises(ValueError, StructuredSummaryCondenser, llm=llm, keep_first=-1)
+
+ # If all other parameters are good but there's no function calling the
+ # condenser still counts as improperly configured.
+ llm.is_function_calling_active.return_value = False
+ pytest.raises(
+ ValueError, StructuredSummaryCondenser, llm=llm, max_size=40, keep_first=2
+ )
+
+
+def test_structured_summary_condenser_gives_expected_view_size(mock_llm):
+ """Test that StructuredSummaryCondenser maintains the correct view size."""
+ max_size = 10
+ condenser = StructuredSummaryCondenser(max_size=max_size, llm=mock_llm)
+
+ events = [create_test_event(f'Event {i}', id=i) for i in range(max_size * 10)]
+
+ # Set up mock LLM response
+ mock_llm.set_mock_response_content('Summary of forgotten events')
+
+ harness = RollingCondenserTestHarness(condenser)
+
+ for i, view in enumerate(harness.views(events)):
+ assert len(view) == harness.expected_size(i, max_size)
+
+
+def test_structured_summary_condenser_keeps_first_and_summary_events(mock_llm):
+ """Test that the StructuredSummaryCondenser appropriately maintains the event prefix and any summary events."""
+ max_size = 10
+ keep_first = 3
+ condenser = StructuredSummaryCondenser(
+ max_size=max_size, keep_first=keep_first, llm=mock_llm
+ )
+
+ mock_llm.set_mock_response_content('Summary of forgotten events')
+
+ events = [create_test_event(f'Event {i}', id=i) for i in range(max_size * 10)]
+ harness = RollingCondenserTestHarness(condenser)
+
+ for i, view in enumerate(harness.views(events)):
+ assert len(view) == harness.expected_size(i, max_size)
+
+ # Ensure that the we've called out the summarizing LLM once per condensation
+ assert mock_llm.completion.call_count == harness.expected_condensations(
+ i, max_size
+ )
+
+ # Ensure that the prefix is appropiately maintained
+ assert view[:keep_first] == events[: min(keep_first, i + 1)]
+
+ # If we've condensed, ensure that the summary event is present
+ if i > max_size:
+ assert isinstance(view[keep_first], AgentCondensationObservation)