fix: Disable prompt caching in default condenser (#7781)

Co-authored-by: openhands <openhands@all-hands.dev> Co-authored-by: Calvin Smith <calvin@all-hands.dev> Co-authored-by: Xingyao Wang <xingyao@all-hands.dev>
2025-12-26 13:52:43 +08:00 · 2025-04-11 10:09:23 -06:00 · 2025-04-11 10:09:23 -06:00 · 36e092e0ac
commit 36e092e0ac
parent e2bb69908a
4 changed files with 36 additions and 7 deletions
--- a/openhands/memory/condenser/impl/llm_attention_condenser.py
+++ b/openhands/memory/condenser/impl/llm_attention_condenser.py
@ -114,8 +114,14 @@ class LLMAttentionCondenser(RollingCondenser):

    @classmethod
    def from_config(cls, config: LLMAttentionCondenserConfig) -> LLMAttentionCondenser:
+        # This condenser cannot take advantage of prompt caching. If it happens
+        # to be set, we'll pay for the cache writes but never get a chance to
+        # save on a read.
+        llm_config = config.llm_config.model_copy()
+        llm_config.caching_prompt = False
+
        return LLMAttentionCondenser(
-            llm=LLM(config=config.llm_config),
+            llm=LLM(config=llm_config),
            max_size=config.max_size,
            keep_first=config.keep_first,
        )
--- a/openhands/memory/condenser/impl/llm_summarizing_condenser.py
+++ b/openhands/memory/condenser/impl/llm_summarizing_condenser.py
@ -155,8 +155,14 @@ CURRENT_STATE: Last flip: Heads, Haiku count: 15/20"""
    def from_config(
        cls, config: LLMSummarizingCondenserConfig
    ) -> LLMSummarizingCondenser:
+        # This condenser cannot take advantage of prompt caching. If it happens
+        # to be set, we'll pay for the cache writes but never get a chance to
+        # save on a read.
+        llm_config = config.llm_config.model_copy()
+        llm_config.caching_prompt = False
+
        return LLMSummarizingCondenser(
-            llm=LLM(config=config.llm_config),
+            llm=LLM(config=llm_config),
            max_size=config.max_size,
            keep_first=config.keep_first,
            max_event_length=config.max_event_length,
--- a/openhands/memory/condenser/impl/structured_summary_condenser.py
+++ b/openhands/memory/condenser/impl/structured_summary_condenser.py
@ -311,8 +311,14 @@ Capture all relevant information, especially:
    def from_config(
        cls, config: StructuredSummaryCondenserConfig
    ) -> StructuredSummaryCondenser:
+        # This condenser cannot take advantage of prompt caching. If it happens
+        # to be set, we'll pay for the cache writes but never get a chance to
+        # save on a read.
+        llm_config = config.llm_config.model_copy()
+        llm_config.caching_prompt = False
+
        return StructuredSummaryCondenser(
-            llm=LLM(config=config.llm_config),
+            llm=LLM(config=llm_config),
            max_size=config.max_size,
            keep_first=config.keep_first,
            max_event_length=config.max_event_length,
--- a/tests/unit/test_condenser.py
+++ b/tests/unit/test_condenser.py
@ -331,10 +331,7 @@ def test_llm_summarizing_condenser_from_config():
    config = LLMSummarizingCondenserConfig(
        max_size=50,
        keep_first=10,
-        llm_config=LLMConfig(
-            model='gpt-4o',
-            api_key='test_key',
-        ),
+        llm_config=LLMConfig(model='gpt-4o', api_key='test_key', caching_prompt=True),
    )
    condenser = Condenser.from_config(config)

@ -344,6 +341,10 @@ def test_llm_summarizing_condenser_from_config():
    assert condenser.max_size == 50
    assert condenser.keep_first == 10

+    # Since this condenser can't take advantage of caching, we intercept the
+    # passed config and manually flip the caching prompt to False.
+    assert not condenser.llm.config.caching_prompt
+

 def test_llm_summarizing_condenser_invalid_config():
    """Test that LLMSummarizingCondenser raises error when keep_first > max_size."""
@ -474,6 +475,7 @@ def test_llm_attention_condenser_from_config():
        llm_config=LLMConfig(
            model='gpt-4o',
            api_key='test_key',
+            caching_prompt=True,
        ),
    )
    condenser = Condenser.from_config(config)
@ -484,6 +486,10 @@ def test_llm_attention_condenser_from_config():
    assert condenser.max_size == 50
    assert condenser.keep_first == 10

+    # Since this condenser can't take advantage of caching, we intercept the
+    # passed config and manually flip the caching prompt to False.
+    assert not condenser.llm.config.caching_prompt
+

 def test_llm_attention_condenser_invalid_config():
    """Test that LLMAttentionCondenser raises an error if the configured LLM doesn't support response schema."""
@ -614,6 +620,7 @@ def test_structured_summary_condenser_from_config():
        llm_config=LLMConfig(
            model='gpt-4o',
            api_key='test_key',
+            caching_prompt=True,
        ),
    )
    condenser = Condenser.from_config(config)
@ -624,6 +631,10 @@ def test_structured_summary_condenser_from_config():
    assert condenser.max_size == 50
    assert condenser.keep_first == 10

+    # Since this condenser can't take advantage of caching, we intercept the
+    # passed config and manually flip the caching prompt to False.
+    assert not condenser.llm.config.caching_prompt
+

 def test_structured_summary_condenser_invalid_config():
    """Test that StructuredSummaryCondenser raises error when keep_first > max_size."""