diff --git a/openhands/server/services/conversation_stats.py b/openhands/server/services/conversation_stats.py index 89ed789f6c..ff72681db9 100644 --- a/openhands/server/services/conversation_stats.py +++ b/openhands/server/services/conversation_stats.py @@ -36,7 +36,30 @@ class ConversationStats: return with self._save_lock: - pickled = pickle.dumps(self.service_to_metrics) + # Check for duplicate service IDs between restored and service metrics + duplicate_services = set(self.restored_metrics.keys()) & set( + self.service_to_metrics.keys() + ) + if duplicate_services: + logger.error( + f'Duplicate service IDs found between restored and service metrics: {duplicate_services}. ' + 'This should not happen as registered services should be removed from restored_metrics. ' + 'Proceeding by preferring service_to_metrics values for duplicates.', + extra={ + 'conversation_id': self.conversation_id, + 'duplicate_services': list(duplicate_services), + }, + ) + + # Combine both restored metrics and service metrics to avoid data loss + # Start with restored metrics (for services not yet registered) + combined_metrics = self.restored_metrics.copy() + + # Add service metrics (for registered services) + # Since we checked for duplicates above, this is safe + combined_metrics.update(self.service_to_metrics) + + pickled = pickle.dumps(combined_metrics) serialized_metrics = base64.b64encode(pickled).decode('utf-8') self.file_store.write(self.metrics_path, serialized_metrics) logger.info( diff --git a/tests/unit/test_conversation_stats.py b/tests/unit/test_conversation_stats.py index 61de343f89..59b656684d 100644 --- a/tests/unit/test_conversation_stats.py +++ b/tests/unit/test_conversation_stats.py @@ -488,3 +488,125 @@ def test_save_and_restore_workflow(mock_file_store): assert llm.metrics.accumulated_cost == 0.05 assert llm.metrics.accumulated_token_usage.prompt_tokens == 100 assert llm.metrics.accumulated_token_usage.completion_tokens == 50 + + +def test_save_metrics_preserves_restored_metrics_fix(mock_file_store): + """Test that save_metrics correctly preserves restored metrics for unregistered services.""" + conversation_id = 'test-conversation-id' + user_id = 'test-user-id' + + # Step 1: Create initial conversation stats with multiple services + stats1 = ConversationStats( + file_store=mock_file_store, conversation_id=conversation_id, user_id=user_id + ) + + # Add metrics for multiple services + service_a = 'service-a' + service_b = 'service-b' + service_c = 'service-c' + + metrics_a = Metrics(model_name='gpt-4') + metrics_a.add_cost(0.10) + + metrics_b = Metrics(model_name='gpt-3.5') + metrics_b.add_cost(0.05) + + metrics_c = Metrics(model_name='claude-3') + metrics_c.add_cost(0.08) + + stats1.service_to_metrics[service_a] = metrics_a + stats1.service_to_metrics[service_b] = metrics_b + stats1.service_to_metrics[service_c] = metrics_c + + # Save metrics (all three services should be saved) + stats1.save_metrics() + + # Step 2: Create new conversation stats instance (simulates app restart) + stats2 = ConversationStats( + file_store=mock_file_store, conversation_id=conversation_id, user_id=user_id + ) + + # Verify all metrics were restored + assert service_a in stats2.restored_metrics + assert service_b in stats2.restored_metrics + assert service_c in stats2.restored_metrics + assert stats2.restored_metrics[service_a].accumulated_cost == 0.10 + assert stats2.restored_metrics[service_b].accumulated_cost == 0.05 + assert stats2.restored_metrics[service_c].accumulated_cost == 0.08 + + # Step 3: Register only one LLM service (simulates partial LLM activation) + llm_config = LLMConfig( + model='gpt-4o', + api_key='test_key', + num_retries=2, + retry_min_wait=1, + retry_max_wait=2, + ) + + with patch('openhands.llm.llm.litellm_completion'): + llm_a = LLM(service_id=service_a, config=llm_config) + event_a = RegistryEvent(llm=llm_a, service_id=service_a) + stats2.register_llm(event_a) + + # Verify service_a was moved from restored_metrics to service_to_metrics + assert service_a in stats2.service_to_metrics + assert service_a not in stats2.restored_metrics + assert stats2.service_to_metrics[service_a].accumulated_cost == 0.10 + + # Verify services B and C are still in restored_metrics (not yet registered) + assert service_b in stats2.restored_metrics + assert service_c in stats2.restored_metrics + assert stats2.restored_metrics[service_b].accumulated_cost == 0.05 + assert stats2.restored_metrics[service_c].accumulated_cost == 0.08 + + # Step 4: Save metrics again (this is where the bug occurs) + stats2.save_metrics() + + # Step 5: Create a third conversation stats instance to verify what was saved + stats3 = ConversationStats( + file_store=mock_file_store, conversation_id=conversation_id, user_id=user_id + ) + + # FIXED: All services should be restored because save_metrics now combines both dictionaries + # Service A should be restored with its current metrics from service_to_metrics + assert service_a in stats3.restored_metrics + assert stats3.restored_metrics[service_a].accumulated_cost == 0.10 + + # Services B and C should be preserved from restored_metrics + assert service_b in stats3.restored_metrics # FIXED: Now preserved + assert service_c in stats3.restored_metrics # FIXED: Now preserved + assert stats3.restored_metrics[service_b].accumulated_cost == 0.05 + assert stats3.restored_metrics[service_c].accumulated_cost == 0.08 + + +def test_save_metrics_throws_error_on_duplicate_service_ids(mock_file_store): + """Test updated: save_metrics should NOT raise on duplicate service IDs; it should prefer service_to_metrics and proceed.""" + conversation_id = 'test-conversation-id' + user_id = 'test-user-id' + + stats = ConversationStats( + file_store=mock_file_store, conversation_id=conversation_id, user_id=user_id + ) + + # Manually create a scenario with duplicate service IDs (this should never happen in normal operation) + service_id = 'duplicate-service' + + # Add to both restored_metrics and service_to_metrics + restored_metrics = Metrics(model_name='gpt-4') + restored_metrics.add_cost(0.10) + stats.restored_metrics[service_id] = restored_metrics + + service_metrics = Metrics(model_name='gpt-3.5') + service_metrics.add_cost(0.05) + stats.service_to_metrics[service_id] = service_metrics + + # Should not raise. Should save with service_to_metrics preferred. + stats.save_metrics() + + # Verify the saved content prefers service_to_metrics for duplicates + encoded = mock_file_store.read(stats.metrics_path) + pickled = base64.b64decode(encoded) + restored = pickle.loads(pickled) + + assert service_id in restored + assert restored[service_id].accumulated_cost == 0.05 # prefers service_to_metrics