import base64 import pickle from unittest.mock import patch import pytest from openhands.core.config import LLMConfig, OpenHandsConfig from openhands.llm.llm import LLM from openhands.llm.llm_registry import LLMRegistry, RegistryEvent from openhands.llm.metrics import Metrics from openhands.server.services.conversation_stats import ConversationStats from openhands.storage.memory import InMemoryFileStore @pytest.fixture def mock_file_store(): """Create a mock file store for testing.""" return InMemoryFileStore({}) @pytest.fixture def conversation_stats(mock_file_store): """Create a ConversationStats instance for testing.""" return ConversationStats( file_store=mock_file_store, conversation_id='test-conversation-id', user_id='test-user-id', ) @pytest.fixture def mock_llm_registry(): """Create a mock LLM registry that properly simulates LLM registration.""" config = OpenHandsConfig() registry = LLMRegistry(config=config, agent_cls=None, retry_listener=None) return registry @pytest.fixture def connected_registry_and_stats(mock_llm_registry, conversation_stats): """Connect the LLMRegistry and ConversationStats properly.""" # Subscribe to LLM registry events to track metrics mock_llm_registry.subscribe(conversation_stats.register_llm) return mock_llm_registry, conversation_stats def test_conversation_stats_initialization(conversation_stats): """Test that ConversationStats initializes correctly.""" assert conversation_stats.conversation_id == 'test-conversation-id' assert conversation_stats.user_id == 'test-user-id' assert conversation_stats.service_to_metrics == {} assert isinstance(conversation_stats.restored_metrics, dict) def test_save_metrics(conversation_stats, mock_file_store): """Test that metrics are saved correctly.""" # Add a service with metrics service_id = 'test-service' metrics = Metrics(model_name='gpt-4') metrics.add_cost(0.05) conversation_stats.service_to_metrics[service_id] = metrics # Save metrics conversation_stats.save_metrics() # Verify that metrics were saved to the file store try: # Verify the saved content can be decoded and unpickled encoded = mock_file_store.read(conversation_stats.metrics_path) pickled = base64.b64decode(encoded) restored = pickle.loads(pickled) assert service_id in restored assert restored[service_id].accumulated_cost == 0.05 except FileNotFoundError: pytest.fail(f'File not found: {conversation_stats.metrics_path}') def test_maybe_restore_metrics(mock_file_store): """Test that metrics are restored correctly.""" # Create metrics to save service_id = 'test-service' metrics = Metrics(model_name='gpt-4') metrics.add_cost(0.1) service_to_metrics = {service_id: metrics} # Serialize and save metrics pickled = pickle.dumps(service_to_metrics) serialized_metrics = base64.b64encode(pickled).decode('utf-8') # Create a new ConversationStats with pre-populated file store conversation_id = 'test-conversation-id' user_id = 'test-user-id' # Get the correct path using the same function as ConversationStats from openhands.storage.locations import get_conversation_stats_filename metrics_path = get_conversation_stats_filename(conversation_id, user_id) # Write to the correct path mock_file_store.write(metrics_path, serialized_metrics) # Create ConversationStats which should restore metrics stats = ConversationStats( file_store=mock_file_store, conversation_id=conversation_id, user_id=user_id ) # Verify metrics were restored assert service_id in stats.restored_metrics assert stats.restored_metrics[service_id].accumulated_cost == 0.1 def test_get_combined_metrics(conversation_stats): """Test that combined metrics are calculated correctly.""" # Add multiple services with metrics service1 = 'service1' metrics1 = Metrics(model_name='gpt-4') metrics1.add_cost(0.05) metrics1.add_token_usage( prompt_tokens=100, completion_tokens=50, cache_read_tokens=0, cache_write_tokens=0, context_window=8000, response_id='resp1', ) service2 = 'service2' metrics2 = Metrics(model_name='gpt-3.5') metrics2.add_cost(0.02) metrics2.add_token_usage( prompt_tokens=200, completion_tokens=100, cache_read_tokens=0, cache_write_tokens=0, context_window=4000, response_id='resp2', ) conversation_stats.service_to_metrics[service1] = metrics1 conversation_stats.service_to_metrics[service2] = metrics2 # Get combined metrics combined = conversation_stats.get_combined_metrics() # Verify combined metrics assert combined.accumulated_cost == 0.07 # 0.05 + 0.02 assert combined.accumulated_token_usage.prompt_tokens == 300 # 100 + 200 assert combined.accumulated_token_usage.completion_tokens == 150 # 50 + 100 assert ( combined.accumulated_token_usage.context_window == 8000 ) # max of 8000 and 4000 def test_get_metrics_for_service(conversation_stats): """Test that metrics for a specific service are retrieved correctly.""" # Add a service with metrics service_id = 'test-service' metrics = Metrics(model_name='gpt-4') metrics.add_cost(0.05) conversation_stats.service_to_metrics[service_id] = metrics # Get metrics for the service retrieved_metrics = conversation_stats.get_metrics_for_service(service_id) # Verify metrics assert retrieved_metrics.accumulated_cost == 0.05 assert retrieved_metrics is metrics # Should be the same object # Test getting metrics for non-existent service # Use a specific exception message pattern instead of a blind Exception with pytest.raises(Exception, match='LLM service does not exist'): conversation_stats.get_metrics_for_service('non-existent-service') def test_register_llm_with_new_service(conversation_stats): """Test registering a new LLM service.""" # Create a real LLM instance with a mock config llm_config = LLMConfig( model='gpt-4o', api_key='test_key', num_retries=2, retry_min_wait=1, retry_max_wait=2, ) # Patch the LLM class to avoid actual API calls with patch('openhands.llm.llm.litellm_completion'): llm = LLM(service_id='new-service', config=llm_config) # Create a registry event service_id = 'new-service' event = RegistryEvent(llm=llm, service_id=service_id) # Register the LLM conversation_stats.register_llm(event) # Verify the service was registered assert service_id in conversation_stats.service_to_metrics assert conversation_stats.service_to_metrics[service_id] is llm.metrics def test_register_llm_with_restored_metrics(conversation_stats): """Test registering an LLM service with restored metrics.""" # Create restored metrics service_id = 'restored-service' restored_metrics = Metrics(model_name='gpt-4') restored_metrics.add_cost(0.1) conversation_stats.restored_metrics = {service_id: restored_metrics} # Create a real LLM instance with a mock config llm_config = LLMConfig( model='gpt-4o', api_key='test_key', num_retries=2, retry_min_wait=1, retry_max_wait=2, ) # Patch the LLM class to avoid actual API calls with patch('openhands.llm.llm.litellm_completion'): llm = LLM(service_id=service_id, config=llm_config) # Create a registry event event = RegistryEvent(llm=llm, service_id=service_id) # Register the LLM conversation_stats.register_llm(event) # Verify the service was registered with restored metrics assert service_id in conversation_stats.service_to_metrics assert conversation_stats.service_to_metrics[service_id] is llm.metrics assert llm.metrics.accumulated_cost == 0.1 # Restored cost # Verify the specific service was removed from restored_metrics assert service_id not in conversation_stats.restored_metrics assert hasattr( conversation_stats, 'restored_metrics' ) # The dict should still exist def test_llm_registry_notifications(connected_registry_and_stats): """Test that LLM registry notifications update conversation stats.""" mock_llm_registry, conversation_stats = connected_registry_and_stats # Create a new LLM through the registry service_id = 'test-service' llm_config = LLMConfig( model='gpt-4o', api_key='test_key', num_retries=2, retry_min_wait=1, retry_max_wait=2, ) # Get LLM from registry (this should trigger the notification) llm = mock_llm_registry.get_llm(service_id, llm_config) # Verify the service was registered in conversation stats assert service_id in conversation_stats.service_to_metrics assert conversation_stats.service_to_metrics[service_id] is llm.metrics # Add some metrics to the LLM llm.metrics.add_cost(0.05) llm.metrics.add_token_usage( prompt_tokens=100, completion_tokens=50, cache_read_tokens=0, cache_write_tokens=0, context_window=8000, response_id='resp1', ) # Verify the metrics are reflected in conversation stats assert conversation_stats.service_to_metrics[service_id].accumulated_cost == 0.05 assert ( conversation_stats.service_to_metrics[ service_id ].accumulated_token_usage.prompt_tokens == 100 ) assert ( conversation_stats.service_to_metrics[ service_id ].accumulated_token_usage.completion_tokens == 50 ) # Get combined metrics and verify combined = conversation_stats.get_combined_metrics() assert combined.accumulated_cost == 0.05 assert combined.accumulated_token_usage.prompt_tokens == 100 assert combined.accumulated_token_usage.completion_tokens == 50 def test_multiple_llm_services(connected_registry_and_stats): """Test tracking metrics for multiple LLM services.""" mock_llm_registry, conversation_stats = connected_registry_and_stats # Create multiple LLMs through the registry service1 = 'service1' service2 = 'service2' llm_config1 = LLMConfig( model='gpt-4o', api_key='test_key', num_retries=2, retry_min_wait=1, retry_max_wait=2, ) llm_config2 = LLMConfig( model='gpt-3.5-turbo', api_key='test_key', num_retries=2, retry_min_wait=1, retry_max_wait=2, ) # Get LLMs from registry (this should trigger notifications) llm1 = mock_llm_registry.get_llm(service1, llm_config1) llm2 = mock_llm_registry.get_llm(service2, llm_config2) # Add different metrics to each LLM llm1.metrics.add_cost(0.05) llm1.metrics.add_token_usage( prompt_tokens=100, completion_tokens=50, cache_read_tokens=0, cache_write_tokens=0, context_window=8000, response_id='resp1', ) llm2.metrics.add_cost(0.02) llm2.metrics.add_token_usage( prompt_tokens=200, completion_tokens=100, cache_read_tokens=0, cache_write_tokens=0, context_window=4000, response_id='resp2', ) # Verify services were registered in conversation stats assert service1 in conversation_stats.service_to_metrics assert service2 in conversation_stats.service_to_metrics # Verify individual metrics assert conversation_stats.service_to_metrics[service1].accumulated_cost == 0.05 assert conversation_stats.service_to_metrics[service2].accumulated_cost == 0.02 # Get combined metrics and verify combined = conversation_stats.get_combined_metrics() assert combined.accumulated_cost == 0.07 # 0.05 + 0.02 assert combined.accumulated_token_usage.prompt_tokens == 300 # 100 + 200 assert combined.accumulated_token_usage.completion_tokens == 150 # 50 + 100 assert ( combined.accumulated_token_usage.context_window == 8000 ) # max of 8000 and 4000 def test_register_llm_with_multiple_restored_services_bug(conversation_stats): """Test that reproduces the bug where del self.restored_metrics deletes entire dict instead of specific service.""" # Create restored metrics for multiple services service_id_1 = 'service-1' service_id_2 = 'service-2' restored_metrics_1 = Metrics(model_name='gpt-4') restored_metrics_1.add_cost(0.1) restored_metrics_2 = Metrics(model_name='gpt-3.5') restored_metrics_2.add_cost(0.05) # Set up restored metrics for both services conversation_stats.restored_metrics = { service_id_1: restored_metrics_1, service_id_2: restored_metrics_2, } # Create LLM configs llm_config_1 = LLMConfig( model='gpt-4o', api_key='test_key', num_retries=2, retry_min_wait=1, retry_max_wait=2, ) llm_config_2 = LLMConfig( model='gpt-3.5-turbo', api_key='test_key', num_retries=2, retry_min_wait=1, retry_max_wait=2, ) # Patch the LLM class to avoid actual API calls with patch('openhands.llm.llm.litellm_completion'): # Register first LLM llm_1 = LLM(service_id=service_id_1, config=llm_config_1) event_1 = RegistryEvent(llm=llm_1, service_id=service_id_1) conversation_stats.register_llm(event_1) # Verify first service was registered with restored metrics assert service_id_1 in conversation_stats.service_to_metrics assert llm_1.metrics.accumulated_cost == 0.1 # After registering first service, restored_metrics should still contain service_id_2 assert service_id_2 in conversation_stats.restored_metrics # Register second LLM - this should also work with restored metrics llm_2 = LLM(service_id=service_id_2, config=llm_config_2) event_2 = RegistryEvent(llm=llm_2, service_id=service_id_2) conversation_stats.register_llm(event_2) # Verify second service was registered with restored metrics assert service_id_2 in conversation_stats.service_to_metrics assert llm_2.metrics.accumulated_cost == 0.05 # After both services are registered, restored_metrics should be empty assert len(conversation_stats.restored_metrics) == 0 def test_save_and_restore_workflow(mock_file_store): """Test the full workflow of saving and restoring metrics.""" # Create initial conversation stats conversation_id = 'test-conversation-id' user_id = 'test-user-id' stats1 = ConversationStats( file_store=mock_file_store, conversation_id=conversation_id, user_id=user_id ) # Add a service with metrics service_id = 'test-service' metrics = Metrics(model_name='gpt-4') metrics.add_cost(0.05) metrics.add_token_usage( prompt_tokens=100, completion_tokens=50, cache_read_tokens=0, cache_write_tokens=0, context_window=8000, response_id='resp1', ) stats1.service_to_metrics[service_id] = metrics # Save metrics stats1.save_metrics() # Create a new conversation stats instance that should restore the metrics stats2 = ConversationStats( file_store=mock_file_store, conversation_id=conversation_id, user_id=user_id ) # Verify metrics were restored assert service_id in stats2.restored_metrics assert stats2.restored_metrics[service_id].accumulated_cost == 0.05 assert ( stats2.restored_metrics[service_id].accumulated_token_usage.prompt_tokens == 100 ) assert ( stats2.restored_metrics[service_id].accumulated_token_usage.completion_tokens == 50 ) # Create a real LLM instance with a mock config llm_config = LLMConfig( model='gpt-4o', api_key='test_key', num_retries=2, retry_min_wait=1, retry_max_wait=2, ) # Patch the LLM class to avoid actual API calls with patch('openhands.llm.llm.litellm_completion'): llm = LLM(service_id=service_id, config=llm_config) # Create a registry event event = RegistryEvent(llm=llm, service_id=service_id) # Register the LLM to trigger restoration stats2.register_llm(event) # Verify metrics were applied to the LLM assert llm.metrics.accumulated_cost == 0.05 assert llm.metrics.accumulated_token_usage.prompt_tokens == 100 assert llm.metrics.accumulated_token_usage.completion_tokens == 50