mirror of
https://github.com/OpenHands/OpenHands.git
synced 2025-12-25 21:36:52 +08:00
Co-authored-by: openhands <openhands@all-hands.dev> Co-authored-by: Graham Neubig <neubig@gmail.com> Co-authored-by: Engel Nyst <enyst@users.noreply.github.com>
491 lines
16 KiB
Python
491 lines
16 KiB
Python
import base64
|
|
import pickle
|
|
from unittest.mock import patch
|
|
|
|
import pytest
|
|
|
|
from openhands.core.config import LLMConfig, OpenHandsConfig
|
|
from openhands.llm.llm import LLM
|
|
from openhands.llm.llm_registry import LLMRegistry, RegistryEvent
|
|
from openhands.llm.metrics import Metrics
|
|
from openhands.server.services.conversation_stats import ConversationStats
|
|
from openhands.storage.memory import InMemoryFileStore
|
|
|
|
|
|
@pytest.fixture
|
|
def mock_file_store():
|
|
"""Create a mock file store for testing."""
|
|
return InMemoryFileStore({})
|
|
|
|
|
|
@pytest.fixture
|
|
def conversation_stats(mock_file_store):
|
|
"""Create a ConversationStats instance for testing."""
|
|
return ConversationStats(
|
|
file_store=mock_file_store,
|
|
conversation_id='test-conversation-id',
|
|
user_id='test-user-id',
|
|
)
|
|
|
|
|
|
@pytest.fixture
|
|
def mock_llm_registry():
|
|
"""Create a mock LLM registry that properly simulates LLM registration."""
|
|
config = OpenHandsConfig()
|
|
registry = LLMRegistry(config=config, agent_cls=None, retry_listener=None)
|
|
return registry
|
|
|
|
|
|
@pytest.fixture
|
|
def connected_registry_and_stats(mock_llm_registry, conversation_stats):
|
|
"""Connect the LLMRegistry and ConversationStats properly."""
|
|
# Subscribe to LLM registry events to track metrics
|
|
mock_llm_registry.subscribe(conversation_stats.register_llm)
|
|
return mock_llm_registry, conversation_stats
|
|
|
|
|
|
def test_conversation_stats_initialization(conversation_stats):
|
|
"""Test that ConversationStats initializes correctly."""
|
|
assert conversation_stats.conversation_id == 'test-conversation-id'
|
|
assert conversation_stats.user_id == 'test-user-id'
|
|
assert conversation_stats.service_to_metrics == {}
|
|
assert isinstance(conversation_stats.restored_metrics, dict)
|
|
|
|
|
|
def test_save_metrics(conversation_stats, mock_file_store):
|
|
"""Test that metrics are saved correctly."""
|
|
# Add a service with metrics
|
|
service_id = 'test-service'
|
|
metrics = Metrics(model_name='gpt-4')
|
|
metrics.add_cost(0.05)
|
|
conversation_stats.service_to_metrics[service_id] = metrics
|
|
|
|
# Save metrics
|
|
conversation_stats.save_metrics()
|
|
|
|
# Verify that metrics were saved to the file store
|
|
try:
|
|
# Verify the saved content can be decoded and unpickled
|
|
encoded = mock_file_store.read(conversation_stats.metrics_path)
|
|
pickled = base64.b64decode(encoded)
|
|
restored = pickle.loads(pickled)
|
|
|
|
assert service_id in restored
|
|
assert restored[service_id].accumulated_cost == 0.05
|
|
except FileNotFoundError:
|
|
pytest.fail(f'File not found: {conversation_stats.metrics_path}')
|
|
|
|
|
|
def test_maybe_restore_metrics(mock_file_store):
|
|
"""Test that metrics are restored correctly."""
|
|
# Create metrics to save
|
|
service_id = 'test-service'
|
|
metrics = Metrics(model_name='gpt-4')
|
|
metrics.add_cost(0.1)
|
|
service_to_metrics = {service_id: metrics}
|
|
|
|
# Serialize and save metrics
|
|
pickled = pickle.dumps(service_to_metrics)
|
|
serialized_metrics = base64.b64encode(pickled).decode('utf-8')
|
|
|
|
# Create a new ConversationStats with pre-populated file store
|
|
conversation_id = 'test-conversation-id'
|
|
user_id = 'test-user-id'
|
|
|
|
# Get the correct path using the same function as ConversationStats
|
|
from openhands.storage.locations import get_conversation_stats_filename
|
|
|
|
metrics_path = get_conversation_stats_filename(conversation_id, user_id)
|
|
|
|
# Write to the correct path
|
|
mock_file_store.write(metrics_path, serialized_metrics)
|
|
|
|
# Create ConversationStats which should restore metrics
|
|
stats = ConversationStats(
|
|
file_store=mock_file_store, conversation_id=conversation_id, user_id=user_id
|
|
)
|
|
|
|
# Verify metrics were restored
|
|
assert service_id in stats.restored_metrics
|
|
assert stats.restored_metrics[service_id].accumulated_cost == 0.1
|
|
|
|
|
|
def test_get_combined_metrics(conversation_stats):
|
|
"""Test that combined metrics are calculated correctly."""
|
|
# Add multiple services with metrics
|
|
service1 = 'service1'
|
|
metrics1 = Metrics(model_name='gpt-4')
|
|
metrics1.add_cost(0.05)
|
|
metrics1.add_token_usage(
|
|
prompt_tokens=100,
|
|
completion_tokens=50,
|
|
cache_read_tokens=0,
|
|
cache_write_tokens=0,
|
|
context_window=8000,
|
|
response_id='resp1',
|
|
)
|
|
|
|
service2 = 'service2'
|
|
metrics2 = Metrics(model_name='gpt-3.5')
|
|
metrics2.add_cost(0.02)
|
|
metrics2.add_token_usage(
|
|
prompt_tokens=200,
|
|
completion_tokens=100,
|
|
cache_read_tokens=0,
|
|
cache_write_tokens=0,
|
|
context_window=4000,
|
|
response_id='resp2',
|
|
)
|
|
|
|
conversation_stats.service_to_metrics[service1] = metrics1
|
|
conversation_stats.service_to_metrics[service2] = metrics2
|
|
|
|
# Get combined metrics
|
|
combined = conversation_stats.get_combined_metrics()
|
|
|
|
# Verify combined metrics
|
|
assert combined.accumulated_cost == 0.07 # 0.05 + 0.02
|
|
assert combined.accumulated_token_usage.prompt_tokens == 300 # 100 + 200
|
|
assert combined.accumulated_token_usage.completion_tokens == 150 # 50 + 100
|
|
assert (
|
|
combined.accumulated_token_usage.context_window == 8000
|
|
) # max of 8000 and 4000
|
|
|
|
|
|
def test_get_metrics_for_service(conversation_stats):
|
|
"""Test that metrics for a specific service are retrieved correctly."""
|
|
# Add a service with metrics
|
|
service_id = 'test-service'
|
|
metrics = Metrics(model_name='gpt-4')
|
|
metrics.add_cost(0.05)
|
|
conversation_stats.service_to_metrics[service_id] = metrics
|
|
|
|
# Get metrics for the service
|
|
retrieved_metrics = conversation_stats.get_metrics_for_service(service_id)
|
|
|
|
# Verify metrics
|
|
assert retrieved_metrics.accumulated_cost == 0.05
|
|
assert retrieved_metrics is metrics # Should be the same object
|
|
|
|
# Test getting metrics for non-existent service
|
|
# Use a specific exception message pattern instead of a blind Exception
|
|
with pytest.raises(Exception, match='LLM service does not exist'):
|
|
conversation_stats.get_metrics_for_service('non-existent-service')
|
|
|
|
|
|
def test_register_llm_with_new_service(conversation_stats):
|
|
"""Test registering a new LLM service."""
|
|
# Create a real LLM instance with a mock config
|
|
llm_config = LLMConfig(
|
|
model='gpt-4o',
|
|
api_key='test_key',
|
|
num_retries=2,
|
|
retry_min_wait=1,
|
|
retry_max_wait=2,
|
|
)
|
|
|
|
# Patch the LLM class to avoid actual API calls
|
|
with patch('openhands.llm.llm.litellm_completion'):
|
|
llm = LLM(service_id='new-service', config=llm_config)
|
|
|
|
# Create a registry event
|
|
service_id = 'new-service'
|
|
event = RegistryEvent(llm=llm, service_id=service_id)
|
|
|
|
# Register the LLM
|
|
conversation_stats.register_llm(event)
|
|
|
|
# Verify the service was registered
|
|
assert service_id in conversation_stats.service_to_metrics
|
|
assert conversation_stats.service_to_metrics[service_id] is llm.metrics
|
|
|
|
|
|
def test_register_llm_with_restored_metrics(conversation_stats):
|
|
"""Test registering an LLM service with restored metrics."""
|
|
# Create restored metrics
|
|
service_id = 'restored-service'
|
|
restored_metrics = Metrics(model_name='gpt-4')
|
|
restored_metrics.add_cost(0.1)
|
|
conversation_stats.restored_metrics = {service_id: restored_metrics}
|
|
|
|
# Create a real LLM instance with a mock config
|
|
llm_config = LLMConfig(
|
|
model='gpt-4o',
|
|
api_key='test_key',
|
|
num_retries=2,
|
|
retry_min_wait=1,
|
|
retry_max_wait=2,
|
|
)
|
|
|
|
# Patch the LLM class to avoid actual API calls
|
|
with patch('openhands.llm.llm.litellm_completion'):
|
|
llm = LLM(service_id=service_id, config=llm_config)
|
|
|
|
# Create a registry event
|
|
event = RegistryEvent(llm=llm, service_id=service_id)
|
|
|
|
# Register the LLM
|
|
conversation_stats.register_llm(event)
|
|
|
|
# Verify the service was registered with restored metrics
|
|
assert service_id in conversation_stats.service_to_metrics
|
|
assert conversation_stats.service_to_metrics[service_id] is llm.metrics
|
|
assert llm.metrics.accumulated_cost == 0.1 # Restored cost
|
|
|
|
# Verify the specific service was removed from restored_metrics
|
|
assert service_id not in conversation_stats.restored_metrics
|
|
assert hasattr(
|
|
conversation_stats, 'restored_metrics'
|
|
) # The dict should still exist
|
|
|
|
|
|
def test_llm_registry_notifications(connected_registry_and_stats):
|
|
"""Test that LLM registry notifications update conversation stats."""
|
|
mock_llm_registry, conversation_stats = connected_registry_and_stats
|
|
|
|
# Create a new LLM through the registry
|
|
service_id = 'test-service'
|
|
llm_config = LLMConfig(
|
|
model='gpt-4o',
|
|
api_key='test_key',
|
|
num_retries=2,
|
|
retry_min_wait=1,
|
|
retry_max_wait=2,
|
|
)
|
|
|
|
# Get LLM from registry (this should trigger the notification)
|
|
llm = mock_llm_registry.get_llm(service_id, llm_config)
|
|
|
|
# Verify the service was registered in conversation stats
|
|
assert service_id in conversation_stats.service_to_metrics
|
|
assert conversation_stats.service_to_metrics[service_id] is llm.metrics
|
|
|
|
# Add some metrics to the LLM
|
|
llm.metrics.add_cost(0.05)
|
|
llm.metrics.add_token_usage(
|
|
prompt_tokens=100,
|
|
completion_tokens=50,
|
|
cache_read_tokens=0,
|
|
cache_write_tokens=0,
|
|
context_window=8000,
|
|
response_id='resp1',
|
|
)
|
|
|
|
# Verify the metrics are reflected in conversation stats
|
|
assert conversation_stats.service_to_metrics[service_id].accumulated_cost == 0.05
|
|
assert (
|
|
conversation_stats.service_to_metrics[
|
|
service_id
|
|
].accumulated_token_usage.prompt_tokens
|
|
== 100
|
|
)
|
|
assert (
|
|
conversation_stats.service_to_metrics[
|
|
service_id
|
|
].accumulated_token_usage.completion_tokens
|
|
== 50
|
|
)
|
|
|
|
# Get combined metrics and verify
|
|
combined = conversation_stats.get_combined_metrics()
|
|
assert combined.accumulated_cost == 0.05
|
|
assert combined.accumulated_token_usage.prompt_tokens == 100
|
|
assert combined.accumulated_token_usage.completion_tokens == 50
|
|
|
|
|
|
def test_multiple_llm_services(connected_registry_and_stats):
|
|
"""Test tracking metrics for multiple LLM services."""
|
|
mock_llm_registry, conversation_stats = connected_registry_and_stats
|
|
|
|
# Create multiple LLMs through the registry
|
|
service1 = 'service1'
|
|
service2 = 'service2'
|
|
|
|
llm_config1 = LLMConfig(
|
|
model='gpt-4o',
|
|
api_key='test_key',
|
|
num_retries=2,
|
|
retry_min_wait=1,
|
|
retry_max_wait=2,
|
|
)
|
|
|
|
llm_config2 = LLMConfig(
|
|
model='gpt-3.5-turbo',
|
|
api_key='test_key',
|
|
num_retries=2,
|
|
retry_min_wait=1,
|
|
retry_max_wait=2,
|
|
)
|
|
|
|
# Get LLMs from registry (this should trigger notifications)
|
|
llm1 = mock_llm_registry.get_llm(service1, llm_config1)
|
|
llm2 = mock_llm_registry.get_llm(service2, llm_config2)
|
|
|
|
# Add different metrics to each LLM
|
|
llm1.metrics.add_cost(0.05)
|
|
llm1.metrics.add_token_usage(
|
|
prompt_tokens=100,
|
|
completion_tokens=50,
|
|
cache_read_tokens=0,
|
|
cache_write_tokens=0,
|
|
context_window=8000,
|
|
response_id='resp1',
|
|
)
|
|
|
|
llm2.metrics.add_cost(0.02)
|
|
llm2.metrics.add_token_usage(
|
|
prompt_tokens=200,
|
|
completion_tokens=100,
|
|
cache_read_tokens=0,
|
|
cache_write_tokens=0,
|
|
context_window=4000,
|
|
response_id='resp2',
|
|
)
|
|
|
|
# Verify services were registered in conversation stats
|
|
assert service1 in conversation_stats.service_to_metrics
|
|
assert service2 in conversation_stats.service_to_metrics
|
|
|
|
# Verify individual metrics
|
|
assert conversation_stats.service_to_metrics[service1].accumulated_cost == 0.05
|
|
assert conversation_stats.service_to_metrics[service2].accumulated_cost == 0.02
|
|
|
|
# Get combined metrics and verify
|
|
combined = conversation_stats.get_combined_metrics()
|
|
assert combined.accumulated_cost == 0.07 # 0.05 + 0.02
|
|
assert combined.accumulated_token_usage.prompt_tokens == 300 # 100 + 200
|
|
assert combined.accumulated_token_usage.completion_tokens == 150 # 50 + 100
|
|
assert (
|
|
combined.accumulated_token_usage.context_window == 8000
|
|
) # max of 8000 and 4000
|
|
|
|
|
|
def test_register_llm_with_multiple_restored_services_bug(conversation_stats):
|
|
"""Test that reproduces the bug where del self.restored_metrics deletes entire dict instead of specific service."""
|
|
# Create restored metrics for multiple services
|
|
service_id_1 = 'service-1'
|
|
service_id_2 = 'service-2'
|
|
|
|
restored_metrics_1 = Metrics(model_name='gpt-4')
|
|
restored_metrics_1.add_cost(0.1)
|
|
|
|
restored_metrics_2 = Metrics(model_name='gpt-3.5')
|
|
restored_metrics_2.add_cost(0.05)
|
|
|
|
# Set up restored metrics for both services
|
|
conversation_stats.restored_metrics = {
|
|
service_id_1: restored_metrics_1,
|
|
service_id_2: restored_metrics_2,
|
|
}
|
|
|
|
# Create LLM configs
|
|
llm_config_1 = LLMConfig(
|
|
model='gpt-4o',
|
|
api_key='test_key',
|
|
num_retries=2,
|
|
retry_min_wait=1,
|
|
retry_max_wait=2,
|
|
)
|
|
|
|
llm_config_2 = LLMConfig(
|
|
model='gpt-3.5-turbo',
|
|
api_key='test_key',
|
|
num_retries=2,
|
|
retry_min_wait=1,
|
|
retry_max_wait=2,
|
|
)
|
|
|
|
# Patch the LLM class to avoid actual API calls
|
|
with patch('openhands.llm.llm.litellm_completion'):
|
|
# Register first LLM
|
|
llm_1 = LLM(service_id=service_id_1, config=llm_config_1)
|
|
event_1 = RegistryEvent(llm=llm_1, service_id=service_id_1)
|
|
conversation_stats.register_llm(event_1)
|
|
|
|
# Verify first service was registered with restored metrics
|
|
assert service_id_1 in conversation_stats.service_to_metrics
|
|
assert llm_1.metrics.accumulated_cost == 0.1
|
|
|
|
# After registering first service, restored_metrics should still contain service_id_2
|
|
assert service_id_2 in conversation_stats.restored_metrics
|
|
|
|
# Register second LLM - this should also work with restored metrics
|
|
llm_2 = LLM(service_id=service_id_2, config=llm_config_2)
|
|
event_2 = RegistryEvent(llm=llm_2, service_id=service_id_2)
|
|
conversation_stats.register_llm(event_2)
|
|
|
|
# Verify second service was registered with restored metrics
|
|
assert service_id_2 in conversation_stats.service_to_metrics
|
|
assert llm_2.metrics.accumulated_cost == 0.05
|
|
|
|
# After both services are registered, restored_metrics should be empty
|
|
assert len(conversation_stats.restored_metrics) == 0
|
|
|
|
|
|
def test_save_and_restore_workflow(mock_file_store):
|
|
"""Test the full workflow of saving and restoring metrics."""
|
|
# Create initial conversation stats
|
|
conversation_id = 'test-conversation-id'
|
|
user_id = 'test-user-id'
|
|
|
|
stats1 = ConversationStats(
|
|
file_store=mock_file_store, conversation_id=conversation_id, user_id=user_id
|
|
)
|
|
|
|
# Add a service with metrics
|
|
service_id = 'test-service'
|
|
metrics = Metrics(model_name='gpt-4')
|
|
metrics.add_cost(0.05)
|
|
metrics.add_token_usage(
|
|
prompt_tokens=100,
|
|
completion_tokens=50,
|
|
cache_read_tokens=0,
|
|
cache_write_tokens=0,
|
|
context_window=8000,
|
|
response_id='resp1',
|
|
)
|
|
stats1.service_to_metrics[service_id] = metrics
|
|
|
|
# Save metrics
|
|
stats1.save_metrics()
|
|
|
|
# Create a new conversation stats instance that should restore the metrics
|
|
stats2 = ConversationStats(
|
|
file_store=mock_file_store, conversation_id=conversation_id, user_id=user_id
|
|
)
|
|
|
|
# Verify metrics were restored
|
|
assert service_id in stats2.restored_metrics
|
|
assert stats2.restored_metrics[service_id].accumulated_cost == 0.05
|
|
assert (
|
|
stats2.restored_metrics[service_id].accumulated_token_usage.prompt_tokens == 100
|
|
)
|
|
assert (
|
|
stats2.restored_metrics[service_id].accumulated_token_usage.completion_tokens
|
|
== 50
|
|
)
|
|
|
|
# Create a real LLM instance with a mock config
|
|
llm_config = LLMConfig(
|
|
model='gpt-4o',
|
|
api_key='test_key',
|
|
num_retries=2,
|
|
retry_min_wait=1,
|
|
retry_max_wait=2,
|
|
)
|
|
|
|
# Patch the LLM class to avoid actual API calls
|
|
with patch('openhands.llm.llm.litellm_completion'):
|
|
llm = LLM(service_id=service_id, config=llm_config)
|
|
|
|
# Create a registry event
|
|
event = RegistryEvent(llm=llm, service_id=service_id)
|
|
|
|
# Register the LLM to trigger restoration
|
|
stats2.register_llm(event)
|
|
|
|
# Verify metrics were applied to the LLM
|
|
assert llm.metrics.accumulated_cost == 0.05
|
|
assert llm.metrics.accumulated_token_usage.prompt_tokens == 100
|
|
assert llm.metrics.accumulated_token_usage.completion_tokens == 50
|