mirror of
https://github.com/OpenHands/OpenHands.git
synced 2025-12-25 21:36:52 +08:00
Co-authored-by: openhands <openhands@all-hands.dev> Co-authored-by: Graham Neubig <neubig@gmail.com> Co-authored-by: Engel Nyst <enyst@users.noreply.github.com>
1305 lines
48 KiB
Python
1305 lines
48 KiB
Python
import copy
|
|
import tempfile
|
|
from pathlib import Path
|
|
from unittest.mock import MagicMock, patch
|
|
|
|
import pytest
|
|
from litellm import PromptTokensDetails
|
|
from litellm.exceptions import (
|
|
RateLimitError,
|
|
)
|
|
|
|
from openhands.core.config import LLMConfig
|
|
from openhands.core.exceptions import LLMNoResponseError, OperationCancelled
|
|
from openhands.core.message import Message, TextContent
|
|
from openhands.llm.llm import LLM
|
|
from openhands.llm.metrics import Metrics, TokenUsage
|
|
|
|
|
|
@pytest.fixture(autouse=True)
|
|
def mock_logger(monkeypatch):
|
|
# suppress logging of completion data to file
|
|
mock_logger = MagicMock()
|
|
monkeypatch.setattr('openhands.llm.debug_mixin.llm_prompt_logger', mock_logger)
|
|
monkeypatch.setattr('openhands.llm.debug_mixin.llm_response_logger', mock_logger)
|
|
monkeypatch.setattr('openhands.llm.llm.logger', mock_logger)
|
|
return mock_logger
|
|
|
|
|
|
@pytest.fixture
|
|
def default_config():
|
|
return LLMConfig(
|
|
model='gpt-4o',
|
|
api_key='test_key',
|
|
num_retries=2,
|
|
retry_min_wait=1,
|
|
retry_max_wait=2,
|
|
)
|
|
|
|
|
|
def test_llm_init_with_default_config(default_config):
|
|
llm = LLM(default_config, service_id='test-service')
|
|
assert llm.config.model == 'gpt-4o'
|
|
assert llm.config.api_key.get_secret_value() == 'test_key'
|
|
assert isinstance(llm.metrics, Metrics)
|
|
assert llm.metrics.model_name == 'gpt-4o'
|
|
|
|
|
|
def test_token_usage_add():
|
|
"""Test that TokenUsage instances can be added together."""
|
|
# Create two TokenUsage instances
|
|
usage1 = TokenUsage(
|
|
model='model1',
|
|
prompt_tokens=10,
|
|
completion_tokens=5,
|
|
cache_read_tokens=3,
|
|
cache_write_tokens=2,
|
|
response_id='response-1',
|
|
)
|
|
|
|
usage2 = TokenUsage(
|
|
model='model2',
|
|
prompt_tokens=8,
|
|
completion_tokens=6,
|
|
cache_read_tokens=2,
|
|
cache_write_tokens=4,
|
|
response_id='response-2',
|
|
)
|
|
|
|
# Add them together
|
|
combined = usage1 + usage2
|
|
|
|
# Verify the result
|
|
assert combined.model == 'model1' # Should keep the model from the first instance
|
|
assert combined.prompt_tokens == 18 # 10 + 8
|
|
assert combined.completion_tokens == 11 # 5 + 6
|
|
assert combined.cache_read_tokens == 5 # 3 + 2
|
|
assert combined.cache_write_tokens == 6 # 2 + 4
|
|
assert (
|
|
combined.response_id == 'response-1'
|
|
) # Should keep the response_id from the first instance
|
|
|
|
|
|
def test_metrics_merge_accumulated_token_usage():
|
|
"""Test that accumulated token usage is properly merged between two Metrics instances."""
|
|
# Create two Metrics instances
|
|
metrics1 = Metrics(model_name='model1')
|
|
metrics2 = Metrics(model_name='model2')
|
|
|
|
# Add token usage to each
|
|
metrics1.add_token_usage(10, 5, 3, 2, 1000, 'response-1')
|
|
metrics2.add_token_usage(8, 6, 2, 4, 1000, 'response-2')
|
|
|
|
# Verify initial accumulated token usage
|
|
metrics1_data = metrics1.get()
|
|
accumulated1 = metrics1_data['accumulated_token_usage']
|
|
assert accumulated1['prompt_tokens'] == 10
|
|
assert accumulated1['completion_tokens'] == 5
|
|
assert accumulated1['cache_read_tokens'] == 3
|
|
assert accumulated1['cache_write_tokens'] == 2
|
|
|
|
metrics2_data = metrics2.get()
|
|
accumulated2 = metrics2_data['accumulated_token_usage']
|
|
assert accumulated2['prompt_tokens'] == 8
|
|
assert accumulated2['completion_tokens'] == 6
|
|
assert accumulated2['cache_read_tokens'] == 2
|
|
assert accumulated2['cache_write_tokens'] == 4
|
|
|
|
# Merge metrics2 into metrics1
|
|
metrics1.merge(metrics2)
|
|
|
|
# Verify merged accumulated token usage
|
|
merged_data = metrics1.get()
|
|
merged_accumulated = merged_data['accumulated_token_usage']
|
|
assert merged_accumulated['prompt_tokens'] == 18 # 10 + 8
|
|
assert merged_accumulated['completion_tokens'] == 11 # 5 + 6
|
|
assert merged_accumulated['cache_read_tokens'] == 5 # 3 + 2
|
|
assert merged_accumulated['cache_write_tokens'] == 6 # 2 + 4
|
|
|
|
# Verify individual token usage records are maintained
|
|
token_usages = merged_data['token_usages']
|
|
assert len(token_usages) == 2
|
|
assert token_usages[0]['response_id'] == 'response-1'
|
|
assert token_usages[1]['response_id'] == 'response-2'
|
|
|
|
|
|
@patch('openhands.llm.llm.litellm.get_model_info')
|
|
def test_llm_init_with_model_info(mock_get_model_info, default_config):
|
|
mock_get_model_info.return_value = {
|
|
'max_input_tokens': 8000,
|
|
'max_output_tokens': 2000,
|
|
}
|
|
llm = LLM(default_config, service_id='test-service')
|
|
llm.init_model_info()
|
|
assert llm.config.max_input_tokens == 8000
|
|
assert llm.config.max_output_tokens == 2000
|
|
|
|
|
|
@patch('openhands.llm.llm.litellm.get_model_info')
|
|
def test_llm_init_without_model_info(mock_get_model_info, default_config):
|
|
mock_get_model_info.side_effect = Exception('Model info not available')
|
|
llm = LLM(default_config, service_id='test-service')
|
|
llm.init_model_info()
|
|
assert llm.config.max_input_tokens is None
|
|
assert llm.config.max_output_tokens is None
|
|
|
|
|
|
def test_llm_init_with_custom_config():
|
|
custom_config = LLMConfig(
|
|
model='custom-model',
|
|
api_key='custom_key',
|
|
max_input_tokens=5000,
|
|
max_output_tokens=1500,
|
|
temperature=0.8,
|
|
top_p=0.9,
|
|
top_k=None,
|
|
)
|
|
llm = LLM(custom_config, service_id='test-service')
|
|
assert llm.config.model == 'custom-model'
|
|
assert llm.config.api_key.get_secret_value() == 'custom_key'
|
|
assert llm.config.max_input_tokens == 5000
|
|
assert llm.config.max_output_tokens == 1500
|
|
assert llm.config.temperature == 0.8
|
|
assert llm.config.top_p == 0.9
|
|
assert llm.config.top_k is None
|
|
|
|
|
|
@patch('openhands.llm.llm.litellm_completion')
|
|
def test_llm_top_k_in_completion_when_set(mock_litellm_completion):
|
|
# Create a config with top_k set
|
|
config_with_top_k = LLMConfig(top_k=50)
|
|
llm = LLM(config_with_top_k, service_id='test-service')
|
|
|
|
# Define a side effect function to check top_k
|
|
def side_effect(*args, **kwargs):
|
|
assert 'top_k' in kwargs
|
|
assert kwargs['top_k'] == 50
|
|
return {'choices': [{'message': {'content': 'Mocked response'}}]}
|
|
|
|
mock_litellm_completion.side_effect = side_effect
|
|
|
|
# Call completion
|
|
llm.completion(messages=[{'role': 'system', 'content': 'Test message'}])
|
|
|
|
|
|
@patch('openhands.llm.llm.litellm_completion')
|
|
def test_llm_top_k_not_in_completion_when_none(mock_litellm_completion):
|
|
# Create a config with top_k set to None
|
|
config_without_top_k = LLMConfig(top_k=None)
|
|
llm = LLM(config_without_top_k, service_id='test-service')
|
|
|
|
# Define a side effect function to check top_k
|
|
def side_effect(*args, **kwargs):
|
|
assert 'top_k' not in kwargs
|
|
return {'choices': [{'message': {'content': 'Mocked response'}}]}
|
|
|
|
mock_litellm_completion.side_effect = side_effect
|
|
|
|
# Call completion
|
|
llm.completion(messages=[{'role': 'system', 'content': 'Test message'}])
|
|
|
|
|
|
def test_llm_init_with_metrics():
|
|
config = LLMConfig(model='gpt-4o', api_key='test_key')
|
|
metrics = Metrics()
|
|
llm = LLM(config, metrics=metrics, service_id='test-service')
|
|
assert llm.metrics is metrics
|
|
assert (
|
|
llm.metrics.model_name == 'default'
|
|
) # because we didn't specify model_name in Metrics init
|
|
|
|
|
|
@patch('openhands.llm.llm.litellm_completion')
|
|
@patch('time.time')
|
|
def test_response_latency_tracking(mock_time, mock_litellm_completion):
|
|
# Mock time.time() to return controlled values
|
|
mock_time.side_effect = [1000.0, 1002.5] # Start time, end time (2.5s difference)
|
|
|
|
# Mock the completion response with a specific ID
|
|
mock_response = {
|
|
'id': 'test-response-123',
|
|
'choices': [{'message': {'content': 'Test response'}}],
|
|
}
|
|
mock_litellm_completion.return_value = mock_response
|
|
|
|
# Create LLM instance and make a completion call
|
|
config = LLMConfig(model='gpt-4o', api_key='test_key')
|
|
llm = LLM(config, service_id='test-service')
|
|
response = llm.completion(messages=[{'role': 'user', 'content': 'Hello!'}])
|
|
|
|
# Verify the response latency was tracked correctly
|
|
assert len(llm.metrics.response_latencies) == 1
|
|
latency_record = llm.metrics.response_latencies[0]
|
|
assert latency_record.model == 'gpt-4o'
|
|
assert (
|
|
latency_record.latency == 2.5
|
|
) # Should be the difference between our mocked times
|
|
assert latency_record.response_id == 'test-response-123'
|
|
|
|
# Verify the completion response was returned correctly
|
|
assert response['id'] == 'test-response-123'
|
|
assert response['choices'][0]['message']['content'] == 'Test response'
|
|
|
|
# To make sure the metrics fail gracefully, set the start/end time to go backwards.
|
|
mock_time.side_effect = [1000.0, 999.0]
|
|
llm.completion(messages=[{'role': 'user', 'content': 'Hello!'}])
|
|
|
|
# There should now be 2 latencies, the last of which has the value clipped to 0
|
|
assert len(llm.metrics.response_latencies) == 2
|
|
latency_record = llm.metrics.response_latencies[-1]
|
|
assert latency_record.latency == 0.0 # Should be lifted to 0 instead of being -1!
|
|
|
|
|
|
@patch('openhands.llm.llm.litellm.get_model_info')
|
|
def test_llm_init_with_openrouter_model(mock_get_model_info, default_config):
|
|
default_config.model = 'openrouter:gpt-4o-mini'
|
|
mock_get_model_info.return_value = {
|
|
'max_input_tokens': 7000,
|
|
'max_output_tokens': 1500,
|
|
}
|
|
llm = LLM(default_config, service_id='test-service')
|
|
llm.init_model_info()
|
|
assert llm.config.max_input_tokens == 7000
|
|
assert llm.config.max_output_tokens == 1500
|
|
mock_get_model_info.assert_called_once_with('openrouter:gpt-4o-mini')
|
|
|
|
|
|
@patch('openhands.llm.llm.litellm_completion')
|
|
def test_stop_parameter_handling(mock_litellm_completion, default_config):
|
|
"""Test that stop parameter is only added for supported models."""
|
|
from litellm.types.utils import ModelResponse
|
|
|
|
mock_response = ModelResponse(
|
|
id='test-id',
|
|
choices=[{'message': {'content': 'Test response'}}],
|
|
model='test-model',
|
|
)
|
|
mock_litellm_completion.return_value = mock_response
|
|
|
|
# Test with a model that supports stop parameter
|
|
default_config.model = (
|
|
'custom-model' # Use a model not in FUNCTION_CALLING_SUPPORTED_MODELS
|
|
)
|
|
llm = LLM(default_config, service_id='test-service')
|
|
llm.completion(
|
|
messages=[{'role': 'user', 'content': 'Hello!'}],
|
|
tools=[
|
|
{'type': 'function', 'function': {'name': 'test', 'description': 'test'}}
|
|
],
|
|
)
|
|
# Verify stop parameter was included
|
|
assert 'stop' in mock_litellm_completion.call_args[1]
|
|
|
|
# Test with Grok-4 model that doesn't support stop parameter
|
|
default_config.model = 'xai/grok-4-0709'
|
|
llm = LLM(default_config, service_id='test-service')
|
|
llm.completion(
|
|
messages=[{'role': 'user', 'content': 'Hello!'}],
|
|
tools=[
|
|
{'type': 'function', 'function': {'name': 'test', 'description': 'test'}}
|
|
],
|
|
)
|
|
# Verify stop parameter was not included
|
|
assert 'stop' not in mock_litellm_completion.call_args[1]
|
|
|
|
|
|
# Tests involving completion and retries
|
|
|
|
|
|
@patch('openhands.llm.llm.litellm_completion')
|
|
def test_completion_with_mocked_logger(
|
|
mock_litellm_completion, default_config, mock_logger
|
|
):
|
|
mock_litellm_completion.return_value = {
|
|
'choices': [{'message': {'content': 'Test response'}}]
|
|
}
|
|
|
|
llm = LLM(config=default_config, service_id='test-service')
|
|
response = llm.completion(
|
|
messages=[{'role': 'user', 'content': 'Hello!'}],
|
|
stream=False,
|
|
)
|
|
|
|
assert response['choices'][0]['message']['content'] == 'Test response'
|
|
assert mock_litellm_completion.call_count == 1
|
|
|
|
mock_logger.debug.assert_called()
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
'exception_class,extra_args,expected_retries',
|
|
[
|
|
(RateLimitError, {'llm_provider': 'test_provider', 'model': 'test_model'}, 2),
|
|
],
|
|
)
|
|
@patch('openhands.llm.llm.litellm_completion')
|
|
def test_completion_retries(
|
|
mock_litellm_completion,
|
|
default_config,
|
|
exception_class,
|
|
extra_args,
|
|
expected_retries,
|
|
):
|
|
mock_litellm_completion.side_effect = [
|
|
exception_class('Test error message', **extra_args),
|
|
{'choices': [{'message': {'content': 'Retry successful'}}]},
|
|
]
|
|
|
|
llm = LLM(config=default_config, service_id='test-service')
|
|
response = llm.completion(
|
|
messages=[{'role': 'user', 'content': 'Hello!'}],
|
|
stream=False,
|
|
)
|
|
|
|
assert response['choices'][0]['message']['content'] == 'Retry successful'
|
|
assert mock_litellm_completion.call_count == expected_retries
|
|
|
|
|
|
@patch('openhands.llm.llm.litellm_completion')
|
|
def test_completion_rate_limit_wait_time(mock_litellm_completion, default_config):
|
|
with patch('time.sleep') as mock_sleep:
|
|
mock_litellm_completion.side_effect = [
|
|
RateLimitError(
|
|
'Rate limit exceeded', llm_provider='test_provider', model='test_model'
|
|
),
|
|
{'choices': [{'message': {'content': 'Retry successful'}}]},
|
|
]
|
|
|
|
llm = LLM(config=default_config, service_id='test-service')
|
|
response = llm.completion(
|
|
messages=[{'role': 'user', 'content': 'Hello!'}],
|
|
stream=False,
|
|
)
|
|
|
|
assert response['choices'][0]['message']['content'] == 'Retry successful'
|
|
assert mock_litellm_completion.call_count == 2
|
|
|
|
mock_sleep.assert_called_once()
|
|
wait_time = mock_sleep.call_args[0][0]
|
|
assert (
|
|
default_config.retry_min_wait <= wait_time <= default_config.retry_max_wait
|
|
), (
|
|
f'Expected wait time between {default_config.retry_min_wait} and {default_config.retry_max_wait} seconds, but got {wait_time}'
|
|
)
|
|
|
|
|
|
@patch('openhands.llm.llm.litellm_completion')
|
|
def test_completion_operation_cancelled(mock_litellm_completion, default_config):
|
|
mock_litellm_completion.side_effect = OperationCancelled('Operation cancelled')
|
|
|
|
llm = LLM(config=default_config, service_id='test-service')
|
|
with pytest.raises(OperationCancelled):
|
|
llm.completion(
|
|
messages=[{'role': 'user', 'content': 'Hello!'}],
|
|
stream=False,
|
|
)
|
|
|
|
assert mock_litellm_completion.call_count == 1
|
|
|
|
|
|
@patch('openhands.llm.llm.litellm_completion')
|
|
def test_completion_keyboard_interrupt(mock_litellm_completion, default_config):
|
|
def side_effect(*args, **kwargs):
|
|
raise KeyboardInterrupt('Simulated KeyboardInterrupt')
|
|
|
|
mock_litellm_completion.side_effect = side_effect
|
|
|
|
llm = LLM(config=default_config, service_id='test-service')
|
|
with pytest.raises(OperationCancelled):
|
|
try:
|
|
llm.completion(
|
|
messages=[{'role': 'user', 'content': 'Hello!'}],
|
|
stream=False,
|
|
)
|
|
except KeyboardInterrupt:
|
|
raise OperationCancelled('Operation cancelled due to KeyboardInterrupt')
|
|
|
|
assert mock_litellm_completion.call_count == 1
|
|
|
|
|
|
@patch('openhands.llm.llm.litellm_completion')
|
|
def test_completion_keyboard_interrupt_handler(mock_litellm_completion, default_config):
|
|
global _should_exit
|
|
|
|
def side_effect(*args, **kwargs):
|
|
global _should_exit
|
|
_should_exit = True
|
|
return {'choices': [{'message': {'content': 'Simulated interrupt response'}}]}
|
|
|
|
mock_litellm_completion.side_effect = side_effect
|
|
|
|
llm = LLM(config=default_config, service_id='test-service')
|
|
result = llm.completion(
|
|
messages=[{'role': 'user', 'content': 'Hello!'}],
|
|
stream=False,
|
|
)
|
|
|
|
assert mock_litellm_completion.call_count == 1
|
|
assert result['choices'][0]['message']['content'] == 'Simulated interrupt response'
|
|
assert _should_exit
|
|
|
|
_should_exit = False
|
|
|
|
|
|
@patch('openhands.llm.llm.litellm_completion')
|
|
def test_completion_retry_with_llm_no_response_error_zero_temp(
|
|
mock_litellm_completion, default_config
|
|
):
|
|
"""Test that the retry decorator properly handles LLMNoResponseError by:
|
|
1. First call to llm_completion uses temperature=0 and throws LLMNoResponseError
|
|
2. Second call should have temperature=0.2 and return a successful response
|
|
"""
|
|
|
|
# Define a side effect function that checks the temperature parameter
|
|
# and returns different responses based on it
|
|
def side_effect(*args, **kwargs):
|
|
temperature = kwargs.get('temperature', 0)
|
|
|
|
# First call with temperature=0 should raise LLMNoResponseError
|
|
if temperature == 0:
|
|
raise LLMNoResponseError('LLM did not return a response')
|
|
|
|
else:
|
|
return {
|
|
'choices': [
|
|
{'message': {'content': f'Response with temperature={temperature}'}}
|
|
]
|
|
}
|
|
|
|
mock_litellm_completion.side_effect = side_effect
|
|
|
|
# Create LLM instance and make a completion call
|
|
llm = LLM(config=default_config, service_id='test-service')
|
|
response = llm.completion(
|
|
messages=[{'role': 'user', 'content': 'Hello!'}],
|
|
stream=False,
|
|
temperature=0, # Initial temperature is 0
|
|
)
|
|
|
|
# Verify the response after retry
|
|
assert (
|
|
response['choices'][0]['message']['content'] == 'Response with temperature=1.0'
|
|
)
|
|
|
|
# Verify that litellm_completion was called twice
|
|
assert mock_litellm_completion.call_count == 2
|
|
|
|
# Check the temperature in the first call (should be 0)
|
|
first_call_kwargs = mock_litellm_completion.call_args_list[0][1]
|
|
assert first_call_kwargs.get('temperature') == 0
|
|
|
|
# Check the temperature in the second call (should be 1.0)
|
|
second_call_kwargs = mock_litellm_completion.call_args_list[1][1]
|
|
assert second_call_kwargs.get('temperature') == 1.0
|
|
|
|
|
|
@patch('openhands.llm.llm.litellm_completion')
|
|
def test_completion_retry_with_llm_no_response_error_nonzero_temp(
|
|
mock_litellm_completion, default_config
|
|
):
|
|
"""Test that the retry decorator works for LLMNoResponseError when initial temperature is non-zero,
|
|
and keeps the original temperature on retry.
|
|
|
|
This test verifies that when LLMNoResponseError is raised with a non-zero temperature:
|
|
1. The retry mechanism is triggered
|
|
2. The temperature remains unchanged (not set to 0.2)
|
|
3. After all retries are exhausted, the error is raised
|
|
"""
|
|
mock_litellm_completion.side_effect = LLMNoResponseError(
|
|
'LLM did not return a response'
|
|
)
|
|
|
|
llm = LLM(config=default_config, service_id='test-service')
|
|
with pytest.raises(LLMNoResponseError):
|
|
llm.completion(
|
|
messages=[{'role': 'user', 'content': 'Hello!'}],
|
|
stream=False,
|
|
temperature=0.7, # Non-zero temperature
|
|
)
|
|
|
|
# Verify that litellm_completion was called the expected number of times
|
|
assert mock_litellm_completion.call_count == default_config.num_retries
|
|
|
|
# Check that all calls used the original temperature
|
|
for call in mock_litellm_completion.call_args_list:
|
|
assert call[1].get('temperature') == 0.7
|
|
|
|
|
|
@patch('openhands.llm.llm.litellm.get_model_info')
|
|
@patch('openhands.llm.llm.httpx.get')
|
|
def test_gemini_25_pro_function_calling(mock_httpx_get, mock_get_model_info):
|
|
"""Test that Gemini 2.5 Pro models have function calling enabled by default.
|
|
This includes testing various model name formats with different prefixes.
|
|
"""
|
|
# Mock the model info response
|
|
mock_get_model_info.return_value = {
|
|
'max_input_tokens': 8000,
|
|
'max_output_tokens': 2000,
|
|
}
|
|
|
|
# Mock the httpx response for litellm proxy
|
|
mock_response = MagicMock()
|
|
mock_response.json.return_value = {
|
|
'data': [
|
|
{
|
|
'model_name': 'gemini-2.5-pro-preview-03-25',
|
|
'model_info': {
|
|
'max_input_tokens': 8000,
|
|
'max_output_tokens': 2000,
|
|
},
|
|
}
|
|
]
|
|
}
|
|
mock_httpx_get.return_value = mock_response
|
|
|
|
# Test cases with model names and expected function calling support
|
|
test_cases = [
|
|
# Base model names
|
|
('gemini-2.5-pro-preview-03-25', True),
|
|
('gemini-2.5-pro-exp-03-25', True),
|
|
# With gemini/ prefix
|
|
('gemini/gemini-2.5-pro-preview-03-25', True),
|
|
('gemini/gemini-2.5-pro-exp-03-25', True),
|
|
# With litellm_proxy/ prefix
|
|
('litellm_proxy/gemini-2.5-pro-preview-03-25', True),
|
|
('litellm_proxy/gemini-2.5-pro-exp-03-25', True),
|
|
# With openrouter/gemini/ prefix
|
|
('openrouter/gemini/gemini-2.5-pro-preview-03-25', True),
|
|
('openrouter/gemini/gemini-2.5-pro-exp-03-25', True),
|
|
# With litellm_proxy/gemini/ prefix
|
|
('litellm_proxy/gemini/gemini-2.5-pro-preview-03-25', True),
|
|
('litellm_proxy/gemini/gemini-2.5-pro-exp-03-25', True),
|
|
# Control case - a model that shouldn't have function calling
|
|
('gemini-1.0-pro', False),
|
|
]
|
|
|
|
for model_name, expected_support in test_cases:
|
|
config = LLMConfig(model=model_name, api_key='test_key')
|
|
llm = LLM(config, service_id='test-service')
|
|
|
|
assert llm.is_function_calling_active() == expected_support, (
|
|
f'Expected function calling support to be {expected_support} for model {model_name}'
|
|
)
|
|
|
|
|
|
@patch('openhands.llm.llm.litellm_completion')
|
|
def test_completion_retry_with_llm_no_response_error_nonzero_temp_successful_retry(
|
|
mock_litellm_completion, default_config
|
|
):
|
|
"""Test that the retry decorator works for LLMNoResponseError with non-zero temperature
|
|
and successfully retries while preserving the original temperature.
|
|
|
|
This test verifies that:
|
|
1. First call to llm_completion with temperature=0.7 throws LLMNoResponseError
|
|
2. Second call with the same temperature=0.7 returns a successful response
|
|
"""
|
|
|
|
# Define a side effect function that raises LLMNoResponseError on first call
|
|
# and returns a successful response on second call
|
|
def side_effect(*args, **kwargs):
|
|
temperature = kwargs.get('temperature', 0)
|
|
|
|
if mock_litellm_completion.call_count == 1:
|
|
# First call should raise LLMNoResponseError
|
|
raise LLMNoResponseError('LLM did not return a response')
|
|
else:
|
|
# Second call should return a successful response
|
|
return {
|
|
'choices': [
|
|
{
|
|
'message': {
|
|
'content': f'Successful response with temperature={temperature}'
|
|
}
|
|
}
|
|
]
|
|
}
|
|
|
|
mock_litellm_completion.side_effect = side_effect
|
|
|
|
# Create LLM instance and make a completion call with non-zero temperature
|
|
llm = LLM(config=default_config, service_id='test-service')
|
|
response = llm.completion(
|
|
messages=[{'role': 'user', 'content': 'Hello!'}],
|
|
stream=False,
|
|
temperature=0.7, # Non-zero temperature
|
|
)
|
|
|
|
# Verify the response after retry
|
|
assert (
|
|
response['choices'][0]['message']['content']
|
|
== 'Successful response with temperature=0.7'
|
|
)
|
|
|
|
# Verify that litellm_completion was called twice
|
|
assert mock_litellm_completion.call_count == 2
|
|
|
|
# Check the temperature in the first call (should be 0.7)
|
|
first_call_kwargs = mock_litellm_completion.call_args_list[0][1]
|
|
assert first_call_kwargs.get('temperature') == 0.7
|
|
|
|
# Check the temperature in the second call (should still be 0.7)
|
|
second_call_kwargs = mock_litellm_completion.call_args_list[1][1]
|
|
assert second_call_kwargs.get('temperature') == 0.7
|
|
|
|
|
|
@patch('openhands.llm.llm.litellm_completion')
|
|
def test_completion_retry_with_llm_no_response_error_successful_retry(
|
|
mock_litellm_completion, default_config
|
|
):
|
|
"""Test that the retry decorator works for LLMNoResponseError with zero temperature
|
|
and successfully retries with temperature=0.2.
|
|
|
|
This test verifies that:
|
|
1. First call to llm_completion with temperature=0 throws LLMNoResponseError
|
|
2. Second call with temperature=0.2 returns a successful response
|
|
"""
|
|
|
|
# Define a side effect function that raises LLMNoResponseError on first call
|
|
# and returns a successful response on second call
|
|
def side_effect(*args, **kwargs):
|
|
temperature = kwargs.get('temperature', 0)
|
|
|
|
if mock_litellm_completion.call_count == 1:
|
|
# First call should raise LLMNoResponseError
|
|
raise LLMNoResponseError('LLM did not return a response')
|
|
else:
|
|
# Second call should return a successful response
|
|
return {
|
|
'choices': [
|
|
{
|
|
'message': {
|
|
'content': f'Successful response with temperature={temperature}'
|
|
}
|
|
}
|
|
]
|
|
}
|
|
|
|
mock_litellm_completion.side_effect = side_effect
|
|
|
|
# Create LLM instance and make a completion call with explicit temperature=0
|
|
llm = LLM(config=default_config, service_id='test-service')
|
|
response = llm.completion(
|
|
messages=[{'role': 'user', 'content': 'Hello!'}],
|
|
stream=False,
|
|
temperature=0, # Explicitly set temperature to 0
|
|
)
|
|
|
|
# Verify the response after retry
|
|
assert (
|
|
response['choices'][0]['message']['content']
|
|
== 'Successful response with temperature=1.0'
|
|
)
|
|
|
|
# Verify that litellm_completion was called twice
|
|
assert mock_litellm_completion.call_count == 2
|
|
|
|
# Check the temperature in the first call (should be 0)
|
|
first_call_kwargs = mock_litellm_completion.call_args_list[0][1]
|
|
assert first_call_kwargs.get('temperature') == 0
|
|
|
|
# Check the temperature in the second call (should be 1.0)
|
|
second_call_kwargs = mock_litellm_completion.call_args_list[1][1]
|
|
assert second_call_kwargs.get('temperature') == 1.0
|
|
|
|
|
|
@patch('openhands.llm.llm.litellm_completion')
|
|
def test_completion_with_litellm_mock(mock_litellm_completion, default_config):
|
|
mock_response = {
|
|
'choices': [{'message': {'content': 'This is a mocked response.'}}]
|
|
}
|
|
mock_litellm_completion.return_value = mock_response
|
|
|
|
test_llm = LLM(config=default_config, service_id='test-service')
|
|
response = test_llm.completion(
|
|
messages=[{'role': 'user', 'content': 'Hello!'}],
|
|
stream=False,
|
|
drop_params=True,
|
|
)
|
|
|
|
# Assertions
|
|
assert response['choices'][0]['message']['content'] == 'This is a mocked response.'
|
|
mock_litellm_completion.assert_called_once()
|
|
|
|
# Check if the correct arguments were passed to litellm_completion
|
|
call_args = mock_litellm_completion.call_args[1] # Get keyword arguments
|
|
assert call_args['model'] == default_config.model
|
|
assert call_args['messages'] == [{'role': 'user', 'content': 'Hello!'}]
|
|
assert not call_args['stream']
|
|
|
|
|
|
@patch('openhands.llm.llm.litellm_completion')
|
|
def test_llm_gemini_thinking_parameter(mock_litellm_completion, default_config):
|
|
"""Test that the 'thinking' parameter is correctly passed to litellm_completion
|
|
when a Gemini model is used with 'low' reasoning_effort.
|
|
"""
|
|
# Configure for Gemini model with low reasoning effort
|
|
gemini_config = copy.deepcopy(default_config)
|
|
gemini_config.model = 'gemini-2.5-pro'
|
|
gemini_config.reasoning_effort = 'low'
|
|
|
|
# Mock the response from litellm
|
|
mock_litellm_completion.return_value = {
|
|
'choices': [{'message': {'content': 'Test response'}}]
|
|
}
|
|
|
|
# Initialize LLM and call completion
|
|
llm = LLM(config=gemini_config, service_id='test-service')
|
|
llm.completion(messages=[{'role': 'user', 'content': 'Hello!'}])
|
|
|
|
# Verify that litellm_completion was called with the 'thinking' parameter
|
|
mock_litellm_completion.assert_called_once()
|
|
call_args, call_kwargs = mock_litellm_completion.call_args
|
|
assert 'thinking' in call_kwargs
|
|
assert call_kwargs['thinking'] == {'budget_tokens': 128}
|
|
assert 'reasoning_effort' not in call_kwargs
|
|
|
|
# Ensure the first positional argument (model) was ignored
|
|
assert (
|
|
len(call_args) == 0
|
|
) # No positional args should be passed to litellm_completion here
|
|
|
|
|
|
@patch('openhands.llm.llm.litellm.token_counter')
|
|
def test_get_token_count_with_dict_messages(mock_token_counter, default_config):
|
|
mock_token_counter.return_value = 42
|
|
llm = LLM(default_config, service_id='test-service')
|
|
messages = [{'role': 'user', 'content': 'Hello!'}]
|
|
|
|
token_count = llm.get_token_count(messages)
|
|
|
|
assert token_count == 42
|
|
mock_token_counter.assert_called_once_with(
|
|
model=default_config.model, messages=messages, custom_tokenizer=None
|
|
)
|
|
|
|
|
|
@patch('openhands.llm.llm.litellm.token_counter')
|
|
def test_get_token_count_with_message_objects(
|
|
mock_token_counter, default_config, mock_logger
|
|
):
|
|
llm = LLM(default_config, service_id='test-service')
|
|
|
|
# Create a Message object and its equivalent dict
|
|
message_obj = Message(role='user', content=[TextContent(text='Hello!')])
|
|
message_dict = {'role': 'user', 'content': 'Hello!'}
|
|
|
|
# Mock token counter to return different values for each call
|
|
mock_token_counter.side_effect = [42, 42] # Same value for both cases
|
|
|
|
# Get token counts for both formats
|
|
token_count_obj = llm.get_token_count([message_obj])
|
|
token_count_dict = llm.get_token_count([message_dict])
|
|
|
|
# Verify both formats get the same token count
|
|
assert token_count_obj == token_count_dict
|
|
assert mock_token_counter.call_count == 2
|
|
|
|
|
|
@patch('openhands.llm.llm.litellm.token_counter')
|
|
@patch('openhands.llm.llm.create_pretrained_tokenizer')
|
|
def test_get_token_count_with_custom_tokenizer(
|
|
mock_create_tokenizer, mock_token_counter, default_config
|
|
):
|
|
mock_tokenizer = MagicMock()
|
|
mock_create_tokenizer.return_value = mock_tokenizer
|
|
mock_token_counter.return_value = 42
|
|
|
|
config = copy.deepcopy(default_config)
|
|
config.custom_tokenizer = 'custom/tokenizer'
|
|
llm = LLM(config, service_id='test-service')
|
|
messages = [{'role': 'user', 'content': 'Hello!'}]
|
|
|
|
token_count = llm.get_token_count(messages)
|
|
|
|
assert token_count == 42
|
|
mock_create_tokenizer.assert_called_once_with('custom/tokenizer')
|
|
mock_token_counter.assert_called_once_with(
|
|
model=config.model, messages=messages, custom_tokenizer=mock_tokenizer
|
|
)
|
|
|
|
|
|
@patch('openhands.llm.llm.litellm.token_counter')
|
|
def test_get_token_count_error_handling(
|
|
mock_token_counter, default_config, mock_logger
|
|
):
|
|
mock_token_counter.side_effect = Exception('Token counting failed')
|
|
llm = LLM(default_config, service_id='test-service')
|
|
messages = [{'role': 'user', 'content': 'Hello!'}]
|
|
|
|
token_count = llm.get_token_count(messages)
|
|
|
|
assert token_count == 0
|
|
mock_token_counter.assert_called_once()
|
|
mock_logger.error.assert_called_once_with(
|
|
'Error getting token count for\n model gpt-4o\nToken counting failed'
|
|
)
|
|
|
|
|
|
@patch('openhands.llm.llm.litellm_completion')
|
|
def test_llm_token_usage(mock_litellm_completion, default_config):
|
|
# This mock response includes usage details with prompt_tokens,
|
|
# completion_tokens, prompt_tokens_details.cached_tokens, and model_extra.cache_creation_input_tokens
|
|
mock_response_1 = {
|
|
'id': 'test-response-usage',
|
|
'choices': [{'message': {'content': 'Usage test response'}}],
|
|
'usage': {
|
|
'prompt_tokens': 12,
|
|
'completion_tokens': 3,
|
|
'prompt_tokens_details': PromptTokensDetails(cached_tokens=2),
|
|
'model_extra': {'cache_creation_input_tokens': 5},
|
|
},
|
|
}
|
|
|
|
# Create a second usage scenario to test accumulation and a different response_id
|
|
mock_response_2 = {
|
|
'id': 'test-response-usage-2',
|
|
'choices': [{'message': {'content': 'Second usage test response'}}],
|
|
'usage': {
|
|
'prompt_tokens': 7,
|
|
'completion_tokens': 2,
|
|
'prompt_tokens_details': PromptTokensDetails(cached_tokens=1),
|
|
'model_extra': {'cache_creation_input_tokens': 3},
|
|
},
|
|
}
|
|
|
|
# We'll make mock_litellm_completion return these responses in sequence
|
|
mock_litellm_completion.side_effect = [mock_response_1, mock_response_2]
|
|
|
|
llm = LLM(config=default_config, service_id='test-service')
|
|
|
|
# First call
|
|
llm.completion(messages=[{'role': 'user', 'content': 'Hello usage!'}])
|
|
|
|
# Verify we have exactly one usage record after first call
|
|
token_usage_list = llm.metrics.get()['token_usages']
|
|
assert len(token_usage_list) == 1
|
|
usage_entry_1 = token_usage_list[0]
|
|
assert usage_entry_1['prompt_tokens'] == 12
|
|
assert usage_entry_1['completion_tokens'] == 3
|
|
assert usage_entry_1['cache_read_tokens'] == 2
|
|
assert usage_entry_1['cache_write_tokens'] == 5
|
|
assert usage_entry_1['response_id'] == 'test-response-usage'
|
|
|
|
# Second call
|
|
llm.completion(messages=[{'role': 'user', 'content': 'Hello again!'}])
|
|
|
|
# Now we expect two usage records total
|
|
token_usage_list = llm.metrics.get()['token_usages']
|
|
assert len(token_usage_list) == 2
|
|
usage_entry_2 = token_usage_list[-1]
|
|
assert usage_entry_2['prompt_tokens'] == 7
|
|
assert usage_entry_2['completion_tokens'] == 2
|
|
assert usage_entry_2['cache_read_tokens'] == 1
|
|
assert usage_entry_2['cache_write_tokens'] == 3
|
|
assert usage_entry_2['response_id'] == 'test-response-usage-2'
|
|
|
|
|
|
@patch('openhands.llm.llm.litellm_completion')
|
|
def test_accumulated_token_usage(mock_litellm_completion, default_config):
|
|
"""Test that token usage is properly accumulated across multiple LLM calls."""
|
|
# Mock responses with token usage information
|
|
mock_response_1 = {
|
|
'id': 'test-response-1',
|
|
'choices': [{'message': {'content': 'First response'}}],
|
|
'usage': {
|
|
'prompt_tokens': 10,
|
|
'completion_tokens': 5,
|
|
'prompt_tokens_details': PromptTokensDetails(cached_tokens=3),
|
|
'model_extra': {'cache_creation_input_tokens': 4},
|
|
},
|
|
}
|
|
|
|
mock_response_2 = {
|
|
'id': 'test-response-2',
|
|
'choices': [{'message': {'content': 'Second response'}}],
|
|
'usage': {
|
|
'prompt_tokens': 8,
|
|
'completion_tokens': 6,
|
|
'prompt_tokens_details': PromptTokensDetails(cached_tokens=2),
|
|
'model_extra': {'cache_creation_input_tokens': 3},
|
|
},
|
|
}
|
|
|
|
# Set up the mock to return these responses in sequence
|
|
mock_litellm_completion.side_effect = [mock_response_1, mock_response_2]
|
|
|
|
# Create LLM instance
|
|
llm = LLM(config=default_config, service_id='test-service')
|
|
|
|
# First call
|
|
llm.completion(messages=[{'role': 'user', 'content': 'First message'}])
|
|
|
|
# Check accumulated token usage after first call
|
|
metrics_data = llm.metrics.get()
|
|
accumulated_usage = metrics_data['accumulated_token_usage']
|
|
|
|
assert accumulated_usage['prompt_tokens'] == 10
|
|
assert accumulated_usage['completion_tokens'] == 5
|
|
assert accumulated_usage['cache_read_tokens'] == 3
|
|
assert accumulated_usage['cache_write_tokens'] == 4
|
|
|
|
# Second call
|
|
llm.completion(messages=[{'role': 'user', 'content': 'Second message'}])
|
|
|
|
# Check accumulated token usage after second call
|
|
metrics_data = llm.metrics.get()
|
|
accumulated_usage = metrics_data['accumulated_token_usage']
|
|
|
|
# Values should be the sum of both calls
|
|
assert accumulated_usage['prompt_tokens'] == 18 # 10 + 8
|
|
assert accumulated_usage['completion_tokens'] == 11 # 5 + 6
|
|
assert accumulated_usage['cache_read_tokens'] == 5 # 3 + 2
|
|
assert accumulated_usage['cache_write_tokens'] == 7 # 4 + 3
|
|
|
|
# Verify individual token usage records are still maintained
|
|
token_usages = metrics_data['token_usages']
|
|
assert len(token_usages) == 2
|
|
|
|
# First record
|
|
assert token_usages[0]['prompt_tokens'] == 10
|
|
assert token_usages[0]['completion_tokens'] == 5
|
|
assert token_usages[0]['cache_read_tokens'] == 3
|
|
assert token_usages[0]['cache_write_tokens'] == 4
|
|
assert token_usages[0]['response_id'] == 'test-response-1'
|
|
|
|
# Second record
|
|
assert token_usages[1]['prompt_tokens'] == 8
|
|
assert token_usages[1]['completion_tokens'] == 6
|
|
assert token_usages[1]['cache_read_tokens'] == 2
|
|
assert token_usages[1]['cache_write_tokens'] == 3
|
|
assert token_usages[1]['response_id'] == 'test-response-2'
|
|
|
|
|
|
@patch('openhands.llm.llm.litellm_completion')
|
|
def test_completion_with_log_completions(mock_litellm_completion, default_config):
|
|
with tempfile.TemporaryDirectory() as temp_dir:
|
|
default_config.log_completions = True
|
|
default_config.log_completions_folder = temp_dir
|
|
mock_response = {
|
|
'choices': [{'message': {'content': 'This is a mocked response.'}}]
|
|
}
|
|
mock_litellm_completion.return_value = mock_response
|
|
|
|
test_llm = LLM(config=default_config, service_id='test-service')
|
|
response = test_llm.completion(
|
|
messages=[{'role': 'user', 'content': 'Hello!'}],
|
|
stream=False,
|
|
drop_params=True,
|
|
)
|
|
assert (
|
|
response['choices'][0]['message']['content'] == 'This is a mocked response.'
|
|
)
|
|
files = list(Path(temp_dir).iterdir())
|
|
# Expect a log to be generated
|
|
assert len(files) == 1
|
|
|
|
|
|
@patch('httpx.get')
|
|
def test_llm_base_url_auto_protocol_patch(mock_get):
|
|
"""Test that LLM base_url without protocol is automatically fixed with 'http://'."""
|
|
config = LLMConfig(
|
|
model='litellm_proxy/test-model',
|
|
api_key='fake-key',
|
|
base_url=' api.example.com ',
|
|
)
|
|
|
|
mock_get.return_value.status_code = 200
|
|
mock_get.return_value.json.return_value = {'model': 'fake'}
|
|
|
|
llm = LLM(config=config, service_id='test-service')
|
|
llm.init_model_info()
|
|
|
|
called_url = mock_get.call_args[0][0]
|
|
assert called_url.startswith('http://') or called_url.startswith('https://')
|
|
|
|
|
|
# Tests for max_output_tokens configuration and usage
|
|
|
|
|
|
def test_unknown_model_token_limits():
|
|
"""Test that models without known token limits get None for both max_output_tokens and max_input_tokens."""
|
|
# Create LLM instance with a non-existent model to avoid litellm having model info for it
|
|
config = LLMConfig(model='non-existent-model', api_key='test_key')
|
|
llm = LLM(config, service_id='test-service')
|
|
|
|
# Verify max_output_tokens and max_input_tokens are initialized to None (default value)
|
|
assert llm.config.max_output_tokens is None
|
|
assert llm.config.max_input_tokens is None
|
|
|
|
|
|
def test_max_tokens_from_model_info():
|
|
"""Test that max_output_tokens and max_input_tokens are correctly initialized from model info."""
|
|
# Create LLM instance with GPT-4 model which has known token limits
|
|
config = LLMConfig(model='gpt-4', api_key='test_key')
|
|
llm = LLM(config, service_id='test-service')
|
|
|
|
# GPT-4 has specific token limits
|
|
# These are the expected values from litellm
|
|
assert llm.config.max_output_tokens == 4096
|
|
assert llm.config.max_input_tokens == 8192
|
|
|
|
|
|
def test_claude_3_7_sonnet_max_output_tokens():
|
|
"""Test that Claude 3.7 Sonnet models get the special 64000 max_output_tokens value and default max_input_tokens."""
|
|
# Create LLM instance with Claude 3.7 Sonnet model
|
|
config = LLMConfig(model='claude-3-7-sonnet', api_key='test_key')
|
|
llm = LLM(config, service_id='test-service')
|
|
|
|
# Verify max_output_tokens is set to 64000 for Claude 3.7 Sonnet
|
|
assert llm.config.max_output_tokens == 64000
|
|
# Verify max_input_tokens is set to None (default value)
|
|
assert llm.config.max_input_tokens is None
|
|
|
|
|
|
def test_claude_sonnet_4_max_output_tokens():
|
|
"""Test that Claude Sonnet 4 models get the correct max_output_tokens and max_input_tokens values."""
|
|
# Create LLM instance with a Claude Sonnet 4 model
|
|
config = LLMConfig(model='claude-sonnet-4-20250514', api_key='test_key')
|
|
llm = LLM(config, service_id='test-service')
|
|
|
|
# Verify max_output_tokens is set to the expected value
|
|
assert llm.config.max_output_tokens == 64000
|
|
# Verify max_input_tokens is set to the expected value
|
|
# For Claude models, we expect a specific value from litellm
|
|
assert llm.config.max_input_tokens == 200000
|
|
|
|
|
|
def test_sambanova_deepseek_model_max_output_tokens():
|
|
"""Test that SambaNova DeepSeek-V3-0324 model gets the correct max_output_tokens value."""
|
|
# Create LLM instance with SambaNova DeepSeek model
|
|
config = LLMConfig(model='sambanova/DeepSeek-V3-0324', api_key='test_key')
|
|
llm = LLM(config, service_id='test-service')
|
|
|
|
# SambaNova DeepSeek model has specific token limits
|
|
# This is the expected value from litellm
|
|
assert llm.config.max_output_tokens == 32768
|
|
|
|
|
|
def test_max_output_tokens_override_in_config():
|
|
"""Test that max_output_tokens can be overridden in the config."""
|
|
# Create LLM instance with minimal config and overridden max_output_tokens
|
|
config = LLMConfig(
|
|
model='claude-sonnet-4-20250514', api_key='test_key', max_output_tokens=2048
|
|
)
|
|
llm = LLM(config, service_id='test-service')
|
|
|
|
# Verify the config has the overridden max_output_tokens value
|
|
assert llm.config.max_output_tokens == 2048
|
|
|
|
|
|
def test_azure_model_default_max_tokens():
|
|
"""Test that Azure models have the default max_output_tokens value."""
|
|
# Create minimal config for Azure model (without specifying max_output_tokens)
|
|
azure_config = LLMConfig(
|
|
model='azure/non-existent-model', # Use a non-existent model to avoid litellm having model info for it
|
|
api_key='test_key',
|
|
base_url='https://test.openai.azure.com/',
|
|
api_version='2024-12-01-preview',
|
|
)
|
|
|
|
# Create LLM instance with Azure model
|
|
llm = LLM(azure_config, service_id='test-service')
|
|
|
|
# Verify the config has the default max_output_tokens value
|
|
assert llm.config.max_output_tokens is None # Default value
|
|
|
|
|
|
# Gemini Performance Optimization Tests
|
|
|
|
|
|
def test_gemini_model_keeps_none_reasoning_effort():
|
|
"""Test that Gemini models keep reasoning_effort=None for optimization."""
|
|
config = LLMConfig(model='gemini-2.5-pro', api_key='test_key')
|
|
# reasoning_effort should remain None for Gemini models
|
|
assert config.reasoning_effort is None
|
|
|
|
|
|
def test_non_gemini_model_gets_high_reasoning_effort():
|
|
"""Test that non-Gemini models get reasoning_effort='high' by default."""
|
|
config = LLMConfig(model='gpt-4o', api_key='test_key')
|
|
# Non-Gemini models should get reasoning_effort='high'
|
|
assert config.reasoning_effort == 'high'
|
|
|
|
|
|
def test_explicit_reasoning_effort_preserved():
|
|
"""Test that explicitly set reasoning_effort is preserved."""
|
|
config = LLMConfig(
|
|
model='gemini-2.5-pro', api_key='test_key', reasoning_effort='medium'
|
|
)
|
|
# Explicitly set reasoning_effort should be preserved
|
|
assert config.reasoning_effort == 'medium'
|
|
|
|
|
|
@patch('openhands.llm.llm.litellm_completion')
|
|
def test_gemini_none_reasoning_effort_uses_thinking_budget(mock_completion):
|
|
"""Test that Gemini with reasoning_effort=None uses thinking budget."""
|
|
config = LLMConfig(
|
|
model='gemini-2.5-pro', api_key='test_key', reasoning_effort=None
|
|
)
|
|
|
|
# Mock the completion response
|
|
mock_completion.return_value = {
|
|
'choices': [{'message': {'content': 'Test response'}}],
|
|
'usage': {'prompt_tokens': 10, 'completion_tokens': 5},
|
|
}
|
|
|
|
llm = LLM(config, service_id='test-service')
|
|
sample_messages = [{'role': 'user', 'content': 'Hello, how are you?'}]
|
|
llm.completion(messages=sample_messages)
|
|
|
|
# Verify that thinking budget was set and reasoning_effort was None
|
|
call_kwargs = mock_completion.call_args[1]
|
|
assert 'thinking' in call_kwargs
|
|
assert call_kwargs['thinking'] == {'budget_tokens': 128}
|
|
assert call_kwargs.get('reasoning_effort') is None
|
|
|
|
|
|
@patch('openhands.llm.llm.litellm_completion')
|
|
def test_gemini_low_reasoning_effort_uses_thinking_budget(mock_completion):
|
|
"""Test that Gemini with reasoning_effort='low' uses thinking budget."""
|
|
config = LLMConfig(
|
|
model='gemini-2.5-pro', api_key='test_key', reasoning_effort='low'
|
|
)
|
|
|
|
# Mock the completion response
|
|
mock_completion.return_value = {
|
|
'choices': [{'message': {'content': 'Test response'}}],
|
|
'usage': {'prompt_tokens': 10, 'completion_tokens': 5},
|
|
}
|
|
|
|
llm = LLM(config, service_id='test-service')
|
|
sample_messages = [{'role': 'user', 'content': 'Hello, how are you?'}]
|
|
llm.completion(messages=sample_messages)
|
|
|
|
# Verify that thinking budget was set and reasoning_effort was None
|
|
call_kwargs = mock_completion.call_args[1]
|
|
assert 'thinking' in call_kwargs
|
|
assert call_kwargs['thinking'] == {'budget_tokens': 128}
|
|
assert call_kwargs.get('reasoning_effort') is None
|
|
|
|
|
|
@patch('openhands.llm.llm.litellm_completion')
|
|
def test_gemini_medium_reasoning_effort_passes_through(mock_completion):
|
|
"""Test that Gemini with reasoning_effort='medium' passes through to litellm."""
|
|
config = LLMConfig(
|
|
model='gemini-2.5-pro', api_key='test_key', reasoning_effort='medium'
|
|
)
|
|
|
|
# Mock the completion response
|
|
mock_completion.return_value = {
|
|
'choices': [{'message': {'content': 'Test response'}}],
|
|
'usage': {'prompt_tokens': 10, 'completion_tokens': 5},
|
|
}
|
|
|
|
llm = LLM(config, service_id='test-service')
|
|
sample_messages = [{'role': 'user', 'content': 'Hello, how are you?'}]
|
|
llm.completion(messages=sample_messages)
|
|
|
|
# Verify that reasoning_effort was passed through and thinking budget was not set
|
|
call_kwargs = mock_completion.call_args[1]
|
|
assert 'thinking' not in call_kwargs
|
|
assert call_kwargs.get('reasoning_effort') == 'medium'
|
|
|
|
|
|
@patch('openhands.llm.llm.litellm_completion')
|
|
def test_gemini_high_reasoning_effort_passes_through(mock_completion):
|
|
"""Test that Gemini with reasoning_effort='high' passes through to litellm."""
|
|
config = LLMConfig(
|
|
model='gemini-2.5-pro', api_key='test_key', reasoning_effort='high'
|
|
)
|
|
|
|
# Mock the completion response
|
|
mock_completion.return_value = {
|
|
'choices': [{'message': {'content': 'Test response'}}],
|
|
'usage': {'prompt_tokens': 10, 'completion_tokens': 5},
|
|
}
|
|
|
|
llm = LLM(config, service_id='test-service')
|
|
sample_messages = [{'role': 'user', 'content': 'Hello, how are you?'}]
|
|
llm.completion(messages=sample_messages)
|
|
|
|
# Verify that reasoning_effort was passed through and thinking budget was not set
|
|
call_kwargs = mock_completion.call_args[1]
|
|
assert 'thinking' not in call_kwargs
|
|
assert call_kwargs.get('reasoning_effort') == 'high'
|
|
|
|
|
|
@patch('openhands.llm.llm.litellm_completion')
|
|
def test_non_gemini_uses_reasoning_effort(mock_completion):
|
|
"""Test that non-Gemini models use reasoning_effort instead of thinking budget."""
|
|
config = LLMConfig(model='o1', api_key='test_key', reasoning_effort='high')
|
|
|
|
# Mock the completion response
|
|
mock_completion.return_value = {
|
|
'choices': [{'message': {'content': 'Test response'}}],
|
|
'usage': {'prompt_tokens': 10, 'completion_tokens': 5},
|
|
}
|
|
|
|
llm = LLM(config, service_id='test-service')
|
|
sample_messages = [{'role': 'user', 'content': 'Hello, how are you?'}]
|
|
llm.completion(messages=sample_messages)
|
|
|
|
# Verify that reasoning_effort was used and thinking budget was not set
|
|
call_kwargs = mock_completion.call_args[1]
|
|
assert 'thinking' not in call_kwargs
|
|
assert call_kwargs.get('reasoning_effort') == 'high'
|
|
|
|
|
|
@patch('openhands.llm.llm.litellm_completion')
|
|
def test_non_reasoning_model_no_optimization(mock_completion):
|
|
"""Test that non-reasoning models don't get optimization parameters."""
|
|
config = LLMConfig(
|
|
model='gpt-3.5-turbo', # Not in REASONING_EFFORT_SUPPORTED_MODELS
|
|
api_key='test_key',
|
|
)
|
|
|
|
# Mock the completion response
|
|
mock_completion.return_value = {
|
|
'choices': [{'message': {'content': 'Test response'}}],
|
|
'usage': {'prompt_tokens': 10, 'completion_tokens': 5},
|
|
}
|
|
|
|
llm = LLM(config, service_id='test-service')
|
|
sample_messages = [{'role': 'user', 'content': 'Hello, how are you?'}]
|
|
llm.completion(messages=sample_messages)
|
|
|
|
# Verify that neither thinking budget nor reasoning_effort were set
|
|
call_kwargs = mock_completion.call_args[1]
|
|
assert 'thinking' not in call_kwargs
|
|
assert 'reasoning_effort' not in call_kwargs
|
|
|
|
|
|
@patch('openhands.llm.llm.litellm_completion')
|
|
def test_gemini_performance_optimization_end_to_end(mock_completion):
|
|
"""Test the complete Gemini performance optimization flow end-to-end."""
|
|
# Mock the completion response
|
|
mock_completion.return_value = {
|
|
'choices': [{'message': {'content': 'Optimized response'}}],
|
|
'usage': {'prompt_tokens': 50, 'completion_tokens': 25},
|
|
}
|
|
|
|
# Create Gemini configuration
|
|
config = LLMConfig(model='gemini-2.5-pro', api_key='test_key')
|
|
|
|
# Verify config has optimized defaults
|
|
assert config.reasoning_effort is None
|
|
|
|
# Create LLM and make completion
|
|
llm = LLM(config, service_id='test-service')
|
|
messages = [{'role': 'user', 'content': 'Solve this complex problem'}]
|
|
|
|
response = llm.completion(messages=messages)
|
|
|
|
# Verify response was generated
|
|
assert response['choices'][0]['message']['content'] == 'Optimized response'
|
|
|
|
# Verify optimization parameters were applied
|
|
call_kwargs = mock_completion.call_args[1]
|
|
assert 'thinking' in call_kwargs
|
|
assert call_kwargs['thinking'] == {'budget_tokens': 128}
|
|
assert call_kwargs.get('reasoning_effort') is None
|
|
|
|
# Verify temperature and top_p were removed for reasoning models
|
|
assert 'temperature' not in call_kwargs
|
|
assert 'top_p' not in call_kwargs
|