Centralize model feature checks (#10414)

Co-authored-by: OpenHands-GPT-5 <openhands@all-hands.dev>
This commit is contained in:
Engel Nyst
2025-08-19 22:30:07 +02:00
committed by GitHub
parent aa6b454772
commit bb0e24d23b
8 changed files with 596 additions and 112 deletions

View File

@@ -12,8 +12,10 @@ from litellm.exceptions import (
from openhands.core.config import LLMConfig
from openhands.core.exceptions import LLMNoResponseError, OperationCancelled
from openhands.core.message import Message, TextContent
from openhands.llm.async_llm import AsyncLLM
from openhands.llm.llm import LLM
from openhands.llm.metrics import Metrics, TokenUsage
from openhands.llm.streaming_llm import StreamingLLM
@pytest.fixture(autouse=True)
@@ -252,7 +254,7 @@ def test_response_latency_tracking(mock_time, mock_litellm_completion):
@patch('openhands.llm.llm.litellm.get_model_info')
def test_llm_init_with_openrouter_model(mock_get_model_info, default_config):
default_config.model = 'openrouter:gpt-4o-mini'
default_config.model = 'openrouter/gpt-4o-mini'
mock_get_model_info.return_value = {
'max_input_tokens': 7000,
'max_output_tokens': 1500,
@@ -261,7 +263,7 @@ def test_llm_init_with_openrouter_model(mock_get_model_info, default_config):
llm.init_model_info()
assert llm.config.max_input_tokens == 7000
assert llm.config.max_output_tokens == 1500
mock_get_model_info.assert_called_once_with('openrouter:gpt-4o-mini')
mock_get_model_info.assert_called_once_with('openrouter/gpt-4o-mini')
@patch('openhands.llm.llm.litellm_completion')
@@ -1201,6 +1203,92 @@ def test_gemini_medium_reasoning_effort_passes_through(mock_completion):
assert call_kwargs.get('reasoning_effort') == 'medium'
@patch('openhands.llm.llm.litellm_completion')
def test_opus_41_reasoning_pops_temperature_top_p(mock_completion):
mock_completion.return_value = {
'choices': [{'message': {'content': 'ok'}}],
}
config = LLMConfig(
model='anthropic/claude-opus-4-1-20250805',
api_key='k',
temperature=0.7,
top_p=0.9,
)
llm = LLM(config, service_id='svc')
llm.completion(messages=[{'role': 'user', 'content': 'hi'}])
call_kwargs = mock_completion.call_args[1]
assert 'temperature' not in call_kwargs
assert 'top_p' not in call_kwargs
@patch('openhands.llm.llm.litellm_completion')
def test_opus_4_keeps_temperature_top_p(mock_completion):
mock_completion.return_value = {
'choices': [{'message': {'content': 'ok'}}],
}
config = LLMConfig(
model='anthropic/claude-opus-4-20250514',
api_key='k',
temperature=0.7,
top_p=0.9,
)
llm = LLM(config, service_id='svc')
llm.completion(messages=[{'role': 'user', 'content': 'hi'}])
call_kwargs = mock_completion.call_args[1]
assert call_kwargs.get('temperature') == 0.7
assert call_kwargs.get('top_p') == 0.9
@patch('openhands.llm.llm.litellm.get_model_info')
def test_is_caching_prompt_active_anthropic_prefixed(mock_get_model_info):
# Avoid external calls, but behavior shouldn't depend on model info
mock_get_model_info.side_effect = Exception('skip')
config = LLMConfig(
model='anthropic/claude-3-7-sonnet', api_key='k', caching_prompt=True
)
llm = LLM(config, service_id='svc')
assert llm.is_caching_prompt_active() is True
@patch('openhands.llm.llm.httpx.get')
@patch('openhands.llm.llm.litellm.get_model_info')
def test_openhands_provider_rewrite_and_caching_prompt(
mock_get_model_info, mock_httpx_get
):
# Mock LiteLLM proxy /v1/model/info response
mock_httpx_get.return_value = type(
'Resp',
(),
{
'json': lambda self=None: {
'data': [
{
'model_name': 'claude-3.7-sonnet',
'model_info': {
'max_input_tokens': 200000,
'max_output_tokens': 64000,
'supports_vision': True,
},
}
]
}
},
)()
mock_get_model_info.return_value = {
'max_input_tokens': 200000,
'max_output_tokens': 64000,
}
config = LLMConfig(
model='openhands/claude-3.7-sonnet', api_key='k', caching_prompt=True
)
llm = LLM(config, service_id='svc')
# Model should be rewritten to litellm_proxy/...
assert llm.config.model.startswith('litellm_proxy/claude-3.7-sonnet')
# Caching prompt should be active for Claude
assert llm.is_caching_prompt_active() is True
@patch('openhands.llm.llm.litellm_completion')
def test_gemini_high_reasoning_effort_passes_through(mock_completion):
"""Test that Gemini with reasoning_effort='high' passes through to litellm."""
@@ -1239,10 +1327,61 @@ def test_non_gemini_uses_reasoning_effort(mock_completion):
sample_messages = [{'role': 'user', 'content': 'Hello, how are you?'}]
llm.completion(messages=sample_messages)
# Verify that reasoning_effort was used and thinking budget was not set
call_kwargs = mock_completion.call_args[1]
@patch('openhands.llm.async_llm.litellm_acompletion')
@pytest.mark.asyncio
async def test_async_reasoning_effort_passthrough(mock_acompletion):
mock_acompletion.return_value = {
'choices': [{'message': {'content': 'ok'}}],
}
config = LLMConfig(
model='o3', api_key='k', temperature=0.7, top_p=0.9, reasoning_effort='low'
)
llm = AsyncLLM(config, service_id='svc')
await llm.async_completion(messages=[{'role': 'user', 'content': 'hi'}])
call_kwargs = mock_acompletion.call_args[1]
assert call_kwargs.get('reasoning_effort') == 'low'
# Async path does not pop temperature/top_p (parity with main)
assert call_kwargs.get('temperature') == 0.7
assert call_kwargs.get('top_p') == 0.9
@patch('openhands.llm.streaming_llm.AsyncLLM._call_acompletion')
@pytest.mark.asyncio
async def test_streaming_reasoning_effort_passthrough(mock_call):
async def fake_stream(*args, **kwargs):
class Dummy:
async def __aiter__(self):
yield {'choices': [{'delta': {'content': 'x'}}]}
return Dummy()
mock_call.side_effect = fake_stream
config = LLMConfig(
model='o3', api_key='k', temperature=0.7, top_p=0.9, reasoning_effort='low'
)
sllm = StreamingLLM(config, service_id='svc')
async for _ in sllm.async_streaming_completion(
messages=[{'role': 'user', 'content': 'hi'}]
):
break
call_kwargs = mock_call.call_args[1]
assert call_kwargs.get('reasoning_effort') == 'low'
assert call_kwargs.get('temperature') == 0.7
assert call_kwargs.get('top_p') == 0.9
@patch('openhands.llm.async_llm.litellm_acompletion')
@pytest.mark.asyncio
async def test_async_streaming_no_thinking_for_gemini(mock_acompletion):
mock_acompletion.return_value = {
'choices': [{'message': {'content': 'ok'}}],
}
config = LLMConfig(model='gemini-2.5-pro', api_key='k', reasoning_effort='low')
llm = AsyncLLM(config, service_id='svc')
await llm.async_completion(messages=[{'role': 'user', 'content': 'hi'}])
call_kwargs = mock_acompletion.call_args[1]
assert 'thinking' not in call_kwargs
assert call_kwargs.get('reasoning_effort') == 'high'
@patch('openhands.llm.llm.litellm_completion')

View File

@@ -0,0 +1,293 @@
import pytest
from openhands.llm.model_features import (
ModelFeatures,
get_features,
model_matches,
normalize_model_name,
)
@pytest.mark.parametrize(
'raw,expected',
[
(' OPENAI/gpt-4o ', 'gpt-4o'),
('anthropic/claude-3-7-sonnet', 'claude-3-7-sonnet'),
('litellm_proxy/gemini-2.5-pro', 'gemini-2.5-pro'),
('qwen3-coder-480b-a35b-instruct', 'qwen3-coder-480b-a35b-instruct'),
('gpt-5', 'gpt-5'),
('deepseek/DeepSeek-R1-0528:671b-Q4_K_XL', 'deepseek-r1-0528'),
('openai/GLM-4.5-GGUF', 'glm-4.5'),
('openrouter/gpt-4o-mini', 'gpt-4o-mini'),
(
'bedrock/anthropic.claude-3-5-sonnet-20241022-v2:0',
'anthropic.claude-3-5-sonnet-20241022-v2',
),
('', ''),
(None, ''), # type: ignore[arg-type]
],
)
def test_normalize_model_name(raw, expected):
assert normalize_model_name(raw) == expected
@pytest.mark.parametrize(
'name,pattern,expected',
[
('gpt-4o', 'gpt-4o*', True),
('openai/gpt-4o', 'gpt-4o*', True),
('litellm_proxy/gpt-4o-mini', 'gpt-4o*', True),
('claude-3-7-sonnet-20250219', 'claude-3-7-sonnet*', True),
('o1-2024-12-17', 'o1*', True),
('grok-4-0709', 'grok-4-0709', True),
('grok-4-0801', 'grok-4-0709', False),
],
)
def test_model_matches(name, pattern, expected):
assert model_matches(name, [pattern]) is expected
@pytest.mark.parametrize(
'name,pattern,expected',
[
('openai/gpt-4o', 'openai/gpt-4o*', True),
('openrouter/gpt-4o', 'openai/gpt-4o*', False),
('litellm_proxy/gpt-4o-mini', 'litellm_proxy/gpt-4o*', True),
(
'gpt-4o',
'openai/gpt-4o*',
False,
), # basename alone should not match provider-qualified
('unknown-model', 'gpt-5*', False),
],
)
def test_model_matches_provider_qualified(name, pattern, expected):
assert model_matches(name, [pattern]) is expected
@pytest.mark.parametrize(
'model,expect',
[
(
'gpt-4o',
ModelFeatures(
supports_function_calling=True,
supports_reasoning_effort=False,
supports_prompt_cache=False,
supports_stop_words=True,
),
),
(
'gpt-5',
ModelFeatures(
supports_function_calling=True,
supports_reasoning_effort=True,
supports_prompt_cache=False,
supports_stop_words=True,
),
),
(
'o3-mini',
ModelFeatures(
supports_function_calling=True,
supports_reasoning_effort=True,
supports_prompt_cache=False,
supports_stop_words=True,
),
),
(
'o1-2024-12-17',
ModelFeatures(
supports_function_calling=True,
supports_reasoning_effort=True,
supports_prompt_cache=False,
supports_stop_words=False,
),
),
(
'xai/grok-4-0709',
ModelFeatures(
supports_function_calling=False,
supports_reasoning_effort=False,
supports_prompt_cache=False,
supports_stop_words=False,
),
),
(
'anthropic/claude-3-7-sonnet',
ModelFeatures(
supports_function_calling=True,
supports_reasoning_effort=False,
supports_prompt_cache=True,
supports_stop_words=True,
),
),
(
'litellm_proxy/claude-3.7-sonnet',
ModelFeatures(
supports_function_calling=True,
supports_reasoning_effort=False,
supports_prompt_cache=True,
supports_stop_words=True,
),
),
(
'gemini-2.5-pro',
ModelFeatures(
supports_function_calling=True,
supports_reasoning_effort=True,
supports_prompt_cache=False,
supports_stop_words=True,
),
),
(
'openai/gpt-4o',
ModelFeatures(
supports_function_calling=True,
supports_reasoning_effort=False,
supports_prompt_cache=False,
supports_stop_words=True,
),
), # provider-qualified still matches basename patterns
],
)
def test_get_features(model, expect):
features = get_features(model)
assert features == expect
@pytest.mark.parametrize(
'model',
[
# Anthropic families
'claude-3-7-sonnet-20250219',
'claude-3.7-sonnet',
'claude-sonnet-3-7-latest',
'claude-3-5-sonnet',
'claude-3.5-haiku',
'claude-3-5-haiku-20241022',
'claude-sonnet-4-latest',
'claude-opus-4-1-20250805',
# OpenAI families
'gpt-4o',
'gpt-4.1',
'gpt-5',
# o-series
'o1-2024-12-17',
'o3-mini',
'o4-mini',
# Google Gemini
'gemini-2.5-pro',
# Others
'kimi-k2-0711-preview',
'kimi-k2-instruct',
'qwen3-coder',
'qwen3-coder-480b-a35b-instruct',
],
)
def test_function_calling_models(model):
features = get_features(model)
assert features.supports_function_calling is True
@pytest.mark.parametrize(
'model',
[
'o1-2024-12-17',
'o3-mini',
'o4-mini',
'gemini-2.5-flash',
'gemini-2.5-pro',
'gpt-5',
'claude-opus-4-1-20250805',
],
)
def test_reasoning_effort_models(model):
features = get_features(model)
assert features.supports_reasoning_effort is True
@pytest.mark.parametrize(
'model',
[
'deepseek/DeepSeek-R1-0528:671b-Q4_K_XL',
'DeepSeek-R1-0528',
],
)
def test_deepseek_reasoning_effort_models(model):
features = get_features(model)
assert features.supports_reasoning_effort is True
@pytest.mark.parametrize(
'model',
[
'claude-3-7-sonnet-20250219',
'claude-3.7-sonnet',
'claude-sonnet-3-7-latest',
'claude-3-5-sonnet',
'claude-3-5-haiku-20241022',
'claude-3-haiku-20240307',
'claude-3-opus-20240229',
'claude-sonnet-4-latest',
'claude-opus-4-1-20250805',
],
)
def test_prompt_cache_models(model):
features = get_features(model)
assert features.supports_prompt_cache is True
@pytest.mark.parametrize(
'model,expected',
[
# Positive cases: exactly those supported on main
('o1', True),
('o1-2024-12-17', True),
('o3', True),
('o3-2025-04-16', True),
('o3-mini', True),
('o3-mini-2025-01-31', True),
('o4-mini', True),
('o4-mini-2025-04-16', True),
('gemini-2.5-flash', True),
('gemini-2.5-pro', True),
('gpt-5', True),
('gpt-5-2025-08-07', True),
('claude-opus-4-1-20250805', True),
# DeepSeek
('deepseek/DeepSeek-R1-0528:671b-Q4_K_XL', True),
('DeepSeek-R1-0528', True),
# Negative cases: ensure we didn't unintentionally expand
('o1-mini', False),
('o1-preview', False),
('gemini-1.0-pro', False),
],
)
def test_reasoning_effort_parity_with_main(model, expected):
assert get_features(model).supports_reasoning_effort is expected
def test_prompt_cache_haiku_variants():
assert get_features('claude-3-5-haiku-20241022').supports_prompt_cache is True
assert get_features('claude-3.5-haiku-20241022').supports_prompt_cache is True
def test_stop_words_grok_provider_prefixed():
assert get_features('xai/grok-4-0709').supports_stop_words is False
assert get_features('grok-4-0709').supports_stop_words is False
@pytest.mark.parametrize(
'model',
[
'o1-mini',
'o1-2024-12-17',
'xai/grok-4-0709',
'deepseek/DeepSeek-R1-0528:671b-Q4_K_XL',
'DeepSeek-R1-0528',
],
)
def test_supports_stop_words_false_models(model):
features = get_features(model)
assert features.supports_stop_words is False