Centralize model feature checks (#10414)

Co-authored-by: OpenHands-GPT-5 <openhands@all-hands.dev>
2026-03-22 13:47:19 +08:00 · 2025-08-19 22:30:07 +02:00
parent aa6b454772
commit bb0e24d23b
8 changed files with 596 additions and 112 deletions
--- a/tests/unit/llm/test_llm.py
+++ b/tests/unit/llm/test_llm.py
@@ -12,8 +12,10 @@ from litellm.exceptions import (
 from openhands.core.config import LLMConfig
 from openhands.core.exceptions import LLMNoResponseError, OperationCancelled
 from openhands.core.message import Message, TextContent
+from openhands.llm.async_llm import AsyncLLM
 from openhands.llm.llm import LLM
 from openhands.llm.metrics import Metrics, TokenUsage
+from openhands.llm.streaming_llm import StreamingLLM


@pytest.fixture(autouse=True)
@@ -252,7 +254,7 @@ def test_response_latency_tracking(mock_time, mock_litellm_completion):

@patch('openhands.llm.llm.litellm.get_model_info')
 def test_llm_init_with_openrouter_model(mock_get_model_info, default_config):
-    default_config.model = 'openrouter:gpt-4o-mini'
+    default_config.model = 'openrouter/gpt-4o-mini'
    mock_get_model_info.return_value = {
        'max_input_tokens': 7000,
        'max_output_tokens': 1500,
@@ -261,7 +263,7 @@ def test_llm_init_with_openrouter_model(mock_get_model_info, default_config):
    llm.init_model_info()
    assert llm.config.max_input_tokens == 7000
    assert llm.config.max_output_tokens == 1500
-    mock_get_model_info.assert_called_once_with('openrouter:gpt-4o-mini')
+    mock_get_model_info.assert_called_once_with('openrouter/gpt-4o-mini')


@patch('openhands.llm.llm.litellm_completion')
@@ -1201,6 +1203,92 @@ def test_gemini_medium_reasoning_effort_passes_through(mock_completion):
    assert call_kwargs.get('reasoning_effort') == 'medium'


+@patch('openhands.llm.llm.litellm_completion')
+def test_opus_41_reasoning_pops_temperature_top_p(mock_completion):
+    mock_completion.return_value = {
+        'choices': [{'message': {'content': 'ok'}}],
+    }
+    config = LLMConfig(
+        model='anthropic/claude-opus-4-1-20250805',
+        api_key='k',
+        temperature=0.7,
+        top_p=0.9,
+    )
+    llm = LLM(config, service_id='svc')
+    llm.completion(messages=[{'role': 'user', 'content': 'hi'}])
+    call_kwargs = mock_completion.call_args[1]
+    assert 'temperature' not in call_kwargs
+    assert 'top_p' not in call_kwargs
+
+
+@patch('openhands.llm.llm.litellm_completion')
+def test_opus_4_keeps_temperature_top_p(mock_completion):
+    mock_completion.return_value = {
+        'choices': [{'message': {'content': 'ok'}}],
+    }
+    config = LLMConfig(
+        model='anthropic/claude-opus-4-20250514',
+        api_key='k',
+        temperature=0.7,
+        top_p=0.9,
+    )
+    llm = LLM(config, service_id='svc')
+    llm.completion(messages=[{'role': 'user', 'content': 'hi'}])
+    call_kwargs = mock_completion.call_args[1]
+    assert call_kwargs.get('temperature') == 0.7
+    assert call_kwargs.get('top_p') == 0.9
+
+
+@patch('openhands.llm.llm.litellm.get_model_info')
+def test_is_caching_prompt_active_anthropic_prefixed(mock_get_model_info):
+    # Avoid external calls, but behavior shouldn't depend on model info
+    mock_get_model_info.side_effect = Exception('skip')
+    config = LLMConfig(
+        model='anthropic/claude-3-7-sonnet', api_key='k', caching_prompt=True
+    )
+    llm = LLM(config, service_id='svc')
+    assert llm.is_caching_prompt_active() is True
+
+
+@patch('openhands.llm.llm.httpx.get')
+@patch('openhands.llm.llm.litellm.get_model_info')
+def test_openhands_provider_rewrite_and_caching_prompt(
+    mock_get_model_info, mock_httpx_get
+):
+    # Mock LiteLLM proxy /v1/model/info response
+    mock_httpx_get.return_value = type(
+        'Resp',
+        (),
+        {
+            'json': lambda self=None: {
+                'data': [
+                    {
+                        'model_name': 'claude-3.7-sonnet',
+                        'model_info': {
+                            'max_input_tokens': 200000,
+                            'max_output_tokens': 64000,
+                            'supports_vision': True,
+                        },
+                    }
+                ]
+            }
+        },
+    )()
+    mock_get_model_info.return_value = {
+        'max_input_tokens': 200000,
+        'max_output_tokens': 64000,
+    }
+
+    config = LLMConfig(
+        model='openhands/claude-3.7-sonnet', api_key='k', caching_prompt=True
+    )
+    llm = LLM(config, service_id='svc')
+    # Model should be rewritten to litellm_proxy/...
+    assert llm.config.model.startswith('litellm_proxy/claude-3.7-sonnet')
+    # Caching prompt should be active for Claude
+    assert llm.is_caching_prompt_active() is True
+
+
@patch('openhands.llm.llm.litellm_completion')
 def test_gemini_high_reasoning_effort_passes_through(mock_completion):
    """Test that Gemini with reasoning_effort='high' passes through to litellm."""
@@ -1239,10 +1327,61 @@ def test_non_gemini_uses_reasoning_effort(mock_completion):
    sample_messages = [{'role': 'user', 'content': 'Hello, how are you?'}]
    llm.completion(messages=sample_messages)

-    # Verify that reasoning_effort was used and thinking budget was not set
-    call_kwargs = mock_completion.call_args[1]
+
+@patch('openhands.llm.async_llm.litellm_acompletion')
+@pytest.mark.asyncio
+async def test_async_reasoning_effort_passthrough(mock_acompletion):
+    mock_acompletion.return_value = {
+        'choices': [{'message': {'content': 'ok'}}],
+    }
+    config = LLMConfig(
+        model='o3', api_key='k', temperature=0.7, top_p=0.9, reasoning_effort='low'
+    )
+    llm = AsyncLLM(config, service_id='svc')
+    await llm.async_completion(messages=[{'role': 'user', 'content': 'hi'}])
+    call_kwargs = mock_acompletion.call_args[1]
+    assert call_kwargs.get('reasoning_effort') == 'low'
+    # Async path does not pop temperature/top_p (parity with main)
+    assert call_kwargs.get('temperature') == 0.7
+    assert call_kwargs.get('top_p') == 0.9
+
+
+@patch('openhands.llm.streaming_llm.AsyncLLM._call_acompletion')
+@pytest.mark.asyncio
+async def test_streaming_reasoning_effort_passthrough(mock_call):
+    async def fake_stream(*args, **kwargs):
+        class Dummy:
+            async def __aiter__(self):
+                yield {'choices': [{'delta': {'content': 'x'}}]}
+
+        return Dummy()
+
+    mock_call.side_effect = fake_stream
+    config = LLMConfig(
+        model='o3', api_key='k', temperature=0.7, top_p=0.9, reasoning_effort='low'
+    )
+    sllm = StreamingLLM(config, service_id='svc')
+    async for _ in sllm.async_streaming_completion(
+        messages=[{'role': 'user', 'content': 'hi'}]
+    ):
+        break
+    call_kwargs = mock_call.call_args[1]
+    assert call_kwargs.get('reasoning_effort') == 'low'
+    assert call_kwargs.get('temperature') == 0.7
+    assert call_kwargs.get('top_p') == 0.9
+
+
+@patch('openhands.llm.async_llm.litellm_acompletion')
+@pytest.mark.asyncio
+async def test_async_streaming_no_thinking_for_gemini(mock_acompletion):
+    mock_acompletion.return_value = {
+        'choices': [{'message': {'content': 'ok'}}],
+    }
+    config = LLMConfig(model='gemini-2.5-pro', api_key='k', reasoning_effort='low')
+    llm = AsyncLLM(config, service_id='svc')
+    await llm.async_completion(messages=[{'role': 'user', 'content': 'hi'}])
+    call_kwargs = mock_acompletion.call_args[1]
    assert 'thinking' not in call_kwargs
-    assert call_kwargs.get('reasoning_effort') == 'high'


@patch('openhands.llm.llm.litellm_completion')
--- a/tests/unit/llm/test_model_features.py
+++ b/tests/unit/llm/test_model_features.py
@@ -0,0 +1,293 @@
+import pytest
+
+from openhands.llm.model_features import (
+    ModelFeatures,
+    get_features,
+    model_matches,
+    normalize_model_name,
+)
+
+
+@pytest.mark.parametrize(
+    'raw,expected',
+    [
+        ('  OPENAI/gpt-4o  ', 'gpt-4o'),
+        ('anthropic/claude-3-7-sonnet', 'claude-3-7-sonnet'),
+        ('litellm_proxy/gemini-2.5-pro', 'gemini-2.5-pro'),
+        ('qwen3-coder-480b-a35b-instruct', 'qwen3-coder-480b-a35b-instruct'),
+        ('gpt-5', 'gpt-5'),
+        ('deepseek/DeepSeek-R1-0528:671b-Q4_K_XL', 'deepseek-r1-0528'),
+        ('openai/GLM-4.5-GGUF', 'glm-4.5'),
+        ('openrouter/gpt-4o-mini', 'gpt-4o-mini'),
+        (
+            'bedrock/anthropic.claude-3-5-sonnet-20241022-v2:0',
+            'anthropic.claude-3-5-sonnet-20241022-v2',
+        ),
+        ('', ''),
+        (None, ''),  # type: ignore[arg-type]
+    ],
+)
+def test_normalize_model_name(raw, expected):
+    assert normalize_model_name(raw) == expected
+
+
+@pytest.mark.parametrize(
+    'name,pattern,expected',
+    [
+        ('gpt-4o', 'gpt-4o*', True),
+        ('openai/gpt-4o', 'gpt-4o*', True),
+        ('litellm_proxy/gpt-4o-mini', 'gpt-4o*', True),
+        ('claude-3-7-sonnet-20250219', 'claude-3-7-sonnet*', True),
+        ('o1-2024-12-17', 'o1*', True),
+        ('grok-4-0709', 'grok-4-0709', True),
+        ('grok-4-0801', 'grok-4-0709', False),
+    ],
+)
+def test_model_matches(name, pattern, expected):
+    assert model_matches(name, [pattern]) is expected
+
+
+@pytest.mark.parametrize(
+    'name,pattern,expected',
+    [
+        ('openai/gpt-4o', 'openai/gpt-4o*', True),
+        ('openrouter/gpt-4o', 'openai/gpt-4o*', False),
+        ('litellm_proxy/gpt-4o-mini', 'litellm_proxy/gpt-4o*', True),
+        (
+            'gpt-4o',
+            'openai/gpt-4o*',
+            False,
+        ),  # basename alone should not match provider-qualified
+        ('unknown-model', 'gpt-5*', False),
+    ],
+)
+def test_model_matches_provider_qualified(name, pattern, expected):
+    assert model_matches(name, [pattern]) is expected
+
+
+@pytest.mark.parametrize(
+    'model,expect',
+    [
+        (
+            'gpt-4o',
+            ModelFeatures(
+                supports_function_calling=True,
+                supports_reasoning_effort=False,
+                supports_prompt_cache=False,
+                supports_stop_words=True,
+            ),
+        ),
+        (
+            'gpt-5',
+            ModelFeatures(
+                supports_function_calling=True,
+                supports_reasoning_effort=True,
+                supports_prompt_cache=False,
+                supports_stop_words=True,
+            ),
+        ),
+        (
+            'o3-mini',
+            ModelFeatures(
+                supports_function_calling=True,
+                supports_reasoning_effort=True,
+                supports_prompt_cache=False,
+                supports_stop_words=True,
+            ),
+        ),
+        (
+            'o1-2024-12-17',
+            ModelFeatures(
+                supports_function_calling=True,
+                supports_reasoning_effort=True,
+                supports_prompt_cache=False,
+                supports_stop_words=False,
+            ),
+        ),
+        (
+            'xai/grok-4-0709',
+            ModelFeatures(
+                supports_function_calling=False,
+                supports_reasoning_effort=False,
+                supports_prompt_cache=False,
+                supports_stop_words=False,
+            ),
+        ),
+        (
+            'anthropic/claude-3-7-sonnet',
+            ModelFeatures(
+                supports_function_calling=True,
+                supports_reasoning_effort=False,
+                supports_prompt_cache=True,
+                supports_stop_words=True,
+            ),
+        ),
+        (
+            'litellm_proxy/claude-3.7-sonnet',
+            ModelFeatures(
+                supports_function_calling=True,
+                supports_reasoning_effort=False,
+                supports_prompt_cache=True,
+                supports_stop_words=True,
+            ),
+        ),
+        (
+            'gemini-2.5-pro',
+            ModelFeatures(
+                supports_function_calling=True,
+                supports_reasoning_effort=True,
+                supports_prompt_cache=False,
+                supports_stop_words=True,
+            ),
+        ),
+        (
+            'openai/gpt-4o',
+            ModelFeatures(
+                supports_function_calling=True,
+                supports_reasoning_effort=False,
+                supports_prompt_cache=False,
+                supports_stop_words=True,
+            ),
+        ),  # provider-qualified still matches basename patterns
+    ],
+)
+def test_get_features(model, expect):
+    features = get_features(model)
+    assert features == expect
+
+
+@pytest.mark.parametrize(
+    'model',
+    [
+        # Anthropic families
+        'claude-3-7-sonnet-20250219',
+        'claude-3.7-sonnet',
+        'claude-sonnet-3-7-latest',
+        'claude-3-5-sonnet',
+        'claude-3.5-haiku',
+        'claude-3-5-haiku-20241022',
+        'claude-sonnet-4-latest',
+        'claude-opus-4-1-20250805',
+        # OpenAI families
+        'gpt-4o',
+        'gpt-4.1',
+        'gpt-5',
+        # o-series
+        'o1-2024-12-17',
+        'o3-mini',
+        'o4-mini',
+        # Google Gemini
+        'gemini-2.5-pro',
+        # Others
+        'kimi-k2-0711-preview',
+        'kimi-k2-instruct',
+        'qwen3-coder',
+        'qwen3-coder-480b-a35b-instruct',
+    ],
+)
+def test_function_calling_models(model):
+    features = get_features(model)
+    assert features.supports_function_calling is True
+
+
+@pytest.mark.parametrize(
+    'model',
+    [
+        'o1-2024-12-17',
+        'o3-mini',
+        'o4-mini',
+        'gemini-2.5-flash',
+        'gemini-2.5-pro',
+        'gpt-5',
+        'claude-opus-4-1-20250805',
+    ],
+)
+def test_reasoning_effort_models(model):
+    features = get_features(model)
+    assert features.supports_reasoning_effort is True
+
+
+@pytest.mark.parametrize(
+    'model',
+    [
+        'deepseek/DeepSeek-R1-0528:671b-Q4_K_XL',
+        'DeepSeek-R1-0528',
+    ],
+)
+def test_deepseek_reasoning_effort_models(model):
+    features = get_features(model)
+    assert features.supports_reasoning_effort is True
+
+
+@pytest.mark.parametrize(
+    'model',
+    [
+        'claude-3-7-sonnet-20250219',
+        'claude-3.7-sonnet',
+        'claude-sonnet-3-7-latest',
+        'claude-3-5-sonnet',
+        'claude-3-5-haiku-20241022',
+        'claude-3-haiku-20240307',
+        'claude-3-opus-20240229',
+        'claude-sonnet-4-latest',
+        'claude-opus-4-1-20250805',
+    ],
+)
+def test_prompt_cache_models(model):
+    features = get_features(model)
+    assert features.supports_prompt_cache is True
+
+
+@pytest.mark.parametrize(
+    'model,expected',
+    [
+        # Positive cases: exactly those supported on main
+        ('o1', True),
+        ('o1-2024-12-17', True),
+        ('o3', True),
+        ('o3-2025-04-16', True),
+        ('o3-mini', True),
+        ('o3-mini-2025-01-31', True),
+        ('o4-mini', True),
+        ('o4-mini-2025-04-16', True),
+        ('gemini-2.5-flash', True),
+        ('gemini-2.5-pro', True),
+        ('gpt-5', True),
+        ('gpt-5-2025-08-07', True),
+        ('claude-opus-4-1-20250805', True),
+        # DeepSeek
+        ('deepseek/DeepSeek-R1-0528:671b-Q4_K_XL', True),
+        ('DeepSeek-R1-0528', True),
+        # Negative cases: ensure we didn't unintentionally expand
+        ('o1-mini', False),
+        ('o1-preview', False),
+        ('gemini-1.0-pro', False),
+    ],
+)
+def test_reasoning_effort_parity_with_main(model, expected):
+    assert get_features(model).supports_reasoning_effort is expected
+
+
+def test_prompt_cache_haiku_variants():
+    assert get_features('claude-3-5-haiku-20241022').supports_prompt_cache is True
+    assert get_features('claude-3.5-haiku-20241022').supports_prompt_cache is True
+
+
+def test_stop_words_grok_provider_prefixed():
+    assert get_features('xai/grok-4-0709').supports_stop_words is False
+    assert get_features('grok-4-0709').supports_stop_words is False
+
+
+@pytest.mark.parametrize(
+    'model',
+    [
+        'o1-mini',
+        'o1-2024-12-17',
+        'xai/grok-4-0709',
+        'deepseek/DeepSeek-R1-0528:671b-Q4_K_XL',
+        'DeepSeek-R1-0528',
+    ],
+)
+def test_supports_stop_words_false_models(model):
+    features = get_features(model)
+    assert features.supports_stop_words is False