diff --git a/config.template.toml b/config.template.toml index e88150cd07..26291976fd 100644 --- a/config.template.toml +++ b/config.template.toml @@ -154,6 +154,10 @@ model = "gpt-4o" # Drop any unmapped (unsupported) params without causing an exception #drop_params = false +# Modify params for litellm to do transformations like adding a default message, when a message is empty. +# Note: this setting is global, unlike drop_params, it cannot be overridden in each call to litellm. +#modify_params = true + # Using the prompt caching feature if provided by the LLM and supported #caching_prompt = true diff --git a/evaluation/benchmarks/EDA/run_infer.py b/evaluation/benchmarks/EDA/run_infer.py index cce795e954..c866b5090b 100644 --- a/evaluation/benchmarks/EDA/run_infer.py +++ b/evaluation/benchmarks/EDA/run_infer.py @@ -202,6 +202,9 @@ if __name__ == '__main__': llm_config = None if args.llm_config: llm_config = get_llm_config_arg(args.llm_config) + # modify_params must be False for evaluation purpose, for reproducibility and accurancy of results + llm_config.modify_params = False + if llm_config is None: raise ValueError(f'Could not find LLM config: --llm_config {args.llm_config}') diff --git a/evaluation/benchmarks/agent_bench/run_infer.py b/evaluation/benchmarks/agent_bench/run_infer.py index 2fb7213ce8..f008c9dc8a 100644 --- a/evaluation/benchmarks/agent_bench/run_infer.py +++ b/evaluation/benchmarks/agent_bench/run_infer.py @@ -307,6 +307,8 @@ if __name__ == '__main__': llm_config = None if args.llm_config: llm_config = get_llm_config_arg(args.llm_config) + # modify_params must be False for evaluation purpose, for reproducibility and accurancy of results + llm_config.modify_params = False if llm_config is None: raise ValueError(f'Could not find LLM config: --llm_config {args.llm_config}') diff --git a/evaluation/benchmarks/aider_bench/run_infer.py b/evaluation/benchmarks/aider_bench/run_infer.py index f7796c7696..e059a6b46f 100644 --- a/evaluation/benchmarks/aider_bench/run_infer.py +++ b/evaluation/benchmarks/aider_bench/run_infer.py @@ -279,6 +279,8 @@ if __name__ == '__main__': llm_config = None if args.llm_config: llm_config = get_llm_config_arg(args.llm_config) + # modify_params must be False for evaluation purpose, for reproducibility and accurancy of results + llm_config.modify_params = False if llm_config is None: raise ValueError(f'Could not find LLM config: --llm_config {args.llm_config}') diff --git a/evaluation/benchmarks/biocoder/run_infer.py b/evaluation/benchmarks/biocoder/run_infer.py index f5cdd44471..2da7b09f0f 100644 --- a/evaluation/benchmarks/biocoder/run_infer.py +++ b/evaluation/benchmarks/biocoder/run_infer.py @@ -328,6 +328,8 @@ if __name__ == '__main__': llm_config = None if args.llm_config: llm_config = get_llm_config_arg(args.llm_config) + # modify_params must be False for evaluation purpose, for reproducibility and accurancy of results + llm_config.modify_params = False if llm_config is None: raise ValueError(f'Could not find LLM config: --llm_config {args.llm_config}') diff --git a/evaluation/benchmarks/bird/run_infer.py b/evaluation/benchmarks/bird/run_infer.py index 8392841480..d35084fdbc 100644 --- a/evaluation/benchmarks/bird/run_infer.py +++ b/evaluation/benchmarks/bird/run_infer.py @@ -456,6 +456,8 @@ if __name__ == '__main__': llm_config = None if args.llm_config: llm_config = get_llm_config_arg(args.llm_config) + # modify_params must be False for evaluation purpose, for reproducibility and accurancy of results + llm_config.modify_params = False if llm_config is None: raise ValueError(f'Could not find LLM config: --llm_config {args.llm_config}') diff --git a/evaluation/benchmarks/browsing_delegation/run_infer.py b/evaluation/benchmarks/browsing_delegation/run_infer.py index 5c1ab8c062..38fb6cae25 100644 --- a/evaluation/benchmarks/browsing_delegation/run_infer.py +++ b/evaluation/benchmarks/browsing_delegation/run_infer.py @@ -142,6 +142,8 @@ if __name__ == '__main__': llm_config = None if args.llm_config: llm_config = get_llm_config_arg(args.llm_config) + # modify_params must be False for evaluation purpose, for reproducibility and accurancy of results + llm_config.modify_params = False if llm_config is None: raise ValueError(f'Could not find LLM config: --llm_config {args.llm_config}') diff --git a/evaluation/benchmarks/commit0_bench/run_infer.py b/evaluation/benchmarks/commit0_bench/run_infer.py index ef2df02031..1ef347931f 100644 --- a/evaluation/benchmarks/commit0_bench/run_infer.py +++ b/evaluation/benchmarks/commit0_bench/run_infer.py @@ -571,6 +571,8 @@ if __name__ == '__main__': llm_config = None if args.llm_config: llm_config = get_llm_config_arg(args.llm_config) + # modify_params must be False for evaluation purpose, for reproducibility and accurancy of results + llm_config.modify_params = False llm_config.log_completions = True if llm_config is None: diff --git a/evaluation/benchmarks/discoverybench/run_infer.py b/evaluation/benchmarks/discoverybench/run_infer.py index 6d8dcbd89b..55e958d9fd 100644 --- a/evaluation/benchmarks/discoverybench/run_infer.py +++ b/evaluation/benchmarks/discoverybench/run_infer.py @@ -466,6 +466,8 @@ if __name__ == '__main__': llm_config = None if args.llm_config: llm_config = get_llm_config_arg(args.llm_config) + # modify_params must be False for evaluation purpose, for reproducibility and accurancy of results + llm_config.modify_params = False if llm_config is None: raise ValueError(f'Could not find LLM config: --llm_config {args.llm_config}') diff --git a/evaluation/benchmarks/gaia/run_infer.py b/evaluation/benchmarks/gaia/run_infer.py index fb6d4b3db0..99c29b211d 100644 --- a/evaluation/benchmarks/gaia/run_infer.py +++ b/evaluation/benchmarks/gaia/run_infer.py @@ -238,6 +238,9 @@ if __name__ == '__main__': llm_config = None if args.llm_config: llm_config = get_llm_config_arg(args.llm_config) + # modify_params must be False for evaluation purpose, for reproducibility and accurancy of results + llm_config.modify_params = False + if llm_config is None: raise ValueError(f'Could not find LLM config: --llm_config {args.llm_config}') diff --git a/evaluation/benchmarks/gorilla/run_infer.py b/evaluation/benchmarks/gorilla/run_infer.py index 6f5b6c9d43..64263242d7 100644 --- a/evaluation/benchmarks/gorilla/run_infer.py +++ b/evaluation/benchmarks/gorilla/run_infer.py @@ -146,6 +146,8 @@ if __name__ == '__main__': llm_config = None if args.llm_config: llm_config = get_llm_config_arg(args.llm_config) + # modify_params must be False for evaluation purpose, for reproducibility and accurancy of results + llm_config.modify_params = False if llm_config is None: raise ValueError(f'Could not find LLM config: --llm_config {args.llm_config}') diff --git a/evaluation/benchmarks/gpqa/run_infer.py b/evaluation/benchmarks/gpqa/run_infer.py index de41248599..d9e1caec77 100644 --- a/evaluation/benchmarks/gpqa/run_infer.py +++ b/evaluation/benchmarks/gpqa/run_infer.py @@ -326,6 +326,9 @@ if __name__ == '__main__': llm_config = None if args.llm_config: llm_config = get_llm_config_arg(args.llm_config) + # modify_params must be False for evaluation purpose, for reproducibility and accurancy of results + llm_config.modify_params = False + if llm_config is None: raise ValueError(f'Could not find LLM config: --llm_config {args.llm_config}') diff --git a/evaluation/benchmarks/humanevalfix/run_infer.py b/evaluation/benchmarks/humanevalfix/run_infer.py index fff2e23730..3b5a5bca2f 100644 --- a/evaluation/benchmarks/humanevalfix/run_infer.py +++ b/evaluation/benchmarks/humanevalfix/run_infer.py @@ -285,6 +285,8 @@ if __name__ == '__main__': llm_config = None if args.llm_config: llm_config = get_llm_config_arg(args.llm_config) + # modify_params must be False for evaluation purpose, for reproducibility and accurancy of results + llm_config.modify_params = False if llm_config is None: raise ValueError(f'Could not find LLM config: --llm_config {args.llm_config}') diff --git a/evaluation/benchmarks/logic_reasoning/run_infer.py b/evaluation/benchmarks/logic_reasoning/run_infer.py index 116b438b3e..87334de0e4 100644 --- a/evaluation/benchmarks/logic_reasoning/run_infer.py +++ b/evaluation/benchmarks/logic_reasoning/run_infer.py @@ -288,6 +288,8 @@ if __name__ == '__main__': llm_config = None if args.llm_config: llm_config = get_llm_config_arg(args.llm_config) + # modify_params must be False for evaluation purpose, for reproducibility and accurancy of results + llm_config.modify_params = False if llm_config is None: raise ValueError(f'Could not find LLM config: --llm_config {args.llm_config}') diff --git a/evaluation/benchmarks/miniwob/run_infer.py b/evaluation/benchmarks/miniwob/run_infer.py index 95e4c93575..dd93fbaf0a 100644 --- a/evaluation/benchmarks/miniwob/run_infer.py +++ b/evaluation/benchmarks/miniwob/run_infer.py @@ -231,6 +231,8 @@ if __name__ == '__main__': llm_config = None if args.llm_config: llm_config = get_llm_config_arg(args.llm_config) + # modify_params must be False for evaluation purpose, for reproducibility and accurancy of results + llm_config.modify_params = False if llm_config is None: raise ValueError(f'Could not find LLM config: --llm_config {args.llm_config}') diff --git a/evaluation/benchmarks/mint/run_infer.py b/evaluation/benchmarks/mint/run_infer.py index 4414e1c462..7106f4a59d 100644 --- a/evaluation/benchmarks/mint/run_infer.py +++ b/evaluation/benchmarks/mint/run_infer.py @@ -279,6 +279,8 @@ if __name__ == '__main__': llm_config = None if args.llm_config: llm_config = get_llm_config_arg(args.llm_config) + # modify_params must be False for evaluation purpose, for reproducibility and accurancy of results + llm_config.modify_params = False if llm_config is None: raise ValueError(f'Could not find LLM config: --llm_config {args.llm_config}') diff --git a/evaluation/benchmarks/ml_bench/run_analysis.py b/evaluation/benchmarks/ml_bench/run_analysis.py index eda8fd4bdd..19de2fc42c 100644 --- a/evaluation/benchmarks/ml_bench/run_analysis.py +++ b/evaluation/benchmarks/ml_bench/run_analysis.py @@ -124,6 +124,9 @@ if __name__ == '__main__': # for details of how to set `llm_config` if args.llm_config: specified_llm_config = get_llm_config_arg(args.llm_config) + # modify_params must be False for evaluation purpose, for reproducibility and accurancy of results + specified_llm_config.modify_params = False + if specified_llm_config: config.llm = specified_llm_config logger.info(f'Config for evaluation: {config}') diff --git a/evaluation/benchmarks/ml_bench/run_infer.py b/evaluation/benchmarks/ml_bench/run_infer.py index e97746bb76..ab94b925ab 100644 --- a/evaluation/benchmarks/ml_bench/run_infer.py +++ b/evaluation/benchmarks/ml_bench/run_infer.py @@ -292,6 +292,8 @@ if __name__ == '__main__': llm_config = None if args.llm_config: llm_config = get_llm_config_arg(args.llm_config) + # modify_params must be False for evaluation purpose, for reproducibility and accurancy of results + llm_config.modify_params = False if llm_config is None: raise ValueError(f'Could not find LLM config: --llm_config {args.llm_config}') diff --git a/evaluation/benchmarks/scienceagentbench/run_infer.py b/evaluation/benchmarks/scienceagentbench/run_infer.py index efa6c9e42c..7e7c7919c0 100644 --- a/evaluation/benchmarks/scienceagentbench/run_infer.py +++ b/evaluation/benchmarks/scienceagentbench/run_infer.py @@ -272,6 +272,8 @@ if __name__ == '__main__': llm_config = None if args.llm_config: llm_config = get_llm_config_arg(args.llm_config) + # modify_params must be False for evaluation purpose, for reproducibility and accurancy of results + llm_config.modify_params = False if llm_config is None: raise ValueError(f'Could not find LLM config: --llm_config {args.llm_config}') diff --git a/evaluation/benchmarks/swe_bench/run_infer.py b/evaluation/benchmarks/swe_bench/run_infer.py index 134c98cb96..b97b5d9361 100644 --- a/evaluation/benchmarks/swe_bench/run_infer.py +++ b/evaluation/benchmarks/swe_bench/run_infer.py @@ -490,6 +490,8 @@ if __name__ == '__main__': if args.llm_config: llm_config = get_llm_config_arg(args.llm_config) llm_config.log_completions = True + # modify_params must be False for evaluation purpose, for reproducibility and accurancy of results + llm_config.modify_params = False if llm_config is None: raise ValueError(f'Could not find LLM config: --llm_config {args.llm_config}') diff --git a/evaluation/benchmarks/toolqa/run_infer.py b/evaluation/benchmarks/toolqa/run_infer.py index c99f15a89a..32a830e2a6 100644 --- a/evaluation/benchmarks/toolqa/run_infer.py +++ b/evaluation/benchmarks/toolqa/run_infer.py @@ -181,6 +181,9 @@ if __name__ == '__main__': llm_config = None if args.llm_config: llm_config = get_llm_config_arg(args.llm_config) + # modify_params must be False for evaluation purpose, for reproducibility and accurancy of results + llm_config.modify_params = False + if llm_config is None: raise ValueError(f'Could not find LLM config: --llm_config {args.llm_config}') diff --git a/evaluation/benchmarks/webarena/run_infer.py b/evaluation/benchmarks/webarena/run_infer.py index 531f134fd9..d18918cf96 100644 --- a/evaluation/benchmarks/webarena/run_infer.py +++ b/evaluation/benchmarks/webarena/run_infer.py @@ -212,6 +212,8 @@ if __name__ == '__main__': llm_config = None if args.llm_config: llm_config = get_llm_config_arg(args.llm_config) + # modify_params must be False for evaluation purpose, for reproducibility and accurancy of results + llm_config.modify_params = False if llm_config is None: raise ValueError(f'Could not find LLM config: --llm_config {args.llm_config}') diff --git a/openhands/core/config/llm_config.py b/openhands/core/config/llm_config.py index 4e60d4a281..7db3731e72 100644 --- a/openhands/core/config/llm_config.py +++ b/openhands/core/config/llm_config.py @@ -38,6 +38,7 @@ class LLMConfig: output_cost_per_token: The cost per output token. This will available in logs for the user to check. ollama_base_url: The base URL for the OLLAMA API. drop_params: Drop any unmapped (unsupported) params without causing an exception. + modify_params: Modify params allows litellm to do transformations like adding a default message, when a message is empty. disable_vision: If model is vision capable, this option allows to disable image processing (useful for cost reduction). caching_prompt: Use the prompt caching feature if provided by the LLM and supported by the provider. log_completions: Whether to log LLM completions to the state. @@ -72,7 +73,10 @@ class LLMConfig: input_cost_per_token: float | None = None output_cost_per_token: float | None = None ollama_base_url: str | None = None + # This setting can be sent in each call to litellm drop_params: bool = True + # Note: this setting is actually global, unlike drop_params + modify_params: bool = True disable_vision: bool | None = None caching_prompt: bool = True log_completions: bool = False diff --git a/openhands/llm/llm.py b/openhands/llm/llm.py index d7c7309eff..85fa1d667c 100644 --- a/openhands/llm/llm.py +++ b/openhands/llm/llm.py @@ -101,7 +101,6 @@ class LLM(RetryMixin, DebugMixin): self.cost_metric_supported: bool = True self.config: LLMConfig = copy.deepcopy(config) - # litellm actually uses base Exception here for unknown model self.model_info: ModelInfo | None = None if self.config.log_completions: @@ -206,6 +205,11 @@ class LLM(RetryMixin, DebugMixin): 'anthropic-beta': 'prompt-caching-2024-07-31', } + # set litellm modify_params to the configured value + # True by default to allow litellm to do transformations like adding a default message, when a message is empty + # NOTE: this setting is global; unlike drop_params, it cannot be overridden in the litellm completion partial + litellm.modify_params = self.config.modify_params + try: # Record start time for latency measurement start_time = time.time()