diff --git a/config.template.toml b/config.template.toml index e88150cd07..4e5b0d870a 100644 --- a/config.template.toml +++ b/config.template.toml @@ -154,6 +154,11 @@ model = "gpt-4o" # Drop any unmapped (unsupported) params without causing an exception #drop_params = false +# Allow litellm to modify parameters to make them compatible with providers +# for example by inserting a default message (like 'continue') when a message is empty +# and the provider's API would give an error otherwise +#modify_params = true + # Using the prompt caching feature if provided by the LLM and supported #caching_prompt = true diff --git a/evaluation/benchmarks/EDA/run_infer.py b/evaluation/benchmarks/EDA/run_infer.py index cce795e954..cf63f37e78 100644 --- a/evaluation/benchmarks/EDA/run_infer.py +++ b/evaluation/benchmarks/EDA/run_infer.py @@ -201,7 +201,7 @@ if __name__ == '__main__': llm_config = None if args.llm_config: - llm_config = get_llm_config_arg(args.llm_config) + llm_config = get_llm_config_arg(args.llm_config, evaluation=True) if llm_config is None: raise ValueError(f'Could not find LLM config: --llm_config {args.llm_config}') diff --git a/evaluation/benchmarks/agent_bench/run_infer.py b/evaluation/benchmarks/agent_bench/run_infer.py index 2fb7213ce8..6833402741 100644 --- a/evaluation/benchmarks/agent_bench/run_infer.py +++ b/evaluation/benchmarks/agent_bench/run_infer.py @@ -306,7 +306,7 @@ if __name__ == '__main__': llm_config = None if args.llm_config: - llm_config = get_llm_config_arg(args.llm_config) + llm_config = get_llm_config_arg(args.llm_config, evaluation=True) if llm_config is None: raise ValueError(f'Could not find LLM config: --llm_config {args.llm_config}') diff --git a/evaluation/benchmarks/aider_bench/run_infer.py b/evaluation/benchmarks/aider_bench/run_infer.py index f7796c7696..23aab08dc6 100644 --- a/evaluation/benchmarks/aider_bench/run_infer.py +++ b/evaluation/benchmarks/aider_bench/run_infer.py @@ -278,7 +278,7 @@ if __name__ == '__main__': llm_config = None if args.llm_config: - llm_config = get_llm_config_arg(args.llm_config) + llm_config = get_llm_config_arg(args.llm_config, evaluation=True) if llm_config is None: raise ValueError(f'Could not find LLM config: --llm_config {args.llm_config}') diff --git a/evaluation/benchmarks/biocoder/run_infer.py b/evaluation/benchmarks/biocoder/run_infer.py index f5cdd44471..9b973a9bae 100644 --- a/evaluation/benchmarks/biocoder/run_infer.py +++ b/evaluation/benchmarks/biocoder/run_infer.py @@ -327,7 +327,7 @@ if __name__ == '__main__': llm_config = None if args.llm_config: - llm_config = get_llm_config_arg(args.llm_config) + llm_config = get_llm_config_arg(args.llm_config, evaluation=True) if llm_config is None: raise ValueError(f'Could not find LLM config: --llm_config {args.llm_config}') diff --git a/evaluation/benchmarks/bird/run_infer.py b/evaluation/benchmarks/bird/run_infer.py index 8392841480..b43bc53416 100644 --- a/evaluation/benchmarks/bird/run_infer.py +++ b/evaluation/benchmarks/bird/run_infer.py @@ -455,7 +455,7 @@ if __name__ == '__main__': llm_config = None if args.llm_config: - llm_config = get_llm_config_arg(args.llm_config) + llm_config = get_llm_config_arg(args.llm_config, evaluation=True) if llm_config is None: raise ValueError(f'Could not find LLM config: --llm_config {args.llm_config}') diff --git a/evaluation/benchmarks/browsing_delegation/run_infer.py b/evaluation/benchmarks/browsing_delegation/run_infer.py index 5c1ab8c062..a52082db1e 100644 --- a/evaluation/benchmarks/browsing_delegation/run_infer.py +++ b/evaluation/benchmarks/browsing_delegation/run_infer.py @@ -141,7 +141,7 @@ if __name__ == '__main__': llm_config = None if args.llm_config: - llm_config = get_llm_config_arg(args.llm_config) + llm_config = get_llm_config_arg(args.llm_config, evaluation=True) if llm_config is None: raise ValueError(f'Could not find LLM config: --llm_config {args.llm_config}') diff --git a/evaluation/benchmarks/commit0_bench/run_infer.py b/evaluation/benchmarks/commit0_bench/run_infer.py index ef2df02031..2f703356c2 100644 --- a/evaluation/benchmarks/commit0_bench/run_infer.py +++ b/evaluation/benchmarks/commit0_bench/run_infer.py @@ -570,7 +570,7 @@ if __name__ == '__main__': llm_config = None if args.llm_config: - llm_config = get_llm_config_arg(args.llm_config) + llm_config = get_llm_config_arg(args.llm_config, evaluation=True) llm_config.log_completions = True if llm_config is None: diff --git a/evaluation/benchmarks/discoverybench/run_infer.py b/evaluation/benchmarks/discoverybench/run_infer.py index 6d8dcbd89b..73f4f65903 100644 --- a/evaluation/benchmarks/discoverybench/run_infer.py +++ b/evaluation/benchmarks/discoverybench/run_infer.py @@ -465,7 +465,7 @@ if __name__ == '__main__': llm_config = None if args.llm_config: - llm_config = get_llm_config_arg(args.llm_config) + llm_config = get_llm_config_arg(args.llm_config, evaluation=True) if llm_config is None: raise ValueError(f'Could not find LLM config: --llm_config {args.llm_config}') diff --git a/evaluation/benchmarks/gaia/run_infer.py b/evaluation/benchmarks/gaia/run_infer.py index fb6d4b3db0..582f70dd04 100644 --- a/evaluation/benchmarks/gaia/run_infer.py +++ b/evaluation/benchmarks/gaia/run_infer.py @@ -237,7 +237,7 @@ if __name__ == '__main__': llm_config = None if args.llm_config: - llm_config = get_llm_config_arg(args.llm_config) + llm_config = get_llm_config_arg(args.llm_config, evaluation=True) if llm_config is None: raise ValueError(f'Could not find LLM config: --llm_config {args.llm_config}') diff --git a/evaluation/benchmarks/gorilla/run_infer.py b/evaluation/benchmarks/gorilla/run_infer.py index 6f5b6c9d43..22db1e5a13 100644 --- a/evaluation/benchmarks/gorilla/run_infer.py +++ b/evaluation/benchmarks/gorilla/run_infer.py @@ -145,7 +145,7 @@ if __name__ == '__main__': llm_config = None if args.llm_config: - llm_config = get_llm_config_arg(args.llm_config) + llm_config = get_llm_config_arg(args.llm_config, evaluation=True) if llm_config is None: raise ValueError(f'Could not find LLM config: --llm_config {args.llm_config}') diff --git a/evaluation/benchmarks/gpqa/run_infer.py b/evaluation/benchmarks/gpqa/run_infer.py index de41248599..30c1224560 100644 --- a/evaluation/benchmarks/gpqa/run_infer.py +++ b/evaluation/benchmarks/gpqa/run_infer.py @@ -325,7 +325,7 @@ if __name__ == '__main__': llm_config = None if args.llm_config: - llm_config = get_llm_config_arg(args.llm_config) + llm_config = get_llm_config_arg(args.llm_config, evaluation=True) if llm_config is None: raise ValueError(f'Could not find LLM config: --llm_config {args.llm_config}') diff --git a/evaluation/benchmarks/humanevalfix/run_infer.py b/evaluation/benchmarks/humanevalfix/run_infer.py index fff2e23730..f60c1696be 100644 --- a/evaluation/benchmarks/humanevalfix/run_infer.py +++ b/evaluation/benchmarks/humanevalfix/run_infer.py @@ -284,7 +284,7 @@ if __name__ == '__main__': llm_config = None if args.llm_config: - llm_config = get_llm_config_arg(args.llm_config) + llm_config = get_llm_config_arg(args.llm_config, evaluation=True) if llm_config is None: raise ValueError(f'Could not find LLM config: --llm_config {args.llm_config}') diff --git a/evaluation/benchmarks/logic_reasoning/run_infer.py b/evaluation/benchmarks/logic_reasoning/run_infer.py index 116b438b3e..d7e5ad8684 100644 --- a/evaluation/benchmarks/logic_reasoning/run_infer.py +++ b/evaluation/benchmarks/logic_reasoning/run_infer.py @@ -287,7 +287,7 @@ if __name__ == '__main__': llm_config = None if args.llm_config: - llm_config = get_llm_config_arg(args.llm_config) + llm_config = get_llm_config_arg(args.llm_config, evaluation=True) if llm_config is None: raise ValueError(f'Could not find LLM config: --llm_config {args.llm_config}') diff --git a/evaluation/benchmarks/miniwob/run_infer.py b/evaluation/benchmarks/miniwob/run_infer.py index 95e4c93575..e85d0fd2ab 100644 --- a/evaluation/benchmarks/miniwob/run_infer.py +++ b/evaluation/benchmarks/miniwob/run_infer.py @@ -230,7 +230,7 @@ if __name__ == '__main__': llm_config = None if args.llm_config: - llm_config = get_llm_config_arg(args.llm_config) + llm_config = get_llm_config_arg(args.llm_config, evaluation=True) if llm_config is None: raise ValueError(f'Could not find LLM config: --llm_config {args.llm_config}') diff --git a/evaluation/benchmarks/mint/run_infer.py b/evaluation/benchmarks/mint/run_infer.py index 4414e1c462..e27aa679f8 100644 --- a/evaluation/benchmarks/mint/run_infer.py +++ b/evaluation/benchmarks/mint/run_infer.py @@ -278,7 +278,7 @@ if __name__ == '__main__': llm_config = None if args.llm_config: - llm_config = get_llm_config_arg(args.llm_config) + llm_config = get_llm_config_arg(args.llm_config, evaluation=True) if llm_config is None: raise ValueError(f'Could not find LLM config: --llm_config {args.llm_config}') diff --git a/evaluation/benchmarks/ml_bench/run_infer.py b/evaluation/benchmarks/ml_bench/run_infer.py index e97746bb76..39eca2d670 100644 --- a/evaluation/benchmarks/ml_bench/run_infer.py +++ b/evaluation/benchmarks/ml_bench/run_infer.py @@ -291,7 +291,7 @@ if __name__ == '__main__': llm_config = None if args.llm_config: - llm_config = get_llm_config_arg(args.llm_config) + llm_config = get_llm_config_arg(args.llm_config, evaluation=True) if llm_config is None: raise ValueError(f'Could not find LLM config: --llm_config {args.llm_config}') diff --git a/evaluation/benchmarks/scienceagentbench/run_infer.py b/evaluation/benchmarks/scienceagentbench/run_infer.py index efa6c9e42c..7e7d5ee556 100644 --- a/evaluation/benchmarks/scienceagentbench/run_infer.py +++ b/evaluation/benchmarks/scienceagentbench/run_infer.py @@ -271,7 +271,7 @@ if __name__ == '__main__': llm_config = None if args.llm_config: - llm_config = get_llm_config_arg(args.llm_config) + llm_config = get_llm_config_arg(args.llm_config, evaluation=True) if llm_config is None: raise ValueError(f'Could not find LLM config: --llm_config {args.llm_config}') diff --git a/evaluation/benchmarks/swe_bench/run_infer.py b/evaluation/benchmarks/swe_bench/run_infer.py index 01111f75d1..8b1c36e32e 100644 --- a/evaluation/benchmarks/swe_bench/run_infer.py +++ b/evaluation/benchmarks/swe_bench/run_infer.py @@ -9,7 +9,6 @@ import toml from datasets import load_dataset import openhands.agenthub - from evaluation.utils.shared import ( EvalException, EvalMetadata, @@ -76,7 +75,7 @@ def get_instruction(instance: pd.Series, metadata: EvalMetadata): '4. Rerun your reproduce script and confirm that the error is fixed!\n' '5. Think about edgecases and make sure your fix handles them as well\n' "Your thinking should be thorough and so it's fine if it's very long.\n" - ) + ) if RUN_WITH_BROWSING: instruction += ( @@ -489,7 +488,7 @@ if __name__ == '__main__': llm_config = None if args.llm_config: - llm_config = get_llm_config_arg(args.llm_config) + llm_config = get_llm_config_arg(args.llm_config, evaluation=True) llm_config.log_completions = True if llm_config is None: diff --git a/evaluation/benchmarks/toolqa/run_infer.py b/evaluation/benchmarks/toolqa/run_infer.py index c99f15a89a..c730966bb7 100644 --- a/evaluation/benchmarks/toolqa/run_infer.py +++ b/evaluation/benchmarks/toolqa/run_infer.py @@ -180,7 +180,7 @@ if __name__ == '__main__': llm_config = None if args.llm_config: - llm_config = get_llm_config_arg(args.llm_config) + llm_config = get_llm_config_arg(args.llm_config, evaluation=True) if llm_config is None: raise ValueError(f'Could not find LLM config: --llm_config {args.llm_config}') diff --git a/evaluation/benchmarks/webarena/run_infer.py b/evaluation/benchmarks/webarena/run_infer.py index 531f134fd9..1d2eae37f7 100644 --- a/evaluation/benchmarks/webarena/run_infer.py +++ b/evaluation/benchmarks/webarena/run_infer.py @@ -211,7 +211,7 @@ if __name__ == '__main__': llm_config = None if args.llm_config: - llm_config = get_llm_config_arg(args.llm_config) + llm_config = get_llm_config_arg(args.llm_config, evaluation=True) if llm_config is None: raise ValueError(f'Could not find LLM config: --llm_config {args.llm_config}') diff --git a/openhands/core/config/llm_config.py b/openhands/core/config/llm_config.py index 4e60d4a281..8cd2b71714 100644 --- a/openhands/core/config/llm_config.py +++ b/openhands/core/config/llm_config.py @@ -44,6 +44,7 @@ class LLMConfig: log_completions_folder: The folder to log LLM completions to. Required if log_completions is True. draft_editor: A more efficient LLM to use for file editing. Introduced in [PR 3985](https://github.com/All-Hands-AI/OpenHands/pull/3985). custom_tokenizer: A custom tokenizer to use for token counting. + modify_params: Allow litellm to modify parameters to make them compatible with the provider. For example, insert default messages when empty. Defaults to True. """ model: str = 'claude-3-5-sonnet-20241022' @@ -79,6 +80,7 @@ class LLMConfig: log_completions_folder: str = os.path.join(LOG_DIR, 'completions') draft_editor: Optional['LLMConfig'] = None custom_tokenizer: str | None = None + modify_params: bool = True def defaults_to_dict(self) -> dict: """Serialize fields to a dict for the frontend, including type hints, defaults, and whether it's optional.""" diff --git a/openhands/core/config/utils.py b/openhands/core/config/utils.py index 3aedaf9523..bbfe7d3084 100644 --- a/openhands/core/config/utils.py +++ b/openhands/core/config/utils.py @@ -243,9 +243,9 @@ def finalize_config(cfg: AppConfig): ) -# Utility function for command line --group argument +# Utility function for command line -l (--llm-config) argument def get_llm_config_arg( - llm_config_arg: str, toml_file: str = 'config.toml' + llm_config_arg: str, toml_file: str = 'config.toml', evaluation: bool = False ) -> LLMConfig | None: """Get a group of llm settings from the config file. @@ -268,6 +268,7 @@ def get_llm_config_arg( Args: llm_config_arg: The group of llm settings to get from the config.toml file. toml_file: Path to the configuration file to read from. Defaults to 'config.toml'. + evaluation: If True, sets modify_params=False for evaluation purposes. Defaults to False. Returns: LLMConfig: The LLMConfig object with the settings from the config file. @@ -296,7 +297,10 @@ def get_llm_config_arg( # update the llm config with the specified section if 'llm' in toml_config and llm_config_arg in toml_config['llm']: - return LLMConfig.from_dict(toml_config['llm'][llm_config_arg]) + config = LLMConfig.from_dict(toml_config['llm'][llm_config_arg]) + if evaluation: + config.modify_params = False + return config logger.openhands_logger.debug(f'Loading from toml failed for {llm_config_arg}') return None diff --git a/openhands/llm/llm.py b/openhands/llm/llm.py index d7c7309eff..dfa16d977a 100644 --- a/openhands/llm/llm.py +++ b/openhands/llm/llm.py @@ -142,6 +142,7 @@ class LLM(RetryMixin, DebugMixin): temperature=self.config.temperature, top_p=self.config.top_p, drop_params=self.config.drop_params, + modify_params=self.config.modify_params, ) self._completion_unwrapped = self._completion diff --git a/pyproject.toml b/pyproject.toml index 2b0d3ca1e8..20c54d64cd 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -100,6 +100,7 @@ reportlab = "*" [tool.coverage.run] concurrency = ["gevent"] + [tool.poetry.group.runtime.dependencies] jupyterlab = "*" notebook = "*" @@ -130,6 +131,7 @@ ignore = ["D1"] [tool.ruff.lint.pydocstyle] convention = "google" + [tool.poetry.group.evaluation.dependencies] streamlit = "*" whatthepatch = "*"