Customize LLM config per agent (#2756)

Currently, OpenDevin uses a global singleton LLM config and a global singleton agent config. This PR allows customers to configure an LLM config for each agent. A hypothetically useful scenario is to use a cheaper LLM for repo exploration / code search, and a more powerful LLM to actually do the problem solving (CodeActAgent). Partially solves #2075 (web GUI improvement is not the goal of this PR)
2025-12-26 05:48:36 +08:00 · 2024-07-09 22:05:54 -07:00 · 2024-07-09 22:05:54 -07:00 · c68478f470
commit c68478f470
parent 23e2d01cf5
35 changed files with 522 additions and 227 deletions
--- a/.github/workflows/dummy-agent-test.yml
+++ b/.github/workflows/dummy-agent-test.yml
@ -31,7 +31,7 @@ jobs:
      - name: Run tests
        run: |
          set -e
-          poetry run python opendevin/core/main.py -t "do a flip" -m ollama/not-a-model -d ./workspace/ -c DummyAgent
+          poetry run python opendevin/core/main.py -t "do a flip" -d ./workspace/ -c DummyAgent
      - name: Check exit code
        run: |
          if [ $? -ne 0 ]; then
--- a/agenthub/codeact_agent/codeact_agent.py
+++ b/agenthub/codeact_agent/codeact_agent.py
@ -8,6 +8,7 @@ from agenthub.codeact_agent.prompt import (
 )
 from opendevin.controller.agent import Agent
 from opendevin.controller.state.state import State
+from opendevin.core.config import config
 from opendevin.events.action import (
    Action,
    AgentDelegateAction,
@ -60,8 +61,11 @@ def get_action_message(action: Action) -> dict[str, str] | None:


 def get_observation_message(obs) -> dict[str, str] | None:
+    max_message_chars = config.get_llm_config_from_agent(
+        'CodeActAgent'
+    ).max_message_chars
    if isinstance(obs, CmdOutputObservation):
-        content = 'OBSERVATION:\n' + truncate_content(obs.content)
+        content = 'OBSERVATION:\n' + truncate_content(obs.content, max_message_chars)
        content += (
            f'\n[Command {obs.command_id} finished with exit code {obs.exit_code}]'
        )
@ -76,10 +80,12 @@ def get_observation_message(obs) -> dict[str, str] | None:
                    '![image](data:image/png;base64, ...) already displayed to user'
                )
        content = '\n'.join(splitted)
-        content = truncate_content(content)
+        content = truncate_content(content, max_message_chars)
        return {'role': 'user', 'content': content}
    elif isinstance(obs, AgentDelegateObservation):
-        content = 'OBSERVATION:\n' + truncate_content(str(obs.outputs))
+        content = 'OBSERVATION:\n' + truncate_content(
+            str(obs.outputs), max_message_chars
+        )
        return {'role': 'user', 'content': content}
    return None

--- a/agenthub/codeact_swe_agent/codeact_swe_agent.py
+++ b/agenthub/codeact_swe_agent/codeact_swe_agent.py
@ -7,6 +7,7 @@ from agenthub.codeact_swe_agent.prompt import (
 from agenthub.codeact_swe_agent.response_parser import CodeActSWEResponseParser
 from opendevin.controller.agent import Agent
 from opendevin.controller.state.state import State
+from opendevin.core.config import config
 from opendevin.events.action import (
    Action,
    AgentFinishAction,
@ -52,8 +53,11 @@ def get_action_message(action: Action) -> dict[str, str] | None:


 def get_observation_message(obs) -> dict[str, str] | None:
+    max_message_chars = config.get_llm_config_from_agent(
+        'CodeActSWEAgent'
+    ).max_message_chars
    if isinstance(obs, CmdOutputObservation):
-        content = 'OBSERVATION:\n' + truncate_content(obs.content)
+        content = 'OBSERVATION:\n' + truncate_content(obs.content, max_message_chars)
        content += (
            f'\n[Command {obs.command_id} finished with exit code {obs.exit_code}]'
        )
@ -68,7 +72,7 @@ def get_observation_message(obs) -> dict[str, str] | None:
                    '![image](data:image/png;base64, ...) already displayed to user'
                )
        content = '\n'.join(splitted)
-        content = truncate_content(content)
+        content = truncate_content(content, max_message_chars)
        return {'role': 'user', 'content': content}
    return None

--- a/agenthub/micro/agent.py
+++ b/agenthub/micro/agent.py
@ -2,6 +2,7 @@ from jinja2 import BaseLoader, Environment

 from opendevin.controller.agent import Agent
 from opendevin.controller.state.state import State
+from opendevin.core.config import config
 from opendevin.core.utils import json
 from opendevin.events.action import Action
 from opendevin.events.serialization.action import action_from_dict
@ -32,6 +33,9 @@ def history_to_json(history: ShortTermHistory, max_events=20, **kwargs):
    """
    Serialize and simplify history to str format
    """
+    # TODO: get agent specific llm config
+    llm_config = config.get_llm_config()
+    max_message_chars = llm_config.max_message_chars

    processed_history = []
    event_count = 0
@ -39,7 +43,7 @@ def history_to_json(history: ShortTermHistory, max_events=20, **kwargs):
    for event in history.get_events(reverse=True):
        if event_count >= max_events:
            break
-        processed_history.append(event_to_memory(event))
+        processed_history.append(event_to_memory(event, max_message_chars))
        event_count += 1

    # history is in reverse order, let's fix it
--- a/agenthub/monologue_agent/agent.py
+++ b/agenthub/monologue_agent/agent.py
@ -29,7 +29,7 @@ from opendevin.llm.llm import LLM
 from opendevin.memory.condenser import MemoryCondenser
 from opendevin.runtime.tools import RuntimeTool

-if config.agent.memory_enabled:
+if config.get_agent_config('MonologueAgent').memory_enabled:
    from opendevin.memory.memory import LongTermMemory


@ -78,7 +78,7 @@ class MonologueAgent(Agent):
            raise AgentNoInstructionError()

        self.initial_thoughts = []
-        if config.agent.memory_enabled:
+        if config.get_agent_config('MonologueAgent').memory_enabled:
            self.memory = LongTermMemory()
        else:
            self.memory = None
@ -89,6 +89,9 @@ class MonologueAgent(Agent):
        self._initialized = True

    def _add_initial_thoughts(self, task):
+        max_message_chars = config.get_llm_config_from_agent(
+            'MonologueAgent'
+        ).max_message_chars
        previous_action = ''
        for thought in INITIAL_THOUGHTS:
            thought = thought.replace('$TASK', task)
@ -106,7 +109,9 @@ class MonologueAgent(Agent):
                    observation = BrowserOutputObservation(
                        content=thought, url='', screenshot=''
                    )
-                self.initial_thoughts.append(event_to_memory(observation))
+                self.initial_thoughts.append(
+                    event_to_memory(observation, max_message_chars)
+                )
                previous_action = ''
            else:
                action: Action = NullAction()
@ -133,7 +138,7 @@ class MonologueAgent(Agent):
                    previous_action = ActionType.BROWSE
                else:
                    action = MessageAction(thought)
-                self.initial_thoughts.append(event_to_memory(action))
+                self.initial_thoughts.append(event_to_memory(action, max_message_chars))

    def step(self, state: State) -> Action:
        """
@ -145,7 +150,9 @@ class MonologueAgent(Agent):
        Returns:
        - Action: The next action to take based on LLM response
        """
-
+        max_message_chars = config.get_llm_config_from_agent(
+            'MonologueAgent'
+        ).max_message_chars
        goal = state.get_current_user_intent()
        self._initialize(goal)

@ -153,7 +160,7 @@ class MonologueAgent(Agent):

        # add the events from state.history
        for event in state.history.get_events():
-            recent_events.append(event_to_memory(event))
+            recent_events.append(event_to_memory(event, max_message_chars))

        # add the last messages to long term memory
        if self.memory is not None:
@ -163,9 +170,11 @@ class MonologueAgent(Agent):
            # this should still work
            # we will need to do this differently: find out if there really is an action or an observation in this step
            if last_action:
-                self.memory.add_event(event_to_memory(last_action))
+                self.memory.add_event(event_to_memory(last_action, max_message_chars))
            if last_observation:
-                self.memory.add_event(event_to_memory(last_observation))
+                self.memory.add_event(
+                    event_to_memory(last_observation, max_message_chars)
+                )

        # the action prompt with initial thoughts and recent events
        prompt = prompts.get_request_action_prompt(
--- a/agenthub/planner_agent/prompt.py
+++ b/agenthub/planner_agent/prompt.py
@ -1,4 +1,5 @@
 from opendevin.controller.state.state import State
+from opendevin.core.config import config
 from opendevin.core.logger import opendevin_logger as logger
 from opendevin.core.schema import ActionType
 from opendevin.core.utils import json
@ -128,6 +129,9 @@ def get_prompt(state: State) -> str:
    Returns:
    - str: The formatted string prompt with historical values
    """
+    max_message_chars = config.get_llm_config_from_agent(
+        'PlannerAgent'
+    ).max_message_chars

    # the plan
    plan_str = json.dumps(state.root_task.to_dict(), indent=2)
@ -142,7 +146,7 @@ def get_prompt(state: State) -> str:
            break
        if latest_action == NullAction() and isinstance(event, Action):
            latest_action = event
-        history_dicts.append(event_to_memory(event))
+        history_dicts.append(event_to_memory(event, max_message_chars))

    # history_dicts is in reverse order, lets fix it
    history_dicts.reverse()
@ -160,7 +164,7 @@ def get_prompt(state: State) -> str:
        plan_status = "You're not currently working on any tasks. Your next action MUST be to mark a task as in_progress."

    # the hint, based on the last action
-    hint = get_hint(event_to_memory(latest_action).get('action', ''))
+    hint = get_hint(event_to_memory(latest_action, max_message_chars).get('action', ''))
    logger.info('HINT:\n' + hint, extra={'msg_type': 'DETAIL'})

    # the last relevant user message (the task)
--- a/config.template.toml
+++ b/config.template.toml
@ -79,8 +79,12 @@ persist_sandbox = false
 # Use host network
 #use_host_network = false

+# Name of the default agent
+#default_agent = "CodeActAgent"
+
 #################################### LLM #####################################
-# Configuration for the LLM model
+# Configuration for LLM models (group name starts with 'llm')
+# use 'llm' for the default LLM config
 ##############################################################################
 [llm]
 # AWS access key ID
@ -149,8 +153,18 @@ model = "gpt-4o"
 # Top p for the API
 #top_p = 0.5

+[llm.gpt3]
+# API key to use
+api_key = "your-api-key"
+
+# Model to use
+model = "gpt-3.5"
+
 #################################### Agent ###################################
-# Configuration for the agent
+# Configuration for agents (group name starts with 'agent')
+# Use 'agent' for the default agent config
+# otherwise, group name must be `agent.<agent_name>` (case-sensitive), e.g.
+# agent.CodeActAgent
 ##############################################################################
 [agent]
 # Memory enabled
@ -159,8 +173,13 @@ model = "gpt-4o"
 # Memory maximum threads
 #memory_max_threads = 2

-# Name of the agent
-#name = "CodeActAgent"
+# LLM config group to use
+#llm_config = 'llm'
+
+[agent.RepoExplorerAgent]
+# Example: use a cheaper model for RepoExplorerAgent to reduce cost, especially
+# useful when an agent doesn't demand high quality but uses a lot of tokens
+llm_config = 'gpt3'

 #################################### Sandbox ###################################
 # Configuration for the sandbox
--- a/docs/modules/usage/changelog.md
+++ b/docs/modules/usage/changelog.md
@ -0,0 +1,75 @@
+---
+sidebar_position: 8
+---
+
+# Changelog
+
+## 0.8 (release date: ??)
+
+### Config breaking changes
+
+In this release we introduced a few breaking changes to backend configurations.
+If you have only been using OpenDevin via frontend (web GUI), nothing needs
+to be taken care of.
+
+Here's a list of breaking changes in configs. They only apply to users who
+use OpenDevin CLI via `main.py`. For more detail, see [#2756](https://github.com/OpenDevin/OpenDevin/pull/2756).
+
+#### Removal of --model-name option from main.py
+
+Please note that `--model-name`, or `-m` option, no longer exists. You should set up the LLM
+configs in `config.toml` or via environmental variables.
+
+#### LLM config groups must be subgroups of 'llm'
+
+Prior to release 0.8, you can use arbitrary name for llm config in `config.toml`, e.g.
+
+```toml
+[gpt-4o]
+model="gpt-4o"
+api_key="<your_api_key>"
+```
+
+and then use `--llm-config` CLI argument to specify the desired LLM config group
+by name. This no longer works. Instead, the config group must be under `llm` group,
+e.g.:
+
+```toml
+[llm.gpt-4o]
+model="gpt-4o"
+api_key="<your_api_key>"
+```
+
+If you have a config group named `llm`, no need to change it, it will be used
+as the default LLM config group.
+
+#### 'agent' group no longer contains 'name' field
+
+Prior to release 0.8, you may or may not have a config group named `agent` that
+looks like this:
+
+```toml
+[agent]
+name="CodeActAgent"
+memory_max_threads=2
+```
+
+Note the `name` field is now removed. Instead, you should put `default_agent` field
+under `core` group, e.g.
+
+```toml
+[core]
+# other configs
+default_agent='CodeActAgent'
+
+[agent]
+llm_config='llm'
+memory_max_threads=2
+
+[agent.CodeActAgent]
+llm_config='gpt-4o'
+```
+
+Note that similar to `llm` subgroups, you can also define `agent` subgroups.
+Moreover, an agent can be associated with a specific LLM config group. For more
+detail, see the examples in `config.template.toml`.
--- a/evaluation/TUTORIAL.md
+++ b/evaluation/TUTORIAL.md
@ -53,14 +53,14 @@ api_key = "sk-XXX"

 In this section, for the purpose of building an evaluation task, we don't use the standard OpenDevin web-based GUI, but rather run OpenDevin backend from CLI.

-For example, you can run the following, which performs the specified task `-t`, with a particular model `-m` and agent `-c`, for a maximum number of iterations `-i`:
+For example, you can run the following, which performs the specified task `-t`, with a particular model config `-l` and agent `-c`, for a maximum number of iterations `-i`:

 ```bash
 poetry run python ./opendevin/core/main.py \
        -i 10 \
        -t "Write me a bash script that print hello world." \
        -c CodeActAgent \
-        -m gpt-4o-2024-05-13
+        -l llm
 ```

 After running the script, you will observe the following:
--- a/evaluation/agent_bench/README.md
+++ b/evaluation/agent_bench/README.md
@ -29,12 +29,12 @@ enable_auto_lint = true
 box_type = "ssh"
 timeout = 120

-[eval_gpt35_turbo]
+[llm.eval_gpt35_turbo]
 model = "gpt-3.5-turbo"
 api_key = "sk-123"
 temperature = 0.0

-[eval_gpt4o]
+[llm.eval_gpt4o]
 model = "gpt-4o"
 api_key = "sk-123"
 temperature = 0.0
--- a/evaluation/bird/README.md
+++ b/evaluation/bird/README.md
@ -21,12 +21,12 @@ ssh_hostname = "localhost"
 enable_auto_lint = true

 # TODO: Change these to the model you want to evaluate
-[eval_gpt4_1106_preview]
+[llm.eval_gpt4_1106_preview]
 model = "gpt-4-1106-preview"
 api_key = "XXX"
 temperature = 0.0

-[eval_some_openai_compatible_model]
+[llm.eval_some_openai_compatible_model]
 model = "openai/MODEL_NAME"
 base_url = "https://OPENAI_COMPATIBLE_URL/v1"
 api_key = "XXX"
--- a/evaluation/gpqa/README.md
+++ b/evaluation/gpqa/README.md
@ -39,12 +39,12 @@ ssh_hostname = "localhost"
 enable_auto_lint = true

 # TODO: Change these to the model you want to evaluate
-[eval_gpt4_1106_preview]
+[llm.eval_gpt4_1106_preview]
 model = "gpt-4-1106-preview"
 api_key = "XXX"
 temperature = 0.0

-[eval_azure_openai_compatible_model]
+[llm.eval_azure_openai_compatible_model]
 model = "AZURE_OPENAI_EXACT_DEPLOYMENT_MODEL_NAME"
 base_url = "AZURE_OPENAI_ENDPOINT"
 api_key = "AZURE_ENDPOINT_API_KEY"
--- a/evaluation/humanevalfix/README.md
+++ b/evaluation/humanevalfix/README.md
@ -21,12 +21,12 @@ ssh_hostname = "localhost"
 enable_auto_lint = true

 # TODO: Change these to the model you want to evaluate
-[eval_gpt4_1106_preview]
+[llm.eval_gpt4_1106_preview]
 model = "gpt-4-1106-preview"
 api_key = "XXX"
 temperature = 0.0

-[eval_some_openai_compatible_model]
+[llm.eval_some_openai_compatible_model]
 model = "openai/MODEL_NAME"
 base_url = "https://OPENAI_COMPATIBLE_URL/v1"
 api_key = "XXX"
--- a/evaluation/logic_reasoning/README.md
+++ b/evaluation/logic_reasoning/README.md
@ -16,12 +16,12 @@ ssh_hostname = "localhost"
 enable_auto_lint = true

 # TODO: Change these to the model you want to evaluate
-[eval_gpt4_1106_preview]
+[llm.eval_gpt4_1106_preview_llm]
 model = "gpt-4-1106-preview"
 api_key = "XXX"
 temperature = 0.0

-[eval_some_openai_compatible_model]
+[llm.eval_some_openai_compatible_model_llm]
 model = "openai/MODEL_NAME"
 base_url = "https://OPENAI_COMPATIBLE_URL/v1"
 api_key = "XXX"
@ -29,9 +29,9 @@ temperature = 0.0
 ```

 ## Run Inference on logic_reasoning
-The following code will run inference on the first example of the ProntoQA dataset with model gpt-4o,
+The following code will run inference on the first example of the ProntoQA dataset,
 using OpenDevin 0.6.2 version.

 ```bash
-./evaluation/logic_reasoning/scripts/run_infer.sh ProntoQA gpt-4o 0.6.2 1
+./evaluation/logic_reasoning/scripts/run_infer.sh ProntoQA eval_gpt4_1106_preview_llm 0.6.2 1
 ```
--- a/evaluation/miniwob/README.md
+++ b/evaluation/miniwob/README.md
@ -23,12 +23,12 @@ box_type = "ssh"
 timeout = 120

 # TODO: Change these to the model you want to evaluate
-[eval_gpt4_1106_preview]
+[llm.eval_gpt4_1106_preview]
 model = "gpt-4-1106-preview"
 api_key = "XXX"
 temperature = 0.0

-[eval_some_openai_compatible_model]
+[llm.eval_some_openai_compatible_model]
 model = "openai/MODEL_NAME"
 base_url = "https://OPENAI_COMPATIBLE_URL/v1"
 api_key = "XXX"
--- a/evaluation/ml_bench/README.md
+++ b/evaluation/ml_bench/README.md
@ -30,12 +30,12 @@ run_as_devin = false
 sandbox_container_image = "public.ecr.aws/i5g0m1f6/ml-bench" # Use the latest image from the ML-Bench repository

 # TODO: Change these to the model you want to evaluate
-[eval_gpt4_1106_preview]
+[llm.eval_gpt4_1106_preview]
 model = "gpt-4-1106-preview"
 api_key = "XXX"
 temperature = 0.0

-[eval_some_openai_compatible_model]
+[llm.eval_some_openai_compatible_model]
 model = "openai/MODEL_NAME"
 base_url = "https://OPENAI_COMPATIBLE_URL/v1"
 api_key = "XXX"
--- a/evaluation/swe_bench/README.md
+++ b/evaluation/swe_bench/README.md
@ -57,12 +57,12 @@ enable_auto_lint = true
 max_budget_per_task = 4 # 4 USD

 # TODO: Change these to the model you want to evaluate
-[eval_gpt4_1106_preview]
+[llm.eval_gpt4_1106_preview_llm]
 model = "gpt-4-1106-preview"
 api_key = "XXX"
 temperature = 0.0

-[eval_some_openai_compatible_model]
+[llm.eval_some_openai_compatible_model_llm]
 model = "openai/MODEL_NAME"
 base_url = "https://OPENAI_COMPATIBLE_URL/v1"
 api_key = "XXX"
@ -86,7 +86,7 @@ If you see an error, please make sure your `config.toml` contains all

 ```bash
 ./evaluation/swe_bench/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit]
-# e.g., ./evaluation/swe_bench/scripts/run_infer.sh eval_gpt4_1106_preview HEAD CodeActAgent 300
+# e.g., ./evaluation/swe_bench/scripts/run_infer.sh eval_gpt4_1106_preview_llm HEAD CodeActAgent 300
 ```

 where `model_config` is mandatory, while `agent` and `eval_limit` are optional.
@ -104,11 +104,11 @@ to `CodeActAgent`.
 default, the script evaluates the entire SWE-bench_Lite test set (300 issues). Note:
 in order to use `eval_limit`, you must also set `agent`.

-Let's say you'd like to run 10 instances using `eval_gpt4_1106_preview` and CodeActAgent,
+Let's say you'd like to run 10 instances using `eval_gpt4_1106_preview_llm` and CodeActAgent,
 then your command would be:

 ```bash
-./evaluation/swe_bench/scripts/run_infer.sh eval_gpt4_1106_preview HEAD CodeActAgent 10
+./evaluation/swe_bench/scripts/run_infer.sh eval_gpt4_1106_preview_llm HEAD CodeActAgent 10
 ```

 If you would like to specify a list of tasks you'd like to benchmark on, you could
--- a/opendevin/controller/agent_controller.py
+++ b/opendevin/controller/agent_controller.py
@ -33,6 +33,7 @@ from opendevin.events.observation import (
    ErrorObservation,
    Observation,
 )
+from opendevin.llm.llm import LLM

 MAX_ITERATIONS = config.max_iterations
 MAX_BUDGET_PER_TASK = config.max_budget_per_task
@ -218,7 +219,9 @@ class AgentController:

    async def start_delegate(self, action: AgentDelegateAction):
        agent_cls: Type[Agent] = Agent.get_cls(action.agent)
-        agent = agent_cls(llm=self.agent.llm)
+        llm_config = config.get_llm_config_from_agent(action.agent)
+        llm = LLM(llm_config=llm_config)
+        delegate_agent = agent_cls(llm=llm)
        state = State(
            inputs=action.inputs or {},
            iteration=0,
@ -227,10 +230,12 @@ class AgentController:
            # metrics should be shared between parent and child
            metrics=self.state.metrics,
        )
-        logger.info(f'[Agent Controller {self.id}]: start delegate')
+        logger.info(
+            f'[Agent Controller {self.id}]: start delegate, creating agent {delegate_agent.name} using LLM {llm}'
+        )
        self.delegate = AgentController(
            sid=self.id + '-delegate',
-            agent=agent,
+            agent=delegate_agent,
            event_stream=self.event_stream,
            max_iterations=self.state.max_iterations,
            max_budget_per_task=self.max_budget_per_task,
--- a/opendevin/core/config.py
+++ b/opendevin/core/config.py
@ -20,7 +20,7 @@ load_dotenv()


@dataclass
-class LLMConfig(metaclass=Singleton):
+class LLMConfig:
    """
    Configuration for the LLM model.

@ -101,19 +101,19 @@ class LLMConfig(metaclass=Singleton):


@dataclass
-class AgentConfig(metaclass=Singleton):
+class AgentConfig:
    """
    Configuration for the agent.

    Attributes:
-        name: The name of the agent.
        memory_enabled: Whether long-term memory (embeddings) is enabled.
        memory_max_threads: The maximum number of threads indexing at the same time for embeddings.
+        llm_config: The name of the llm config to use. If specified, this will override global llm config.
    """

-    name: str = 'CodeActAgent'
    memory_enabled: bool = False
    memory_max_threads: int = 2
+    llm_config: str | None = None

    def defaults_to_dict(self) -> dict:
        """
@ -180,8 +180,9 @@ class AppConfig(metaclass=Singleton):
    Configuration for the app.

    Attributes:
-        llm: The LLM configuration.
-        agent: The agent configuration.
+        llms: A dictionary of name -> LLM configuration. Default config is under 'llm' key.
+        agents: A dictionary of name -> Agent configuration. Default config is under 'agent' key.
+        default_agent: The name of the default agent to use.
        sandbox: The sandbox configuration.
        runtime: The runtime environment.
        file_store: The file store to use.
@ -207,8 +208,9 @@ class AppConfig(metaclass=Singleton):
        file_uploads_allowed_extensions: List of allowed file extensions for uploads. ['.*'] means all extensions are allowed.
    """

-    llm: LLMConfig = field(default_factory=LLMConfig)
-    agent: AgentConfig = field(default_factory=AgentConfig)
+    llms: dict = field(default_factory=dict)
+    agents: dict = field(default_factory=dict)
+    default_agent: str = 'CodeActAgent'
    sandbox: SandboxConfig = field(default_factory=SandboxConfig)
    runtime: str = 'server'
    file_store: str = 'memory'
@ -243,6 +245,39 @@ class AppConfig(metaclass=Singleton):

    defaults_dict: ClassVar[dict] = {}

+    def get_llm_config(self, name='llm') -> LLMConfig:
+        """
+        llm is the name for default config (for backward compatibility prior to 0.8)
+        """
+        if name in self.llms:
+            return self.llms[name]
+        if name is not None and name != 'llm':
+            logger.warning(f'llm config group {name} not found, using default config')
+        if 'llm' not in self.llms:
+            self.llms['llm'] = LLMConfig()
+        return self.llms['llm']
+
+    def set_llm_config(self, value: LLMConfig, name='llm'):
+        self.llms[name] = value
+
+    def get_agent_config(self, name='agent') -> AgentConfig:
+        """
+        agent is the name for default config (for backward compability prior to 0.8)
+        """
+        if name in self.agents:
+            return self.agents[name]
+        if 'agent' not in self.agents:
+            self.agents['agent'] = AgentConfig()
+        return self.agents['agent']
+
+    def set_agent_config(self, value: AgentConfig, name='agent'):
+        self.agents[name] = value
+
+    def get_llm_config_from_agent(self, name='agent') -> LLMConfig:
+        agent_config: AgentConfig = self.get_agent_config(name)
+        llm_config_name = agent_config.llm_config
+        return self.get_llm_config(llm_config_name)
+
    def __post_init__(self):
        """
        Post-initialization hook, called when the instance is created with only default values.
@ -346,11 +381,6 @@ def load_from_env(cfg: AppConfig, env_or_toml_dict: dict | MutableMapping[str, s
            if is_dataclass(field_type):
                # nested dataclass
                nested_sub_config = getattr(sub_config, field_name)
-
-                # the agent field: the env var for agent.name is just 'AGENT'
-                if field_name == 'agent' and 'AGENT' in env_or_toml_dict:
-                    setattr(nested_sub_config, 'name', env_or_toml_dict[env_var_name])
-
                set_attr_from_env(nested_sub_config, prefix=field_name + '_')
            elif env_var_name in env_or_toml_dict:
                # convert the env var to the correct type and set it
@ -377,6 +407,13 @@ def load_from_env(cfg: AppConfig, env_or_toml_dict: dict | MutableMapping[str, s
    # Start processing from the root of the config object
    set_attr_from_env(cfg)

+    # load default LLM config from env
+    default_llm_config = config.get_llm_config()
+    set_attr_from_env(default_llm_config, 'LLM_')
+    # load default agent config from env
+    default_agent_config = config.get_agent_config()
+    set_attr_from_env(default_agent_config, 'AGENT_')
+

 def load_from_toml(cfg: AppConfig, toml_file: str = 'config.toml'):
    """Load the config from the toml file. Supports both styles of config vars.
@ -408,17 +445,45 @@ def load_from_toml(cfg: AppConfig, toml_file: str = 'config.toml'):

    core_config = toml_config['core']

+    # load llm configs and agent configs
+    for key, value in toml_config.items():
+        if isinstance(value, dict):
+            try:
+                if key is not None and key.lower() == 'agent':
+                    logger.info('Attempt to load default agent config from config toml')
+                    non_dict_fields = {
+                        k: v for k, v in value.items() if not isinstance(v, dict)
+                    }
+                    agent_config = AgentConfig(**non_dict_fields)
+                    cfg.set_agent_config(agent_config, 'agent')
+                    for nested_key, nested_value in value.items():
+                        if isinstance(nested_value, dict):
+                            logger.info(
+                                f'Attempt to load group {nested_key} from config toml as agent config'
+                            )
+                            agent_config = AgentConfig(**nested_value)
+                            cfg.set_agent_config(agent_config, nested_key)
+                if key is not None and key.lower() == 'llm':
+                    logger.info('Attempt to load default LLM config from config toml')
+                    non_dict_fields = {
+                        k: v for k, v in value.items() if not isinstance(v, dict)
+                    }
+                    llm_config = LLMConfig(**non_dict_fields)
+                    cfg.set_llm_config(llm_config, 'llm')
+                    for nested_key, nested_value in value.items():
+                        if isinstance(nested_value, dict):
+                            logger.info(
+                                f'Attempt to load group {nested_key} from config toml as llm config'
+                            )
+                            llm_config = LLMConfig(**nested_value)
+                            cfg.set_llm_config(llm_config, nested_key)
+            except (TypeError, KeyError) as e:
+                logger.warning(
+                    f'Cannot parse config from toml, toml values have not been applied.\n Error: {e}',
+                    exc_info=False,
+                )
+
    try:
-        # set llm config from the toml file
-        llm_config = cfg.llm
-        if 'llm' in toml_config:
-            llm_config = LLMConfig(**toml_config['llm'])
-
-        # set agent config from the toml file
-        agent_config = cfg.agent
-        if 'agent' in toml_config:
-            agent_config = AgentConfig(**toml_config['agent'])
-
        # set sandbox config from the toml file
        sandbox_config = config.sandbox

@ -439,12 +504,7 @@ def load_from_toml(cfg: AppConfig, toml_file: str = 'config.toml'):
            sandbox_config = SandboxConfig(**toml_config['sandbox'])

        # update the config object with the new values
-        AppConfig(
-            llm=llm_config,
-            agent=agent_config,
-            sandbox=sandbox_config,
-            **core_config,
-        )
+        AppConfig(sandbox=sandbox_config, **core_config)
    except (TypeError, KeyError) as e:
        logger.warning(
            f'Cannot parse config from toml, toml values have not been applied.\nError: {e}',
@ -472,8 +532,9 @@ def finalize_config(cfg: AppConfig):
        parts = cfg.workspace_mount_rewrite.split(':')
        cfg.workspace_mount_path = base.replace(parts[0], parts[1])

-    if cfg.llm.embedding_base_url is None:
-        cfg.llm.embedding_base_url = cfg.llm.base_url
+    for llm in cfg.llms.values():
+        if llm.embedding_base_url is None:
+            llm.embedding_base_url = llm.base_url

    if cfg.use_host_network and platform.system() == 'Darwin':
        logger.warning(
@ -493,14 +554,16 @@ finalize_config(config)


 # Utility function for command line --group argument
-def get_llm_config_arg(llm_config_arg: str):
+def get_llm_config_arg(
+    llm_config_arg: str, toml_file: str = 'config.toml'
+) -> LLMConfig | None:
    """
    Get a group of llm settings from the config file.

    A group in config.toml can look like this:

    ```
-    [gpt-3.5-for-eval]
+    [llm.gpt-3.5-for-eval]
    model = 'gpt-3.5-turbo'
    api_key = '...'
    temperature = 0.5
@ -511,6 +574,8 @@ def get_llm_config_arg(llm_config_arg: str):
    The user-defined group name, like "gpt-3.5-for-eval", is the argument to this function. The function will load the LLMConfig object
    with the settings of this group, from the config file, and set it as the LLMConfig object for the app.

+    Note that the group must be under "llm" group, or in other words, the group name must start with "llm.".
+
    Args:
        llm_config_arg: The group of llm settings to get from the config.toml file.

@ -520,12 +585,17 @@ def get_llm_config_arg(llm_config_arg: str):

    # keep only the name, just in case
    llm_config_arg = llm_config_arg.strip('[]')
+
+    # truncate the prefix, just in case
+    if llm_config_arg.startswith('llm.'):
+        llm_config_arg = llm_config_arg[4:]
+
    logger.info(f'Loading llm config from {llm_config_arg}')

    # load the toml file
    try:
-        with open('config.toml', 'r', encoding='utf-8') as toml_file:
-            toml_config = toml.load(toml_file)
+        with open(toml_file, 'r', encoding='utf-8') as toml_contents:
+            toml_config = toml.load(toml_contents)
    except FileNotFoundError as e:
        logger.error(f'Config file not found: {e}')
        return None
@ -534,8 +604,8 @@ def get_llm_config_arg(llm_config_arg: str):
        return None

    # update the llm config with the specified section
-    if llm_config_arg in toml_config:
-        return LLMConfig(**toml_config[llm_config_arg])
+    if 'llm' in toml_config and llm_config_arg in toml_config['llm']:
+        return LLMConfig(**toml_config['llm'][llm_config_arg])
    logger.debug(f'Loading from toml failed for {llm_config_arg}')
    return None

@ -564,16 +634,9 @@ def get_parser() -> argparse.ArgumentParser:
    parser.add_argument(
        '-c',
        '--agent-cls',
-        default=config.agent.name,
+        default=config.default_agent,
        type=str,
-        help='The agent class to use',
-    )
-    parser.add_argument(
-        '-m',
-        '--model-name',
-        default=config.llm.model,
-        type=str,
-        help='The (litellm) model name to use',
+        help='Name of the default agent to use',
    )
    parser.add_argument(
        '-i',
@ -619,7 +682,7 @@ def get_parser() -> argparse.ArgumentParser:
        '--llm-config',
        default=None,
        type=str,
-        help='The group of llm settings, e.g. a [llama3] section in the toml file. Overrides model if both are provided.',
+        help='The group of llm settings, e.g. "llama3" for [llm.llama3] section in the toml file. Overrides model if both are provided.',
    )
    return parser

--- a/opendevin/core/main.py
+++ b/opendevin/core/main.py
@ -53,7 +53,7 @@ async def run_agent_controller(

    # Logging
    logger.info(
-        f'Running agent {type(agent)}, model {agent.llm.model_name}, with task: "{task_str}"'
+        f'Running agent {agent.name}, model {agent.llm.model_name}, with task: "{task_str}"'
    )

    # set up the event stream
@ -163,7 +163,7 @@ if __name__ == '__main__':
            raise ValueError(f'Invalid toml file, cannot read {args.llm_config}')
        llm = LLM(llm_config=llm_config)
    else:
-        llm = LLM(model=args.model_name)
+        llm = LLM(llm_config=config.get_llm_config_from_agent(args.agent_cls))

    # Create the agent
    AgentCls: Type[Agent] = Agent.get_cls(args.agent_cls)
--- a/opendevin/core/utils/singleton.py
+++ b/opendevin/core/utils/singleton.py
@ -21,8 +21,10 @@ class Singleton(type):
        # used by pytest to reset the state of the singleton instances
        for instance_type, instance in cls._instances.items():
            print('resetting... ', instance_type)
-            for field in dataclasses.fields(instance_type):
-                if dataclasses.is_dataclass(field.type):
-                    setattr(instance, field.name, field.type())
+            for field_info in dataclasses.fields(instance_type):
+                if dataclasses.is_dataclass(field_info.type):
+                    setattr(instance, field_info.name, field_info.type())
+                elif field_info.default_factory is not dataclasses.MISSING:
+                    setattr(instance, field_info.name, field_info.default_factory())
                else:
-                    setattr(instance, field.name, field.default)
+                    setattr(instance, field_info.name, field_info.default)
--- a/opendevin/events/serialization/event.py
+++ b/opendevin/events/serialization/event.py
@ -1,7 +1,6 @@
 from dataclasses import asdict
 from datetime import datetime

-from opendevin.core.config import config
 from opendevin.events import Event, EventSource
 from opendevin.events.observation.observation import Observation

@ -70,7 +69,7 @@ def event_to_dict(event: 'Event') -> dict:
    return d


-def event_to_memory(event: 'Event') -> dict:
+def event_to_memory(event: 'Event', max_message_chars: int) -> dict:
    d = event_to_dict(event)
    d.pop('id', None)
    d.pop('cause', None)
@ -79,17 +78,14 @@ def event_to_memory(event: 'Event') -> dict:
    if 'extras' in d:
        remove_fields(d['extras'], DELETE_FROM_MEMORY_EXTRAS)
    if isinstance(event, Observation) and 'content' in d:
-        d['content'] = truncate_content(d['content'])
+        d['content'] = truncate_content(d['content'], max_message_chars)
    return d


-def truncate_content(content: str, max_chars: int = -1) -> str:
+def truncate_content(content: str, max_chars: int) -> str:
    """
    Truncate the middle of the observation content if it is too long.
    """
-    if max_chars == -1:
-        max_chars = config.llm.max_message_chars
-
    if len(content) <= max_chars:
        return content

--- a/opendevin/llm/bedrock.py
+++ b/opendevin/llm/bedrock.py
@ -5,9 +5,10 @@ import boto3
 from opendevin.core.config import config
 from opendevin.core.logger import opendevin_logger as logger

-AWS_ACCESS_KEY_ID = config.llm.aws_access_key_id
-AWS_SECRET_ACCESS_KEY = config.llm.aws_secret_access_key
-AWS_REGION_NAME = config.llm.aws_region_name
+# TODO: this assumes AWS-specific configs are under default 'llm' group
+AWS_ACCESS_KEY_ID = config.get_llm_config().aws_access_key_id
+AWS_SECRET_ACCESS_KEY = config.get_llm_config().aws_secret_access_key
+AWS_REGION_NAME = config.get_llm_config().aws_region_name

 # It needs to be set as an environment variable, if the variable is configured in the Config file.
 if AWS_ACCESS_KEY_ID is not None:
--- a/opendevin/llm/llm.py
+++ b/opendevin/llm/llm.py
@ -63,6 +63,8 @@ class LLM:
        llm_config=None,
        metrics=None,
        cost_metric_supported=True,
+        input_cost_per_token=None,
+        output_cost_per_token=None,
    ):
        """
        Initializes the LLM. If LLMConfig is passed, its values will be the fallback.
@ -84,9 +86,11 @@ class LLM:
            llm_temperature (float, optional): The temperature for LLM sampling. Defaults to LLM_TEMPERATURE.
            metrics (Metrics, optional): The metrics object to use. Defaults to None.
            cost_metric_supported (bool, optional): Whether the cost metric is supported. Defaults to True.
+            input_cost_per_token (float, optional): The cost per input token.
+            output_cost_per_token (float, optional): The cost per output token.
        """
        if llm_config is None:
-            llm_config = config.llm
+            llm_config = config.get_llm_config()
        model = model if model is not None else llm_config.model
        api_key = api_key if api_key is not None else llm_config.api_key
        base_url = base_url if base_url is not None else llm_config.base_url
@ -118,6 +122,16 @@ class LLM:
            if max_output_tokens is not None
            else llm_config.max_output_tokens
        )
+        input_cost_per_token = (
+            input_cost_per_token
+            if input_cost_per_token is not None
+            else llm_config.input_cost_per_token
+        )
+        output_cost_per_token = (
+            output_cost_per_token
+            if output_cost_per_token is not None
+            else llm_config.output_cost_per_token
+        )
        metrics = metrics if metrics is not None else Metrics()

        logger.info(f'Initializing LLM with model: {model}')
@ -127,6 +141,8 @@ class LLM:
        self.api_version = api_version
        self.max_input_tokens = max_input_tokens
        self.max_output_tokens = max_output_tokens
+        self.input_cost_per_token = input_cost_per_token
+        self.output_cost_per_token = output_cost_per_token
        self.llm_timeout = llm_timeout
        self.custom_llm_provider = custom_llm_provider
        self.metrics = metrics
@ -292,12 +308,12 @@ class LLM:

        extra_kwargs = {}
        if (
-            config.llm.input_cost_per_token is not None
-            and config.llm.output_cost_per_token is not None
+            self.input_cost_per_token is not None
+            and self.output_cost_per_token is not None
        ):
            cost_per_token = CostPerToken(
-                input_cost_per_token=config.llm.input_cost_per_token,
-                output_cost_per_token=config.llm.output_cost_per_token,
+                input_cost_per_token=self.input_cost_per_token,
+                output_cost_per_token=self.output_cost_per_token,
            )
            logger.info(f'Using custom cost per token: {cost_per_token}')
            extra_kwargs['custom_cost_per_token'] = cost_per_token
--- a/opendevin/memory/memory.py
+++ b/opendevin/memory/memory.py
@ -13,13 +13,14 @@ from tenacity import (
    wait_random_exponential,
 )

-from opendevin.core.config import config
+from opendevin.core.config import LLMConfig, config
 from opendevin.core.logger import opendevin_logger as logger
 from opendevin.core.utils import json

-num_retries = config.llm.num_retries
-retry_min_wait = config.llm.retry_min_wait
-retry_max_wait = config.llm.retry_max_wait
+# TODO: this should depend on specific agent setting
+num_retries = config.get_llm_config().num_retries
+retry_min_wait = config.get_llm_config().retry_min_wait
+retry_max_wait = config.get_llm_config().retry_max_wait

 # llama-index includes a retry decorator around openai.get_embeddings() function
 # it is initialized with hard-coded values and errors
@ -62,7 +63,7 @@ class EmbeddingsLoader:
    """Loader for embedding model initialization."""

    @staticmethod
-    def get_embedding_model(strategy: str):
+    def get_embedding_model(strategy: str, llm_config: LLMConfig):
        supported_ollama_embed_models = [
            'llama2',
            'mxbai-embed-large',
@ -75,7 +76,7 @@ class EmbeddingsLoader:

            return OllamaEmbedding(
                model_name=strategy,
-                base_url=config.llm.embedding_base_url,
+                base_url=llm_config.embedding_base_url,
                ollama_additional_kwargs={'mirostat': 0},
            )
        elif strategy == 'openai':
@ -83,17 +84,17 @@ class EmbeddingsLoader:

            return OpenAIEmbedding(
                model='text-embedding-ada-002',
-                api_key=config.llm.api_key,
+                api_key=llm_config.api_key,
            )
        elif strategy == 'azureopenai':
            from llama_index.embeddings.azure_openai import AzureOpenAIEmbedding

            return AzureOpenAIEmbedding(
                model='text-embedding-ada-002',
-                deployment_name=config.llm.embedding_deployment_name,
-                api_key=config.llm.api_key,
-                azure_endpoint=config.llm.base_url,
-                api_version=config.llm.api_version,
+                deployment_name=llm_config.embedding_deployment_name,
+                api_key=llm_config.api_key,
+                azure_endpoint=llm_config.base_url,
+                api_version=llm_config.api_version,
            )
        elif (strategy is not None) and (strategy.lower() == 'none'):
            # TODO: this works but is not elegant enough. The incentive is when
@ -106,24 +107,26 @@ class EmbeddingsLoader:
            return HuggingFaceEmbedding(model_name='BAAI/bge-small-en-v1.5')


-sema = threading.Semaphore(value=config.agent.memory_max_threads)
-
-
 class LongTermMemory:
    """
    Handles storing information for the agent to access later, using chromadb.
    """

-    def __init__(self):
+    def __init__(self, agent_config_name='agent'):
        """
        Initialize the chromadb and set up ChromaVectorStore for later use.
        """
        db = chromadb.Client(chromadb.Settings(anonymized_telemetry=False))
        self.collection = db.get_or_create_collection(name='memories')
        vector_store = ChromaVectorStore(chroma_collection=self.collection)
-        embedding_strategy = config.llm.embedding_model
-        embed_model = EmbeddingsLoader.get_embedding_model(embedding_strategy)
+        agent_config = config.get_agent_config(agent_config_name)
+        llm_config = config.get_llm_config(agent_config.llm_config)
+        embedding_strategy = llm_config.embedding_model
+        embed_model = EmbeddingsLoader.get_embedding_model(
+            embedding_strategy, llm_config
+        )
        self.index = VectorStoreIndex.from_vector_store(vector_store, embed_model)
+        self.sema = threading.Semaphore(value=agent_config.memory_max_threads)
        self.thought_idx = 0
        self._add_threads = []

@ -158,7 +161,7 @@ class LongTermMemory:
        thread.start()  # We add the doc concurrently so we don't have to wait ~500ms for the insert

    def _add_doc(self, doc):
-        with sema:
+        with self.sema:
            self.index.insert(doc)

    def search(self, query: str, k: int = 10):
--- a/opendevin/server/listen.py
+++ b/opendevin/server/listen.py
@ -308,18 +308,22 @@ async def get_litellm_models():
    )
    bedrock_model_list = bedrock.list_foundation_models()
    model_list = litellm_model_list_without_bedrock + bedrock_model_list
-    ollama_base_url = config.llm.ollama_base_url
-    if config.llm.model.startswith('ollama'):
-        if not ollama_base_url:
-            ollama_base_url = config.llm.base_url
-    if ollama_base_url:
-        ollama_url = ollama_base_url.strip('/') + '/api/tags'
-        try:
-            ollama_models_list = requests.get(ollama_url, timeout=3).json()['models']
-            for model in ollama_models_list:
-                model_list.append('ollama/' + model['name'])
-        except requests.exceptions.RequestException as e:
-            logger.error(f'Error getting OLLAMA models: {e}', exc_info=True)
+    for llm_config in config.llms.values():
+        ollama_base_url = llm_config.ollama_base_url
+        if llm_config.model.startswith('ollama'):
+            if not ollama_base_url:
+                ollama_base_url = llm_config.base_url
+        if ollama_base_url:
+            ollama_url = ollama_base_url.strip('/') + '/api/tags'
+            try:
+                ollama_models_list = requests.get(ollama_url, timeout=3).json()[
+                    'models'
+                ]
+                for model in ollama_models_list:
+                    model_list.append('ollama/' + model['name'])
+                break
+            except requests.exceptions.RequestException as e:
+                logger.error(f'Error getting OLLAMA models: {e}', exc_info=True)

    return list(sorted(set(model_list)))

--- a/opendevin/server/session/agent.py
+++ b/opendevin/server/session/agent.py
@ -86,10 +86,11 @@ class AgentSession:
            for key, value in start_event.get('args', {}).items()
            if value != ''
        }  # remove empty values, prevent FE from sending empty strings
-        agent_cls = args.get(ConfigType.AGENT, config.agent.name)
-        model = args.get(ConfigType.LLM_MODEL, config.llm.model)
-        api_key = args.get(ConfigType.LLM_API_KEY, config.llm.api_key)
-        api_base = config.llm.base_url
+        agent_cls = args.get(ConfigType.AGENT, config.default_agent)
+        llm_config = config.get_llm_config_from_agent(agent_cls)
+        model = args.get(ConfigType.LLM_MODEL, llm_config.model)
+        api_key = args.get(ConfigType.LLM_API_KEY, llm_config.api_key)
+        api_base = llm_config.base_url
        max_iterations = args.get(ConfigType.MAX_ITERATIONS, config.max_iterations)

        logger.info(f'Creating agent {agent_cls} using LLM {model}')
--- a/tests/integration/conftest.py
+++ b/tests/integration/conftest.py
@ -49,7 +49,9 @@ def apply_prompt_and_get_mock_response(test_name: str, messages: str, id: int) -
    Note: this function blindly replaces existing prompt file with the given
    input without checking the contents.
    """
-    mock_dir = os.path.join(script_dir, 'mock', os.environ.get('AGENT'), test_name)
+    mock_dir = os.path.join(
+        script_dir, 'mock', os.environ.get('DEFAULT_AGENT'), test_name
+    )
    prompt_file_path = os.path.join(mock_dir, f'prompt_{"{0:03}".format(id)}.log')
    resp_file_path = os.path.join(mock_dir, f'response_{"{0:03}".format(id)}.log')
    try:
@ -82,7 +84,9 @@ def get_mock_response(test_name: str, messages: str, id: int) -> str:
    makes test code harder to understand.
    """
    prompt = filter_out_symbols(messages)
-    mock_dir = os.path.join(script_dir, 'mock', os.environ.get('AGENT'), test_name)
+    mock_dir = os.path.join(
+        script_dir, 'mock', os.environ.get('DEFAULT_AGENT'), test_name
+    )
    prompt_file_path = os.path.join(mock_dir, f'prompt_{"{0:03}".format(id)}.log')
    resp_file_path = os.path.join(mock_dir, f'response_{"{0:03}".format(id)}.log')
    # Open the prompt file and compare its contents
@ -130,7 +134,11 @@ def mock_user_response(*args, test_name, **kwargs):
    STDIN input for the agent to read.
    """
    user_response_file = os.path.join(
-        script_dir, 'mock', os.environ.get('AGENT'), test_name, 'user_responses.log'
+        script_dir,
+        'mock',
+        os.environ.get('DEFAULT_AGENT'),
+        test_name,
+        'user_responses.log',
    )
    if not os.path.exists(user_response_file):
        return ''
--- a/tests/integration/regenerate.sh
+++ b/tests/integration/regenerate.sh
@ -78,7 +78,7 @@ run_test() {
    WORKSPACE_MOUNT_PATH=$WORKSPACE_MOUNT_PATH \
    WORKSPACE_MOUNT_PATH_IN_SANDBOX=$WORKSPACE_MOUNT_PATH_IN_SANDBOX \
    MAX_ITERATIONS=$MAX_ITERATIONS \
-    AGENT=$agent \
+    DEFAULT_AGENT=$agent \
    $pytest_cmd 2>&1 | tee $TMP_FILE

  # Capture the exit code of pytest
@ -148,7 +148,7 @@ regenerate_without_llm() {
    WORKSPACE_MOUNT_PATH_IN_SANDBOX=$WORKSPACE_MOUNT_PATH_IN_SANDBOX \
    MAX_ITERATIONS=$MAX_ITERATIONS \
    FORCE_APPLY_PROMPTS=true \
-    AGENT=$agent \
+    DEFAULT_AGENT=$agent \
    poetry run pytest -s ./tests/integration/test_agent.py::$test_name
  set +x
 }
--- a/tests/integration/test_agent.py
+++ b/tests/integration/test_agent.py
@ -29,16 +29,19 @@ print(f'workspace_mount_path_in_sandbox: {workspace_mount_path_in_sandbox}')


@pytest.mark.skipif(
-    os.getenv('AGENT') == 'BrowsingAgent',
+    os.getenv('DEFAULT_AGENT') == 'BrowsingAgent',
    reason='BrowsingAgent is a specialized agent',
 )
@pytest.mark.skipif(
-    (os.getenv('AGENT') == 'CodeActAgent' or os.getenv('AGENT') == 'CodeActSWEAgent')
+    (
+        os.getenv('DEFAULT_AGENT') == 'CodeActAgent'
+        or os.getenv('DEFAULT_AGENT') == 'CodeActSWEAgent'
+    )
    and os.getenv('SANDBOX_BOX_TYPE', '').lower() != 'ssh',
    reason='CodeActAgent/CodeActSWEAgent only supports ssh sandbox which is stateful',
 )
@pytest.mark.skipif(
-    os.getenv('AGENT') == 'ManagerAgent',
+    os.getenv('DEFAULT_AGENT') == 'ManagerAgent',
    reason='Manager agent is not capable of finishing this in reasonable steps yet',
 )
 def test_write_simple_script():
@ -46,7 +49,7 @@ def test_write_simple_script():
    args = parse_arguments()

    # Create the agent
-    agent = Agent.get_cls(args.agent_cls)(llm=LLM(args.model_name))
+    agent = Agent.get_cls(args.agent_cls)(llm=LLM())

    final_state: State | None = asyncio.run(
        run_agent_controller(agent, task, exit_on_message=True)
@ -68,16 +71,20 @@ def test_write_simple_script():


@pytest.mark.skipif(
-    os.getenv('AGENT') == 'BrowsingAgent',
+    os.getenv('DEFAULT_AGENT') == 'BrowsingAgent',
    reason='BrowsingAgent is a specialized agent',
 )
@pytest.mark.skipif(
-    (os.getenv('AGENT') == 'CodeActAgent' or os.getenv('AGENT') == 'CodeActSWEAgent')
+    (
+        os.getenv('DEFAULT_AGENT') == 'CodeActAgent'
+        or os.getenv('DEFAULT_AGENT') == 'CodeActSWEAgent'
+    )
    and os.getenv('SANDBOX_BOX_TYPE', '').lower() != 'ssh',
    reason='CodeActAgent/CodeActSWEAgent only supports ssh sandbox which is stateful',
 )
@pytest.mark.skipif(
-    os.getenv('AGENT') == 'MonologueAgent' or os.getenv('AGENT') == 'PlannerAgent',
+    os.getenv('DEFAULT_AGENT') == 'MonologueAgent'
+    or os.getenv('DEFAULT_AGENT') == 'PlannerAgent',
    reason='We only keep basic tests for MonologueAgent and PlannerAgent',
 )
@pytest.mark.skipif(
@ -96,7 +103,7 @@ def test_edits():
        shutil.copy(os.path.join(source_dir, file), dest_file)

    # Create the agent
-    agent = Agent.get_cls(args.agent_cls)(llm=LLM(args.model_name))
+    agent = Agent.get_cls(args.agent_cls)(llm=LLM())

    # Execute the task
    task = 'Fix typos in bad.txt. Do not ask me for confirmation at any point.'
@ -118,7 +125,8 @@ Enjoy!


@pytest.mark.skipif(
-    os.getenv('AGENT') != 'CodeActAgent' and os.getenv('AGENT') != 'CodeActSWEAgent',
+    os.getenv('DEFAULT_AGENT') != 'CodeActAgent'
+    and os.getenv('DEFAULT_AGENT') != 'CodeActSWEAgent',
    reason='currently only CodeActAgent and CodeActSWEAgent have IPython (Jupyter) execution by default',
 )
@pytest.mark.skipif(
@ -129,7 +137,7 @@ def test_ipython():
    args = parse_arguments()

    # Create the agent
-    agent = Agent.get_cls(args.agent_cls)(llm=LLM(args.model_name))
+    agent = Agent.get_cls(args.agent_cls)(llm=LLM())

    # Execute the task
    task = "Use Jupyter IPython to write a text file containing 'hello world' to '/workspace/test.txt'. Do not ask me for confirmation at any point."
@ -152,7 +160,7 @@ def test_ipython():


@pytest.mark.skipif(
-    os.getenv('AGENT') != 'ManagerAgent',
+    os.getenv('DEFAULT_AGENT') != 'ManagerAgent',
    reason='Currently, only ManagerAgent supports task rejection',
 )
@pytest.mark.skipif(
@ -163,7 +171,7 @@ def test_simple_task_rejection():
    args = parse_arguments()

    # Create the agent
-    agent = Agent.get_cls(args.agent_cls)(llm=LLM(args.model_name))
+    agent = Agent.get_cls(args.agent_cls)(llm=LLM())

    # Give an impossible task to do: cannot write a commit message because
    # the workspace is not a git repo
@ -175,7 +183,8 @@ def test_simple_task_rejection():


@pytest.mark.skipif(
-    os.getenv('AGENT') != 'CodeActAgent' and os.getenv('AGENT') != 'CodeActSWEAgent',
+    os.getenv('DEFAULT_AGENT') != 'CodeActAgent'
+    and os.getenv('DEFAULT_AGENT') != 'CodeActSWEAgent',
    reason='currently only CodeActAgent and CodeActSWEAgent have IPython (Jupyter) execution by default',
 )
@pytest.mark.skipif(
@ -186,7 +195,7 @@ def test_ipython_module():
    args = parse_arguments()

    # Create the agent
-    agent = Agent.get_cls(args.agent_cls)(llm=LLM(args.model_name))
+    agent = Agent.get_cls(args.agent_cls)(llm=LLM())

    # Execute the task
    task = "Install and import pymsgbox==1.0.9 and print it's version in /workspace/test.txt. Do not ask me for confirmation at any point."
@ -210,11 +219,15 @@ def test_ipython_module():


@pytest.mark.skipif(
-    os.getenv('AGENT') != 'BrowsingAgent' and os.getenv('AGENT') != 'CodeActAgent',
+    os.getenv('DEFAULT_AGENT') != 'BrowsingAgent'
+    and os.getenv('DEFAULT_AGENT') != 'CodeActAgent',
    reason='currently only BrowsingAgent and CodeActAgent are capable of searching the internet',
 )
@pytest.mark.skipif(
-    (os.getenv('AGENT') == 'CodeActAgent' or os.getenv('AGENT') == 'CodeActSWEAgent')
+    (
+        os.getenv('DEFAULT_AGENT') == 'CodeActAgent'
+        or os.getenv('DEFAULT_AGENT') == 'CodeActSWEAgent'
+    )
    and os.getenv('SANDBOX_BOX_TYPE', '').lower() != 'ssh',
    reason='CodeActAgent/CodeActSWEAgent only supports ssh sandbox which is stateful',
 )
@ -222,7 +235,7 @@ def test_browse_internet(http_server):
    args = parse_arguments()

    # Create the agent
-    agent = Agent.get_cls(args.agent_cls)(llm=LLM(args.model_name))
+    agent = Agent.get_cls(args.agent_cls)(llm=LLM())

    # Execute the task
    task = 'Browse localhost:8000, and tell me the ultimate answer to life. Do not ask me for confirmation at any point.'
--- a/tests/unit/test_action_serialization.py
+++ b/tests/unit/test_action_serialization.py
@ -1,3 +1,4 @@
+from opendevin.core.config import config
 from opendevin.events.action import (
    Action,
    AddTaskAction,
@ -28,7 +29,9 @@ def serialization_deserialization(original_action_dict, cls):
        action_instance, cls
    ), f'The action instance should be an instance of {cls.__name__}.'
    serialized_action_dict = event_to_dict(action_instance)
-    serialized_action_memory = event_to_memory(action_instance)
+    serialized_action_memory = event_to_memory(
+        action_instance, config.get_llm_config().max_message_chars
+    )
    serialized_action_dict.pop('message')
    assert (
        serialized_action_dict == original_action_dict
--- a/tests/unit/test_arg_parser.py
+++ b/tests/unit/test_arg_parser.py
@ -10,7 +10,7 @@ def test_help_message(capsys):
    captured = capsys.readouterr()
    expected_help_message = """
 usage: pytest [-h] [-d DIRECTORY] [-t TASK] [-f FILE] [-c AGENT_CLS]
-              [-m MODEL_NAME] [-i MAX_ITERATIONS] [-b MAX_BUDGET_PER_TASK]
+              [-i MAX_ITERATIONS] [-b MAX_BUDGET_PER_TASK]
              [--eval-output-dir EVAL_OUTPUT_DIR]
              [--eval-n-limit EVAL_N_LIMIT]
              [--eval-num-workers EVAL_NUM_WORKERS] [--eval-note EVAL_NOTE]
@ -26,9 +26,7 @@ options:
  -f FILE, --file FILE  Path to a file containing the task. Overrides -t if
                        both are provided.
  -c AGENT_CLS, --agent-cls AGENT_CLS
-                        The agent class to use
-  -m MODEL_NAME, --model-name MODEL_NAME
-                        The (litellm) model name to use
+                        Name of the default agent to use
  -i MAX_ITERATIONS, --max-iterations MAX_ITERATIONS
                        The maximum number of iterations to run the agent
  -b MAX_BUDGET_PER_TASK, --max-budget-per-task MAX_BUDGET_PER_TASK
@ -43,8 +41,9 @@ options:
  --eval-note EVAL_NOTE
                        The note to add to the evaluation directory
  -l LLM_CONFIG, --llm-config LLM_CONFIG
-                        The group of llm settings, e.g. a [llama3] section in
-                        the toml file. Overrides model if both are provided.
+                        The group of llm settings, e.g. "llama3" for
+                        [llm.llama3] section in the toml file. Overrides model
+                        if both are provided.
 """

    actual_lines = captured.out.strip().split('\n')
--- a/tests/unit/test_config.py
+++ b/tests/unit/test_config.py
@ -8,6 +8,7 @@ from opendevin.core.config import (
    LLMConfig,
    UndefinedString,
    finalize_config,
+    get_llm_config_arg,
    load_from_env,
    load_from_toml,
 )
@ -50,7 +51,7 @@ def test_compat_env_to_config(monkeypatch, setup_env):
    monkeypatch.setenv('LLM_MODEL', 'gpt-4o')
    monkeypatch.setenv('AGENT_MEMORY_MAX_THREADS', '4')
    monkeypatch.setenv('AGENT_MEMORY_ENABLED', 'True')
-    monkeypatch.setenv('AGENT', 'CodeActAgent')
+    monkeypatch.setenv('DEFAULT_AGENT', 'CodeActAgent')
    monkeypatch.setenv('SANDBOX_TYPE', 'local')
    monkeypatch.setenv('SANDBOX_TIMEOUT', '10')

@ -58,14 +59,14 @@ def test_compat_env_to_config(monkeypatch, setup_env):
    load_from_env(config, os.environ)

    assert config.workspace_base == '/repos/opendevin/workspace'
-    assert isinstance(config.llm, LLMConfig)
-    assert config.llm.api_key == 'sk-proj-rgMV0...'
-    assert config.llm.model == 'gpt-4o'
-    assert isinstance(config.agent, AgentConfig)
-    assert isinstance(config.agent.memory_max_threads, int)
-    assert config.agent.memory_max_threads == 4
-    assert config.agent.memory_enabled is True
-    assert config.agent.name == 'CodeActAgent'
+    assert isinstance(config.get_llm_config(), LLMConfig)
+    assert config.get_llm_config().api_key == 'sk-proj-rgMV0...'
+    assert config.get_llm_config().model == 'gpt-4o'
+    assert isinstance(config.get_agent_config(), AgentConfig)
+    assert isinstance(config.get_agent_config().memory_max_threads, int)
+    assert config.get_agent_config().memory_max_threads == 4
+    assert config.get_agent_config().memory_enabled is True
+    assert config.default_agent == 'CodeActAgent'
    assert config.sandbox.box_type == 'local'
    assert config.sandbox.timeout == 10

@ -74,15 +75,15 @@ def test_load_from_old_style_env(monkeypatch, default_config):
    # Test loading configuration from old-style environment variables using monkeypatch
    monkeypatch.setenv('LLM_API_KEY', 'test-api-key')
    monkeypatch.setenv('AGENT_MEMORY_ENABLED', 'True')
-    monkeypatch.setenv('AGENT_NAME', 'PlannerAgent')
+    monkeypatch.setenv('DEFAULT_AGENT', 'PlannerAgent')
    monkeypatch.setenv('WORKSPACE_BASE', '/opt/files/workspace')
    monkeypatch.setenv('SANDBOX_CONTAINER_IMAGE', 'custom_image')

    load_from_env(default_config, os.environ)

-    assert default_config.llm.api_key == 'test-api-key'
-    assert default_config.agent.memory_enabled is True
-    assert default_config.agent.name == 'PlannerAgent'
+    assert default_config.get_llm_config().api_key == 'test-api-key'
+    assert default_config.get_agent_config().memory_enabled is True
+    assert default_config.default_agent == 'PlannerAgent'
    assert default_config.workspace_base == '/opt/files/workspace'
    assert (
        default_config.workspace_mount_path is UndefinedString.UNDEFINED
@ -102,25 +103,52 @@ def test_load_from_new_style_toml(default_config, temp_toml_file):
 model = "test-model"
 api_key = "toml-api-key"

+[llm.cheap]
+model = "some-cheap-model"
+api_key = "cheap-model-api-key"
+
 [agent]
-name = "TestAgent"
 memory_enabled = true

+[agent.BrowsingAgent]
+llm_config = "cheap"
+memory_enabled = false
+
 [sandbox]
 timeout = 1

 [core]
 workspace_base = "/opt/files2/workspace"
+default_agent = "TestAgent"
 sandbox_type = "local"
 """
        )

    load_from_toml(default_config, temp_toml_file)

-    assert default_config.llm.model == 'test-model'
-    assert default_config.llm.api_key == 'toml-api-key'
-    assert default_config.agent.name == 'TestAgent'
-    assert default_config.agent.memory_enabled is True
+    # default llm & agent configs
+    assert default_config.default_agent == 'TestAgent'
+    assert default_config.get_llm_config().model == 'test-model'
+    assert default_config.get_llm_config().api_key == 'toml-api-key'
+    assert default_config.get_agent_config().memory_enabled is True
+
+    # undefined agent config inherits default ones
+    assert (
+        default_config.get_llm_config_from_agent('CodeActAgent')
+        == default_config.get_llm_config()
+    )
+    assert default_config.get_agent_config('CodeActAgent').memory_enabled is True
+
+    # defined agent config overrides default ones
+    assert default_config.get_llm_config_from_agent(
+        'BrowsingAgent'
+    ) == default_config.get_llm_config('cheap')
+    assert (
+        default_config.get_llm_config_from_agent('BrowsingAgent').model
+        == 'some-cheap-model'
+    )
+    assert default_config.get_agent_config('BrowsingAgent').memory_enabled is False
+
    assert default_config.workspace_base == '/opt/files2/workspace'
    assert default_config.sandbox.box_type == 'local'
    assert default_config.sandbox.timeout == 1
@ -152,7 +180,6 @@ def test_compat_load_sandbox_from_toml(default_config, temp_toml_file):
 model = "test-model"

 [agent]
-name = "TestAgent"
 memory_enabled = true

 [core]
@ -161,14 +188,16 @@ sandbox_type = "local"
 sandbox_timeout = 500
 sandbox_container_image = "node:14"
 sandbox_user_id = 1001
+default_agent = "TestAgent"
 """
        )

    load_from_toml(default_config, temp_toml_file)

-    assert default_config.llm.model == 'test-model'
-    assert default_config.agent.name == 'TestAgent'
-    assert default_config.agent.memory_enabled is True
+    assert default_config.get_llm_config().model == 'test-model'
+    assert default_config.get_llm_config_from_agent().model == 'test-model'
+    assert default_config.default_agent == 'TestAgent'
+    assert default_config.get_agent_config().memory_enabled is True
    assert default_config.workspace_base == '/opt/files2/workspace'
    assert default_config.sandbox.box_type == 'local'
    assert default_config.sandbox.timeout == 500
@ -220,8 +249,10 @@ sandbox_user_id = 1001
    load_from_env(default_config, os.environ)

    assert os.environ.get('LLM_MODEL') is None
-    assert default_config.llm.model == 'test-model'
-    assert default_config.llm.api_key == 'env-api-key'
+    assert default_config.get_llm_config().model == 'test-model'
+    assert default_config.get_llm_config('llm').model == 'test-model'
+    assert default_config.get_llm_config_from_agent().model == 'test-model'
+    assert default_config.get_llm_config().api_key == 'env-api-key'

    # after we set workspace_base to 'UNDEFINED' in the environment,
    # workspace_base should be set to that
@ -271,7 +302,7 @@ user_id = 1001
    assert default_config.workspace_mount_path is UndefinedString.UNDEFINED

    # before load_from_env, values are set to the values from the toml file
-    assert default_config.llm.api_key == 'toml-api-key'
+    assert default_config.get_llm_config().api_key == 'toml-api-key'
    assert default_config.sandbox.box_type == 'e2b'
    assert default_config.sandbox.timeout == 500
    assert default_config.sandbox.user_id == 1001
@ -280,8 +311,8 @@ user_id = 1001

    # values from env override values from toml
    assert os.environ.get('LLM_MODEL') is None
-    assert default_config.llm.model == 'test-model'
-    assert default_config.llm.api_key == 'env-api-key'
+    assert default_config.get_llm_config().model == 'test-model'
+    assert default_config.get_llm_config().api_key == 'env-api-key'

    assert default_config.sandbox.box_type == 'local'
    assert default_config.sandbox.timeout == 1000
@ -315,7 +346,7 @@ user_id = 1001
    load_from_env(default_config, os.environ)
    finalize_config(default_config)

-    assert default_config.llm.model == 'test-model'
+    assert default_config.get_llm_config().model == 'test-model'
    assert default_config.sandbox.box_type == 'local'
    assert default_config.sandbox.timeout == 1
    assert default_config.sandbox.container_image == 'custom_image'
@ -328,16 +359,19 @@ def test_defaults_dict_after_updates(default_config):
    assert (
        initial_defaults['workspace_mount_path']['default'] is UndefinedString.UNDEFINED
    )
-    assert initial_defaults['llm']['api_key']['default'] is None
-    assert initial_defaults['agent']['name']['default'] == 'CodeActAgent'
+    assert initial_defaults['default_agent']['default'] == 'CodeActAgent'

    updated_config = AppConfig()
-    updated_config.llm.api_key = 'updated-api-key'
-    updated_config.agent.name = 'MonologueAgent'
+    updated_config.get_llm_config().api_key = 'updated-api-key'
+    updated_config.get_llm_config('llm').api_key = 'updated-api-key'
+    updated_config.get_llm_config_from_agent('agent').api_key = 'updated-api-key'
+    updated_config.get_llm_config_from_agent(
+        'MonologueAgent'
+    ).api_key = 'updated-api-key'
+    updated_config.default_agent = 'MonologueAgent'

    defaults_after_updates = updated_config.defaults_dict
-    assert defaults_after_updates['llm']['api_key']['default'] is None
-    assert defaults_after_updates['agent']['name']['default'] == 'CodeActAgent'
+    assert defaults_after_updates['default_agent']['default'] == 'CodeActAgent'
    assert (
        defaults_after_updates['workspace_mount_path']['default']
        is UndefinedString.UNDEFINED
@ -363,10 +397,10 @@ def test_invalid_toml_format(monkeypatch, temp_toml_file, default_config):
    load_from_env(default_config, os.environ)
    default_config.ssh_password = None  # prevent leak
    default_config.jwt_secret = None  # prevent leak
-    assert default_config.llm.model == 'gpt-5-turbo-1106'
-    assert default_config.llm.custom_llm_provider is None
-    if default_config.llm.api_key is not None:  # prevent leak
-        pytest.fail('LLM API key should be empty.')
+    for llm in default_config.llms.values():
+        llm.api_key = None  # prevent leak
+    assert default_config.get_llm_config().model == 'gpt-5-turbo-1106'
+    assert default_config.get_llm_config().custom_llm_provider is None
    assert default_config.workspace_mount_path == '/home/user/project'


@ -413,9 +447,12 @@ def test_workspace_mount_rewrite(default_config, monkeypatch):


 def test_embedding_base_url_default(default_config):
-    default_config.llm.base_url = 'https://api.exampleapi.com'
+    default_config.get_llm_config().base_url = 'https://api.exampleapi.com'
    finalize_config(default_config)
-    assert default_config.llm.embedding_base_url == 'https://api.exampleapi.com'
+    assert (
+        default_config.get_llm_config().embedding_base_url
+        == 'https://api.exampleapi.com'
+    )


 def test_cache_dir_creation(default_config, tmpdir):
@ -461,9 +498,7 @@ def test_api_keys_repr_str():

    # Test AgentConfig
    # No attrs in AgentConfig have 'key' or 'token' in their name
-    agent_config = AgentConfig(
-        name='my_agent', memory_enabled=True, memory_max_threads=4
-    )
+    agent_config = AgentConfig(memory_enabled=True, memory_max_threads=4)
    for attr_name in dir(AgentConfig):
        if not attr_name.startswith('__'):
            assert (
@ -475,8 +510,8 @@ def test_api_keys_repr_str():

    # Test AppConfig
    app_config = AppConfig(
-        llm=llm_config,
-        agent=agent_config,
+        llms={'llm': llm_config},
+        agents={'agent': agent_config},
        e2b_api_key='my_e2b_api_key',
        jwt_secret='my_jwt_secret',
        ssh_password='my_ssh_password',
@ -519,3 +554,28 @@ max_budget_per_task = 4.0

    assert config.max_iterations == 100
    assert config.max_budget_per_task == 4.0
+
+
+def test_get_llm_config_arg(temp_toml_file):
+    temp_toml = """
+[core]
+max_iterations = 100
+max_budget_per_task = 4.0
+
+[llm.gpt3]
+model="gpt-3.5-turbo"
+api_key="redacted"
+embedding_model="openai"
+
+[llm.gpt4o]
+model="gpt-4o"
+api_key="redacted"
+embedding_model="openai"
+"""
+
+    with open(temp_toml_file, 'w') as f:
+        f.write(temp_toml)
+
+    llm_config = get_llm_config_arg('gpt3', temp_toml_file)
+    assert llm_config.model == 'gpt-3.5-turbo'
+    assert llm_config.embedding_model == 'openai'
--- a/tests/unit/test_logging.py
+++ b/tests/unit/test_logging.py
@ -76,9 +76,6 @@ def test_llm_config_attributes_masking(test_handler):
    assert 'AKIAIOSFODNN7EXAMPLE' not in log_output
    assert 'wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY' not in log_output

-    # reset the LLMConfig
-    LLMConfig.reset()
-

 def test_app_config_attributes_masking(test_handler):
    logger, stream = test_handler
--- a/tests/unit/test_observation_serialization.py
+++ b/tests/unit/test_observation_serialization.py
@ -1,3 +1,4 @@
+from opendevin.core.config import config
 from opendevin.events.observation import (
    CmdOutputObservation,
    Observation,
@ -18,7 +19,9 @@ def serialization_deserialization(original_observation_dict, cls):
        observation_instance, cls
    ), 'The observation instance should be an instance of CmdOutputObservation.'
    serialized_observation_dict = event_to_dict(observation_instance)
-    serialized_observation_memory = event_to_memory(observation_instance)
+    serialized_observation_memory = event_to_memory(
+        observation_instance, config.get_llm_config().max_message_chars
+    )
    assert (
        serialized_observation_dict == original_observation_dict
    ), 'The serialized observation should match the original observation dict.'