diff --git a/agenthub/SWE_agent/agent.py b/agenthub/SWE_agent/agent.py index d62681579d..6a3297e229 100644 --- a/agenthub/SWE_agent/agent.py +++ b/agenthub/SWE_agent/agent.py @@ -42,7 +42,7 @@ class SWEAgent(Agent): self.running_memory.append(memory) def _think_act(self, messages: list[dict]) -> tuple[Action, str]: - resp = self.llm.completion( + resp = self.llm.do_completion( messages=messages, temperature=0.05, ) diff --git a/agenthub/codeact_agent/codeact_agent.py b/agenthub/codeact_agent/codeact_agent.py index b797dd1cc7..e7f2c12690 100644 --- a/agenthub/codeact_agent/codeact_agent.py +++ b/agenthub/codeact_agent/codeact_agent.py @@ -9,7 +9,6 @@ from agenthub.codeact_agent.prompt import ( ) from opendevin.controller.agent import Agent from opendevin.controller.state.state import State -from opendevin.core.logger import opendevin_logger as logger from opendevin.events.action import ( Action, AgentFinishAction, @@ -173,7 +172,6 @@ class CodeActAgent(Agent): Resets the CodeAct Agent. """ super().reset() - self.cost_accumulator = 0 def step(self, state: State) -> Action: """ @@ -215,7 +213,7 @@ class CodeActAgent(Agent): f'\n\nENVIRONMENT REMINDER: You have {state.max_iterations - state.iteration} turns left to complete the task.' ) - response = self.llm.completion( + response = self.llm.do_completion( messages=messages, stop=[ '', @@ -225,8 +223,6 @@ class CodeActAgent(Agent): temperature=0.0, ) - self.log_cost(response) - action_str: str = parse_response(response) state.num_of_chars += sum( len(message['content']) for message in messages @@ -269,15 +265,3 @@ class CodeActAgent(Agent): def search_memory(self, query: str) -> list[str]: raise NotImplementedError('Implement this abstract method') - - def log_cost(self, response): - try: - cur_cost = self.llm.completion_cost(response) - except Exception: - cur_cost = 0 - self.cost_accumulator += cur_cost - logger.info( - 'Cost: %.2f USD | Accumulated Cost: %.2f USD', - cur_cost, - self.cost_accumulator, - ) diff --git a/agenthub/micro/agent.py b/agenthub/micro/agent.py index 6e5a73e0d6..e4ac0cceb6 100644 --- a/agenthub/micro/agent.py +++ b/agenthub/micro/agent.py @@ -65,7 +65,7 @@ class MicroAgent(Agent): latest_user_message=latest_user_message, ) messages = [{'content': prompt, 'role': 'user'}] - resp = self.llm.completion(messages=messages) + resp = self.llm.do_completion(messages=messages) action_resp = resp['choices'][0]['message']['content'] state.num_of_chars += len(prompt) + len(action_resp) action = parse_response(action_resp) diff --git a/agenthub/monologue_agent/agent.py b/agenthub/monologue_agent/agent.py index a8e7d73847..2fb1fe6a83 100644 --- a/agenthub/monologue_agent/agent.py +++ b/agenthub/monologue_agent/agent.py @@ -242,7 +242,7 @@ class MonologueAgent(Agent): state.background_commands_obs, ) messages = [{'content': prompt, 'role': 'user'}] - resp = self.llm.completion(messages=messages) + resp = self.llm.do_completion(messages=messages) action_resp = resp['choices'][0]['message']['content'] state.num_of_chars += len(prompt) + len(action_resp) action = prompts.parse_action_response(action_resp) diff --git a/agenthub/planner_agent/agent.py b/agenthub/planner_agent/agent.py index 3979b0a30c..2dd4ec8be0 100644 --- a/agenthub/planner_agent/agent.py +++ b/agenthub/planner_agent/agent.py @@ -43,7 +43,7 @@ class PlannerAgent(Agent): return AgentFinishAction() prompt = get_prompt(state) messages = [{'content': prompt, 'role': 'user'}] - resp = self.llm.completion(messages=messages) + resp = self.llm.do_completion(messages=messages) action_resp = resp['choices'][0]['message']['content'] state.num_of_chars += len(prompt) + len(action_resp) action = parse_response(action_resp) diff --git a/opendevin/controller/agent_controller.py b/opendevin/controller/agent_controller.py index 67b2e86dd8..cb374f9a80 100644 --- a/opendevin/controller/agent_controller.py +++ b/opendevin/controller/agent_controller.py @@ -89,6 +89,8 @@ class AgentController: def update_state_after_step(self): self.state.updated_info = [] + # update metrics especially for cost + self.state.metrics = self.agent.llm.metrics async def report_error(self, message: str, exception: Exception | None = None): self.state.error = message diff --git a/opendevin/controller/state/state.py b/opendevin/controller/state/state.py index 50e3d3d18a..334328b7f5 100644 --- a/opendevin/controller/state/state.py +++ b/opendevin/controller/state/state.py @@ -4,6 +4,7 @@ from dataclasses import dataclass, field from opendevin.controller.state.task import RootTask from opendevin.core.logger import opendevin_logger as logger +from opendevin.core.metrics import Metrics from opendevin.core.schema import AgentState from opendevin.events.action import ( Action, @@ -30,6 +31,7 @@ class State: outputs: dict = field(default_factory=dict) error: str | None = None agent_state: AgentState = AgentState.LOADING + metrics: Metrics = Metrics() def save_to_session(self, sid: str): fs = get_file_store() diff --git a/opendevin/core/metrics.py b/opendevin/core/metrics.py new file mode 100644 index 0000000000..8bfe4e5524 --- /dev/null +++ b/opendevin/core/metrics.py @@ -0,0 +1,46 @@ +class Metrics: + """ + Metrics class can record various metrics during running and evaluation. + Currently we define the following metrics: + accumulated_cost: the total cost (USD $) of the current LLM. + """ + + def __init__(self) -> None: + self._accumulated_cost: float = 0.0 + self._costs: list[float] = [] + + @property + def accumulated_cost(self) -> float: + return self._accumulated_cost + + @accumulated_cost.setter + def accumulated_cost(self, value: float) -> None: + if value < 0: + raise ValueError('Total cost cannot be negative.') + self._accumulated_cost = value + + @property + def costs(self) -> list: + return self._costs + + def add_cost(self, value: float) -> None: + if value < 0: + raise ValueError('Added cost cannot be negative.') + self._accumulated_cost += value + self._costs.append(value) + + def get(self): + """ + Return the metrics in a dictionary. + """ + return {'accumulated_cost': self._accumulated_cost, 'costs': self._costs} + + def log(self): + """ + Log the metrics. + """ + metrics = self.get() + logs = '' + for key, value in metrics.items(): + logs += f'{key}: {value}\n' + return logs diff --git a/opendevin/llm/llm.py b/opendevin/llm/llm.py index da938823b7..9d4477d6a2 100644 --- a/opendevin/llm/llm.py +++ b/opendevin/llm/llm.py @@ -21,6 +21,7 @@ from tenacity import ( from opendevin.core.config import config from opendevin.core.logger import llm_prompt_logger, llm_response_logger from opendevin.core.logger import opendevin_logger as logger +from opendevin.core.metrics import Metrics __all__ = ['LLM'] @@ -58,6 +59,7 @@ class LLM: max_input_tokens=None, max_output_tokens=None, llm_config=None, + metrics=None, ): """ Initializes the LLM. If LLMConfig is passed, its values will be the fallback. @@ -77,7 +79,7 @@ class LLM: custom_llm_provider (str, optional): A custom LLM provider. Defaults to LLM_CUSTOM_LLM_PROVIDER. llm_timeout (int, optional): The maximum time to wait for a response in seconds. Defaults to LLM_TIMEOUT. llm_temperature (float, optional): The temperature for LLM sampling. Defaults to LLM_TEMPERATURE. - + metrics (Metrics, optional): The metrics object to use. Defaults to None. """ if llm_config is None: llm_config = config.llm @@ -112,6 +114,7 @@ class LLM: if max_output_tokens is not None else llm_config.max_output_tokens ) + metrics = metrics if metrics is not None else Metrics() logger.info(f'Initializing LLM with model: {model}') self.model_name = model @@ -122,6 +125,7 @@ class LLM: self.max_output_tokens = max_output_tokens self.llm_timeout = llm_timeout self.custom_llm_provider = custom_llm_provider + self.metrics = metrics # litellm actually uses base Exception here for unknown model self.model_info = None @@ -200,6 +204,30 @@ class LLM: """ return self._completion + def do_completion(self, *args, **kwargs): + """ + Wrapper for the litellm completion function. + + Check the complete documentation at https://litellm.vercel.app/docs/completion + """ + resp = self._completion(*args, **kwargs) + self.post_completion(resp) + return resp + + def post_completion(self, response: str) -> None: + """ + Post-process the completion response. + """ + try: + cur_cost = self.completion_cost(response) + except Exception: + cur_cost = 0 + logger.info( + 'Cost: %.2f USD | Accumulated Cost: %.2f USD', + cur_cost, + self.metrics.accumulated_cost, + ) + def get_token_count(self, messages): """ Get the number of tokens in a list of messages. @@ -231,6 +259,7 @@ class LLM: def completion_cost(self, response): """ Calculate the cost of a completion response based on the model. Local models are treated as free. + Add the current cost into total cost in metrics. Args: response (list): A response from a model invocation. @@ -241,6 +270,7 @@ class LLM: if not self.is_local(): try: cost = litellm_completion_cost(completion_response=response) + self.metrics.add_cost(cost) return cost except Exception: logger.warning('Cost calculation not supported for this model.') diff --git a/opendevin/memory/condenser.py b/opendevin/memory/condenser.py index b8b1842dc7..1587283006 100644 --- a/opendevin/memory/condenser.py +++ b/opendevin/memory/condenser.py @@ -16,7 +16,7 @@ class MemoryCondenser: try: messages = [{'content': summarize_prompt, 'role': 'user'}] - resp = llm.completion(messages=messages) + resp = llm.do_completion(messages=messages) summary_response = resp['choices'][0]['message']['content'] return summary_response except Exception as e: diff --git a/tests/unit/test_micro_agents.py b/tests/unit/test_micro_agents.py index 39d5bec186..881bb2d3d8 100644 --- a/tests/unit/test_micro_agents.py +++ b/tests/unit/test_micro_agents.py @@ -31,7 +31,7 @@ def test_coder_agent_with_summary(): """ mock_llm = MagicMock() content = json.dumps({'action': 'finish', 'args': {}}) - mock_llm.completion.return_value = {'choices': [{'message': {'content': content}}]} + mock_llm.do_completion.return_value = {'choices': [{'message': {'content': content}}]} coder_agent = Agent.get_cls('CoderAgent')(llm=mock_llm) assert coder_agent is not None @@ -43,8 +43,8 @@ def test_coder_agent_with_summary(): state = State(history=history, inputs={'summary': summary}) coder_agent.step(state) - mock_llm.completion.assert_called_once() - _, kwargs = mock_llm.completion.call_args + mock_llm.do_completion.assert_called_once() + _, kwargs = mock_llm.do_completion.call_args prompt = kwargs['messages'][0]['content'] assert task in prompt assert "Here's a summary of the codebase, as it relates to this task" in prompt @@ -58,7 +58,7 @@ def test_coder_agent_without_summary(): """ mock_llm = MagicMock() content = json.dumps({'action': 'finish', 'args': {}}) - mock_llm.completion.return_value = {'choices': [{'message': {'content': content}}]} + mock_llm.do_completion.return_value = {'choices': [{'message': {'content': content}}]} coder_agent = Agent.get_cls('CoderAgent')(llm=mock_llm) assert coder_agent is not None @@ -69,8 +69,8 @@ def test_coder_agent_without_summary(): state = State(history=history) coder_agent.step(state) - mock_llm.completion.assert_called_once() - _, kwargs = mock_llm.completion.call_args + mock_llm.do_completion.assert_called_once() + _, kwargs = mock_llm.do_completion.call_args prompt = kwargs['messages'][0]['content'] assert task in prompt assert "Here's a summary of the codebase, as it relates to this task" not in prompt