mirror of
https://github.com/OpenHands/OpenHands.git
synced 2025-12-26 05:48:36 +08:00
feat: add metrics related to cost for better observability (#1944)
* add metrics for total_cost * make lint * refact codeact * change metrics into llm * add costs list, add into state * refactor log completion * refactor and test others * make lint * Update opendevin/core/metrics.py Co-authored-by: Boxuan Li <liboxuan@connect.hku.hk> * Update opendevin/llm/llm.py Co-authored-by: Xingyao Wang <xingyao6@illinois.edu> * refactor * add code --------- Co-authored-by: Boxuan Li <liboxuan@connect.hku.hk> Co-authored-by: Xingyao Wang <xingyao6@illinois.edu>
This commit is contained in:
parent
34cccfe9cc
commit
d18e6c85a0
@ -42,7 +42,7 @@ class SWEAgent(Agent):
|
||||
self.running_memory.append(memory)
|
||||
|
||||
def _think_act(self, messages: list[dict]) -> tuple[Action, str]:
|
||||
resp = self.llm.completion(
|
||||
resp = self.llm.do_completion(
|
||||
messages=messages,
|
||||
temperature=0.05,
|
||||
)
|
||||
|
||||
@ -9,7 +9,6 @@ from agenthub.codeact_agent.prompt import (
|
||||
)
|
||||
from opendevin.controller.agent import Agent
|
||||
from opendevin.controller.state.state import State
|
||||
from opendevin.core.logger import opendevin_logger as logger
|
||||
from opendevin.events.action import (
|
||||
Action,
|
||||
AgentFinishAction,
|
||||
@ -173,7 +172,6 @@ class CodeActAgent(Agent):
|
||||
Resets the CodeAct Agent.
|
||||
"""
|
||||
super().reset()
|
||||
self.cost_accumulator = 0
|
||||
|
||||
def step(self, state: State) -> Action:
|
||||
"""
|
||||
@ -215,7 +213,7 @@ class CodeActAgent(Agent):
|
||||
f'\n\nENVIRONMENT REMINDER: You have {state.max_iterations - state.iteration} turns left to complete the task.'
|
||||
)
|
||||
|
||||
response = self.llm.completion(
|
||||
response = self.llm.do_completion(
|
||||
messages=messages,
|
||||
stop=[
|
||||
'</execute_ipython>',
|
||||
@ -225,8 +223,6 @@ class CodeActAgent(Agent):
|
||||
temperature=0.0,
|
||||
)
|
||||
|
||||
self.log_cost(response)
|
||||
|
||||
action_str: str = parse_response(response)
|
||||
state.num_of_chars += sum(
|
||||
len(message['content']) for message in messages
|
||||
@ -269,15 +265,3 @@ class CodeActAgent(Agent):
|
||||
|
||||
def search_memory(self, query: str) -> list[str]:
|
||||
raise NotImplementedError('Implement this abstract method')
|
||||
|
||||
def log_cost(self, response):
|
||||
try:
|
||||
cur_cost = self.llm.completion_cost(response)
|
||||
except Exception:
|
||||
cur_cost = 0
|
||||
self.cost_accumulator += cur_cost
|
||||
logger.info(
|
||||
'Cost: %.2f USD | Accumulated Cost: %.2f USD',
|
||||
cur_cost,
|
||||
self.cost_accumulator,
|
||||
)
|
||||
|
||||
@ -65,7 +65,7 @@ class MicroAgent(Agent):
|
||||
latest_user_message=latest_user_message,
|
||||
)
|
||||
messages = [{'content': prompt, 'role': 'user'}]
|
||||
resp = self.llm.completion(messages=messages)
|
||||
resp = self.llm.do_completion(messages=messages)
|
||||
action_resp = resp['choices'][0]['message']['content']
|
||||
state.num_of_chars += len(prompt) + len(action_resp)
|
||||
action = parse_response(action_resp)
|
||||
|
||||
@ -242,7 +242,7 @@ class MonologueAgent(Agent):
|
||||
state.background_commands_obs,
|
||||
)
|
||||
messages = [{'content': prompt, 'role': 'user'}]
|
||||
resp = self.llm.completion(messages=messages)
|
||||
resp = self.llm.do_completion(messages=messages)
|
||||
action_resp = resp['choices'][0]['message']['content']
|
||||
state.num_of_chars += len(prompt) + len(action_resp)
|
||||
action = prompts.parse_action_response(action_resp)
|
||||
|
||||
@ -43,7 +43,7 @@ class PlannerAgent(Agent):
|
||||
return AgentFinishAction()
|
||||
prompt = get_prompt(state)
|
||||
messages = [{'content': prompt, 'role': 'user'}]
|
||||
resp = self.llm.completion(messages=messages)
|
||||
resp = self.llm.do_completion(messages=messages)
|
||||
action_resp = resp['choices'][0]['message']['content']
|
||||
state.num_of_chars += len(prompt) + len(action_resp)
|
||||
action = parse_response(action_resp)
|
||||
|
||||
@ -89,6 +89,8 @@ class AgentController:
|
||||
|
||||
def update_state_after_step(self):
|
||||
self.state.updated_info = []
|
||||
# update metrics especially for cost
|
||||
self.state.metrics = self.agent.llm.metrics
|
||||
|
||||
async def report_error(self, message: str, exception: Exception | None = None):
|
||||
self.state.error = message
|
||||
|
||||
@ -4,6 +4,7 @@ from dataclasses import dataclass, field
|
||||
|
||||
from opendevin.controller.state.task import RootTask
|
||||
from opendevin.core.logger import opendevin_logger as logger
|
||||
from opendevin.core.metrics import Metrics
|
||||
from opendevin.core.schema import AgentState
|
||||
from opendevin.events.action import (
|
||||
Action,
|
||||
@ -30,6 +31,7 @@ class State:
|
||||
outputs: dict = field(default_factory=dict)
|
||||
error: str | None = None
|
||||
agent_state: AgentState = AgentState.LOADING
|
||||
metrics: Metrics = Metrics()
|
||||
|
||||
def save_to_session(self, sid: str):
|
||||
fs = get_file_store()
|
||||
|
||||
46
opendevin/core/metrics.py
Normal file
46
opendevin/core/metrics.py
Normal file
@ -0,0 +1,46 @@
|
||||
class Metrics:
|
||||
"""
|
||||
Metrics class can record various metrics during running and evaluation.
|
||||
Currently we define the following metrics:
|
||||
accumulated_cost: the total cost (USD $) of the current LLM.
|
||||
"""
|
||||
|
||||
def __init__(self) -> None:
|
||||
self._accumulated_cost: float = 0.0
|
||||
self._costs: list[float] = []
|
||||
|
||||
@property
|
||||
def accumulated_cost(self) -> float:
|
||||
return self._accumulated_cost
|
||||
|
||||
@accumulated_cost.setter
|
||||
def accumulated_cost(self, value: float) -> None:
|
||||
if value < 0:
|
||||
raise ValueError('Total cost cannot be negative.')
|
||||
self._accumulated_cost = value
|
||||
|
||||
@property
|
||||
def costs(self) -> list:
|
||||
return self._costs
|
||||
|
||||
def add_cost(self, value: float) -> None:
|
||||
if value < 0:
|
||||
raise ValueError('Added cost cannot be negative.')
|
||||
self._accumulated_cost += value
|
||||
self._costs.append(value)
|
||||
|
||||
def get(self):
|
||||
"""
|
||||
Return the metrics in a dictionary.
|
||||
"""
|
||||
return {'accumulated_cost': self._accumulated_cost, 'costs': self._costs}
|
||||
|
||||
def log(self):
|
||||
"""
|
||||
Log the metrics.
|
||||
"""
|
||||
metrics = self.get()
|
||||
logs = ''
|
||||
for key, value in metrics.items():
|
||||
logs += f'{key}: {value}\n'
|
||||
return logs
|
||||
@ -21,6 +21,7 @@ from tenacity import (
|
||||
from opendevin.core.config import config
|
||||
from opendevin.core.logger import llm_prompt_logger, llm_response_logger
|
||||
from opendevin.core.logger import opendevin_logger as logger
|
||||
from opendevin.core.metrics import Metrics
|
||||
|
||||
__all__ = ['LLM']
|
||||
|
||||
@ -58,6 +59,7 @@ class LLM:
|
||||
max_input_tokens=None,
|
||||
max_output_tokens=None,
|
||||
llm_config=None,
|
||||
metrics=None,
|
||||
):
|
||||
"""
|
||||
Initializes the LLM. If LLMConfig is passed, its values will be the fallback.
|
||||
@ -77,7 +79,7 @@ class LLM:
|
||||
custom_llm_provider (str, optional): A custom LLM provider. Defaults to LLM_CUSTOM_LLM_PROVIDER.
|
||||
llm_timeout (int, optional): The maximum time to wait for a response in seconds. Defaults to LLM_TIMEOUT.
|
||||
llm_temperature (float, optional): The temperature for LLM sampling. Defaults to LLM_TEMPERATURE.
|
||||
|
||||
metrics (Metrics, optional): The metrics object to use. Defaults to None.
|
||||
"""
|
||||
if llm_config is None:
|
||||
llm_config = config.llm
|
||||
@ -112,6 +114,7 @@ class LLM:
|
||||
if max_output_tokens is not None
|
||||
else llm_config.max_output_tokens
|
||||
)
|
||||
metrics = metrics if metrics is not None else Metrics()
|
||||
|
||||
logger.info(f'Initializing LLM with model: {model}')
|
||||
self.model_name = model
|
||||
@ -122,6 +125,7 @@ class LLM:
|
||||
self.max_output_tokens = max_output_tokens
|
||||
self.llm_timeout = llm_timeout
|
||||
self.custom_llm_provider = custom_llm_provider
|
||||
self.metrics = metrics
|
||||
|
||||
# litellm actually uses base Exception here for unknown model
|
||||
self.model_info = None
|
||||
@ -200,6 +204,30 @@ class LLM:
|
||||
"""
|
||||
return self._completion
|
||||
|
||||
def do_completion(self, *args, **kwargs):
|
||||
"""
|
||||
Wrapper for the litellm completion function.
|
||||
|
||||
Check the complete documentation at https://litellm.vercel.app/docs/completion
|
||||
"""
|
||||
resp = self._completion(*args, **kwargs)
|
||||
self.post_completion(resp)
|
||||
return resp
|
||||
|
||||
def post_completion(self, response: str) -> None:
|
||||
"""
|
||||
Post-process the completion response.
|
||||
"""
|
||||
try:
|
||||
cur_cost = self.completion_cost(response)
|
||||
except Exception:
|
||||
cur_cost = 0
|
||||
logger.info(
|
||||
'Cost: %.2f USD | Accumulated Cost: %.2f USD',
|
||||
cur_cost,
|
||||
self.metrics.accumulated_cost,
|
||||
)
|
||||
|
||||
def get_token_count(self, messages):
|
||||
"""
|
||||
Get the number of tokens in a list of messages.
|
||||
@ -231,6 +259,7 @@ class LLM:
|
||||
def completion_cost(self, response):
|
||||
"""
|
||||
Calculate the cost of a completion response based on the model. Local models are treated as free.
|
||||
Add the current cost into total cost in metrics.
|
||||
|
||||
Args:
|
||||
response (list): A response from a model invocation.
|
||||
@ -241,6 +270,7 @@ class LLM:
|
||||
if not self.is_local():
|
||||
try:
|
||||
cost = litellm_completion_cost(completion_response=response)
|
||||
self.metrics.add_cost(cost)
|
||||
return cost
|
||||
except Exception:
|
||||
logger.warning('Cost calculation not supported for this model.')
|
||||
|
||||
@ -16,7 +16,7 @@ class MemoryCondenser:
|
||||
|
||||
try:
|
||||
messages = [{'content': summarize_prompt, 'role': 'user'}]
|
||||
resp = llm.completion(messages=messages)
|
||||
resp = llm.do_completion(messages=messages)
|
||||
summary_response = resp['choices'][0]['message']['content']
|
||||
return summary_response
|
||||
except Exception as e:
|
||||
|
||||
@ -31,7 +31,7 @@ def test_coder_agent_with_summary():
|
||||
"""
|
||||
mock_llm = MagicMock()
|
||||
content = json.dumps({'action': 'finish', 'args': {}})
|
||||
mock_llm.completion.return_value = {'choices': [{'message': {'content': content}}]}
|
||||
mock_llm.do_completion.return_value = {'choices': [{'message': {'content': content}}]}
|
||||
|
||||
coder_agent = Agent.get_cls('CoderAgent')(llm=mock_llm)
|
||||
assert coder_agent is not None
|
||||
@ -43,8 +43,8 @@ def test_coder_agent_with_summary():
|
||||
state = State(history=history, inputs={'summary': summary})
|
||||
coder_agent.step(state)
|
||||
|
||||
mock_llm.completion.assert_called_once()
|
||||
_, kwargs = mock_llm.completion.call_args
|
||||
mock_llm.do_completion.assert_called_once()
|
||||
_, kwargs = mock_llm.do_completion.call_args
|
||||
prompt = kwargs['messages'][0]['content']
|
||||
assert task in prompt
|
||||
assert "Here's a summary of the codebase, as it relates to this task" in prompt
|
||||
@ -58,7 +58,7 @@ def test_coder_agent_without_summary():
|
||||
"""
|
||||
mock_llm = MagicMock()
|
||||
content = json.dumps({'action': 'finish', 'args': {}})
|
||||
mock_llm.completion.return_value = {'choices': [{'message': {'content': content}}]}
|
||||
mock_llm.do_completion.return_value = {'choices': [{'message': {'content': content}}]}
|
||||
|
||||
coder_agent = Agent.get_cls('CoderAgent')(llm=mock_llm)
|
||||
assert coder_agent is not None
|
||||
@ -69,8 +69,8 @@ def test_coder_agent_without_summary():
|
||||
state = State(history=history)
|
||||
coder_agent.step(state)
|
||||
|
||||
mock_llm.completion.assert_called_once()
|
||||
_, kwargs = mock_llm.completion.call_args
|
||||
mock_llm.do_completion.assert_called_once()
|
||||
_, kwargs = mock_llm.do_completion.call_args
|
||||
prompt = kwargs['messages'][0]['content']
|
||||
assert task in prompt
|
||||
assert "Here's a summary of the codebase, as it relates to this task" not in prompt
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user