mirror of
https://github.com/OpenHands/OpenHands.git
synced 2025-12-26 05:48:36 +08:00
Tweak connect exceptions (#1120)
* Clean up manual sleep * Add default retries and document them. * Add doctrings to llm * Add exponential backoff for rate limiting errors * Get embeddings for the action and its own content, not the user message * Add a few bad exceptions to stop loop * Stop loop when the step has no action * Add action with content, no message, to history * make retry settings customizable * fix condense to stop the loop for the same reasons as completion * Add 500-504 exception to retries * document the retry variables * Add retries and limits for embeddings. Replaces llama-index hard-coded decorator. * Rename to retry_min_wait and retry_max_wait
This commit is contained in:
parent
1f2a845feb
commit
464bf7ee23
@ -177,6 +177,11 @@ can only be as powerful as the models driving it--fortunately folks on our team
|
||||
are actively working on building better open source models!
|
||||
|
||||
|
||||
**Note on API retries and rate limits:**
|
||||
Some LLMs have rate limits and may require retries. OpenDevin will automatically retry requests if it receives a 429 error or API connection error.
|
||||
You can set LLM_NUM_RETRIES, LLM_RETRY_MIN_WAIT, LLM_RETRY_MAX_WAIT environment variables to control the number of retries and the time between retries.
|
||||
By default, LLM_NUM_RETRIES is 5 and LLM_RETRY_MIN_WAIT, LLM_RETRY_MAX_WAIT are 3 seconds and respectively 60 seconds.
|
||||
|
||||
## ⭐️ Research Strategy
|
||||
|
||||
Achieving full replication of production-grade applications with LLMs is a complex endeavor. Our strategy involves:
|
||||
|
||||
@ -1,3 +1,4 @@
|
||||
import llama_index.embeddings.openai.base as llama_openai
|
||||
from threading import Thread
|
||||
|
||||
import chromadb
|
||||
@ -5,11 +6,46 @@ from llama_index.core import Document
|
||||
from llama_index.core.retrievers import VectorIndexRetriever
|
||||
from llama_index.core import VectorStoreIndex
|
||||
from llama_index.vector_stores.chroma import ChromaVectorStore
|
||||
from tenacity import retry, retry_if_exception_type, stop_after_attempt, wait_random_exponential
|
||||
from openai._exceptions import APIConnectionError, RateLimitError, InternalServerError
|
||||
|
||||
from opendevin import config
|
||||
from opendevin.logger import opendevin_logger as logger
|
||||
from . import json
|
||||
|
||||
num_retries = config.get('LLM_NUM_RETRIES')
|
||||
retry_min_wait = config.get('LLM_RETRY_MIN_WAIT')
|
||||
retry_max_wait = config.get('LLM_RETRY_MAX_WAIT')
|
||||
|
||||
# llama-index includes a retry decorator around openai.get_embeddings() function
|
||||
# it is initialized with hard-coded values and errors
|
||||
# this non-customizable behavior is creating issues when it's retrying faster than providers' rate limits
|
||||
# this block attempts to banish it and replace it with our decorator, to allow users to set their own limits
|
||||
|
||||
if hasattr(llama_openai.get_embeddings, '__wrapped__'):
|
||||
original_get_embeddings = llama_openai.get_embeddings.__wrapped__
|
||||
else:
|
||||
logger.warning('Cannot set custom retry limits.') # warn
|
||||
num_retries = 1
|
||||
original_get_embeddings = llama_openai.get_embeddings
|
||||
|
||||
|
||||
def attempt_on_error(retry_state):
|
||||
logger.error(f'{retry_state.outcome.exception()}. Attempt #{retry_state.attempt_number} | You can customize these settings in the configuration.', exc_info=False)
|
||||
return True
|
||||
|
||||
|
||||
@retry(reraise=True,
|
||||
stop=stop_after_attempt(num_retries),
|
||||
wait=wait_random_exponential(min=retry_min_wait, max=retry_max_wait),
|
||||
retry=retry_if_exception_type((RateLimitError, APIConnectionError, InternalServerError)),
|
||||
after=attempt_on_error)
|
||||
def wrapper_get_embeddings(*args, **kwargs):
|
||||
return original_get_embeddings(*args, **kwargs)
|
||||
|
||||
|
||||
llama_openai.get_embeddings = wrapper_get_embeddings
|
||||
|
||||
embedding_strategy = config.get('LLM_EMBEDDING_MODEL')
|
||||
|
||||
# TODO: More embeddings: https://docs.llamaindex.ai/en/stable/examples/embeddings/OpenAI/
|
||||
|
||||
@ -1,9 +1,9 @@
|
||||
import traceback
|
||||
|
||||
from opendevin.llm.llm import LLM
|
||||
from opendevin.exceptions import AgentEventTypeError
|
||||
import agenthub.monologue_agent.utils.json as json
|
||||
import agenthub.monologue_agent.utils.prompts as prompts
|
||||
from opendevin.logger import opendevin_logger as logger
|
||||
|
||||
|
||||
class Monologue:
|
||||
@ -53,7 +53,7 @@ class Monologue:
|
||||
try:
|
||||
total_length += len(json.dumps(t))
|
||||
except TypeError as e:
|
||||
print(f'Error serializing thought: {e}')
|
||||
logger.error('Error serializing thought: %s', str(e), exc_info=False)
|
||||
return total_length
|
||||
|
||||
def condense(self, llm: LLM):
|
||||
@ -64,7 +64,7 @@ class Monologue:
|
||||
- llm (LLM): llm to be used for summarization
|
||||
|
||||
Raises:
|
||||
- RunTimeError: When the condensing process fails for any reason
|
||||
- Exception: the same exception as it got from the llm or processing the response
|
||||
"""
|
||||
|
||||
try:
|
||||
@ -74,5 +74,7 @@ class Monologue:
|
||||
summary_resp = resp['choices'][0]['message']['content']
|
||||
self.thoughts = prompts.parse_summary_response(summary_resp)
|
||||
except Exception as e:
|
||||
traceback.print_exc()
|
||||
raise RuntimeError(f'Error condensing thoughts: {e}')
|
||||
logger.error('Error condensing thoughts: %s', str(e), exc_info=False)
|
||||
|
||||
# TODO If the llm fails with ContextWindowExceededError, we can try to condense the monologue chunk by chunk
|
||||
raise
|
||||
|
||||
@ -5,6 +5,9 @@ import toml
|
||||
from dotenv import load_dotenv
|
||||
|
||||
from opendevin.schema import ConfigType
|
||||
import logging
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
load_dotenv()
|
||||
|
||||
@ -21,8 +24,9 @@ DEFAULT_CONFIG: dict = {
|
||||
ConfigType.LLM_EMBEDDING_MODEL: 'local',
|
||||
ConfigType.LLM_EMBEDDING_DEPLOYMENT_NAME: None,
|
||||
ConfigType.LLM_API_VERSION: None,
|
||||
ConfigType.LLM_NUM_RETRIES: 1,
|
||||
ConfigType.LLM_COOLDOWN_TIME: 1,
|
||||
ConfigType.LLM_NUM_RETRIES: 5,
|
||||
ConfigType.LLM_RETRY_MIN_WAIT: 3,
|
||||
ConfigType.LLM_RETRY_MAX_WAIT: 60,
|
||||
ConfigType.MAX_ITERATIONS: 100,
|
||||
# GPT-4 pricing is $10 per 1M input tokens. Since tokenization happens on LLM side,
|
||||
# we cannot easily count number of tokens, but we can count characters.
|
||||
@ -41,6 +45,16 @@ if os.path.exists('config.toml'):
|
||||
with open('config.toml', 'rb') as f:
|
||||
config_str = f.read().decode('utf-8')
|
||||
|
||||
|
||||
def int_value(value, default, config_key):
|
||||
# FIXME use a library
|
||||
try:
|
||||
return int(value)
|
||||
except ValueError:
|
||||
logger.warning(f'Invalid value for {config_key}: {value} not applied. Using default value {default}')
|
||||
return default
|
||||
|
||||
|
||||
tomlConfig = toml.loads(config_str)
|
||||
config = DEFAULT_CONFIG.copy()
|
||||
for k, v in config.items():
|
||||
@ -48,6 +62,8 @@ for k, v in config.items():
|
||||
config[k] = os.environ[k]
|
||||
elif k in tomlConfig:
|
||||
config[k] = tomlConfig[k]
|
||||
if k in [ConfigType.LLM_NUM_RETRIES, ConfigType.LLM_RETRY_MIN_WAIT, ConfigType.LLM_RETRY_MAX_WAIT]:
|
||||
config[k] = int_value(config[k], v, config_key=k)
|
||||
|
||||
|
||||
def get_parser():
|
||||
|
||||
@ -1,10 +1,9 @@
|
||||
import asyncio
|
||||
import time
|
||||
import traceback
|
||||
from typing import Callable, List
|
||||
|
||||
from litellm.exceptions import APIConnectionError
|
||||
from openai import AuthenticationError
|
||||
from openai import AuthenticationError, APIConnectionError
|
||||
from litellm import ContextWindowExceededError
|
||||
|
||||
from opendevin import config
|
||||
from opendevin.action import (
|
||||
@ -170,26 +169,23 @@ class AgentController:
|
||||
observation: Observation = NullObservation('')
|
||||
try:
|
||||
action = self.agent.step(self.state)
|
||||
logger.info(action, extra={'msg_type': 'ACTION'})
|
||||
if action is None:
|
||||
raise AgentNoActionError()
|
||||
logger.info(action, extra={'msg_type': 'ACTION'})
|
||||
except Exception as e:
|
||||
observation = AgentErrorObservation(str(e))
|
||||
logger.error(e)
|
||||
logger.debug(traceback.format_exc())
|
||||
|
||||
if isinstance(e, APIConnectionError):
|
||||
time.sleep(3)
|
||||
|
||||
# raise specific exceptions that need to be handled outside
|
||||
# note: we are using AuthenticationError class from openai rather than
|
||||
# litellm because:
|
||||
# note: we are using classes from openai rather than litellm because:
|
||||
# 1) litellm.exceptions.AuthenticationError is a subclass of openai.AuthenticationError
|
||||
# 2) embeddings call, initiated by llama-index, has no wrapper for authentication
|
||||
# errors. This means we have to catch individual authentication errors
|
||||
# 2) embeddings call, initiated by llama-index, has no wrapper for errors.
|
||||
# This means we have to catch individual authentication errors
|
||||
# from different providers, and OpenAI is one of these.
|
||||
if isinstance(e, (AuthenticationError, AgentNoActionError)):
|
||||
if isinstance(e, (AuthenticationError, ContextWindowExceededError, APIConnectionError)):
|
||||
raise
|
||||
|
||||
self.update_state_after_step()
|
||||
|
||||
await self._run_callbacks(action)
|
||||
|
||||
@ -1,30 +1,55 @@
|
||||
|
||||
from litellm import completion as litellm_completion
|
||||
from tenacity import retry, retry_if_exception_type, stop_after_attempt
|
||||
from litellm.exceptions import APIConnectionError, RateLimitError
|
||||
from tenacity import retry, retry_if_exception_type, stop_after_attempt, wait_random_exponential
|
||||
from litellm.exceptions import APIConnectionError, RateLimitError, ServiceUnavailableError
|
||||
from functools import partial
|
||||
|
||||
from opendevin import config
|
||||
from opendevin.logger import llm_prompt_logger, llm_response_logger, opendevin_logger
|
||||
from opendevin.logger import llm_prompt_logger, llm_response_logger
|
||||
from opendevin.logger import opendevin_logger as logger
|
||||
|
||||
|
||||
DEFAULT_API_KEY = config.get('LLM_API_KEY')
|
||||
DEFAULT_BASE_URL = config.get('LLM_BASE_URL')
|
||||
DEFAULT_MODEL_NAME = config.get('LLM_MODEL')
|
||||
DEFAULT_LLM_NUM_RETRIES = config.get('LLM_NUM_RETRIES')
|
||||
DEFAULT_LLM_COOLDOWN_TIME = config.get('LLM_COOLDOWN_TIME')
|
||||
DEFAULT_API_VERSION = config.get('LLM_API_VERSION')
|
||||
LLM_NUM_RETRIES = config.get('LLM_NUM_RETRIES')
|
||||
LLM_RETRY_MIN_WAIT = config.get('LLM_RETRY_MIN_WAIT')
|
||||
LLM_RETRY_MAX_WAIT = config.get('LLM_RETRY_MAX_WAIT')
|
||||
|
||||
|
||||
class LLM:
|
||||
"""
|
||||
The LLM class represents a Language Model instance.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
model=DEFAULT_MODEL_NAME,
|
||||
api_key=DEFAULT_API_KEY,
|
||||
base_url=DEFAULT_BASE_URL,
|
||||
num_retries=DEFAULT_LLM_NUM_RETRIES,
|
||||
cooldown_time=DEFAULT_LLM_COOLDOWN_TIME,
|
||||
api_version=DEFAULT_API_VERSION,
|
||||
num_retries=LLM_NUM_RETRIES,
|
||||
retry_min_wait=LLM_RETRY_MIN_WAIT,
|
||||
retry_max_wait=LLM_RETRY_MAX_WAIT,
|
||||
):
|
||||
opendevin_logger.info(f'Initializing LLM with model: {model}')
|
||||
"""
|
||||
Args:
|
||||
model (str, optional): The name of the language model. Defaults to LLM_MODEL.
|
||||
api_key (str, optional): The API key for accessing the language model. Defaults to LLM_API_KEY.
|
||||
base_url (str, optional): The base URL for the language model API. Defaults to LLM_BASE_URL. Not necessary for OpenAI.
|
||||
api_version (str, optional): The version of the API to use. Defaults to LLM_API_VERSION. Not necessary for OpenAI.
|
||||
num_retries (int, optional): The number of retries for API calls. Defaults to LLM_NUM_RETRIES.
|
||||
retry_min_wait (int, optional): The minimum time to wait between retries in seconds. Defaults to LLM_RETRY_MIN_TIME.
|
||||
retry_max_wait (int, optional): The maximum time to wait between retries in seconds. Defaults to LLM_RETRY_MAX_TIME.
|
||||
|
||||
Attributes:
|
||||
model_name (str): The name of the language model.
|
||||
api_key (str): The API key for accessing the language model.
|
||||
base_url (str): The base URL for the language model API.
|
||||
api_version (str): The version of the API to use.
|
||||
completion (function): A decorator for the litellm completion function.
|
||||
"""
|
||||
logger.info(f'Initializing LLM with model: {model}')
|
||||
self.model_name = model
|
||||
self.api_key = api_key
|
||||
self.base_url = base_url
|
||||
@ -35,15 +60,13 @@ class LLM:
|
||||
|
||||
completion_unwrapped = self._completion
|
||||
|
||||
def my_wait(retry_state):
|
||||
seconds = (retry_state.attempt_number) * cooldown_time
|
||||
opendevin_logger.warning(f'LLM error: {retry_state.outcome.exception()}')
|
||||
opendevin_logger.info(f'Attempt #{retry_state.attempt_number} | Sleeping for {seconds}s')
|
||||
return seconds
|
||||
def attempt_on_error(retry_state):
|
||||
logger.error(f'{retry_state.outcome.exception()}. Attempt #{retry_state.attempt_number} | You can customize these settings in the configuration.', exc_info=False)
|
||||
return True
|
||||
|
||||
@retry(reraise=True,
|
||||
stop=stop_after_attempt(num_retries),
|
||||
wait=my_wait, retry=retry_if_exception_type((APIConnectionError, RateLimitError)))
|
||||
wait=wait_random_exponential(min=retry_min_wait, max=retry_max_wait), retry=retry_if_exception_type((RateLimitError, APIConnectionError, ServiceUnavailableError)), after=attempt_on_error)
|
||||
def wrapper(*args, **kwargs):
|
||||
if 'messages' in kwargs:
|
||||
messages = kwargs['messages']
|
||||
|
||||
@ -15,7 +15,8 @@ class ConfigType(str, Enum):
|
||||
LLM_EMBEDDING_DEPLOYMENT_NAME = 'LLM_EMBEDDING_DEPLOYMENT_NAME'
|
||||
LLM_API_VERSION = 'LLM_API_VERSION'
|
||||
LLM_NUM_RETRIES = 'LLM_NUM_RETRIES'
|
||||
LLM_COOLDOWN_TIME = 'LLM_COOLDOWN_TIME'
|
||||
LLM_RETRY_MIN_WAIT = 'LLM_RETRY_MIN_WAIT'
|
||||
LLM_RETRY_MAX_WAIT = 'LLM_RETRY_MAX_WAIT'
|
||||
MAX_ITERATIONS = 'MAX_ITERATIONS'
|
||||
MAX_CHARS = 'MAX_CHARS'
|
||||
AGENT = 'AGENT'
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user