Tweak connect exceptions (#1120)

* Clean up manual sleep

* Add default retries and document them.

* Add doctrings to llm

* Add exponential backoff for rate limiting errors

* Get embeddings for the action and its own content, not the user message

* Add a few bad exceptions to stop loop

* Stop loop when the step has no action

* Add action with content, no message, to history

* make retry settings customizable

* fix condense to stop the loop for the same reasons as completion

* Add 500-504 exception to retries

* document the retry variables

* Add retries and limits for embeddings. Replaces llama-index hard-coded decorator.

* Rename to retry_min_wait and retry_max_wait
This commit is contained in:
Engel Nyst 2024-04-22 04:00:01 +02:00 committed by GitHub
parent 1f2a845feb
commit 464bf7ee23
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 113 additions and 34 deletions

View File

@ -177,6 +177,11 @@ can only be as powerful as the models driving it--fortunately folks on our team
are actively working on building better open source models!
**Note on API retries and rate limits:**
Some LLMs have rate limits and may require retries. OpenDevin will automatically retry requests if it receives a 429 error or API connection error.
You can set LLM_NUM_RETRIES, LLM_RETRY_MIN_WAIT, LLM_RETRY_MAX_WAIT environment variables to control the number of retries and the time between retries.
By default, LLM_NUM_RETRIES is 5 and LLM_RETRY_MIN_WAIT, LLM_RETRY_MAX_WAIT are 3 seconds and respectively 60 seconds.
## ⭐️ Research Strategy
Achieving full replication of production-grade applications with LLMs is a complex endeavor. Our strategy involves:

View File

@ -1,3 +1,4 @@
import llama_index.embeddings.openai.base as llama_openai
from threading import Thread
import chromadb
@ -5,11 +6,46 @@ from llama_index.core import Document
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.core import VectorStoreIndex
from llama_index.vector_stores.chroma import ChromaVectorStore
from tenacity import retry, retry_if_exception_type, stop_after_attempt, wait_random_exponential
from openai._exceptions import APIConnectionError, RateLimitError, InternalServerError
from opendevin import config
from opendevin.logger import opendevin_logger as logger
from . import json
num_retries = config.get('LLM_NUM_RETRIES')
retry_min_wait = config.get('LLM_RETRY_MIN_WAIT')
retry_max_wait = config.get('LLM_RETRY_MAX_WAIT')
# llama-index includes a retry decorator around openai.get_embeddings() function
# it is initialized with hard-coded values and errors
# this non-customizable behavior is creating issues when it's retrying faster than providers' rate limits
# this block attempts to banish it and replace it with our decorator, to allow users to set their own limits
if hasattr(llama_openai.get_embeddings, '__wrapped__'):
original_get_embeddings = llama_openai.get_embeddings.__wrapped__
else:
logger.warning('Cannot set custom retry limits.') # warn
num_retries = 1
original_get_embeddings = llama_openai.get_embeddings
def attempt_on_error(retry_state):
logger.error(f'{retry_state.outcome.exception()}. Attempt #{retry_state.attempt_number} | You can customize these settings in the configuration.', exc_info=False)
return True
@retry(reraise=True,
stop=stop_after_attempt(num_retries),
wait=wait_random_exponential(min=retry_min_wait, max=retry_max_wait),
retry=retry_if_exception_type((RateLimitError, APIConnectionError, InternalServerError)),
after=attempt_on_error)
def wrapper_get_embeddings(*args, **kwargs):
return original_get_embeddings(*args, **kwargs)
llama_openai.get_embeddings = wrapper_get_embeddings
embedding_strategy = config.get('LLM_EMBEDDING_MODEL')
# TODO: More embeddings: https://docs.llamaindex.ai/en/stable/examples/embeddings/OpenAI/

View File

@ -1,9 +1,9 @@
import traceback
from opendevin.llm.llm import LLM
from opendevin.exceptions import AgentEventTypeError
import agenthub.monologue_agent.utils.json as json
import agenthub.monologue_agent.utils.prompts as prompts
from opendevin.logger import opendevin_logger as logger
class Monologue:
@ -53,7 +53,7 @@ class Monologue:
try:
total_length += len(json.dumps(t))
except TypeError as e:
print(f'Error serializing thought: {e}')
logger.error('Error serializing thought: %s', str(e), exc_info=False)
return total_length
def condense(self, llm: LLM):
@ -64,7 +64,7 @@ class Monologue:
- llm (LLM): llm to be used for summarization
Raises:
- RunTimeError: When the condensing process fails for any reason
- Exception: the same exception as it got from the llm or processing the response
"""
try:
@ -74,5 +74,7 @@ class Monologue:
summary_resp = resp['choices'][0]['message']['content']
self.thoughts = prompts.parse_summary_response(summary_resp)
except Exception as e:
traceback.print_exc()
raise RuntimeError(f'Error condensing thoughts: {e}')
logger.error('Error condensing thoughts: %s', str(e), exc_info=False)
# TODO If the llm fails with ContextWindowExceededError, we can try to condense the monologue chunk by chunk
raise

View File

@ -5,6 +5,9 @@ import toml
from dotenv import load_dotenv
from opendevin.schema import ConfigType
import logging
logger = logging.getLogger(__name__)
load_dotenv()
@ -21,8 +24,9 @@ DEFAULT_CONFIG: dict = {
ConfigType.LLM_EMBEDDING_MODEL: 'local',
ConfigType.LLM_EMBEDDING_DEPLOYMENT_NAME: None,
ConfigType.LLM_API_VERSION: None,
ConfigType.LLM_NUM_RETRIES: 1,
ConfigType.LLM_COOLDOWN_TIME: 1,
ConfigType.LLM_NUM_RETRIES: 5,
ConfigType.LLM_RETRY_MIN_WAIT: 3,
ConfigType.LLM_RETRY_MAX_WAIT: 60,
ConfigType.MAX_ITERATIONS: 100,
# GPT-4 pricing is $10 per 1M input tokens. Since tokenization happens on LLM side,
# we cannot easily count number of tokens, but we can count characters.
@ -41,6 +45,16 @@ if os.path.exists('config.toml'):
with open('config.toml', 'rb') as f:
config_str = f.read().decode('utf-8')
def int_value(value, default, config_key):
# FIXME use a library
try:
return int(value)
except ValueError:
logger.warning(f'Invalid value for {config_key}: {value} not applied. Using default value {default}')
return default
tomlConfig = toml.loads(config_str)
config = DEFAULT_CONFIG.copy()
for k, v in config.items():
@ -48,6 +62,8 @@ for k, v in config.items():
config[k] = os.environ[k]
elif k in tomlConfig:
config[k] = tomlConfig[k]
if k in [ConfigType.LLM_NUM_RETRIES, ConfigType.LLM_RETRY_MIN_WAIT, ConfigType.LLM_RETRY_MAX_WAIT]:
config[k] = int_value(config[k], v, config_key=k)
def get_parser():

View File

@ -1,10 +1,9 @@
import asyncio
import time
import traceback
from typing import Callable, List
from litellm.exceptions import APIConnectionError
from openai import AuthenticationError
from openai import AuthenticationError, APIConnectionError
from litellm import ContextWindowExceededError
from opendevin import config
from opendevin.action import (
@ -170,26 +169,23 @@ class AgentController:
observation: Observation = NullObservation('')
try:
action = self.agent.step(self.state)
logger.info(action, extra={'msg_type': 'ACTION'})
if action is None:
raise AgentNoActionError()
logger.info(action, extra={'msg_type': 'ACTION'})
except Exception as e:
observation = AgentErrorObservation(str(e))
logger.error(e)
logger.debug(traceback.format_exc())
if isinstance(e, APIConnectionError):
time.sleep(3)
# raise specific exceptions that need to be handled outside
# note: we are using AuthenticationError class from openai rather than
# litellm because:
# note: we are using classes from openai rather than litellm because:
# 1) litellm.exceptions.AuthenticationError is a subclass of openai.AuthenticationError
# 2) embeddings call, initiated by llama-index, has no wrapper for authentication
# errors. This means we have to catch individual authentication errors
# 2) embeddings call, initiated by llama-index, has no wrapper for errors.
# This means we have to catch individual authentication errors
# from different providers, and OpenAI is one of these.
if isinstance(e, (AuthenticationError, AgentNoActionError)):
if isinstance(e, (AuthenticationError, ContextWindowExceededError, APIConnectionError)):
raise
self.update_state_after_step()
await self._run_callbacks(action)

View File

@ -1,30 +1,55 @@
from litellm import completion as litellm_completion
from tenacity import retry, retry_if_exception_type, stop_after_attempt
from litellm.exceptions import APIConnectionError, RateLimitError
from tenacity import retry, retry_if_exception_type, stop_after_attempt, wait_random_exponential
from litellm.exceptions import APIConnectionError, RateLimitError, ServiceUnavailableError
from functools import partial
from opendevin import config
from opendevin.logger import llm_prompt_logger, llm_response_logger, opendevin_logger
from opendevin.logger import llm_prompt_logger, llm_response_logger
from opendevin.logger import opendevin_logger as logger
DEFAULT_API_KEY = config.get('LLM_API_KEY')
DEFAULT_BASE_URL = config.get('LLM_BASE_URL')
DEFAULT_MODEL_NAME = config.get('LLM_MODEL')
DEFAULT_LLM_NUM_RETRIES = config.get('LLM_NUM_RETRIES')
DEFAULT_LLM_COOLDOWN_TIME = config.get('LLM_COOLDOWN_TIME')
DEFAULT_API_VERSION = config.get('LLM_API_VERSION')
LLM_NUM_RETRIES = config.get('LLM_NUM_RETRIES')
LLM_RETRY_MIN_WAIT = config.get('LLM_RETRY_MIN_WAIT')
LLM_RETRY_MAX_WAIT = config.get('LLM_RETRY_MAX_WAIT')
class LLM:
"""
The LLM class represents a Language Model instance.
"""
def __init__(self,
model=DEFAULT_MODEL_NAME,
api_key=DEFAULT_API_KEY,
base_url=DEFAULT_BASE_URL,
num_retries=DEFAULT_LLM_NUM_RETRIES,
cooldown_time=DEFAULT_LLM_COOLDOWN_TIME,
api_version=DEFAULT_API_VERSION,
num_retries=LLM_NUM_RETRIES,
retry_min_wait=LLM_RETRY_MIN_WAIT,
retry_max_wait=LLM_RETRY_MAX_WAIT,
):
opendevin_logger.info(f'Initializing LLM with model: {model}')
"""
Args:
model (str, optional): The name of the language model. Defaults to LLM_MODEL.
api_key (str, optional): The API key for accessing the language model. Defaults to LLM_API_KEY.
base_url (str, optional): The base URL for the language model API. Defaults to LLM_BASE_URL. Not necessary for OpenAI.
api_version (str, optional): The version of the API to use. Defaults to LLM_API_VERSION. Not necessary for OpenAI.
num_retries (int, optional): The number of retries for API calls. Defaults to LLM_NUM_RETRIES.
retry_min_wait (int, optional): The minimum time to wait between retries in seconds. Defaults to LLM_RETRY_MIN_TIME.
retry_max_wait (int, optional): The maximum time to wait between retries in seconds. Defaults to LLM_RETRY_MAX_TIME.
Attributes:
model_name (str): The name of the language model.
api_key (str): The API key for accessing the language model.
base_url (str): The base URL for the language model API.
api_version (str): The version of the API to use.
completion (function): A decorator for the litellm completion function.
"""
logger.info(f'Initializing LLM with model: {model}')
self.model_name = model
self.api_key = api_key
self.base_url = base_url
@ -35,15 +60,13 @@ class LLM:
completion_unwrapped = self._completion
def my_wait(retry_state):
seconds = (retry_state.attempt_number) * cooldown_time
opendevin_logger.warning(f'LLM error: {retry_state.outcome.exception()}')
opendevin_logger.info(f'Attempt #{retry_state.attempt_number} | Sleeping for {seconds}s')
return seconds
def attempt_on_error(retry_state):
logger.error(f'{retry_state.outcome.exception()}. Attempt #{retry_state.attempt_number} | You can customize these settings in the configuration.', exc_info=False)
return True
@retry(reraise=True,
stop=stop_after_attempt(num_retries),
wait=my_wait, retry=retry_if_exception_type((APIConnectionError, RateLimitError)))
wait=wait_random_exponential(min=retry_min_wait, max=retry_max_wait), retry=retry_if_exception_type((RateLimitError, APIConnectionError, ServiceUnavailableError)), after=attempt_on_error)
def wrapper(*args, **kwargs):
if 'messages' in kwargs:
messages = kwargs['messages']

View File

@ -15,7 +15,8 @@ class ConfigType(str, Enum):
LLM_EMBEDDING_DEPLOYMENT_NAME = 'LLM_EMBEDDING_DEPLOYMENT_NAME'
LLM_API_VERSION = 'LLM_API_VERSION'
LLM_NUM_RETRIES = 'LLM_NUM_RETRIES'
LLM_COOLDOWN_TIME = 'LLM_COOLDOWN_TIME'
LLM_RETRY_MIN_WAIT = 'LLM_RETRY_MIN_WAIT'
LLM_RETRY_MAX_WAIT = 'LLM_RETRY_MAX_WAIT'
MAX_ITERATIONS = 'MAX_ITERATIONS'
MAX_CHARS = 'MAX_CHARS'
AGENT = 'AGENT'