Tweak connect exceptions (#1120)

* Clean up manual sleep * Add default retries and document them. * Add doctrings to llm * Add exponential backoff for rate limiting errors * Get embeddings for the action and its own content, not the user message * Add a few bad exceptions to stop loop * Stop loop when the step has no action * Add action with content, no message, to history * make retry settings customizable * fix condense to stop the loop for the same reasons as completion * Add 500-504 exception to retries * document the retry variables * Add retries and limits for embeddings. Replaces llama-index hard-coded decorator. * Rename to retry_min_wait and retry_max_wait
2025-12-26 05:48:36 +08:00 · 2024-04-22 04:00:01 +02:00 · 2024-04-22 04:00:01 +02:00 · 464bf7ee23
commit 464bf7ee23
parent 1f2a845feb
7 changed files with 113 additions and 34 deletions
--- a/README.md
+++ b/README.md
@ -177,6 +177,11 @@ can only be as powerful as the models driving it--fortunately folks on our team
 are actively working on building better open source models!


+**Note on API retries and rate limits:**
+Some LLMs have rate limits and may require retries. OpenDevin will automatically retry requests if it receives a 429 error or API connection error.
+You can set LLM_NUM_RETRIES, LLM_RETRY_MIN_WAIT, LLM_RETRY_MAX_WAIT environment variables to control the number of retries and the time between retries.
+By default, LLM_NUM_RETRIES is 5 and LLM_RETRY_MIN_WAIT, LLM_RETRY_MAX_WAIT are 3 seconds and respectively 60 seconds.
+
 ## ⭐️ Research Strategy

 Achieving full replication of production-grade applications with LLMs is a complex endeavor. Our strategy involves:
--- a/agenthub/monologue_agent/utils/memory.py
+++ b/agenthub/monologue_agent/utils/memory.py
@ -1,3 +1,4 @@
+import llama_index.embeddings.openai.base as llama_openai
 from threading import Thread

 import chromadb
@ -5,11 +6,46 @@ from llama_index.core import Document
 from llama_index.core.retrievers import VectorIndexRetriever
 from llama_index.core import VectorStoreIndex
 from llama_index.vector_stores.chroma import ChromaVectorStore
+from tenacity import retry, retry_if_exception_type, stop_after_attempt, wait_random_exponential
+from openai._exceptions import APIConnectionError, RateLimitError, InternalServerError

 from opendevin import config
 from opendevin.logger import opendevin_logger as logger
 from . import json

+num_retries = config.get('LLM_NUM_RETRIES')
+retry_min_wait = config.get('LLM_RETRY_MIN_WAIT')
+retry_max_wait = config.get('LLM_RETRY_MAX_WAIT')
+
+# llama-index includes a retry decorator around openai.get_embeddings() function
+# it is initialized with hard-coded values and errors
+# this non-customizable behavior is creating issues when it's retrying faster than providers' rate limits
+# this block attempts to banish it and replace it with our decorator, to allow users to set their own limits
+
+if hasattr(llama_openai.get_embeddings, '__wrapped__'):
+    original_get_embeddings = llama_openai.get_embeddings.__wrapped__
+else:
+    logger.warning('Cannot set custom retry limits.')  # warn
+    num_retries = 1
+    original_get_embeddings = llama_openai.get_embeddings
+
+
+def attempt_on_error(retry_state):
+    logger.error(f'{retry_state.outcome.exception()}. Attempt #{retry_state.attempt_number} | You can customize these settings in the configuration.', exc_info=False)
+    return True
+
+
+@retry(reraise=True,
+       stop=stop_after_attempt(num_retries),
+       wait=wait_random_exponential(min=retry_min_wait, max=retry_max_wait),
+       retry=retry_if_exception_type((RateLimitError, APIConnectionError, InternalServerError)),
+       after=attempt_on_error)
+def wrapper_get_embeddings(*args, **kwargs):
+    return original_get_embeddings(*args, **kwargs)
+
+
+llama_openai.get_embeddings = wrapper_get_embeddings
+
 embedding_strategy = config.get('LLM_EMBEDDING_MODEL')

 # TODO: More embeddings: https://docs.llamaindex.ai/en/stable/examples/embeddings/OpenAI/
--- a/agenthub/monologue_agent/utils/monologue.py
+++ b/agenthub/monologue_agent/utils/monologue.py
@ -1,9 +1,9 @@
-import traceback

 from opendevin.llm.llm import LLM
 from opendevin.exceptions import AgentEventTypeError
 import agenthub.monologue_agent.utils.json as json
 import agenthub.monologue_agent.utils.prompts as prompts
+from opendevin.logger import opendevin_logger as logger


 class Monologue:
@ -53,7 +53,7 @@ class Monologue:
            try:
                total_length += len(json.dumps(t))
            except TypeError as e:
-                print(f'Error serializing thought: {e}')
+                logger.error('Error serializing thought: %s', str(e), exc_info=False)
        return total_length

    def condense(self, llm: LLM):
@ -64,7 +64,7 @@ class Monologue:
        - llm (LLM): llm to be used for summarization

        Raises:
-        - RunTimeError: When the condensing process fails for any reason
+        - Exception: the same exception as it got from the llm or processing the response
        """

        try:
@ -74,5 +74,7 @@ class Monologue:
            summary_resp = resp['choices'][0]['message']['content']
            self.thoughts = prompts.parse_summary_response(summary_resp)
        except Exception as e:
-            traceback.print_exc()
-            raise RuntimeError(f'Error condensing thoughts: {e}')
+            logger.error('Error condensing thoughts: %s', str(e), exc_info=False)
+
+            # TODO If the llm fails with ContextWindowExceededError, we can try to condense the monologue chunk by chunk
+            raise
--- a/opendevin/config.py
+++ b/opendevin/config.py
@ -5,6 +5,9 @@ import toml
 from dotenv import load_dotenv

 from opendevin.schema import ConfigType
+import logging
+
+logger = logging.getLogger(__name__)

 load_dotenv()

@ -21,8 +24,9 @@ DEFAULT_CONFIG: dict = {
    ConfigType.LLM_EMBEDDING_MODEL: 'local',
    ConfigType.LLM_EMBEDDING_DEPLOYMENT_NAME: None,
    ConfigType.LLM_API_VERSION: None,
-    ConfigType.LLM_NUM_RETRIES: 1,
-    ConfigType.LLM_COOLDOWN_TIME: 1,
+    ConfigType.LLM_NUM_RETRIES: 5,
+    ConfigType.LLM_RETRY_MIN_WAIT: 3,
+    ConfigType.LLM_RETRY_MAX_WAIT: 60,
    ConfigType.MAX_ITERATIONS: 100,
    # GPT-4 pricing is $10 per 1M input tokens. Since tokenization happens on LLM side,
    # we cannot easily count number of tokens, but we can count characters.
@ -41,6 +45,16 @@ if os.path.exists('config.toml'):
    with open('config.toml', 'rb') as f:
        config_str = f.read().decode('utf-8')

+
+def int_value(value, default, config_key):
+    # FIXME use a library
+    try:
+        return int(value)
+    except ValueError:
+        logger.warning(f'Invalid value for {config_key}: {value} not applied. Using default value {default}')
+        return default
+
+
 tomlConfig = toml.loads(config_str)
 config = DEFAULT_CONFIG.copy()
 for k, v in config.items():
@ -48,6 +62,8 @@ for k, v in config.items():
        config[k] = os.environ[k]
    elif k in tomlConfig:
        config[k] = tomlConfig[k]
+    if k in [ConfigType.LLM_NUM_RETRIES, ConfigType.LLM_RETRY_MIN_WAIT, ConfigType.LLM_RETRY_MAX_WAIT]:
+        config[k] = int_value(config[k], v, config_key=k)


 def get_parser():
--- a/opendevin/controller/agent_controller.py
+++ b/opendevin/controller/agent_controller.py
@ -1,10 +1,9 @@
 import asyncio
-import time
 import traceback
 from typing import Callable, List

-from litellm.exceptions import APIConnectionError
-from openai import AuthenticationError
+from openai import AuthenticationError, APIConnectionError
+from litellm import ContextWindowExceededError

 from opendevin import config
 from opendevin.action import (
@ -170,26 +169,23 @@ class AgentController:
        observation: Observation = NullObservation('')
        try:
            action = self.agent.step(self.state)
-            logger.info(action, extra={'msg_type': 'ACTION'})
            if action is None:
                raise AgentNoActionError()
+            logger.info(action, extra={'msg_type': 'ACTION'})
        except Exception as e:
            observation = AgentErrorObservation(str(e))
            logger.error(e)
            logger.debug(traceback.format_exc())

-            if isinstance(e, APIConnectionError):
-                time.sleep(3)
-
            # raise specific exceptions that need to be handled outside
-            # note: we are using AuthenticationError class from openai rather than
-            # litellm because:
+            # note: we are using classes from openai rather than litellm because:
            # 1) litellm.exceptions.AuthenticationError is a subclass of openai.AuthenticationError
-            # 2) embeddings call, initiated by llama-index, has no wrapper for authentication
-            #    errors. This means we have to catch individual authentication errors
+            # 2) embeddings call, initiated by llama-index, has no wrapper for errors.
+            #    This means we have to catch individual authentication errors
            #    from different providers, and OpenAI is one of these.
-            if isinstance(e, (AuthenticationError, AgentNoActionError)):
+            if isinstance(e, (AuthenticationError, ContextWindowExceededError, APIConnectionError)):
                raise
+
        self.update_state_after_step()

        await self._run_callbacks(action)
--- a/opendevin/llm/llm.py
+++ b/opendevin/llm/llm.py
@ -1,30 +1,55 @@

 from litellm import completion as litellm_completion
-from tenacity import retry, retry_if_exception_type, stop_after_attempt
-from litellm.exceptions import APIConnectionError, RateLimitError
+from tenacity import retry, retry_if_exception_type, stop_after_attempt, wait_random_exponential
+from litellm.exceptions import APIConnectionError, RateLimitError, ServiceUnavailableError
 from functools import partial

 from opendevin import config
-from opendevin.logger import llm_prompt_logger, llm_response_logger, opendevin_logger
+from opendevin.logger import llm_prompt_logger, llm_response_logger
+from opendevin.logger import opendevin_logger as logger
+

 DEFAULT_API_KEY = config.get('LLM_API_KEY')
 DEFAULT_BASE_URL = config.get('LLM_BASE_URL')
 DEFAULT_MODEL_NAME = config.get('LLM_MODEL')
-DEFAULT_LLM_NUM_RETRIES = config.get('LLM_NUM_RETRIES')
-DEFAULT_LLM_COOLDOWN_TIME = config.get('LLM_COOLDOWN_TIME')
 DEFAULT_API_VERSION = config.get('LLM_API_VERSION')
+LLM_NUM_RETRIES = config.get('LLM_NUM_RETRIES')
+LLM_RETRY_MIN_WAIT = config.get('LLM_RETRY_MIN_WAIT')
+LLM_RETRY_MAX_WAIT = config.get('LLM_RETRY_MAX_WAIT')


 class LLM:
+    """
+    The LLM class represents a Language Model instance.
+    """
+
    def __init__(self,
                 model=DEFAULT_MODEL_NAME,
                 api_key=DEFAULT_API_KEY,
                 base_url=DEFAULT_BASE_URL,
-                 num_retries=DEFAULT_LLM_NUM_RETRIES,
-                 cooldown_time=DEFAULT_LLM_COOLDOWN_TIME,
                 api_version=DEFAULT_API_VERSION,
+                 num_retries=LLM_NUM_RETRIES,
+                 retry_min_wait=LLM_RETRY_MIN_WAIT,
+                 retry_max_wait=LLM_RETRY_MAX_WAIT,
                 ):
-        opendevin_logger.info(f'Initializing LLM with model: {model}')
+        """
+        Args:
+            model (str, optional): The name of the language model. Defaults to LLM_MODEL.
+            api_key (str, optional): The API key for accessing the language model. Defaults to LLM_API_KEY.
+            base_url (str, optional): The base URL for the language model API. Defaults to LLM_BASE_URL. Not necessary for OpenAI.
+            api_version (str, optional): The version of the API to use. Defaults to LLM_API_VERSION. Not necessary for OpenAI.
+            num_retries (int, optional): The number of retries for API calls. Defaults to LLM_NUM_RETRIES.
+            retry_min_wait (int, optional): The minimum time to wait between retries in seconds. Defaults to LLM_RETRY_MIN_TIME.
+            retry_max_wait (int, optional): The maximum time to wait between retries in seconds. Defaults to LLM_RETRY_MAX_TIME.
+
+        Attributes:
+            model_name (str): The name of the language model.
+            api_key (str): The API key for accessing the language model.
+            base_url (str): The base URL for the language model API.
+            api_version (str): The version of the API to use.
+            completion (function): A decorator for the litellm completion function.
+        """
+        logger.info(f'Initializing LLM with model: {model}')
        self.model_name = model
        self.api_key = api_key
        self.base_url = base_url
@ -35,15 +60,13 @@ class LLM:

        completion_unwrapped = self._completion

-        def my_wait(retry_state):
-            seconds = (retry_state.attempt_number) * cooldown_time
-            opendevin_logger.warning(f'LLM error: {retry_state.outcome.exception()}')
-            opendevin_logger.info(f'Attempt #{retry_state.attempt_number} | Sleeping for {seconds}s')
-            return seconds
+        def attempt_on_error(retry_state):
+            logger.error(f'{retry_state.outcome.exception()}. Attempt #{retry_state.attempt_number} | You can customize these settings in the configuration.', exc_info=False)
+            return True

        @retry(reraise=True,
               stop=stop_after_attempt(num_retries),
-               wait=my_wait, retry=retry_if_exception_type((APIConnectionError, RateLimitError)))
+               wait=wait_random_exponential(min=retry_min_wait, max=retry_max_wait), retry=retry_if_exception_type((RateLimitError, APIConnectionError, ServiceUnavailableError)), after=attempt_on_error)
        def wrapper(*args, **kwargs):
            if 'messages' in kwargs:
                messages = kwargs['messages']
--- a/opendevin/schema/config.py
+++ b/opendevin/schema/config.py
@ -15,7 +15,8 @@ class ConfigType(str, Enum):
    LLM_EMBEDDING_DEPLOYMENT_NAME = 'LLM_EMBEDDING_DEPLOYMENT_NAME'
    LLM_API_VERSION = 'LLM_API_VERSION'
    LLM_NUM_RETRIES = 'LLM_NUM_RETRIES'
-    LLM_COOLDOWN_TIME = 'LLM_COOLDOWN_TIME'
+    LLM_RETRY_MIN_WAIT = 'LLM_RETRY_MIN_WAIT'
+    LLM_RETRY_MAX_WAIT = 'LLM_RETRY_MAX_WAIT'
    MAX_ITERATIONS = 'MAX_ITERATIONS'
    MAX_CHARS = 'MAX_CHARS'
    AGENT = 'AGENT'