Refactor response to action in agent step (#2350)

* refactor action parser * Fix typos * fix typo --------- Co-authored-by: Boxuan Li <liboxuan@connect.hku.hk>
2025-12-26 05:48:36 +08:00 · 2024-06-10 18:17:30 +08:00 · 2024-06-10 18:17:30 +08:00 · f7491bd2fa
commit f7491bd2fa
parent 7fc57650f3
3 changed files with 263 additions and 52 deletions
--- a/agenthub/codeact_agent/action_parser.py
+++ b/agenthub/codeact_agent/action_parser.py
@ -0,0 +1,182 @@
+import re
+
+from opendevin.controller.action_parser import ActionParser, ResponseParser
+from opendevin.events.action import (
+    Action,
+    AgentDelegateAction,
+    AgentFinishAction,
+    CmdRunAction,
+    IPythonRunCellAction,
+    MessageAction,
+)
+
+
+class CodeActResponseParser(ResponseParser):
+    """
+    Parser action:
+        - CmdRunAction(command) - bash command to run
+        - IPythonRunCellAction(code) - IPython code to run
+        - AgentDelegateAction(agent, inputs) - delegate action for (sub)task
+        - MessageAction(content) - Message action to run (e.g. ask for clarification)
+        - AgentFinishAction() - end the interaction
+    """
+
+    def __init__(
+        self,
+    ):
+        # Need pay attention to the item order in self.action_parsers
+        self.action_parsers = [
+            CodeActActionParserFinish(),
+            CodeActActionParserCmdRun(),
+            CodeActActionParserIPythonRunCell(),
+            CodeActActionParserAgentDelegate(),
+        ]
+        self.default_parser = CodeActActionParserMessage()
+
+    def parse(self, response: str) -> Action:
+        action_str = self.parse_response(response)
+        return self.parse_action(action_str)
+
+    def parse_response(self, response) -> str:
+        action = response.choices[0].message.content
+        for lang in ['bash', 'ipython', 'browse']:
+            if f'<execute_{lang}>' in action and f'</execute_{lang}>' not in action:
+                action += f'</execute_{lang}>'
+        return action
+
+    def parse_action(self, action_str: str) -> Action:
+        for action_parser in self.action_parsers:
+            if action_parser.check_condition(action_str):
+                return action_parser.parse(action_str)
+        return self.default_parser.parse(action_str)
+
+
+class CodeActActionParserFinish(ActionParser):
+    """
+    Parser action:
+        - AgentFinishAction() - end the interaction
+    """
+
+    def __init__(
+        self,
+    ):
+        self.finish_command = None
+
+    def check_condition(self, action_str: str) -> bool:
+        self.finish_command = re.search(r'<finish>.*</finish>', action_str, re.DOTALL)
+        return self.finish_command is not None
+
+    def parse(self, action_str: str) -> Action:
+        assert (
+            self.finish_command is not None
+        ), 'self.finish_command should not be None when parse is called'
+        thought = action_str.replace(self.finish_command.group(0), '').strip()
+        return AgentFinishAction(thought=thought)
+
+
+class CodeActActionParserCmdRun(ActionParser):
+    """
+    Parser action:
+        - CmdRunAction(command) - bash command to run
+        - AgentFinishAction() - end the interaction
+    """
+
+    def __init__(
+        self,
+    ):
+        self.bash_command = None
+
+    def check_condition(self, action_str: str) -> bool:
+        self.bash_command = re.search(
+            r'<execute_bash>(.*?)</execute_bash>', action_str, re.DOTALL
+        )
+        return self.bash_command is not None
+
+    def parse(self, action_str: str) -> Action:
+        assert (
+            self.bash_command is not None
+        ), 'self.bash_command should not be None when parse is called'
+        thought = action_str.replace(self.bash_command.group(0), '').strip()
+        # a command was found
+        command_group = self.bash_command.group(1).strip()
+        if command_group.strip() == 'exit':
+            return AgentFinishAction()
+        return CmdRunAction(command=command_group, thought=thought)
+
+
+class CodeActActionParserIPythonRunCell(ActionParser):
+    """
+    Parser action:
+        - IPythonRunCellAction(code) - IPython code to run
+    """
+
+    def __init__(
+        self,
+    ):
+        self.python_code = None
+        self.jupyter_kernel_init_code: str = 'from agentskills import *'
+
+    def check_condition(self, action_str: str) -> bool:
+        self.python_code = re.search(
+            r'<execute_ipython>(.*?)</execute_ipython>', action_str, re.DOTALL
+        )
+        return self.python_code is not None
+
+    def parse(self, action_str: str) -> Action:
+        assert (
+            self.python_code is not None
+        ), 'self.python_code should not be None when parse is called'
+        code_group = self.python_code.group(1).strip()
+        thought = action_str.replace(self.python_code.group(0), '').strip()
+        return IPythonRunCellAction(
+            code=code_group,
+            thought=thought,
+            kernel_init_code=self.jupyter_kernel_init_code,
+        )
+
+
+class CodeActActionParserAgentDelegate(ActionParser):
+    """
+    Parser action:
+        - AgentDelegateAction(agent, inputs) - delegate action for (sub)task
+    """
+
+    def __init__(
+        self,
+    ):
+        self.agent_delegate = None
+
+    def check_condition(self, action_str: str) -> bool:
+        self.agent_delegate = re.search(
+            r'<execute_browse>(.*)</execute_browse>', action_str, re.DOTALL
+        )
+        return self.agent_delegate is not None
+
+    def parse(self, action_str: str) -> Action:
+        assert (
+            self.agent_delegate is not None
+        ), 'self.agent_delegate should not be None when parse is called'
+        thought = action_str.replace(self.agent_delegate.group(0), '').strip()
+        browse_actions = self.agent_delegate.group(1).strip()
+        task = f'{thought}. I should start with: {browse_actions}'
+        return AgentDelegateAction(agent='BrowsingAgent', inputs={'task': task})
+
+
+class CodeActActionParserMessage(ActionParser):
+    """
+    Parser action:
+        - MessageAction(content) - Message action to run (e.g. ask for clarification)
+    """
+
+    def __init__(
+        self,
+    ):
+        pass
+
+    def check_condition(self, action_str: str) -> bool:
+        # We assume the LLM is GOOD enough that when it returns pure natural language
+        # it wants to talk to the user
+        return True
+
+    def parse(self, action_str: str) -> Action:
+        return MessageAction(content=action_str, wait_for_response=True)
--- a/agenthub/codeact_agent/codeact_agent.py
+++ b/agenthub/codeact_agent/codeact_agent.py
@ -1,5 +1,4 @@
-import re
-
+from agenthub.codeact_agent.action_parser import CodeActResponseParser
 from agenthub.codeact_agent.prompt import (
    COMMAND_DOCS,
    EXAMPLES,
@ -11,7 +10,6 @@ from opendevin.controller.agent import Agent
 from opendevin.controller.state.state import State
 from opendevin.events.action import (
    Action,
-    AgentDelegateAction,
    AgentFinishAction,
    BrowseInteractiveAction,
    CmdRunAction,
@ -35,14 +33,6 @@ from opendevin.runtime.tools import RuntimeTool
 ENABLE_GITHUB = True


-def parse_response(response) -> str:
-    action = response.choices[0].message.content
-    for lang in ['bash', 'ipython', 'browse']:
-        if f'<execute_{lang}>' in action and f'</execute_{lang}>' not in action:
-            action += f'</execute_{lang}>'
-    return action
-
-
 def action_to_str(action: Action) -> str:
    if isinstance(action, CmdRunAction):
        return f'{action.thought}\n<execute_bash>\n{action.command}\n</execute_bash>'
@ -169,11 +159,12 @@ class CodeActAgent(Agent):
        JupyterRequirement(),
    ]
    runtime_tools: list[RuntimeTool] = [RuntimeTool.BROWSER]
-    jupyter_kernel_init_code: str = 'from agentskills import *'

    system_message: str = get_system_message()
    in_context_example: str = f"Here is an example of how you can interact with the environment for task solving:\n{get_in_context_example()}\n\nNOW, LET'S START!"

+    action_parser = CodeActResponseParser()
+
    def __init__(
        self,
        llm: LLM,
@ -239,48 +230,10 @@ class CodeActAgent(Agent):
            ],
            temperature=0.0,
        )
-
-        action_str: str = parse_response(response)
        state.num_of_chars += sum(
            len(message['content']) for message in messages
-        ) + len(action_str)
-
-        if finish_command := re.search(r'<finish>.*</finish>', action_str, re.DOTALL):
-            thought = action_str.replace(finish_command.group(0), '').strip()
-            return AgentFinishAction(thought=thought)
-        if bash_command := re.search(
-            r'<execute_bash>(.*?)</execute_bash>', action_str, re.DOTALL
-        ):
-            # remove the command from the action string to get thought
-            thought = action_str.replace(bash_command.group(0), '').strip()
-            # a command was found
-            command_group = bash_command.group(1).strip()
-
-            if command_group.strip() == 'exit':
-                return AgentFinishAction()
-            return CmdRunAction(command=command_group, thought=thought)
-        elif python_code := re.search(
-            r'<execute_ipython>(.*?)</execute_ipython>', action_str, re.DOTALL
-        ):
-            # a code block was found
-            code_group = python_code.group(1).strip()
-            thought = action_str.replace(python_code.group(0), '').strip()
-            return IPythonRunCellAction(
-                code=code_group,
-                thought=thought,
-                kernel_init_code=self.jupyter_kernel_init_code,
-            )
-        elif browse_command := re.search(
-            r'<execute_browse>(.*)</execute_browse>', action_str, re.DOTALL
-        ):
-            thought = action_str.replace(browse_command.group(0), '').strip()
-            browse_actions = browse_command.group(1).strip()
-            task = f'{thought}. I should start with: {browse_actions}'
-            return AgentDelegateAction(agent='BrowsingAgent', inputs={'task': task})
-        else:
-            # We assume the LLM is GOOD enough that when it returns pure natural language
-            # it want to talk to the user
-            return MessageAction(content=action_str, wait_for_response=True)
+        ) + len(response.choices[0].message.content)
+        return self.action_parser.parse(response)

    def search_memory(self, query: str) -> list[str]:
        raise NotImplementedError('Implement this abstract method')
--- a/opendevin/controller/action_parser.py
+++ b/opendevin/controller/action_parser.py
@ -0,0 +1,76 @@
+from abc import ABC, abstractmethod
+
+from opendevin.events.action import Action
+
+
+class ResponseParser(ABC):
+    """
+    This abstract base class is a general interface for an response parser dedicated to
+    parsing the action from the response from the LLM.
+    """
+
+    def __init__(
+        self,
+    ):
+        # Need pay attention to the item order in self.action_parsers
+        self.action_parsers = []
+
+    @abstractmethod
+    def parse(self, response: str) -> Action:
+        """
+        Parses the action from the response from the LLM.
+
+        Parameters:
+        - response (str): The response from the LLM.
+
+        Returns:
+        - action (Action): The action parsed from the response.
+        """
+        pass
+
+    @abstractmethod
+    def parse_response(self, response) -> str:
+        """
+        Parses the action from the response from the LLM.
+
+        Parameters:
+        - response (str): The response from the LLM.
+
+        Returns:
+        - action_str (str): The action str parsed from the response.
+        """
+        pass
+
+    @abstractmethod
+    def parse_action(self, action_str: str) -> Action:
+        """
+        Parses the action from the response from the LLM.
+
+        Parameters:
+        - action_str (str): The response from the LLM.
+
+        Returns:
+        - action (Action): The action parsed from the response.
+        """
+        pass
+
+
+class ActionParser(ABC):
+    """
+    This abstract base class is an general interface for an action parser dedicated to
+    parsing the action from the action str from the LLM.
+    """
+
+    @abstractmethod
+    def check_condition(self, action_str: str) -> bool:
+        """
+        Check if the action string can be parsed by this parser.
+        """
+        pass
+
+    @abstractmethod
+    def parse(self, action_str: str) -> Action:
+        """
+        Parses the action from the action string from the LLM response.
+        """
+        pass