refactor: codeact tools into separate files (#6978)

2025-12-26 05:48:36 +08:00 · 2025-02-26 17:57:14 -05:00 · 2025-02-26 17:57:14 -05:00 · c4ba54122e
commit c4ba54122e
parent 4b7cca9bdf
10 changed files with 476 additions and 443 deletions
--- a/openhands/agenthub/codeact_agent/function_calling.py
+++ b/openhands/agenthub/codeact_agent/function_calling.py
@ -5,13 +5,20 @@ This is similar to the functionality of `CodeActResponseParser`.

 import json

-from browsergym.core.action.highlevel import HighLevelActionSet
 from litellm import (
    ChatCompletionToolParam,
-    ChatCompletionToolParamFunctionChunk,
    ModelResponse,
 )

+from openhands.agenthub.codeact_agent.tools import (
+    BrowserTool,
+    CmdRunTool,
+    FinishTool,
+    IPythonTool,
+    LLMBasedFileEditTool,
+    StrReplaceEditorTool,
+    WebReadTool,
+)
 from openhands.core.exceptions import (
    FunctionCallNotExistsError,
    FunctionCallValidationError,
@ -31,438 +38,6 @@ from openhands.events.action import (
 from openhands.events.event import FileEditSource, FileReadSource
 from openhands.events.tool import ToolCallMetadata

-_BASH_DESCRIPTION = """Execute a bash command in the terminal.
-* Long running commands: For commands that may run indefinitely, it should be run in the background and the output should be redirected to a file, e.g. command = `python3 app.py > server.log 2>&1 &`.
-* Interact with running process: If a bash command returns exit code `-1`, this means the process is not yet finished. By setting `is_input` to `true`, the assistant can interact with the running process and send empty `command` to retrieve any additional logs, or send additional text (set `command` to the text) to STDIN of the running process, or send command like `C-c` (Ctrl+C), `C-d` (Ctrl+D), `C-z` (Ctrl+Z) to interrupt the process.
-* One command at a time: You can only execute one bash command at a time. If you need to run multiple commands sequentially, you can use `&&` or `;` to chain them together.
-"""
-
-CmdRunTool = ChatCompletionToolParam(
-    type='function',
-    function=ChatCompletionToolParamFunctionChunk(
-        name='execute_bash',
-        description=_BASH_DESCRIPTION,
-        parameters={
-            'type': 'object',
-            'properties': {
-                'command': {
-                    'type': 'string',
-                    'description': 'The bash command to execute. Can be empty string to view additional logs when previous exit code is `-1`. Can be `C-c` (Ctrl+C) to interrupt the currently running process. Note: You can only execute one bash command at a time. If you need to run multiple commands sequentially, you can use `&&` or `;` to chain them together.',
-                },
-                'is_input': {
-                    'type': 'string',
-                    'description': 'If True, the command is an input to the running process. If False, the command is a bash command to be executed in the terminal. Default is False.',
-                    'enum': ['true', 'false'],
-                },
-            },
-            'required': ['command'],
-        },
-    ),
-)
-
-_IPYTHON_DESCRIPTION = """Run a cell of Python code in an IPython environment.
-* The assistant should define variables and import packages before using them.
-* The variable defined in the IPython environment will not be available outside the IPython environment (e.g., in terminal).
-"""
-
-IPythonTool = ChatCompletionToolParam(
-    type='function',
-    function=ChatCompletionToolParamFunctionChunk(
-        name='execute_ipython_cell',
-        description=_IPYTHON_DESCRIPTION,
-        parameters={
-            'type': 'object',
-            'properties': {
-                'code': {
-                    'type': 'string',
-                    'description': 'The Python code to execute. Supports magic commands like %pip.',
-                },
-            },
-            'required': ['code'],
-        },
-    ),
-)
-
-_FILE_EDIT_DESCRIPTION = """Edit a file in plain-text format.
-* The assistant can edit files by specifying the file path and providing a draft of the new file content.
-* The draft content doesn't need to be exactly the same as the existing file; the assistant may skip unchanged lines using comments like `# unchanged` to indicate unchanged sections.
-* IMPORTANT: For large files (e.g., > 300 lines), specify the range of lines to edit using `start` and `end` (1-indexed, inclusive). The range should be smaller than 300 lines.
-* To append to a file, set both `start` and `end` to `-1`.
-* If the file doesn't exist, a new file will be created with the provided content.
-
-**Example 1: general edit for short files**
-For example, given an existing file `/path/to/file.py` that looks like this:
-(this is the end of the file)
-1|class MyClass:
-2|    def __init__(self):
-3|        self.x = 1
-4|        self.y = 2
-5|        self.z = 3
-6|
-7|print(MyClass().z)
-8|print(MyClass().x)
-(this is the end of the file)
-
-The assistant wants to edit the file to look like this:
-(this is the end of the file)
-1|class MyClass:
-2|    def __init__(self):
-3|        self.x = 1
-4|        self.y = 2
-5|
-6|print(MyClass().y)
-(this is the end of the file)
-
-The assistant may produce an edit action like this:
-path="/path/to/file.txt" start=1 end=-1
-content=```
-class MyClass:
-    def __init__(self):
-        # no changes before
-        self.y = 2
-        # self.z is removed
-
-# MyClass().z is removed
-print(MyClass().y)
-```
-
-**Example 2: append to file for short files**
-For example, given an existing file `/path/to/file.py` that looks like this:
-(this is the end of the file)
-1|class MyClass:
-2|    def __init__(self):
-3|        self.x = 1
-4|        self.y = 2
-5|        self.z = 3
-6|
-7|print(MyClass().z)
-8|print(MyClass().x)
-(this is the end of the file)
-
-To append the following lines to the file:
-```python
-print(MyClass().y)
-```
-
-The assistant may produce an edit action like this:
-path="/path/to/file.txt" start=-1 end=-1
-content=```
-print(MyClass().y)
-```
-
-**Example 3: edit for long files**
-
-Given an existing file `/path/to/file.py` that looks like this:
-(1000 more lines above)
-1001|class MyClass:
-1002|    def __init__(self):
-1003|        self.x = 1
-1004|        self.y = 2
-1005|        self.z = 3
-1006|
-1007|print(MyClass().z)
-1008|print(MyClass().x)
-(2000 more lines below)
-
-The assistant wants to edit the file to look like this:
-
-(1000 more lines above)
-1001|class MyClass:
-1002|    def __init__(self):
-1003|        self.x = 1
-1004|        self.y = 2
-1005|
-1006|print(MyClass().y)
-(2000 more lines below)
-
-The assistant may produce an edit action like this:
-path="/path/to/file.txt" start=1001 end=1008
-content=```
-class MyClass:
-    def __init__(self):
-        # no changes before
-        self.y = 2
-        # self.z is removed
-
-# MyClass().z is removed
-print(MyClass().y)
-```
-"""
-
-LLMBasedFileEditTool = ChatCompletionToolParam(
-    type='function',
-    function=ChatCompletionToolParamFunctionChunk(
-        name='edit_file',
-        description=_FILE_EDIT_DESCRIPTION,
-        parameters={
-            'type': 'object',
-            'properties': {
-                'path': {
-                    'type': 'string',
-                    'description': 'The absolute path to the file to be edited.',
-                },
-                'content': {
-                    'type': 'string',
-                    'description': 'A draft of the new content for the file being edited. Note that the assistant may skip unchanged lines.',
-                },
-                'start': {
-                    'type': 'integer',
-                    'description': 'The starting line number for the edit (1-indexed, inclusive). Default is 1.',
-                },
-                'end': {
-                    'type': 'integer',
-                    'description': 'The ending line number for the edit (1-indexed, inclusive). Default is -1 (end of file).',
-                },
-            },
-            'required': ['path', 'content'],
-        },
-    ),
-)
-
-_STR_REPLACE_EDITOR_DESCRIPTION = """Custom editing tool for viewing, creating and editing files in plain-text format
-* State is persistent across command calls and discussions with the user
-* If `path` is a file, `view` displays the result of applying `cat -n`. If `path` is a directory, `view` lists non-hidden files and directories up to 2 levels deep
-* The `create` command cannot be used if the specified `path` already exists as a file
-* If a `command` generates a long output, it will be truncated and marked with `<response clipped>`
-* The `undo_edit` command will revert the last edit made to the file at `path`
-
-Notes for using the `str_replace` command:
-* The `old_str` parameter should match EXACTLY one or more consecutive lines from the original file. Be mindful of whitespaces!
-* If the `old_str` parameter is not unique in the file, the replacement will not be performed. Make sure to include enough context in `old_str` to make it unique
-* The `new_str` parameter should contain the edited lines that should replace the `old_str`
-"""
-
-StrReplaceEditorTool = ChatCompletionToolParam(
-    type='function',
-    function=ChatCompletionToolParamFunctionChunk(
-        name='str_replace_editor',
-        description=_STR_REPLACE_EDITOR_DESCRIPTION,
-        parameters={
-            'type': 'object',
-            'properties': {
-                'command': {
-                    'description': 'The commands to run. Allowed options are: `view`, `create`, `str_replace`, `insert`, `undo_edit`.',
-                    'enum': ['view', 'create', 'str_replace', 'insert', 'undo_edit'],
-                    'type': 'string',
-                },
-                'path': {
-                    'description': 'Absolute path to file or directory, e.g. `/workspace/file.py` or `/workspace`.',
-                    'type': 'string',
-                },
-                'file_text': {
-                    'description': 'Required parameter of `create` command, with the content of the file to be created.',
-                    'type': 'string',
-                },
-                'old_str': {
-                    'description': 'Required parameter of `str_replace` command containing the string in `path` to replace.',
-                    'type': 'string',
-                },
-                'new_str': {
-                    'description': 'Optional parameter of `str_replace` command containing the new string (if not given, no string will be added). Required parameter of `insert` command containing the string to insert.',
-                    'type': 'string',
-                },
-                'insert_line': {
-                    'description': 'Required parameter of `insert` command. The `new_str` will be inserted AFTER the line `insert_line` of `path`.',
-                    'type': 'integer',
-                },
-                'view_range': {
-                    'description': 'Optional parameter of `view` command when `path` points to a file. If none is given, the full file is shown. If provided, the file will be shown in the indicated line number range, e.g. [11, 12] will show lines 11 and 12. Indexing at 1 to start. Setting `[start_line, -1]` shows all lines from `start_line` to the end of the file.',
-                    'items': {'type': 'integer'},
-                    'type': 'array',
-                },
-            },
-            'required': ['command', 'path'],
-        },
-    ),
-)
-
-
-_WEB_DESCRIPTION = """Read (convert to markdown) content from a webpage. You should prefer using the `web_read` tool over the `browser` tool, but do use the `browser` tool if you need to interact with a webpage (e.g., click a button, fill out a form, etc.).
-
-You may use the `web_read` tool to read content from a webpage, and even search the webpage content using a Google search query (e.g., url=`https://www.google.com/search?q=YOUR_QUERY`).
-"""
-
-WebReadTool = ChatCompletionToolParam(
-    type='function',
-    function=ChatCompletionToolParamFunctionChunk(
-        name='web_read',
-        description=_WEB_DESCRIPTION,
-        parameters={
-            'type': 'object',
-            'properties': {
-                'url': {
-                    'type': 'string',
-                    'description': 'The URL of the webpage to read. You can also use a Google search query here (e.g., `https://www.google.com/search?q=YOUR_QUERY`).',
-                }
-            },
-            'required': ['url'],
-        },
-    ),
-)
-
-# from browsergym/core/action/highlevel.py
-_browser_action_space = HighLevelActionSet(
-    subsets=['bid', 'nav'],
-    strict=False,  # less strict on the parsing of the actions
-    multiaction=True,  # enable to agent to take multiple actions at once
-)
-
-
-_BROWSER_DESCRIPTION = """Interact with the browser using Python code. Use it ONLY when you need to interact with a webpage.
-
-See the description of "code" parameter for more details.
-
-Multiple actions can be provided at once, but will be executed sequentially without any feedback from the page.
-More than 2-3 actions usually leads to failure or unexpected behavior. Example:
-fill('a12', 'example with "quotes"')
-click('a51')
-click('48', button='middle', modifiers=['Shift'])
-"""
-
-_BROWSER_TOOL_DESCRIPTION = """
-The following 15 functions are available. Nothing else is supported.
-
-goto(url: str)
-    Description: Navigate to a url.
-    Examples:
-        goto('http://www.example.com')
-
-go_back()
-    Description: Navigate to the previous page in history.
-    Examples:
-        go_back()
-
-go_forward()
-    Description: Navigate to the next page in history.
-    Examples:
-        go_forward()
-
-noop(wait_ms: float = 1000)
-    Description: Do nothing, and optionally wait for the given time (in milliseconds).
-    You can use this to get the current page content and/or wait for the page to load.
-    Examples:
-        noop()
-
-        noop(500)
-
-scroll(delta_x: float, delta_y: float)
-    Description: Scroll horizontally and vertically. Amounts in pixels, positive for right or down scrolling, negative for left or up scrolling. Dispatches a wheel event.
-    Examples:
-        scroll(0, 200)
-
-        scroll(-50.2, -100.5)
-
-fill(bid: str, value: str)
-    Description: Fill out a form field. It focuses the element and triggers an input event with the entered text. It works for <input>, <textarea> and [contenteditable] elements.
-    Examples:
-        fill('237', 'example value')
-
-        fill('45', 'multi-line\nexample')
-
-        fill('a12', 'example with "quotes"')
-
-select_option(bid: str, options: str | list[str])
-    Description: Select one or multiple options in a <select> element. You can specify option value or label to select. Multiple options can be selected.
-    Examples:
-        select_option('a48', 'blue')
-
-        select_option('c48', ['red', 'green', 'blue'])
-
-click(bid: str, button: Literal['left', 'middle', 'right'] = 'left', modifiers: list[typing.Literal['Alt', 'Control', 'ControlOrMeta', 'Meta', 'Shift']] = [])
-    Description: Click an element.
-    Examples:
-        click('a51')
-
-        click('b22', button='right')
-
-        click('48', button='middle', modifiers=['Shift'])
-
-dblclick(bid: str, button: Literal['left', 'middle', 'right'] = 'left', modifiers: list[typing.Literal['Alt', 'Control', 'ControlOrMeta', 'Meta', 'Shift']] = [])
-    Description: Double click an element.
-    Examples:
-        dblclick('12')
-
-        dblclick('ca42', button='right')
-
-        dblclick('178', button='middle', modifiers=['Shift'])
-
-hover(bid: str)
-    Description: Hover over an element.
-    Examples:
-        hover('b8')
-
-press(bid: str, key_comb: str)
-    Description: Focus the matching element and press a combination of keys. It accepts the logical key names that are emitted in the keyboardEvent.key property of the keyboard events: Backquote, Minus, Equal, Backslash, Backspace, Tab, Delete, Escape, ArrowDown, End, Enter, Home, Insert, PageDown, PageUp, ArrowRight, ArrowUp, F1 - F12, Digit0 - Digit9, KeyA - KeyZ, etc. You can alternatively specify a single character you'd like to produce such as "a" or "#". Following modification shortcuts are also supported: Shift, Control, Alt, Meta, ShiftLeft, ControlOrMeta. ControlOrMeta resolves to Control on Windows and Linux and to Meta on macOS.
-    Examples:
-        press('88', 'Backspace')
-
-        press('a26', 'ControlOrMeta+a')
-
-        press('a61', 'Meta+Shift+t')
-
-focus(bid: str)
-    Description: Focus the matching element.
-    Examples:
-        focus('b455')
-
-clear(bid: str)
-    Description: Clear the input field.
-    Examples:
-        clear('996')
-
-drag_and_drop(from_bid: str, to_bid: str)
-    Description: Perform a drag & drop. Hover the element that will be dragged. Press left mouse button. Move mouse to the element that will receive the drop. Release left mouse button.
-    Examples:
-        drag_and_drop('56', '498')
-
-upload_file(bid: str, file: str | list[str])
-    Description: Click an element and wait for a "filechooser" event, then select one or multiple input files for upload. Relative file paths are resolved relative to the current working directory. An empty list clears the selected files.
-    Examples:
-        upload_file('572', '/home/user/my_receipt.pdf')
-
-        upload_file('63', ['/home/bob/Documents/image.jpg', '/home/bob/Documents/file.zip'])
-"""
-
-
-for _, action in _browser_action_space.action_set.items():
-    assert (
-        action.signature in _BROWSER_TOOL_DESCRIPTION
-    ), f'Browser description mismatch. Please double check if the BrowserGym updated their action space.\n\nAction: {action.signature}'
-    assert (
-        action.description in _BROWSER_TOOL_DESCRIPTION
-    ), f'Browser description mismatch. Please double check if the BrowserGym updated their action space.\n\nAction: {action.description}'
-
-BrowserTool = ChatCompletionToolParam(
-    type='function',
-    function=ChatCompletionToolParamFunctionChunk(
-        name='browser',
-        description=_BROWSER_DESCRIPTION,
-        parameters={
-            'type': 'object',
-            'properties': {
-                'code': {
-                    'type': 'string',
-                    'description': (
-                        'The Python code that interacts with the browser.\n'
-                        + _BROWSER_TOOL_DESCRIPTION
-                    ),
-                }
-            },
-            'required': ['code'],
-        },
-    ),
-)
-
-_FINISH_DESCRIPTION = """Finish the interaction when the task is complete OR if the assistant cannot proceed further with the task."""
-
-FinishTool = ChatCompletionToolParam(
-    type='function',
-    function=ChatCompletionToolParamFunctionChunk(
-        name='finish',
-        description=_FINISH_DESCRIPTION,
-    ),
-)
-

 def combine_thought(action: Action, thought: str) -> Action:
    if not hasattr(action, 'thought'):
@ -496,7 +71,7 @@ def response_to_actions(response: ModelResponse) -> list[Action]:
                raise RuntimeError(
                    f'Failed to parse tool call arguments: {tool_call.function.arguments}'
                ) from e
-            if tool_call.function.name == 'execute_bash':
+            if tool_call.function.name == CmdRunTool['function']['name']:
                if 'command' not in arguments:
                    raise FunctionCallValidationError(
                        f'Missing required argument "command" in tool call {tool_call.function.name}'
@ -504,7 +79,7 @@ def response_to_actions(response: ModelResponse) -> list[Action]:
                # convert is_input to boolean
                is_input = arguments.get('is_input', 'false') == 'true'
                action = CmdRunAction(command=arguments['command'], is_input=is_input)
-            elif tool_call.function.name == 'execute_ipython_cell':
+            elif tool_call.function.name == IPythonTool['function']['name']:
                if 'code' not in arguments:
                    raise FunctionCallValidationError(
                        f'Missing required argument "code" in tool call {tool_call.function.name}'
@ -515,9 +90,9 @@ def response_to_actions(response: ModelResponse) -> list[Action]:
                    agent='BrowsingAgent',
                    inputs=arguments,
                )
-            elif tool_call.function.name == 'finish':
+            elif tool_call.function.name == FinishTool['function']['name']:
                action = AgentFinishAction()
-            elif tool_call.function.name == 'edit_file':
+            elif tool_call.function.name == LLMBasedFileEditTool['function']['name']:
                if 'path' not in arguments:
                    raise FunctionCallValidationError(
                        f'Missing required argument "path" in tool call {tool_call.function.name}'
@ -532,7 +107,7 @@ def response_to_actions(response: ModelResponse) -> list[Action]:
                    start=arguments.get('start', 1),
                    end=arguments.get('end', -1),
                )
-            elif tool_call.function.name == 'str_replace_editor':
+            elif tool_call.function.name == StrReplaceEditorTool['function']['name']:
                if 'command' not in arguments:
                    raise FunctionCallValidationError(
                        f'Missing required argument "command" in tool call {tool_call.function.name}'
@ -563,13 +138,13 @@ def response_to_actions(response: ModelResponse) -> list[Action]:
                        impl_source=FileEditSource.OH_ACI,
                        **other_kwargs,
                    )
-            elif tool_call.function.name == 'browser':
+            elif tool_call.function.name == BrowserTool['function']['name']:
                if 'code' not in arguments:
                    raise FunctionCallValidationError(
                        f'Missing required argument "code" in tool call {tool_call.function.name}'
                    )
                action = BrowseInteractiveAction(browser_actions=arguments['code'])
-            elif tool_call.function.name == 'web_read':
+            elif tool_call.function.name == WebReadTool['function']['name']:
                if 'url' not in arguments:
                    raise FunctionCallValidationError(
                        f'Missing required argument "url" in tool call {tool_call.function.name}'
--- a/openhands/agenthub/codeact_agent/tools/init.py
+++ b/openhands/agenthub/codeact_agent/tools/init.py
@ -0,0 +1,17 @@
+from .bash import CmdRunTool
+from .browser import BrowserTool
+from .finish import FinishTool
+from .ipython import IPythonTool
+from .llm_based_edit import LLMBasedFileEditTool
+from .str_replace_editor import StrReplaceEditorTool
+from .web_read import WebReadTool
+
+__all__ = [
+    'BrowserTool',
+    'CmdRunTool',
+    'FinishTool',
+    'IPythonTool',
+    'LLMBasedFileEditTool',
+    'StrReplaceEditorTool',
+    'WebReadTool',
+]
--- a/openhands/agenthub/codeact_agent/tools/bash.py
+++ b/openhands/agenthub/codeact_agent/tools/bash.py
@ -0,0 +1,30 @@
+from litellm import ChatCompletionToolParam, ChatCompletionToolParamFunctionChunk
+
+_BASH_DESCRIPTION = """Execute a bash command in the terminal.
+* Long running commands: For commands that may run indefinitely, it should be run in the background and the output should be redirected to a file, e.g. command = `python3 app.py > server.log 2>&1 &`.
+* Interact with running process: If a bash command returns exit code `-1`, this means the process is not yet finished. By setting `is_input` to `true`, the assistant can interact with the running process and send empty `command` to retrieve any additional logs, or send additional text (set `command` to the text) to STDIN of the running process, or send command like `C-c` (Ctrl+C), `C-d` (Ctrl+D), `C-z` (Ctrl+Z) to interrupt the process.
+* One command at a time: You can only execute one bash command at a time. If you need to run multiple commands sequentially, you can use `&&` or `;` to chain them together.
+"""
+
+CmdRunTool = ChatCompletionToolParam(
+    type='function',
+    function=ChatCompletionToolParamFunctionChunk(
+        name='execute_bash',
+        description=_BASH_DESCRIPTION,
+        parameters={
+            'type': 'object',
+            'properties': {
+                'command': {
+                    'type': 'string',
+                    'description': 'The bash command to execute. Can be empty string to view additional logs when previous exit code is `-1`. Can be `C-c` (Ctrl+C) to interrupt the currently running process. Note: You can only execute one bash command at a time. If you need to run multiple commands sequentially, you can use `&&` or `;` to chain them together.',
+                },
+                'is_input': {
+                    'type': 'string',
+                    'description': 'If True, the command is an input to the running process. If False, the command is a bash command to be executed in the terminal. Default is False.',
+                    'enum': ['true', 'false'],
+                },
+            },
+            'required': ['command'],
+        },
+    ),
+)
--- a/openhands/agenthub/codeact_agent/tools/browser.py
+++ b/openhands/agenthub/codeact_agent/tools/browser.py
@ -0,0 +1,155 @@
+from browsergym.core.action.highlevel import HighLevelActionSet
+from litellm import ChatCompletionToolParam, ChatCompletionToolParamFunctionChunk
+
+# from browsergym/core/action/highlevel.py
+_browser_action_space = HighLevelActionSet(
+    subsets=['bid', 'nav'],
+    strict=False,  # less strict on the parsing of the actions
+    multiaction=True,  # enable to agent to take multiple actions at once
+)
+
+
+_BROWSER_DESCRIPTION = """Interact with the browser using Python code. Use it ONLY when you need to interact with a webpage.
+
+See the description of "code" parameter for more details.
+
+Multiple actions can be provided at once, but will be executed sequentially without any feedback from the page.
+More than 2-3 actions usually leads to failure or unexpected behavior. Example:
+fill('a12', 'example with "quotes"')
+click('a51')
+click('48', button='middle', modifiers=['Shift'])
+"""
+
+_BROWSER_TOOL_DESCRIPTION = """
+The following 15 functions are available. Nothing else is supported.
+
+goto(url: str)
+    Description: Navigate to a url.
+    Examples:
+        goto('http://www.example.com')
+
+go_back()
+    Description: Navigate to the previous page in history.
+    Examples:
+        go_back()
+
+go_forward()
+    Description: Navigate to the next page in history.
+    Examples:
+        go_forward()
+
+noop(wait_ms: float = 1000)
+    Description: Do nothing, and optionally wait for the given time (in milliseconds).
+    You can use this to get the current page content and/or wait for the page to load.
+    Examples:
+        noop()
+
+        noop(500)
+
+scroll(delta_x: float, delta_y: float)
+    Description: Scroll horizontally and vertically. Amounts in pixels, positive for right or down scrolling, negative for left or up scrolling. Dispatches a wheel event.
+    Examples:
+        scroll(0, 200)
+
+        scroll(-50.2, -100.5)
+
+fill(bid: str, value: str)
+    Description: Fill out a form field. It focuses the element and triggers an input event with the entered text. It works for <input>, <textarea> and [contenteditable] elements.
+    Examples:
+        fill('237', 'example value')
+
+        fill('45', 'multi-line\nexample')
+
+        fill('a12', 'example with "quotes"')
+
+select_option(bid: str, options: str | list[str])
+    Description: Select one or multiple options in a <select> element. You can specify option value or label to select. Multiple options can be selected.
+    Examples:
+        select_option('a48', 'blue')
+
+        select_option('c48', ['red', 'green', 'blue'])
+
+click(bid: str, button: Literal['left', 'middle', 'right'] = 'left', modifiers: list[typing.Literal['Alt', 'Control', 'ControlOrMeta', 'Meta', 'Shift']] = [])
+    Description: Click an element.
+    Examples:
+        click('a51')
+
+        click('b22', button='right')
+
+        click('48', button='middle', modifiers=['Shift'])
+
+dblclick(bid: str, button: Literal['left', 'middle', 'right'] = 'left', modifiers: list[typing.Literal['Alt', 'Control', 'ControlOrMeta', 'Meta', 'Shift']] = [])
+    Description: Double click an element.
+    Examples:
+        dblclick('12')
+
+        dblclick('ca42', button='right')
+
+        dblclick('178', button='middle', modifiers=['Shift'])
+
+hover(bid: str)
+    Description: Hover over an element.
+    Examples:
+        hover('b8')
+
+press(bid: str, key_comb: str)
+    Description: Focus the matching element and press a combination of keys. It accepts the logical key names that are emitted in the keyboardEvent.key property of the keyboard events: Backquote, Minus, Equal, Backslash, Backspace, Tab, Delete, Escape, ArrowDown, End, Enter, Home, Insert, PageDown, PageUp, ArrowRight, ArrowUp, F1 - F12, Digit0 - Digit9, KeyA - KeyZ, etc. You can alternatively specify a single character you'd like to produce such as "a" or "#". Following modification shortcuts are also supported: Shift, Control, Alt, Meta, ShiftLeft, ControlOrMeta. ControlOrMeta resolves to Control on Windows and Linux and to Meta on macOS.
+    Examples:
+        press('88', 'Backspace')
+
+        press('a26', 'ControlOrMeta+a')
+
+        press('a61', 'Meta+Shift+t')
+
+focus(bid: str)
+    Description: Focus the matching element.
+    Examples:
+        focus('b455')
+
+clear(bid: str)
+    Description: Clear the input field.
+    Examples:
+        clear('996')
+
+drag_and_drop(from_bid: str, to_bid: str)
+    Description: Perform a drag & drop. Hover the element that will be dragged. Press left mouse button. Move mouse to the element that will receive the drop. Release left mouse button.
+    Examples:
+        drag_and_drop('56', '498')
+
+upload_file(bid: str, file: str | list[str])
+    Description: Click an element and wait for a "filechooser" event, then select one or multiple input files for upload. Relative file paths are resolved relative to the current working directory. An empty list clears the selected files.
+    Examples:
+        upload_file('572', '/home/user/my_receipt.pdf')
+
+        upload_file('63', ['/home/bob/Documents/image.jpg', '/home/bob/Documents/file.zip'])
+"""
+
+
+for _, action in _browser_action_space.action_set.items():
+    assert (
+        action.signature in _BROWSER_TOOL_DESCRIPTION
+    ), f'Browser description mismatch. Please double check if the BrowserGym updated their action space.\n\nAction: {action.signature}'
+    assert (
+        action.description in _BROWSER_TOOL_DESCRIPTION
+    ), f'Browser description mismatch. Please double check if the BrowserGym updated their action space.\n\nAction: {action.description}'
+
+BrowserTool = ChatCompletionToolParam(
+    type='function',
+    function=ChatCompletionToolParamFunctionChunk(
+        name='browser',
+        description=_BROWSER_DESCRIPTION,
+        parameters={
+            'type': 'object',
+            'properties': {
+                'code': {
+                    'type': 'string',
+                    'description': (
+                        'The Python code that interacts with the browser.\n'
+                        + _BROWSER_TOOL_DESCRIPTION
+                    ),
+                }
+            },
+            'required': ['code'],
+        },
+    ),
+)
--- a/openhands/agenthub/codeact_agent/tools/finish.py
+++ b/openhands/agenthub/codeact_agent/tools/finish.py
@ -0,0 +1,11 @@
+from litellm import ChatCompletionToolParam, ChatCompletionToolParamFunctionChunk
+
+_FINISH_DESCRIPTION = """Finish the interaction when the task is complete OR if the assistant cannot proceed further with the task."""
+
+FinishTool = ChatCompletionToolParam(
+    type='function',
+    function=ChatCompletionToolParamFunctionChunk(
+        name='finish',
+        description=_FINISH_DESCRIPTION,
+    ),
+)
--- a/openhands/agenthub/codeact_agent/tools/ipython.py
+++ b/openhands/agenthub/codeact_agent/tools/ipython.py
@ -0,0 +1,24 @@
+from litellm import ChatCompletionToolParam, ChatCompletionToolParamFunctionChunk
+
+_IPYTHON_DESCRIPTION = """Run a cell of Python code in an IPython environment.
+* The assistant should define variables and import packages before using them.
+* The variable defined in the IPython environment will not be available outside the IPython environment (e.g., in terminal).
+"""
+
+IPythonTool = ChatCompletionToolParam(
+    type='function',
+    function=ChatCompletionToolParamFunctionChunk(
+        name='execute_ipython_cell',
+        description=_IPYTHON_DESCRIPTION,
+        parameters={
+            'type': 'object',
+            'properties': {
+                'code': {
+                    'type': 'string',
+                    'description': 'The Python code to execute. Supports magic commands like %pip.',
+                },
+            },
+            'required': ['code'],
+        },
+    ),
+)
--- a/openhands/agenthub/codeact_agent/tools/llm_based_edit.py
+++ b/openhands/agenthub/codeact_agent/tools/llm_based_edit.py
@ -0,0 +1,137 @@
+from litellm import ChatCompletionToolParam, ChatCompletionToolParamFunctionChunk
+
+_FILE_EDIT_DESCRIPTION = """Edit a file in plain-text format.
+* The assistant can edit files by specifying the file path and providing a draft of the new file content.
+* The draft content doesn't need to be exactly the same as the existing file; the assistant may skip unchanged lines using comments like `# unchanged` to indicate unchanged sections.
+* IMPORTANT: For large files (e.g., > 300 lines), specify the range of lines to edit using `start` and `end` (1-indexed, inclusive). The range should be smaller than 300 lines.
+* To append to a file, set both `start` and `end` to `-1`.
+* If the file doesn't exist, a new file will be created with the provided content.
+
+**Example 1: general edit for short files**
+For example, given an existing file `/path/to/file.py` that looks like this:
+(this is the end of the file)
+1|class MyClass:
+2|    def __init__(self):
+3|        self.x = 1
+4|        self.y = 2
+5|        self.z = 3
+6|
+7|print(MyClass().z)
+8|print(MyClass().x)
+(this is the end of the file)
+
+The assistant wants to edit the file to look like this:
+(this is the end of the file)
+1|class MyClass:
+2|    def __init__(self):
+3|        self.x = 1
+4|        self.y = 2
+5|
+6|print(MyClass().y)
+(this is the end of the file)
+
+The assistant may produce an edit action like this:
+path="/path/to/file.txt" start=1 end=-1
+content=```
+class MyClass:
+    def __init__(self):
+        # no changes before
+        self.y = 2
+        # self.z is removed
+
+# MyClass().z is removed
+print(MyClass().y)
+```
+
+**Example 2: append to file for short files**
+For example, given an existing file `/path/to/file.py` that looks like this:
+(this is the end of the file)
+1|class MyClass:
+2|    def __init__(self):
+3|        self.x = 1
+4|        self.y = 2
+5|        self.z = 3
+6|
+7|print(MyClass().z)
+8|print(MyClass().x)
+(this is the end of the file)
+
+To append the following lines to the file:
+```python
+print(MyClass().y)
+```
+
+The assistant may produce an edit action like this:
+path="/path/to/file.txt" start=-1 end=-1
+content=```
+print(MyClass().y)
+```
+
+**Example 3: edit for long files**
+
+Given an existing file `/path/to/file.py` that looks like this:
+(1000 more lines above)
+1001|class MyClass:
+1002|    def __init__(self):
+1003|        self.x = 1
+1004|        self.y = 2
+1005|        self.z = 3
+1006|
+1007|print(MyClass().z)
+1008|print(MyClass().x)
+(2000 more lines below)
+
+The assistant wants to edit the file to look like this:
+
+(1000 more lines above)
+1001|class MyClass:
+1002|    def __init__(self):
+1003|        self.x = 1
+1004|        self.y = 2
+1005|
+1006|print(MyClass().y)
+(2000 more lines below)
+
+The assistant may produce an edit action like this:
+path="/path/to/file.txt" start=1001 end=1008
+content=```
+class MyClass:
+    def __init__(self):
+        # no changes before
+        self.y = 2
+        # self.z is removed
+
+# MyClass().z is removed
+print(MyClass().y)
+```
+"""
+
+LLMBasedFileEditTool = ChatCompletionToolParam(
+    type='function',
+    function=ChatCompletionToolParamFunctionChunk(
+        name='edit_file',
+        description=_FILE_EDIT_DESCRIPTION,
+        parameters={
+            'type': 'object',
+            'properties': {
+                'path': {
+                    'type': 'string',
+                    'description': 'The absolute path to the file to be edited.',
+                },
+                'content': {
+                    'type': 'string',
+                    'description': 'A draft of the new content for the file being edited. Note that the assistant may skip unchanged lines.',
+                },
+                'start': {
+                    'type': 'integer',
+                    'description': 'The starting line number for the edit (1-indexed, inclusive). Default is 1.',
+                },
+                'end': {
+                    'type': 'integer',
+                    'description': 'The ending line number for the edit (1-indexed, inclusive). Default is -1 (end of file).',
+                },
+            },
+            'required': ['path', 'content'],
+        },
+    ),
+)
--- a/openhands/agenthub/codeact_agent/tools/str_replace_editor.py
+++ b/openhands/agenthub/codeact_agent/tools/str_replace_editor.py
@ -0,0 +1,58 @@
+from litellm import ChatCompletionToolParam, ChatCompletionToolParamFunctionChunk
+
+_STR_REPLACE_EDITOR_DESCRIPTION = """Custom editing tool for viewing, creating and editing files in plain-text format
+* State is persistent across command calls and discussions with the user
+* If `path` is a file, `view` displays the result of applying `cat -n`. If `path` is a directory, `view` lists non-hidden files and directories up to 2 levels deep
+* The `create` command cannot be used if the specified `path` already exists as a file
+* If a `command` generates a long output, it will be truncated and marked with `<response clipped>`
+* The `undo_edit` command will revert the last edit made to the file at `path`
+
+Notes for using the `str_replace` command:
+* The `old_str` parameter should match EXACTLY one or more consecutive lines from the original file. Be mindful of whitespaces!
+* If the `old_str` parameter is not unique in the file, the replacement will not be performed. Make sure to include enough context in `old_str` to make it unique
+* The `new_str` parameter should contain the edited lines that should replace the `old_str`
+"""
+
+StrReplaceEditorTool = ChatCompletionToolParam(
+    type='function',
+    function=ChatCompletionToolParamFunctionChunk(
+        name='str_replace_editor',
+        description=_STR_REPLACE_EDITOR_DESCRIPTION,
+        parameters={
+            'type': 'object',
+            'properties': {
+                'command': {
+                    'description': 'The commands to run. Allowed options are: `view`, `create`, `str_replace`, `insert`, `undo_edit`.',
+                    'enum': ['view', 'create', 'str_replace', 'insert', 'undo_edit'],
+                    'type': 'string',
+                },
+                'path': {
+                    'description': 'Absolute path to file or directory, e.g. `/workspace/file.py` or `/workspace`.',
+                    'type': 'string',
+                },
+                'file_text': {
+                    'description': 'Required parameter of `create` command, with the content of the file to be created.',
+                    'type': 'string',
+                },
+                'old_str': {
+                    'description': 'Required parameter of `str_replace` command containing the string in `path` to replace.',
+                    'type': 'string',
+                },
+                'new_str': {
+                    'description': 'Optional parameter of `str_replace` command containing the new string (if not given, no string will be added). Required parameter of `insert` command containing the string to insert.',
+                    'type': 'string',
+                },
+                'insert_line': {
+                    'description': 'Required parameter of `insert` command. The `new_str` will be inserted AFTER the line `insert_line` of `path`.',
+                    'type': 'integer',
+                },
+                'view_range': {
+                    'description': 'Optional parameter of `view` command when `path` points to a file. If none is given, the full file is shown. If provided, the file will be shown in the indicated line number range, e.g. [11, 12] will show lines 11 and 12. Indexing at 1 to start. Setting `[start_line, -1]` shows all lines from `start_line` to the end of the file.',
+                    'items': {'type': 'integer'},
+                    'type': 'array',
+                },
+            },
+            'required': ['command', 'path'],
+        },
+    ),
+)
--- a/openhands/agenthub/codeact_agent/tools/web_read.py
+++ b/openhands/agenthub/codeact_agent/tools/web_read.py
@ -0,0 +1,24 @@
+from litellm import ChatCompletionToolParam, ChatCompletionToolParamFunctionChunk
+
+_WEB_DESCRIPTION = """Read (convert to markdown) content from a webpage. You should prefer using the `web_read` tool over the `browser` tool, but do use the `browser` tool if you need to interact with a webpage (e.g., click a button, fill out a form, etc.).
+
+You may use the `web_read` tool to read content from a webpage, and even search the webpage content using a Google search query (e.g., url=`https://www.google.com/search?q=YOUR_QUERY`).
+"""
+
+WebReadTool = ChatCompletionToolParam(
+    type='function',
+    function=ChatCompletionToolParamFunctionChunk(
+        name='web_read',
+        description=_WEB_DESCRIPTION,
+        parameters={
+            'type': 'object',
+            'properties': {
+                'url': {
+                    'type': 'string',
+                    'description': 'The URL of the webpage to read. You can also use a Google search query here (e.g., `https://www.google.com/search?q=YOUR_QUERY`).',
+                }
+            },
+            'required': ['url'],
+        },
+    ),
+)
--- a/tests/unit/test_codeact_agent.py
+++ b/tests/unit/test_codeact_agent.py
@ -5,8 +5,6 @@ from litellm import ChatCompletionMessageToolCall

 from openhands.agenthub.codeact_agent.codeact_agent import CodeActAgent
 from openhands.agenthub.codeact_agent.function_calling import (
-    _BROWSER_DESCRIPTION,
-    _BROWSER_TOOL_DESCRIPTION,
    BrowserTool,
    CmdRunTool,
    IPythonTool,
@ -16,6 +14,10 @@ from openhands.agenthub.codeact_agent.function_calling import (
    get_tools,
    response_to_actions,
 )
+from openhands.agenthub.codeact_agent.tools.browser import (
+    _BROWSER_DESCRIPTION,
+    _BROWSER_TOOL_DESCRIPTION,
+)
 from openhands.controller.state.state import State
 from openhands.core.config import AgentConfig, LLMConfig
 from openhands.core.exceptions import FunctionCallNotExistsError