Merge pull request #526 from vvincent1234/fix/multi_tab

fix multiple tab
2026-03-22 11:17:17 +08:00 · 2025-04-12 21:10:02 +08:00
parent 2df50b308c d70db733a4
commit d390e68b1c
5 changed files with 45 additions and 25 deletions
--- a/src/agent/custom_message_manager.py
+++ b/src/agent/custom_message_manager.py
@@ -74,7 +74,8 @@ class CustomMessageManager(MessageManager):
        min_message_len = 2 if self.context_content is not None else 1

        while diff > 0 and len(self.state.history.messages) > min_message_len:
-            self.state.history.remove_message(min_message_len)  # always remove the oldest message
+            msg = self.state.history.messages.pop(min_message_len)
+            self.state.history.current_tokens -= msg.metadata.tokens
            diff = self.state.history.current_tokens - self.settings.max_input_tokens

    def add_state_message(
@@ -104,6 +105,7 @@ class CustomMessageManager(MessageManager):
            if isinstance(self.state.history.messages[i].message, HumanMessage):
                remove_cnt += 1
            if remove_cnt == abs(remove_ind):
-                self.state.history.messages.pop(i)
+                msg = self.state.history.messages.pop(i)
+                self.state.history.current_tokens -= msg.metadata.tokens
                break
            i -= 1
--- a/src/agent/custom_prompts.py
+++ b/src/agent/custom_prompts.py
@@ -21,6 +21,18 @@ class CustomSystemPrompt(SystemPrompt):
        except Exception as e:
            raise RuntimeError(f'Failed to load system prompt template: {e}')

+    def get_system_message(self) -> SystemMessage:
+        """
+        Get the system prompt for the agent.
+
+        Returns:
+            SystemMessage: Formatted system prompt
+        """
+        prompt = self.prompt_template.format(max_actions=self.max_actions_per_step,
+                                             available_actions=self.default_action_description)
+
+        return SystemMessage(content=prompt)
+

 class CustomAgentMessagePrompt(AgentMessagePrompt):
    def __init__(
--- a/src/agent/custom_system_prompt.md
+++ b/src/agent/custom_system_prompt.md
@@ -30,7 +30,7 @@ Example:
 ]
 }}

-2. ACTIONS: You can specify multiple actions in the list to be executed in sequence. But always specify only one action name per item. Use maximum {{max_actions}} actions per sequence.
+2. ACTIONS: You can specify multiple actions in the list to be executed in sequence. But always specify only one action name per item. Use maximum {max_actions} actions per sequence.
 Common action sequences:
 - Form filling: [{{"input_text": {{"index": 1, "text": "username"}}}}, {{"input_text": {{"index": 2, "text": "password"}}}}, {{"click_element": {{"index": 3}}}}]
 - Navigation and extraction: [{{"go_to_url": {{"url": "https://example.com"}}}}, {{"extract_content": {{"goal": "extract the names"}}}}]
@@ -39,6 +39,7 @@ Common action sequences:
 - Only provide the action sequence until an action which changes the page state significantly.
 - Try to be efficient, e.g. fill forms at once, or chain actions where nothing changes on the page
 - only use multiple actions if it makes sense.
+- Only chose from below available actions.

 3. ELEMENT INTERACTION:
 - Only use indexes of the interactive elements
@@ -73,4 +74,7 @@ Common action sequences:

 9. Extraction:
 - If your task is to find information - call extract_content on the specific pages to get and store the information.
-Your responses must be always JSON with the specified format. 
+Your responses must be always JSON with the specified format. 
+
+Available Actions:
+{available_actions}
--- a/tests/test_browser_use.py
+++ b/tests/test_browser_use.py
@@ -118,26 +118,26 @@ async def test_browser_use_custom():
    #     api_key=os.getenv("OPENAI_API_KEY", ""),
    # )

+    llm = utils.get_llm_model(
+        provider="azure_openai",
+        model_name="gpt-4o",
+        temperature=0.5,
+        base_url=os.getenv("AZURE_OPENAI_ENDPOINT", ""),
+        api_key=os.getenv("AZURE_OPENAI_API_KEY", ""),
+    )
+
    # llm = utils.get_llm_model(
-    #     provider="azure_openai",
-    #     model_name="gpt-4o",
+    #     provider="google",
+    #     model_name="gemini-2.0-flash",
    #     temperature=0.6,
-    #     base_url=os.getenv("AZURE_OPENAI_ENDPOINT", ""),
-    #     api_key=os.getenv("AZURE_OPENAI_API_KEY", ""),
+    #     api_key=os.getenv("GOOGLE_API_KEY", "")
    # )

-    llm = utils.get_llm_model(
-        provider="google",
-        model_name="gemini-2.0-flash",
-        temperature=0.6,
-        api_key=os.getenv("GOOGLE_API_KEY", "")
-    )
-
-    llm = utils.get_llm_model(
-        provider="deepseek",
-        model_name="deepseek-reasoner",
-        temperature=0.8
-    )
+    # llm = utils.get_llm_model(
+    #     provider="deepseek",
+    #     model_name="deepseek-reasoner",
+    #     temperature=0.8
+    # )

    # llm = utils.get_llm_model(
    #     provider="deepseek",
@@ -156,9 +156,9 @@ async def test_browser_use_custom():
    controller = CustomController()
    use_own_browser = True
    disable_security = True
-    use_vision = False  # Set to False when using DeepSeek
+    use_vision = True  # Set to False when using DeepSeek

-    max_actions_per_step = 1
+    max_actions_per_step = 10
    playwright = None
    browser = None
    browser_context = None
@@ -193,7 +193,7 @@ async def test_browser_use_custom():
            )
        )
        agent = CustomAgent(
-            task="Give me stock price of Nvidia",
+            task="open youtube in tab 1 , open google email in tab 2, open facebook in tab 3",
            add_infos="",  # some hints for llm to complete the task
            llm=llm,
            browser=browser,
--- a/webui.py
+++ b/webui.py
@@ -332,7 +332,7 @@ async def run_org_agent(
    try:
        global _global_browser, _global_browser_context, _global_agent

-        extra_chromium_args = [f"--window-size={window_w},{window_h}"]
+        extra_chromium_args = ["--accept_downloads=True", f"--window-size={window_w},{window_h}"]
        cdp_url = chrome_cdp

        if use_own_browser:
@@ -362,6 +362,7 @@ async def run_org_agent(
                config=BrowserContextConfig(
                    trace_path=save_trace_path if save_trace_path else None,
                    save_recording_path=save_recording_path if save_recording_path else None,
+                    save_downloads_path="./tmp/downloads",
                    no_viewport=False,
                    browser_window_size=BrowserContextWindowSize(
                        width=window_w, height=window_h
@@ -435,7 +436,7 @@ async def run_custom_agent(
    try:
        global _global_browser, _global_browser_context, _global_agent

-        extra_chromium_args = [f"--window-size={window_w},{window_h}"]
+        extra_chromium_args = ["--accept_downloads=True", f"--window-size={window_w},{window_h}"]
        cdp_url = chrome_cdp
        if use_own_browser:
            cdp_url = os.getenv("CHROME_CDP", chrome_cdp)
@@ -470,6 +471,7 @@ async def run_custom_agent(
                    trace_path=save_trace_path if save_trace_path else None,
                    save_recording_path=save_recording_path if save_recording_path else None,
                    no_viewport=False,
+                    save_downloads_path="./tmp/downloads",
                    browser_window_size=BrowserContextWindowSize(
                        width=window_w, height=window_h
                    ),