From d70db733a4bd2529f0aa008f4194348d6a769e74 Mon Sep 17 00:00:00 2001 From: alex Date: Sat, 12 Apr 2025 21:05:02 +0800 Subject: [PATCH] fix multiple tab --- src/agent/custom_message_manager.py | 6 +++-- src/agent/custom_prompts.py | 12 +++++++++ src/agent/custom_system_prompt.md | 8 ++++-- tests/test_browser_use.py | 38 ++++++++++++++--------------- webui.py | 6 +++-- 5 files changed, 45 insertions(+), 25 deletions(-) diff --git a/src/agent/custom_message_manager.py b/src/agent/custom_message_manager.py index 212c3fb..99836b2 100644 --- a/src/agent/custom_message_manager.py +++ b/src/agent/custom_message_manager.py @@ -74,7 +74,8 @@ class CustomMessageManager(MessageManager): min_message_len = 2 if self.context_content is not None else 1 while diff > 0 and len(self.state.history.messages) > min_message_len: - self.state.history.remove_message(min_message_len) # always remove the oldest message + msg = self.state.history.messages.pop(min_message_len) + self.state.history.current_tokens -= msg.metadata.tokens diff = self.state.history.current_tokens - self.settings.max_input_tokens def add_state_message( @@ -104,6 +105,7 @@ class CustomMessageManager(MessageManager): if isinstance(self.state.history.messages[i].message, HumanMessage): remove_cnt += 1 if remove_cnt == abs(remove_ind): - self.state.history.messages.pop(i) + msg = self.state.history.messages.pop(i) + self.state.history.current_tokens -= msg.metadata.tokens break i -= 1 diff --git a/src/agent/custom_prompts.py b/src/agent/custom_prompts.py index 6ec6cff..02f1777 100644 --- a/src/agent/custom_prompts.py +++ b/src/agent/custom_prompts.py @@ -21,6 +21,18 @@ class CustomSystemPrompt(SystemPrompt): except Exception as e: raise RuntimeError(f'Failed to load system prompt template: {e}') + def get_system_message(self) -> SystemMessage: + """ + Get the system prompt for the agent. + + Returns: + SystemMessage: Formatted system prompt + """ + prompt = self.prompt_template.format(max_actions=self.max_actions_per_step, + available_actions=self.default_action_description) + + return SystemMessage(content=prompt) + class CustomAgentMessagePrompt(AgentMessagePrompt): def __init__( diff --git a/src/agent/custom_system_prompt.md b/src/agent/custom_system_prompt.md index 9cefaa2..594fdc0 100644 --- a/src/agent/custom_system_prompt.md +++ b/src/agent/custom_system_prompt.md @@ -30,7 +30,7 @@ Example: ] }} -2. ACTIONS: You can specify multiple actions in the list to be executed in sequence. But always specify only one action name per item. Use maximum {{max_actions}} actions per sequence. +2. ACTIONS: You can specify multiple actions in the list to be executed in sequence. But always specify only one action name per item. Use maximum {max_actions} actions per sequence. Common action sequences: - Form filling: [{{"input_text": {{"index": 1, "text": "username"}}}}, {{"input_text": {{"index": 2, "text": "password"}}}}, {{"click_element": {{"index": 3}}}}] - Navigation and extraction: [{{"go_to_url": {{"url": "https://example.com"}}}}, {{"extract_content": {{"goal": "extract the names"}}}}] @@ -39,6 +39,7 @@ Common action sequences: - Only provide the action sequence until an action which changes the page state significantly. - Try to be efficient, e.g. fill forms at once, or chain actions where nothing changes on the page - only use multiple actions if it makes sense. +- Only chose from below available actions. 3. ELEMENT INTERACTION: - Only use indexes of the interactive elements @@ -73,4 +74,7 @@ Common action sequences: 9. Extraction: - If your task is to find information - call extract_content on the specific pages to get and store the information. -Your responses must be always JSON with the specified format. \ No newline at end of file +Your responses must be always JSON with the specified format. + +Available Actions: +{available_actions} \ No newline at end of file diff --git a/tests/test_browser_use.py b/tests/test_browser_use.py index 6ef4210..cb321db 100644 --- a/tests/test_browser_use.py +++ b/tests/test_browser_use.py @@ -118,26 +118,26 @@ async def test_browser_use_custom(): # api_key=os.getenv("OPENAI_API_KEY", ""), # ) + llm = utils.get_llm_model( + provider="azure_openai", + model_name="gpt-4o", + temperature=0.5, + base_url=os.getenv("AZURE_OPENAI_ENDPOINT", ""), + api_key=os.getenv("AZURE_OPENAI_API_KEY", ""), + ) + # llm = utils.get_llm_model( - # provider="azure_openai", - # model_name="gpt-4o", + # provider="google", + # model_name="gemini-2.0-flash", # temperature=0.6, - # base_url=os.getenv("AZURE_OPENAI_ENDPOINT", ""), - # api_key=os.getenv("AZURE_OPENAI_API_KEY", ""), + # api_key=os.getenv("GOOGLE_API_KEY", "") # ) - llm = utils.get_llm_model( - provider="google", - model_name="gemini-2.0-flash", - temperature=0.6, - api_key=os.getenv("GOOGLE_API_KEY", "") - ) - - llm = utils.get_llm_model( - provider="deepseek", - model_name="deepseek-reasoner", - temperature=0.8 - ) + # llm = utils.get_llm_model( + # provider="deepseek", + # model_name="deepseek-reasoner", + # temperature=0.8 + # ) # llm = utils.get_llm_model( # provider="deepseek", @@ -156,9 +156,9 @@ async def test_browser_use_custom(): controller = CustomController() use_own_browser = True disable_security = True - use_vision = False # Set to False when using DeepSeek + use_vision = True # Set to False when using DeepSeek - max_actions_per_step = 1 + max_actions_per_step = 10 playwright = None browser = None browser_context = None @@ -193,7 +193,7 @@ async def test_browser_use_custom(): ) ) agent = CustomAgent( - task="Give me stock price of Nvidia", + task="open youtube in tab 1 , open google email in tab 2, open facebook in tab 3", add_infos="", # some hints for llm to complete the task llm=llm, browser=browser, diff --git a/webui.py b/webui.py index bc68605..33d7ece 100644 --- a/webui.py +++ b/webui.py @@ -332,7 +332,7 @@ async def run_org_agent( try: global _global_browser, _global_browser_context, _global_agent - extra_chromium_args = [f"--window-size={window_w},{window_h}"] + extra_chromium_args = ["--accept_downloads=True", f"--window-size={window_w},{window_h}"] cdp_url = chrome_cdp if use_own_browser: @@ -362,6 +362,7 @@ async def run_org_agent( config=BrowserContextConfig( trace_path=save_trace_path if save_trace_path else None, save_recording_path=save_recording_path if save_recording_path else None, + save_downloads_path="./tmp/downloads", no_viewport=False, browser_window_size=BrowserContextWindowSize( width=window_w, height=window_h @@ -435,7 +436,7 @@ async def run_custom_agent( try: global _global_browser, _global_browser_context, _global_agent - extra_chromium_args = [f"--window-size={window_w},{window_h}"] + extra_chromium_args = ["--accept_downloads=True", f"--window-size={window_w},{window_h}"] cdp_url = chrome_cdp if use_own_browser: cdp_url = os.getenv("CHROME_CDP", chrome_cdp) @@ -470,6 +471,7 @@ async def run_custom_agent( trace_path=save_trace_path if save_trace_path else None, save_recording_path=save_recording_path if save_recording_path else None, no_viewport=False, + save_downloads_path="./tmp/downloads", browser_window_size=BrowserContextWindowSize( width=window_w, height=window_h ),