From d711c856441af1f8f4c0fa8803a9522e18a8c991 Mon Sep 17 00:00:00 2001 From: M87monster <2772762669@qq.com> Date: Thu, 3 Apr 2025 07:12:40 +0800 Subject: [PATCH 01/35] Added siliconflow API support --- .env.example | 3 ++ src/utils/llm.py | 90 +++++++++++++++++++++++++++++++++++++++++++++- src/utils/utils.py | 56 +++++++++++++++++++++++++++-- 3 files changed, 145 insertions(+), 4 deletions(-) diff --git a/.env.example b/.env.example index d4bf83f..d99f358 100644 --- a/.env.example +++ b/.env.example @@ -27,6 +27,9 @@ MOONSHOT_API_KEY= UNBOUND_ENDPOINT=https://api.getunbound.ai UNBOUND_API_KEY= +SiliconFLOW_ENDPOINT=https://api.siliconflow.cn/v1/ +SiliconFLOW_API_KEY= + # Set to false to disable anonymized telemetry ANONYMIZED_TELEMETRY=false diff --git a/src/utils/llm.py b/src/utils/llm.py index aada234..afb9def 100644 --- a/src/utils/llm.py +++ b/src/utils/llm.py @@ -37,7 +37,7 @@ from typing import ( Literal, Optional, Union, - cast, + cast, List, ) @@ -136,3 +136,91 @@ class DeepSeekR1ChatOllama(ChatOllama): if "**JSON Response:**" in content: content = content.split("**JSON Response:**")[-1] return AIMessage(content=content, reasoning_content=reasoning_content) + + +class SiliconFlowChat(ChatOpenAI): + """Wrapper for SiliconFlow Chat API, fully compatible with OpenAI-spec format.""" + + def __init__(self, *args: Any, **kwargs: Any) -> None: + super().__init__(*args, **kwargs) + + # Ensure the API client is initialized with SiliconFlow's endpoint and key + self.client = OpenAI( + api_key=kwargs.get("api_key"), + base_url=kwargs.get("base_url") + ) + + async def ainvoke( + self, + input: LanguageModelInput, + config: Optional[RunnableConfig] = None, + *, + stop: Optional[List[str]] = None, + **kwargs: Any, + ) -> AIMessage: + """Async call SiliconFlow API.""" + + # Convert input messages into OpenAI-compatible format + message_history = [] + for input_msg in input: + if isinstance(input_msg, SystemMessage): + message_history.append({"role": "system", "content": input_msg.content}) + elif isinstance(input_msg, AIMessage): + message_history.append({"role": "assistant", "content": input_msg.content}) + else: # HumanMessage or similar + message_history.append({"role": "user", "content": input_msg.content}) + + # Send request to SiliconFlow API (OpenAI-spec endpoint) + response = await self.client.chat.completions.create( + model=self.model_name, + messages=message_history, + stop=stop, + **kwargs, + ) + + # Extract the AI response (SiliconFlow's response must match OpenAI format) + if hasattr(response.choices[0].message, "reasoning_content"): + reasoning_content = response.choices[0].message.reasoning_content + else: + reasoning_content = None + + content = response.choices[0].message.content + return AIMessage(content=content, reasoning_content=reasoning_content) # Return reasoning_content if needed + + def invoke( + self, + input: LanguageModelInput, + config: Optional[RunnableConfig] = None, + *, + stop: Optional[List[str]] = None, + **kwargs: Any, + ) -> AIMessage: + """Sync call SiliconFlow API.""" + + # Same conversion as async version + message_history = [] + for input_msg in input: + if isinstance(input_msg, SystemMessage): + message_history.append({"role": "system", "content": input_msg.content}) + elif isinstance(input_msg, AIMessage): + message_history.append({"role": "assistant", "content": input_msg.content}) + else: + message_history.append({"role": "user", "content": input_msg.content}) + + # Sync call + response = self.client.chat.completions.create( + model=self.model_name, + messages=message_history, + stop=stop, + **kwargs, + ) + + # Handle reasoning_content (if supported) + reasoning_content = None + if hasattr(response.choices[0].message, "reasoning_content"): + reasoning_content = response.choices[0].message.reasoning_content + + return AIMessage( + content=response.choices[0].message.content, + reasoning_content=reasoning_content, # Only if SiliconFlow supports it + ) diff --git a/src/utils/utils.py b/src/utils/utils.py index 07a6730..a6e346b 100644 --- a/src/utils/utils.py +++ b/src/utils/utils.py @@ -14,7 +14,7 @@ from langchain_google_genai import ChatGoogleGenerativeAI from langchain_ollama import ChatOllama from langchain_openai import AzureChatOpenAI, ChatOpenAI -from .llm import DeepSeekR1ChatOpenAI, DeepSeekR1ChatOllama +from .llm import DeepSeekR1ChatOpenAI, DeepSeekR1ChatOllama,SiliconFlowChat PROVIDER_DISPLAY_NAMES = { "openai": "OpenAI", @@ -165,9 +165,26 @@ def get_llm_model(provider: str, **kwargs): return ChatOpenAI( model=kwargs.get("model_name", "gpt-4o-mini"), temperature=kwargs.get("temperature", 0.0), - base_url = os.getenv("UNBOUND_ENDPOINT", "https://api.getunbound.ai"), + base_url=os.getenv("UNBOUND_ENDPOINT", "https://api.getunbound.ai"), api_key=api_key, ) + elif provider == "siliconflow": + if not kwargs.get("api_key", ""): + api_key = os.getenv("SiliconFLOW_API_KEY", "") + else: + api_key = kwargs.get("api_key") + if not kwargs.get("base_url", ""): + base_url = os.getenv("SiliconFLOW_ENDPOINT", "") + else: + base_url = kwargs.get("base_url") + return SiliconFlowChat( + api_key=api_key, + base_url=base_url, + model_name=kwargs.get("model_name", "Qwen/QwQ-32B"), + temperature=kwargs.get("temperature", 0.0), + max_tokens=kwargs.get("max_tokens", 512), + frequency_penalty=kwargs.get("frequency_penalty", 0.5), + ) else: raise ValueError(f"Unsupported provider: {provider}") @@ -185,7 +202,40 @@ model_names = { "mistral": ["pixtral-large-latest", "mistral-large-latest", "mistral-small-latest", "ministral-8b-latest"], "alibaba": ["qwen-plus", "qwen-max", "qwen-turbo", "qwen-long"], "moonshot": ["moonshot-v1-32k-vision-preview", "moonshot-v1-8k-vision-preview"], - "unbound": ["gemini-2.0-flash","gpt-4o-mini", "gpt-4o", "gpt-4.5-preview"] + "unbound": ["gemini-2.0-flash", "gpt-4o-mini", "gpt-4o", "gpt-4.5-preview"], + "siliconflow": [ + "deepseek-ai/DeepSeek-R1", + "deepseek-ai/DeepSeek-V3", + "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B", + "deepseek-ai/DeepSeek-R1-Distill-Qwen-14B", + "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B", + "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B", + "deepseek-ai/DeepSeek-V2.5", + "deepseek-ai/deepseek-vl2", + "Qwen/Qwen2.5-72B-Instruct-128K", + "Qwen/Qwen2.5-72B-Instruct", + "Qwen/Qwen2.5-32B-Instruct", + "Qwen/Qwen2.5-14B-Instruct", + "Qwen/Qwen2.5-7B-Instruct", + "Qwen/Qwen2.5-Coder-32B-Instruct", + "Qwen/Qwen2.5-Coder-7B-Instruct", + "Qwen/Qwen2-7B-Instruct", + "Qwen/Qwen2-1.5B-Instruct", + "Qwen/QwQ-32B-Preview", + "Qwen/Qwen2-VL-72B-Instruct", + "Qwen/Qwen2.5-VL-32B-Instruct", + "Qwen/Qwen2.5-VL-72B-Instruct", + "TeleAI/TeleChat2", + "THUDM/glm-4-9b-chat", + "Vendor-A/Qwen/Qwen2.5-72B-Instruct", + "internlm/internlm2_5-7b-chat", + "internlm/internlm2_5-20b-chat", + "Pro/Qwen/Qwen2.5-7B-Instruct", + "Pro/Qwen/Qwen2-7B-Instruct", + "Pro/Qwen/Qwen2-1.5B-Instruct", + "Pro/THUDM/chatglm3-6b", + "Pro/THUDM/glm-4-9b-chat", + ], } From d70db733a4bd2529f0aa008f4194348d6a769e74 Mon Sep 17 00:00:00 2001 From: alex Date: Sat, 12 Apr 2025 21:05:02 +0800 Subject: [PATCH 02/35] fix multiple tab --- src/agent/custom_message_manager.py | 6 +++-- src/agent/custom_prompts.py | 12 +++++++++ src/agent/custom_system_prompt.md | 8 ++++-- tests/test_browser_use.py | 38 ++++++++++++++--------------- webui.py | 6 +++-- 5 files changed, 45 insertions(+), 25 deletions(-) diff --git a/src/agent/custom_message_manager.py b/src/agent/custom_message_manager.py index 212c3fb..99836b2 100644 --- a/src/agent/custom_message_manager.py +++ b/src/agent/custom_message_manager.py @@ -74,7 +74,8 @@ class CustomMessageManager(MessageManager): min_message_len = 2 if self.context_content is not None else 1 while diff > 0 and len(self.state.history.messages) > min_message_len: - self.state.history.remove_message(min_message_len) # always remove the oldest message + msg = self.state.history.messages.pop(min_message_len) + self.state.history.current_tokens -= msg.metadata.tokens diff = self.state.history.current_tokens - self.settings.max_input_tokens def add_state_message( @@ -104,6 +105,7 @@ class CustomMessageManager(MessageManager): if isinstance(self.state.history.messages[i].message, HumanMessage): remove_cnt += 1 if remove_cnt == abs(remove_ind): - self.state.history.messages.pop(i) + msg = self.state.history.messages.pop(i) + self.state.history.current_tokens -= msg.metadata.tokens break i -= 1 diff --git a/src/agent/custom_prompts.py b/src/agent/custom_prompts.py index 6ec6cff..02f1777 100644 --- a/src/agent/custom_prompts.py +++ b/src/agent/custom_prompts.py @@ -21,6 +21,18 @@ class CustomSystemPrompt(SystemPrompt): except Exception as e: raise RuntimeError(f'Failed to load system prompt template: {e}') + def get_system_message(self) -> SystemMessage: + """ + Get the system prompt for the agent. + + Returns: + SystemMessage: Formatted system prompt + """ + prompt = self.prompt_template.format(max_actions=self.max_actions_per_step, + available_actions=self.default_action_description) + + return SystemMessage(content=prompt) + class CustomAgentMessagePrompt(AgentMessagePrompt): def __init__( diff --git a/src/agent/custom_system_prompt.md b/src/agent/custom_system_prompt.md index 9cefaa2..594fdc0 100644 --- a/src/agent/custom_system_prompt.md +++ b/src/agent/custom_system_prompt.md @@ -30,7 +30,7 @@ Example: ] }} -2. ACTIONS: You can specify multiple actions in the list to be executed in sequence. But always specify only one action name per item. Use maximum {{max_actions}} actions per sequence. +2. ACTIONS: You can specify multiple actions in the list to be executed in sequence. But always specify only one action name per item. Use maximum {max_actions} actions per sequence. Common action sequences: - Form filling: [{{"input_text": {{"index": 1, "text": "username"}}}}, {{"input_text": {{"index": 2, "text": "password"}}}}, {{"click_element": {{"index": 3}}}}] - Navigation and extraction: [{{"go_to_url": {{"url": "https://example.com"}}}}, {{"extract_content": {{"goal": "extract the names"}}}}] @@ -39,6 +39,7 @@ Common action sequences: - Only provide the action sequence until an action which changes the page state significantly. - Try to be efficient, e.g. fill forms at once, or chain actions where nothing changes on the page - only use multiple actions if it makes sense. +- Only chose from below available actions. 3. ELEMENT INTERACTION: - Only use indexes of the interactive elements @@ -73,4 +74,7 @@ Common action sequences: 9. Extraction: - If your task is to find information - call extract_content on the specific pages to get and store the information. -Your responses must be always JSON with the specified format. \ No newline at end of file +Your responses must be always JSON with the specified format. + +Available Actions: +{available_actions} \ No newline at end of file diff --git a/tests/test_browser_use.py b/tests/test_browser_use.py index 6ef4210..cb321db 100644 --- a/tests/test_browser_use.py +++ b/tests/test_browser_use.py @@ -118,26 +118,26 @@ async def test_browser_use_custom(): # api_key=os.getenv("OPENAI_API_KEY", ""), # ) + llm = utils.get_llm_model( + provider="azure_openai", + model_name="gpt-4o", + temperature=0.5, + base_url=os.getenv("AZURE_OPENAI_ENDPOINT", ""), + api_key=os.getenv("AZURE_OPENAI_API_KEY", ""), + ) + # llm = utils.get_llm_model( - # provider="azure_openai", - # model_name="gpt-4o", + # provider="google", + # model_name="gemini-2.0-flash", # temperature=0.6, - # base_url=os.getenv("AZURE_OPENAI_ENDPOINT", ""), - # api_key=os.getenv("AZURE_OPENAI_API_KEY", ""), + # api_key=os.getenv("GOOGLE_API_KEY", "") # ) - llm = utils.get_llm_model( - provider="google", - model_name="gemini-2.0-flash", - temperature=0.6, - api_key=os.getenv("GOOGLE_API_KEY", "") - ) - - llm = utils.get_llm_model( - provider="deepseek", - model_name="deepseek-reasoner", - temperature=0.8 - ) + # llm = utils.get_llm_model( + # provider="deepseek", + # model_name="deepseek-reasoner", + # temperature=0.8 + # ) # llm = utils.get_llm_model( # provider="deepseek", @@ -156,9 +156,9 @@ async def test_browser_use_custom(): controller = CustomController() use_own_browser = True disable_security = True - use_vision = False # Set to False when using DeepSeek + use_vision = True # Set to False when using DeepSeek - max_actions_per_step = 1 + max_actions_per_step = 10 playwright = None browser = None browser_context = None @@ -193,7 +193,7 @@ async def test_browser_use_custom(): ) ) agent = CustomAgent( - task="Give me stock price of Nvidia", + task="open youtube in tab 1 , open google email in tab 2, open facebook in tab 3", add_infos="", # some hints for llm to complete the task llm=llm, browser=browser, diff --git a/webui.py b/webui.py index bc68605..33d7ece 100644 --- a/webui.py +++ b/webui.py @@ -332,7 +332,7 @@ async def run_org_agent( try: global _global_browser, _global_browser_context, _global_agent - extra_chromium_args = [f"--window-size={window_w},{window_h}"] + extra_chromium_args = ["--accept_downloads=True", f"--window-size={window_w},{window_h}"] cdp_url = chrome_cdp if use_own_browser: @@ -362,6 +362,7 @@ async def run_org_agent( config=BrowserContextConfig( trace_path=save_trace_path if save_trace_path else None, save_recording_path=save_recording_path if save_recording_path else None, + save_downloads_path="./tmp/downloads", no_viewport=False, browser_window_size=BrowserContextWindowSize( width=window_w, height=window_h @@ -435,7 +436,7 @@ async def run_custom_agent( try: global _global_browser, _global_browser_context, _global_agent - extra_chromium_args = [f"--window-size={window_w},{window_h}"] + extra_chromium_args = ["--accept_downloads=True", f"--window-size={window_w},{window_h}"] cdp_url = chrome_cdp if use_own_browser: cdp_url = os.getenv("CHROME_CDP", chrome_cdp) @@ -470,6 +471,7 @@ async def run_custom_agent( trace_path=save_trace_path if save_trace_path else None, save_recording_path=save_recording_path if save_recording_path else None, no_viewport=False, + save_downloads_path="./tmp/downloads", browser_window_size=BrowserContextWindowSize( width=window_w, height=window_h ), From 61de4e8631bf5e66bcf6358ff70fd491f1599f91 Mon Sep 17 00:00:00 2001 From: M87monster <2772762669@qq.com> Date: Sat, 12 Apr 2025 22:21:47 +0800 Subject: [PATCH 03/35] =?UTF-8?q?=E4=BF=AE=E6=94=B9=E4=B8=BA=E7=9B=B4?= =?UTF-8?q?=E6=8E=A5=E4=BD=BF=E7=94=A8OpenAIChat?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/utils/llm.py | 88 ---------------------------------------------- src/utils/utils.py | 6 ++-- 2 files changed, 2 insertions(+), 92 deletions(-) diff --git a/src/utils/llm.py b/src/utils/llm.py index afb9def..0b601ed 100644 --- a/src/utils/llm.py +++ b/src/utils/llm.py @@ -136,91 +136,3 @@ class DeepSeekR1ChatOllama(ChatOllama): if "**JSON Response:**" in content: content = content.split("**JSON Response:**")[-1] return AIMessage(content=content, reasoning_content=reasoning_content) - - -class SiliconFlowChat(ChatOpenAI): - """Wrapper for SiliconFlow Chat API, fully compatible with OpenAI-spec format.""" - - def __init__(self, *args: Any, **kwargs: Any) -> None: - super().__init__(*args, **kwargs) - - # Ensure the API client is initialized with SiliconFlow's endpoint and key - self.client = OpenAI( - api_key=kwargs.get("api_key"), - base_url=kwargs.get("base_url") - ) - - async def ainvoke( - self, - input: LanguageModelInput, - config: Optional[RunnableConfig] = None, - *, - stop: Optional[List[str]] = None, - **kwargs: Any, - ) -> AIMessage: - """Async call SiliconFlow API.""" - - # Convert input messages into OpenAI-compatible format - message_history = [] - for input_msg in input: - if isinstance(input_msg, SystemMessage): - message_history.append({"role": "system", "content": input_msg.content}) - elif isinstance(input_msg, AIMessage): - message_history.append({"role": "assistant", "content": input_msg.content}) - else: # HumanMessage or similar - message_history.append({"role": "user", "content": input_msg.content}) - - # Send request to SiliconFlow API (OpenAI-spec endpoint) - response = await self.client.chat.completions.create( - model=self.model_name, - messages=message_history, - stop=stop, - **kwargs, - ) - - # Extract the AI response (SiliconFlow's response must match OpenAI format) - if hasattr(response.choices[0].message, "reasoning_content"): - reasoning_content = response.choices[0].message.reasoning_content - else: - reasoning_content = None - - content = response.choices[0].message.content - return AIMessage(content=content, reasoning_content=reasoning_content) # Return reasoning_content if needed - - def invoke( - self, - input: LanguageModelInput, - config: Optional[RunnableConfig] = None, - *, - stop: Optional[List[str]] = None, - **kwargs: Any, - ) -> AIMessage: - """Sync call SiliconFlow API.""" - - # Same conversion as async version - message_history = [] - for input_msg in input: - if isinstance(input_msg, SystemMessage): - message_history.append({"role": "system", "content": input_msg.content}) - elif isinstance(input_msg, AIMessage): - message_history.append({"role": "assistant", "content": input_msg.content}) - else: - message_history.append({"role": "user", "content": input_msg.content}) - - # Sync call - response = self.client.chat.completions.create( - model=self.model_name, - messages=message_history, - stop=stop, - **kwargs, - ) - - # Handle reasoning_content (if supported) - reasoning_content = None - if hasattr(response.choices[0].message, "reasoning_content"): - reasoning_content = response.choices[0].message.reasoning_content - - return AIMessage( - content=response.choices[0].message.content, - reasoning_content=reasoning_content, # Only if SiliconFlow supports it - ) diff --git a/src/utils/utils.py b/src/utils/utils.py index a6e346b..62fc8a8 100644 --- a/src/utils/utils.py +++ b/src/utils/utils.py @@ -14,7 +14,7 @@ from langchain_google_genai import ChatGoogleGenerativeAI from langchain_ollama import ChatOllama from langchain_openai import AzureChatOpenAI, ChatOpenAI -from .llm import DeepSeekR1ChatOpenAI, DeepSeekR1ChatOllama,SiliconFlowChat +from .llm import DeepSeekR1ChatOpenAI, DeepSeekR1ChatOllama PROVIDER_DISPLAY_NAMES = { "openai": "OpenAI", @@ -177,13 +177,11 @@ def get_llm_model(provider: str, **kwargs): base_url = os.getenv("SiliconFLOW_ENDPOINT", "") else: base_url = kwargs.get("base_url") - return SiliconFlowChat( + return ChatOpenAI( api_key=api_key, base_url=base_url, model_name=kwargs.get("model_name", "Qwen/QwQ-32B"), temperature=kwargs.get("temperature", 0.0), - max_tokens=kwargs.get("max_tokens", 512), - frequency_penalty=kwargs.get("frequency_penalty", 0.5), ) else: raise ValueError(f"Unsupported provider: {provider}") From 69a4b675b2ebc82e4ad92023dc38784c71853dbd Mon Sep 17 00:00:00 2001 From: Madhuri Pednekar Date: Thu, 24 Apr 2025 17:17:20 +0530 Subject: [PATCH 04/35] Added IBM watsonx model support --- .env.example | 4 ++++ docker-compose.yml | 3 +++ requirements.txt | 1 + src/utils/utils.py | 22 +++++++++++++++++++++- tests/test_llm_api.py | 8 +++++++- 5 files changed, 36 insertions(+), 2 deletions(-) diff --git a/.env.example b/.env.example index d99f358..ad0bc6a 100644 --- a/.env.example +++ b/.env.example @@ -30,6 +30,10 @@ UNBOUND_API_KEY= SiliconFLOW_ENDPOINT=https://api.siliconflow.cn/v1/ SiliconFLOW_API_KEY= +IBM_ENDPOINT=https://us-south.ml.cloud.ibm.com +IBM_API_KEY= +IBM_PROJECT_ID= + # Set to false to disable anonymized telemetry ANONYMIZED_TELEMETRY=false diff --git a/docker-compose.yml b/docker-compose.yml index 9c907e6..75b0fd0 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -28,6 +28,9 @@ services: - ALIBABA_API_KEY=${ALIBABA_API_KEY:-} - MOONSHOT_ENDPOINT=${MOONSHOT_ENDPOINT:-https://api.moonshot.cn/v1} - MOONSHOT_API_KEY=${MOONSHOT_API_KEY:-} + - IBM_API_KEY=${IBM_API_KEY:-} + - IBM_ENDPOINT=${IBM_ENDPOINT:-https://us-south.ml.cloud.ibm.com} + - IBM_PROJECT_ID=${IBM_PROJECT_ID:-} - BROWSER_USE_LOGGING_LEVEL=${BROWSER_USE_LOGGING_LEVEL:-info} - ANONYMIZED_TELEMETRY=${ANONYMIZED_TELEMETRY:-false} - CHROME_PATH=/usr/bin/google-chrome diff --git a/requirements.txt b/requirements.txt index 7f2d12c..14f1a6a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,3 +5,4 @@ json-repair langchain-mistralai==0.2.4 langchain-google-genai==2.0.8 MainContentExtractor==0.0.4 +langchain-ibm==0.3.10 \ No newline at end of file diff --git a/src/utils/utils.py b/src/utils/utils.py index 62fc8a8..f39dda0 100644 --- a/src/utils/utils.py +++ b/src/utils/utils.py @@ -13,6 +13,7 @@ from langchain_mistralai import ChatMistralAI from langchain_google_genai import ChatGoogleGenerativeAI from langchain_ollama import ChatOllama from langchain_openai import AzureChatOpenAI, ChatOpenAI +from langchain_ibm import ChatWatsonx from .llm import DeepSeekR1ChatOpenAI, DeepSeekR1ChatOllama @@ -24,7 +25,8 @@ PROVIDER_DISPLAY_NAMES = { "google": "Google", "alibaba": "Alibaba", "moonshot": "MoonShot", - "unbound": "Unbound AI" + "unbound": "Unbound AI", + "ibm": "IBM" } @@ -154,6 +156,23 @@ def get_llm_model(provider: str, **kwargs): base_url=base_url, api_key=api_key, ) + elif provider == "ibm": + parameters = { + "temperature": kwargs.get("temperature", 0.0), + "max_tokens": kwargs.get("num_ctx", 32000) + } + if not kwargs.get("base_url", ""): + base_url = os.getenv("IBM_ENDPOINT", "https://us-south.ml.cloud.ibm.com") + else: + base_url = kwargs.get("base_url") + + return ChatWatsonx( + model_id=kwargs.get("model_name", "ibm/granite-vision-3.1-2b-preview"), + url=base_url, + project_id=os.getenv("IBM_PROJECT_ID"), + apikey=os.getenv("IBM_API_KEY"), + params=parameters + ) elif provider == "moonshot": return ChatOpenAI( model=kwargs.get("model_name", "moonshot-v1-32k-vision-preview"), @@ -234,6 +253,7 @@ model_names = { "Pro/THUDM/chatglm3-6b", "Pro/THUDM/glm-4-9b-chat", ], + "ibm": ["meta-llama/llama-4-maverick-17b-128e-instruct-fp8","meta-llama/llama-3-2-90b-vision-instruct"] } diff --git a/tests/test_llm_api.py b/tests/test_llm_api.py index 1eb45f4..05bc06e 100644 --- a/tests/test_llm_api.py +++ b/tests/test_llm_api.py @@ -41,6 +41,7 @@ def get_env_value(key, provider): "mistral": {"api_key": "MISTRAL_API_KEY", "base_url": "MISTRAL_ENDPOINT"}, "alibaba": {"api_key": "ALIBABA_API_KEY", "base_url": "ALIBABA_ENDPOINT"}, "moonshot":{"api_key": "MOONSHOT_API_KEY", "base_url": "MOONSHOT_ENDPOINT"}, + "ibm": {"api_key": "IBM_API_KEY", "base_url": "IBM_ENDPOINT"} } if provider in env_mappings and key in env_mappings[provider]: @@ -126,12 +127,17 @@ def test_moonshot_model(): config = LLMConfig(provider="moonshot", model_name="moonshot-v1-32k-vision-preview") test_llm(config, "Describe this image", "assets/examples/test.png") +def test_ibm_model(): + config = LLMConfig(provider="ibm", model_name="meta-llama/llama-4-maverick-17b-128e-instruct-fp8") + test_llm(config, "Describe this image", "assets/examples/test.png") + if __name__ == "__main__": # test_openai_model() # test_google_model() # test_azure_openai_model() #test_deepseek_model() # test_ollama_model() - test_deepseek_r1_model() + # test_deepseek_r1_model() # test_deepseek_r1_ollama_model() # test_mistral_model() + test_ibm_model() From e2083af25537b08b252cc7ae6a9405582072afb2 Mon Sep 17 00:00:00 2001 From: Madhuri Pednekar Date: Thu, 24 Apr 2025 17:30:55 +0530 Subject: [PATCH 05/35] Added ibm/granite-vision-3.1-2b-preview in the list of supported models --- src/utils/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/utils/utils.py b/src/utils/utils.py index f39dda0..10ebf7a 100644 --- a/src/utils/utils.py +++ b/src/utils/utils.py @@ -253,7 +253,7 @@ model_names = { "Pro/THUDM/chatglm3-6b", "Pro/THUDM/glm-4-9b-chat", ], - "ibm": ["meta-llama/llama-4-maverick-17b-128e-instruct-fp8","meta-llama/llama-3-2-90b-vision-instruct"] + "ibm": ["ibm/granite-vision-3.1-2b-preview", "meta-llama/llama-4-maverick-17b-128e-instruct-fp8","meta-llama/llama-3-2-90b-vision-instruct"] } From 3c0a089fc5eb9b76aa53d5a3aa833bc826bdf5b3 Mon Sep 17 00:00:00 2001 From: vvincent1234 Date: Sat, 26 Apr 2025 23:14:40 +0800 Subject: [PATCH 06/35] add mcp tool --- requirements.txt | 8 ++-- src/controller/custom_controller.py | 74 ++++++++++++++++++++++++----- src/utils/mcp_client.py | 42 ++++++++++++++++ tests/test_controller.py | 31 ++++++++++++ 4 files changed, 138 insertions(+), 17 deletions(-) create mode 100644 src/utils/mcp_client.py create mode 100644 tests/test_controller.py diff --git a/requirements.txt b/requirements.txt index 14f1a6a..462f010 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,8 +1,8 @@ -browser-use==0.1.40 +browser-use==0.1.41 pyperclip==1.9.0 -gradio==5.23.1 +gradio==5.27.0 json-repair langchain-mistralai==0.2.4 -langchain-google-genai==2.0.8 MainContentExtractor==0.0.4 -langchain-ibm==0.3.10 \ No newline at end of file +langchain-ibm==0.3.10 +langchain_mcp_adapters==0.0.9 \ No newline at end of file diff --git a/src/controller/custom_controller.py b/src/controller/custom_controller.py index 560befa..9f95fc6 100644 --- a/src/controller/custom_controller.py +++ b/src/controller/custom_controller.py @@ -1,7 +1,7 @@ import pdb import pyperclip -from typing import Optional, Type +from typing import Optional, Type, Callable, Dict, Any, Union, Awaitable from pydantic import BaseModel from browser_use.agent.views import ActionResult from browser_use.browser.context import BrowserContext @@ -20,30 +20,78 @@ from browser_use.controller.views import ( SwitchTabAction, ) import logging +import inspect +import os +from src.utils import utils logger = logging.getLogger(__name__) class CustomController(Controller): def __init__(self, exclude_actions: list[str] = [], - output_model: Optional[Type[BaseModel]] = None + output_model: Optional[Type[BaseModel]] = None, + ask_assistant_callback: Optional[Union[Callable[[str, BrowserContext], Dict[str, Any]], Callable[ + [str, BrowserContext], Awaitable[Dict[str, Any]]]]] = None, + ): super().__init__(exclude_actions=exclude_actions, output_model=output_model) self._register_custom_actions() + self.ask_assistant_callback = ask_assistant_callback def _register_custom_actions(self): """Register all custom browser actions""" - @self.registry.action("Copy text to clipboard") - def copy_to_clipboard(text: str): - pyperclip.copy(text) - return ActionResult(extracted_content=text) + @self.registry.action( + "When executing tasks, prioritize autonomous completion. However, if you encounter a definitive blocker " + "that prevents you from proceeding independently – such as needing credentials you don't possess, " + "requiring subjective human judgment, needing a physical action performed, encountering complex CAPTCHAs, " + "or facing limitations in your capabilities – you must request human assistance." + ) + async def ask_for_assistant(query: str, browser: BrowserContext): + if self.ask_assistant_callback: + if inspect.iscoroutinefunction(self.ask_assistant_callback): + user_response = await self.ask_assistant_callback(query, browser) + else: + user_response = self.ask_assistant_callback(query, browser) + msg = f"AI ask: {query}. User response: {user_response['response']}" + logger.info(msg) + return ActionResult(extracted_content=msg, include_in_memory=True) + else: + return ActionResult(extracted_content="Human cannot help you. Please try another way.", + include_in_memory=True) - @self.registry.action("Paste text from clipboard") - async def paste_from_clipboard(browser: BrowserContext): - text = pyperclip.paste() - # send text to browser - page = await browser.get_current_page() - await page.keyboard.type(text) + @self.registry.action( + 'Upload file to interactive element with file path ', + ) + async def upload_file(index: int, path: str, browser: BrowserContext, available_file_paths: list[str]): + if path not in available_file_paths: + return ActionResult(error=f'File path {path} is not available') - return ActionResult(extracted_content=text) + if not os.path.exists(path): + return ActionResult(error=f'File {path} does not exist') + + dom_el = await browser.get_dom_element_by_index(index) + + file_upload_dom_el = dom_el.get_file_upload_element() + + if file_upload_dom_el is None: + msg = f'No file upload element found at index {index}' + logger.info(msg) + return ActionResult(error=msg) + + file_upload_el = await browser.get_locate_element(file_upload_dom_el) + + if file_upload_el is None: + msg = f'No file upload element found at index {index}' + logger.info(msg) + return ActionResult(error=msg) + + try: + await file_upload_el.set_input_files(path) + msg = f'Successfully uploaded file to index {index}' + logger.info(msg) + return ActionResult(extracted_content=msg, include_in_memory=True) + except Exception as e: + msg = f'Failed to upload file to index {index}: {str(e)}' + logger.info(msg) + return ActionResult(error=msg) diff --git a/src/utils/mcp_client.py b/src/utils/mcp_client.py new file mode 100644 index 0000000..aa5de2b --- /dev/null +++ b/src/utils/mcp_client.py @@ -0,0 +1,42 @@ +import os +import asyncio +import base64 +import pdb +from typing import List, Tuple, Optional +from langchain_core.tools import BaseTool +from langchain_mcp_adapters.client import MultiServerMCPClient +import base64 +import json +import logging +from typing import Optional, Dict, Any, Type +from langchain_core.tools import BaseTool +from pydantic.v1 import BaseModel, Field +from langchain_core.runnables import RunnableConfig + +logger = logging.getLogger(__name__) + + +async def setup_mcp_client_and_tools(mcp_server_config: Dict[str, Any]) -> Tuple[ + Optional[List[BaseTool]], Optional[MultiServerMCPClient]]: + """ + Initializes the MultiServerMCPClient, connects to servers, fetches tools, + filters them, and returns a flat list of usable tools and the client instance. + + Returns: + A tuple containing: + - list[BaseTool]: The filtered list of usable LangChain tools. + - MultiServerMCPClient | None: The initialized and started client instance, or None on failure. + """ + + logger.info("Initializing MultiServerMCPClient...") + + try: + client = MultiServerMCPClient(mcp_server_config) + await client.__aenter__() + mcp_tools = client.get_tools() + logger.info(f"Total usable MCP tools collected: {len(mcp_tools)}") + return mcp_tools, client + + except Exception as e: + logger.error(f"Failed to setup MCP client or fetch tools: {e}", exc_info=True) + return [], None diff --git a/tests/test_controller.py b/tests/test_controller.py new file mode 100644 index 0000000..93ed340 --- /dev/null +++ b/tests/test_controller.py @@ -0,0 +1,31 @@ +import asyncio +import pdb +import sys + +sys.path.append(".") + +from dotenv import load_dotenv + +load_dotenv() + + +async def test_mcp_client(): + from src.utils.mcp_client import setup_mcp_client_and_tools + + test_server_config = { + "playwright": { + "command": "npx", + "args": [ + "@playwright/mcp@latest", + ], + "transport": "stdio", + } + } + + mcp_tools, mcp_client = await setup_mcp_client_and_tools(test_server_config) + + pdb.set_trace() + + +if __name__ == '__main__': + asyncio.run(test_mcp_client()) From 70ac2f483a1f2f161ac98804b02facb63d6c1c80 Mon Sep 17 00:00:00 2001 From: vincent Date: Sun, 27 Apr 2025 21:21:56 +0800 Subject: [PATCH 07/35] refactor webui --- src/agent/custom_agent.py | 478 ------- src/agent/custom_message_manager.py | 111 -- src/agent/custom_prompts.py | 125 -- src/agent/custom_system_prompt.md | 80 -- src/agent/custom_views.py | 67 - .../deep_research_agent.py} | 9 +- src/controller/custom_controller.py | 87 +- src/utils/agent_state.py | 31 - src/utils/config.py | 62 + src/utils/llm.py | 138 -- src/utils/llm_provider.py | 325 +++++ src/utils/mcp_client.py | 231 +++- src/utils/utils.py | 257 ---- src/webui/__init__.py | 0 src/webui/components/__init__.py | 0 src/webui/components/agent_settings_tab.py | 228 ++++ src/webui/components/browser_settings_tab.py | 0 src/webui/components/load_save_config_tab.py | 0 src/webui/components/run_agent_tab.py | 4 + src/webui/components/run_deep_research_tab.py | 0 src/webui/interface.py | 68 + src/webui/webui_manager.py | 46 + tests/{test_browser_use.py => test_agents.py} | 4 +- tests/test_controller.py | 87 +- tests/test_deep_research.py | 30 - tests/test_llm_api.py | 28 +- webui.py | 1191 +--------------- webui2.py | 1202 +++++++++++++++++ 28 files changed, 2357 insertions(+), 2532 deletions(-) delete mode 100644 src/agent/custom_agent.py delete mode 100644 src/agent/custom_message_manager.py delete mode 100644 src/agent/custom_prompts.py delete mode 100644 src/agent/custom_system_prompt.md delete mode 100644 src/agent/custom_views.py rename src/{utils/deep_research.py => agent/deep_research_agent.py} (99%) delete mode 100644 src/utils/agent_state.py create mode 100644 src/utils/config.py delete mode 100644 src/utils/llm.py create mode 100644 src/utils/llm_provider.py create mode 100644 src/webui/__init__.py create mode 100644 src/webui/components/__init__.py create mode 100644 src/webui/components/agent_settings_tab.py create mode 100644 src/webui/components/browser_settings_tab.py create mode 100644 src/webui/components/load_save_config_tab.py create mode 100644 src/webui/components/run_agent_tab.py create mode 100644 src/webui/components/run_deep_research_tab.py create mode 100644 src/webui/interface.py create mode 100644 src/webui/webui_manager.py rename tests/{test_browser_use.py => test_agents.py} (99%) delete mode 100644 tests/test_deep_research.py create mode 100644 webui2.py diff --git a/src/agent/custom_agent.py b/src/agent/custom_agent.py deleted file mode 100644 index 4b0eff3..0000000 --- a/src/agent/custom_agent.py +++ /dev/null @@ -1,478 +0,0 @@ -import json -import logging -import pdb -import traceback -from typing import Any, Awaitable, Callable, Dict, Generic, List, Optional, Type, TypeVar -from PIL import Image, ImageDraw, ImageFont -import os -import base64 -import io -import asyncio -import time -import platform -from browser_use.agent.prompts import SystemPrompt, AgentMessagePrompt -from browser_use.agent.service import Agent -from browser_use.agent.message_manager.utils import convert_input_messages, extract_json_from_model_output, \ - save_conversation -from browser_use.agent.views import ( - ActionResult, - AgentError, - AgentHistory, - AgentHistoryList, - AgentOutput, - AgentSettings, - AgentState, - AgentStepInfo, - StepMetadata, - ToolCallingMethod, -) -from browser_use.agent.gif import create_history_gif -from browser_use.browser.browser import Browser -from browser_use.browser.context import BrowserContext -from browser_use.browser.views import BrowserStateHistory -from browser_use.controller.service import Controller -from browser_use.telemetry.views import ( - AgentEndTelemetryEvent, - AgentRunTelemetryEvent, - AgentStepTelemetryEvent, -) -from browser_use.utils import time_execution_async -from langchain_core.language_models.chat_models import BaseChatModel -from langchain_core.messages import ( - BaseMessage, - HumanMessage, - AIMessage -) -from browser_use.browser.views import BrowserState, BrowserStateHistory -from browser_use.agent.prompts import PlannerPrompt - -from json_repair import repair_json -from src.utils.agent_state import AgentState - -from .custom_message_manager import CustomMessageManager, CustomMessageManagerSettings -from .custom_views import CustomAgentOutput, CustomAgentStepInfo, CustomAgentState - -logger = logging.getLogger(__name__) - -Context = TypeVar('Context') - - -class CustomAgent(Agent): - def __init__( - self, - task: str, - llm: BaseChatModel, - add_infos: str = "", - # Optional parameters - browser: Browser | None = None, - browser_context: BrowserContext | None = None, - controller: Controller[Context] = Controller(), - # Initial agent run parameters - sensitive_data: Optional[Dict[str, str]] = None, - initial_actions: Optional[List[Dict[str, Dict[str, Any]]]] = None, - # Cloud Callbacks - register_new_step_callback: Callable[['BrowserState', 'AgentOutput', int], Awaitable[None]] | None = None, - register_done_callback: Callable[['AgentHistoryList'], Awaitable[None]] | None = None, - register_external_agent_status_raise_error_callback: Callable[[], Awaitable[bool]] | None = None, - # Agent settings - use_vision: bool = True, - use_vision_for_planner: bool = False, - save_conversation_path: Optional[str] = None, - save_conversation_path_encoding: Optional[str] = 'utf-8', - max_failures: int = 3, - retry_delay: int = 10, - system_prompt_class: Type[SystemPrompt] = SystemPrompt, - agent_prompt_class: Type[AgentMessagePrompt] = AgentMessagePrompt, - max_input_tokens: int = 128000, - validate_output: bool = False, - message_context: Optional[str] = None, - generate_gif: bool | str = False, - available_file_paths: Optional[list[str]] = None, - include_attributes: list[str] = [ - 'title', - 'type', - 'name', - 'role', - 'aria-label', - 'placeholder', - 'value', - 'alt', - 'aria-expanded', - 'data-date-format', - ], - max_actions_per_step: int = 10, - tool_calling_method: Optional[ToolCallingMethod] = 'auto', - page_extraction_llm: Optional[BaseChatModel] = None, - planner_llm: Optional[BaseChatModel] = None, - planner_interval: int = 1, # Run planner every N steps - # Inject state - injected_agent_state: Optional[AgentState] = None, - context: Context | None = None, - ): - super(CustomAgent, self).__init__( - task=task, - llm=llm, - browser=browser, - browser_context=browser_context, - controller=controller, - sensitive_data=sensitive_data, - initial_actions=initial_actions, - register_new_step_callback=register_new_step_callback, - register_done_callback=register_done_callback, - register_external_agent_status_raise_error_callback=register_external_agent_status_raise_error_callback, - use_vision=use_vision, - use_vision_for_planner=use_vision_for_planner, - save_conversation_path=save_conversation_path, - save_conversation_path_encoding=save_conversation_path_encoding, - max_failures=max_failures, - retry_delay=retry_delay, - system_prompt_class=system_prompt_class, - max_input_tokens=max_input_tokens, - validate_output=validate_output, - message_context=message_context, - generate_gif=generate_gif, - available_file_paths=available_file_paths, - include_attributes=include_attributes, - max_actions_per_step=max_actions_per_step, - tool_calling_method=tool_calling_method, - page_extraction_llm=page_extraction_llm, - planner_llm=planner_llm, - planner_interval=planner_interval, - injected_agent_state=injected_agent_state, - context=context, - ) - self.state = injected_agent_state or CustomAgentState() - self.add_infos = add_infos - self._message_manager = CustomMessageManager( - task=task, - system_message=self.settings.system_prompt_class( - self.available_actions, - max_actions_per_step=self.settings.max_actions_per_step, - ).get_system_message(), - settings=CustomMessageManagerSettings( - max_input_tokens=self.settings.max_input_tokens, - include_attributes=self.settings.include_attributes, - message_context=self.settings.message_context, - sensitive_data=sensitive_data, - available_file_paths=self.settings.available_file_paths, - agent_prompt_class=agent_prompt_class - ), - state=self.state.message_manager_state, - ) - - def _log_response(self, response: CustomAgentOutput) -> None: - """Log the model's response""" - if "Success" in response.current_state.evaluation_previous_goal: - emoji = "āœ…" - elif "Failed" in response.current_state.evaluation_previous_goal: - emoji = "āŒ" - else: - emoji = "🤷" - - logger.info(f"{emoji} Eval: {response.current_state.evaluation_previous_goal}") - logger.info(f"🧠 New Memory: {response.current_state.important_contents}") - logger.info(f"šŸ¤” Thought: {response.current_state.thought}") - logger.info(f"šŸŽÆ Next Goal: {response.current_state.next_goal}") - for i, action in enumerate(response.action): - logger.info( - f"šŸ› ļø Action {i + 1}/{len(response.action)}: {action.model_dump_json(exclude_unset=True)}" - ) - - def _setup_action_models(self) -> None: - """Setup dynamic action models from controller's registry""" - # Get the dynamic action model from controller's registry - self.ActionModel = self.controller.registry.create_action_model() - # Create output model with the dynamic actions - self.AgentOutput = CustomAgentOutput.type_with_custom_actions(self.ActionModel) - - def update_step_info( - self, model_output: CustomAgentOutput, step_info: CustomAgentStepInfo = None - ): - """ - update step info - """ - if step_info is None: - return - - step_info.step_number += 1 - important_contents = model_output.current_state.important_contents - if ( - important_contents - and "None" not in important_contents - and important_contents not in step_info.memory - ): - step_info.memory += important_contents + "\n" - - logger.info(f"🧠 All Memory: \n{step_info.memory}") - - @time_execution_async("--get_next_action") - async def get_next_action(self, input_messages: list[BaseMessage]) -> AgentOutput: - """Get next action from LLM based on current state""" - fixed_input_messages = self._convert_input_messages(input_messages) - ai_message = self.llm.invoke(fixed_input_messages) - self.message_manager._add_message_with_tokens(ai_message) - - if hasattr(ai_message, "reasoning_content"): - logger.info("🤯 Start Deep Thinking: ") - logger.info(ai_message.reasoning_content) - logger.info("🤯 End Deep Thinking") - - if isinstance(ai_message.content, list): - ai_content = ai_message.content[0] - else: - ai_content = ai_message.content - - try: - ai_content = ai_content.replace("```json", "").replace("```", "") - ai_content = repair_json(ai_content) - parsed_json = json.loads(ai_content) - parsed: AgentOutput = self.AgentOutput(**parsed_json) - except Exception as e: - import traceback - traceback.print_exc() - logger.debug(ai_message.content) - raise ValueError('Could not parse response.') - - if parsed is None: - logger.debug(ai_message.content) - raise ValueError('Could not parse response.') - - # cut the number of actions to max_actions_per_step if needed - if len(parsed.action) > self.settings.max_actions_per_step: - parsed.action = parsed.action[: self.settings.max_actions_per_step] - self._log_response(parsed) - return parsed - - async def _run_planner(self) -> Optional[str]: - """Run the planner to analyze state and suggest next steps""" - # Skip planning if no planner_llm is set - if not self.settings.planner_llm: - return None - - # Create planner message history using full message history - planner_messages = [ - PlannerPrompt(self.controller.registry.get_prompt_description()).get_system_message(), - *self.message_manager.get_messages()[1:], # Use full message history except the first - ] - - if not self.settings.use_vision_for_planner and self.settings.use_vision: - last_state_message: HumanMessage = planner_messages[-1] - # remove image from last state message - new_msg = '' - if isinstance(last_state_message.content, list): - for msg in last_state_message.content: - if msg['type'] == 'text': - new_msg += msg['text'] - elif msg['type'] == 'image_url': - continue - else: - new_msg = last_state_message.content - - planner_messages[-1] = HumanMessage(content=new_msg) - - # Get planner output - response = await self.settings.planner_llm.ainvoke(planner_messages) - plan = str(response.content) - last_state_message = self.message_manager.get_messages()[-1] - if isinstance(last_state_message, HumanMessage): - # remove image from last state message - if isinstance(last_state_message.content, list): - for msg in last_state_message.content: - if msg['type'] == 'text': - msg['text'] += f"\nPlanning Agent outputs plans:\n {plan}\n" - else: - last_state_message.content += f"\nPlanning Agent outputs plans:\n {plan}\n " - - try: - plan_json = json.loads(plan.replace("```json", "").replace("```", "")) - logger.info(f'šŸ“‹ Plans:\n{json.dumps(plan_json, indent=4)}') - - if hasattr(response, "reasoning_content"): - logger.info("🤯 Start Planning Deep Thinking: ") - logger.info(response.reasoning_content) - logger.info("🤯 End Planning Deep Thinking") - - except json.JSONDecodeError: - logger.info(f'šŸ“‹ Plans:\n{plan}') - except Exception as e: - logger.debug(f'Error parsing planning analysis: {e}') - logger.info(f'šŸ“‹ Plans: {plan}') - return plan - - @time_execution_async("--step") - async def step(self, step_info: Optional[CustomAgentStepInfo] = None) -> None: - """Execute one step of the task""" - logger.info(f"\nšŸ“ Step {self.state.n_steps}") - state = None - model_output = None - result: list[ActionResult] = [] - step_start_time = time.time() - tokens = 0 - - try: - state = await self.browser_context.get_state() - await self._raise_if_stopped_or_paused() - - self.message_manager.add_state_message(state, self.state.last_action, self.state.last_result, step_info, - self.settings.use_vision) - - # Run planner at specified intervals if planner is configured - if self.settings.planner_llm and self.state.n_steps % self.settings.planner_interval == 0: - await self._run_planner() - input_messages = self.message_manager.get_messages() - tokens = self._message_manager.state.history.current_tokens - - try: - model_output = await self.get_next_action(input_messages) - self.update_step_info(model_output, step_info) - self.state.n_steps += 1 - - if self.register_new_step_callback: - await self.register_new_step_callback(state, model_output, self.state.n_steps) - - if self.settings.save_conversation_path: - target = self.settings.save_conversation_path + f'_{self.state.n_steps}.txt' - save_conversation(input_messages, model_output, target, - self.settings.save_conversation_path_encoding) - - if self.model_name != "deepseek-reasoner": - # remove prev message - self.message_manager._remove_state_message_by_index(-1) - await self._raise_if_stopped_or_paused() - except Exception as e: - # model call failed, remove last state message from history - self.message_manager._remove_state_message_by_index(-1) - raise e - - result: list[ActionResult] = await self.multi_act(model_output.action) - for ret_ in result: - if ret_.extracted_content and "Extracted page" in ret_.extracted_content: - # record every extracted page - if ret_.extracted_content[:100] not in self.state.extracted_content: - self.state.extracted_content += ret_.extracted_content - self.state.last_result = result - self.state.last_action = model_output.action - if len(result) > 0 and result[-1].is_done: - if not self.state.extracted_content: - self.state.extracted_content = step_info.memory - result[-1].extracted_content = self.state.extracted_content - logger.info(f"šŸ“„ Result: {result[-1].extracted_content}") - - self.state.consecutive_failures = 0 - - except InterruptedError: - logger.debug('Agent paused') - self.state.last_result = [ - ActionResult( - error='The agent was paused - now continuing actions might need to be repeated', - include_in_memory=True - ) - ] - return - - except Exception as e: - result = await self._handle_step_error(e) - self.state.last_result = result - - finally: - step_end_time = time.time() - actions = [a.model_dump(exclude_unset=True) for a in model_output.action] if model_output else [] - self.telemetry.capture( - AgentStepTelemetryEvent( - agent_id=self.state.agent_id, - step=self.state.n_steps, - actions=actions, - consecutive_failures=self.state.consecutive_failures, - step_error=[r.error for r in result if r.error] if result else ['No result'], - ) - ) - if not result: - return - - if state: - metadata = StepMetadata( - step_number=self.state.n_steps, - step_start_time=step_start_time, - step_end_time=step_end_time, - input_tokens=tokens, - ) - self._make_history_item(model_output, state, result, metadata) - - async def run(self, max_steps: int = 100) -> AgentHistoryList: - """Execute the task with maximum number of steps""" - try: - self._log_agent_run() - - # Execute initial actions if provided - if self.initial_actions: - result = await self.multi_act(self.initial_actions, check_for_new_elements=False) - self.state.last_result = result - - step_info = CustomAgentStepInfo( - task=self.task, - add_infos=self.add_infos, - step_number=1, - max_steps=max_steps, - memory="", - ) - - for step in range(max_steps): - # Check if we should stop due to too many failures - if self.state.consecutive_failures >= self.settings.max_failures: - logger.error(f'āŒ Stopping due to {self.settings.max_failures} consecutive failures') - break - - # Check control flags before each step - if self.state.stopped: - logger.info('Agent stopped') - break - - while self.state.paused: - await asyncio.sleep(0.2) # Small delay to prevent CPU spinning - if self.state.stopped: # Allow stopping while paused - break - - await self.step(step_info) - - if self.state.history.is_done(): - if self.settings.validate_output and step < max_steps - 1: - if not await self._validate_output(): - continue - - await self.log_completion() - break - else: - logger.info("āŒ Failed to complete task in maximum steps") - if not self.state.extracted_content: - self.state.history.history[-1].result[-1].extracted_content = step_info.memory - else: - self.state.history.history[-1].result[-1].extracted_content = self.state.extracted_content - - return self.state.history - - finally: - self.telemetry.capture( - AgentEndTelemetryEvent( - agent_id=self.state.agent_id, - is_done=self.state.history.is_done(), - success=self.state.history.is_successful(), - steps=self.state.n_steps, - max_steps_reached=self.state.n_steps >= max_steps, - errors=self.state.history.errors(), - total_input_tokens=self.state.history.total_input_tokens(), - total_duration_seconds=self.state.history.total_duration_seconds(), - ) - ) - - if not self.injected_browser_context: - await self.browser_context.close() - - if not self.injected_browser and self.browser: - await self.browser.close() - - if self.settings.generate_gif: - output_path: str = 'agent_history.gif' - if isinstance(self.settings.generate_gif, str): - output_path = self.settings.generate_gif - - create_history_gif(task=self.task, history=self.state.history, output_path=output_path) diff --git a/src/agent/custom_message_manager.py b/src/agent/custom_message_manager.py deleted file mode 100644 index 99836b2..0000000 --- a/src/agent/custom_message_manager.py +++ /dev/null @@ -1,111 +0,0 @@ -from __future__ import annotations - -import logging -import pdb -from typing import List, Optional, Type, Dict - -from browser_use.agent.message_manager.service import MessageManager -from browser_use.agent.message_manager.views import MessageHistory -from browser_use.agent.prompts import SystemPrompt, AgentMessagePrompt -from browser_use.agent.views import ActionResult, AgentStepInfo, ActionModel -from browser_use.browser.views import BrowserState -from browser_use.agent.message_manager.service import MessageManagerSettings -from browser_use.agent.views import ActionResult, AgentOutput, AgentStepInfo, MessageManagerState -from langchain_core.language_models import BaseChatModel -from langchain_anthropic import ChatAnthropic -from langchain_core.language_models import BaseChatModel -from langchain_core.messages import ( - AIMessage, - BaseMessage, - HumanMessage, - ToolMessage, - SystemMessage -) -from langchain_openai import ChatOpenAI -from ..utils.llm import DeepSeekR1ChatOpenAI -from .custom_prompts import CustomAgentMessagePrompt - -logger = logging.getLogger(__name__) - - -class CustomMessageManagerSettings(MessageManagerSettings): - agent_prompt_class: Type[AgentMessagePrompt] = AgentMessagePrompt - - -class CustomMessageManager(MessageManager): - def __init__( - self, - task: str, - system_message: SystemMessage, - settings: MessageManagerSettings = MessageManagerSettings(), - state: MessageManagerState = MessageManagerState(), - ): - super().__init__( - task=task, - system_message=system_message, - settings=settings, - state=state - ) - - def _init_messages(self) -> None: - """Initialize the message history with system message, context, task, and other initial messages""" - self._add_message_with_tokens(self.system_prompt) - self.context_content = "" - - if self.settings.message_context: - self.context_content += 'Context for the task' + self.settings.message_context - - if self.settings.sensitive_data: - info = f'Here are placeholders for sensitive data: {list(self.settings.sensitive_data.keys())}' - info += 'To use them, write the placeholder name' - self.context_content += info - - if self.settings.available_file_paths: - filepaths_msg = f'Here are file paths you can use: {self.settings.available_file_paths}' - self.context_content += filepaths_msg - - if self.context_content: - context_message = HumanMessage(content=self.context_content) - self._add_message_with_tokens(context_message) - - def cut_messages(self): - """Get current message list, potentially trimmed to max tokens""" - diff = self.state.history.current_tokens - self.settings.max_input_tokens - min_message_len = 2 if self.context_content is not None else 1 - - while diff > 0 and len(self.state.history.messages) > min_message_len: - msg = self.state.history.messages.pop(min_message_len) - self.state.history.current_tokens -= msg.metadata.tokens - diff = self.state.history.current_tokens - self.settings.max_input_tokens - - def add_state_message( - self, - state: BrowserState, - actions: Optional[List[ActionModel]] = None, - result: Optional[List[ActionResult]] = None, - step_info: Optional[AgentStepInfo] = None, - use_vision=True, - ) -> None: - """Add browser state as human message""" - # otherwise add state message and result to next message (which will not stay in memory) - state_message = self.settings.agent_prompt_class( - state, - actions, - result, - include_attributes=self.settings.include_attributes, - step_info=step_info, - ).get_user_message(use_vision) - self._add_message_with_tokens(state_message) - - def _remove_state_message_by_index(self, remove_ind=-1) -> None: - """Remove state message by index from history""" - i = len(self.state.history.messages) - 1 - remove_cnt = 0 - while i >= 0: - if isinstance(self.state.history.messages[i].message, HumanMessage): - remove_cnt += 1 - if remove_cnt == abs(remove_ind): - msg = self.state.history.messages.pop(i) - self.state.history.current_tokens -= msg.metadata.tokens - break - i -= 1 diff --git a/src/agent/custom_prompts.py b/src/agent/custom_prompts.py deleted file mode 100644 index 02f1777..0000000 --- a/src/agent/custom_prompts.py +++ /dev/null @@ -1,125 +0,0 @@ -import pdb -from typing import List, Optional - -from browser_use.agent.prompts import SystemPrompt, AgentMessagePrompt -from browser_use.agent.views import ActionResult, ActionModel -from browser_use.browser.views import BrowserState -from langchain_core.messages import HumanMessage, SystemMessage -from datetime import datetime -import importlib - -from .custom_views import CustomAgentStepInfo - - -class CustomSystemPrompt(SystemPrompt): - def _load_prompt_template(self) -> None: - """Load the prompt template from the markdown file.""" - try: - # This works both in development and when installed as a package - with importlib.resources.files('src.agent').joinpath('custom_system_prompt.md').open('r') as f: - self.prompt_template = f.read() - except Exception as e: - raise RuntimeError(f'Failed to load system prompt template: {e}') - - def get_system_message(self) -> SystemMessage: - """ - Get the system prompt for the agent. - - Returns: - SystemMessage: Formatted system prompt - """ - prompt = self.prompt_template.format(max_actions=self.max_actions_per_step, - available_actions=self.default_action_description) - - return SystemMessage(content=prompt) - - -class CustomAgentMessagePrompt(AgentMessagePrompt): - def __init__( - self, - state: BrowserState, - actions: Optional[List[ActionModel]] = None, - result: Optional[List[ActionResult]] = None, - include_attributes: list[str] = [], - step_info: Optional[CustomAgentStepInfo] = None, - ): - super(CustomAgentMessagePrompt, self).__init__(state=state, - result=result, - include_attributes=include_attributes, - step_info=step_info - ) - self.actions = actions - - def get_user_message(self, use_vision: bool = True) -> HumanMessage: - if self.step_info: - step_info_description = f'Current step: {self.step_info.step_number}/{self.step_info.max_steps}\n' - else: - step_info_description = '' - - time_str = datetime.now().strftime("%Y-%m-%d %H:%M") - step_info_description += f"Current date and time: {time_str}" - - elements_text = self.state.element_tree.clickable_elements_to_string(include_attributes=self.include_attributes) - - has_content_above = (self.state.pixels_above or 0) > 0 - has_content_below = (self.state.pixels_below or 0) > 0 - - if elements_text != '': - if has_content_above: - elements_text = ( - f'... {self.state.pixels_above} pixels above - scroll or extract content to see more ...\n{elements_text}' - ) - else: - elements_text = f'[Start of page]\n{elements_text}' - if has_content_below: - elements_text = ( - f'{elements_text}\n... {self.state.pixels_below} pixels below - scroll or extract content to see more ...' - ) - else: - elements_text = f'{elements_text}\n[End of page]' - else: - elements_text = 'empty page' - - state_description = f""" -{step_info_description} -1. Task: {self.step_info.task}. -2. Hints(Optional): -{self.step_info.add_infos} -3. Memory: -{self.step_info.memory} -4. Current url: {self.state.url} -5. Available tabs: -{self.state.tabs} -6. Interactive elements: -{elements_text} - """ - - if self.actions and self.result: - state_description += "\n **Previous Actions** \n" - state_description += f'Previous step: {self.step_info.step_number - 1}/{self.step_info.max_steps} \n' - for i, result in enumerate(self.result): - action = self.actions[i] - state_description += f"Previous action {i + 1}/{len(self.result)}: {action.model_dump_json(exclude_unset=True)}\n" - if result.error: - # only use last 300 characters of error - error = result.error.split('\n')[-1] - state_description += ( - f"Error of previous action {i + 1}/{len(self.result)}: ...{error}\n" - ) - if result.include_in_memory: - if result.extracted_content: - state_description += f"Result of previous action {i + 1}/{len(self.result)}: {result.extracted_content}\n" - - if self.state.screenshot and use_vision == True: - # Format message for vision model - return HumanMessage( - content=[ - {'type': 'text', 'text': state_description}, - { - 'type': 'image_url', - 'image_url': {'url': f'data:image/png;base64,{self.state.screenshot}'}, - }, - ] - ) - - return HumanMessage(content=state_description) diff --git a/src/agent/custom_system_prompt.md b/src/agent/custom_system_prompt.md deleted file mode 100644 index 594fdc0..0000000 --- a/src/agent/custom_system_prompt.md +++ /dev/null @@ -1,80 +0,0 @@ -You are an AI agent designed to automate browser tasks. Your goal is to accomplish the ultimate task following the rules. - -# Input Format -Task -Previous steps -Current URL -Open Tabs -Interactive Elements -[index]text -- index: Numeric identifier for interaction -- type: HTML element type (button, input, etc.) -- text: Element description -Example: -[33] - -- Only elements with numeric indexes in [] are interactive -- elements without [] provide only context - -# Response Rules -1. RESPONSE FORMAT: You must ALWAYS respond with valid JSON in this exact format: -{{ - "current_state": {{ - "evaluation_previous_goal": "Success|Failed|Unknown - Analyze the current elements and the image to check if the previous goals/actions are successful like intended by the task. Mention if something unexpected happened. Shortly state why/why not.", - "important_contents": "Output important contents closely related to user\'s instruction on the current page. If there is, please output the contents. If not, please output empty string ''.", - "thought": "Think about the requirements that have been completed in previous operations and the requirements that need to be completed in the next one operation. If your output of evaluation_previous_goal is 'Failed', please reflect and output your reflection here.", - "next_goal": "Please generate a brief natural language description for the goal of your next actions based on your thought." - }}, - "action": [ - {{"one_action_name": {{// action-specific parameter}}}}, // ... more actions in sequence - ] -}} - -2. ACTIONS: You can specify multiple actions in the list to be executed in sequence. But always specify only one action name per item. Use maximum {max_actions} actions per sequence. -Common action sequences: -- Form filling: [{{"input_text": {{"index": 1, "text": "username"}}}}, {{"input_text": {{"index": 2, "text": "password"}}}}, {{"click_element": {{"index": 3}}}}] -- Navigation and extraction: [{{"go_to_url": {{"url": "https://example.com"}}}}, {{"extract_content": {{"goal": "extract the names"}}}}] -- Actions are executed in the given order -- If the page changes after an action, the sequence is interrupted and you get the new state. -- Only provide the action sequence until an action which changes the page state significantly. -- Try to be efficient, e.g. fill forms at once, or chain actions where nothing changes on the page -- only use multiple actions if it makes sense. -- Only chose from below available actions. - -3. ELEMENT INTERACTION: -- Only use indexes of the interactive elements -- Elements marked with "[]Non-interactive text" are non-interactive - -4. NAVIGATION & ERROR HANDLING: -- If no suitable elements exist, use other functions to complete the task -- If stuck, try alternative approaches - like going back to a previous page, new search, new tab etc. -- Handle popups/cookies by accepting or closing them -- Use scroll to find elements you are looking for -- If you want to research something, open a new tab instead of using the current tab -- If captcha pops up, try to solve it - else try a different approach -- If the page is not fully loaded, use wait action - -5. TASK COMPLETION: -- Use the done action as the last action as soon as the ultimate task is complete -- Dont use "done" before you are done with everything the user asked you, except you reach the last step of max_steps. -- If you reach your last step, use the done action even if the task is not fully finished. Provide all the information you have gathered so far. If the ultimate task is completly finished set success to true. If not everything the user asked for is completed set success in done to false! -- If you have to do something repeatedly for example the task says for "each", or "for all", or "x times", count always inside "memory" how many times you have done it and how many remain. Don't stop until you have completed like the task asked you. Only call done after the last step. -- Don't hallucinate actions -- Make sure you include everything you found out for the ultimate task in the done text parameter. Do not just say you are done, but include the requested information of the task. - -6. VISUAL CONTEXT: -- When an image is provided, use it to understand the page layout -- Bounding boxes with labels on their top right corner correspond to element indexes - -7. Form filling: -- If you fill an input field and your action sequence is interrupted, most often something changed e.g. suggestions popped up under the field. - -8. Long tasks: -- Keep track of the status and subresults in the memory. - -9. Extraction: -- If your task is to find information - call extract_content on the specific pages to get and store the information. -Your responses must be always JSON with the specified format. - -Available Actions: -{available_actions} \ No newline at end of file diff --git a/src/agent/custom_views.py b/src/agent/custom_views.py deleted file mode 100644 index 98c5d4a..0000000 --- a/src/agent/custom_views.py +++ /dev/null @@ -1,67 +0,0 @@ -from dataclasses import dataclass -from typing import Any, Dict, List, Literal, Optional, Type -import uuid - -from browser_use.agent.views import AgentOutput, AgentState, ActionResult, AgentHistoryList, MessageManagerState -from browser_use.controller.registry.views import ActionModel -from pydantic import BaseModel, ConfigDict, Field, create_model - - -@dataclass -class CustomAgentStepInfo: - step_number: int - max_steps: int - task: str - add_infos: str - memory: str - - -class CustomAgentBrain(BaseModel): - """Current state of the agent""" - - evaluation_previous_goal: str - important_contents: str - thought: str - next_goal: str - - -class CustomAgentOutput(AgentOutput): - """Output model for agent - - @dev note: this model is extended with custom actions in AgentService. You can also use some fields that are not in this model as provided by the linter, as long as they are registered in the DynamicActions model. - """ - - current_state: CustomAgentBrain - - @staticmethod - def type_with_custom_actions( - custom_actions: Type[ActionModel], - ) -> Type["CustomAgentOutput"]: - """Extend actions with custom actions""" - model_ = create_model( - "CustomAgentOutput", - __base__=CustomAgentOutput, - action=( - list[custom_actions], - Field(..., description='List of actions to execute', json_schema_extra={'min_items': 1}), - ), # Properly annotated field with no default - __module__=CustomAgentOutput.__module__, - ) - model_.__doc__ = 'AgentOutput model with custom actions' - return model_ - - -class CustomAgentState(BaseModel): - agent_id: str = Field(default_factory=lambda: str(uuid.uuid4())) - n_steps: int = 1 - consecutive_failures: int = 0 - last_result: Optional[List['ActionResult']] = None - history: AgentHistoryList = Field(default_factory=lambda: AgentHistoryList(history=[])) - last_plan: Optional[str] = None - paused: bool = False - stopped: bool = False - - message_manager_state: MessageManagerState = Field(default_factory=MessageManagerState) - - last_action: Optional[List['ActionModel']] = None - extracted_content: str = '' diff --git a/src/utils/deep_research.py b/src/agent/deep_research_agent.py similarity index 99% rename from src/utils/deep_research.py rename to src/agent/deep_research_agent.py index 0409385..d96125b 100644 --- a/src/utils/deep_research.py +++ b/src/agent/deep_research_agent.py @@ -10,7 +10,6 @@ import logging from pprint import pprint from uuid import uuid4 from src.utils import utils -from src.agent.custom_agent import CustomAgent import json import re from browser_use.agent.service import Agent @@ -27,7 +26,6 @@ from langchain_core.messages import ( SystemMessage ) from json_repair import repair_json -from src.agent.custom_prompts import CustomSystemPrompt, CustomAgentMessagePrompt from src.controller.custom_controller import CustomController from src.browser.custom_browser import CustomBrowser from src.browser.custom_context import BrowserContextConfig, BrowserContext @@ -35,6 +33,7 @@ from browser_use.browser.context import ( BrowserContextConfig, BrowserContextWindowSize, ) +from browser_use.agent.service import Agent logger = logging.getLogger(__name__) @@ -224,7 +223,7 @@ Provide your output as a JSON formatted list. Each item in the list must adhere add_infos = "1. Please click on the most relevant link to get information and go deeper, instead of just staying on the search page. \n" \ "2. When opening a PDF file, please remember to extract the content using extract_content instead of simply opening it for the user to view.\n" if use_own_browser: - agent = CustomAgent( + agent = Agent( task=query_tasks[0], llm=llm, add_infos=add_infos, @@ -246,7 +245,7 @@ Provide your output as a JSON formatted list. Each item in the list must adhere await page.close() else: - agents = [CustomAgent( + agents = [Agent( task=task, llm=llm, add_infos=add_infos, @@ -346,7 +345,7 @@ async def generate_final_report(task, history_infos, save_dir, llm, error_msg=No ``` **Furthermore, ensure that the reference list is free of duplicates. Each unique source should be listed only once, regardless of how many times it is cited in the text.** * **ABSOLUTE FINAL OUTPUT RESTRICTION:** **Your output must contain ONLY the finished, publication-ready Markdown report. Do not include ANY extraneous text, phrases, preambles, meta-commentary, or markdown code indicators (e.g., "```markdown```"). The report should begin directly with the title and introductory paragraph, and end directly after the conclusion and the reference list (if applicable).** **Your response will be deemed a failure if this instruction is not followed precisely.** - + **Inputs:** 1. **User Instruction:** The original instruction given by the user. This helps you determine what kind of information will be useful and how to structure your thinking. diff --git a/src/controller/custom_controller.py b/src/controller/custom_controller.py index 9f95fc6..7209e97 100644 --- a/src/controller/custom_controller.py +++ b/src/controller/custom_controller.py @@ -1,11 +1,12 @@ import pdb import pyperclip -from typing import Optional, Type, Callable, Dict, Any, Union, Awaitable +from typing import Optional, Type, Callable, Dict, Any, Union, Awaitable, TypeVar from pydantic import BaseModel from browser_use.agent.views import ActionResult from browser_use.browser.context import BrowserContext from browser_use.controller.service import Controller, DoneAction +from browser_use.controller.registry.service import Registry, RegisteredAction from main_content_extractor import MainContentExtractor from browser_use.controller.views import ( ClickElementAction, @@ -21,22 +22,53 @@ from browser_use.controller.views import ( ) import logging import inspect +import asyncio import os -from src.utils import utils +from langchain_core.language_models.chat_models import BaseChatModel +from browser_use.agent.views import ActionModel, ActionResult + +from src.utils.mcp_client import create_tool_param_model, setup_mcp_client_and_tools + +from browser_use.utils import time_execution_sync logger = logging.getLogger(__name__) +Context = TypeVar('Context') + class CustomController(Controller): def __init__(self, exclude_actions: list[str] = [], output_model: Optional[Type[BaseModel]] = None, ask_assistant_callback: Optional[Union[Callable[[str, BrowserContext], Dict[str, Any]], Callable[ [str, BrowserContext], Awaitable[Dict[str, Any]]]]] = None, - ): super().__init__(exclude_actions=exclude_actions, output_model=output_model) self._register_custom_actions() self.ask_assistant_callback = ask_assistant_callback + self.mcp_client = None + self.mcp_server_config = None + + async def setup_mcp_client(self, mcp_server_config: Optional[Dict[str, Any]] = None): + self.mcp_server_config = mcp_server_config + if self.mcp_server_config: + self.mcp_client = await setup_mcp_client_and_tools(self.mcp_server_config) + self.register_mcp_tools() + + def register_mcp_tools(self): + """ + Register the MCP tools used by this controller. + """ + if self.mcp_client: + for server_name in self.mcp_client.server_name_to_tools: + for tool in self.mcp_client.server_name_to_tools[server_name]: + tool_name = f"mcp.{server_name}.{tool.name}" + self.registry.registry.actions[tool_name] = RegisteredAction( + name=tool_name, + description=tool.description, + function=tool, + param_model=create_tool_param_model(tool), + ) + logger.info(f"Add mcp tool: {tool_name}") def _register_custom_actions(self): """Register all custom browser actions""" @@ -95,3 +127,52 @@ class CustomController(Controller): msg = f'Failed to upload file to index {index}: {str(e)}' logger.info(msg) return ActionResult(error=msg) + + @time_execution_sync('--act') + async def act( + self, + action: ActionModel, + browser_context: Optional[BrowserContext] = None, + # + page_extraction_llm: Optional[BaseChatModel] = None, + sensitive_data: Optional[Dict[str, str]] = None, + available_file_paths: Optional[list[str]] = None, + # + context: Context | None = None, + ) -> ActionResult: + """Execute an action""" + + try: + for action_name, params in action.model_dump(exclude_unset=True).items(): + if params is not None: + if action_name.startswith("mcp"): + # this is a mcp tool + logger.debug(f"Invoke MCP tool: {action_name}") + mcp_tool = self.registry.registry.actions.get(action_name).function + result = await mcp_tool.ainvoke(params) + else: + result = await self.registry.execute_action( + action_name, + params, + browser=browser_context, + page_extraction_llm=page_extraction_llm, + sensitive_data=sensitive_data, + available_file_paths=available_file_paths, + context=context, + ) + + if isinstance(result, str): + return ActionResult(extracted_content=result) + elif isinstance(result, ActionResult): + return result + elif result is None: + return ActionResult() + else: + raise ValueError(f'Invalid action result type: {type(result)} of {result}') + return ActionResult() + except Exception as e: + raise e + + async def close_mcp_client(self): + if self.mcp_client: + await self.mcp_client.__aexit__(None, None, None) diff --git a/src/utils/agent_state.py b/src/utils/agent_state.py deleted file mode 100644 index 2456a55..0000000 --- a/src/utils/agent_state.py +++ /dev/null @@ -1,31 +0,0 @@ -import asyncio - - -class AgentState: - _instance = None - - def __init__(self): - if not hasattr(self, '_stop_requested'): - self._stop_requested = asyncio.Event() - self.last_valid_state = None # store the last valid browser state - - def __new__(cls): - if cls._instance is None: - cls._instance = super(AgentState, cls).__new__(cls) - return cls._instance - - def request_stop(self): - self._stop_requested.set() - - def clear_stop(self): - self._stop_requested.clear() - self.last_valid_state = None - - def is_stop_requested(self): - return self._stop_requested.is_set() - - def set_last_valid_state(self, state): - self.last_valid_state = state - - def get_last_valid_state(self): - return self.last_valid_state diff --git a/src/utils/config.py b/src/utils/config.py new file mode 100644 index 0000000..0bfd028 --- /dev/null +++ b/src/utils/config.py @@ -0,0 +1,62 @@ +PROVIDER_DISPLAY_NAMES = { + "openai": "OpenAI", + "azure_openai": "Azure OpenAI", + "anthropic": "Anthropic", + "deepseek": "DeepSeek", + "google": "Google", + "alibaba": "Alibaba", + "moonshot": "MoonShot", + "unbound": "Unbound AI", + "ibm": "IBM" +} + +# Predefined model names for common providers +model_names = { + "anthropic": ["claude-3-5-sonnet-20241022", "claude-3-5-sonnet-20240620", "claude-3-opus-20240229"], + "openai": ["gpt-4o", "gpt-4", "gpt-3.5-turbo", "o3-mini"], + "deepseek": ["deepseek-chat", "deepseek-reasoner"], + "google": ["gemini-2.0-flash", "gemini-2.0-flash-thinking-exp", "gemini-1.5-flash-latest", + "gemini-1.5-flash-8b-latest", "gemini-2.0-flash-thinking-exp-01-21", "gemini-2.0-pro-exp-02-05"], + "ollama": ["qwen2.5:7b", "qwen2.5:14b", "qwen2.5:32b", "qwen2.5-coder:14b", "qwen2.5-coder:32b", "llama2:7b", + "deepseek-r1:14b", "deepseek-r1:32b"], + "azure_openai": ["gpt-4o", "gpt-4", "gpt-3.5-turbo"], + "mistral": ["pixtral-large-latest", "mistral-large-latest", "mistral-small-latest", "ministral-8b-latest"], + "alibaba": ["qwen-plus", "qwen-max", "qwen-turbo", "qwen-long"], + "moonshot": ["moonshot-v1-32k-vision-preview", "moonshot-v1-8k-vision-preview"], + "unbound": ["gemini-2.0-flash", "gpt-4o-mini", "gpt-4o", "gpt-4.5-preview"], + "siliconflow": [ + "deepseek-ai/DeepSeek-R1", + "deepseek-ai/DeepSeek-V3", + "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B", + "deepseek-ai/DeepSeek-R1-Distill-Qwen-14B", + "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B", + "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B", + "deepseek-ai/DeepSeek-V2.5", + "deepseek-ai/deepseek-vl2", + "Qwen/Qwen2.5-72B-Instruct-128K", + "Qwen/Qwen2.5-72B-Instruct", + "Qwen/Qwen2.5-32B-Instruct", + "Qwen/Qwen2.5-14B-Instruct", + "Qwen/Qwen2.5-7B-Instruct", + "Qwen/Qwen2.5-Coder-32B-Instruct", + "Qwen/Qwen2.5-Coder-7B-Instruct", + "Qwen/Qwen2-7B-Instruct", + "Qwen/Qwen2-1.5B-Instruct", + "Qwen/QwQ-32B-Preview", + "Qwen/Qwen2-VL-72B-Instruct", + "Qwen/Qwen2.5-VL-32B-Instruct", + "Qwen/Qwen2.5-VL-72B-Instruct", + "TeleAI/TeleChat2", + "THUDM/glm-4-9b-chat", + "Vendor-A/Qwen/Qwen2.5-72B-Instruct", + "internlm/internlm2_5-7b-chat", + "internlm/internlm2_5-20b-chat", + "Pro/Qwen/Qwen2.5-7B-Instruct", + "Pro/Qwen/Qwen2-7B-Instruct", + "Pro/Qwen/Qwen2-1.5B-Instruct", + "Pro/THUDM/chatglm3-6b", + "Pro/THUDM/glm-4-9b-chat", + ], + "ibm": ["ibm/granite-vision-3.1-2b-preview", "meta-llama/llama-4-maverick-17b-128e-instruct-fp8", + "meta-llama/llama-3-2-90b-vision-instruct"] +} diff --git a/src/utils/llm.py b/src/utils/llm.py deleted file mode 100644 index 0b601ed..0000000 --- a/src/utils/llm.py +++ /dev/null @@ -1,138 +0,0 @@ -from openai import OpenAI -import pdb -from langchain_openai import ChatOpenAI -from langchain_core.globals import get_llm_cache -from langchain_core.language_models.base import ( - BaseLanguageModel, - LangSmithParams, - LanguageModelInput, -) -from langchain_core.load import dumpd, dumps -from langchain_core.messages import ( - AIMessage, - SystemMessage, - AnyMessage, - BaseMessage, - BaseMessageChunk, - HumanMessage, - convert_to_messages, - message_chunk_to_message, -) -from langchain_core.outputs import ( - ChatGeneration, - ChatGenerationChunk, - ChatResult, - LLMResult, - RunInfo, -) -from langchain_ollama import ChatOllama -from langchain_core.output_parsers.base import OutputParserLike -from langchain_core.runnables import Runnable, RunnableConfig -from langchain_core.tools import BaseTool - -from typing import ( - TYPE_CHECKING, - Any, - Callable, - Literal, - Optional, - Union, - cast, List, -) - - -class DeepSeekR1ChatOpenAI(ChatOpenAI): - - def __init__(self, *args: Any, **kwargs: Any) -> None: - super().__init__(*args, **kwargs) - self.client = OpenAI( - base_url=kwargs.get("base_url"), - api_key=kwargs.get("api_key") - ) - - async def ainvoke( - self, - input: LanguageModelInput, - config: Optional[RunnableConfig] = None, - *, - stop: Optional[list[str]] = None, - **kwargs: Any, - ) -> AIMessage: - message_history = [] - for input_ in input: - if isinstance(input_, SystemMessage): - message_history.append({"role": "system", "content": input_.content}) - elif isinstance(input_, AIMessage): - message_history.append({"role": "assistant", "content": input_.content}) - else: - message_history.append({"role": "user", "content": input_.content}) - - response = self.client.chat.completions.create( - model=self.model_name, - messages=message_history - ) - - reasoning_content = response.choices[0].message.reasoning_content - content = response.choices[0].message.content - return AIMessage(content=content, reasoning_content=reasoning_content) - - def invoke( - self, - input: LanguageModelInput, - config: Optional[RunnableConfig] = None, - *, - stop: Optional[list[str]] = None, - **kwargs: Any, - ) -> AIMessage: - message_history = [] - for input_ in input: - if isinstance(input_, SystemMessage): - message_history.append({"role": "system", "content": input_.content}) - elif isinstance(input_, AIMessage): - message_history.append({"role": "assistant", "content": input_.content}) - else: - message_history.append({"role": "user", "content": input_.content}) - - response = self.client.chat.completions.create( - model=self.model_name, - messages=message_history - ) - - reasoning_content = response.choices[0].message.reasoning_content - content = response.choices[0].message.content - return AIMessage(content=content, reasoning_content=reasoning_content) - - -class DeepSeekR1ChatOllama(ChatOllama): - - async def ainvoke( - self, - input: LanguageModelInput, - config: Optional[RunnableConfig] = None, - *, - stop: Optional[list[str]] = None, - **kwargs: Any, - ) -> AIMessage: - org_ai_message = await super().ainvoke(input=input) - org_content = org_ai_message.content - reasoning_content = org_content.split("")[0].replace("", "") - content = org_content.split("")[1] - if "**JSON Response:**" in content: - content = content.split("**JSON Response:**")[-1] - return AIMessage(content=content, reasoning_content=reasoning_content) - - def invoke( - self, - input: LanguageModelInput, - config: Optional[RunnableConfig] = None, - *, - stop: Optional[list[str]] = None, - **kwargs: Any, - ) -> AIMessage: - org_ai_message = super().invoke(input=input) - org_content = org_ai_message.content - reasoning_content = org_content.split("")[0].replace("", "") - content = org_content.split("")[1] - if "**JSON Response:**" in content: - content = content.split("**JSON Response:**")[-1] - return AIMessage(content=content, reasoning_content=reasoning_content) diff --git a/src/utils/llm_provider.py b/src/utils/llm_provider.py new file mode 100644 index 0000000..33e9328 --- /dev/null +++ b/src/utils/llm_provider.py @@ -0,0 +1,325 @@ +from openai import OpenAI +import pdb +from langchain_openai import ChatOpenAI +from langchain_core.globals import get_llm_cache +from langchain_core.language_models.base import ( + BaseLanguageModel, + LangSmithParams, + LanguageModelInput, +) +import os +from langchain_core.load import dumpd, dumps +from langchain_core.messages import ( + AIMessage, + SystemMessage, + AnyMessage, + BaseMessage, + BaseMessageChunk, + HumanMessage, + convert_to_messages, + message_chunk_to_message, +) +from langchain_core.outputs import ( + ChatGeneration, + ChatGenerationChunk, + ChatResult, + LLMResult, + RunInfo, +) +from langchain_ollama import ChatOllama +from langchain_core.output_parsers.base import OutputParserLike +from langchain_core.runnables import Runnable, RunnableConfig +from langchain_core.tools import BaseTool + +from typing import ( + TYPE_CHECKING, + Any, + Callable, + Literal, + Optional, + Union, + cast, List, +) +from langchain_anthropic import ChatAnthropic +from langchain_mistralai import ChatMistralAI +from langchain_google_genai import ChatGoogleGenerativeAI +from langchain_ollama import ChatOllama +from langchain_openai import AzureChatOpenAI, ChatOpenAI +from langchain_ibm import ChatWatsonx + +from src.utils import config + + +class DeepSeekR1ChatOpenAI(ChatOpenAI): + + def __init__(self, *args: Any, **kwargs: Any) -> None: + super().__init__(*args, **kwargs) + self.client = OpenAI( + base_url=kwargs.get("base_url"), + api_key=kwargs.get("api_key") + ) + + async def ainvoke( + self, + input: LanguageModelInput, + config: Optional[RunnableConfig] = None, + *, + stop: Optional[list[str]] = None, + **kwargs: Any, + ) -> AIMessage: + message_history = [] + for input_ in input: + if isinstance(input_, SystemMessage): + message_history.append({"role": "system", "content": input_.content}) + elif isinstance(input_, AIMessage): + message_history.append({"role": "assistant", "content": input_.content}) + else: + message_history.append({"role": "user", "content": input_.content}) + + response = self.client.chat.completions.create( + model=self.model_name, + messages=message_history + ) + + reasoning_content = response.choices[0].message.reasoning_content + content = response.choices[0].message.content + return AIMessage(content=content, reasoning_content=reasoning_content) + + def invoke( + self, + input: LanguageModelInput, + config: Optional[RunnableConfig] = None, + *, + stop: Optional[list[str]] = None, + **kwargs: Any, + ) -> AIMessage: + message_history = [] + for input_ in input: + if isinstance(input_, SystemMessage): + message_history.append({"role": "system", "content": input_.content}) + elif isinstance(input_, AIMessage): + message_history.append({"role": "assistant", "content": input_.content}) + else: + message_history.append({"role": "user", "content": input_.content}) + + response = self.client.chat.completions.create( + model=self.model_name, + messages=message_history + ) + + reasoning_content = response.choices[0].message.reasoning_content + content = response.choices[0].message.content + return AIMessage(content=content, reasoning_content=reasoning_content) + + +class DeepSeekR1ChatOllama(ChatOllama): + + async def ainvoke( + self, + input: LanguageModelInput, + config: Optional[RunnableConfig] = None, + *, + stop: Optional[list[str]] = None, + **kwargs: Any, + ) -> AIMessage: + org_ai_message = await super().ainvoke(input=input) + org_content = org_ai_message.content + reasoning_content = org_content.split("")[0].replace("", "") + content = org_content.split("")[1] + if "**JSON Response:**" in content: + content = content.split("**JSON Response:**")[-1] + return AIMessage(content=content, reasoning_content=reasoning_content) + + def invoke( + self, + input: LanguageModelInput, + config: Optional[RunnableConfig] = None, + *, + stop: Optional[list[str]] = None, + **kwargs: Any, + ) -> AIMessage: + org_ai_message = super().invoke(input=input) + org_content = org_ai_message.content + reasoning_content = org_content.split("")[0].replace("", "") + content = org_content.split("")[1] + if "**JSON Response:**" in content: + content = content.split("**JSON Response:**")[-1] + return AIMessage(content=content, reasoning_content=reasoning_content) + + +def get_llm_model(provider: str, **kwargs): + """ + Get LLM model + :param provider: LLM provider + :param kwargs: + :return: + """ + if provider not in ["ollama"]: + env_var = f"{provider.upper()}_API_KEY" + api_key = kwargs.get("api_key", "") or os.getenv(env_var, "") + if not api_key: + provider_display = config.PROVIDER_DISPLAY_NAMES.get(provider, provider.upper()) + error_msg = f"šŸ’„ {provider_display} API key not found! šŸ”‘ Please set the `{env_var}` environment variable or provide it in the UI." + raise ValueError(error_msg) + kwargs["api_key"] = api_key + + if provider == "anthropic": + if not kwargs.get("base_url", ""): + base_url = "https://api.anthropic.com" + else: + base_url = kwargs.get("base_url") + + return ChatAnthropic( + model=kwargs.get("model_name", "claude-3-5-sonnet-20241022"), + temperature=kwargs.get("temperature", 0.0), + base_url=base_url, + api_key=api_key, + ) + elif provider == 'mistral': + if not kwargs.get("base_url", ""): + base_url = os.getenv("MISTRAL_ENDPOINT", "https://api.mistral.ai/v1") + else: + base_url = kwargs.get("base_url") + if not kwargs.get("api_key", ""): + api_key = os.getenv("MISTRAL_API_KEY", "") + else: + api_key = kwargs.get("api_key") + + return ChatMistralAI( + model=kwargs.get("model_name", "mistral-large-latest"), + temperature=kwargs.get("temperature", 0.0), + base_url=base_url, + api_key=api_key, + ) + elif provider == "openai": + if not kwargs.get("base_url", ""): + base_url = os.getenv("OPENAI_ENDPOINT", "https://api.openai.com/v1") + else: + base_url = kwargs.get("base_url") + + return ChatOpenAI( + model=kwargs.get("model_name", "gpt-4o"), + temperature=kwargs.get("temperature", 0.0), + base_url=base_url, + api_key=api_key, + ) + elif provider == "deepseek": + if not kwargs.get("base_url", ""): + base_url = os.getenv("DEEPSEEK_ENDPOINT", "") + else: + base_url = kwargs.get("base_url") + + if kwargs.get("model_name", "deepseek-chat") == "deepseek-reasoner": + return DeepSeekR1ChatOpenAI( + model=kwargs.get("model_name", "deepseek-reasoner"), + temperature=kwargs.get("temperature", 0.0), + base_url=base_url, + api_key=api_key, + ) + else: + return ChatOpenAI( + model=kwargs.get("model_name", "deepseek-chat"), + temperature=kwargs.get("temperature", 0.0), + base_url=base_url, + api_key=api_key, + ) + elif provider == "google": + return ChatGoogleGenerativeAI( + model=kwargs.get("model_name", "gemini-2.0-flash-exp"), + temperature=kwargs.get("temperature", 0.0), + api_key=api_key, + ) + elif provider == "ollama": + if not kwargs.get("base_url", ""): + base_url = os.getenv("OLLAMA_ENDPOINT", "http://localhost:11434") + else: + base_url = kwargs.get("base_url") + + if "deepseek-r1" in kwargs.get("model_name", "qwen2.5:7b"): + return DeepSeekR1ChatOllama( + model=kwargs.get("model_name", "deepseek-r1:14b"), + temperature=kwargs.get("temperature", 0.0), + num_ctx=kwargs.get("num_ctx", 32000), + base_url=base_url, + ) + else: + return ChatOllama( + model=kwargs.get("model_name", "qwen2.5:7b"), + temperature=kwargs.get("temperature", 0.0), + num_ctx=kwargs.get("num_ctx", 32000), + num_predict=kwargs.get("num_predict", 1024), + base_url=base_url, + ) + elif provider == "azure_openai": + if not kwargs.get("base_url", ""): + base_url = os.getenv("AZURE_OPENAI_ENDPOINT", "") + else: + base_url = kwargs.get("base_url") + api_version = kwargs.get("api_version", "") or os.getenv("AZURE_OPENAI_API_VERSION", "2025-01-01-preview") + return AzureChatOpenAI( + model=kwargs.get("model_name", "gpt-4o"), + temperature=kwargs.get("temperature", 0.0), + api_version=api_version, + azure_endpoint=base_url, + api_key=api_key, + ) + elif provider == "alibaba": + if not kwargs.get("base_url", ""): + base_url = os.getenv("ALIBABA_ENDPOINT", "https://dashscope.aliyuncs.com/compatible-mode/v1") + else: + base_url = kwargs.get("base_url") + + return ChatOpenAI( + model=kwargs.get("model_name", "qwen-plus"), + temperature=kwargs.get("temperature", 0.0), + base_url=base_url, + api_key=api_key, + ) + elif provider == "ibm": + parameters = { + "temperature": kwargs.get("temperature", 0.0), + "max_tokens": kwargs.get("num_ctx", 32000) + } + if not kwargs.get("base_url", ""): + base_url = os.getenv("IBM_ENDPOINT", "https://us-south.ml.cloud.ibm.com") + else: + base_url = kwargs.get("base_url") + + return ChatWatsonx( + model_id=kwargs.get("model_name", "ibm/granite-vision-3.1-2b-preview"), + url=base_url, + project_id=os.getenv("IBM_PROJECT_ID"), + apikey=os.getenv("IBM_API_KEY"), + params=parameters + ) + elif provider == "moonshot": + return ChatOpenAI( + model=kwargs.get("model_name", "moonshot-v1-32k-vision-preview"), + temperature=kwargs.get("temperature", 0.0), + base_url=os.getenv("MOONSHOT_ENDPOINT"), + api_key=os.getenv("MOONSHOT_API_KEY"), + ) + elif provider == "unbound": + return ChatOpenAI( + model=kwargs.get("model_name", "gpt-4o-mini"), + temperature=kwargs.get("temperature", 0.0), + base_url=os.getenv("UNBOUND_ENDPOINT", "https://api.getunbound.ai"), + api_key=api_key, + ) + elif provider == "siliconflow": + if not kwargs.get("api_key", ""): + api_key = os.getenv("SiliconFLOW_API_KEY", "") + else: + api_key = kwargs.get("api_key") + if not kwargs.get("base_url", ""): + base_url = os.getenv("SiliconFLOW_ENDPOINT", "") + else: + base_url = kwargs.get("base_url") + return ChatOpenAI( + api_key=api_key, + base_url=base_url, + model_name=kwargs.get("model_name", "Qwen/QwQ-32B"), + temperature=kwargs.get("temperature", 0.0), + ) + else: + raise ValueError(f"Unsupported provider: {provider}") diff --git a/src/utils/mcp_client.py b/src/utils/mcp_client.py index aa5de2b..a5d6fcd 100644 --- a/src/utils/mcp_client.py +++ b/src/utils/mcp_client.py @@ -12,12 +12,22 @@ from typing import Optional, Dict, Any, Type from langchain_core.tools import BaseTool from pydantic.v1 import BaseModel, Field from langchain_core.runnables import RunnableConfig +from pydantic import BaseModel, Field, create_model +from typing import Type, Dict, Any, Optional, get_type_hints, List, Union, Annotated, Set +from pydantic import BaseModel, ConfigDict, create_model, Field +from langchain.tools import BaseTool +import inspect +from datetime import datetime, date, time +import uuid +from enum import Enum +import inspect +from browser_use.controller.registry.views import ActionModel +from typing import Type, Dict, Any, Optional, get_type_hints logger = logging.getLogger(__name__) -async def setup_mcp_client_and_tools(mcp_server_config: Dict[str, Any]) -> Tuple[ - Optional[List[BaseTool]], Optional[MultiServerMCPClient]]: +async def setup_mcp_client_and_tools(mcp_server_config: Dict[str, Any]) -> Optional[MultiServerMCPClient]: """ Initializes the MultiServerMCPClient, connects to servers, fetches tools, filters them, and returns a flat list of usable tools and the client instance. @@ -33,10 +43,219 @@ async def setup_mcp_client_and_tools(mcp_server_config: Dict[str, Any]) -> Tuple try: client = MultiServerMCPClient(mcp_server_config) await client.__aenter__() - mcp_tools = client.get_tools() - logger.info(f"Total usable MCP tools collected: {len(mcp_tools)}") - return mcp_tools, client + return client except Exception as e: logger.error(f"Failed to setup MCP client or fetch tools: {e}", exc_info=True) - return [], None + return None + + +def create_tool_param_model(tool: BaseTool) -> Type[BaseModel]: + """Creates a Pydantic model from a LangChain tool's schema""" + + # Get tool schema information + json_schema = tool.args_schema + tool_name = tool.name + + # If the tool already has a schema defined, convert it to a new param_model + if json_schema is not None: + + # Create new parameter model + params = {} + + # Process properties if they exist + if 'properties' in json_schema: + # Find required fields + required_fields: Set[str] = set(json_schema.get('required', [])) + + for prop_name, prop_details in json_schema['properties'].items(): + field_type = resolve_type(prop_details, f"{tool_name}_{prop_name}") + + # Check if parameter is required + is_required = prop_name in required_fields + + # Get default value and description + default_value = prop_details.get('default', ... if is_required else None) + description = prop_details.get('description', '') + + # Add field constraints + field_kwargs = {'default': default_value} + if description: + field_kwargs['description'] = description + + # Add additional constraints if present + if 'minimum' in prop_details: + field_kwargs['ge'] = prop_details['minimum'] + if 'maximum' in prop_details: + field_kwargs['le'] = prop_details['maximum'] + if 'minLength' in prop_details: + field_kwargs['min_length'] = prop_details['minLength'] + if 'maxLength' in prop_details: + field_kwargs['max_length'] = prop_details['maxLength'] + if 'pattern' in prop_details: + field_kwargs['pattern'] = prop_details['pattern'] + + # Add to parameters dictionary + params[prop_name] = (field_type, Field(**field_kwargs)) + + return create_model( + f'{tool_name}_parameters', + __base__=ActionModel, + **params, # type: ignore + ) + + # If no schema is defined, extract parameters from the _run method + run_method = tool._run + sig = inspect.signature(run_method) + + # Get type hints for better type information + try: + type_hints = get_type_hints(run_method) + except Exception: + type_hints = {} + + params = {} + for name, param in sig.parameters.items(): + # Skip 'self' parameter and any other parameters you want to exclude + if name == 'self': + continue + + # Get annotation from type hints if available, otherwise from signature + annotation = type_hints.get(name, param.annotation) + if annotation == inspect.Parameter.empty: + annotation = Any + + # Use default value if available, otherwise make it required + if param.default != param.empty: + params[name] = (annotation, param.default) + else: + params[name] = (annotation, ...) + + return create_model( + f'{tool_name}_parameters', + __base__=ActionModel, + **params, # type: ignore + ) + + +def resolve_type(prop_details: Dict[str, Any], prefix: str = "") -> Any: + """Recursively resolves JSON schema type to Python/Pydantic type""" + + # Handle reference types + if '$ref' in prop_details: + # In a real application, reference resolution would be needed + return Any + + # Basic type mapping + type_mapping = { + 'string': str, + 'integer': int, + 'number': float, + 'boolean': bool, + 'array': List, + 'object': Dict, + 'null': type(None), + } + + # Handle formatted strings + if prop_details.get('type') == 'string' and 'format' in prop_details: + format_mapping = { + 'date-time': datetime, + 'date': date, + 'time': time, + 'email': str, + 'uri': str, + 'url': str, + 'uuid': uuid.UUID, + 'binary': bytes, + } + return format_mapping.get(prop_details['format'], str) + + # Handle enum types + if 'enum' in prop_details: + enum_values = prop_details['enum'] + # Create dynamic enum class with safe names + enum_dict = {} + for i, v in enumerate(enum_values): + # Ensure enum names are valid Python identifiers + if isinstance(v, str): + key = v.upper().replace(' ', '_').replace('-', '_') + if not key.isidentifier(): + key = f"VALUE_{i}" + else: + key = f"VALUE_{i}" + enum_dict[key] = v + + # Only create enum if we have values + if enum_dict: + return Enum(f"{prefix}_Enum", enum_dict) + return str # Fallback + + # Handle array types + if prop_details.get('type') == 'array' and 'items' in prop_details: + item_type = resolve_type(prop_details['items'], f"{prefix}_item") + return List[item_type] # type: ignore + + # Handle object types with properties + if prop_details.get('type') == 'object' and 'properties' in prop_details: + nested_params = {} + for nested_name, nested_details in prop_details['properties'].items(): + nested_type = resolve_type(nested_details, f"{prefix}_{nested_name}") + # Get required field info + required_fields = prop_details.get('required', []) + is_required = nested_name in required_fields + default_value = nested_details.get('default', ... if is_required else None) + description = nested_details.get('description', '') + + field_kwargs = {'default': default_value} + if description: + field_kwargs['description'] = description + + nested_params[nested_name] = (nested_type, Field(**field_kwargs)) + + # Create nested model + nested_model = create_model(f"{prefix}_Model", **nested_params) + return nested_model + + # Handle union types (oneOf, anyOf) + if 'oneOf' in prop_details or 'anyOf' in prop_details: + union_schema = prop_details.get('oneOf') or prop_details.get('anyOf') + union_types = [] + for i, t in enumerate(union_schema): + union_types.append(resolve_type(t, f"{prefix}_{i}")) + + if union_types: + return Union.__getitem__(tuple(union_types)) # type: ignore + return Any + + # Handle allOf (intersection types) + if 'allOf' in prop_details: + nested_params = {} + for i, schema_part in enumerate(prop_details['allOf']): + if 'properties' in schema_part: + for nested_name, nested_details in schema_part['properties'].items(): + nested_type = resolve_type(nested_details, f"{prefix}_allOf_{i}_{nested_name}") + # Check if required + required_fields = schema_part.get('required', []) + is_required = nested_name in required_fields + nested_params[nested_name] = (nested_type, ... if is_required else None) + + # Create composite model + if nested_params: + composite_model = create_model(f"{prefix}_CompositeModel", **nested_params) + return composite_model + return Dict + + # Default to basic types + schema_type = prop_details.get('type', 'string') + if isinstance(schema_type, list): + # Handle multiple types (e.g., ["string", "null"]) + non_null_types = [t for t in schema_type if t != 'null'] + if non_null_types: + primary_type = type_mapping.get(non_null_types[0], Any) + if 'null' in schema_type: + return Optional[primary_type] # type: ignore + return primary_type + return Any + + return type_mapping.get(schema_type, Any) diff --git a/src/utils/utils.py b/src/utils/utils.py index 10ebf7a..8703c46 100644 --- a/src/utils/utils.py +++ b/src/utils/utils.py @@ -8,254 +8,6 @@ import json import gradio as gr import uuid -from langchain_anthropic import ChatAnthropic -from langchain_mistralai import ChatMistralAI -from langchain_google_genai import ChatGoogleGenerativeAI -from langchain_ollama import ChatOllama -from langchain_openai import AzureChatOpenAI, ChatOpenAI -from langchain_ibm import ChatWatsonx - -from .llm import DeepSeekR1ChatOpenAI, DeepSeekR1ChatOllama - -PROVIDER_DISPLAY_NAMES = { - "openai": "OpenAI", - "azure_openai": "Azure OpenAI", - "anthropic": "Anthropic", - "deepseek": "DeepSeek", - "google": "Google", - "alibaba": "Alibaba", - "moonshot": "MoonShot", - "unbound": "Unbound AI", - "ibm": "IBM" -} - - -def get_llm_model(provider: str, **kwargs): - """ - čŽ·å–LLM ęØ”åž‹ - :param provider: ęØ”åž‹ē±»åž‹ - :param kwargs: - :return: - """ - if provider not in ["ollama"]: - env_var = f"{provider.upper()}_API_KEY" - api_key = kwargs.get("api_key", "") or os.getenv(env_var, "") - if not api_key: - raise MissingAPIKeyError(provider, env_var) - kwargs["api_key"] = api_key - - if provider == "anthropic": - if not kwargs.get("base_url", ""): - base_url = "https://api.anthropic.com" - else: - base_url = kwargs.get("base_url") - - return ChatAnthropic( - model=kwargs.get("model_name", "claude-3-5-sonnet-20241022"), - temperature=kwargs.get("temperature", 0.0), - base_url=base_url, - api_key=api_key, - ) - elif provider == 'mistral': - if not kwargs.get("base_url", ""): - base_url = os.getenv("MISTRAL_ENDPOINT", "https://api.mistral.ai/v1") - else: - base_url = kwargs.get("base_url") - if not kwargs.get("api_key", ""): - api_key = os.getenv("MISTRAL_API_KEY", "") - else: - api_key = kwargs.get("api_key") - - return ChatMistralAI( - model=kwargs.get("model_name", "mistral-large-latest"), - temperature=kwargs.get("temperature", 0.0), - base_url=base_url, - api_key=api_key, - ) - elif provider == "openai": - if not kwargs.get("base_url", ""): - base_url = os.getenv("OPENAI_ENDPOINT", "https://api.openai.com/v1") - else: - base_url = kwargs.get("base_url") - - return ChatOpenAI( - model=kwargs.get("model_name", "gpt-4o"), - temperature=kwargs.get("temperature", 0.0), - base_url=base_url, - api_key=api_key, - ) - elif provider == "deepseek": - if not kwargs.get("base_url", ""): - base_url = os.getenv("DEEPSEEK_ENDPOINT", "") - else: - base_url = kwargs.get("base_url") - - if kwargs.get("model_name", "deepseek-chat") == "deepseek-reasoner": - return DeepSeekR1ChatOpenAI( - model=kwargs.get("model_name", "deepseek-reasoner"), - temperature=kwargs.get("temperature", 0.0), - base_url=base_url, - api_key=api_key, - ) - else: - return ChatOpenAI( - model=kwargs.get("model_name", "deepseek-chat"), - temperature=kwargs.get("temperature", 0.0), - base_url=base_url, - api_key=api_key, - ) - elif provider == "google": - return ChatGoogleGenerativeAI( - model=kwargs.get("model_name", "gemini-2.0-flash-exp"), - temperature=kwargs.get("temperature", 0.0), - api_key=api_key, - ) - elif provider == "ollama": - if not kwargs.get("base_url", ""): - base_url = os.getenv("OLLAMA_ENDPOINT", "http://localhost:11434") - else: - base_url = kwargs.get("base_url") - - if "deepseek-r1" in kwargs.get("model_name", "qwen2.5:7b"): - return DeepSeekR1ChatOllama( - model=kwargs.get("model_name", "deepseek-r1:14b"), - temperature=kwargs.get("temperature", 0.0), - num_ctx=kwargs.get("num_ctx", 32000), - base_url=base_url, - ) - else: - return ChatOllama( - model=kwargs.get("model_name", "qwen2.5:7b"), - temperature=kwargs.get("temperature", 0.0), - num_ctx=kwargs.get("num_ctx", 32000), - num_predict=kwargs.get("num_predict", 1024), - base_url=base_url, - ) - elif provider == "azure_openai": - if not kwargs.get("base_url", ""): - base_url = os.getenv("AZURE_OPENAI_ENDPOINT", "") - else: - base_url = kwargs.get("base_url") - api_version = kwargs.get("api_version", "") or os.getenv("AZURE_OPENAI_API_VERSION", "2025-01-01-preview") - return AzureChatOpenAI( - model=kwargs.get("model_name", "gpt-4o"), - temperature=kwargs.get("temperature", 0.0), - api_version=api_version, - azure_endpoint=base_url, - api_key=api_key, - ) - elif provider == "alibaba": - if not kwargs.get("base_url", ""): - base_url = os.getenv("ALIBABA_ENDPOINT", "https://dashscope.aliyuncs.com/compatible-mode/v1") - else: - base_url = kwargs.get("base_url") - - return ChatOpenAI( - model=kwargs.get("model_name", "qwen-plus"), - temperature=kwargs.get("temperature", 0.0), - base_url=base_url, - api_key=api_key, - ) - elif provider == "ibm": - parameters = { - "temperature": kwargs.get("temperature", 0.0), - "max_tokens": kwargs.get("num_ctx", 32000) - } - if not kwargs.get("base_url", ""): - base_url = os.getenv("IBM_ENDPOINT", "https://us-south.ml.cloud.ibm.com") - else: - base_url = kwargs.get("base_url") - - return ChatWatsonx( - model_id=kwargs.get("model_name", "ibm/granite-vision-3.1-2b-preview"), - url=base_url, - project_id=os.getenv("IBM_PROJECT_ID"), - apikey=os.getenv("IBM_API_KEY"), - params=parameters - ) - elif provider == "moonshot": - return ChatOpenAI( - model=kwargs.get("model_name", "moonshot-v1-32k-vision-preview"), - temperature=kwargs.get("temperature", 0.0), - base_url=os.getenv("MOONSHOT_ENDPOINT"), - api_key=os.getenv("MOONSHOT_API_KEY"), - ) - elif provider == "unbound": - return ChatOpenAI( - model=kwargs.get("model_name", "gpt-4o-mini"), - temperature=kwargs.get("temperature", 0.0), - base_url=os.getenv("UNBOUND_ENDPOINT", "https://api.getunbound.ai"), - api_key=api_key, - ) - elif provider == "siliconflow": - if not kwargs.get("api_key", ""): - api_key = os.getenv("SiliconFLOW_API_KEY", "") - else: - api_key = kwargs.get("api_key") - if not kwargs.get("base_url", ""): - base_url = os.getenv("SiliconFLOW_ENDPOINT", "") - else: - base_url = kwargs.get("base_url") - return ChatOpenAI( - api_key=api_key, - base_url=base_url, - model_name=kwargs.get("model_name", "Qwen/QwQ-32B"), - temperature=kwargs.get("temperature", 0.0), - ) - else: - raise ValueError(f"Unsupported provider: {provider}") - - -# Predefined model names for common providers -model_names = { - "anthropic": ["claude-3-5-sonnet-20241022", "claude-3-5-sonnet-20240620", "claude-3-opus-20240229"], - "openai": ["gpt-4o", "gpt-4", "gpt-3.5-turbo", "o3-mini"], - "deepseek": ["deepseek-chat", "deepseek-reasoner"], - "google": ["gemini-2.0-flash", "gemini-2.0-flash-thinking-exp", "gemini-1.5-flash-latest", - "gemini-1.5-flash-8b-latest", "gemini-2.0-flash-thinking-exp-01-21", "gemini-2.0-pro-exp-02-05"], - "ollama": ["qwen2.5:7b", "qwen2.5:14b", "qwen2.5:32b", "qwen2.5-coder:14b", "qwen2.5-coder:32b", "llama2:7b", - "deepseek-r1:14b", "deepseek-r1:32b"], - "azure_openai": ["gpt-4o", "gpt-4", "gpt-3.5-turbo"], - "mistral": ["pixtral-large-latest", "mistral-large-latest", "mistral-small-latest", "ministral-8b-latest"], - "alibaba": ["qwen-plus", "qwen-max", "qwen-turbo", "qwen-long"], - "moonshot": ["moonshot-v1-32k-vision-preview", "moonshot-v1-8k-vision-preview"], - "unbound": ["gemini-2.0-flash", "gpt-4o-mini", "gpt-4o", "gpt-4.5-preview"], - "siliconflow": [ - "deepseek-ai/DeepSeek-R1", - "deepseek-ai/DeepSeek-V3", - "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B", - "deepseek-ai/DeepSeek-R1-Distill-Qwen-14B", - "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B", - "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B", - "deepseek-ai/DeepSeek-V2.5", - "deepseek-ai/deepseek-vl2", - "Qwen/Qwen2.5-72B-Instruct-128K", - "Qwen/Qwen2.5-72B-Instruct", - "Qwen/Qwen2.5-32B-Instruct", - "Qwen/Qwen2.5-14B-Instruct", - "Qwen/Qwen2.5-7B-Instruct", - "Qwen/Qwen2.5-Coder-32B-Instruct", - "Qwen/Qwen2.5-Coder-7B-Instruct", - "Qwen/Qwen2-7B-Instruct", - "Qwen/Qwen2-1.5B-Instruct", - "Qwen/QwQ-32B-Preview", - "Qwen/Qwen2-VL-72B-Instruct", - "Qwen/Qwen2.5-VL-32B-Instruct", - "Qwen/Qwen2.5-VL-72B-Instruct", - "TeleAI/TeleChat2", - "THUDM/glm-4-9b-chat", - "Vendor-A/Qwen/Qwen2.5-72B-Instruct", - "internlm/internlm2_5-7b-chat", - "internlm/internlm2_5-20b-chat", - "Pro/Qwen/Qwen2.5-7B-Instruct", - "Pro/Qwen/Qwen2-7B-Instruct", - "Pro/Qwen/Qwen2-1.5B-Instruct", - "Pro/THUDM/chatglm3-6b", - "Pro/THUDM/glm-4-9b-chat", - ], - "ibm": ["ibm/granite-vision-3.1-2b-preview", "meta-llama/llama-4-maverick-17b-128e-instruct-fp8","meta-llama/llama-3-2-90b-vision-instruct"] -} - # Callback to update the model name dropdown based on the selected provider def update_model_dropdown(llm_provider, api_key=None, base_url=None): @@ -276,15 +28,6 @@ def update_model_dropdown(llm_provider, api_key=None, base_url=None): return gr.Dropdown(choices=[], value="", interactive=True, allow_custom_value=True) -class MissingAPIKeyError(Exception): - """Custom exception for missing API key.""" - - def __init__(self, provider: str, env_var: str): - provider_display = PROVIDER_DISPLAY_NAMES.get(provider, provider.upper()) - super().__init__(f"šŸ’„ {provider_display} API key not found! šŸ”‘ Please set the " - f"`{env_var}` environment variable or provide it in the UI.") - - def encode_image(img_path): if not img_path: return None diff --git a/src/webui/__init__.py b/src/webui/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/webui/components/__init__.py b/src/webui/components/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/webui/components/agent_settings_tab.py b/src/webui/components/agent_settings_tab.py new file mode 100644 index 0000000..4f69ac1 --- /dev/null +++ b/src/webui/components/agent_settings_tab.py @@ -0,0 +1,228 @@ +import gradio as gr +from gradio.components import Component + +from src.webui.webui_manager import WebuiManager +from src.utils import config + + +def update_model_dropdown(llm_provider): + """ + Update the model name dropdown with predefined models for the selected provider. + """ + # Use predefined models for the selected provider + if llm_provider in config.model_names: + return gr.Dropdown(choices=config.model_names[llm_provider], value=config.model_names[llm_provider][0], + interactive=True) + else: + return gr.Dropdown(choices=[], value="", interactive=True, allow_custom_value=True) + + +def create_agent_settings_tab(webui_manager: WebuiManager) -> dict[str, Component]: + """ + Creates an agent settings tab. + """ + input_components = set(webui_manager.get_components()) + tab_components = {} + + with gr.Group(): + with gr.Column(): + override_system_prompt = gr.Textbox(label="Override system prompt", lines=4, interactive=True) + extend_system_prompt = gr.Textbox(label="Extend system prompt", lines=4, interactive=True) + + with gr.Group(): + with gr.Row(): + llm_provider = gr.Dropdown( + choices=[provider for provider, model in config.model_names.items()], + label="LLM Provider", + value="openai", + info="Select LLM provider for LLM", + interactive=True + ) + llm_model_name = gr.Dropdown( + label="LLM Model Name", + choices=config.model_names['openai'], + value="gpt-4o", + interactive=True, + allow_custom_value=True, + info="Select a model in the dropdown options or directly type a custom model name" + ) + with gr.Row(): + llm_temperature = gr.Slider( + minimum=0.0, + maximum=2.0, + value=0.6, + step=0.1, + label="LLM Temperature", + info="Controls randomness in model outputs", + interactive=True + ) + + use_vision = gr.Checkbox( + label="Use Vision", + value=True, + info="Enable Vision(Input highlighted screenshot into LLM)", + interactive=True + ) + + ollama_num_ctx = gr.Slider( + minimum=2 ** 8, + maximum=2 ** 16, + value=16000, + step=1, + label="Ollama Context Length", + info="Controls max context length model needs to handle (less = faster)", + visible=False, + interactive=True + ) + + with gr.Row(): + llm_base_url = gr.Textbox( + label="Base URL", + value="", + info="API endpoint URL (if required)" + ) + llm_api_key = gr.Textbox( + label="API Key", + type="password", + value="", + info="Your API key (leave blank to use .env)" + ) + + with gr.Group(): + with gr.Row(): + planner_llm_provider = gr.Dropdown( + choices=[provider for provider, model in config.model_names.items()], + value=None, + label="Planner LLM Provider", + info="Select LLM provider for LLM", + interactive=True + ) + planner_llm_model_name = gr.Dropdown( + label="Planner LLM Model Name", + interactive=True, + allow_custom_value=True, + info="Select a model in the dropdown options or directly type a custom model name" + ) + with gr.Row(): + planner_llm_temperature = gr.Slider( + minimum=0.0, + maximum=2.0, + value=0.6, + step=0.1, + label="Planner LLM Temperature", + info="Controls randomness in model outputs", + interactive=True + ) + + planner_use_vision = gr.Checkbox( + label="Use Vision(Planner LLM)", + value=False, + info="Enable Vision(Input highlighted screenshot into LLM)", + interactive=True + ) + + planner_ollama_num_ctx = gr.Slider( + minimum=2 ** 8, + maximum=2 ** 16, + value=16000, + step=1, + label="Ollama Context Length", + info="Controls max context length model needs to handle (less = faster)", + visible=False, + interactive=True + ) + + with gr.Row(): + planner_llm_base_url = gr.Textbox( + label="Base URL", + value="", + info="API endpoint URL (if required)" + ) + planner_llm_api_key = gr.Textbox( + label="API Key", + type="password", + value="", + info="Your API key (leave blank to use .env)" + ) + + with gr.Row(): + max_steps = gr.Slider( + minimum=1, + maximum=1000, + value=100, + step=1, + label="Max Run Steps", + info="Maximum number of steps the agent will take", + interactive=True + ) + max_actions = gr.Slider( + minimum=1, + maximum=100, + value=10, + step=1, + label="Max Number of Actions", + info="Maximum number of actions the agent will take per step", + interactive=True + ) + + with gr.Row(): + max_input_tokens = gr.Number( + label="Max Input Tokens", + value=128000, + precision=0, + interactive=True + ) + tool_calling_method = gr.Dropdown( + label="Tool Calling Method", + value="auto", + interactive=True, + allow_custom_value=True, + choices=["auto", "json_schema", "function_calling", "None"], + info="Tool Calls Function Name", + visible=False + ) + tab_components.update(dict( + override_system_prompt=override_system_prompt, + extend_system_prompt=extend_system_prompt, + llm_provider=llm_provider, + llm_model_name=llm_model_name, + llm_temperature=llm_temperature, + use_vision=use_vision, + ollama_num_ctx=ollama_num_ctx, + llm_base_url=llm_base_url, + llm_api_key=llm_api_key, + planner_llm_provider=planner_llm_provider, + planner_llm_model_name=planner_llm_model_name, + planner_llm_temperature=planner_llm_temperature, + planner_use_vision=planner_use_vision, + planner_ollama_num_ctx=planner_ollama_num_ctx, + planner_llm_base_url=planner_llm_base_url, + planner_llm_api_key=planner_llm_api_key, + max_steps=max_steps, + max_actions=max_actions, + max_input_tokens=max_input_tokens, + tool_calling_method=tool_calling_method, + + )) + llm_provider.change( + fn=lambda x: gr.update(visible=x == "ollama"), + inputs=llm_provider, + outputs=ollama_num_ctx + ) + llm_provider.change( + lambda provider: update_model_dropdown(provider), + inputs=[llm_provider], + outputs=llm_model_name + ) + planner_llm_provider.change( + fn=lambda x: gr.update(visible=x == "ollama"), + inputs=planner_llm_provider, + outputs=planner_ollama_num_ctx + ) + planner_llm_provider.change( + lambda provider: update_model_dropdown(provider), + inputs=[planner_llm_provider], + outputs=planner_llm_model_name + ) + + return tab_components diff --git a/src/webui/components/browser_settings_tab.py b/src/webui/components/browser_settings_tab.py new file mode 100644 index 0000000..e69de29 diff --git a/src/webui/components/load_save_config_tab.py b/src/webui/components/load_save_config_tab.py new file mode 100644 index 0000000..e69de29 diff --git a/src/webui/components/run_agent_tab.py b/src/webui/components/run_agent_tab.py new file mode 100644 index 0000000..a071a83 --- /dev/null +++ b/src/webui/components/run_agent_tab.py @@ -0,0 +1,4 @@ +import gradio as gr + +def creat_auto_agent_tab(): + pass \ No newline at end of file diff --git a/src/webui/components/run_deep_research_tab.py b/src/webui/components/run_deep_research_tab.py new file mode 100644 index 0000000..e69de29 diff --git a/src/webui/interface.py b/src/webui/interface.py new file mode 100644 index 0000000..e2690a9 --- /dev/null +++ b/src/webui/interface.py @@ -0,0 +1,68 @@ +import gradio as gr + +from src.webui.webui_manager import WebuiManager +from src.webui.components.agent_settings_tab import create_agent_settings_tab + +theme_map = { + "Default": gr.themes.Default(), + "Soft": gr.themes.Soft(), + "Monochrome": gr.themes.Monochrome(), + "Glass": gr.themes.Glass(), + "Origin": gr.themes.Origin(), + "Citrus": gr.themes.Citrus(), + "Ocean": gr.themes.Ocean(), + "Base": gr.themes.Base() +} + + +def create_ui(theme_name="Ocean"): + css = """ + .gradio-container { + width: 70vw !important; + max-width: 70% !important; + margin-left: auto !important; + margin-right: auto !important; + padding-top: 10px !important; + } + .header-text { + text-align: center; + margin-bottom: 20px; + } + .theme-section { + margin-bottom: 10px; + padding: 15px; + border-radius: 10px; + } + """ + + ui_manager = WebuiManager() + + with gr.Blocks( + title="Browser Use WebUI", theme=theme_map[theme_name], css=css + ) as demo: + with gr.Row(): + gr.Markdown( + """ + # 🌐 Browser Use WebUI + ### Control your browser with AI assistance + """, + elem_classes=["header-text"], + ) + + with gr.Tabs() as tabs: + with gr.TabItem("āš™ļø Agent Settings"): + ui_manager.add_components("agent_settings", create_agent_settings_tab(ui_manager)) + + with gr.TabItem("🌐 Browser Settings"): + pass + + with gr.TabItem("šŸ¤– Run Agent"): + pass + + with gr.TabItem("🧐 Deep Research"): + pass + + with gr.TabItem("šŸ“ UI Configuration"): + pass + + return demo diff --git a/src/webui/webui_manager.py b/src/webui/webui_manager.py new file mode 100644 index 0000000..ca5135f --- /dev/null +++ b/src/webui/webui_manager.py @@ -0,0 +1,46 @@ +from collections.abc import Generator +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from gradio.components import Component + +from browser_use.browser.browser import Browser +from browser_use.browser.context import BrowserContext +from browser_use.agent.service import Agent + + +class WebuiManager: + def __init__(self): + self.id_to_component: dict[str, Component] = {} + self.component_to_id: dict[Component, str] = {} + + self.browser: Browser = None + self.browser_context: BrowserContext = None + self.bu_agent: Agent = None + + def add_components(self, tab_name: str, components_dict: dict[str, "Component"]) -> None: + """ + Add tab components + """ + for comp_name, component in components_dict.items(): + comp_id = f"{tab_name}.{comp_name}" + self.id_to_component[comp_id] = component + self.component_to_id[component] = comp_id + + def get_components(self) -> list["Component"]: + """ + Get all components + """ + return list(self.id_to_component.values()) + + def get_component_by_id(self, comp_id: str) -> "Component": + """ + Get component by id + """ + return self.id_to_component[comp_id] + + def get_id_by_component(self, comp: "Component") -> str: + """ + Get id by component + """ + return self.component_to_id[comp] diff --git a/tests/test_browser_use.py b/tests/test_agents.py similarity index 99% rename from tests/test_browser_use.py rename to tests/test_agents.py index cb321db..27bb704 100644 --- a/tests/test_browser_use.py +++ b/tests/test_agents.py @@ -359,6 +359,6 @@ async def test_browser_use_parallel(): if __name__ == "__main__": - # asyncio.run(test_browser_use_org()) + asyncio.run(test_browser_use_org()) # asyncio.run(test_browser_use_parallel()) - asyncio.run(test_browser_use_custom()) + # asyncio.run(test_browser_use_custom()) diff --git a/tests/test_controller.py b/tests/test_controller.py index 93ed340..ef859ed 100644 --- a/tests/test_controller.py +++ b/tests/test_controller.py @@ -1,6 +1,7 @@ import asyncio import pdb import sys +import time sys.path.append(".") @@ -10,7 +11,7 @@ load_dotenv() async def test_mcp_client(): - from src.utils.mcp_client import setup_mcp_client_and_tools + from src.utils.mcp_client import setup_mcp_client_and_tools, create_tool_param_model test_server_config = { "playwright": { @@ -19,13 +20,95 @@ async def test_mcp_client(): "@playwright/mcp@latest", ], "transport": "stdio", + }, + "filesystem": { + "command": "npx", + "args": [ + "-y", + "@modelcontextprotocol/server-filesystem", + "/Users/warmshao/ai_workspace", + ] } } mcp_tools, mcp_client = await setup_mcp_client_and_tools(test_server_config) + for tool in mcp_tools: + tool_param_model = create_tool_param_model(tool) + print(tool.name) + print(tool.description) + print(tool_param_model.model_json_schema()) + pdb.set_trace() + +async def test_controller_with_mcp(): + import os + from src.controller.custom_controller import CustomController + from browser_use.controller.registry.views import ActionModel + + test_server_config = { + # "playwright": { + # "command": "npx", + # "args": [ + # "@playwright/mcp@latest", + # ], + # "transport": "stdio", + # }, + # "filesystem": { + # "command": "npx", + # "args": [ + # "-y", + # "@modelcontextprotocol/server-filesystem", + # "/Users/xxx/ai_workspace", + # ] + # }, + "desktop-commander": { + "command": "npx", + "args": [ + "-y", + "@wonderwhy-er/desktop-commander" + ] + } + } + + controller = CustomController() + await controller.setup_mcp_client(test_server_config) + action_name = "mcp.desktop-commander.execute_command" + action_info = controller.registry.registry.actions[action_name] + param_model = action_info.param_model + print(param_model.model_json_schema()) + params = {"command": f"python ./tmp/test.py" + } + validated_params = param_model(**params) + ActionModel_ = controller.registry.create_action_model() + # Create ActionModel instance with the validated parameters + action_model = ActionModel_(**{action_name: validated_params}) + result = await controller.act(action_model) + result = result.extracted_content + print(result) + if result and "Command is still running. Use read_output to get more output." in result and "PID" in result.split("\n")[0]: + pid = int(result.split("\n")[0].split("PID")[-1].strip()) + action_name = "mcp.desktop-commander.read_output" + action_info = controller.registry.registry.actions[action_name] + param_model = action_info.param_model + print(param_model.model_json_schema()) + params = {"pid": pid} + validated_params = param_model(**params) + action_model = ActionModel_(**{action_name: validated_params}) + output_result = "" + while True: + time.sleep(1) + result = await controller.act(action_model) + result = result.extracted_content + if result: + pdb.set_trace() + output_result = result + break + print(output_result) + pdb.set_trace() + await controller.close_mcp_client() pdb.set_trace() if __name__ == '__main__': - asyncio.run(test_mcp_client()) + # asyncio.run(test_mcp_client()) + asyncio.run(test_controller_with_mcp()) diff --git a/tests/test_deep_research.py b/tests/test_deep_research.py deleted file mode 100644 index 762345d..0000000 --- a/tests/test_deep_research.py +++ /dev/null @@ -1,30 +0,0 @@ -import asyncio -import os -from dotenv import load_dotenv - -load_dotenv() -import sys - -sys.path.append(".") - -async def test_deep_research(): - from src.utils.deep_research import deep_research - from src.utils import utils - - task = "write a report about DeepSeek-R1, get its pdf" - llm = utils.get_llm_model( - provider="gemini", - model_name="gemini-2.0-flash-thinking-exp-01-21", - temperature=1.0, - api_key=os.getenv("GOOGLE_API_KEY", "") - ) - - report_content, report_file_path = await deep_research(task=task, llm=llm, agent_state=None, - max_search_iterations=1, - max_query_num=3, - use_own_browser=False) - - - -if __name__ == "__main__": - asyncio.run(test_deep_research()) \ No newline at end of file diff --git a/tests/test_llm_api.py b/tests/test_llm_api.py index 05bc06e..bee1e6b 100644 --- a/tests/test_llm_api.py +++ b/tests/test_llm_api.py @@ -12,6 +12,7 @@ import sys sys.path.append(".") + @dataclass class LLMConfig: provider: str @@ -20,6 +21,7 @@ class LLMConfig: base_url: str = None api_key: str = None + def create_message_content(text, image_path=None): content = [{"type": "text", "text": text}] image_format = "png" if image_path and image_path.endswith(".png") else "jpeg" @@ -32,6 +34,7 @@ def create_message_content(text, image_path=None): }) return content + def get_env_value(key, provider): env_mappings = { "openai": {"api_key": "OPENAI_API_KEY", "base_url": "OPENAI_ENDPOINT"}, @@ -40,7 +43,7 @@ def get_env_value(key, provider): "deepseek": {"api_key": "DEEPSEEK_API_KEY", "base_url": "DEEPSEEK_ENDPOINT"}, "mistral": {"api_key": "MISTRAL_API_KEY", "base_url": "MISTRAL_ENDPOINT"}, "alibaba": {"api_key": "ALIBABA_API_KEY", "base_url": "ALIBABA_ENDPOINT"}, - "moonshot":{"api_key": "MOONSHOT_API_KEY", "base_url": "MOONSHOT_ENDPOINT"}, + "moonshot": {"api_key": "MOONSHOT_API_KEY", "base_url": "MOONSHOT_ENDPOINT"}, "ibm": {"api_key": "IBM_API_KEY", "base_url": "IBM_ENDPOINT"} } @@ -48,13 +51,14 @@ def get_env_value(key, provider): return os.getenv(env_mappings[provider][key], "") return "" + def test_llm(config, query, image_path=None, system_message=None): - from src.utils import utils + from src.utils import utils, llm_provider # Special handling for Ollama-based models if config.provider == "ollama": if "deepseek-r1" in config.model_name: - from src.utils.llm import DeepSeekR1ChatOllama + from src.utils.llm_provider import DeepSeekR1ChatOllama llm = DeepSeekR1ChatOllama(model=config.model_name) else: llm = ChatOllama(model=config.model_name) @@ -66,7 +70,7 @@ def test_llm(config, query, image_path=None, system_message=None): return # For other providers, use the standard configuration - llm = utils.get_llm_model( + llm = llm_provider.get_llm_model( provider=config.provider, model_name=config.model_name, temperature=config.temperature, @@ -86,56 +90,62 @@ def test_llm(config, query, image_path=None, system_message=None): print(ai_msg.reasoning_content) print(ai_msg.content) - if config.provider == "deepseek" and "deepseek-reasoner" in config.model_name: - print(llm.model_name) - pdb.set_trace() - def test_openai_model(): config = LLMConfig(provider="openai", model_name="gpt-4o") test_llm(config, "Describe this image", "assets/examples/test.png") + def test_google_model(): # Enable your API key first if you haven't: https://ai.google.dev/palm_docs/oauth_quickstart config = LLMConfig(provider="google", model_name="gemini-2.0-flash-exp") test_llm(config, "Describe this image", "assets/examples/test.png") + def test_azure_openai_model(): config = LLMConfig(provider="azure_openai", model_name="gpt-4o") test_llm(config, "Describe this image", "assets/examples/test.png") + def test_deepseek_model(): config = LLMConfig(provider="deepseek", model_name="deepseek-chat") test_llm(config, "Who are you?") + def test_deepseek_r1_model(): config = LLMConfig(provider="deepseek", model_name="deepseek-reasoner") test_llm(config, "Which is greater, 9.11 or 9.8?", system_message="You are a helpful AI assistant.") + def test_ollama_model(): config = LLMConfig(provider="ollama", model_name="qwen2.5:7b") test_llm(config, "Sing a ballad of LangChain.") + def test_deepseek_r1_ollama_model(): config = LLMConfig(provider="ollama", model_name="deepseek-r1:14b") test_llm(config, "How many 'r's are in the word 'strawberry'?") + def test_mistral_model(): config = LLMConfig(provider="mistral", model_name="pixtral-large-latest") test_llm(config, "Describe this image", "assets/examples/test.png") + def test_moonshot_model(): config = LLMConfig(provider="moonshot", model_name="moonshot-v1-32k-vision-preview") test_llm(config, "Describe this image", "assets/examples/test.png") + def test_ibm_model(): config = LLMConfig(provider="ibm", model_name="meta-llama/llama-4-maverick-17b-128e-instruct-fp8") test_llm(config, "Describe this image", "assets/examples/test.png") + if __name__ == "__main__": # test_openai_model() # test_google_model() # test_azure_openai_model() - #test_deepseek_model() + # test_deepseek_model() # test_ollama_model() # test_deepseek_r1_model() # test_deepseek_r1_ollama_model() diff --git a/webui.py b/webui.py index 33d7ece..3066ecb 100644 --- a/webui.py +++ b/webui.py @@ -1,1201 +1,16 @@ -import pdb -import logging - -from dotenv import load_dotenv - -load_dotenv() -import os -import glob -import asyncio import argparse -import os - -logger = logging.getLogger(__name__) - -import gradio as gr -import inspect -from functools import wraps - -from browser_use.agent.service import Agent -from playwright.async_api import async_playwright -from browser_use.browser.browser import Browser, BrowserConfig -from browser_use.browser.context import ( - BrowserContextConfig, - BrowserContextWindowSize, -) -from langchain_ollama import ChatOllama -from playwright.async_api import async_playwright -from src.utils.agent_state import AgentState - -from src.utils import utils -from src.agent.custom_agent import CustomAgent -from src.browser.custom_browser import CustomBrowser -from src.agent.custom_prompts import CustomSystemPrompt, CustomAgentMessagePrompt -from src.browser.custom_context import BrowserContextConfig, CustomBrowserContext -from src.controller.custom_controller import CustomController -from gradio.themes import Citrus, Default, Glass, Monochrome, Ocean, Origin, Soft, Base -from src.utils.utils import update_model_dropdown, get_latest_files, capture_screenshot, MissingAPIKeyError -from src.utils import utils - -# Global variables for persistence -_global_browser = None -_global_browser_context = None -_global_agent = None - -# Create the global agent state instance -_global_agent_state = AgentState() - -# webui config -webui_config_manager = utils.ConfigManager() - - -def scan_and_register_components(blocks): - """ę‰«ęäø€äøŖ Blocks åÆ¹č±”å¹¶ę³Øå†Œå…¶äø­ēš„ę‰€ęœ‰äŗ¤äŗ’å¼ē»„ä»¶ļ¼Œä½†äøåŒ…ę‹¬ęŒ‰é’®""" - global webui_config_manager - - def traverse_blocks(block, prefix=""): - registered = 0 - - # 处理 Blocks č‡Ŗčŗ«ēš„ē»„ä»¶ - if hasattr(block, "children"): - for i, child in enumerate(block.children): - if isinstance(child, gr.components.Component): - # ęŽ’é™¤ęŒ‰é’® (Button) 组件 - if getattr(child, "interactive", False) and not isinstance(child, gr.Button): - name = f"{prefix}component_{i}" - if hasattr(child, "label") and child.label: - # ä½æē”Øę ‡ē­¾ä½œäøŗåē§°ēš„äø€éƒØåˆ† - label = child.label - name = f"{prefix}{label}" - logger.debug(f"Registering component: {name}") - webui_config_manager.register_component(name, child) - registered += 1 - elif hasattr(child, "children"): - # é€’å½’å¤„ē†åµŒå„—ēš„ Blocks - new_prefix = f"{prefix}block_{i}_" - registered += traverse_blocks(child, new_prefix) - - return registered - - total = traverse_blocks(blocks) - logger.info(f"Total registered components: {total}") - - -def save_current_config(): - return webui_config_manager.save_current_config() - - -def update_ui_from_config(config_file): - return webui_config_manager.update_ui_from_config(config_file) - - -def resolve_sensitive_env_variables(text): - """ - Replace environment variable placeholders ($SENSITIVE_*) with their values. - Only replaces variables that start with SENSITIVE_. - """ - if not text: - return text - - import re - - # Find all $SENSITIVE_* patterns - env_vars = re.findall(r'\$SENSITIVE_[A-Za-z0-9_]*', text) - - result = text - for var in env_vars: - # Remove the $ prefix to get the actual environment variable name - env_name = var[1:] # removes the $ - env_value = os.getenv(env_name) - if env_value is not None: - # Replace $SENSITIVE_VAR_NAME with its value - result = result.replace(var, env_value) - - return result - - -async def stop_agent(): - """Request the agent to stop and update UI with enhanced feedback""" - global _global_agent - - try: - if _global_agent is not None: - # Request stop - _global_agent.stop() - # Update UI immediately - message = "Stop requested - the agent will halt at the next safe point" - logger.info(f"šŸ›‘ {message}") - - # Return UI updates - return ( - gr.update(value="Stopping...", interactive=False), # stop_button - gr.update(interactive=False), # run_button - ) - except Exception as e: - error_msg = f"Error during stop: {str(e)}" - logger.error(error_msg) - return ( - gr.update(value="Stop", interactive=True), - gr.update(interactive=True) - ) - - -async def stop_research_agent(): - """Request the agent to stop and update UI with enhanced feedback""" - global _global_agent_state - - try: - # Request stop - _global_agent_state.request_stop() - - # Update UI immediately - message = "Stop requested - the agent will halt at the next safe point" - logger.info(f"šŸ›‘ {message}") - - # Return UI updates - return ( # errors_output - gr.update(value="Stopping...", interactive=False), # stop_button - gr.update(interactive=False), # run_button - ) - except Exception as e: - error_msg = f"Error during stop: {str(e)}" - logger.error(error_msg) - return ( - gr.update(value="Stop", interactive=True), - gr.update(interactive=True) - ) - - -async def run_browser_agent( - agent_type, - llm_provider, - llm_model_name, - llm_num_ctx, - llm_temperature, - llm_base_url, - llm_api_key, - use_own_browser, - keep_browser_open, - headless, - disable_security, - window_w, - window_h, - save_recording_path, - save_agent_history_path, - save_trace_path, - enable_recording, - task, - add_infos, - max_steps, - use_vision, - max_actions_per_step, - tool_calling_method, - chrome_cdp, - max_input_tokens -): - try: - # Disable recording if the checkbox is unchecked - if not enable_recording: - save_recording_path = None - - # Ensure the recording directory exists if recording is enabled - if save_recording_path: - os.makedirs(save_recording_path, exist_ok=True) - - # Get the list of existing videos before the agent runs - existing_videos = set() - if save_recording_path: - existing_videos = set( - glob.glob(os.path.join(save_recording_path, "*.[mM][pP]4")) - + glob.glob(os.path.join(save_recording_path, "*.[wW][eE][bB][mM]")) - ) - - task = resolve_sensitive_env_variables(task) - - # Run the agent - llm = utils.get_llm_model( - provider=llm_provider, - model_name=llm_model_name, - num_ctx=llm_num_ctx, - temperature=llm_temperature, - base_url=llm_base_url, - api_key=llm_api_key, - ) - if agent_type == "org": - final_result, errors, model_actions, model_thoughts, trace_file, history_file = await run_org_agent( - llm=llm, - use_own_browser=use_own_browser, - keep_browser_open=keep_browser_open, - headless=headless, - disable_security=disable_security, - window_w=window_w, - window_h=window_h, - save_recording_path=save_recording_path, - save_agent_history_path=save_agent_history_path, - save_trace_path=save_trace_path, - task=task, - max_steps=max_steps, - use_vision=use_vision, - max_actions_per_step=max_actions_per_step, - tool_calling_method=tool_calling_method, - chrome_cdp=chrome_cdp, - max_input_tokens=max_input_tokens - ) - elif agent_type == "custom": - final_result, errors, model_actions, model_thoughts, trace_file, history_file = await run_custom_agent( - llm=llm, - use_own_browser=use_own_browser, - keep_browser_open=keep_browser_open, - headless=headless, - disable_security=disable_security, - window_w=window_w, - window_h=window_h, - save_recording_path=save_recording_path, - save_agent_history_path=save_agent_history_path, - save_trace_path=save_trace_path, - task=task, - add_infos=add_infos, - max_steps=max_steps, - use_vision=use_vision, - max_actions_per_step=max_actions_per_step, - tool_calling_method=tool_calling_method, - chrome_cdp=chrome_cdp, - max_input_tokens=max_input_tokens - ) - else: - raise ValueError(f"Invalid agent type: {agent_type}") - - # Get the list of videos after the agent runs (if recording is enabled) - # latest_video = None - # if save_recording_path: - # new_videos = set( - # glob.glob(os.path.join(save_recording_path, "*.[mM][pP]4")) - # + glob.glob(os.path.join(save_recording_path, "*.[wW][eE][bB][mM]")) - # ) - # if new_videos - existing_videos: - # latest_video = list(new_videos - existing_videos)[0] # Get the first new video - - gif_path = os.path.join(os.path.dirname(__file__), "agent_history.gif") - - return ( - final_result, - errors, - model_actions, - model_thoughts, - gif_path, - trace_file, - history_file, - gr.update(value="Stop", interactive=True), # Re-enable stop button - gr.update(interactive=True) # Re-enable run button - ) - - except MissingAPIKeyError as e: - logger.error(str(e)) - raise gr.Error(str(e), print_exception=False) - - except Exception as e: - import traceback - traceback.print_exc() - errors = str(e) + "\n" + traceback.format_exc() - return ( - '', # final_result - errors, # errors - '', # model_actions - '', # model_thoughts - None, # latest_video - None, # history_file - None, # trace_file - gr.update(value="Stop", interactive=True), # Re-enable stop button - gr.update(interactive=True) # Re-enable run button - ) - - -async def run_org_agent( - llm, - use_own_browser, - keep_browser_open, - headless, - disable_security, - window_w, - window_h, - save_recording_path, - save_agent_history_path, - save_trace_path, - task, - max_steps, - use_vision, - max_actions_per_step, - tool_calling_method, - chrome_cdp, - max_input_tokens -): - try: - global _global_browser, _global_browser_context, _global_agent - - extra_chromium_args = ["--accept_downloads=True", f"--window-size={window_w},{window_h}"] - cdp_url = chrome_cdp - - if use_own_browser: - cdp_url = os.getenv("CHROME_CDP", chrome_cdp) - chrome_path = os.getenv("CHROME_PATH", None) - if chrome_path == "": - chrome_path = None - chrome_user_data = os.getenv("CHROME_USER_DATA", None) - if chrome_user_data: - extra_chromium_args += [f"--user-data-dir={chrome_user_data}"] - else: - chrome_path = None - - if _global_browser is None: - _global_browser = Browser( - config=BrowserConfig( - headless=headless, - cdp_url=cdp_url, - disable_security=disable_security, - chrome_instance_path=chrome_path, - extra_chromium_args=extra_chromium_args, - ) - ) - - if _global_browser_context is None: - _global_browser_context = await _global_browser.new_context( - config=BrowserContextConfig( - trace_path=save_trace_path if save_trace_path else None, - save_recording_path=save_recording_path if save_recording_path else None, - save_downloads_path="./tmp/downloads", - no_viewport=False, - browser_window_size=BrowserContextWindowSize( - width=window_w, height=window_h - ), - ) - ) - - if _global_agent is None: - _global_agent = Agent( - task=task, - llm=llm, - use_vision=use_vision, - browser=_global_browser, - browser_context=_global_browser_context, - max_actions_per_step=max_actions_per_step, - tool_calling_method=tool_calling_method, - max_input_tokens=max_input_tokens, - generate_gif=True - ) - history = await _global_agent.run(max_steps=max_steps) - - history_file = os.path.join(save_agent_history_path, f"{_global_agent.state.agent_id}.json") - _global_agent.save_history(history_file) - - final_result = history.final_result() - errors = history.errors() - model_actions = history.model_actions() - model_thoughts = history.model_thoughts() - - trace_file = get_latest_files(save_trace_path) - - return final_result, errors, model_actions, model_thoughts, trace_file.get('.zip'), history_file - except Exception as e: - import traceback - traceback.print_exc() - errors = str(e) + "\n" + traceback.format_exc() - return '', errors, '', '', None, None - finally: - _global_agent = None - # Handle cleanup based on persistence configuration - if not keep_browser_open: - if _global_browser_context: - await _global_browser_context.close() - _global_browser_context = None - - if _global_browser: - await _global_browser.close() - _global_browser = None - - -async def run_custom_agent( - llm, - use_own_browser, - keep_browser_open, - headless, - disable_security, - window_w, - window_h, - save_recording_path, - save_agent_history_path, - save_trace_path, - task, - add_infos, - max_steps, - use_vision, - max_actions_per_step, - tool_calling_method, - chrome_cdp, - max_input_tokens -): - try: - global _global_browser, _global_browser_context, _global_agent - - extra_chromium_args = ["--accept_downloads=True", f"--window-size={window_w},{window_h}"] - cdp_url = chrome_cdp - if use_own_browser: - cdp_url = os.getenv("CHROME_CDP", chrome_cdp) - - chrome_path = os.getenv("CHROME_PATH", None) - if chrome_path == "": - chrome_path = None - chrome_user_data = os.getenv("CHROME_USER_DATA", None) - if chrome_user_data: - extra_chromium_args += [f"--user-data-dir={chrome_user_data}"] - else: - chrome_path = None - - controller = CustomController() - - # Initialize global browser if needed - # if chrome_cdp not empty string nor None - if (_global_browser is None) or (cdp_url and cdp_url != "" and cdp_url != None): - _global_browser = CustomBrowser( - config=BrowserConfig( - headless=headless, - disable_security=disable_security, - cdp_url=cdp_url, - chrome_instance_path=chrome_path, - extra_chromium_args=extra_chromium_args, - ) - ) - - if _global_browser_context is None or (chrome_cdp and cdp_url != "" and cdp_url != None): - _global_browser_context = await _global_browser.new_context( - config=BrowserContextConfig( - trace_path=save_trace_path if save_trace_path else None, - save_recording_path=save_recording_path if save_recording_path else None, - no_viewport=False, - save_downloads_path="./tmp/downloads", - browser_window_size=BrowserContextWindowSize( - width=window_w, height=window_h - ), - ) - ) - - # Create and run agent - if _global_agent is None: - _global_agent = CustomAgent( - task=task, - add_infos=add_infos, - use_vision=use_vision, - llm=llm, - browser=_global_browser, - browser_context=_global_browser_context, - controller=controller, - system_prompt_class=CustomSystemPrompt, - agent_prompt_class=CustomAgentMessagePrompt, - max_actions_per_step=max_actions_per_step, - tool_calling_method=tool_calling_method, - max_input_tokens=max_input_tokens, - generate_gif=True - ) - history = await _global_agent.run(max_steps=max_steps) - - history_file = os.path.join(save_agent_history_path, f"{_global_agent.state.agent_id}.json") - _global_agent.save_history(history_file) - - final_result = history.final_result() - errors = history.errors() - model_actions = history.model_actions() - model_thoughts = history.model_thoughts() - - trace_file = get_latest_files(save_trace_path) - - return final_result, errors, model_actions, model_thoughts, trace_file.get('.zip'), history_file - except Exception as e: - import traceback - traceback.print_exc() - errors = str(e) + "\n" + traceback.format_exc() - return '', errors, '', '', None, None - finally: - _global_agent = None - # Handle cleanup based on persistence configuration - if not keep_browser_open: - if _global_browser_context: - await _global_browser_context.close() - _global_browser_context = None - - if _global_browser: - await _global_browser.close() - _global_browser = None - - -async def run_with_stream( - agent_type, - llm_provider, - llm_model_name, - llm_num_ctx, - llm_temperature, - llm_base_url, - llm_api_key, - use_own_browser, - keep_browser_open, - headless, - disable_security, - window_w, - window_h, - save_recording_path, - save_agent_history_path, - save_trace_path, - enable_recording, - task, - add_infos, - max_steps, - use_vision, - max_actions_per_step, - tool_calling_method, - chrome_cdp, - max_input_tokens -): - global _global_agent - - stream_vw = 80 - stream_vh = int(80 * window_h // window_w) - if not headless: - result = await run_browser_agent( - agent_type=agent_type, - llm_provider=llm_provider, - llm_model_name=llm_model_name, - llm_num_ctx=llm_num_ctx, - llm_temperature=llm_temperature, - llm_base_url=llm_base_url, - llm_api_key=llm_api_key, - use_own_browser=use_own_browser, - keep_browser_open=keep_browser_open, - headless=headless, - disable_security=disable_security, - window_w=window_w, - window_h=window_h, - save_recording_path=save_recording_path, - save_agent_history_path=save_agent_history_path, - save_trace_path=save_trace_path, - enable_recording=enable_recording, - task=task, - add_infos=add_infos, - max_steps=max_steps, - use_vision=use_vision, - max_actions_per_step=max_actions_per_step, - tool_calling_method=tool_calling_method, - chrome_cdp=chrome_cdp, - max_input_tokens=max_input_tokens - ) - # Add HTML content at the start of the result array - yield [gr.update(visible=False)] + list(result) - else: - try: - # Run the browser agent in the background - agent_task = asyncio.create_task( - run_browser_agent( - agent_type=agent_type, - llm_provider=llm_provider, - llm_model_name=llm_model_name, - llm_num_ctx=llm_num_ctx, - llm_temperature=llm_temperature, - llm_base_url=llm_base_url, - llm_api_key=llm_api_key, - use_own_browser=use_own_browser, - keep_browser_open=keep_browser_open, - headless=headless, - disable_security=disable_security, - window_w=window_w, - window_h=window_h, - save_recording_path=save_recording_path, - save_agent_history_path=save_agent_history_path, - save_trace_path=save_trace_path, - enable_recording=enable_recording, - task=task, - add_infos=add_infos, - max_steps=max_steps, - use_vision=use_vision, - max_actions_per_step=max_actions_per_step, - tool_calling_method=tool_calling_method, - chrome_cdp=chrome_cdp, - max_input_tokens=max_input_tokens - ) - ) - - # Initialize values for streaming - html_content = f"

Using browser...

" - final_result = errors = model_actions = model_thoughts = "" - recording_gif = trace = history_file = None - - # Periodically update the stream while the agent task is running - while not agent_task.done(): - try: - encoded_screenshot = await capture_screenshot(_global_browser_context) - if encoded_screenshot is not None: - html_content = f'' - else: - html_content = f"

Waiting for browser session...

" - except Exception as e: - html_content = f"

Waiting for browser session...

" - - if _global_agent and _global_agent.state.stopped: - yield [ - gr.HTML(value=html_content, visible=True), - final_result, - errors, - model_actions, - model_thoughts, - recording_gif, - trace, - history_file, - gr.update(value="Stopping...", interactive=False), # stop_button - gr.update(interactive=False), # run_button - ] - break - else: - yield [ - gr.HTML(value=html_content, visible=True), - final_result, - errors, - model_actions, - model_thoughts, - recording_gif, - trace, - history_file, - gr.update(), # Re-enable stop button - gr.update() # Re-enable run button - ] - await asyncio.sleep(0.1) - - # Once the agent task completes, get the results - try: - result = await agent_task - final_result, errors, model_actions, model_thoughts, recording_gif, trace, history_file, stop_button, run_button = result - except gr.Error: - final_result = "" - model_actions = "" - model_thoughts = "" - recording_gif = trace = history_file = None - - except Exception as e: - errors = f"Agent error: {str(e)}" - - yield [ - gr.HTML(value=html_content, visible=True), - final_result, - errors, - model_actions, - model_thoughts, - recording_gif, - trace, - history_file, - stop_button, - run_button - ] - - except Exception as e: - import traceback - yield [ - gr.HTML( - value=f"

Waiting for browser session...

", - visible=True), - "", - f"Error: {str(e)}\n{traceback.format_exc()}", - "", - "", - None, - None, - None, - gr.update(value="Stop", interactive=True), # Re-enable stop button - gr.update(interactive=True) # Re-enable run button - ] - - -# Define the theme map globally -theme_map = { - "Default": Default(), - "Soft": Soft(), - "Monochrome": Monochrome(), - "Glass": Glass(), - "Origin": Origin(), - "Citrus": Citrus(), - "Ocean": Ocean(), - "Base": Base() -} - - -async def close_global_browser(): - global _global_browser, _global_browser_context - - if _global_browser_context: - await _global_browser_context.close() - _global_browser_context = None - - if _global_browser: - await _global_browser.close() - _global_browser = None - - -async def run_deep_search(research_task, max_search_iteration_input, max_query_per_iter_input, llm_provider, - llm_model_name, llm_num_ctx, llm_temperature, llm_base_url, llm_api_key, use_vision, - use_own_browser, headless, chrome_cdp): - from src.utils.deep_research import deep_research - global _global_agent_state - - # Clear any previous stop request - _global_agent_state.clear_stop() - - llm = utils.get_llm_model( - provider=llm_provider, - model_name=llm_model_name, - num_ctx=llm_num_ctx, - temperature=llm_temperature, - base_url=llm_base_url, - api_key=llm_api_key, - ) - markdown_content, file_path = await deep_research(research_task, llm, _global_agent_state, - max_search_iterations=max_search_iteration_input, - max_query_num=max_query_per_iter_input, - use_vision=use_vision, - headless=headless, - use_own_browser=use_own_browser, - chrome_cdp=chrome_cdp - ) - - return markdown_content, file_path, gr.update(value="Stop", interactive=True), gr.update(interactive=True) - - -def create_ui(theme_name="Ocean"): - css = """ - .gradio-container { - width: 60vw !important; - max-width: 60% !important; - margin-left: auto !important; - margin-right: auto !important; - padding-top: 20px !important; - } - .header-text { - text-align: center; - margin-bottom: 30px; - } - .theme-section { - margin-bottom: 20px; - padding: 15px; - border-radius: 10px; - } - """ - - with gr.Blocks( - title="Browser Use WebUI", theme=theme_map[theme_name], css=css - ) as demo: - with gr.Row(): - gr.Markdown( - """ - # 🌐 Browser Use WebUI - ### Control your browser with AI assistance - """, - elem_classes=["header-text"], - ) - - with gr.Tabs() as tabs: - with gr.TabItem("āš™ļø Agent Settings", id=1): - with gr.Group(): - agent_type = gr.Radio( - ["org", "custom"], - label="Agent Type", - value="custom", - info="Select the type of agent to use", - interactive=True - ) - with gr.Column(): - max_steps = gr.Slider( - minimum=1, - maximum=200, - value=100, - step=1, - label="Max Run Steps", - info="Maximum number of steps the agent will take", - interactive=True - ) - max_actions_per_step = gr.Slider( - minimum=1, - maximum=100, - value=10, - step=1, - label="Max Actions per Step", - info="Maximum number of actions the agent will take per step", - interactive=True - ) - with gr.Column(): - use_vision = gr.Checkbox( - label="Use Vision", - value=True, - info="Enable visual processing capabilities", - interactive=True - ) - max_input_tokens = gr.Number( - label="Max Input Tokens", - value=128000, - precision=0, - interactive=True - ) - tool_calling_method = gr.Dropdown( - label="Tool Calling Method", - value="auto", - interactive=True, - allow_custom_value=True, # Allow users to input custom model names - choices=["auto", "json_schema", "function_calling"], - info="Tool Calls Funtion Name", - visible=False - ) - - with gr.TabItem("šŸ”§ LLM Settings", id=2): - with gr.Group(): - llm_provider = gr.Dropdown( - choices=[provider for provider, model in utils.model_names.items()], - label="LLM Provider", - value="openai", - info="Select your preferred language model provider", - interactive=True - ) - llm_model_name = gr.Dropdown( - label="Model Name", - choices=utils.model_names['openai'], - value="gpt-4o", - interactive=True, - allow_custom_value=True, # Allow users to input custom model names - info="Select a model in the dropdown options or directly type a custom model name" - ) - ollama_num_ctx = gr.Slider( - minimum=2 ** 8, - maximum=2 ** 16, - value=16000, - step=1, - label="Ollama Context Length", - info="Controls max context length model needs to handle (less = faster)", - visible=False, - interactive=True - ) - llm_temperature = gr.Slider( - minimum=0.0, - maximum=2.0, - value=0.6, - step=0.1, - label="Temperature", - info="Controls randomness in model outputs", - interactive=True - ) - with gr.Row(): - llm_base_url = gr.Textbox( - label="Base URL", - value="", - info="API endpoint URL (if required)" - ) - llm_api_key = gr.Textbox( - label="API Key", - type="password", - value="", - info="Your API key (leave blank to use .env)" - ) - - # Change event to update context length slider - def update_llm_num_ctx_visibility(llm_provider): - return gr.update(visible=llm_provider == "ollama") - - # Bind the change event of llm_provider to update the visibility of context length slider - llm_provider.change( - fn=update_llm_num_ctx_visibility, - inputs=llm_provider, - outputs=ollama_num_ctx - ) - - with gr.TabItem("🌐 Browser Settings", id=3): - with gr.Group(): - with gr.Row(): - use_own_browser = gr.Checkbox( - label="Use Own Browser", - value=False, - info="Use your existing browser instance", - interactive=True - ) - keep_browser_open = gr.Checkbox( - label="Keep Browser Open", - value=False, - info="Keep Browser Open between Tasks", - interactive=True - ) - headless = gr.Checkbox( - label="Headless Mode", - value=False, - info="Run browser without GUI", - interactive=True - ) - disable_security = gr.Checkbox( - label="Disable Security", - value=True, - info="Disable browser security features", - interactive=True - ) - enable_recording = gr.Checkbox( - label="Enable Recording", - value=True, - info="Enable saving browser recordings", - interactive=True - ) - - with gr.Row(): - window_w = gr.Number( - label="Window Width", - value=1280, - info="Browser window width", - interactive=True - ) - window_h = gr.Number( - label="Window Height", - value=1100, - info="Browser window height", - interactive=True - ) - - chrome_cdp = gr.Textbox( - label="CDP URL", - placeholder="http://localhost:9222", - value="", - info="CDP for google remote debugging", - interactive=True, # Allow editing only if recording is enabled - ) - - save_recording_path = gr.Textbox( - label="Recording Path", - placeholder="e.g. ./tmp/record_videos", - value="./tmp/record_videos", - info="Path to save browser recordings", - interactive=True, # Allow editing only if recording is enabled - ) - - save_trace_path = gr.Textbox( - label="Trace Path", - placeholder="e.g. ./tmp/traces", - value="./tmp/traces", - info="Path to save Agent traces", - interactive=True, - ) - - save_agent_history_path = gr.Textbox( - label="Agent History Save Path", - placeholder="e.g., ./tmp/agent_history", - value="./tmp/agent_history", - info="Specify the directory where agent history should be saved.", - interactive=True, - ) - - with gr.TabItem("šŸ¤– Run Agent", id=4): - task = gr.Textbox( - label="Task Description", - lines=4, - placeholder="Enter your task here...", - value="go to google.com and type 'OpenAI' click search and give me the first url", - info="Describe what you want the agent to do", - interactive=True - ) - add_infos = gr.Textbox( - label="Additional Information", - lines=3, - placeholder="Add any helpful context or instructions...", - info="Optional hints to help the LLM complete the task", - value="", - interactive=True - ) - - with gr.Row(): - run_button = gr.Button("ā–¶ļø Run Agent", variant="primary", scale=2) - stop_button = gr.Button("ā¹ļø Stop", variant="stop", scale=1) - - with gr.Row(): - browser_view = gr.HTML( - value="

Waiting for browser session...

", - label="Live Browser View", - visible=False - ) - - gr.Markdown("### Results") - with gr.Row(): - with gr.Column(): - final_result_output = gr.Textbox( - label="Final Result", lines=3, show_label=True - ) - with gr.Column(): - errors_output = gr.Textbox( - label="Errors", lines=3, show_label=True - ) - with gr.Row(): - with gr.Column(): - model_actions_output = gr.Textbox( - label="Model Actions", lines=3, show_label=True, visible=False - ) - with gr.Column(): - model_thoughts_output = gr.Textbox( - label="Model Thoughts", lines=3, show_label=True, visible=False - ) - recording_gif = gr.Image(label="Result GIF", format="gif") - trace_file = gr.File(label="Trace File") - agent_history_file = gr.File(label="Agent History") - - with gr.TabItem("🧐 Deep Research", id=5): - research_task_input = gr.Textbox(label="Research Task", lines=5, - value="Compose a report on the use of Reinforcement Learning for training Large Language Models, encompassing its origins, current advancements, and future prospects, substantiated with examples of relevant models and techniques. The report should reflect original insights and analysis, moving beyond mere summarization of existing literature.", - interactive=True) - with gr.Row(): - max_search_iteration_input = gr.Number(label="Max Search Iteration", value=3, - precision=0, - interactive=True) # precision=0 ē”®äæę˜Æę•“ę•° - max_query_per_iter_input = gr.Number(label="Max Query per Iteration", value=1, - precision=0, - interactive=True) # precision=0 ē”®äæę˜Æę•“ę•° - with gr.Row(): - research_button = gr.Button("ā–¶ļø Run Deep Research", variant="primary", scale=2) - stop_research_button = gr.Button("ā¹ Stop", variant="stop", scale=1) - markdown_output_display = gr.Markdown(label="Research Report") - markdown_download = gr.File(label="Download Research Report") - - # Bind the stop button click event after errors_output is defined - stop_button.click( - fn=stop_agent, - inputs=[], - outputs=[stop_button, run_button], - ) - - # Run button click handler - run_button.click( - fn=run_with_stream, - inputs=[ - agent_type, llm_provider, llm_model_name, ollama_num_ctx, llm_temperature, llm_base_url, - llm_api_key, - use_own_browser, keep_browser_open, headless, disable_security, window_w, window_h, - save_recording_path, save_agent_history_path, save_trace_path, # Include the new path - enable_recording, task, add_infos, max_steps, use_vision, max_actions_per_step, - tool_calling_method, chrome_cdp, max_input_tokens - ], - outputs=[ - browser_view, # Browser view - final_result_output, # Final result - errors_output, # Errors - model_actions_output, # Model actions - model_thoughts_output, # Model thoughts - recording_gif, # Latest recording - trace_file, # Trace file - agent_history_file, # Agent history file - stop_button, # Stop button - run_button # Run button - ], - ) - - # Run Deep Research - research_button.click( - fn=run_deep_search, - inputs=[research_task_input, max_search_iteration_input, max_query_per_iter_input, llm_provider, - llm_model_name, ollama_num_ctx, llm_temperature, llm_base_url, llm_api_key, use_vision, - use_own_browser, headless, chrome_cdp], - outputs=[markdown_output_display, markdown_download, stop_research_button, research_button] - ) - # Bind the stop button click event after errors_output is defined - stop_research_button.click( - fn=stop_research_agent, - inputs=[], - outputs=[stop_research_button, research_button], - ) - - with gr.TabItem("šŸŽ„ Recordings", id=7, visible=True): - def list_recordings(save_recording_path): - if not os.path.exists(save_recording_path): - return [] - - # Get all video files - recordings = glob.glob(os.path.join(save_recording_path, "*.[mM][pP]4")) + glob.glob( - os.path.join(save_recording_path, "*.[wW][eE][bB][mM]")) - - # Sort recordings by creation time (oldest first) - recordings.sort(key=os.path.getctime) - - # Add numbering to the recordings - numbered_recordings = [] - for idx, recording in enumerate(recordings, start=1): - filename = os.path.basename(recording) - numbered_recordings.append((recording, f"{idx}. {filename}")) - - return numbered_recordings - - recordings_gallery = gr.Gallery( - label="Recordings", - columns=3, - height="auto", - object_fit="contain" - ) - - refresh_button = gr.Button("šŸ”„ Refresh Recordings", variant="secondary") - refresh_button.click( - fn=list_recordings, - inputs=save_recording_path, - outputs=recordings_gallery - ) - - with gr.TabItem("šŸ“ UI Configuration", id=8): - config_file_input = gr.File( - label="Load UI Settings from Config File", - file_types=[".json"], - interactive=True - ) - with gr.Row(): - load_config_button = gr.Button("Load Config", variant="primary") - save_config_button = gr.Button("Save UI Settings", variant="primary") - - config_status = gr.Textbox( - label="Status", - lines=2, - interactive=False - ) - save_config_button.click( - fn=save_current_config, - inputs=[], # äøéœ€č¦č¾“å…„å‚ę•° - outputs=[config_status] - ) - - # Attach the callback to the LLM provider dropdown - llm_provider.change( - lambda provider, api_key, base_url: update_model_dropdown(provider, api_key, base_url), - inputs=[llm_provider, llm_api_key, llm_base_url], - outputs=llm_model_name - ) - - # Add this after defining the components - enable_recording.change( - lambda enabled: gr.update(interactive=enabled), - inputs=enable_recording, - outputs=save_recording_path - ) - - use_own_browser.change(fn=close_global_browser) - keep_browser_open.change(fn=close_global_browser) - - scan_and_register_components(demo) - global webui_config_manager - all_components = webui_config_manager.get_all_components() - - load_config_button.click( - fn=update_ui_from_config, - inputs=[config_file_input], - outputs=all_components + [config_status] - ) - return demo +from src.webui.interface import theme_map, create_ui def main(): - parser = argparse.ArgumentParser(description="Gradio UI for Browser Agent") + parser = argparse.ArgumentParser(description="Gradio WebUI for Browser Agent") parser.add_argument("--ip", type=str, default="127.0.0.1", help="IP address to bind to") parser.add_argument("--port", type=int, default=7788, help="Port to listen on") parser.add_argument("--theme", type=str, default="Ocean", choices=theme_map.keys(), help="Theme to use for the UI") args = parser.parse_args() demo = create_ui(theme_name=args.theme) - demo.launch(server_name=args.ip, server_port=args.port) + demo.queue().launch(server_name=args.ip, server_port=args.port) if __name__ == '__main__': diff --git a/webui2.py b/webui2.py new file mode 100644 index 0000000..33d7ece --- /dev/null +++ b/webui2.py @@ -0,0 +1,1202 @@ +import pdb +import logging + +from dotenv import load_dotenv + +load_dotenv() +import os +import glob +import asyncio +import argparse +import os + +logger = logging.getLogger(__name__) + +import gradio as gr +import inspect +from functools import wraps + +from browser_use.agent.service import Agent +from playwright.async_api import async_playwright +from browser_use.browser.browser import Browser, BrowserConfig +from browser_use.browser.context import ( + BrowserContextConfig, + BrowserContextWindowSize, +) +from langchain_ollama import ChatOllama +from playwright.async_api import async_playwright +from src.utils.agent_state import AgentState + +from src.utils import utils +from src.agent.custom_agent import CustomAgent +from src.browser.custom_browser import CustomBrowser +from src.agent.custom_prompts import CustomSystemPrompt, CustomAgentMessagePrompt +from src.browser.custom_context import BrowserContextConfig, CustomBrowserContext +from src.controller.custom_controller import CustomController +from gradio.themes import Citrus, Default, Glass, Monochrome, Ocean, Origin, Soft, Base +from src.utils.utils import update_model_dropdown, get_latest_files, capture_screenshot, MissingAPIKeyError +from src.utils import utils + +# Global variables for persistence +_global_browser = None +_global_browser_context = None +_global_agent = None + +# Create the global agent state instance +_global_agent_state = AgentState() + +# webui config +webui_config_manager = utils.ConfigManager() + + +def scan_and_register_components(blocks): + """ę‰«ęäø€äøŖ Blocks åÆ¹č±”å¹¶ę³Øå†Œå…¶äø­ēš„ę‰€ęœ‰äŗ¤äŗ’å¼ē»„ä»¶ļ¼Œä½†äøåŒ…ę‹¬ęŒ‰é’®""" + global webui_config_manager + + def traverse_blocks(block, prefix=""): + registered = 0 + + # 处理 Blocks č‡Ŗčŗ«ēš„ē»„ä»¶ + if hasattr(block, "children"): + for i, child in enumerate(block.children): + if isinstance(child, gr.components.Component): + # ęŽ’é™¤ęŒ‰é’® (Button) 组件 + if getattr(child, "interactive", False) and not isinstance(child, gr.Button): + name = f"{prefix}component_{i}" + if hasattr(child, "label") and child.label: + # ä½æē”Øę ‡ē­¾ä½œäøŗåē§°ēš„äø€éƒØåˆ† + label = child.label + name = f"{prefix}{label}" + logger.debug(f"Registering component: {name}") + webui_config_manager.register_component(name, child) + registered += 1 + elif hasattr(child, "children"): + # é€’å½’å¤„ē†åµŒå„—ēš„ Blocks + new_prefix = f"{prefix}block_{i}_" + registered += traverse_blocks(child, new_prefix) + + return registered + + total = traverse_blocks(blocks) + logger.info(f"Total registered components: {total}") + + +def save_current_config(): + return webui_config_manager.save_current_config() + + +def update_ui_from_config(config_file): + return webui_config_manager.update_ui_from_config(config_file) + + +def resolve_sensitive_env_variables(text): + """ + Replace environment variable placeholders ($SENSITIVE_*) with their values. + Only replaces variables that start with SENSITIVE_. + """ + if not text: + return text + + import re + + # Find all $SENSITIVE_* patterns + env_vars = re.findall(r'\$SENSITIVE_[A-Za-z0-9_]*', text) + + result = text + for var in env_vars: + # Remove the $ prefix to get the actual environment variable name + env_name = var[1:] # removes the $ + env_value = os.getenv(env_name) + if env_value is not None: + # Replace $SENSITIVE_VAR_NAME with its value + result = result.replace(var, env_value) + + return result + + +async def stop_agent(): + """Request the agent to stop and update UI with enhanced feedback""" + global _global_agent + + try: + if _global_agent is not None: + # Request stop + _global_agent.stop() + # Update UI immediately + message = "Stop requested - the agent will halt at the next safe point" + logger.info(f"šŸ›‘ {message}") + + # Return UI updates + return ( + gr.update(value="Stopping...", interactive=False), # stop_button + gr.update(interactive=False), # run_button + ) + except Exception as e: + error_msg = f"Error during stop: {str(e)}" + logger.error(error_msg) + return ( + gr.update(value="Stop", interactive=True), + gr.update(interactive=True) + ) + + +async def stop_research_agent(): + """Request the agent to stop and update UI with enhanced feedback""" + global _global_agent_state + + try: + # Request stop + _global_agent_state.request_stop() + + # Update UI immediately + message = "Stop requested - the agent will halt at the next safe point" + logger.info(f"šŸ›‘ {message}") + + # Return UI updates + return ( # errors_output + gr.update(value="Stopping...", interactive=False), # stop_button + gr.update(interactive=False), # run_button + ) + except Exception as e: + error_msg = f"Error during stop: {str(e)}" + logger.error(error_msg) + return ( + gr.update(value="Stop", interactive=True), + gr.update(interactive=True) + ) + + +async def run_browser_agent( + agent_type, + llm_provider, + llm_model_name, + llm_num_ctx, + llm_temperature, + llm_base_url, + llm_api_key, + use_own_browser, + keep_browser_open, + headless, + disable_security, + window_w, + window_h, + save_recording_path, + save_agent_history_path, + save_trace_path, + enable_recording, + task, + add_infos, + max_steps, + use_vision, + max_actions_per_step, + tool_calling_method, + chrome_cdp, + max_input_tokens +): + try: + # Disable recording if the checkbox is unchecked + if not enable_recording: + save_recording_path = None + + # Ensure the recording directory exists if recording is enabled + if save_recording_path: + os.makedirs(save_recording_path, exist_ok=True) + + # Get the list of existing videos before the agent runs + existing_videos = set() + if save_recording_path: + existing_videos = set( + glob.glob(os.path.join(save_recording_path, "*.[mM][pP]4")) + + glob.glob(os.path.join(save_recording_path, "*.[wW][eE][bB][mM]")) + ) + + task = resolve_sensitive_env_variables(task) + + # Run the agent + llm = utils.get_llm_model( + provider=llm_provider, + model_name=llm_model_name, + num_ctx=llm_num_ctx, + temperature=llm_temperature, + base_url=llm_base_url, + api_key=llm_api_key, + ) + if agent_type == "org": + final_result, errors, model_actions, model_thoughts, trace_file, history_file = await run_org_agent( + llm=llm, + use_own_browser=use_own_browser, + keep_browser_open=keep_browser_open, + headless=headless, + disable_security=disable_security, + window_w=window_w, + window_h=window_h, + save_recording_path=save_recording_path, + save_agent_history_path=save_agent_history_path, + save_trace_path=save_trace_path, + task=task, + max_steps=max_steps, + use_vision=use_vision, + max_actions_per_step=max_actions_per_step, + tool_calling_method=tool_calling_method, + chrome_cdp=chrome_cdp, + max_input_tokens=max_input_tokens + ) + elif agent_type == "custom": + final_result, errors, model_actions, model_thoughts, trace_file, history_file = await run_custom_agent( + llm=llm, + use_own_browser=use_own_browser, + keep_browser_open=keep_browser_open, + headless=headless, + disable_security=disable_security, + window_w=window_w, + window_h=window_h, + save_recording_path=save_recording_path, + save_agent_history_path=save_agent_history_path, + save_trace_path=save_trace_path, + task=task, + add_infos=add_infos, + max_steps=max_steps, + use_vision=use_vision, + max_actions_per_step=max_actions_per_step, + tool_calling_method=tool_calling_method, + chrome_cdp=chrome_cdp, + max_input_tokens=max_input_tokens + ) + else: + raise ValueError(f"Invalid agent type: {agent_type}") + + # Get the list of videos after the agent runs (if recording is enabled) + # latest_video = None + # if save_recording_path: + # new_videos = set( + # glob.glob(os.path.join(save_recording_path, "*.[mM][pP]4")) + # + glob.glob(os.path.join(save_recording_path, "*.[wW][eE][bB][mM]")) + # ) + # if new_videos - existing_videos: + # latest_video = list(new_videos - existing_videos)[0] # Get the first new video + + gif_path = os.path.join(os.path.dirname(__file__), "agent_history.gif") + + return ( + final_result, + errors, + model_actions, + model_thoughts, + gif_path, + trace_file, + history_file, + gr.update(value="Stop", interactive=True), # Re-enable stop button + gr.update(interactive=True) # Re-enable run button + ) + + except MissingAPIKeyError as e: + logger.error(str(e)) + raise gr.Error(str(e), print_exception=False) + + except Exception as e: + import traceback + traceback.print_exc() + errors = str(e) + "\n" + traceback.format_exc() + return ( + '', # final_result + errors, # errors + '', # model_actions + '', # model_thoughts + None, # latest_video + None, # history_file + None, # trace_file + gr.update(value="Stop", interactive=True), # Re-enable stop button + gr.update(interactive=True) # Re-enable run button + ) + + +async def run_org_agent( + llm, + use_own_browser, + keep_browser_open, + headless, + disable_security, + window_w, + window_h, + save_recording_path, + save_agent_history_path, + save_trace_path, + task, + max_steps, + use_vision, + max_actions_per_step, + tool_calling_method, + chrome_cdp, + max_input_tokens +): + try: + global _global_browser, _global_browser_context, _global_agent + + extra_chromium_args = ["--accept_downloads=True", f"--window-size={window_w},{window_h}"] + cdp_url = chrome_cdp + + if use_own_browser: + cdp_url = os.getenv("CHROME_CDP", chrome_cdp) + chrome_path = os.getenv("CHROME_PATH", None) + if chrome_path == "": + chrome_path = None + chrome_user_data = os.getenv("CHROME_USER_DATA", None) + if chrome_user_data: + extra_chromium_args += [f"--user-data-dir={chrome_user_data}"] + else: + chrome_path = None + + if _global_browser is None: + _global_browser = Browser( + config=BrowserConfig( + headless=headless, + cdp_url=cdp_url, + disable_security=disable_security, + chrome_instance_path=chrome_path, + extra_chromium_args=extra_chromium_args, + ) + ) + + if _global_browser_context is None: + _global_browser_context = await _global_browser.new_context( + config=BrowserContextConfig( + trace_path=save_trace_path if save_trace_path else None, + save_recording_path=save_recording_path if save_recording_path else None, + save_downloads_path="./tmp/downloads", + no_viewport=False, + browser_window_size=BrowserContextWindowSize( + width=window_w, height=window_h + ), + ) + ) + + if _global_agent is None: + _global_agent = Agent( + task=task, + llm=llm, + use_vision=use_vision, + browser=_global_browser, + browser_context=_global_browser_context, + max_actions_per_step=max_actions_per_step, + tool_calling_method=tool_calling_method, + max_input_tokens=max_input_tokens, + generate_gif=True + ) + history = await _global_agent.run(max_steps=max_steps) + + history_file = os.path.join(save_agent_history_path, f"{_global_agent.state.agent_id}.json") + _global_agent.save_history(history_file) + + final_result = history.final_result() + errors = history.errors() + model_actions = history.model_actions() + model_thoughts = history.model_thoughts() + + trace_file = get_latest_files(save_trace_path) + + return final_result, errors, model_actions, model_thoughts, trace_file.get('.zip'), history_file + except Exception as e: + import traceback + traceback.print_exc() + errors = str(e) + "\n" + traceback.format_exc() + return '', errors, '', '', None, None + finally: + _global_agent = None + # Handle cleanup based on persistence configuration + if not keep_browser_open: + if _global_browser_context: + await _global_browser_context.close() + _global_browser_context = None + + if _global_browser: + await _global_browser.close() + _global_browser = None + + +async def run_custom_agent( + llm, + use_own_browser, + keep_browser_open, + headless, + disable_security, + window_w, + window_h, + save_recording_path, + save_agent_history_path, + save_trace_path, + task, + add_infos, + max_steps, + use_vision, + max_actions_per_step, + tool_calling_method, + chrome_cdp, + max_input_tokens +): + try: + global _global_browser, _global_browser_context, _global_agent + + extra_chromium_args = ["--accept_downloads=True", f"--window-size={window_w},{window_h}"] + cdp_url = chrome_cdp + if use_own_browser: + cdp_url = os.getenv("CHROME_CDP", chrome_cdp) + + chrome_path = os.getenv("CHROME_PATH", None) + if chrome_path == "": + chrome_path = None + chrome_user_data = os.getenv("CHROME_USER_DATA", None) + if chrome_user_data: + extra_chromium_args += [f"--user-data-dir={chrome_user_data}"] + else: + chrome_path = None + + controller = CustomController() + + # Initialize global browser if needed + # if chrome_cdp not empty string nor None + if (_global_browser is None) or (cdp_url and cdp_url != "" and cdp_url != None): + _global_browser = CustomBrowser( + config=BrowserConfig( + headless=headless, + disable_security=disable_security, + cdp_url=cdp_url, + chrome_instance_path=chrome_path, + extra_chromium_args=extra_chromium_args, + ) + ) + + if _global_browser_context is None or (chrome_cdp and cdp_url != "" and cdp_url != None): + _global_browser_context = await _global_browser.new_context( + config=BrowserContextConfig( + trace_path=save_trace_path if save_trace_path else None, + save_recording_path=save_recording_path if save_recording_path else None, + no_viewport=False, + save_downloads_path="./tmp/downloads", + browser_window_size=BrowserContextWindowSize( + width=window_w, height=window_h + ), + ) + ) + + # Create and run agent + if _global_agent is None: + _global_agent = CustomAgent( + task=task, + add_infos=add_infos, + use_vision=use_vision, + llm=llm, + browser=_global_browser, + browser_context=_global_browser_context, + controller=controller, + system_prompt_class=CustomSystemPrompt, + agent_prompt_class=CustomAgentMessagePrompt, + max_actions_per_step=max_actions_per_step, + tool_calling_method=tool_calling_method, + max_input_tokens=max_input_tokens, + generate_gif=True + ) + history = await _global_agent.run(max_steps=max_steps) + + history_file = os.path.join(save_agent_history_path, f"{_global_agent.state.agent_id}.json") + _global_agent.save_history(history_file) + + final_result = history.final_result() + errors = history.errors() + model_actions = history.model_actions() + model_thoughts = history.model_thoughts() + + trace_file = get_latest_files(save_trace_path) + + return final_result, errors, model_actions, model_thoughts, trace_file.get('.zip'), history_file + except Exception as e: + import traceback + traceback.print_exc() + errors = str(e) + "\n" + traceback.format_exc() + return '', errors, '', '', None, None + finally: + _global_agent = None + # Handle cleanup based on persistence configuration + if not keep_browser_open: + if _global_browser_context: + await _global_browser_context.close() + _global_browser_context = None + + if _global_browser: + await _global_browser.close() + _global_browser = None + + +async def run_with_stream( + agent_type, + llm_provider, + llm_model_name, + llm_num_ctx, + llm_temperature, + llm_base_url, + llm_api_key, + use_own_browser, + keep_browser_open, + headless, + disable_security, + window_w, + window_h, + save_recording_path, + save_agent_history_path, + save_trace_path, + enable_recording, + task, + add_infos, + max_steps, + use_vision, + max_actions_per_step, + tool_calling_method, + chrome_cdp, + max_input_tokens +): + global _global_agent + + stream_vw = 80 + stream_vh = int(80 * window_h // window_w) + if not headless: + result = await run_browser_agent( + agent_type=agent_type, + llm_provider=llm_provider, + llm_model_name=llm_model_name, + llm_num_ctx=llm_num_ctx, + llm_temperature=llm_temperature, + llm_base_url=llm_base_url, + llm_api_key=llm_api_key, + use_own_browser=use_own_browser, + keep_browser_open=keep_browser_open, + headless=headless, + disable_security=disable_security, + window_w=window_w, + window_h=window_h, + save_recording_path=save_recording_path, + save_agent_history_path=save_agent_history_path, + save_trace_path=save_trace_path, + enable_recording=enable_recording, + task=task, + add_infos=add_infos, + max_steps=max_steps, + use_vision=use_vision, + max_actions_per_step=max_actions_per_step, + tool_calling_method=tool_calling_method, + chrome_cdp=chrome_cdp, + max_input_tokens=max_input_tokens + ) + # Add HTML content at the start of the result array + yield [gr.update(visible=False)] + list(result) + else: + try: + # Run the browser agent in the background + agent_task = asyncio.create_task( + run_browser_agent( + agent_type=agent_type, + llm_provider=llm_provider, + llm_model_name=llm_model_name, + llm_num_ctx=llm_num_ctx, + llm_temperature=llm_temperature, + llm_base_url=llm_base_url, + llm_api_key=llm_api_key, + use_own_browser=use_own_browser, + keep_browser_open=keep_browser_open, + headless=headless, + disable_security=disable_security, + window_w=window_w, + window_h=window_h, + save_recording_path=save_recording_path, + save_agent_history_path=save_agent_history_path, + save_trace_path=save_trace_path, + enable_recording=enable_recording, + task=task, + add_infos=add_infos, + max_steps=max_steps, + use_vision=use_vision, + max_actions_per_step=max_actions_per_step, + tool_calling_method=tool_calling_method, + chrome_cdp=chrome_cdp, + max_input_tokens=max_input_tokens + ) + ) + + # Initialize values for streaming + html_content = f"

Using browser...

" + final_result = errors = model_actions = model_thoughts = "" + recording_gif = trace = history_file = None + + # Periodically update the stream while the agent task is running + while not agent_task.done(): + try: + encoded_screenshot = await capture_screenshot(_global_browser_context) + if encoded_screenshot is not None: + html_content = f'' + else: + html_content = f"

Waiting for browser session...

" + except Exception as e: + html_content = f"

Waiting for browser session...

" + + if _global_agent and _global_agent.state.stopped: + yield [ + gr.HTML(value=html_content, visible=True), + final_result, + errors, + model_actions, + model_thoughts, + recording_gif, + trace, + history_file, + gr.update(value="Stopping...", interactive=False), # stop_button + gr.update(interactive=False), # run_button + ] + break + else: + yield [ + gr.HTML(value=html_content, visible=True), + final_result, + errors, + model_actions, + model_thoughts, + recording_gif, + trace, + history_file, + gr.update(), # Re-enable stop button + gr.update() # Re-enable run button + ] + await asyncio.sleep(0.1) + + # Once the agent task completes, get the results + try: + result = await agent_task + final_result, errors, model_actions, model_thoughts, recording_gif, trace, history_file, stop_button, run_button = result + except gr.Error: + final_result = "" + model_actions = "" + model_thoughts = "" + recording_gif = trace = history_file = None + + except Exception as e: + errors = f"Agent error: {str(e)}" + + yield [ + gr.HTML(value=html_content, visible=True), + final_result, + errors, + model_actions, + model_thoughts, + recording_gif, + trace, + history_file, + stop_button, + run_button + ] + + except Exception as e: + import traceback + yield [ + gr.HTML( + value=f"

Waiting for browser session...

", + visible=True), + "", + f"Error: {str(e)}\n{traceback.format_exc()}", + "", + "", + None, + None, + None, + gr.update(value="Stop", interactive=True), # Re-enable stop button + gr.update(interactive=True) # Re-enable run button + ] + + +# Define the theme map globally +theme_map = { + "Default": Default(), + "Soft": Soft(), + "Monochrome": Monochrome(), + "Glass": Glass(), + "Origin": Origin(), + "Citrus": Citrus(), + "Ocean": Ocean(), + "Base": Base() +} + + +async def close_global_browser(): + global _global_browser, _global_browser_context + + if _global_browser_context: + await _global_browser_context.close() + _global_browser_context = None + + if _global_browser: + await _global_browser.close() + _global_browser = None + + +async def run_deep_search(research_task, max_search_iteration_input, max_query_per_iter_input, llm_provider, + llm_model_name, llm_num_ctx, llm_temperature, llm_base_url, llm_api_key, use_vision, + use_own_browser, headless, chrome_cdp): + from src.utils.deep_research import deep_research + global _global_agent_state + + # Clear any previous stop request + _global_agent_state.clear_stop() + + llm = utils.get_llm_model( + provider=llm_provider, + model_name=llm_model_name, + num_ctx=llm_num_ctx, + temperature=llm_temperature, + base_url=llm_base_url, + api_key=llm_api_key, + ) + markdown_content, file_path = await deep_research(research_task, llm, _global_agent_state, + max_search_iterations=max_search_iteration_input, + max_query_num=max_query_per_iter_input, + use_vision=use_vision, + headless=headless, + use_own_browser=use_own_browser, + chrome_cdp=chrome_cdp + ) + + return markdown_content, file_path, gr.update(value="Stop", interactive=True), gr.update(interactive=True) + + +def create_ui(theme_name="Ocean"): + css = """ + .gradio-container { + width: 60vw !important; + max-width: 60% !important; + margin-left: auto !important; + margin-right: auto !important; + padding-top: 20px !important; + } + .header-text { + text-align: center; + margin-bottom: 30px; + } + .theme-section { + margin-bottom: 20px; + padding: 15px; + border-radius: 10px; + } + """ + + with gr.Blocks( + title="Browser Use WebUI", theme=theme_map[theme_name], css=css + ) as demo: + with gr.Row(): + gr.Markdown( + """ + # 🌐 Browser Use WebUI + ### Control your browser with AI assistance + """, + elem_classes=["header-text"], + ) + + with gr.Tabs() as tabs: + with gr.TabItem("āš™ļø Agent Settings", id=1): + with gr.Group(): + agent_type = gr.Radio( + ["org", "custom"], + label="Agent Type", + value="custom", + info="Select the type of agent to use", + interactive=True + ) + with gr.Column(): + max_steps = gr.Slider( + minimum=1, + maximum=200, + value=100, + step=1, + label="Max Run Steps", + info="Maximum number of steps the agent will take", + interactive=True + ) + max_actions_per_step = gr.Slider( + minimum=1, + maximum=100, + value=10, + step=1, + label="Max Actions per Step", + info="Maximum number of actions the agent will take per step", + interactive=True + ) + with gr.Column(): + use_vision = gr.Checkbox( + label="Use Vision", + value=True, + info="Enable visual processing capabilities", + interactive=True + ) + max_input_tokens = gr.Number( + label="Max Input Tokens", + value=128000, + precision=0, + interactive=True + ) + tool_calling_method = gr.Dropdown( + label="Tool Calling Method", + value="auto", + interactive=True, + allow_custom_value=True, # Allow users to input custom model names + choices=["auto", "json_schema", "function_calling"], + info="Tool Calls Funtion Name", + visible=False + ) + + with gr.TabItem("šŸ”§ LLM Settings", id=2): + with gr.Group(): + llm_provider = gr.Dropdown( + choices=[provider for provider, model in utils.model_names.items()], + label="LLM Provider", + value="openai", + info="Select your preferred language model provider", + interactive=True + ) + llm_model_name = gr.Dropdown( + label="Model Name", + choices=utils.model_names['openai'], + value="gpt-4o", + interactive=True, + allow_custom_value=True, # Allow users to input custom model names + info="Select a model in the dropdown options or directly type a custom model name" + ) + ollama_num_ctx = gr.Slider( + minimum=2 ** 8, + maximum=2 ** 16, + value=16000, + step=1, + label="Ollama Context Length", + info="Controls max context length model needs to handle (less = faster)", + visible=False, + interactive=True + ) + llm_temperature = gr.Slider( + minimum=0.0, + maximum=2.0, + value=0.6, + step=0.1, + label="Temperature", + info="Controls randomness in model outputs", + interactive=True + ) + with gr.Row(): + llm_base_url = gr.Textbox( + label="Base URL", + value="", + info="API endpoint URL (if required)" + ) + llm_api_key = gr.Textbox( + label="API Key", + type="password", + value="", + info="Your API key (leave blank to use .env)" + ) + + # Change event to update context length slider + def update_llm_num_ctx_visibility(llm_provider): + return gr.update(visible=llm_provider == "ollama") + + # Bind the change event of llm_provider to update the visibility of context length slider + llm_provider.change( + fn=update_llm_num_ctx_visibility, + inputs=llm_provider, + outputs=ollama_num_ctx + ) + + with gr.TabItem("🌐 Browser Settings", id=3): + with gr.Group(): + with gr.Row(): + use_own_browser = gr.Checkbox( + label="Use Own Browser", + value=False, + info="Use your existing browser instance", + interactive=True + ) + keep_browser_open = gr.Checkbox( + label="Keep Browser Open", + value=False, + info="Keep Browser Open between Tasks", + interactive=True + ) + headless = gr.Checkbox( + label="Headless Mode", + value=False, + info="Run browser without GUI", + interactive=True + ) + disable_security = gr.Checkbox( + label="Disable Security", + value=True, + info="Disable browser security features", + interactive=True + ) + enable_recording = gr.Checkbox( + label="Enable Recording", + value=True, + info="Enable saving browser recordings", + interactive=True + ) + + with gr.Row(): + window_w = gr.Number( + label="Window Width", + value=1280, + info="Browser window width", + interactive=True + ) + window_h = gr.Number( + label="Window Height", + value=1100, + info="Browser window height", + interactive=True + ) + + chrome_cdp = gr.Textbox( + label="CDP URL", + placeholder="http://localhost:9222", + value="", + info="CDP for google remote debugging", + interactive=True, # Allow editing only if recording is enabled + ) + + save_recording_path = gr.Textbox( + label="Recording Path", + placeholder="e.g. ./tmp/record_videos", + value="./tmp/record_videos", + info="Path to save browser recordings", + interactive=True, # Allow editing only if recording is enabled + ) + + save_trace_path = gr.Textbox( + label="Trace Path", + placeholder="e.g. ./tmp/traces", + value="./tmp/traces", + info="Path to save Agent traces", + interactive=True, + ) + + save_agent_history_path = gr.Textbox( + label="Agent History Save Path", + placeholder="e.g., ./tmp/agent_history", + value="./tmp/agent_history", + info="Specify the directory where agent history should be saved.", + interactive=True, + ) + + with gr.TabItem("šŸ¤– Run Agent", id=4): + task = gr.Textbox( + label="Task Description", + lines=4, + placeholder="Enter your task here...", + value="go to google.com and type 'OpenAI' click search and give me the first url", + info="Describe what you want the agent to do", + interactive=True + ) + add_infos = gr.Textbox( + label="Additional Information", + lines=3, + placeholder="Add any helpful context or instructions...", + info="Optional hints to help the LLM complete the task", + value="", + interactive=True + ) + + with gr.Row(): + run_button = gr.Button("ā–¶ļø Run Agent", variant="primary", scale=2) + stop_button = gr.Button("ā¹ļø Stop", variant="stop", scale=1) + + with gr.Row(): + browser_view = gr.HTML( + value="

Waiting for browser session...

", + label="Live Browser View", + visible=False + ) + + gr.Markdown("### Results") + with gr.Row(): + with gr.Column(): + final_result_output = gr.Textbox( + label="Final Result", lines=3, show_label=True + ) + with gr.Column(): + errors_output = gr.Textbox( + label="Errors", lines=3, show_label=True + ) + with gr.Row(): + with gr.Column(): + model_actions_output = gr.Textbox( + label="Model Actions", lines=3, show_label=True, visible=False + ) + with gr.Column(): + model_thoughts_output = gr.Textbox( + label="Model Thoughts", lines=3, show_label=True, visible=False + ) + recording_gif = gr.Image(label="Result GIF", format="gif") + trace_file = gr.File(label="Trace File") + agent_history_file = gr.File(label="Agent History") + + with gr.TabItem("🧐 Deep Research", id=5): + research_task_input = gr.Textbox(label="Research Task", lines=5, + value="Compose a report on the use of Reinforcement Learning for training Large Language Models, encompassing its origins, current advancements, and future prospects, substantiated with examples of relevant models and techniques. The report should reflect original insights and analysis, moving beyond mere summarization of existing literature.", + interactive=True) + with gr.Row(): + max_search_iteration_input = gr.Number(label="Max Search Iteration", value=3, + precision=0, + interactive=True) # precision=0 ē”®äæę˜Æę•“ę•° + max_query_per_iter_input = gr.Number(label="Max Query per Iteration", value=1, + precision=0, + interactive=True) # precision=0 ē”®äæę˜Æę•“ę•° + with gr.Row(): + research_button = gr.Button("ā–¶ļø Run Deep Research", variant="primary", scale=2) + stop_research_button = gr.Button("ā¹ Stop", variant="stop", scale=1) + markdown_output_display = gr.Markdown(label="Research Report") + markdown_download = gr.File(label="Download Research Report") + + # Bind the stop button click event after errors_output is defined + stop_button.click( + fn=stop_agent, + inputs=[], + outputs=[stop_button, run_button], + ) + + # Run button click handler + run_button.click( + fn=run_with_stream, + inputs=[ + agent_type, llm_provider, llm_model_name, ollama_num_ctx, llm_temperature, llm_base_url, + llm_api_key, + use_own_browser, keep_browser_open, headless, disable_security, window_w, window_h, + save_recording_path, save_agent_history_path, save_trace_path, # Include the new path + enable_recording, task, add_infos, max_steps, use_vision, max_actions_per_step, + tool_calling_method, chrome_cdp, max_input_tokens + ], + outputs=[ + browser_view, # Browser view + final_result_output, # Final result + errors_output, # Errors + model_actions_output, # Model actions + model_thoughts_output, # Model thoughts + recording_gif, # Latest recording + trace_file, # Trace file + agent_history_file, # Agent history file + stop_button, # Stop button + run_button # Run button + ], + ) + + # Run Deep Research + research_button.click( + fn=run_deep_search, + inputs=[research_task_input, max_search_iteration_input, max_query_per_iter_input, llm_provider, + llm_model_name, ollama_num_ctx, llm_temperature, llm_base_url, llm_api_key, use_vision, + use_own_browser, headless, chrome_cdp], + outputs=[markdown_output_display, markdown_download, stop_research_button, research_button] + ) + # Bind the stop button click event after errors_output is defined + stop_research_button.click( + fn=stop_research_agent, + inputs=[], + outputs=[stop_research_button, research_button], + ) + + with gr.TabItem("šŸŽ„ Recordings", id=7, visible=True): + def list_recordings(save_recording_path): + if not os.path.exists(save_recording_path): + return [] + + # Get all video files + recordings = glob.glob(os.path.join(save_recording_path, "*.[mM][pP]4")) + glob.glob( + os.path.join(save_recording_path, "*.[wW][eE][bB][mM]")) + + # Sort recordings by creation time (oldest first) + recordings.sort(key=os.path.getctime) + + # Add numbering to the recordings + numbered_recordings = [] + for idx, recording in enumerate(recordings, start=1): + filename = os.path.basename(recording) + numbered_recordings.append((recording, f"{idx}. {filename}")) + + return numbered_recordings + + recordings_gallery = gr.Gallery( + label="Recordings", + columns=3, + height="auto", + object_fit="contain" + ) + + refresh_button = gr.Button("šŸ”„ Refresh Recordings", variant="secondary") + refresh_button.click( + fn=list_recordings, + inputs=save_recording_path, + outputs=recordings_gallery + ) + + with gr.TabItem("šŸ“ UI Configuration", id=8): + config_file_input = gr.File( + label="Load UI Settings from Config File", + file_types=[".json"], + interactive=True + ) + with gr.Row(): + load_config_button = gr.Button("Load Config", variant="primary") + save_config_button = gr.Button("Save UI Settings", variant="primary") + + config_status = gr.Textbox( + label="Status", + lines=2, + interactive=False + ) + save_config_button.click( + fn=save_current_config, + inputs=[], # äøéœ€č¦č¾“å…„å‚ę•° + outputs=[config_status] + ) + + # Attach the callback to the LLM provider dropdown + llm_provider.change( + lambda provider, api_key, base_url: update_model_dropdown(provider, api_key, base_url), + inputs=[llm_provider, llm_api_key, llm_base_url], + outputs=llm_model_name + ) + + # Add this after defining the components + enable_recording.change( + lambda enabled: gr.update(interactive=enabled), + inputs=enable_recording, + outputs=save_recording_path + ) + + use_own_browser.change(fn=close_global_browser) + keep_browser_open.change(fn=close_global_browser) + + scan_and_register_components(demo) + global webui_config_manager + all_components = webui_config_manager.get_all_components() + + load_config_button.click( + fn=update_ui_from_config, + inputs=[config_file_input], + outputs=all_components + [config_status] + ) + return demo + + +def main(): + parser = argparse.ArgumentParser(description="Gradio UI for Browser Agent") + parser.add_argument("--ip", type=str, default="127.0.0.1", help="IP address to bind to") + parser.add_argument("--port", type=int, default=7788, help="Port to listen on") + parser.add_argument("--theme", type=str, default="Ocean", choices=theme_map.keys(), help="Theme to use for the UI") + args = parser.parse_args() + + demo = create_ui(theme_name=args.theme) + demo.launch(server_name=args.ip, server_port=args.port) + + +if __name__ == '__main__': + main() From 6ac9e268d31da5cca65dd3207123e171f253b877 Mon Sep 17 00:00:00 2001 From: vvincent1234 Date: Sun, 27 Apr 2025 23:28:47 +0800 Subject: [PATCH 08/35] add ui --- src/webui/components/agent_settings_tab.py | 36 ++++- src/webui/components/browser_settings_tab.py | 125 ++++++++++++++++++ src/webui/components/browser_use_agent_tab.py | 62 +++++++++ ...arch_tab.py => deep_research_agent_tab.py} | 0 src/webui/components/run_agent_tab.py | 4 - src/webui/interface.py | 6 +- tests/test_controller.py | 30 ++--- 7 files changed, 239 insertions(+), 24 deletions(-) create mode 100644 src/webui/components/browser_use_agent_tab.py rename src/webui/components/{run_deep_research_tab.py => deep_research_agent_tab.py} (100%) delete mode 100644 src/webui/components/run_agent_tab.py diff --git a/src/webui/components/agent_settings_tab.py b/src/webui/components/agent_settings_tab.py index 4f69ac1..764487c 100644 --- a/src/webui/components/agent_settings_tab.py +++ b/src/webui/components/agent_settings_tab.py @@ -1,8 +1,14 @@ +import json +import os + import gradio as gr from gradio.components import Component - +from typing import Any, Dict, Optional from src.webui.webui_manager import WebuiManager from src.utils import config +import logging + +logger = logging.getLogger(__name__) def update_model_dropdown(llm_provider): @@ -17,6 +23,20 @@ def update_model_dropdown(llm_provider): return gr.Dropdown(choices=[], value="", interactive=True, allow_custom_value=True) +def update_mcp_server(mcp_file: str): + """ + Update the MCP server. + """ + if not mcp_file or not os.path.exists(mcp_file) or mcp_file.endswith('.json'): + logger.warning(f"{mcp_file} is not a valid MCP file.") + return gr.update() + + with open(mcp_file, 'r') as f: + mcp_server = json.load(f) + + return gr.update(value=json.dumps(mcp_server, indent=2), visible=True) + + def create_agent_settings_tab(webui_manager: WebuiManager) -> dict[str, Component]: """ Creates an agent settings tab. @@ -29,6 +49,10 @@ def create_agent_settings_tab(webui_manager: WebuiManager) -> dict[str, Componen override_system_prompt = gr.Textbox(label="Override system prompt", lines=4, interactive=True) extend_system_prompt = gr.Textbox(label="Extend system prompt", lines=4, interactive=True) + with gr.Group(): + mcp_json_file = gr.File(label="MCP server file", interactive=True, file_types=["json"]) + mcp_server_config = gr.Textbox(label="MCP server", lines=6, interactive=True, visible=False) + with gr.Group(): with gr.Row(): llm_provider = gr.Dropdown( @@ -92,7 +116,6 @@ def create_agent_settings_tab(webui_manager: WebuiManager) -> dict[str, Componen with gr.Row(): planner_llm_provider = gr.Dropdown( choices=[provider for provider, model in config.model_names.items()], - value=None, label="Planner LLM Provider", info="Select LLM provider for LLM", interactive=True @@ -202,7 +225,8 @@ def create_agent_settings_tab(webui_manager: WebuiManager) -> dict[str, Componen max_actions=max_actions, max_input_tokens=max_input_tokens, tool_calling_method=tool_calling_method, - + mcp_json_file=mcp_json_file, + mcp_server_config=mcp_server_config, )) llm_provider.change( fn=lambda x: gr.update(visible=x == "ollama"), @@ -225,4 +249,10 @@ def create_agent_settings_tab(webui_manager: WebuiManager) -> dict[str, Componen outputs=planner_llm_model_name ) + mcp_json_file.change( + update_mcp_server, + inputs=mcp_json_file, + outputs=mcp_server_config + ) + return tab_components diff --git a/src/webui/components/browser_settings_tab.py b/src/webui/components/browser_settings_tab.py index e69de29..c2b3e56 100644 --- a/src/webui/components/browser_settings_tab.py +++ b/src/webui/components/browser_settings_tab.py @@ -0,0 +1,125 @@ +import gradio as gr +from gradio.components import Component + +from src.webui.webui_manager import WebuiManager +from src.utils import config + + +def create_browser_settings_tab(webui_manager: WebuiManager) -> dict[str, Component]: + """ + Creates a browser settings tab. + """ + input_components = set(webui_manager.get_components()) + tab_components = {} + + with gr.Group(): + with gr.Row(): + browser_binary_path = gr.Textbox( + label="Browser Binary Path", + lines=1, + interactive=True, + placeholder="e.g. '/Applications/Google\\ Chrome.app/Contents/MacOS/Google\\ Chrome'" + ) + browser_user_data_dir = gr.Textbox( + label="Browser User Data Dir", + lines=1, + interactive=True, + placeholder="Leave it empty if you use your default user data", + ) + with gr.Row(): + use_own_browser = gr.Checkbox( + label="Use Own Browser", + value=False, + info="Use your existing browser instance", + interactive=True + ) + keep_browser_open = gr.Checkbox( + label="Keep Browser Open", + value=False, + info="Keep Browser Open between Tasks", + interactive=True + ) + headless = gr.Checkbox( + label="Headless Mode", + value=False, + info="Run browser without GUI", + interactive=True + ) + disable_security = gr.Checkbox( + label="Disable Security", + value=True, + info="Disable browser security features", + interactive=True + ) + + with gr.Row(): + window_w = gr.Number( + label="Window Width", + value=1280, + info="Browser window width", + interactive=True + ) + window_h = gr.Number( + label="Window Height", + value=1100, + info="Browser window height", + interactive=True + ) + + with gr.Row(): + cdp_url = gr.Textbox( + label="CDP URL", + info="CDP URL for browser remote debugging", + interactive=True, + ) + wss_url = gr.Textbox( + label="WSS URL", + info="WSS URL for browser remote debugging", + interactive=True, + ) + + with gr.Row(): + save_recording_path = gr.Textbox( + label="Recording Path", + placeholder="e.g. ./tmp/record_videos", + info="Path to save browser recordings", + interactive=True, + ) + + save_trace_path = gr.Textbox( + label="Trace Path", + placeholder="e.g. ./tmp/traces", + info="Path to save Agent traces", + interactive=True, + ) + + with gr.Row(): + save_agent_history_path = gr.Textbox( + label="Agent History Save Path", + value="./tmp/agent_history", + info="Specify the directory where agent history should be saved.", + interactive=True, + ) + save_download_path = gr.Textbox( + label="Save Directory for browser downloads", + value="./tmp/downloads", + info="Specify the directory where downloaded files should be saved.", + interactive=True, + ) + tab_components.update( + dict( + browser_binary_path=browser_binary_path, + browser_user_data_dir=browser_user_data_dir, + use_own_browser=use_own_browser, + keep_browser_open=keep_browser_open, + headless=headless, + disable_security=disable_security, + save_recording_path=save_recording_path, + save_trace_path=save_trace_path, + save_agent_history_path=save_agent_history_path, + save_download_path=save_download_path, + cdp_url=cdp_url, + wss_url=wss_url + ) + ) + return tab_components diff --git a/src/webui/components/browser_use_agent_tab.py b/src/webui/components/browser_use_agent_tab.py new file mode 100644 index 0000000..0534872 --- /dev/null +++ b/src/webui/components/browser_use_agent_tab.py @@ -0,0 +1,62 @@ +import gradio as gr +from gradio.components import Component + +from src.webui.webui_manager import WebuiManager +from src.utils import config + + +def create_browser_use_agent_tab(webui_manager: WebuiManager) -> dict[str, Component]: + """ + Create the run agent tab + """ + input_components = set(webui_manager.get_components()) + tab_components = {} + + chatbot = gr.Chatbot(type='messages', label="Chat History", height=600) + user_input = gr.Textbox( + label="User Input", + lines=3, + value="go to google.com and type 'OpenAI' click search and give me the first url", + interactive=True + ) + + with gr.Row(): + stop_button = gr.Button("ā¹ļø Stop", interactive=False, variant="stop", scale=2) + clear_button = gr.Button("🧹 Clear", interactive=False, variant="stop", scale=2) + run_button = gr.Button("ā–¶ļø Summit", variant="primary", scale=3) + + browser_view = gr.HTML( + value="

Waiting for browser session...

", + label="Browser Live View", + visible=False + ) + + with gr.Row(): + agent_final_result = gr.Textbox( + label="Final Result", lines=3, show_label=True, interactive=False + ) + agent_errors = gr.Textbox( + label="Errors", lines=3, show_label=True, interactive=False + ) + + with gr.Row(): + agent_trace_file = gr.File(label="Trace File", interactive=False) + agent_history_file = gr.File(label="Agent History", interactive=False) + + recording_gif = gr.Image(label="Result GIF", format="gif", interactive=False) + tab_components.update( + dict( + chatbot=chatbot, + user_input=user_input, + clear_button=clear_button, + run_button=run_button, + stop_button=stop_button, + agent_final_result=agent_final_result, + agent_errors=agent_errors, + agent_trace_file=agent_trace_file, + agent_history_file=agent_history_file, + recording_gif=recording_gif, + browser_view=browser_view + ) + ) + return tab_components diff --git a/src/webui/components/run_deep_research_tab.py b/src/webui/components/deep_research_agent_tab.py similarity index 100% rename from src/webui/components/run_deep_research_tab.py rename to src/webui/components/deep_research_agent_tab.py diff --git a/src/webui/components/run_agent_tab.py b/src/webui/components/run_agent_tab.py deleted file mode 100644 index a071a83..0000000 --- a/src/webui/components/run_agent_tab.py +++ /dev/null @@ -1,4 +0,0 @@ -import gradio as gr - -def creat_auto_agent_tab(): - pass \ No newline at end of file diff --git a/src/webui/interface.py b/src/webui/interface.py index e2690a9..a53d1f8 100644 --- a/src/webui/interface.py +++ b/src/webui/interface.py @@ -2,6 +2,8 @@ import gradio as gr from src.webui.webui_manager import WebuiManager from src.webui.components.agent_settings_tab import create_agent_settings_tab +from src.webui.components.browser_settings_tab import create_browser_settings_tab +from src.webui.components.browser_use_agent_tab import create_browser_use_agent_tab theme_map = { "Default": gr.themes.Default(), @@ -54,10 +56,10 @@ def create_ui(theme_name="Ocean"): ui_manager.add_components("agent_settings", create_agent_settings_tab(ui_manager)) with gr.TabItem("🌐 Browser Settings"): - pass + ui_manager.add_components("browser_settings", create_browser_settings_tab(ui_manager)) with gr.TabItem("šŸ¤– Run Agent"): - pass + ui_manager.add_components("browser_use_agent", create_browser_use_agent_tab(ui_manager)) with gr.TabItem("🧐 Deep Research"): pass diff --git a/tests/test_controller.py b/tests/test_controller.py index ef859ed..6a10ebc 100644 --- a/tests/test_controller.py +++ b/tests/test_controller.py @@ -46,21 +46,21 @@ async def test_controller_with_mcp(): from browser_use.controller.registry.views import ActionModel test_server_config = { - # "playwright": { - # "command": "npx", - # "args": [ - # "@playwright/mcp@latest", - # ], - # "transport": "stdio", - # }, - # "filesystem": { - # "command": "npx", - # "args": [ - # "-y", - # "@modelcontextprotocol/server-filesystem", - # "/Users/xxx/ai_workspace", - # ] - # }, + "playwright": { + "command": "npx", + "args": [ + "@playwright/mcp@latest", + ], + "transport": "stdio", + }, + "filesystem": { + "command": "npx", + "args": [ + "-y", + "@modelcontextprotocol/server-filesystem", + "/Users/xxx/ai_workspace", + ] + }, "desktop-commander": { "command": "npx", "args": [ From 0d259efbebb5bfd818c19d5461d729eb85dee484 Mon Sep 17 00:00:00 2001 From: vvincent1234 Date: Mon, 28 Apr 2025 09:37:49 +0800 Subject: [PATCH 09/35] add load and save config tab --- src/webui/components/agent_settings_tab.py | 12 ++--- src/webui/components/browser_use_agent_tab.py | 2 +- .../components/deep_research_agent_tab.py | 41 ++++++++++++++++ src/webui/components/load_save_config_tab.py | 49 +++++++++++++++++++ src/webui/interface.py | 22 +++++++-- src/webui/webui_manager.py | 43 ++++++++++++++-- 6 files changed, 154 insertions(+), 15 deletions(-) diff --git a/src/webui/components/agent_settings_tab.py b/src/webui/components/agent_settings_tab.py index 764487c..a2479b3 100644 --- a/src/webui/components/agent_settings_tab.py +++ b/src/webui/components/agent_settings_tab.py @@ -27,14 +27,14 @@ def update_mcp_server(mcp_file: str): """ Update the MCP server. """ - if not mcp_file or not os.path.exists(mcp_file) or mcp_file.endswith('.json'): + if not mcp_file or not os.path.exists(mcp_file) or not mcp_file.endswith('.json'): logger.warning(f"{mcp_file} is not a valid MCP file.") - return gr.update() + return None, gr.update(visible=False) with open(mcp_file, 'r') as f: mcp_server = json.load(f) - return gr.update(value=json.dumps(mcp_server, indent=2), visible=True) + return json.dumps(mcp_server, indent=2), gr.update(visible=True) def create_agent_settings_tab(webui_manager: WebuiManager) -> dict[str, Component]: @@ -50,7 +50,7 @@ def create_agent_settings_tab(webui_manager: WebuiManager) -> dict[str, Componen extend_system_prompt = gr.Textbox(label="Extend system prompt", lines=4, interactive=True) with gr.Group(): - mcp_json_file = gr.File(label="MCP server file", interactive=True, file_types=["json"]) + mcp_json_file = gr.File(label="MCP server file", interactive=True, file_types=[".json"]) mcp_server_config = gr.Textbox(label="MCP server", lines=6, interactive=True, visible=False) with gr.Group(): @@ -202,7 +202,7 @@ def create_agent_settings_tab(webui_manager: WebuiManager) -> dict[str, Componen allow_custom_value=True, choices=["auto", "json_schema", "function_calling", "None"], info="Tool Calls Function Name", - visible=False + visible=True ) tab_components.update(dict( override_system_prompt=override_system_prompt, @@ -252,7 +252,7 @@ def create_agent_settings_tab(webui_manager: WebuiManager) -> dict[str, Componen mcp_json_file.change( update_mcp_server, inputs=mcp_json_file, - outputs=mcp_server_config + outputs=[mcp_server_config, mcp_server_config] ) return tab_components diff --git a/src/webui/components/browser_use_agent_tab.py b/src/webui/components/browser_use_agent_tab.py index 0534872..8f842af 100644 --- a/src/webui/components/browser_use_agent_tab.py +++ b/src/webui/components/browser_use_agent_tab.py @@ -22,7 +22,7 @@ def create_browser_use_agent_tab(webui_manager: WebuiManager) -> dict[str, Compo with gr.Row(): stop_button = gr.Button("ā¹ļø Stop", interactive=False, variant="stop", scale=2) - clear_button = gr.Button("🧹 Clear", interactive=False, variant="stop", scale=2) + clear_button = gr.Button("🧹 Clear", interactive=True, variant="stop", scale=2) run_button = gr.Button("ā–¶ļø Summit", variant="primary", scale=3) browser_view = gr.HTML( diff --git a/src/webui/components/deep_research_agent_tab.py b/src/webui/components/deep_research_agent_tab.py index e69de29..d9dfc24 100644 --- a/src/webui/components/deep_research_agent_tab.py +++ b/src/webui/components/deep_research_agent_tab.py @@ -0,0 +1,41 @@ +import gradio as gr +from gradio.components import Component + +from src.webui.webui_manager import WebuiManager +from src.utils import config + + +def create_deep_research_agent_tab(webui_manager: WebuiManager) -> dict[str, Component]: + """ + Creates a deep research agent tab + """ + input_components = set(webui_manager.get_components()) + tab_components = {} + + research_task = gr.Textbox(label="Research Task", lines=5, + value="Give me a detailed plan for traveling to Switzerland on June 1st.", + interactive=True) + with gr.Row(): + max_iteration = gr.Number(label="Max Search Iteration", value=3, + precision=0, + interactive=True) # precision=0 ē”®äæę˜Æę•“ę•° + max_query = gr.Number(label="Max Query per Iteration", value=1, + precision=0, + interactive=True) # precision=0 ē”®äæę˜Æę•“ę•° + with gr.Row(): + stop_button = gr.Button("ā¹ļø Stop", variant="stop", scale=2) + start_button = gr.Button("ā–¶ļø Run", variant="primary", scale=3) + markdown_display = gr.Markdown(label="Research Report") + markdown_download = gr.File(label="Download Research Report", interactive=False) + tab_components.update( + dict( + research_task=research_task, + max_iteration=max_iteration, + max_query=max_query, + start_button=start_button, + stop_button=stop_button, + markdown_display=markdown_display, + markdown_download=markdown_download, + ) + ) + return tab_components diff --git a/src/webui/components/load_save_config_tab.py b/src/webui/components/load_save_config_tab.py index e69de29..91dcad7 100644 --- a/src/webui/components/load_save_config_tab.py +++ b/src/webui/components/load_save_config_tab.py @@ -0,0 +1,49 @@ +import gradio as gr +from gradio.components import Component + +from src.webui.webui_manager import WebuiManager +from src.utils import config + + +def create_load_save_config_tab(webui_manager: WebuiManager) -> dict[str, Component]: + """ + Creates a load and save config tab. + """ + input_components = set(webui_manager.get_components()) + tab_components = {} + + config_file = gr.File( + label="Load UI Settings from Config File", + file_types=[".json"], + interactive=True + ) + with gr.Row(): + load_config_button = gr.Button("Load Config", variant="primary") + save_config_button = gr.Button("Save UI Settings", variant="primary") + + config_status = gr.Textbox( + label="Status", + lines=2, + interactive=False + ) + + tab_components.update(dict( + load_config_button=load_config_button, + save_config_button=save_config_button, + config_status=config_status, + config_file=config_file, + )) + + save_config_button.click( + fn=webui_manager.save_current_config, + inputs=[], + outputs=[config_status] + ) + + load_config_button.click( + fn=webui_manager.load_config, + inputs=[config_file], + outputs=[config_status] + ) + + return tab_components diff --git a/src/webui/interface.py b/src/webui/interface.py index a53d1f8..266b079 100644 --- a/src/webui/interface.py +++ b/src/webui/interface.py @@ -4,6 +4,8 @@ from src.webui.webui_manager import WebuiManager from src.webui.components.agent_settings_tab import create_agent_settings_tab from src.webui.components.browser_settings_tab import create_browser_settings_tab from src.webui.components.browser_use_agent_tab import create_browser_use_agent_tab +from src.webui.components.deep_research_agent_tab import create_deep_research_agent_tab +from src.webui.components.load_save_config_tab import create_load_save_config_tab theme_map = { "Default": gr.themes.Default(), @@ -37,10 +39,22 @@ def create_ui(theme_name="Ocean"): } """ + # dark mode in default + js_func = """ + function refresh() { + const url = new URL(window.location); + + if (url.searchParams.get('__theme') !== 'dark') { + url.searchParams.set('__theme', 'dark'); + window.location.href = url.href; + } + } + """ + ui_manager = WebuiManager() with gr.Blocks( - title="Browser Use WebUI", theme=theme_map[theme_name], css=css + title="Browser Use WebUI", theme=theme_map[theme_name], css=css, js=js_func, ) as demo: with gr.Row(): gr.Markdown( @@ -62,9 +76,9 @@ def create_ui(theme_name="Ocean"): ui_manager.add_components("browser_use_agent", create_browser_use_agent_tab(ui_manager)) with gr.TabItem("🧐 Deep Research"): - pass + ui_manager.add_components("deep_research_agent", create_deep_research_agent_tab(ui_manager)) - with gr.TabItem("šŸ“ UI Configuration"): - pass + with gr.TabItem("šŸ“ Load & Save Config"): + ui_manager.add_components("load_save_config", create_load_save_config_tab(ui_manager)) return demo diff --git a/src/webui/webui_manager.py b/src/webui/webui_manager.py index ca5135f..033564a 100644 --- a/src/webui/webui_manager.py +++ b/src/webui/webui_manager.py @@ -1,19 +1,24 @@ +import json from collections.abc import Generator from typing import TYPE_CHECKING +import os +import gradio as gr +from datetime import datetime -if TYPE_CHECKING: - from gradio.components import Component - +from gradio.components import Component from browser_use.browser.browser import Browser from browser_use.browser.context import BrowserContext from browser_use.agent.service import Agent class WebuiManager: - def __init__(self): + def __init__(self, settings_save_dir: str = "./tmp/webui_settings"): self.id_to_component: dict[str, Component] = {} self.component_to_id: dict[Component, str] = {} + self.settings_save_dir = settings_save_dir + os.makedirs(self.settings_save_dir, exist_ok=True) + self.browser: Browser = None self.browser_context: BrowserContext = None self.bu_agent: Agent = None @@ -44,3 +49,33 @@ class WebuiManager: Get id by component """ return self.component_to_id[comp] + + def save_current_config(self): + """ + Save current config + """ + cur_settings = {} + for comp_id, comp in self.id_to_component.items(): + if not isinstance(comp, gr.Button) and not isinstance(comp, gr.File) and str( + getattr(comp, "interactive", True)).lower() != "false": + cur_settings[comp_id] = getattr(comp, "value", None) + + config_name = datetime.now().strftime("%Y%m%d-%H%M%S") + with open(os.path.join(self.settings_save_dir, f"{config_name}.json"), "w") as fw: + json.dump(cur_settings, fw, indent=4) + + return os.path.join(self.settings_save_dir, f"{config_name}.json") + + def load_config(self, config_path: str): + """ + Load config + """ + with open(config_path, "r") as fr: + ui_settings = json.load(fr) + + update_components = {} + for comp_id, comp_val in ui_settings.items(): + if comp_id in self.id_to_component: + update_components[self.id_to_component[comp_id]].value = comp_val + + return f"Successfully loaded config from {config_path}" From 4c87694cef50ba504a97c10d4ecaa135a1e57a34 Mon Sep 17 00:00:00 2001 From: vincent Date: Mon, 28 Apr 2025 22:11:56 +0800 Subject: [PATCH 10/35] add browser-use agent run --- .../deep_research_agent.py | 0 src/browser/custom_browser.py | 73 +- src/browser/custom_context.py | 98 +- src/controller/custom_controller.py | 44 +- src/utils/mcp_client.py | 6 + src/utils/utils.py | 124 --- src/webui/components/agent_settings_tab.py | 18 +- src/webui/components/browser_settings_tab.py | 8 +- src/webui/components/browser_use_agent_tab.py | 947 +++++++++++++++++- .../components/deep_research_agent_tab.py | 2 +- src/webui/components/load_save_config_tab.py | 9 +- src/webui/interface.py | 23 +- src/webui/webui_manager.py | 42 +- tests/test_agents.py | 306 +++--- tests/test_controller.py | 53 +- tests/test_llm_api.py | 4 +- webui.py | 2 + webui2.py | 107 -- 18 files changed, 1343 insertions(+), 523 deletions(-) rename src/agent/{ => deep_research}/deep_research_agent.py (100%) diff --git a/src/agent/deep_research_agent.py b/src/agent/deep_research/deep_research_agent.py similarity index 100% rename from src/agent/deep_research_agent.py rename to src/agent/deep_research/deep_research_agent.py diff --git a/src/browser/custom_browser.py b/src/browser/custom_browser.py index 4a2d1ab..a1c057b 100644 --- a/src/browser/custom_browser.py +++ b/src/browser/custom_browser.py @@ -9,11 +9,23 @@ from playwright.async_api import ( Playwright, async_playwright, ) -from browser_use.browser.browser import Browser +from browser_use.browser.browser import Browser, IN_DOCKER from browser_use.browser.context import BrowserContext, BrowserContextConfig from playwright.async_api import BrowserContext as PlaywrightBrowserContext import logging +from browser_use.browser.chrome import ( + CHROME_ARGS, + CHROME_DETERMINISTIC_RENDERING_ARGS, + CHROME_DISABLE_SECURITY_ARGS, + CHROME_DOCKER_ARGS, + CHROME_HEADLESS_ARGS, +) +from browser_use.browser.context import BrowserContext, BrowserContextConfig +from browser_use.browser.utils.screen_resolution import get_screen_resolution, get_window_adjustments +from browser_use.utils import time_execution_async +import socket + from .custom_context import CustomBrowserContext logger = logging.getLogger(__name__) @@ -26,3 +38,62 @@ class CustomBrowser(Browser): config: BrowserContextConfig = BrowserContextConfig() ) -> CustomBrowserContext: return CustomBrowserContext(config=config, browser=self) + + async def _setup_builtin_browser(self, playwright: Playwright) -> PlaywrightBrowser: + """Sets up and returns a Playwright Browser instance with anti-detection measures.""" + assert self.config.browser_binary_path is None, 'browser_binary_path should be None if trying to use the builtin browsers' + + if self.config.headless: + screen_size = {'width': 1920, 'height': 1080} + offset_x, offset_y = 0, 0 + else: + screen_size = get_screen_resolution() + offset_x, offset_y = get_window_adjustments() + + chrome_args = { + *CHROME_ARGS, + *(CHROME_DOCKER_ARGS if IN_DOCKER else []), + *(CHROME_HEADLESS_ARGS if self.config.headless else []), + *(CHROME_DISABLE_SECURITY_ARGS if self.config.disable_security else []), + *(CHROME_DETERMINISTIC_RENDERING_ARGS if self.config.deterministic_rendering else []), + f'--window-position={offset_x},{offset_y}', + *self.config.extra_browser_args, + } + contain_window_size = False + for arg in self.config.extra_browser_args: + if "--window-size" in arg: + contain_window_size = True + break + if not contain_window_size: + chrome_args.add(f'--window-size={screen_size["width"]},{screen_size["height"]}') + + # check if port 9222 is already taken, if so remove the remote-debugging-port arg to prevent conflicts + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: + if s.connect_ex(('localhost', 9222)) == 0: + chrome_args.remove('--remote-debugging-port=9222') + + browser_class = getattr(playwright, self.config.browser_class) + args = { + 'chromium': list(chrome_args), + 'firefox': [ + *{ + '-no-remote', + *self.config.extra_browser_args, + } + ], + 'webkit': [ + *{ + '--no-startup-window', + *self.config.extra_browser_args, + } + ], + } + + browser = await browser_class.launch( + headless=self.config.headless, + args=args[self.config.browser_class], + proxy=self.config.proxy.model_dump() if self.config.proxy else None, + handle_sigterm=False, + handle_sigint=False, + ) + return browser diff --git a/src/browser/custom_context.py b/src/browser/custom_context.py index fd0e2e5..4dc2423 100644 --- a/src/browser/custom_context.py +++ b/src/browser/custom_context.py @@ -2,7 +2,7 @@ import json import logging import os -from browser_use.browser.browser import Browser +from browser_use.browser.browser import Browser, IN_DOCKER from browser_use.browser.context import BrowserContext, BrowserContextConfig from playwright.async_api import Browser as PlaywrightBrowser from playwright.async_api import BrowserContext as PlaywrightBrowserContext @@ -10,10 +10,104 @@ from playwright.async_api import BrowserContext as PlaywrightBrowserContext logger = logging.getLogger(__name__) +class CustomBrowserContextConfig(BrowserContextConfig): + force_new_context: bool = False # force to create new context + + class CustomBrowserContext(BrowserContext): def __init__( self, browser: "Browser", - config: BrowserContextConfig = BrowserContextConfig() + config: CustomBrowserContextConfig = CustomBrowserContextConfig(), ): super(CustomBrowserContext, self).__init__(browser=browser, config=config) + + async def _create_context(self, browser: PlaywrightBrowser): + """Creates a new browser context with anti-detection measures and loads cookies if available.""" + if not self.config.force_new_context and self.browser.config.cdp_url and len(browser.contexts) > 0: + context = browser.contexts[0] + elif not self.config.force_new_context and self.browser.config.browser_binary_path and len( + browser.contexts) > 0: + # Connect to existing Chrome instance instead of creating new one + context = browser.contexts[0] + else: + # Original code for creating new context + context = await browser.new_context( + no_viewport=True, + user_agent=self.config.user_agent, + java_script_enabled=True, + bypass_csp=self.config.disable_security, + ignore_https_errors=self.config.disable_security, + record_video_dir=self.config.save_recording_path, + record_video_size=self.config.browser_window_size.model_dump(), + record_har_path=self.config.save_har_path, + locale=self.config.locale, + http_credentials=self.config.http_credentials, + is_mobile=self.config.is_mobile, + has_touch=self.config.has_touch, + geolocation=self.config.geolocation, + permissions=self.config.permissions, + timezone_id=self.config.timezone_id, + ) + + if self.config.trace_path: + await context.tracing.start(screenshots=True, snapshots=True, sources=True) + + # Load cookies if they exist + if self.config.cookies_file and os.path.exists(self.config.cookies_file): + with open(self.config.cookies_file, 'r') as f: + try: + cookies = json.load(f) + + valid_same_site_values = ['Strict', 'Lax', 'None'] + for cookie in cookies: + if 'sameSite' in cookie: + if cookie['sameSite'] not in valid_same_site_values: + logger.warning( + f"Fixed invalid sameSite value '{cookie['sameSite']}' to 'None' for cookie {cookie.get('name')}" + ) + cookie['sameSite'] = 'None' + logger.info(f'šŸŖ Loaded {len(cookies)} cookies from {self.config.cookies_file}') + await context.add_cookies(cookies) + + except json.JSONDecodeError as e: + logger.error(f'Failed to parse cookies file: {str(e)}') + + # Expose anti-detection scripts + await context.add_init_script( + """ + // Webdriver property + Object.defineProperty(navigator, 'webdriver', { + get: () => undefined + }); + + // Languages + Object.defineProperty(navigator, 'languages', { + get: () => ['en-US'] + }); + + // Plugins + Object.defineProperty(navigator, 'plugins', { + get: () => [1, 2, 3, 4, 5] + }); + + // Chrome runtime + window.chrome = { runtime: {} }; + + // Permissions + const originalQuery = window.navigator.permissions.query; + window.navigator.permissions.query = (parameters) => ( + parameters.name === 'notifications' ? + Promise.resolve({ state: Notification.permission }) : + originalQuery(parameters) + ); + (function () { + const originalAttachShadow = Element.prototype.attachShadow; + Element.prototype.attachShadow = function attachShadow(options) { + return originalAttachShadow.call(this, { ...options, mode: "open" }); + }; + })(); + """ + ) + + return context diff --git a/src/controller/custom_controller.py b/src/controller/custom_controller.py index 7209e97..d07c88b 100644 --- a/src/controller/custom_controller.py +++ b/src/controller/custom_controller.py @@ -48,28 +48,6 @@ class CustomController(Controller): self.mcp_client = None self.mcp_server_config = None - async def setup_mcp_client(self, mcp_server_config: Optional[Dict[str, Any]] = None): - self.mcp_server_config = mcp_server_config - if self.mcp_server_config: - self.mcp_client = await setup_mcp_client_and_tools(self.mcp_server_config) - self.register_mcp_tools() - - def register_mcp_tools(self): - """ - Register the MCP tools used by this controller. - """ - if self.mcp_client: - for server_name in self.mcp_client.server_name_to_tools: - for tool in self.mcp_client.server_name_to_tools[server_name]: - tool_name = f"mcp.{server_name}.{tool.name}" - self.registry.registry.actions[tool_name] = RegisteredAction( - name=tool_name, - description=tool.description, - function=tool, - param_model=create_tool_param_model(tool), - ) - logger.info(f"Add mcp tool: {tool_name}") - def _register_custom_actions(self): """Register all custom browser actions""" @@ -173,6 +151,28 @@ class CustomController(Controller): except Exception as e: raise e + async def setup_mcp_client(self, mcp_server_config: Optional[Dict[str, Any]] = None): + self.mcp_server_config = mcp_server_config + if self.mcp_server_config: + self.mcp_client = await setup_mcp_client_and_tools(self.mcp_server_config) + self.register_mcp_tools() + + def register_mcp_tools(self): + """ + Register the MCP tools used by this controller. + """ + if self.mcp_client: + for server_name in self.mcp_client.server_name_to_tools: + for tool in self.mcp_client.server_name_to_tools[server_name]: + tool_name = f"mcp.{server_name}.{tool.name}" + self.registry.registry.actions[tool_name] = RegisteredAction( + name=tool_name, + description=tool.description, + function=tool, + param_model=create_tool_param_model(tool), + ) + logger.info(f"Add mcp tool: {tool_name}") + async def close_mcp_client(self): if self.mcp_client: await self.mcp_client.__aexit__(None, None, None) diff --git a/src/utils/mcp_client.py b/src/utils/mcp_client.py index a5d6fcd..b909d0d 100644 --- a/src/utils/mcp_client.py +++ b/src/utils/mcp_client.py @@ -40,7 +40,13 @@ async def setup_mcp_client_and_tools(mcp_server_config: Dict[str, Any]) -> Optio logger.info("Initializing MultiServerMCPClient...") + if not mcp_server_config: + logger.error("No MCP server configuration provided.") + return None + try: + if "mcpServers" in mcp_server_config: + mcp_server_config = mcp_server_config["mcpServers"] client = MultiServerMCPClient(mcp_server_config) await client.__aenter__() return client diff --git a/src/utils/utils.py b/src/utils/utils.py index 8703c46..f0f0b76 100644 --- a/src/utils/utils.py +++ b/src/utils/utils.py @@ -9,25 +9,6 @@ import gradio as gr import uuid -# Callback to update the model name dropdown based on the selected provider -def update_model_dropdown(llm_provider, api_key=None, base_url=None): - """ - Update the model name dropdown with predefined models for the selected provider. - """ - import gradio as gr - # Use API keys from .env if not provided - if not api_key: - api_key = os.getenv(f"{llm_provider.upper()}_API_KEY", "") - if not base_url: - base_url = os.getenv(f"{llm_provider.upper()}_BASE_URL", "") - - # Use predefined models for the selected provider - if llm_provider in model_names: - return gr.Dropdown(choices=model_names[llm_provider], value=model_names[llm_provider][0], interactive=True) - else: - return gr.Dropdown(choices=[], value="", interactive=True, allow_custom_value=True) - - def encode_image(img_path): if not img_path: return None @@ -56,108 +37,3 @@ def get_latest_files(directory: str, file_types: list = ['.webm', '.zip']) -> Di print(f"Error getting latest {file_type} file: {e}") return latest_files - - -async def capture_screenshot(browser_context): - """Capture and encode a screenshot""" - # Extract the Playwright browser instance - playwright_browser = browser_context.browser.playwright_browser # Ensure this is correct. - - # Check if the browser instance is valid and if an existing context can be reused - if playwright_browser and playwright_browser.contexts: - playwright_context = playwright_browser.contexts[0] - else: - return None - - # Access pages in the context - pages = None - if playwright_context: - pages = playwright_context.pages - - # Use an existing page or create a new one if none exist - if pages: - active_page = pages[0] - for page in pages: - if page.url != "about:blank": - active_page = page - else: - return None - - # Take screenshot - try: - screenshot = await active_page.screenshot( - type='jpeg', - quality=75, - scale="css" - ) - encoded = base64.b64encode(screenshot).decode('utf-8') - return encoded - except Exception as e: - return None - - -class ConfigManager: - def __init__(self): - self.components = {} - self.component_order = [] - - def register_component(self, name: str, component): - """Register a gradio component for config management.""" - self.components[name] = component - if name not in self.component_order: - self.component_order.append(name) - return component - - def save_current_config(self): - """Save the current configuration of all registered components.""" - current_config = {} - for name in self.component_order: - component = self.components[name] - # Get the current value from the component - current_config[name] = getattr(component, "value", None) - - return save_config_to_file(current_config) - - def update_ui_from_config(self, config_file): - """Update UI components from a loaded configuration file.""" - if config_file is None: - return [gr.update() for _ in self.component_order] + ["No file selected."] - - loaded_config = load_config_from_file(config_file.name) - - if not isinstance(loaded_config, dict): - return [gr.update() for _ in self.component_order] + ["Error: Invalid configuration file."] - - # Prepare updates for all components - updates = [] - for name in self.component_order: - if name in loaded_config: - updates.append(gr.update(value=loaded_config[name])) - else: - updates.append(gr.update()) - - updates.append("Configuration loaded successfully.") - return updates - - def get_all_components(self): - """Return all registered components in the order they were registered.""" - return [self.components[name] for name in self.component_order] - - -def load_config_from_file(config_file): - """Load settings from a config file (JSON format).""" - try: - with open(config_file, 'r') as f: - settings = json.load(f) - return settings - except Exception as e: - return f"Error loading configuration: {str(e)}" - - -def save_config_to_file(settings, save_dir="./tmp/webui_settings"): - """Save the current settings to a UUID.json file with a UUID name.""" - os.makedirs(save_dir, exist_ok=True) - config_file = os.path.join(save_dir, f"{uuid.uuid4()}.json") - with open(config_file, 'w') as f: - json.dump(settings, f, indent=2) - return f"Configuration saved to {config_file}" diff --git a/src/webui/components/agent_settings_tab.py b/src/webui/components/agent_settings_tab.py index a2479b3..85e7c0e 100644 --- a/src/webui/components/agent_settings_tab.py +++ b/src/webui/components/agent_settings_tab.py @@ -50,7 +50,7 @@ def create_agent_settings_tab(webui_manager: WebuiManager) -> dict[str, Componen extend_system_prompt = gr.Textbox(label="Extend system prompt", lines=4, interactive=True) with gr.Group(): - mcp_json_file = gr.File(label="MCP server file", interactive=True, file_types=[".json"]) + mcp_json_file = gr.File(label="MCP server json", interactive=True, file_types=[".json"]) mcp_server_config = gr.Textbox(label="MCP server", lines=6, interactive=True, visible=False) with gr.Group(): @@ -118,6 +118,7 @@ def create_agent_settings_tab(webui_manager: WebuiManager) -> dict[str, Componen choices=[provider for provider, model in config.model_names.items()], label="Planner LLM Provider", info="Select LLM provider for LLM", + value=None, interactive=True ) planner_llm_model_name = gr.Dropdown( @@ -201,7 +202,6 @@ def create_agent_settings_tab(webui_manager: WebuiManager) -> dict[str, Componen interactive=True, allow_custom_value=True, choices=["auto", "json_schema", "function_calling", "None"], - info="Tool Calls Function Name", visible=True ) tab_components.update(dict( @@ -228,6 +228,8 @@ def create_agent_settings_tab(webui_manager: WebuiManager) -> dict[str, Componen mcp_json_file=mcp_json_file, mcp_server_config=mcp_server_config, )) + webui_manager.add_components("agent_settings", tab_components) + llm_provider.change( fn=lambda x: gr.update(visible=x == "ollama"), inputs=llm_provider, @@ -236,23 +238,21 @@ def create_agent_settings_tab(webui_manager: WebuiManager) -> dict[str, Componen llm_provider.change( lambda provider: update_model_dropdown(provider), inputs=[llm_provider], - outputs=llm_model_name + outputs=[llm_model_name] ) planner_llm_provider.change( fn=lambda x: gr.update(visible=x == "ollama"), - inputs=planner_llm_provider, - outputs=planner_ollama_num_ctx + inputs=[planner_llm_provider], + outputs=[planner_ollama_num_ctx] ) planner_llm_provider.change( lambda provider: update_model_dropdown(provider), inputs=[planner_llm_provider], - outputs=planner_llm_model_name + outputs=[planner_llm_model_name] ) mcp_json_file.change( update_mcp_server, - inputs=mcp_json_file, + inputs=[mcp_json_file], outputs=[mcp_server_config, mcp_server_config] ) - - return tab_components diff --git a/src/webui/components/browser_settings_tab.py b/src/webui/components/browser_settings_tab.py index c2b3e56..0d3bcbb 100644 --- a/src/webui/components/browser_settings_tab.py +++ b/src/webui/components/browser_settings_tab.py @@ -35,7 +35,7 @@ def create_browser_settings_tab(webui_manager: WebuiManager) -> dict[str, Compon ) keep_browser_open = gr.Checkbox( label="Keep Browser Open", - value=False, + value=True, info="Keep Browser Open between Tasks", interactive=True ) @@ -119,7 +119,9 @@ def create_browser_settings_tab(webui_manager: WebuiManager) -> dict[str, Compon save_agent_history_path=save_agent_history_path, save_download_path=save_download_path, cdp_url=cdp_url, - wss_url=wss_url + wss_url=wss_url, + window_h=window_h, + window_w=window_w, ) ) - return tab_components + webui_manager.add_components("browser_settings", tab_components) diff --git a/src/webui/components/browser_use_agent_tab.py b/src/webui/components/browser_use_agent_tab.py index 8f842af..8a122b9 100644 --- a/src/webui/components/browser_use_agent_tab.py +++ b/src/webui/components/browser_use_agent_tab.py @@ -1,62 +1,921 @@ import gradio as gr from gradio.components import Component +import asyncio +import os +import json +import uuid +import logging +from datetime import datetime +from typing import List, Dict, Optional, Any, Set, Generator, AsyncGenerator, Union +from collections.abc import Awaitable +from langchain_core.language_models.chat_models import BaseChatModel +import base64 +from browser_use.browser.browser import Browser, BrowserConfig +from browser_use.browser.context import BrowserContext, BrowserContextConfig, BrowserContextWindowSize +from browser_use.agent.service import Agent +from browser_use.agent.views import AgentHistoryList +from browser_use.agent.views import ToolCallingMethod # Adjust import +from browser_use.agent.views import ( + REQUIRED_LLM_API_ENV_VARS, + ActionResult, + AgentError, + AgentHistory, + AgentHistoryList, + AgentOutput, + AgentSettings, + AgentState, + AgentStepInfo, + StepMetadata, + ToolCallingMethod, +) +from browser_use.browser.browser import Browser +from browser_use.browser.context import BrowserContext +from browser_use.browser.views import BrowserState, BrowserStateHistory from src.webui.webui_manager import WebuiManager -from src.utils import config +from src.controller.custom_controller import CustomController +from src.utils import llm_provider +from src.browser.custom_browser import CustomBrowser +from src.browser.custom_context import CustomBrowserContext, CustomBrowserContextConfig + +logger = logging.getLogger(__name__) -def create_browser_use_agent_tab(webui_manager: WebuiManager) -> dict[str, Component]: +# --- Helper Functions --- (Defined at module level) + +async def _initialize_llm(provider: Optional[str], model_name: Optional[str], temperature: float, + base_url: Optional[str], api_key: Optional[str], num_ctx: Optional[int] = None) -> Optional[ + BaseChatModel]: + """Initializes the LLM based on settings. Returns None if provider/model is missing.""" + if not provider or not model_name: + logger.info("LLM Provider or Model Name not specified, LLM will be None.") + return None + try: + # Use your actual LLM provider logic here + logger.info(f"Initializing LLM: Provider={provider}, Model={model_name}, Temp={temperature}") + # Example using a placeholder function + llm = llm_provider.get_llm_model( + provider=provider, + model_name=model_name, + temperature=temperature, + base_url=base_url or None, + api_key=api_key or None, + # Add other relevant params like num_ctx for ollama + num_ctx=num_ctx if provider == "ollama" else None + ) + return llm + except Exception as e: + logger.error(f"Failed to initialize LLM: {e}", exc_info=True) + gr.Warning( + f"Failed to initialize LLM '{model_name}' for provider '{provider}'. Please check settings. Error: {e}") + return None + + +def _get_config_value(webui_manager: WebuiManager, comp_dict: Dict[gr.components.Component, Any], comp_id_suffix: str, + default: Any = None) -> Any: + """Safely get value from component dictionary using its ID suffix relative to the tab.""" + # Assumes component ID format is "tab_name.comp_name" + tab_name = "browser_use_agent" # Hardcode or derive if needed + comp_id = f"{tab_name}.{comp_id_suffix}" + # Need to find the component object first using the ID from the manager + try: + comp = webui_manager.get_component_by_id(comp_id) + return comp_dict.get(comp, default) + except KeyError: + # Try accessing settings tabs as well + for prefix in ["agent_settings", "browser_settings"]: + try: + comp_id = f"{prefix}.{comp_id_suffix}" + comp = webui_manager.get_component_by_id(comp_id) + return comp_dict.get(comp, default) + except KeyError: + continue + logger.warning(f"Component with suffix '{comp_id_suffix}' not found in manager for value lookup.") + return default + + +def _format_agent_output(model_output: AgentOutput) -> str: + """Formats AgentOutput for display in the chatbot using JSON.""" + content = "" + if model_output: + try: + # Directly use model_dump if actions and current_state are Pydantic models + action_dump = [action.model_dump(exclude_none=True) for action in model_output.action] + + state_dump = model_output.current_state.model_dump(exclude_none=True) + model_output_dump = { + 'current_state': state_dump, + 'action': action_dump, + } + # Dump to JSON string with indentation + json_string = json.dumps(model_output_dump, indent=4, ensure_ascii=False) + # Wrap in
 for proper display in HTML
+            content = f"
{json_string}
" + + except AttributeError as ae: + logger.error( + f"AttributeError during model dump: {ae}. Check if 'action' or 'current_state' or their items support 'model_dump'.") + content = f"
Error: Could not format agent output (AttributeError: {ae}).\nRaw output: {str(model_output)}
" + except Exception as e: + logger.error(f"Error formatting agent output: {e}", exc_info=True) + # Fallback to simple string representation on error + content = f"
Error formatting agent output.\nRaw output:\n{str(model_output)}
" + + return content.strip() + + +# --- Updated Callback Implementation --- + +async def _handle_new_step(webui_manager: WebuiManager, state: BrowserState, output: AgentOutput, step_num: int): + """Callback for each step taken by the agent, including screenshot display.""" + + # Use the correct chat history attribute name from the user's code + if not hasattr(webui_manager, 'bu_chat_history'): + logger.error("Attribute 'bu_chat_history' not found in webui_manager! Cannot add chat message.") + # Initialize it maybe? Or raise an error? For now, log and potentially skip chat update. + webui_manager.bu_chat_history = [] # Initialize if missing (consider if this is the right place) + # return # Or stop if this is critical + step_num -= 1 + logger.info(f"Step {step_num} completed.") + + # --- Screenshot Handling --- + screenshot_html = "" + # Ensure state.screenshot exists and is not empty before proceeding + # Use getattr for safer access + screenshot_data = getattr(state, 'screenshot', None) + if screenshot_data: + try: + # Basic validation: check if it looks like base64 + if isinstance(screenshot_data, str) and len(screenshot_data) > 100: # Arbitrary length check + # *** UPDATED STYLE: Removed centering, adjusted width *** + img_tag = f'Step {step_num} Screenshot' + screenshot_html = img_tag + "
" # Use
for line break after inline-block image + else: + logger.warning( + f"Screenshot for step {step_num} seems invalid (type: {type(screenshot_data)}, len: {len(screenshot_data) if isinstance(screenshot_data, str) else 'N/A'}).") + screenshot_html = "**[Invalid screenshot data]**
" + + except Exception as e: + logger.error(f"Error processing or formatting screenshot for step {step_num}: {e}", exc_info=True) + screenshot_html = "**[Error displaying screenshot]**
" + else: + logger.debug(f"No screenshot available for step {step_num}.") + + # --- Format Agent Output --- + formatted_output = _format_agent_output(output) # Use the updated function + + # --- Combine and Append to Chat --- + step_header = f"--- **Step {step_num}** ---" + # Combine header, image (with line break), and JSON block + final_content = step_header + "
" + screenshot_html + formatted_output + + chat_message = { + "role": "assistant", + "content": final_content.strip() # Remove leading/trailing whitespace + } + + # Append to the correct chat history list + webui_manager.bu_chat_history.append(chat_message) + + await asyncio.sleep(0.05) + + +def _handle_done(webui_manager: WebuiManager, history: AgentHistoryList): + """Callback when the agent finishes the task (success or failure).""" + logger.info( + f"Agent task finished. Duration: {history.total_duration_seconds():.2f}s, Tokens: {history.total_input_tokens()}") + final_summary = f"**Task Completed**\n" + final_summary += f"- Duration: {history.total_duration_seconds():.2f} seconds\n" + final_summary += f"- Total Input Tokens: {history.total_input_tokens()}\n" # Or total tokens if available + + final_result = history.final_result() + if final_result: + final_summary += f"- Final Result: {final_result}\n" + + errors = history.errors() + if errors and any(errors): + final_summary += f"- **Errors:**\n```\n{errors}\n```\n" + else: + final_summary += "- Status: Success\n" + + webui_manager.bu_chat_history.append({"role": "assistant", "content": final_summary}) + + +async def _ask_assistant_callback(webui_manager: WebuiManager, query: str, browser_context: BrowserContext) -> Dict[ + str, Any]: + """Callback triggered by the agent's ask_for_assistant action.""" + logger.info("Agent requires assistance. Waiting for user input.") + + if not hasattr(webui_manager, '_chat_history'): + logger.error("Chat history not found in webui_manager during ask_assistant!") + return {"response": "Internal Error: Cannot display help request."} + + webui_manager.bu_chat_history.append({"role": "assistant", + "content": f"**Need Help:** {query}\nPlease provide information or perform the required action in the browser, then type your response/confirmation below and click 'Submit Response'."}) + + # Use state stored in webui_manager + webui_manager.bu_response_event = asyncio.Event() + webui_manager.bu_user_help_response = None # Reset previous response + + try: + logger.info("Waiting for user response event...") + await asyncio.wait_for(webui_manager.bu_response_event.wait(), timeout=3600.0) # Long timeout + logger.info("User response event received.") + except asyncio.TimeoutError: + logger.warning("Timeout waiting for user assistance.") + webui_manager.bu_chat_history.append( + {"role": "assistant", "content": "**Timeout:** No response received. Trying to proceed."}) + webui_manager.bu_response_event = None # Clear the event + return {"response": "Timeout: User did not respond."} # Inform the agent + + response = webui_manager.bu_user_help_response + webui_manager.bu_chat_history.append({"role": "user", "content": response}) # Show user response in chat + webui_manager.bu_response_event = None # Clear the event for the next potential request + return {"response": response} + + +async def capture_screenshot(browser_context): + """Capture and encode a screenshot""" + # Extract the Playwright browser instance + playwright_browser = browser_context.browser.playwright_browser # Ensure this is correct. + + # Check if the browser instance is valid and if an existing context can be reused + if playwright_browser and playwright_browser.contexts: + playwright_context = playwright_browser.contexts[0] + else: + return None + + # Access pages in the context + pages = None + if playwright_context: + pages = playwright_context.pages + + # Use an existing page or create a new one if none exist + if pages: + active_page = pages[0] + for page in pages: + if page.url != "about:blank": + active_page = page + else: + return None + + # Take screenshot + try: + screenshot = await active_page.screenshot( + type='jpeg', + quality=75, + scale="css" + ) + encoded = base64.b64encode(screenshot).decode('utf-8') + return encoded + except Exception as e: + return None + + +# --- Core Agent Execution Logic --- (Needs access to webui_manager) + +async def run_agent_task(webui_manager: WebuiManager, components: Dict[gr.components.Component, Any]) -> AsyncGenerator[ + Dict[gr.components.Component, Any], None]: + """Handles the entire lifecycle of initializing and running the agent.""" + + # --- Get Components --- + # Need handles to specific UI components to update them + user_input_comp = webui_manager.get_component_by_id("browser_use_agent.user_input") + run_button_comp = webui_manager.get_component_by_id("browser_use_agent.run_button") + stop_button_comp = webui_manager.get_component_by_id("browser_use_agent.stop_button") + pause_resume_button_comp = webui_manager.get_component_by_id("browser_use_agent.pause_resume_button") + clear_button_comp = webui_manager.get_component_by_id("browser_use_agent.clear_button") + chatbot_comp = webui_manager.get_component_by_id("browser_use_agent.chatbot") + history_file_comp = webui_manager.get_component_by_id("browser_use_agent.agent_history_file") + gif_comp = webui_manager.get_component_by_id("browser_use_agent.recording_gif") + browser_view_comp = webui_manager.get_component_by_id("browser_use_agent.browser_view") + + # --- 1. Get Task and Initial UI Update --- + task = components.get(user_input_comp, "").strip() + if not task: + gr.Warning("Please enter a task.") + yield {run_button_comp: gr.update(interactive=True)} + return + + # Set running state indirectly via _current_task + webui_manager.bu_chat_history.append({"role": "user", "content": task}) + + yield { + user_input_comp: gr.Textbox(value="", interactive=False, placeholder="Agent is running..."), + run_button_comp: gr.Button(value="ā³ Running...", interactive=False), + stop_button_comp: gr.Button(interactive=True), + pause_resume_button_comp: gr.Button(value="āøļø Pause", interactive=True), + clear_button_comp: gr.Button(interactive=False), + chatbot_comp: gr.update(value=webui_manager.bu_chat_history), + history_file_comp: gr.update(value=None), + gif_comp: gr.update(value=None), + } + + # --- Agent Settings --- + # Access settings values via components dict, getting IDs from webui_manager + def get_setting(key, default=None): + comp = webui_manager.id_to_component.get(f"agent_settings.{key}") + return components.get(comp, default) if comp else default + + override_system_prompt = get_setting("override_system_prompt") or None + extend_system_prompt = get_setting("extend_system_prompt") or None + llm_provider_name = get_setting("llm_provider", None) # Default to None if not found + llm_model_name = get_setting("llm_model_name", None) + llm_temperature = get_setting("llm_temperature", 0.6) + use_vision = get_setting("use_vision", True) + ollama_num_ctx = get_setting("ollama_num_ctx", 16000) + llm_base_url = get_setting("llm_base_url") or None + llm_api_key = get_setting("llm_api_key") or None + max_steps = get_setting("max_steps", 100) + max_actions = get_setting("max_actions", 10) + max_input_tokens = get_setting("max_input_tokens", 128000) + tool_calling_str = get_setting("tool_calling_method", "auto") + tool_calling_method = tool_calling_str if tool_calling_str != "None" else None + mcp_server_config_comp = webui_manager.id_to_component.get("agent_settings.mcp_server_config") + mcp_server_config_str = components.get(mcp_server_config_comp) if mcp_server_config_comp else None + mcp_server_config = json.loads(mcp_server_config_str) if mcp_server_config_str else None + + # Planner LLM Settings (Optional) + planner_llm_provider_name = get_setting("planner_llm_provider") or None + planner_llm = None + if planner_llm_provider_name: + planner_llm_model_name = get_setting("planner_llm_model_name") + planner_llm_temperature = get_setting("planner_llm_temperature", 0.6) + planner_ollama_num_ctx = get_setting("planner_ollama_num_ctx", 16000) + planner_llm_base_url = get_setting("planner_llm_base_url") or None + planner_llm_api_key = get_setting("planner_llm_api_key") or None + planner_use_vision = get_setting("planner_use_vision", False) + + planner_llm = await _initialize_llm( + planner_llm_provider_name, planner_llm_model_name, planner_llm_temperature, + planner_llm_base_url, planner_llm_api_key, + planner_ollama_num_ctx if planner_llm_provider_name == "ollama" else None + ) + + # --- Browser Settings --- + def get_browser_setting(key, default=None): + comp = webui_manager.id_to_component.get(f"browser_settings.{key}") + return components.get(comp, default) if comp else default + + browser_binary_path = get_browser_setting("browser_binary_path") or None + browser_user_data_dir = get_browser_setting("browser_user_data_dir") or None + use_own_browser = get_browser_setting("use_own_browser", False) # Logic handled by CDP/WSS presence + keep_browser_open = get_browser_setting("keep_browser_open", False) + headless = get_browser_setting("headless", False) + disable_security = get_browser_setting("disable_security", True) + window_w = int(get_browser_setting("window_w", 1280)) + window_h = int(get_browser_setting("window_h", 1100)) + cdp_url = get_browser_setting("cdp_url") or None + wss_url = get_browser_setting("wss_url") or None + save_recording_path = get_browser_setting("save_recording_path") or None + save_trace_path = get_browser_setting("save_trace_path") or None + save_agent_history_path = get_browser_setting("save_agent_history_path", "./tmp/agent_history") + save_download_path = get_browser_setting("save_download_path", "./tmp/downloads") + + stream_vw = 80 + stream_vh = int(80 * window_h // window_w) + + os.makedirs(save_agent_history_path, exist_ok=True) + if save_recording_path: os.makedirs(save_recording_path, exist_ok=True) + if save_trace_path: os.makedirs(save_trace_path, exist_ok=True) + if save_download_path: os.makedirs(save_download_path, exist_ok=True) + + # --- 2. Initialize LLM --- + main_llm = await _initialize_llm( + llm_provider_name, llm_model_name, llm_temperature, llm_base_url, llm_api_key, + ollama_num_ctx if llm_provider_name == "ollama" else None + ) + + # Pass the webui_manager instance to the callback when wrapping it + async def ask_callback_wrapper(query: str, browser_context: BrowserContext) -> Dict[str, Any]: + return await _ask_assistant_callback(webui_manager, query, browser_context) + + if not webui_manager.bu_controller: + webui_manager.bu_controller = CustomController(ask_assistant_callback=ask_callback_wrapper) + await webui_manager.bu_controller.setup_mcp_client(mcp_server_config) + + # --- 4. Initialize Browser and Context --- + should_close_browser_on_finish = not keep_browser_open + + try: + # Close existing resources if not keeping open + if not keep_browser_open: + if webui_manager.bu_browser_context: + logger.info("Closing previous browser context.") + await webui_manager.bu_browser_context.close() + webui_manager.bu_browser_context = None + if webui_manager.bu_browser: + logger.info("Closing previous browser.") + await webui_manager.bu_browser.close() + webui_manager.bu_browser = None + + # Create Browser if needed + if not webui_manager.bu_browser: + logger.info("Launching new browser instance.") + extra_args = [f"--window-size={window_w},{window_h}"] + if browser_user_data_dir: + extra_args.append(f"--user-data-dir={browser_user_data_dir}") + + if use_own_browser: + browser_binary_path = os.getenv("CHROME_PATH", None) or browser_binary_path + if browser_binary_path == "": + browser_binary_path = None + chrome_user_data = os.getenv("CHROME_USER_DATA", None) + if chrome_user_data: + extra_args += [f"--user-data-dir={chrome_user_data}"] + else: + browser_binary_path = None + + webui_manager.bu_browser = CustomBrowser( + config=BrowserConfig( + headless=headless, + disable_security=disable_security, + browser_binary_path=browser_binary_path, + extra_browser_args=extra_args, + wss_url=wss_url, + cdp_url=cdp_url, + ) + ) + + # Create Context if needed + if not webui_manager.bu_browser_context: + logger.info("Creating new browser context.") + context_config = CustomBrowserContextConfig( + trace_path=save_trace_path if save_trace_path else None, + save_recording_path=save_recording_path if save_recording_path else None, + save_downloads_path=save_download_path if save_download_path else None, + browser_window_size=BrowserContextWindowSize(width=window_w, height=window_h) + ) + if not webui_manager.bu_browser: + raise ValueError("Browser not initialized, cannot create context.") + webui_manager.bu_browser_context = await webui_manager.bu_browser.new_context(config=context_config) + + # --- 5. Initialize or Update Agent --- + webui_manager.bu_agent_task_id = str(uuid.uuid4()) # New ID for this task run + os.makedirs(os.path.join(save_agent_history_path, webui_manager.bu_agent_task_id), exist_ok=True) + history_file = os.path.join(save_agent_history_path, webui_manager.bu_agent_task_id, + f"{webui_manager.bu_agent_task_id}.json") + gif_path = os.path.join(save_agent_history_path, webui_manager.bu_agent_task_id, + f"{webui_manager.bu_agent_task_id}.gif") + + # Pass the webui_manager to callbacks when wrapping them + async def step_callback_wrapper(state: BrowserState, output: AgentOutput, step_num: int): + await _handle_new_step(webui_manager, state, output, step_num) + + def done_callback_wrapper(history: AgentHistoryList): + _handle_done(webui_manager, history) + + if not webui_manager.bu_agent: + logger.info(f"Initializing new agent for task: {task}") + if not webui_manager.bu_browser or not webui_manager.bu_browser_context: + raise ValueError("Browser or Context not initialized, cannot create agent.") + + webui_manager.bu_agent = Agent( + task=task, + llm=main_llm, + browser=webui_manager.bu_browser, + browser_context=webui_manager.bu_browser_context, + controller=webui_manager.bu_controller, + register_new_step_callback=step_callback_wrapper, + register_done_callback=done_callback_wrapper, + # Agent settings + use_vision=use_vision, + override_system_message=override_system_prompt, + extend_system_message=extend_system_prompt, + max_input_tokens=max_input_tokens, + max_actions_per_step=max_actions, + tool_calling_method=tool_calling_method, + planner_llm=planner_llm, + use_vision_for_planner=planner_use_vision if planner_llm else False, + save_conversation_path=history_file, + ) + webui_manager.bu_agent.state.agent_id = webui_manager.bu_agent_task_id + webui_manager.bu_agent.settings.generate_gif = gif_path + else: + webui_manager.bu_agent.state.agent_id = webui_manager.bu_agent_task_id + webui_manager.bu_agent.add_new_task(task) + webui_manager.bu_agent.settings.generate_gif = gif_path + + # --- 6. Run Agent Task and Stream Updates --- + agent_run_coro = webui_manager.bu_agent.run(max_steps=max_steps) + agent_task = asyncio.create_task(agent_run_coro) + webui_manager.bu_current_task = agent_task # Store the task + + last_chat_len = len(webui_manager.bu_chat_history) + while not agent_task.done(): + is_paused = webui_manager.bu_agent.state.paused + is_stopped = webui_manager.bu_agent.state.stopped + + # Check for pause state + if is_paused: + yield { + pause_resume_button_comp: gr.update(value="ā–¶ļø Resume", interactive=True), + run_button_comp: gr.update(value="āøļø Paused", interactive=False), + stop_button_comp: gr.update(interactive=True), # Allow stop while paused + } + # Wait until pause is released or task is stopped/done + while is_paused and not agent_task.done(): + # Re-check agent state in loop + is_paused = webui_manager.bu_agent.state.paused + is_stopped = webui_manager.bu_agent.state.stopped + if is_stopped: # Stop signal received while paused + break + await asyncio.sleep(0.2) + + if agent_task.done() or is_stopped: # If stopped or task finished while paused + break + + # If resumed, yield UI update + yield { + pause_resume_button_comp: gr.update(value="āøļø Pause", interactive=True), + run_button_comp: gr.update(value="ā³ Running...", interactive=False), + } + + # Check if agent stopped itself or stop button was pressed (which sets agent.state.stopped) + if is_stopped: + logger.info("Agent has stopped (internally or via stop button).") + if not agent_task.done(): + # Ensure the task coroutine finishes if agent just set flag + try: + await asyncio.wait_for(agent_task, timeout=1.0) # Give it a moment to exit run() + except asyncio.TimeoutError: + logger.warning("Agent task did not finish quickly after stop signal, cancelling.") + agent_task.cancel() + except Exception: # Catch task exceptions if it errors on stop + pass + break # Exit the streaming loop + + # Check if agent is asking for help (via response_event) + update_dict = {} + if webui_manager.bu_response_event is not None: + update_dict = { + user_input_comp: gr.update(placeholder="Agent needs help. Enter response and submit.", + interactive=True), + run_button_comp: gr.update(value="āœ”ļø Submit Response", interactive=True), + pause_resume_button_comp: gr.update(interactive=False), + stop_button_comp: gr.update(interactive=False), + chatbot_comp: gr.update(value=webui_manager.bu_chat_history) + } + last_chat_len = len(webui_manager.bu_chat_history) + yield update_dict + # Wait until response is submitted or task finishes + while webui_manager.bu_response_event is not None and not agent_task.done(): + await asyncio.sleep(0.2) + # Restore UI after response submitted or if task ended unexpectedly + if not agent_task.done(): + yield { + user_input_comp: gr.update(placeholder="Agent is running...", interactive=False), + run_button_comp: gr.update(value="ā³ Running...", interactive=False), + pause_resume_button_comp: gr.update(interactive=True), + stop_button_comp: gr.update(interactive=True), + } + else: + break # Task finished while waiting for response + + # Update Chatbot if new messages arrived via callbacks + if len(webui_manager.bu_chat_history) > last_chat_len: + update_dict[chatbot_comp] = gr.update(value=webui_manager.bu_chat_history) + last_chat_len = len(webui_manager.bu_chat_history) + + # Update Browser View + if headless and webui_manager.bu_browser_context: + try: + screenshot_b64 = await capture_screenshot(webui_manager.bu_browser_context) + if screenshot_b64: + html_content = f'' + update_dict[browser_view_comp] = gr.update(value=html_content, visible=True) + else: + html_content = f"

Waiting for browser session...

" + update_dict[browser_view_comp] = gr.update(value=html_content, + visible=True) + except Exception as e: + logger.debug(f"Failed to capture screenshot: {e}") + update_dict[browser_view_comp] = gr.update(value="
Error loading view...
", + visible=True) + else: + update_dict[browser_view_comp] = gr.update(visible=False) + + # Yield accumulated updates + if update_dict: + yield update_dict + + await asyncio.sleep(0.1) # Polling interval + + # --- 7. Task Finalization --- + webui_manager.bu_agent.state.paused = False + webui_manager.bu_agent.state.stopped = False + final_update = {} + try: + logger.info("Agent task completing...") + # Await the task ensure completion and catch exceptions if not already caught + if not agent_task.done(): + await agent_task # Retrieve result/exception + elif agent_task.exception(): # Check if task finished with exception + agent_task.result() # Raise the exception to be caught below + logger.info("Agent task completed processing.") + + logger.info(f"Explicitly saving agent history to: {history_file}") + webui_manager.bu_agent.save_history(history_file) + + if os.path.exists(history_file): + final_update[history_file_comp] = gr.File(value=history_file) + + if gif_path and os.path.exists(gif_path): + logger.info(f"GIF found at: {gif_path}") + final_update[gif_comp] = gr.Image(value=gif_path) + + except asyncio.CancelledError: + logger.info("Agent task was cancelled.") + if not any("Cancelled" in msg.get("content", "") for msg in webui_manager.bu_chat_history if + msg.get("role") == "assistant"): + webui_manager.bu_chat_history.append({"role": "assistant", "content": "**Task Cancelled**."}) + final_update[chatbot_comp] = gr.update(value=webui_manager.bu_chat_history) + except Exception as e: + logger.error(f"Error during agent execution: {e}", exc_info=True) + error_message = f"**Agent Execution Error:**\n```\n{type(e).__name__}: {e}\n```" + if not any(error_message in msg.get("content", "") for msg in webui_manager.bu_chat_history if + msg.get("role") == "assistant"): + webui_manager.bu_chat_history.append({"role": "assistant", "content": error_message}) + final_update[chatbot_comp] = gr.update(value=webui_manager.bu_chat_history) + gr.Error(f"Agent execution failed: {e}") + + finally: + webui_manager.bu_current_task = None # Clear the task reference + + # Close browser/context if requested + if should_close_browser_on_finish: + if webui_manager.bu_browser_context: + logger.info("Closing browser context after task.") + await webui_manager.bu_browser_context.close() + webui_manager.bu_browser_context = None + if webui_manager.bu_browser: + logger.info("Closing browser after task.") + await webui_manager.bu_browser.close() + webui_manager.bu_browser = None + + # --- 8. Final UI Update --- + final_update.update({ + user_input_comp: gr.update(value="", interactive=True, placeholder="Enter your next task..."), + run_button_comp: gr.update(value="ā–¶ļø Submit Task", interactive=True), + stop_button_comp: gr.update(interactive=False), + pause_resume_button_comp: gr.update(value="āøļø Pause", interactive=False), + clear_button_comp: gr.update(interactive=True), + # Ensure final chat history is shown + chatbot_comp: gr.update(value=webui_manager.bu_chat_history) + }) + yield final_update + + except Exception as e: + # Catch errors during setup (before agent run starts) + logger.error(f"Error setting up agent task: {e}", exc_info=True) + webui_manager.bu_current_task = None # Ensure state is reset + yield { + user_input_comp: gr.update(interactive=True, placeholder="Error during setup. Enter task..."), + run_button_comp: gr.update(value="ā–¶ļø Submit Task", interactive=True), + stop_button_comp: gr.update(interactive=False), + pause_resume_button_comp: gr.update(value="āøļø Pause", interactive=False), + clear_button_comp: gr.update(interactive=True), + chatbot_comp: gr.update( + value=webui_manager.bu_chat_history + [{"role": "assistant", "content": f"**Setup Error:** {e}"}]), + } + + +# --- Button Click Handlers --- (Need access to webui_manager) + +async def handle_submit(webui_manager: WebuiManager, components: Dict[gr.components.Component, Any]): + """Handles clicks on the main 'Submit' button.""" + user_input_comp = webui_manager.get_component_by_id("browser_use_agent.user_input") + user_input_value = components.get(user_input_comp, "").strip() + + # Check if waiting for user assistance + if webui_manager.bu_response_event and not webui_manager.bu_response_event.is_set(): + logger.info(f"User submitted assistance: {user_input_value}") + webui_manager.bu_user_help_response = user_input_value if user_input_value else "User provided no text response." + webui_manager.bu_response_event.set() + # UI updates handled by the main loop reacting to the event being set + yield { + user_input_comp: gr.update(value="", interactive=False, placeholder="Waiting for agent to continue..."), + webui_manager.get_component_by_id("browser_use_agent.run_button"): gr.update(value="ā³ Running...", + interactive=False) + } + # Check if a task is currently running (using _current_task) + elif webui_manager.bu_current_task and not webui_manager.bu_current_task.done(): + logger.warning("Submit button clicked while agent is already running and not asking for help.") + gr.Info("Agent is currently running. Please wait or use Stop/Pause.") + yield {} # No change + else: + # Handle submission for a new task + logger.info("Submit button clicked for new task.") + # Use async generator to stream updates from run_agent_task + async for update in run_agent_task(webui_manager, components): + yield update + + +async def handle_stop(webui_manager: WebuiManager): + """Handles clicks on the 'Stop' button.""" + logger.info("Stop button clicked.") + agent = webui_manager.bu_agent + task = webui_manager.bu_current_task + + if agent and task and not task.done(): + # Signal the agent to stop by setting its internal flag + agent.state.stopped = True + agent.state.paused = False # Ensure not paused if stopped + return { + webui_manager.get_component_by_id("browser_use_agent.stop_button"): gr.update(interactive=False, + value="ā¹ļø Stopping..."), + webui_manager.get_component_by_id("browser_use_agent.pause_resume_button"): gr.update(interactive=False), + webui_manager.get_component_by_id("browser_use_agent.run_button"): gr.update(interactive=False), + } + else: + logger.warning("Stop clicked but agent is not running or task is already done.") + # Reset UI just in case it's stuck + return { + webui_manager.get_component_by_id("browser_use_agent.run_button"): gr.update(interactive=True), + webui_manager.get_component_by_id("browser_use_agent.stop_button"): gr.update(interactive=False), + webui_manager.get_component_by_id("browser_use_agent.pause_resume_button"): gr.update(interactive=False), + webui_manager.get_component_by_id("browser_use_agent.clear_button"): gr.update(interactive=True), + } + + +async def handle_pause_resume(webui_manager: WebuiManager): + """Handles clicks on the 'Pause/Resume' button.""" + agent = webui_manager.bu_agent + task = webui_manager.bu_current_task + + if agent and task and not task.done(): + if agent.state.paused: + logger.info("Resume button clicked.") + agent.resume() + # UI update happens in main loop + return { + webui_manager.get_component_by_id("browser_use_agent.pause_resume_button"): gr.update(value="āøļø Pause", + interactive=True)} # Optimistic update + else: + logger.info("Pause button clicked.") + agent.pause() + return { + webui_manager.get_component_by_id("browser_use_agent.pause_resume_button"): gr.update(value="ā–¶ļø Resume", + interactive=True)} # Optimistic update + else: + logger.warning("Pause/Resume clicked but agent is not running or doesn't support state.") + return {} # No change + + +async def handle_clear(webui_manager: WebuiManager): + """Handles clicks on the 'Clear' button.""" + logger.info("Clear button clicked.") + + # Stop any running task first + task = webui_manager.bu_current_task + if task and not task.done(): + logger.info("Clearing requires stopping the current task.") + webui_manager.bu_agent.stop() + try: + await asyncio.wait_for(task, timeout=2.0) # Wait briefly + except (asyncio.CancelledError, asyncio.TimeoutError): + pass + except Exception as e: + logger.warning(f"Error stopping task on clear: {e}") + webui_manager.bu_current_task.cancel() + webui_manager.bu_current_task = None + + if webui_manager.bu_controller: + await webui_manager.bu_controller.close_mcp_client() + webui_manager.bu_controller = None + webui_manager.bu_agent = None + + # Reset state stored in manager + webui_manager.bu_chat_history = [] + webui_manager.bu_response_event = None + webui_manager.bu_user_help_response = None + webui_manager.bu_agent_task_id = None + + logger.info("Agent state and browser resources cleared.") + + # Reset UI components + return { + webui_manager.get_component_by_id("browser_use_agent.chatbot"): gr.update(value=[]), + webui_manager.get_component_by_id("browser_use_agent.user_input"): gr.update(value="", + placeholder="Enter your task here..."), + webui_manager.get_component_by_id("browser_use_agent.agent_history_file"): gr.update(value=None), + webui_manager.get_component_by_id("browser_use_agent.recording_gif"): gr.update(value=None), + webui_manager.get_component_by_id("browser_use_agent.browser_view"): gr.update( + value="
Browser Cleared
"), + webui_manager.get_component_by_id("browser_use_agent.run_button"): gr.update(value="ā–¶ļø Submit Task", + interactive=True), + webui_manager.get_component_by_id("browser_use_agent.stop_button"): gr.update(interactive=False), + webui_manager.get_component_by_id("browser_use_agent.pause_resume_button"): gr.update(value="āøļø Pause", + interactive=False), + webui_manager.get_component_by_id("browser_use_agent.clear_button"): gr.update(interactive=True), + } + + +# --- Tab Creation Function --- + +def create_browser_use_agent_tab(webui_manager: WebuiManager): """ - Create the run agent tab + Create the run agent tab, defining UI, state, and handlers. """ - input_components = set(webui_manager.get_components()) + webui_manager.init_browser_use_agent() + + # --- Define UI Components --- tab_components = {} - - chatbot = gr.Chatbot(type='messages', label="Chat History", height=600) - user_input = gr.Textbox( - label="User Input", - lines=3, - value="go to google.com and type 'OpenAI' click search and give me the first url", - interactive=True - ) - - with gr.Row(): - stop_button = gr.Button("ā¹ļø Stop", interactive=False, variant="stop", scale=2) - clear_button = gr.Button("🧹 Clear", interactive=True, variant="stop", scale=2) - run_button = gr.Button("ā–¶ļø Summit", variant="primary", scale=3) - - browser_view = gr.HTML( - value="

Waiting for browser session...

", - label="Browser Live View", - visible=False - ) - - with gr.Row(): - agent_final_result = gr.Textbox( - label="Final Result", lines=3, show_label=True, interactive=False + with gr.Column(): + chatbot = gr.Chatbot( + lambda: webui_manager.bu_chat_history, # Load history dynamically + elem_id="browser_use_chatbot", + label="Agent Interaction", + type="messages", + height=600, + show_copy_button=True, + bubble_full_width=False, ) - agent_errors = gr.Textbox( - label="Errors", lines=3, show_label=True, interactive=False + user_input = gr.Textbox( + label="Your Task or Response", + placeholder="Enter your task here or provide assistance when asked.", + lines=3, + interactive=True, + elem_id="user_input" ) + with gr.Row(): + stop_button = gr.Button("ā¹ļø Stop", interactive=False, variant="stop", scale=1) + pause_resume_button = gr.Button("āøļø Pause", interactive=False, variant="secondary", scale=1) + clear_button = gr.Button("šŸ—‘ļø Clear", interactive=True, variant="secondary", scale=1) + run_button = gr.Button("ā–¶ļø Submit Task", variant="primary", scale=2) - with gr.Row(): - agent_trace_file = gr.File(label="Trace File", interactive=False) - agent_history_file = gr.File(label="Agent History", interactive=False) + browser_view = gr.HTML( + value="

Browser View (Requires Headless=True)

", + label="Browser Live View", + elem_id="browser_view", + visible=False, + ) + with gr.Column(): + gr.Markdown("### Task Outputs") + agent_history_file = gr.File(label="Agent History JSON", interactive=False) + recording_gif = gr.Image(label="Task Recording GIF", format="gif", interactive=False, + type="filepath") - recording_gif = gr.Image(label="Result GIF", format="gif", interactive=False) + # --- Store Components in Manager --- tab_components.update( dict( - chatbot=chatbot, - user_input=user_input, - clear_button=clear_button, - run_button=run_button, - stop_button=stop_button, - agent_final_result=agent_final_result, - agent_errors=agent_errors, - agent_trace_file=agent_trace_file, - agent_history_file=agent_history_file, - recording_gif=recording_gif, + chatbot=chatbot, user_input=user_input, clear_button=clear_button, + run_button=run_button, stop_button=stop_button, pause_resume_button=pause_resume_button, + agent_history_file=agent_history_file, recording_gif=recording_gif, browser_view=browser_view ) ) - return tab_components + webui_manager.add_components("browser_use_agent", tab_components) # Use "browser_use_agent" as tab_name prefix + + all_managed_components = set(webui_manager.get_components()) # Get all components known to manager + run_tab_outputs = list(tab_components.values()) + + async def submit_wrapper(components_dict: Dict[Component, Any]) -> AsyncGenerator[Dict[Component, Any], None]: + """Wrapper for handle_submit that yields its results.""" + # handle_submit is an async generator, iterate and yield + async for update in handle_submit(webui_manager, components_dict): + yield update + + async def stop_wrapper() -> AsyncGenerator[Dict[Component, Any], None]: + """Wrapper for handle_stop.""" + # handle_stop is async def but returns a single dict. We yield it once. + update_dict = await handle_stop(webui_manager) + yield update_dict # Yield the final dictionary + + async def pause_resume_wrapper() -> AsyncGenerator[Dict[Component, Any], None]: + """Wrapper for handle_pause_resume.""" + update_dict = await handle_pause_resume(webui_manager) + yield update_dict + + async def clear_wrapper() -> AsyncGenerator[Dict[Component, Any], None]: + """Wrapper for handle_clear.""" + update_dict = await handle_clear(webui_manager) + yield update_dict + + # --- Connect Event Handlers using the Wrappers -- + run_button.click( + fn=submit_wrapper, + inputs=all_managed_components, + outputs=run_tab_outputs + ) + user_input.submit( + fn=submit_wrapper, + inputs=all_managed_components, + outputs=run_tab_outputs + ) + stop_button.click( + fn=stop_wrapper, + inputs=None, + outputs=run_tab_outputs + ) + pause_resume_button.click( + fn=pause_resume_wrapper, + inputs=None, + outputs=run_tab_outputs + ) + clear_button.click( + fn=clear_wrapper, + inputs=None, + outputs=run_tab_outputs + ) + diff --git a/src/webui/components/deep_research_agent_tab.py b/src/webui/components/deep_research_agent_tab.py index d9dfc24..5ce8dd7 100644 --- a/src/webui/components/deep_research_agent_tab.py +++ b/src/webui/components/deep_research_agent_tab.py @@ -38,4 +38,4 @@ def create_deep_research_agent_tab(webui_manager: WebuiManager) -> dict[str, Com markdown_download=markdown_download, ) ) - return tab_components + webui_manager.add_components("deep_research_agent", tab_components) diff --git a/src/webui/components/load_save_config_tab.py b/src/webui/components/load_save_config_tab.py index 91dcad7..acc0f69 100644 --- a/src/webui/components/load_save_config_tab.py +++ b/src/webui/components/load_save_config_tab.py @@ -34,16 +34,17 @@ def create_load_save_config_tab(webui_manager: WebuiManager) -> dict[str, Compon config_file=config_file, )) + webui_manager.add_components("load_save_config", tab_components) + save_config_button.click( - fn=webui_manager.save_current_config, - inputs=[], + fn=webui_manager.save_config, + inputs=set(webui_manager.get_components()), outputs=[config_status] ) load_config_button.click( fn=webui_manager.load_config, inputs=[config_file], - outputs=[config_status] + outputs=webui_manager.get_components(), ) - return tab_components diff --git a/src/webui/interface.py b/src/webui/interface.py index 266b079..ba99245 100644 --- a/src/webui/interface.py +++ b/src/webui/interface.py @@ -32,6 +32,9 @@ def create_ui(theme_name="Ocean"): text-align: center; margin-bottom: 20px; } + .tab-header-text { + text-align: center; + } .theme-section { margin-bottom: 10px; padding: 15px; @@ -67,18 +70,26 @@ def create_ui(theme_name="Ocean"): with gr.Tabs() as tabs: with gr.TabItem("āš™ļø Agent Settings"): - ui_manager.add_components("agent_settings", create_agent_settings_tab(ui_manager)) + create_agent_settings_tab(ui_manager) with gr.TabItem("🌐 Browser Settings"): - ui_manager.add_components("browser_settings", create_browser_settings_tab(ui_manager)) + create_browser_settings_tab(ui_manager) with gr.TabItem("šŸ¤– Run Agent"): - ui_manager.add_components("browser_use_agent", create_browser_use_agent_tab(ui_manager)) + create_browser_use_agent_tab(ui_manager) - with gr.TabItem("🧐 Deep Research"): - ui_manager.add_components("deep_research_agent", create_deep_research_agent_tab(ui_manager)) + with gr.TabItem("šŸŽ Agent Collections"): + gr.Markdown( + """ + ### Agents built on Browser-Use + """, + elem_classes=["tab-header-text"], + ) + with gr.Tabs(): + with gr.TabItem("Deep Research"): + create_deep_research_agent_tab(ui_manager) with gr.TabItem("šŸ“ Load & Save Config"): - ui_manager.add_components("load_save_config", create_load_save_config_tab(ui_manager)) + create_load_save_config_tab(ui_manager) return demo diff --git a/src/webui/webui_manager.py b/src/webui/webui_manager.py index 033564a..5cbd31f 100644 --- a/src/webui/webui_manager.py +++ b/src/webui/webui_manager.py @@ -4,11 +4,17 @@ from typing import TYPE_CHECKING import os import gradio as gr from datetime import datetime +from typing import Optional, Dict, List +import uuid +import asyncio from gradio.components import Component from browser_use.browser.browser import Browser from browser_use.browser.context import BrowserContext from browser_use.agent.service import Agent +from src.browser.custom_browser import CustomBrowser +from src.browser.custom_context import CustomBrowserContext +from src.controller.custom_controller import CustomController class WebuiManager: @@ -19,9 +25,19 @@ class WebuiManager: self.settings_save_dir = settings_save_dir os.makedirs(self.settings_save_dir, exist_ok=True) - self.browser: Browser = None - self.browser_context: BrowserContext = None - self.bu_agent: Agent = None + def init_browser_use_agent(self) -> None: + """ + init browser use agent + """ + self.bu_agent: Optional[Agent] = None + self.bu_browser: Optional[CustomBrowser] = None + self.bu_browser_context: Optional[CustomBrowserContext] = None + self.bu_controller: Optional[CustomController] = None + self.bu_chat_history: List[Dict[str, Optional[str]]] = [] + self.bu_response_event: Optional[asyncio.Event] = None + self.bu_user_help_response: Optional[str] = None + self.bu_current_task: Optional[asyncio.Task] = None + self.bu_agent_task_id: Optional[str] = None def add_components(self, tab_name: str, components_dict: dict[str, "Component"]) -> None: """ @@ -50,15 +66,16 @@ class WebuiManager: """ return self.component_to_id[comp] - def save_current_config(self): + def save_config(self, components: Dict["Component", str]) -> None: """ - Save current config + Save config """ cur_settings = {} - for comp_id, comp in self.id_to_component.items(): + for comp in components: if not isinstance(comp, gr.Button) and not isinstance(comp, gr.File) and str( getattr(comp, "interactive", True)).lower() != "false": - cur_settings[comp_id] = getattr(comp, "value", None) + comp_id = self.get_id_by_component(comp) + cur_settings[comp_id] = components[comp] config_name = datetime.now().strftime("%Y%m%d-%H%M%S") with open(os.path.join(self.settings_save_dir, f"{config_name}.json"), "w") as fw: @@ -76,6 +93,13 @@ class WebuiManager: update_components = {} for comp_id, comp_val in ui_settings.items(): if comp_id in self.id_to_component: - update_components[self.id_to_component[comp_id]].value = comp_val + comp = self.id_to_component[comp_id] + update_components[comp] = comp.__class__(value=comp_val) - return f"Successfully loaded config from {config_path}" + config_status = self.id_to_component["load_save_config.config_status"] + update_components.update( + { + config_status: config_status.__class__(value=f"Successfully loaded config: {config_path}") + } + ) + yield update_components diff --git a/tests/test_agents.py b/tests/test_agents.py index 27bb704..79e48d6 100644 --- a/tests/test_agents.py +++ b/tests/test_agents.py @@ -17,98 +17,18 @@ from browser_use.agent.views import AgentHistoryList from src.utils import utils -async def test_browser_use_org(): +async def test_browser_use_agent(): from browser_use.browser.browser import Browser, BrowserConfig from browser_use.browser.context import ( BrowserContextConfig, BrowserContextWindowSize, ) + from browser_use.agent.service import Agent - # llm = utils.get_llm_model( - # provider="azure_openai", - # model_name="gpt-4o", - # temperature=0.8, - # base_url=os.getenv("AZURE_OPENAI_ENDPOINT", ""), - # api_key=os.getenv("AZURE_OPENAI_API_KEY", ""), - # ) - - # llm = utils.get_llm_model( - # provider="deepseek", - # model_name="deepseek-chat", - # temperature=0.8 - # ) - - llm = utils.get_llm_model( - provider="ollama", model_name="deepseek-r1:14b", temperature=0.5 - ) - - window_w, window_h = 1920, 1080 - use_vision = False - use_own_browser = False - if use_own_browser: - chrome_path = os.getenv("CHROME_PATH", None) - if chrome_path == "": - chrome_path = None - else: - chrome_path = None - - tool_calling_method = "json_schema" # setting to json_schema when using ollma - - browser = Browser( - config=BrowserConfig( - headless=False, - disable_security=True, - chrome_instance_path=chrome_path, - extra_chromium_args=[f"--window-size={window_w},{window_h}"], - ) - ) - async with await browser.new_context( - config=BrowserContextConfig( - trace_path="./tmp/traces", - save_recording_path="./tmp/record_videos", - no_viewport=False, - browser_window_size=BrowserContextWindowSize( - width=window_w, height=window_h - ), - ) - ) as browser_context: - agent = Agent( - task="go to google.com and type 'OpenAI' click search and give me the first url", - llm=llm, - browser_context=browser_context, - use_vision=use_vision, - tool_calling_method=tool_calling_method - ) - history: AgentHistoryList = await agent.run(max_steps=10) - - print("Final Result:") - pprint(history.final_result(), indent=4) - - print("\nErrors:") - pprint(history.errors(), indent=4) - - # e.g. xPaths the model clicked on - print("\nModel Outputs:") - pprint(history.model_actions(), indent=4) - - print("\nThoughts:") - pprint(history.model_thoughts(), indent=4) - # close browser - await browser.close() - - -async def test_browser_use_custom(): - from browser_use.browser.context import BrowserContextWindowSize - from browser_use.browser.browser import BrowserConfig - from playwright.async_api import async_playwright - - from src.agent.custom_agent import CustomAgent - from src.agent.custom_prompts import CustomSystemPrompt, CustomAgentMessagePrompt from src.browser.custom_browser import CustomBrowser - from src.browser.custom_context import BrowserContextConfig + from src.browser.custom_context import CustomBrowserContextConfig from src.controller.custom_controller import CustomController - - window_w, window_h = 1280, 1100 + from src.utils import llm_provider # llm = utils.get_llm_model( # provider="openai", @@ -118,14 +38,6 @@ async def test_browser_use_custom(): # api_key=os.getenv("OPENAI_API_KEY", ""), # ) - llm = utils.get_llm_model( - provider="azure_openai", - model_name="gpt-4o", - temperature=0.5, - base_url=os.getenv("AZURE_OPENAI_ENDPOINT", ""), - api_key=os.getenv("AZURE_OPENAI_API_KEY", ""), - ) - # llm = utils.get_llm_model( # provider="google", # model_name="gemini-2.0-flash", @@ -153,13 +65,43 @@ async def test_browser_use_custom(): # provider="ollama", model_name="deepseek-r1:14b", temperature=0.5 # ) + window_w, window_h = 1280, 1100 + + llm = llm_provider.get_llm_model( + provider="azure_openai", + model_name="gpt-4o", + temperature=0.5, + base_url=os.getenv("AZURE_OPENAI_ENDPOINT", ""), + api_key=os.getenv("AZURE_OPENAI_API_KEY", ""), + ) + + mcp_server_config = { + "mcpServers": { + "markitdown": { + "command": "docker", + "args": [ + "run", + "--rm", + "-i", + "markitdown-mcp:latest" + ] + }, + "desktop-commander": { + "command": "npx", + "args": [ + "-y", + "@wonderwhy-er/desktop-commander" + ] + }, + } + } controller = CustomController() - use_own_browser = True + await controller.setup_mcp_client(mcp_server_config) + use_own_browser = False disable_security = True use_vision = True # Set to False when using DeepSeek max_actions_per_step = 10 - playwright = None browser = None browser_context = None @@ -178,29 +120,27 @@ async def test_browser_use_custom(): config=BrowserConfig( headless=False, disable_security=disable_security, - chrome_instance_path=chrome_path, - extra_chromium_args=extra_chromium_args, + browser_binary_path=chrome_path, + extra_browser_args=extra_chromium_args, ) ) browser_context = await browser.new_context( - config=BrowserContextConfig( + config=CustomBrowserContextConfig( trace_path="./tmp/traces", save_recording_path="./tmp/record_videos", - no_viewport=False, + save_downloads_path="./tmp/downloads", browser_window_size=BrowserContextWindowSize( width=window_w, height=window_h ), + force_new_context=True ) ) - agent = CustomAgent( - task="open youtube in tab 1 , open google email in tab 2, open facebook in tab 3", - add_infos="", # some hints for llm to complete the task + agent = Agent( + task="download pdf from https://arxiv.org/abs/2504.10458 and rename this pdf to 'GUI-r1-test.pdf'", llm=llm, browser=browser, browser_context=browser_context, controller=controller, - system_prompt_class=CustomSystemPrompt, - agent_prompt_class=CustomAgentMessagePrompt, use_vision=use_vision, max_actions_per_step=max_actions_per_step, generate_gif=True @@ -213,28 +153,17 @@ async def test_browser_use_custom(): print("\nErrors:") pprint(history.errors(), indent=4) - # e.g. xPaths the model clicked on - print("\nModel Outputs:") - pprint(history.model_actions(), indent=4) - - print("\nThoughts:") - pprint(history.model_thoughts(), indent=4) - except Exception: import traceback - traceback.print_exc() finally: - # ę˜¾å¼å…³é—­ęŒä¹…åŒ–äøŠäø‹ę–‡ if browser_context: await browser_context.close() - - # 关闭 Playwright 对豔 - if playwright: - await playwright.stop() if browser: await browser.close() + if controller: + await controller.close_mcp_client() async def test_browser_use_parallel(): @@ -242,13 +171,20 @@ async def test_browser_use_parallel(): from browser_use.browser.browser import BrowserConfig from playwright.async_api import async_playwright from browser_use.browser.browser import Browser - from src.agent.custom_agent import CustomAgent - from src.agent.custom_prompts import CustomSystemPrompt, CustomAgentMessagePrompt - from src.browser.custom_browser import CustomBrowser from src.browser.custom_context import BrowserContextConfig from src.controller.custom_controller import CustomController - window_w, window_h = 1920, 1080 + from browser_use.browser.browser import Browser, BrowserConfig + from browser_use.browser.context import ( + BrowserContextConfig, + BrowserContextWindowSize, + ) + from browser_use.agent.service import Agent + + from src.browser.custom_browser import CustomBrowser + from src.browser.custom_context import CustomBrowserContextConfig + from src.controller.custom_controller import CustomController + from src.utils import llm_provider # llm = utils.get_llm_model( # provider="openai", @@ -258,20 +194,13 @@ async def test_browser_use_parallel(): # api_key=os.getenv("OPENAI_API_KEY", ""), # ) - # llm = utils.get_llm_model( - # provider="azure_openai", - # model_name="gpt-4o", - # temperature=0.8, - # base_url=os.getenv("AZURE_OPENAI_ENDPOINT", ""), - # api_key=os.getenv("AZURE_OPENAI_API_KEY", ""), - # ) - llm = utils.get_llm_model( - provider="gemini", - model_name="gemini-2.0-flash-exp", - temperature=1.0, - api_key=os.getenv("GOOGLE_API_KEY", "") - ) + # llm = utils.get_llm_model( + # provider="google", + # model_name="gemini-2.0-flash", + # temperature=0.6, + # api_key=os.getenv("GOOGLE_API_KEY", "") + # ) # llm = utils.get_llm_model( # provider="deepseek", @@ -293,72 +222,119 @@ async def test_browser_use_parallel(): # provider="ollama", model_name="deepseek-r1:14b", temperature=0.5 # ) + window_w, window_h = 1280, 1100 + + llm = llm_provider.get_llm_model( + provider="azure_openai", + model_name="gpt-4o", + temperature=0.5, + base_url=os.getenv("AZURE_OPENAI_ENDPOINT", ""), + api_key=os.getenv("AZURE_OPENAI_API_KEY", ""), + ) + + mcp_server_config = { + "mcpServers": { + "markitdown": { + "command": "docker", + "args": [ + "run", + "--rm", + "-i", + "markitdown-mcp:latest" + ] + }, + "desktop-commander": { + "command": "npx", + "args": [ + "-y", + "@wonderwhy-er/desktop-commander" + ] + }, + # "filesystem": { + # "command": "npx", + # "args": [ + # "-y", + # "@modelcontextprotocol/server-filesystem", + # "/Users/xxx/ai_workspace", + # ] + # }, + } + } controller = CustomController() - use_own_browser = True + await controller.setup_mcp_client(mcp_server_config) + use_own_browser = False disable_security = True use_vision = True # Set to False when using DeepSeek - max_actions_per_step = 1 - playwright = None + max_actions_per_step = 10 browser = None browser_context = None - browser = Browser( - config=BrowserConfig( - disable_security=True, - headless=False, - new_context_config=BrowserContextConfig(save_recording_path='./tmp/recordings'), - ) - ) - try: + extra_chromium_args = [f"--window-size={window_w},{window_h}"] + if use_own_browser: + chrome_path = os.getenv("CHROME_PATH", None) + if chrome_path == "": + chrome_path = None + chrome_user_data = os.getenv("CHROME_USER_DATA", None) + if chrome_user_data: + extra_chromium_args += [f"--user-data-dir={chrome_user_data}"] + else: + chrome_path = None + browser = CustomBrowser( + config=BrowserConfig( + headless=False, + disable_security=disable_security, + browser_binary_path=chrome_path, + extra_browser_args=extra_chromium_args, + ) + ) + browser_context = await browser.new_context( + config=CustomBrowserContextConfig( + trace_path="./tmp/traces", + save_recording_path="./tmp/record_videos", + save_downloads_path="./tmp/downloads", + browser_window_size=BrowserContextWindowSize( + width=window_w, height=window_h + ), + force_new_context=True + ) + ) agents = [ - Agent(task=task, llm=llm, browser=browser) + Agent(task=task, llm=llm, browser=browser, controller=controller) for task in [ 'Search Google for weather in Tokyo', - 'Check Reddit front page title', - 'Find NASA image of the day', - 'Check top story on CNN', + # 'Check Reddit front page title', + # 'Find NASA image of the day', + # 'Check top story on CNN', # 'Search latest SpaceX launch date', # 'Look up population of Paris', - # 'Find current time in Sydney', - # 'Check who won last Super Bowl', + 'Find current time in Sydney', + 'Check who won last Super Bowl', # 'Search trending topics on Twitter', ] ] history = await asyncio.gather(*[agent.run() for agent in agents]) - pdb.set_trace() print("Final Result:") pprint(history.final_result(), indent=4) print("\nErrors:") pprint(history.errors(), indent=4) - # e.g. xPaths the model clicked on - print("\nModel Outputs:") - pprint(history.model_actions(), indent=4) + pdb.set_trace() - print("\nThoughts:") - pprint(history.model_thoughts(), indent=4) - # close browser except Exception: import traceback traceback.print_exc() finally: - # ę˜¾å¼å…³é—­ęŒä¹…åŒ–äøŠäø‹ę–‡ if browser_context: await browser_context.close() - - # 关闭 Playwright 对豔 - if playwright: - await playwright.stop() if browser: await browser.close() if __name__ == "__main__": - asyncio.run(test_browser_use_org()) - # asyncio.run(test_browser_use_parallel()) - # asyncio.run(test_browser_use_custom()) + # asyncio.run(test_browser_use_agent()) + asyncio.run(test_browser_use_parallel()) diff --git a/tests/test_controller.py b/tests/test_controller.py index 6a10ebc..1e1608e 100644 --- a/tests/test_controller.py +++ b/tests/test_controller.py @@ -45,33 +45,37 @@ async def test_controller_with_mcp(): from src.controller.custom_controller import CustomController from browser_use.controller.registry.views import ActionModel - test_server_config = { - "playwright": { - "command": "npx", - "args": [ - "@playwright/mcp@latest", - ], - "transport": "stdio", - }, - "filesystem": { - "command": "npx", - "args": [ - "-y", - "@modelcontextprotocol/server-filesystem", - "/Users/xxx/ai_workspace", - ] - }, - "desktop-commander": { - "command": "npx", - "args": [ - "-y", - "@wonderwhy-er/desktop-commander" - ] + mcp_server_config = { + "mcpServers": { + "markitdown": { + "command": "docker", + "args": [ + "run", + "--rm", + "-i", + "markitdown-mcp:latest" + ] + }, + "desktop-commander": { + "command": "npx", + "args": [ + "-y", + "@wonderwhy-er/desktop-commander" + ] + }, + # "filesystem": { + # "command": "npx", + # "args": [ + # "-y", + # "@modelcontextprotocol/server-filesystem", + # "/Users/xxx/ai_workspace", + # ] + # }, } } controller = CustomController() - await controller.setup_mcp_client(test_server_config) + await controller.setup_mcp_client(mcp_server_config) action_name = "mcp.desktop-commander.execute_command" action_info = controller.registry.registry.actions[action_name] param_model = action_info.param_model @@ -85,7 +89,8 @@ async def test_controller_with_mcp(): result = await controller.act(action_model) result = result.extracted_content print(result) - if result and "Command is still running. Use read_output to get more output." in result and "PID" in result.split("\n")[0]: + if result and "Command is still running. Use read_output to get more output." in result and "PID" in \ + result.split("\n")[0]: pid = int(result.split("\n")[0].split("PID")[-1].strip()) action_name = "mcp.desktop-commander.read_output" action_info = controller.registry.registry.actions[action_name] diff --git a/tests/test_llm_api.py b/tests/test_llm_api.py index bee1e6b..c0e9e16 100644 --- a/tests/test_llm_api.py +++ b/tests/test_llm_api.py @@ -144,10 +144,10 @@ def test_ibm_model(): if __name__ == "__main__": # test_openai_model() # test_google_model() - # test_azure_openai_model() + test_azure_openai_model() # test_deepseek_model() # test_ollama_model() # test_deepseek_r1_model() # test_deepseek_r1_ollama_model() # test_mistral_model() - test_ibm_model() + # test_ibm_model() diff --git a/webui.py b/webui.py index 3066ecb..34e93ab 100644 --- a/webui.py +++ b/webui.py @@ -1,3 +1,5 @@ +from dotenv import load_dotenv +load_dotenv() import argparse from src.webui.interface import theme_map, create_ui diff --git a/webui2.py b/webui2.py index 33d7ece..98a23b4 100644 --- a/webui2.py +++ b/webui2.py @@ -42,77 +42,6 @@ _global_browser = None _global_browser_context = None _global_agent = None -# Create the global agent state instance -_global_agent_state = AgentState() - -# webui config -webui_config_manager = utils.ConfigManager() - - -def scan_and_register_components(blocks): - """ę‰«ęäø€äøŖ Blocks åÆ¹č±”å¹¶ę³Øå†Œå…¶äø­ēš„ę‰€ęœ‰äŗ¤äŗ’å¼ē»„ä»¶ļ¼Œä½†äøåŒ…ę‹¬ęŒ‰é’®""" - global webui_config_manager - - def traverse_blocks(block, prefix=""): - registered = 0 - - # 处理 Blocks č‡Ŗčŗ«ēš„ē»„ä»¶ - if hasattr(block, "children"): - for i, child in enumerate(block.children): - if isinstance(child, gr.components.Component): - # ęŽ’é™¤ęŒ‰é’® (Button) 组件 - if getattr(child, "interactive", False) and not isinstance(child, gr.Button): - name = f"{prefix}component_{i}" - if hasattr(child, "label") and child.label: - # ä½æē”Øę ‡ē­¾ä½œäøŗåē§°ēš„äø€éƒØåˆ† - label = child.label - name = f"{prefix}{label}" - logger.debug(f"Registering component: {name}") - webui_config_manager.register_component(name, child) - registered += 1 - elif hasattr(child, "children"): - # é€’å½’å¤„ē†åµŒå„—ēš„ Blocks - new_prefix = f"{prefix}block_{i}_" - registered += traverse_blocks(child, new_prefix) - - return registered - - total = traverse_blocks(blocks) - logger.info(f"Total registered components: {total}") - - -def save_current_config(): - return webui_config_manager.save_current_config() - - -def update_ui_from_config(config_file): - return webui_config_manager.update_ui_from_config(config_file) - - -def resolve_sensitive_env_variables(text): - """ - Replace environment variable placeholders ($SENSITIVE_*) with their values. - Only replaces variables that start with SENSITIVE_. - """ - if not text: - return text - - import re - - # Find all $SENSITIVE_* patterns - env_vars = re.findall(r'\$SENSITIVE_[A-Za-z0-9_]*', text) - - result = text - for var in env_vars: - # Remove the $ prefix to get the actual environment variable name - env_name = var[1:] # removes the $ - env_value = os.getenv(env_name) - if env_value is not None: - # Replace $SENSITIVE_VAR_NAME with its value - result = result.replace(var, env_value) - - return result - async def stop_agent(): """Request the agent to stop and update UI with enhanced feedback""" @@ -140,32 +69,6 @@ async def stop_agent(): ) -async def stop_research_agent(): - """Request the agent to stop and update UI with enhanced feedback""" - global _global_agent_state - - try: - # Request stop - _global_agent_state.request_stop() - - # Update UI immediately - message = "Stop requested - the agent will halt at the next safe point" - logger.info(f"šŸ›‘ {message}") - - # Return UI updates - return ( # errors_output - gr.update(value="Stopping...", interactive=False), # stop_button - gr.update(interactive=False), # run_button - ) - except Exception as e: - error_msg = f"Error during stop: {str(e)}" - logger.error(error_msg) - return ( - gr.update(value="Stop", interactive=True), - gr.update(interactive=True) - ) - - async def run_browser_agent( agent_type, llm_provider, @@ -202,16 +105,6 @@ async def run_browser_agent( if save_recording_path: os.makedirs(save_recording_path, exist_ok=True) - # Get the list of existing videos before the agent runs - existing_videos = set() - if save_recording_path: - existing_videos = set( - glob.glob(os.path.join(save_recording_path, "*.[mM][pP]4")) - + glob.glob(os.path.join(save_recording_path, "*.[wW][eE][bB][mM]")) - ) - - task = resolve_sensitive_env_variables(task) - # Run the agent llm = utils.get_llm_model( provider=llm_provider, From 3f4a7d9f5de931a69bec8c844dabfec7c8723dec Mon Sep 17 00:00:00 2001 From: vvincent1234 Date: Tue, 29 Apr 2025 00:01:03 +0800 Subject: [PATCH 11/35] fix bu agent --- src/webui/components/browser_use_agent_tab.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/webui/components/browser_use_agent_tab.py b/src/webui/components/browser_use_agent_tab.py index 8a122b9..6f7d314 100644 --- a/src/webui/components/browser_use_agent_tab.py +++ b/src/webui/components/browser_use_agent_tab.py @@ -657,7 +657,7 @@ async def run_agent_task(webui_manager: WebuiManager, components: Dict[gr.compon final_update.update({ user_input_comp: gr.update(value="", interactive=True, placeholder="Enter your next task..."), run_button_comp: gr.update(value="ā–¶ļø Submit Task", interactive=True), - stop_button_comp: gr.update(interactive=False), + stop_button_comp: gr.update(value="ā¹ļø Stop", interactive=False), pause_resume_button_comp: gr.update(value="āøļø Pause", interactive=False), clear_button_comp: gr.update(interactive=True), # Ensure final chat history is shown @@ -672,7 +672,7 @@ async def run_agent_task(webui_manager: WebuiManager, components: Dict[gr.compon yield { user_input_comp: gr.update(interactive=True, placeholder="Error during setup. Enter task..."), run_button_comp: gr.update(value="ā–¶ļø Submit Task", interactive=True), - stop_button_comp: gr.update(interactive=False), + stop_button_comp: gr.update(value="ā¹ļø Stop", interactive=False), pause_resume_button_comp: gr.update(value="āøļø Pause", interactive=False), clear_button_comp: gr.update(interactive=True), chatbot_comp: gr.update( @@ -771,13 +771,13 @@ async def handle_clear(webui_manager: WebuiManager): if task and not task.done(): logger.info("Clearing requires stopping the current task.") webui_manager.bu_agent.stop() + task.cancel() try: await asyncio.wait_for(task, timeout=2.0) # Wait briefly except (asyncio.CancelledError, asyncio.TimeoutError): pass except Exception as e: logger.warning(f"Error stopping task on clear: {e}") - webui_manager.bu_current_task.cancel() webui_manager.bu_current_task = None if webui_manager.bu_controller: @@ -839,10 +839,10 @@ def create_browser_use_agent_tab(webui_manager: WebuiManager): elem_id="user_input" ) with gr.Row(): - stop_button = gr.Button("ā¹ļø Stop", interactive=False, variant="stop", scale=1) - pause_resume_button = gr.Button("āøļø Pause", interactive=False, variant="secondary", scale=1) - clear_button = gr.Button("šŸ—‘ļø Clear", interactive=True, variant="secondary", scale=1) - run_button = gr.Button("ā–¶ļø Submit Task", variant="primary", scale=2) + stop_button = gr.Button("ā¹ļø Stop", interactive=False, variant="stop", scale=2) + pause_resume_button = gr.Button("āøļø Pause", interactive=False, variant="secondary", scale=2, visible=False) + clear_button = gr.Button("šŸ—‘ļø Clear", interactive=True, variant="secondary", scale=2) + run_button = gr.Button("ā–¶ļø Submit Task", variant="primary", scale=3) browser_view = gr.HTML( value="

Browser View (Requires Headless=True)

", From 47b5b55b0d9164740b5109153f7c718c5cef4ee1 Mon Sep 17 00:00:00 2001 From: vvincent1234 Date: Tue, 29 Apr 2025 09:23:16 +0800 Subject: [PATCH 12/35] opt browser --- src/agent/browser_use/browser_use_agent.py | 178 ++++++++++++++++++ src/browser/custom_browser.py | 23 +-- src/browser/custom_context.py | 9 +- src/webui/components/browser_settings_tab.py | 28 ++- src/webui/components/browser_use_agent_tab.py | 61 +----- src/webui/interface.py | 2 +- 6 files changed, 234 insertions(+), 67 deletions(-) create mode 100644 src/agent/browser_use/browser_use_agent.py diff --git a/src/agent/browser_use/browser_use_agent.py b/src/agent/browser_use/browser_use_agent.py new file mode 100644 index 0000000..a38211e --- /dev/null +++ b/src/agent/browser_use/browser_use_agent.py @@ -0,0 +1,178 @@ +from __future__ import annotations + +import asyncio +import gc +import inspect +import json +import logging +import os +import re +import time +from pathlib import Path +from typing import Any, Awaitable, Callable, Dict, Generic, List, Optional, TypeVar, Union + +from dotenv import load_dotenv +from langchain_core.language_models.chat_models import BaseChatModel +from langchain_core.messages import ( + BaseMessage, + HumanMessage, + SystemMessage, +) + +# from lmnr.sdk.decorators import observe +from pydantic import BaseModel, ValidationError + +from browser_use.agent.gif import create_history_gif +from browser_use.agent.memory.service import Memory, MemorySettings +from browser_use.agent.message_manager.service import MessageManager, MessageManagerSettings +from browser_use.agent.message_manager.utils import convert_input_messages, extract_json_from_model_output, save_conversation +from browser_use.agent.prompts import AgentMessagePrompt, PlannerPrompt, SystemPrompt +from browser_use.agent.views import ( + REQUIRED_LLM_API_ENV_VARS, + ActionResult, + AgentError, + AgentHistory, + AgentHistoryList, + AgentOutput, + AgentSettings, + AgentState, + AgentStepInfo, + StepMetadata, + ToolCallingMethod, +) +from browser_use.browser.browser import Browser +from browser_use.browser.context import BrowserContext +from browser_use.browser.views import BrowserState, BrowserStateHistory +from browser_use.controller.registry.views import ActionModel +from browser_use.controller.service import Controller +from browser_use.dom.history_tree_processor.service import ( + DOMHistoryElement, + HistoryTreeProcessor, +) +from browser_use.exceptions import LLMException +from browser_use.telemetry.service import ProductTelemetry +from browser_use.telemetry.views import ( + AgentEndTelemetryEvent, + AgentRunTelemetryEvent, + AgentStepTelemetryEvent, +) +from browser_use.utils import check_env_variables, time_execution_async, time_execution_sync +from browser_use.agent.service import Agent, AgentHookFunc + +load_dotenv() +logger = logging.getLogger(__name__) + +SKIP_LLM_API_KEY_VERIFICATION = os.environ.get('SKIP_LLM_API_KEY_VERIFICATION', 'false').lower()[0] in 'ty1' + + +class BrowserUseAgent(Agent): + @time_execution_async('--run (agent)') + async def run( + self, max_steps: int = 100, on_step_start: AgentHookFunc | None = None, + on_step_end: AgentHookFunc | None = None + ) -> AgentHistoryList: + """Execute the task with maximum number of steps""" + + loop = asyncio.get_event_loop() + + # Set up the Ctrl+C signal handler with callbacks specific to this agent + from browser_use.utils import SignalHandler + + signal_handler = SignalHandler( + loop=loop, + pause_callback=self.pause, + resume_callback=self.resume, + custom_exit_callback=None, # No special cleanup needed on forced exit + exit_on_second_int=True, + ) + signal_handler.register() + + # Wait for verification task to complete if it exists + if hasattr(self, '_verification_task') and not self._verification_task.done(): + try: + await self._verification_task + except Exception: + # Error already logged in the task + pass + + try: + self._log_agent_run() + + # Execute initial actions if provided + if self.initial_actions: + result = await self.multi_act(self.initial_actions, check_for_new_elements=False) + self.state.last_result = result + + for step in range(max_steps): + # Check if waiting for user input after Ctrl+C + while self.state.paused: + await asyncio.sleep(0.5) + if self.state.stopped: + break + + # Check if we should stop due to too many failures + if self.state.consecutive_failures >= self.settings.max_failures: + logger.error(f'āŒ Stopping due to {self.settings.max_failures} consecutive failures') + break + + # Check control flags before each step + if self.state.stopped: + logger.info('Agent stopped') + break + + while self.state.paused: + await asyncio.sleep(0.2) # Small delay to prevent CPU spinning + if self.state.stopped: # Allow stopping while paused + break + + if on_step_start is not None: + await on_step_start(self) + + step_info = AgentStepInfo(step_number=step, max_steps=max_steps) + await self.step(step_info) + + if on_step_end is not None: + await on_step_end(self) + + if self.state.history.is_done(): + if self.settings.validate_output and step < max_steps - 1: + if not await self._validate_output(): + continue + + await self.log_completion() + break + else: + logger.info('āŒ Failed to complete task in maximum steps') + + return self.state.history + + except KeyboardInterrupt: + # Already handled by our signal handler, but catch any direct KeyboardInterrupt as well + logger.info('Got KeyboardInterrupt during execution, returning current history') + return self.state.history + + finally: + # Unregister signal handlers before cleanup + signal_handler.unregister() + + self.telemetry.capture( + AgentEndTelemetryEvent( + agent_id=self.state.agent_id, + is_done=self.state.history.is_done(), + success=self.state.history.is_successful(), + steps=self.state.n_steps, + max_steps_reached=self.state.n_steps >= max_steps, + errors=self.state.history.errors(), + total_input_tokens=self.state.history.total_input_tokens(), + total_duration_seconds=self.state.history.total_duration_seconds(), + ) + ) + + await self.close() + + if self.settings.generate_gif: + output_path: str = 'agent_history.gif' + if isinstance(self.settings.generate_gif, str): + output_path = self.settings.generate_gif + + create_history_gif(task=self.task, history=self.state.history, output_path=output_path) \ No newline at end of file diff --git a/src/browser/custom_browser.py b/src/browser/custom_browser.py index a1c057b..6db980f 100644 --- a/src/browser/custom_browser.py +++ b/src/browser/custom_browser.py @@ -15,29 +15,30 @@ from playwright.async_api import BrowserContext as PlaywrightBrowserContext import logging from browser_use.browser.chrome import ( - CHROME_ARGS, - CHROME_DETERMINISTIC_RENDERING_ARGS, - CHROME_DISABLE_SECURITY_ARGS, - CHROME_DOCKER_ARGS, - CHROME_HEADLESS_ARGS, + CHROME_ARGS, + CHROME_DETERMINISTIC_RENDERING_ARGS, + CHROME_DISABLE_SECURITY_ARGS, + CHROME_DOCKER_ARGS, + CHROME_HEADLESS_ARGS, ) from browser_use.browser.context import BrowserContext, BrowserContextConfig from browser_use.browser.utils.screen_resolution import get_screen_resolution, get_window_adjustments from browser_use.utils import time_execution_async import socket -from .custom_context import CustomBrowserContext +from .custom_context import CustomBrowserContext, CustomBrowserContextConfig logger = logging.getLogger(__name__) class CustomBrowser(Browser): - async def new_context( - self, - config: BrowserContextConfig = BrowserContextConfig() - ) -> CustomBrowserContext: - return CustomBrowserContext(config=config, browser=self) + async def new_context(self, config: CustomBrowserContextConfig | None = None) -> CustomBrowserContext: + """Create a browser context""" + browser_config = self.config.model_dump() if self.config else {} + context_config = config.model_dump() if config else {} + merged_config = {**browser_config, **context_config} + return CustomBrowserContext(config=CustomBrowserContextConfig(**merged_config), browser=self) async def _setup_builtin_browser(self, playwright: Playwright) -> PlaywrightBrowser: """Sets up and returns a Playwright Browser instance with anti-detection measures.""" diff --git a/src/browser/custom_context.py b/src/browser/custom_context.py index 4dc2423..43a67a8 100644 --- a/src/browser/custom_context.py +++ b/src/browser/custom_context.py @@ -6,6 +6,8 @@ from browser_use.browser.browser import Browser, IN_DOCKER from browser_use.browser.context import BrowserContext, BrowserContextConfig from playwright.async_api import Browser as PlaywrightBrowser from playwright.async_api import BrowserContext as PlaywrightBrowserContext +from typing import Optional +from browser_use.browser.context import BrowserContextState logger = logging.getLogger(__name__) @@ -17,10 +19,11 @@ class CustomBrowserContextConfig(BrowserContextConfig): class CustomBrowserContext(BrowserContext): def __init__( self, - browser: "Browser", - config: CustomBrowserContextConfig = CustomBrowserContextConfig(), + browser: 'Browser', + config: BrowserContextConfig | None = None, + state: Optional[BrowserContextState] = None, ): - super(CustomBrowserContext, self).__init__(browser=browser, config=config) + super(CustomBrowserContext, self).__init__(browser=browser, config=config, state=state) async def _create_context(self, browser: PlaywrightBrowser): """Creates a new browser context with anti-detection measures and loads cookies if available.""" diff --git a/src/webui/components/browser_settings_tab.py b/src/webui/components/browser_settings_tab.py index 0d3bcbb..90e6fa6 100644 --- a/src/webui/components/browser_settings_tab.py +++ b/src/webui/components/browser_settings_tab.py @@ -1,11 +1,28 @@ import gradio as gr +import logging from gradio.components import Component from src.webui.webui_manager import WebuiManager from src.utils import config +logger = logging.getLogger(__name__) -def create_browser_settings_tab(webui_manager: WebuiManager) -> dict[str, Component]: +async def close_browser(webui_manager: WebuiManager): + """ + Close browser + """ + if webui_manager.bu_current_task and not webui_manager.bu_current_task.done(): + webui_manager.bu_current_task.cancel() + webui_manager.bu_current_task = None + if webui_manager.bu_browser: + await webui_manager.bu_browser.close() + webui_manager.bu_browser = None + if webui_manager.bu_browser_context: + await webui_manager.bu_browser_context.close() + webui_manager.bu_browser_context = None + + +def create_browser_settings_tab(webui_manager: WebuiManager): """ Creates a browser settings tab. """ @@ -125,3 +142,12 @@ def create_browser_settings_tab(webui_manager: WebuiManager) -> dict[str, Compon ) ) webui_manager.add_components("browser_settings", tab_components) + + async def close_wrapper(): + """Wrapper for handle_clear.""" + await close_browser(webui_manager) + + headless.change(close_wrapper) + keep_browser_open.change(close_wrapper) + disable_security.change(close_wrapper) + use_own_browser.change(close_wrapper) diff --git a/src/webui/components/browser_use_agent_tab.py b/src/webui/components/browser_use_agent_tab.py index 6f7d314..88f571d 100644 --- a/src/webui/components/browser_use_agent_tab.py +++ b/src/webui/components/browser_use_agent_tab.py @@ -12,7 +12,7 @@ from langchain_core.language_models.chat_models import BaseChatModel import base64 from browser_use.browser.browser import Browser, BrowserConfig from browser_use.browser.context import BrowserContext, BrowserContextConfig, BrowserContextWindowSize -from browser_use.agent.service import Agent +# from browser_use.agent.service import Agent from browser_use.agent.views import AgentHistoryList from browser_use.agent.views import ToolCallingMethod # Adjust import from browser_use.agent.views import ( @@ -37,6 +37,7 @@ from src.controller.custom_controller import CustomController from src.utils import llm_provider from src.browser.custom_browser import CustomBrowser from src.browser.custom_context import CustomBrowserContext, CustomBrowserContextConfig +from src.agent.browser_use.browser_use_agent import BrowserUseAgent logger = logging.getLogger(__name__) @@ -148,7 +149,7 @@ async def _handle_new_step(webui_manager: WebuiManager, state: BrowserState, out # Basic validation: check if it looks like base64 if isinstance(screenshot_data, str) and len(screenshot_data) > 100: # Arbitrary length check # *** UPDATED STYLE: Removed centering, adjusted width *** - img_tag = f'Step {step_num} Screenshot' + img_tag = f'Step {step_num} Screenshot' screenshot_html = img_tag + "
" # Use
for line break after inline-block image else: logger.warning( @@ -234,44 +235,6 @@ async def _ask_assistant_callback(webui_manager: WebuiManager, query: str, brows return {"response": response} -async def capture_screenshot(browser_context): - """Capture and encode a screenshot""" - # Extract the Playwright browser instance - playwright_browser = browser_context.browser.playwright_browser # Ensure this is correct. - - # Check if the browser instance is valid and if an existing context can be reused - if playwright_browser and playwright_browser.contexts: - playwright_context = playwright_browser.contexts[0] - else: - return None - - # Access pages in the context - pages = None - if playwright_context: - pages = playwright_context.pages - - # Use an existing page or create a new one if none exist - if pages: - active_page = pages[0] - for page in pages: - if page.url != "about:blank": - active_page = page - else: - return None - - # Take screenshot - try: - screenshot = await active_page.screenshot( - type='jpeg', - quality=75, - scale="css" - ) - encoded = base64.b64encode(screenshot).decode('utf-8') - return encoded - except Exception as e: - return None - - # --- Core Agent Execution Logic --- (Needs access to webui_manager) async def run_agent_task(webui_manager: WebuiManager, components: Dict[gr.components.Component, Any]) -> AsyncGenerator[ @@ -372,8 +335,8 @@ async def run_agent_task(webui_manager: WebuiManager, components: Dict[gr.compon save_agent_history_path = get_browser_setting("save_agent_history_path", "./tmp/agent_history") save_download_path = get_browser_setting("save_download_path", "./tmp/downloads") - stream_vw = 80 - stream_vh = int(80 * window_h // window_w) + stream_vw = 70 + stream_vh = int(70 * window_h // window_w) os.makedirs(save_agent_history_path, exist_ok=True) if save_recording_path: os.makedirs(save_recording_path, exist_ok=True) @@ -470,7 +433,7 @@ async def run_agent_task(webui_manager: WebuiManager, components: Dict[gr.compon if not webui_manager.bu_browser or not webui_manager.bu_browser_context: raise ValueError("Browser or Context not initialized, cannot create agent.") - webui_manager.bu_agent = Agent( + webui_manager.bu_agent = BrowserUseAgent( task=task, llm=main_llm, browser=webui_manager.bu_browser, @@ -478,7 +441,6 @@ async def run_agent_task(webui_manager: WebuiManager, components: Dict[gr.compon controller=webui_manager.bu_controller, register_new_step_callback=step_callback_wrapper, register_done_callback=done_callback_wrapper, - # Agent settings use_vision=use_vision, override_system_message=override_system_prompt, extend_system_message=extend_system_prompt, @@ -486,8 +448,7 @@ async def run_agent_task(webui_manager: WebuiManager, components: Dict[gr.compon max_actions_per_step=max_actions, tool_calling_method=tool_calling_method, planner_llm=planner_llm, - use_vision_for_planner=planner_use_vision if planner_llm else False, - save_conversation_path=history_file, + use_vision_for_planner=planner_use_vision if planner_llm else False ) webui_manager.bu_agent.state.agent_id = webui_manager.bu_agent_task_id webui_manager.bu_agent.settings.generate_gif = gif_path @@ -510,8 +471,7 @@ async def run_agent_task(webui_manager: WebuiManager, components: Dict[gr.compon if is_paused: yield { pause_resume_button_comp: gr.update(value="ā–¶ļø Resume", interactive=True), - run_button_comp: gr.update(value="āøļø Paused", interactive=False), - stop_button_comp: gr.update(interactive=True), # Allow stop while paused + stop_button_comp: gr.update(interactive=True), } # Wait until pause is released or task is stopped/done while is_paused and not agent_task.done(): @@ -580,7 +540,7 @@ async def run_agent_task(webui_manager: WebuiManager, components: Dict[gr.compon # Update Browser View if headless and webui_manager.bu_browser_context: try: - screenshot_b64 = await capture_screenshot(webui_manager.bu_browser_context) + screenshot_b64 = await webui_manager.bu_browser_context.take_screenshot() if screenshot_b64: html_content = f'' update_dict[browser_view_comp] = gr.update(value=html_content, visible=True) @@ -840,7 +800,7 @@ def create_browser_use_agent_tab(webui_manager: WebuiManager): ) with gr.Row(): stop_button = gr.Button("ā¹ļø Stop", interactive=False, variant="stop", scale=2) - pause_resume_button = gr.Button("āøļø Pause", interactive=False, variant="secondary", scale=2, visible=False) + pause_resume_button = gr.Button("āøļø Pause", interactive=False, variant="secondary", scale=2, visible=True) clear_button = gr.Button("šŸ—‘ļø Clear", interactive=True, variant="secondary", scale=2) run_button = gr.Button("ā–¶ļø Submit Task", variant="primary", scale=3) @@ -918,4 +878,3 @@ def create_browser_use_agent_tab(webui_manager: WebuiManager): inputs=None, outputs=run_tab_outputs ) - diff --git a/src/webui/interface.py b/src/webui/interface.py index ba99245..083649e 100644 --- a/src/webui/interface.py +++ b/src/webui/interface.py @@ -78,7 +78,7 @@ def create_ui(theme_name="Ocean"): with gr.TabItem("šŸ¤– Run Agent"): create_browser_use_agent_tab(ui_manager) - with gr.TabItem("šŸŽ Agent Collections"): + with gr.TabItem("šŸŽ Agent Marketplace"): gr.Markdown( """ ### Agents built on Browser-Use From dad8fc990a78e1bcd5f75307f81df276e92cfc7f Mon Sep 17 00:00:00 2001 From: vincent Date: Tue, 29 Apr 2025 22:02:51 +0800 Subject: [PATCH 13/35] add deep research agent --- requirements.txt | 4 +- .../deep_research/deep_research_agent.py | 1222 ++++++++++++----- src/webui/components/agent_settings_tab.py | 12 +- src/webui/components/browser_settings_tab.py | 19 +- src/webui/components/browser_use_agent_tab.py | 11 +- .../components/deep_research_agent_tab.py | 2 +- src/webui/components/load_save_config_tab.py | 4 +- tests/test_agents.py | 67 +- tests/test_controller.py | 1 + webui2.py | 1095 --------------- 10 files changed, 960 insertions(+), 1477 deletions(-) delete mode 100644 webui2.py diff --git a/requirements.txt b/requirements.txt index 462f010..6c44d12 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,4 +5,6 @@ json-repair langchain-mistralai==0.2.4 MainContentExtractor==0.0.4 langchain-ibm==0.3.10 -langchain_mcp_adapters==0.0.9 \ No newline at end of file +langchain_mcp_adapters==0.0.9 +langgraph==0.3.34 +langchain-community==0.3.23 \ No newline at end of file diff --git a/src/agent/deep_research/deep_research_agent.py b/src/agent/deep_research/deep_research_agent.py index d96125b..6863f47 100644 --- a/src/agent/deep_research/deep_research_agent.py +++ b/src/agent/deep_research/deep_research_agent.py @@ -1,386 +1,886 @@ -import pdb - -from dotenv import load_dotenv - -load_dotenv() import asyncio -import os -import sys -import logging -from pprint import pprint -from uuid import uuid4 -from src.utils import utils import json -import re -from browser_use.agent.service import Agent -from browser_use.browser.browser import BrowserConfig, Browser -from browser_use.agent.views import ActionResult -from browser_use.browser.context import BrowserContext -from browser_use.controller.service import Controller, DoneAction -from main_content_extractor import MainContentExtractor -from langchain_core.messages import ( - AIMessage, - BaseMessage, - HumanMessage, - ToolMessage, - SystemMessage -) -from json_repair import repair_json +import logging +import os +import uuid +from pathlib import Path +from typing import List, Dict, Any, TypedDict, Optional, Sequence, Annotated +from concurrent.futures import ThreadPoolExecutor, as_completed +import threading + +# Langchain imports +from langchain_core.messages import BaseMessage, HumanMessage, AIMessage, ToolMessage, SystemMessage +from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder +from langchain_core.tools import Tool, StructuredTool +from langchain.agents import AgentExecutor # We might use parts, but Langgraph is primary +from langchain_community.tools.file_management import WriteFileTool, ReadFileTool, CopyFileTool, ListDirectoryTool, \ + MoveFileTool, FileSearchTool +from langchain_openai import ChatOpenAI # Replace with your actual LLM import + +from browser_use.browser.browser import BrowserConfig +from browser_use.browser.context import BrowserContextWindowSize + +# Langgraph imports +from langgraph.graph import StateGraph, END from src.controller.custom_controller import CustomController +from src.utils import llm_provider from src.browser.custom_browser import CustomBrowser -from src.browser.custom_context import BrowserContextConfig, BrowserContext -from browser_use.browser.context import ( - BrowserContextConfig, - BrowserContextWindowSize, -) -from browser_use.agent.service import Agent +from src.browser.custom_context import CustomBrowserContext, CustomBrowserContextConfig +from src.agent.browser_use.browser_use_agent import BrowserUseAgent +from src.utils.mcp_client import setup_mcp_client_and_tools logger = logging.getLogger(__name__) +# Constants +TMP_DIR = Path("./tmp/deep_research") +os.makedirs(TMP_DIR, exist_ok=True) +REPORT_FILENAME = "report.md" +PLAN_FILENAME = "research_plan.md" +SEARCH_INFO_FILENAME = "search_info.json" +MAX_PARALLEL_BROWSERS = 2 -async def deep_research(task, llm, agent_state=None, **kwargs): - task_id = str(uuid4()) - save_dir = kwargs.get("save_dir", os.path.join(f"./tmp/deep_research/{task_id}")) - logger.info(f"Save Deep Research at: {save_dir}") - os.makedirs(save_dir, exist_ok=True) +_AGENT_STOP_FLAGS = {} +_BROWSER_AGENT_INSTANCES = {} # To store running browser agents for stopping - # max qyery num per iteration - max_query_num = kwargs.get("max_query_num", 3) - use_own_browser = kwargs.get("use_own_browser", False) - extra_chromium_args = [] +async def run_single_browser_task( + task_query: str, + task_id: str, + llm: Any, # Pass the main LLM + browser_config: Dict[str, Any], + stop_event: threading.Event, + use_vision: bool = False, +) -> Dict[str, Any]: + """ + Runs a single BrowserUseAgent task. + Manages browser creation and closing for this specific task. + """ + if not BrowserUseAgent: + return {"query": task_query, "error": "BrowserUseAgent components not available."} - if use_own_browser: - cdp_url = os.getenv("CHROME_CDP", kwargs.get("chrome_cdp", None)) - # TODO: if use own browser, max query num must be 1 per iter, how to solve it? - max_query_num = 1 - chrome_path = os.getenv("CHROME_PATH", None) - if chrome_path == "": - chrome_path = None - chrome_user_data = os.getenv("CHROME_USER_DATA", None) - if chrome_user_data: - extra_chromium_args += [f"--user-data-dir={chrome_user_data}"] + # --- Browser Setup --- + # These should ideally come from the main agent's config + headless = browser_config.get("headless", False) + window_w = browser_config.get("window_width", 1280) + window_h = browser_config.get("window_height", 1100) + browser_user_data_dir = browser_config.get("user_data_dir", None) + use_own_browser = browser_config.get("use_own_browser", False) + browser_binary_path = browser_config.get("browser_binary_path", None) + wss_url = browser_config.get("wss_url", None) + cdp_url = browser_config.get("cdp_url", None) + disable_security = browser_config.get("disable_security", False) - browser = CustomBrowser( + bu_browser = None + bu_browser_context = None + try: + logger.info(f"Starting browser task for query: {task_query}") + extra_args = [f"--window-size={window_w},{window_h}"] + if browser_user_data_dir: + extra_args.append(f"--user-data-dir={browser_user_data_dir}") + if use_own_browser: + browser_binary_path = os.getenv("CHROME_PATH", None) or browser_binary_path + if browser_binary_path == "": browser_binary_path = None + chrome_user_data = os.getenv("CHROME_USER_DATA", None) + if chrome_user_data: extra_args += [f"--user-data-dir={chrome_user_data}"] + else: + browser_binary_path = None + + bu_browser = CustomBrowser( config=BrowserConfig( - headless=kwargs.get("headless", False), + headless=headless, + disable_security=disable_security, + browser_binary_path=browser_binary_path, + extra_browser_args=extra_args, + wss_url=wss_url, cdp_url=cdp_url, - disable_security=kwargs.get("disable_security", True), - chrome_instance_path=chrome_path, - extra_chromium_args=extra_chromium_args, ) ) - browser_context = await browser.new_context() - else: - browser = None - browser_context = None - controller = CustomController() - - @controller.registry.action( - 'Extract page content to get the pure markdown.', - ) - async def extract_content(browser: BrowserContext): - page = await browser.get_current_page() - # use jina reader - url = page.url - - jina_url = f"https://r.jina.ai/{url}" - await page.goto(jina_url) - output_format = 'markdown' - content = MainContentExtractor.extract( # type: ignore - html=await page.content(), - output_format=output_format, + context_config = CustomBrowserContextConfig( + save_downloads_path="./tmp/downloads", + browser_window_size=BrowserContextWindowSize(width=window_w, height=window_h), + force_new_context=True ) - # go back to org url - await page.go_back() - msg = f'Extracted page content:\n{content}\n' - logger.info(msg) - return ActionResult(extracted_content=msg) + bu_browser_context = await bu_browser.new_context(config=context_config) - search_system_prompt = f""" - You are a **Deep Researcher**, an AI agent specializing in in-depth information gathering and research using a web browser with **automated execution capabilities**. Your expertise lies in formulating comprehensive research plans and executing them meticulously to fulfill complex user requests. You will analyze user instructions, devise a detailed research plan, and determine the necessary search queries to gather the required information. + # Simple controller example, replace with your actual implementation if needed + bu_controller = CustomController() - **Your Task:** - - Given a user's research topic, you will: - - 1. **Develop a Research Plan:** Outline the key aspects and subtopics that need to be investigated to thoroughly address the user's request. This plan should be a high-level overview of the research direction. - 2. **Generate Search Queries:** Based on your research plan, generate a list of specific search queries to be executed in a web browser. These queries should be designed to efficiently gather relevant information for each aspect of your plan. - - **Output Format:** - - Your output will be a JSON object with the following structure: - - ```json - {{ - "plan": "A concise, high-level research plan outlining the key areas to investigate.", - "queries": [ - "search query 1", - "search query 2", - //... up to a maximum of {max_query_num} search queries - ] - }} - ``` - - **Important:** - - * Limit your output to a **maximum of {max_query_num}** search queries. - * Make the search queries to help the automated agent find the needed information. Consider what keywords are most likely to lead to useful results. - * If you have gathered for all the information you want and no further search queries are required, output queries with an empty list: `[]` - * Make sure output search queries are different from the history queries. - - **Inputs:** - - 1. **User Instruction:** The original instruction given by the user. - 2. **Previous Queries:** History Queries. - 3. **Previous Search Results:** Textual data gathered from prior search queries. If there are no previous search results this string will be empty. - """ - search_messages = [SystemMessage(content=search_system_prompt)] - - record_system_prompt = """ - You are an expert information recorder. Your role is to process user instructions, current search results, and previously recorded information to extract, summarize, and record new, useful information that helps fulfill the user's request. Your output will be a JSON formatted list, where each element represents a piece of extracted information and follows the structure: `{"url": "source_url", "title": "source_title", "summary_content": "concise_summary", "thinking": "reasoning"}`. - -**Important Considerations:** - -1. **Minimize Information Loss:** While concise, prioritize retaining important details and nuances from the sources. Aim for a summary that captures the essence of the information without over-simplification. **Crucially, ensure to preserve key data and figures within the `summary_content`. This is essential for later stages, such as generating tables and reports.** - -2. **Avoid Redundancy:** Do not record information that is already present in the Previous Recorded Information. Check for semantic similarity, not just exact matches. However, if the same information is expressed differently in a new source and this variation adds valuable context or clarity, it should be included. - -3. **Source Information:** Extract and include the source title and URL for each piece of information summarized. This is crucial for verification and context. **The Current Search Results are provided in a specific format, where each item starts with "Title:", followed by the title, then "URL Source:", followed by the URL, and finally "Markdown Content:", followed by the content. Please extract the title and URL from this structure.** If a piece of information cannot be attributed to a specific source from the provided search results, use `"url": "unknown"` and `"title": "unknown"`. - -4. **Thinking and Report Structure:** For each extracted piece of information, add a `"thinking"` key. This field should contain your assessment of how this information could be used in a report, which section it might belong to (e.g., introduction, background, analysis, conclusion, specific subtopics), and any other relevant thoughts about its significance or connection to other information. - -**Output Format:** - -Provide your output as a JSON formatted list. Each item in the list must adhere to the following format: - -```json -[ - { - "url": "source_url_1", - "title": "source_title_1", - "summary_content": "Concise summary of content. Remember to include key data and figures here.", - "thinking": "This could be used in the introduction to set the context. It also relates to the section on the history of the topic." - }, - // ... more entries - { - "url": "unknown", - "title": "unknown", - "summary_content": "concise_summary_of_content_without_clear_source", - "thinking": "This might be useful background information, but I need to verify its accuracy. Could be used in the methodology section to explain how data was collected." - } -] -``` - -**Inputs:** - -1. **User Instruction:** The original instruction given by the user. This helps you determine what kind of information will be useful and how to structure your thinking. -2. **Previous Recorded Information:** Textual data gathered and recorded from previous searches and processing, represented as a single text string. -3. **Current Search Plan:** Research plan for current search. -4. **Current Search Query:** The current search query. -5. **Current Search Results:** Textual data gathered from the most recent search query. - """ - record_messages = [SystemMessage(content=record_system_prompt)] - - search_iteration = 0 - max_search_iterations = kwargs.get("max_search_iterations", 10) # Limit search iterations to prevent infinite loop - use_vision = kwargs.get("use_vision", False) - - history_query = [] - history_infos = [] - try: - while search_iteration < max_search_iterations: - search_iteration += 1 - logger.info(f"Start {search_iteration}th Search...") - history_query_ = json.dumps(history_query, indent=4) - history_infos_ = json.dumps(history_infos, indent=4) - query_prompt = f"This is search {search_iteration} of {max_search_iterations} maximum searches allowed.\n User Instruction:{task} \n Previous Queries:\n {history_query_} \n Previous Search Results:\n {history_infos_}\n" - search_messages.append(HumanMessage(content=query_prompt)) - ai_query_msg = llm.invoke(search_messages[:1] + search_messages[1:][-1:]) - search_messages.append(ai_query_msg) - if hasattr(ai_query_msg, "reasoning_content"): - logger.info("🤯 Start Search Deep Thinking: ") - logger.info(ai_query_msg.reasoning_content) - logger.info("🤯 End Search Deep Thinking") - ai_query_content = ai_query_msg.content.replace("```json", "").replace("```", "") - ai_query_content = repair_json(ai_query_content) - ai_query_content = json.loads(ai_query_content) - query_plan = ai_query_content["plan"] - logger.info(f"Current Iteration {search_iteration} Planing:") - logger.info(query_plan) - query_tasks = ai_query_content["queries"] - if not query_tasks: - break - else: - query_tasks = query_tasks[:max_query_num] - history_query.extend(query_tasks) - logger.info("Query tasks:") - logger.info(query_tasks) - - # 2. Perform Web Search and Auto exec - # Parallel BU agents - add_infos = "1. Please click on the most relevant link to get information and go deeper, instead of just staying on the search page. \n" \ - "2. When opening a PDF file, please remember to extract the content using extract_content instead of simply opening it for the user to view.\n" - if use_own_browser: - agent = Agent( - task=query_tasks[0], - llm=llm, - add_infos=add_infos, - browser=browser, - browser_context=browser_context, - use_vision=use_vision, - system_prompt_class=CustomSystemPrompt, - agent_prompt_class=CustomAgentMessagePrompt, - max_actions_per_step=5, - controller=controller - ) - agent_result = await agent.run(max_steps=kwargs.get("max_steps", 10)) - query_results = [agent_result] - # Manually close all tab - session = await browser_context.get_session() - pages = session.context.pages - await browser_context.create_new_tab() - for page_id, page in enumerate(pages): - await page.close() - - else: - agents = [Agent( - task=task, - llm=llm, - add_infos=add_infos, - browser=browser, - browser_context=browser_context, - use_vision=use_vision, - system_prompt_class=CustomSystemPrompt, - agent_prompt_class=CustomAgentMessagePrompt, - max_actions_per_step=5, - controller=controller, - ) for task in query_tasks] - query_results = await asyncio.gather( - *[agent.run(max_steps=kwargs.get("max_steps", 10)) for agent in agents]) - - if agent_state and agent_state.is_stop_requested(): - # Stop - break - # 3. Summarize Search Result - query_result_dir = os.path.join(save_dir, "query_results") - os.makedirs(query_result_dir, exist_ok=True) - for i in range(len(query_tasks)): - query_result = query_results[i].final_result() - if not query_result: - continue - querr_save_path = os.path.join(query_result_dir, f"{search_iteration}-{i}.md") - logger.info(f"save query: {query_tasks[i]} at {querr_save_path}") - with open(querr_save_path, "w", encoding="utf-8") as fw: - fw.write(f"Query: {query_tasks[i]}\n") - fw.write(query_result) - # split query result in case the content is too long - query_results_split = query_result.split("Extracted page content:") - for qi, query_result_ in enumerate(query_results_split): - if not query_result_: - continue - else: - # TODO: limit content lenght: 128k tokens, ~3 chars per token - query_result_ = query_result_[:128000 * 3] - history_infos_ = json.dumps(history_infos, indent=4) - record_prompt = f"User Instruction:{task}. \nPrevious Recorded Information:\n {history_infos_}\n Current Search Iteration: {search_iteration}\n Current Search Plan:\n{query_plan}\n Current Search Query:\n {query_tasks[i]}\n Current Search Results: {query_result_}\n " - record_messages.append(HumanMessage(content=record_prompt)) - ai_record_msg = llm.invoke(record_messages[:1] + record_messages[-1:]) - record_messages.append(ai_record_msg) - if hasattr(ai_record_msg, "reasoning_content"): - logger.info("🤯 Start Record Deep Thinking: ") - logger.info(ai_record_msg.reasoning_content) - logger.info("🤯 End Record Deep Thinking") - record_content = ai_record_msg.content - record_content = repair_json(record_content) - new_record_infos = json.loads(record_content) - history_infos.extend(new_record_infos) - if agent_state and agent_state.is_stop_requested(): - # Stop - break - - logger.info("\nFinish Searching, Start Generating Report...") - - # 5. Report Generation in Markdown (or JSON if you prefer) - return await generate_final_report(task, history_infos, save_dir, llm) - - except Exception as e: - logger.error(f"Deep research Error: {e}") - return await generate_final_report(task, history_infos, save_dir, llm, str(e)) - finally: - if browser: - await browser.close() - if browser_context: - await browser_context.close() - logger.info("Browser closed.") - - -async def generate_final_report(task, history_infos, save_dir, llm, error_msg=None): - """Generate report from collected information with error handling""" - try: - logger.info("\nAttempting to generate final report from collected data...") - - writer_system_prompt = """ - You are a **Deep Researcher** and a professional report writer tasked with creating polished, high-quality reports that fully meet the user's needs, based on the user's instructions and the relevant information provided. You will write the report using Markdown format, ensuring it is both informative and visually appealing. - -**Specific Instructions:** - -* **Structure for Impact:** The report must have a clear, logical, and impactful structure. Begin with a compelling introduction that immediately grabs the reader's attention. Develop well-structured body paragraphs that flow smoothly and logically, and conclude with a concise and memorable conclusion that summarizes key takeaways and leaves a lasting impression. -* **Engaging and Vivid Language:** Employ precise, vivid, and descriptive language to make the report captivating and enjoyable to read. Use stylistic techniques to enhance engagement. Tailor your tone, vocabulary, and writing style to perfectly suit the subject matter and the intended audience to maximize impact and readability. -* **Accuracy, Credibility, and Citations:** Ensure that all information presented is meticulously accurate, rigorously truthful, and robustly supported by the available data. **Cite sources exclusively using bracketed sequential numbers within the text (e.g., [1], [2], etc.). If no references are used, omit citations entirely.** These numbers must correspond to a numbered list of references at the end of the report. -* **Publication-Ready Formatting:** Adhere strictly to Markdown formatting for excellent readability and a clean, highly professional visual appearance. Pay close attention to formatting details like headings, lists, emphasis, and spacing to optimize the visual presentation and reader experience. The report should be ready for immediate publication upon completion, requiring minimal to no further editing for style or format. -* **Conciseness and Clarity (Unless Specified Otherwise):** When the user does not provide a specific length, prioritize concise and to-the-point writing, maximizing information density while maintaining clarity. -* **Data-Driven Comparisons with Tables:** **When appropriate and beneficial for enhancing clarity and impact, present data comparisons in well-structured Markdown tables. This is especially encouraged when dealing with numerical data or when a visual comparison can significantly improve the reader's understanding.** -* **Length Adherence:** When the user specifies a length constraint, meticulously stay within reasonable bounds of that specification, ensuring the content is appropriately scaled without sacrificing quality or completeness. -* **Comprehensive Instruction Following:** Pay meticulous attention to all details and nuances provided in the user instructions. Strive to fulfill every aspect of the user's request with the highest degree of accuracy and attention to detail, creating a report that not only meets but exceeds expectations for quality and professionalism. -* **Reference List Formatting:** The reference list at the end must be formatted as follows: - `[1] Title (URL, if available)` - **Each reference must be separated by a blank line to ensure proper spacing.** For example: - - ``` - [1] Title 1 (URL1, if available) - - [2] Title 2 (URL2, if available) - ``` - **Furthermore, ensure that the reference list is free of duplicates. Each unique source should be listed only once, regardless of how many times it is cited in the text.** -* **ABSOLUTE FINAL OUTPUT RESTRICTION:** **Your output must contain ONLY the finished, publication-ready Markdown report. Do not include ANY extraneous text, phrases, preambles, meta-commentary, or markdown code indicators (e.g., "```markdown```"). The report should begin directly with the title and introductory paragraph, and end directly after the conclusion and the reference list (if applicable).** **Your response will be deemed a failure if this instruction is not followed precisely.** - -**Inputs:** - -1. **User Instruction:** The original instruction given by the user. This helps you determine what kind of information will be useful and how to structure your thinking. -2. **Search Information:** Information gathered from the search queries. + # Construct the task prompt for BrowserUseAgent + # Instruct it to find specific info and return title/URL + bu_task_prompt = f""" + Research Task: {task_query} + Objective: Find relevant information answering the query. + Output Requirements: For each relevant piece of information found, please provide: + 1. A concise summary of the information. + 2. The title of the source page or document. + 3. The URL of the source. + Focus on accuracy and relevance. Avoid irrelevant details. """ - history_infos_ = json.dumps(history_infos, indent=4) - record_json_path = os.path.join(save_dir, "record_infos.json") - logger.info(f"save All recorded information at {record_json_path}") - with open(record_json_path, "w") as fw: - json.dump(history_infos, fw, indent=4) - report_prompt = f"User Instruction:{task} \n Search Information:\n {history_infos_}" - report_messages = [SystemMessage(content=writer_system_prompt), - HumanMessage(content=report_prompt)] # New context for report generation - ai_report_msg = llm.invoke(report_messages) - if hasattr(ai_report_msg, "reasoning_content"): - logger.info("🤯 Start Report Deep Thinking: ") - logger.info(ai_report_msg.reasoning_content) - logger.info("🤯 End Report Deep Thinking") - report_content = ai_report_msg.content - report_content = re.sub(r"^```\s*markdown\s*|^\s*```|```\s*$", "", report_content, flags=re.MULTILINE) - report_content = report_content.strip() + bu_agent_instance = BrowserUseAgent( + task=bu_task_prompt, + llm=llm, # Use the passed LLM + browser=bu_browser, + browser_context=bu_browser_context, + controller=bu_controller, + use_vision=use_vision, + ) - # Add error notification to the report - if error_msg: - report_content = f"## āš ļø Research Incomplete - Partial Results\n" \ - f"**The research process was interrupted by an error:** {error_msg}\n\n" \ - f"{report_content}" + # Store instance for potential stop() call + task_key = f"{task_id}_{uuid.uuid4()}" # Unique key for this run + _BROWSER_AGENT_INSTANCES[task_key] = bu_agent_instance - report_file_path = os.path.join(save_dir, "final_report.md") - with open(report_file_path, "w", encoding="utf-8") as f: - f.write(report_content) - logger.info(f"Save Report at: {report_file_path}") - return report_content, report_file_path + # --- Run with Stop Check --- + # BrowserUseAgent needs to internally check a stop signal or have a stop method. + # We simulate checking before starting and assume `run` might be interruptible + # or have its own stop mechanism we can trigger via bu_agent_instance.stop(). + if stop_event.is_set(): + logger.info(f"Browser task for '{task_query}' cancelled before start.") + return {"query": task_query, "result": None, "status": "cancelled"} - except Exception as report_error: - logger.error(f"Failed to generate partial report: {report_error}") - return f"Error generating report: {str(report_error)}", None + # The run needs to be awaitable and ideally accept a stop signal or have a .stop() method + # result = await bu_agent_instance.run(max_steps=max_steps) # Add max_steps if applicable + # Let's assume a simplified run for now + logger.info(f"Running BrowserUseAgent for: {task_query}") + result = await bu_agent_instance.run() # Assuming run is the main method + logger.info(f"BrowserUseAgent finished for: {task_query}") + + final_data = result.final_result() + + if stop_event.is_set(): + logger.info(f"Browser task for '{task_query}' stopped during execution.") + return {"query": task_query, "result": final_data, "status": "stopped"} + else: + logger.info(f"Browser result for '{task_query}': {final_data}") + return {"query": task_query, "result": final_data, "status": "completed"} + + except Exception as e: + logger.error(f"Error during browser task for query '{task_query}': {e}", exc_info=True) + return {"query": task_query, "error": str(e), "status": "failed"} + finally: + if task_key in _BROWSER_AGENT_INSTANCES: + del _BROWSER_AGENT_INSTANCES[task_key] + if bu_browser_context: + try: + await bu_browser_context.close() + logger.info("Closed browser context.") + except Exception as e: + logger.error(f"Error closing browser context: {e}") + if bu_browser: + try: + await bu_browser.close() + logger.info("Closed browser.") + except Exception as e: + logger.error(f"Error closing browser: {e}") + + +async def browser_search_tool_func(queries: List[str], task_id: str, llm: Any, browser_config: Dict[str, Any], + stop_event: threading.Event): + """ + Tool function to run multiple browser searches in parallel (up to MAX_PARALLEL_BROWSERS). + """ + if not BrowserUseAgent: + return [{"query": q, "error": "BrowserUseAgent components not available."} for q in queries] + + results = [] + # Use asyncio.Semaphore to limit concurrent browser instances + semaphore = asyncio.Semaphore(MAX_PARALLEL_BROWSERS) + + async def task_wrapper(query): + async with semaphore: + if stop_event.is_set(): + logger.info(f"Skipping browser task due to stop signal: {query}") + return {"query": query, "result": None, "status": "cancelled"} + # Pass necessary configs and the stop event + return await run_single_browser_task(query, task_id, llm, browser_config, stop_event) + + tasks = [task_wrapper(query) for query in queries] + # Use asyncio.gather to run tasks concurrently + search_results = await asyncio.gather(*tasks, return_exceptions=True) + + # Process results, handling potential exceptions returned by gather + for result in search_results: + if isinstance(result, Exception): + # Log the exception, but maybe return a specific error structure + logger.error(f"Browser task gather caught exception: {result}") + # Find which query failed if possible (difficult with gather exceptions directly) + results.append({"query": "unknown", "error": str(result), "status": "failed"}) + else: + results.append(result) + + return results + + +# --- Langgraph State Definition --- + +class ResearchPlanItem(TypedDict): + step: int + task: str + status: str # "pending", "completed", "failed" + queries: Optional[List[str]] # Queries generated for this task + result_summary: Optional[str] # Optional brief summary after execution + + +class DeepResearchState(TypedDict): + task_id: str + topic: str + research_plan: List[ResearchPlanItem] + search_results: List[Dict[str, Any]] # Stores results from browser_search_tool_func + # messages: Sequence[BaseMessage] # History for ReAct-like steps within nodes + llm: Any # The LLM instance + tools: List[Tool] + output_dir: Path + browser_config: Dict[str, Any] + final_report: Optional[str] + current_step_index: int # To track progress through the plan + stop_requested: bool # Flag to signal termination + # Add other state variables as needed + error_message: Optional[str] # To store errors + + +# --- Langgraph Nodes --- + +def _load_previous_state(task_id: str, output_dir: str) -> Dict[str, Any]: + """Loads state from files if they exist.""" + state_updates = {} + plan_file = os.path.join(output_dir, task_id, PLAN_FILENAME) + search_file = os.path.join(output_dir, task_id, SEARCH_INFO_FILENAME) + + if os.path.exists(plan_file): + try: + with open(plan_file, 'r', encoding='utf-8') as f: + # Basic parsing, assumes markdown checklist format + plan = [] + step = 1 + for line in f: + line = line.strip() + if line.startswith(("[x]", "[ ]")): + status = "completed" if line.startswith("[x]") else "pending" + task = line[4:].strip() + plan.append( + ResearchPlanItem(step=step, task=task, status=status, queries=None, result_summary=None)) + step += 1 + state_updates['research_plan'] = plan + # Determine next step index based on loaded plan + next_step = next((i for i, item in enumerate(plan) if item['status'] == 'pending'), len(plan)) + state_updates['current_step_index'] = next_step + logger.info(f"Loaded research plan from {plan_file}, next step index: {next_step}") + except Exception as e: + logger.error(f"Failed to load or parse research plan {plan_file}: {e}") + state_updates['error_message'] = f"Failed to load research plan: {e}" + + if os.path.exists(search_file): + try: + with open(search_file, 'r', encoding='utf-8') as f: + state_updates['search_results'] = json.load(f) + logger.info(f"Loaded search results from {search_file}") + except Exception as e: + logger.error(f"Failed to load search results {search_file}: {e}") + state_updates['error_message'] = f"Failed to load search results: {e}" + # Decide if this is fatal or if we can continue without old results + + return state_updates + + +def _save_plan_to_md(plan: List[ResearchPlanItem], output_dir: str): + """Saves the research plan to a markdown checklist file.""" + plan_file = os.path.join(output_dir, PLAN_FILENAME) + try: + with open(plan_file, 'w', encoding='utf-8') as f: + f.write("# Research Plan\n\n") + for item in plan: + marker = "[x]" if item['status'] == 'completed' else "[ ]" + f.write(f"{marker} {item['task']}\n") + logger.info(f"Research plan saved to {plan_file}") + except Exception as e: + logger.error(f"Failed to save research plan to {plan_file}: {e}") + + +def _save_search_results_to_json(results: List[Dict[str, Any]], output_dir: str): + """Appends or overwrites search results to a JSON file.""" + search_file = os.path.join(output_dir, SEARCH_INFO_FILENAME) + try: + # Simple overwrite for now, could be append + with open(search_file, 'w', encoding='utf-8') as f: + json.dump(results, f, indent=2, ensure_ascii=False) + logger.info(f"Search results saved to {search_file}") + except Exception as e: + logger.error(f"Failed to save search results to {search_file}: {e}") + + +def _save_report_to_md(report: str, output_dir: Path): + """Saves the final report to a markdown file.""" + report_file = os.path.join(output_dir, REPORT_FILENAME) + try: + with open(report_file, 'w', encoding='utf-8') as f: + f.write(report) + logger.info(f"Final report saved to {report_file}") + except Exception as e: + logger.error(f"Failed to save final report to {report_file}: {e}") + + +async def planning_node(state: DeepResearchState) -> Dict[str, Any]: + """Generates the initial research plan or refines it if resuming.""" + logger.info("--- Entering Planning Node ---") + if state.get('stop_requested'): + logger.info("Stop requested, skipping planning.") + return {"stop_requested": True} + + llm = state['llm'] + topic = state['topic'] + existing_plan = state.get('research_plan') + existing_results = state.get('search_results') + output_dir = state['output_dir'] + + if existing_plan and state.get('current_step_index', 0) > 0: + logger.info("Resuming with existing plan.") + # Maybe add logic here to let LLM review and potentially adjust the plan + # based on existing_results, but for now, we just use the loaded plan. + _save_plan_to_md(existing_plan, output_dir) # Ensure it's saved initially + return {"research_plan": existing_plan} # Return the loaded plan + + logger.info(f"Generating new research plan for topic: {topic}") + + prompt = ChatPromptTemplate.from_messages([ + ("system", """You are a meticulous research assistant. Your goal is to create a step-by-step research plan to thoroughly investigate a given topic. + The plan should consist of clear, actionable research tasks or questions. Each step should logically build towards a comprehensive understanding. + Format the output as a numbered list. Each item should represent a distinct research step or question. + Example: + 1. Define the core concepts and terminology related to [Topic]. + 2. Identify the key historical developments of [Topic]. + 3. Analyze the current state-of-the-art and recent advancements in [Topic]. + 4. Investigate the major challenges and limitations associated with [Topic]. + 5. Explore the future trends and potential applications of [Topic]. + 6. Summarize the findings and draw conclusions. + + Keep the plan focused and manageable. Aim for 5-10 detailed steps. + """), + ("human", f"Generate a research plan for the topic: {topic}") + ]) + + try: + response = await llm.ainvoke(prompt.format_prompt(topic=topic).to_messages()) + plan_text = response.content + + # Parse the numbered list into the plan structure + new_plan: List[ResearchPlanItem] = [] + for i, line in enumerate(plan_text.strip().split('\n')): + line = line.strip() + if line and (line[0].isdigit() or line.startswith(("*", "-"))): + # Simple parsing: remove number/bullet and space + task_text = line.split('.', 1)[-1].strip() if line[0].isdigit() else line[1:].strip() + if task_text: + new_plan.append(ResearchPlanItem( + step=i + 1, + task=task_text, + status="pending", + queries=None, + result_summary=None + )) + + if not new_plan: + logger.error("LLM failed to generate a valid plan structure.") + return {"error_message": "Failed to generate research plan structure."} + + logger.info(f"Generated research plan with {len(new_plan)} steps.") + _save_plan_to_md(new_plan, output_dir) + + return { + "research_plan": new_plan, + "current_step_index": 0, # Start from the beginning + "search_results": [], # Initialize search results + } + + except Exception as e: + logger.error(f"Error during planning: {e}", exc_info=True) + return {"error_message": f"LLM Error during planning: {e}"} + + +async def research_execution_node(state: DeepResearchState) -> Dict[str, Any]: + """Executes the next step in the research plan using the browser tool.""" + logger.info("--- Entering Research Execution Node ---") + if state.get('stop_requested'): + logger.info("Stop requested, skipping research execution.") + return {"stop_requested": True} + + plan = state['research_plan'] + current_index = state['current_step_index'] + llm = state['llm'] + browser_config = state['browser_config'] + output_dir = state['output_dir'] + task_id = state['task_id'] + stop_event = _AGENT_STOP_FLAGS.get(task_id) + + if not plan or current_index >= len(plan): + logger.info("Research plan complete or empty.") + return {} # Signal to move to synthesis or end + + current_step = plan[current_index] + if current_step['status'] == 'completed': + logger.info(f"Step {current_step['step']} already completed, skipping.") + return {"current_step_index": current_index + 1} # Move to next step + + logger.info(f"Executing research step {current_step['step']}: {current_step['task']}") + + # 1. Generate Search Queries for the current task using LLM + query_gen_prompt = ChatPromptTemplate.from_messages([ + ("system", + f"You are an expert search query formulator. Given a research task, generate {MAX_PARALLEL_BROWSERS} distinct, effective search engine queries to find relevant information. Focus on diversity and different angles of the task. Output ONLY the queries, each on a new line."), + ("human", f"Research Task: {current_step['task']}\n\nGenerate search queries:") + ]) + + try: + response = await llm.ainvoke(query_gen_prompt.format_prompt().to_messages()) + queries = [q.strip() for q in response.content.strip().split('\n') if q.strip()] + if not queries: + logger.warning( + f"LLM did not generate any search queries for task: {current_step['task']}. Using task itself as query.") + queries = [current_step['task']] + else: + queries = queries[:MAX_PARALLEL_BROWSERS] # Limit to max parallel + logger.info(f"Generated queries: {queries}") + current_step['queries'] = queries # Store generated queries in the plan item + + except Exception as e: + logger.error(f"Failed to generate search queries: {e}. Using task as query.", exc_info=True) + queries = [current_step['task']] + current_step['queries'] = queries + + # 2. Execute Searches using the Browser Tool + try: + search_results_list = await browser_search_tool_func( + queries=queries, + task_id=task_id, + llm=llm, + browser_config=browser_config, + stop_event=stop_event + ) + + # Check for stop signal *after* search execution attempt + if stop_event and stop_event.is_set(): + logger.info("Stop requested during or after search execution.") + # Update plan partially if needed, or just signal stop + current_step['status'] = 'pending' # Mark as not completed due to stop + _save_plan_to_md(plan, output_dir) + # Save any partial results gathered before stop + current_search_results = state.get('search_results', []) + current_search_results.extend([r for r in search_results_list if r.get('status') != 'cancelled']) + _save_search_results_to_json(current_search_results, output_dir) + return {"stop_requested": True, "search_results": current_search_results, "research_plan": plan} + + # 3. Process Results and Update State + successful_results = [r for r in search_results_list if r.get('status') == 'completed' and r.get('result')] + failed_queries = [r['query'] for r in search_results_list if r.get('status') == 'failed'] + # Combine results with existing ones + all_search_results = state.get('search_results', []) + all_search_results.extend(search_results_list) # Add all results (incl. errors) + + if failed_queries: + logger.warning(f"Some queries failed: {failed_queries}") + # Optionally add logic to retry failed queries + + if successful_results: + # Optionally, summarize the findings for this step (could be another LLM call) + # current_step['result_summary'] = "Summary of findings..." + current_step['status'] = 'completed' + logger.info(f"Step {current_step['step']} completed successfully.") + else: + # Decide how to handle steps with no successful results + logger.warning(f"Step {current_step['step']} completed but yielded no successful results.") + current_step['status'] = 'failed' # Or 'completed_no_results' + + # Update the plan file on disk + _save_plan_to_md(plan, output_dir) + # Update the search results file on disk + _save_search_results_to_json(all_search_results, output_dir) + + return { + "research_plan": plan, + "search_results": all_search_results, + "current_step_index": current_index + 1, + "error_message": None if not failed_queries else f"Failed queries: {failed_queries}" + } + + except Exception as e: + logger.error(f"Error during research execution for step {current_step['step']}: {e}", exc_info=True) + current_step['status'] = 'failed' + _save_plan_to_md(plan, output_dir) + return { + "research_plan": plan, + "current_step_index": current_index + 1, # Move to next step even if failed? Or retry? Let's move on. + "error_message": f"Execution Error on step {current_step['step']}: {e}" + } + + +async def synthesis_node(state: DeepResearchState) -> Dict[str, Any]: + """Synthesizes the final report from the collected search results.""" + logger.info("--- Entering Synthesis Node ---") + if state.get('stop_requested'): + logger.info("Stop requested, skipping synthesis.") + return {"stop_requested": True} + + llm = state['llm'] + topic = state['topic'] + search_results = state.get('search_results', []) + output_dir = state['output_dir'] + plan = state['research_plan'] # Include plan for context + + if not search_results: + logger.warning("No search results found to synthesize report.") + report = f"# Research Report: {topic}\n\nNo information was gathered during the research process." + _save_report_to_md(report, output_dir) + return {"final_report": report} + + logger.info(f"Synthesizing report from {len(search_results)} collected search result entries.") + + # Prepare context for the LLM + # Format search results nicely, maybe group by query or original plan step + formatted_results = "" + references = {} + ref_count = 1 + for i, result_entry in enumerate(search_results): + query = result_entry.get('query', 'Unknown Query') + status = result_entry.get('status', 'unknown') + result_data = result_entry.get('result') # This should be the dict with summary, title, url + error = result_entry.get('error') + + if status == 'completed' and result_data: + summary = result_data + formatted_results += f"### Finding from Query: \"{query}\"\n" + formatted_results += f"- **Summary:**\n{summary}\n" + formatted_results += "---\n" + + elif status == 'failed': + formatted_results += f"### Failed Query: \"{query}\"\n" + formatted_results += f"- **Error:** {error}\n" + formatted_results += "---\n" + # Ignore cancelled/other statuses for the report content + + # Prepare the research plan context + plan_summary = "\nResearch Plan Followed:\n" + for item in plan: + marker = "[x]" if item['status'] == 'completed' else "[?]" if item['status'] == 'failed' else "[ ]" + plan_summary += f"{marker} {item['task']}\n" + + synthesis_prompt = ChatPromptTemplate.from_messages([ + ("system", """You are a professional researcher tasked with writing a comprehensive and well-structured report based on collected findings. + The report should address the research topic thoroughly, synthesizing the information gathered from various sources. + Structure the report logically: + 1. **Introduction:** Briefly introduce the topic and the report's scope (mentioning the research plan followed is good). + 2. **Main Body:** Discuss the key findings, organizing them thematically or according to the research plan steps. Analyze, compare, and contrast information from different sources where applicable. **Crucially, cite your sources using bracketed numbers [X] corresponding to the reference list.** + 3. **Conclusion:** Summarize the main points and offer concluding thoughts or potential areas for further research. + + Ensure the tone is objective, professional, and analytical. Base the report **strictly** on the provided findings. Do not add external knowledge. If findings are contradictory or incomplete, acknowledge this. + """), + ("human", f""" + **Research Topic:** {topic} + + {plan_summary} + + **Collected Findings:** + ``` + {formatted_results} + ``` + + ``` + + Please generate the final research report in Markdown format based **only** on the information above. Ensure all claims derived from the findings are properly cited using the format [Reference_ID]. + """) + ]) + + try: + response = await llm.ainvoke(synthesis_prompt.format_prompt( + topic=topic, + plan_summary=plan_summary, + formatted_results=formatted_results, + references=references + ).to_messages()) + final_report_md = response.content + + # Append the reference list automatically to the end of the generated markdown + if references: + report_references_section = "\n\n## References\n\n" + # Sort refs by ID for consistent output + sorted_refs = sorted(references.values(), key=lambda x: x['id']) + for ref in sorted_refs: + report_references_section += f"[{ref['id']}] {ref['title']} - {ref['url']}\n" + final_report_md += report_references_section + + logger.info("Successfully synthesized the final report.") + _save_report_to_md(final_report_md, output_dir) + return {"final_report": final_report_md} + + except Exception as e: + logger.error(f"Error during report synthesis: {e}", exc_info=True) + return {"error_message": f"LLM Error during synthesis: {e}"} + + +# --- Langgraph Edges and Conditional Logic --- + +def should_continue(state: DeepResearchState) -> str: + """Determines the next step based on the current state.""" + logger.info("--- Evaluating Condition: Should Continue? ---") + if state.get('stop_requested'): + logger.info("Stop requested, routing to END.") + return "end_run" # Go to a dedicated end node for cleanup if needed + if state.get('error_message'): + logger.warning(f"Error detected: {state['error_message']}. Routing to END.") + # Decide if errors should halt execution or if it should try to synthesize anyway + return "end_run" # Stop on error for now + + plan = state.get('research_plan') + current_index = state.get('current_step_index', 0) + + if not plan: + logger.warning("No research plan found, cannot continue execution. Routing to END.") + return "end_run" # Should not happen if planning node ran correctly + + # Check if there are pending steps in the plan + if current_index < len(plan): + logger.info( + f"Plan has pending steps (current index {current_index}/{len(plan)}). Routing to Research Execution.") + return "execute_research" + else: + logger.info("All plan steps processed. Routing to Synthesis.") + return "synthesize_report" + + +# --- DeepSearchAgent Class --- + +class DeepSearchAgent: + def __init__(self, llm: Any, browser_config: Dict[str, Any], mcp_server_config: Optional[Dict[str, Any]] = None): + """ + Initializes the DeepSearchAgent. + + Args: + llm: The Langchain compatible language model instance. + browser_config: Configuration dictionary for the BrowserUseAgent tool. + Example: {"headless": True, "window_width": 1280, ...} + mcp_server_config: Optional configuration for the MCP client. + """ + self.llm = llm + self.browser_config = browser_config + self.mcp_server_config = mcp_server_config + self.mcp_client = None + self.graph = self._compile_graph() + self.current_task_id: Optional[str] = None + self.stop_event: Optional[threading.Event] = None + self.runner: Optional[asyncio.Task] = None # To hold the asyncio task for run + + async def _setup_tools(self) -> List[Tool]: + """Sets up the basic tools (File I/O) and optional MCP tools.""" + tools = [WriteFileTool(), ReadFileTool(), ListDirectoryTool(), CopyFileTool(), + MoveFileTool()] # Basic file operations + + # Add MCP tools if config is provided + if self.mcp_server_config: + try: + logger.info("Setting up MCP client and tools...") + self.mcp_client = await setup_mcp_client_and_tools(self.mcp_server_config) + mcp_tools = self.mcp_client.get_tools() + logger.info(f"Loaded {len(mcp_tools)} MCP tools.") + tools.extend(mcp_tools) + except Exception as e: + logger.error(f"Failed to set up MCP tools: {e}", exc_info=True) + elif self.mcp_server_config: + logger.warning("MCP server config provided, but setup function unavailable.") + + return tools + + def _compile_graph(self) -> StateGraph: + """Compiles the Langgraph state machine.""" + workflow = StateGraph(DeepResearchState) + + # Add nodes + workflow.add_node("plan_research", planning_node) + workflow.add_node("execute_research", research_execution_node) + workflow.add_node("synthesize_report", synthesis_node) + workflow.add_node("end_run", lambda state: logger.info("--- Reached End Run Node ---") or {}) # Simple end node + + # Define edges + workflow.set_entry_point("plan_research") + + workflow.add_edge("plan_research", "execute_research") # Always execute after planning + + # Conditional edge after execution + workflow.add_conditional_edges( + "execute_research", + should_continue, + { + "execute_research": "execute_research", # Loop back if more steps + "synthesize_report": "synthesize_report", # Move to synthesis if done + "end_run": "end_run" # End if stop requested or error + } + ) + + workflow.add_edge("synthesize_report", "end_run") # End after synthesis + + app = workflow.compile() + return app + + async def run(self, topic: str, task_id: Optional[str] = None) -> Dict[str, Any]: + """ + Starts the deep research process (Async Generator Version). + + Args: + topic: The research topic. + task_id: Optional existing task ID to resume. If None, a new ID is generated. + + Yields: + Intermediate state updates or messages during execution. + """ + if self.runner and not self.runner.done(): + logger.warning("Agent is already running. Please stop the current task first.") + # Return an error status instead of yielding + return {"status": "error", "message": "Agent already running.", "task_id": self.current_task_id} + + self.current_task_id = task_id if task_id else str(uuid.uuid4()) + output_dir = os.path.join(TMP_DIR, self.current_task_id) + os.makedirs(output_dir, exist_ok=True) + + logger.info(f"[AsyncGen] Starting research task ID: {self.current_task_id} for topic: '{topic}'") + logger.info(f"[AsyncGen] Output directory: {output_dir}") + + self.stop_event = threading.Event() + _AGENT_STOP_FLAGS[self.current_task_id] = self.stop_event + agent_tools = await self._setup_tools() + initial_state: DeepResearchState = { + "task_id": self.current_task_id, + "topic": topic, + "research_plan": [], + "search_results": [], + "llm": self.llm, + "tools": agent_tools, + "output_dir": output_dir, + "browser_config": self.browser_config, + "final_report": None, + "current_step_index": 0, + "stop_requested": False, + "error_message": None, + } + + loaded_state = {} + if task_id: + logger.info(f"Attempting to resume task {task_id}...") + loaded_state = _load_previous_state(task_id, output_dir) + initial_state.update(loaded_state) + if loaded_state.get("research_plan"): + logger.info( + f"Resuming with {len(loaded_state['research_plan'])} plan steps and {len(loaded_state.get('search_results', []))} existing results.") + initial_state[ + "topic"] = topic # Allow overriding topic even when resuming? Or use stored topic? Let's use new one. + else: + logger.warning(f"Resume requested for {task_id}, but no previous plan found. Starting fresh.") + initial_state["current_step_index"] = 0 + + # --- Execute Graph using ainvoke --- + final_state = None + status = "unknown" + message = None + try: + logger.info(f"Invoking graph execution for task {self.current_task_id}...") + self.runner = asyncio.create_task(self.graph.ainvoke(initial_state)) + final_state = await self.runner + logger.info(f"Graph execution finished for task {self.current_task_id}.") + + # Determine status based on final state + if self.stop_event and self.stop_event.is_set(): + status = "stopped" + message = "Research process was stopped by request." + logger.info(message) + elif final_state and final_state.get("error_message"): + status = "error" + message = final_state["error_message"] + logger.error(f"Graph execution completed with error: {message}") + elif final_state and final_state.get("final_report"): + status = "completed" + message = "Research process completed successfully." + logger.info(message) + else: + # If it ends without error/report (e.g., empty plan, stopped before synthesis) + status = "finished_incomplete" + message = "Research process finished, but may be incomplete (no final report generated)." + logger.warning(message) + + except asyncio.CancelledError: + status = "cancelled" + message = f"Agent run task cancelled for {self.current_task_id}." + logger.info(message) + # final_state will remain None or the state before cancellation if checkpointing was used + except Exception as e: + status = "error" + message = f"Unhandled error during graph execution for {self.current_task_id}: {e}" + logger.error(message, exc_info=True) + # final_state will remain None or the state before the error + finally: + logger.info(f"Cleaning up resources for task {self.current_task_id}") + task_id_to_clean = self.current_task_id # Store before potentially clearing + if task_id_to_clean in _AGENT_STOP_FLAGS: + del _AGENT_STOP_FLAGS[task_id_to_clean] + # Stop any potentially lingering browser agents for this task + await self._stop_lingering_browsers(task_id_to_clean) + # Ensure the instance tracker is clean (should be handled by tool's finally block) + lingering_keys = [k for k in _BROWSER_AGENT_INSTANCES if k.startswith(f"{task_id_to_clean}_")] + if lingering_keys: + logger.warning( + f"{len(lingering_keys)} lingering browser instances found in tracker for task {task_id_to_clean} after cleanup attempt.") + # Force clear them from the tracker dict + for key in lingering_keys: + del _BROWSER_AGENT_INSTANCES[key] + + self.stop_event = None + self.current_task_id = None + self.runner = None # Mark runner as finished + if self.mcp_client: + await self.mcp_client.__aexit__(None, None, None) + + # Return a result dictionary including the status and the final state if available + return { + "status": status, + "message": message, + "task_id": task_id_to_clean, # Use the stored task_id + "final_state": final_state if final_state else {} # Return the final state dict + } + + async def _stop_lingering_browsers(self, task_id): + """Attempts to stop any BrowserUseAgent instances associated with the task_id.""" + keys_to_stop = [key for key in _BROWSER_AGENT_INSTANCES if key.startswith(f"{task_id}_")] + if not keys_to_stop: + return + + logger.warning( + f"Found {len(keys_to_stop)} potentially lingering browser agents for task {task_id}. Attempting stop...") + for key in keys_to_stop: + agent_instance = _BROWSER_AGENT_INSTANCES.get(key) + if agent_instance and hasattr(agent_instance, 'stop'): + try: + # Assuming BU agent has an async stop method + await agent_instance.stop() + logger.info(f"Called stop() on browser agent instance {key}") + except Exception as e: + logger.error(f"Error calling stop() on browser agent instance {key}: {e}") + # Instance should be removed by the finally block in run_single_browser_task + # but we ensure removal here too. + if key in _BROWSER_AGENT_INSTANCES: + del _BROWSER_AGENT_INSTANCES[key] + + def stop(self): + """Signals the currently running agent task to stop.""" + if not self.current_task_id or not self.stop_event: + logger.info("No agent task is currently running.") + return + + logger.info(f"Stop requested for task ID: {self.current_task_id}") + self.stop_event.set() # Signal the stop event + + # Additionally, try to stop the browser agents directly + # Need to run this async in the background or manage event loops carefully + async def do_stop_browsers(): + await self._stop_lingering_browsers(self.current_task_id) + + try: + loop = asyncio.get_running_loop() + loop.create_task(do_stop_browsers()) + except RuntimeError: # No running loop in current thread + asyncio.run(do_stop_browsers()) diff --git a/src/webui/components/agent_settings_tab.py b/src/webui/components/agent_settings_tab.py index 85e7c0e..6528a11 100644 --- a/src/webui/components/agent_settings_tab.py +++ b/src/webui/components/agent_settings_tab.py @@ -7,6 +7,7 @@ from typing import Any, Dict, Optional from src.webui.webui_manager import WebuiManager from src.utils import config import logging +from functools import partial logger = logging.getLogger(__name__) @@ -23,10 +24,15 @@ def update_model_dropdown(llm_provider): return gr.Dropdown(choices=[], value="", interactive=True, allow_custom_value=True) -def update_mcp_server(mcp_file: str): +def update_mcp_server(mcp_file: str, webui_manager: WebuiManager): """ Update the MCP server. """ + if hasattr(webui_manager, "bu_controller") and webui_manager.bu_controller: + logger.warning("āš ļø Close controller because mcp file has changed!") + webui_manager.bu_controller.close_mcp_client() + webui_manager.bu_controller = None + if not mcp_file or not os.path.exists(mcp_file) or not mcp_file.endswith('.json'): logger.warning(f"{mcp_file} is not a valid MCP file.") return None, gr.update(visible=False) @@ -37,7 +43,7 @@ def update_mcp_server(mcp_file: str): return json.dumps(mcp_server, indent=2), gr.update(visible=True) -def create_agent_settings_tab(webui_manager: WebuiManager) -> dict[str, Component]: +def create_agent_settings_tab(webui_manager: WebuiManager): """ Creates an agent settings tab. """ @@ -252,7 +258,7 @@ def create_agent_settings_tab(webui_manager: WebuiManager) -> dict[str, Componen ) mcp_json_file.change( - update_mcp_server, + partial(update_mcp_server, webui_manager=webui_manager), inputs=[mcp_json_file], outputs=[mcp_server_config, mcp_server_config] ) diff --git a/src/webui/components/browser_settings_tab.py b/src/webui/components/browser_settings_tab.py index 90e6fa6..40c104c 100644 --- a/src/webui/components/browser_settings_tab.py +++ b/src/webui/components/browser_settings_tab.py @@ -14,13 +14,16 @@ async def close_browser(webui_manager: WebuiManager): if webui_manager.bu_current_task and not webui_manager.bu_current_task.done(): webui_manager.bu_current_task.cancel() webui_manager.bu_current_task = None - if webui_manager.bu_browser: - await webui_manager.bu_browser.close() - webui_manager.bu_browser = None + if webui_manager.bu_browser_context: + logger.info("āš ļø Closing browser context when changing browser config.") await webui_manager.bu_browser_context.close() webui_manager.bu_browser_context = None + if webui_manager.bu_browser: + logger.info("āš ļø Closing browser when changing browser config.") + await webui_manager.bu_browser.close() + webui_manager.bu_browser = None def create_browser_settings_tab(webui_manager: WebuiManager): """ @@ -43,6 +46,7 @@ def create_browser_settings_tab(webui_manager: WebuiManager): interactive=True, placeholder="Leave it empty if you use your default user data", ) + with gr.Group(): with gr.Row(): use_own_browser = gr.Checkbox( label="Use Own Browser", @@ -64,11 +68,12 @@ def create_browser_settings_tab(webui_manager: WebuiManager): ) disable_security = gr.Checkbox( label="Disable Security", - value=True, - info="Disable browser security features", + value=False, + info="Disable browser security", interactive=True ) + with gr.Group(): with gr.Row(): window_w = gr.Number( label="Window Width", @@ -82,7 +87,7 @@ def create_browser_settings_tab(webui_manager: WebuiManager): info="Browser window height", interactive=True ) - + with gr.Group(): with gr.Row(): cdp_url = gr.Textbox( label="CDP URL", @@ -94,7 +99,7 @@ def create_browser_settings_tab(webui_manager: WebuiManager): info="WSS URL for browser remote debugging", interactive=True, ) - + with gr.Group(): with gr.Row(): save_recording_path = gr.Textbox( label="Recording Path", diff --git a/src/webui/components/browser_use_agent_tab.py b/src/webui/components/browser_use_agent_tab.py index 88f571d..25f56bf 100644 --- a/src/webui/components/browser_use_agent_tab.py +++ b/src/webui/components/browser_use_agent_tab.py @@ -1,3 +1,5 @@ +import pdb + import gradio as gr from gradio.components import Component import asyncio @@ -388,7 +390,6 @@ async def run_agent_task(webui_manager: WebuiManager, components: Dict[gr.compon extra_args += [f"--user-data-dir={chrome_user_data}"] else: browser_binary_path = None - webui_manager.bu_browser = CustomBrowser( config=BrowserConfig( headless=headless, @@ -432,7 +433,6 @@ async def run_agent_task(webui_manager: WebuiManager, components: Dict[gr.compon logger.info(f"Initializing new agent for task: {task}") if not webui_manager.bu_browser or not webui_manager.bu_browser_context: raise ValueError("Browser or Context not initialized, cannot create agent.") - webui_manager.bu_agent = BrowserUseAgent( task=task, llm=main_llm, @@ -456,6 +456,9 @@ async def run_agent_task(webui_manager: WebuiManager, components: Dict[gr.compon webui_manager.bu_agent.state.agent_id = webui_manager.bu_agent_task_id webui_manager.bu_agent.add_new_task(task) webui_manager.bu_agent.settings.generate_gif = gif_path + webui_manager.bu_agent.browser = webui_manager.bu_browser + webui_manager.bu_agent.browser_context = webui_manager.bu_browser_context + webui_manager.bu_agent.controller = webui_manager.bu_controller # --- 6. Run Agent Task and Stream Updates --- agent_run_coro = webui_manager.bu_agent.run(max_steps=max_steps) @@ -832,15 +835,13 @@ def create_browser_use_agent_tab(webui_manager: WebuiManager): async def submit_wrapper(components_dict: Dict[Component, Any]) -> AsyncGenerator[Dict[Component, Any], None]: """Wrapper for handle_submit that yields its results.""" - # handle_submit is an async generator, iterate and yield async for update in handle_submit(webui_manager, components_dict): yield update async def stop_wrapper() -> AsyncGenerator[Dict[Component, Any], None]: """Wrapper for handle_stop.""" - # handle_stop is async def but returns a single dict. We yield it once. update_dict = await handle_stop(webui_manager) - yield update_dict # Yield the final dictionary + yield update_dict async def pause_resume_wrapper() -> AsyncGenerator[Dict[Component, Any], None]: """Wrapper for handle_pause_resume.""" diff --git a/src/webui/components/deep_research_agent_tab.py b/src/webui/components/deep_research_agent_tab.py index 5ce8dd7..eeaf58a 100644 --- a/src/webui/components/deep_research_agent_tab.py +++ b/src/webui/components/deep_research_agent_tab.py @@ -5,7 +5,7 @@ from src.webui.webui_manager import WebuiManager from src.utils import config -def create_deep_research_agent_tab(webui_manager: WebuiManager) -> dict[str, Component]: +def create_deep_research_agent_tab(webui_manager: WebuiManager): """ Creates a deep research agent tab """ diff --git a/src/webui/components/load_save_config_tab.py b/src/webui/components/load_save_config_tab.py index acc0f69..aaa1441 100644 --- a/src/webui/components/load_save_config_tab.py +++ b/src/webui/components/load_save_config_tab.py @@ -5,7 +5,7 @@ from src.webui.webui_manager import WebuiManager from src.utils import config -def create_load_save_config_tab(webui_manager: WebuiManager) -> dict[str, Component]: +def create_load_save_config_tab(webui_manager: WebuiManager): """ Creates a load and save config tab. """ @@ -13,7 +13,7 @@ def create_load_save_config_tab(webui_manager: WebuiManager) -> dict[str, Compon tab_components = {} config_file = gr.File( - label="Load UI Settings from Config File", + label="Load UI Settings from json", file_types=[".json"], interactive=True ) diff --git a/tests/test_agents.py b/tests/test_agents.py index 79e48d6..216541a 100644 --- a/tests/test_agents.py +++ b/tests/test_agents.py @@ -194,7 +194,6 @@ async def test_browser_use_parallel(): # api_key=os.getenv("OPENAI_API_KEY", ""), # ) - # llm = utils.get_llm_model( # provider="google", # model_name="gemini-2.0-flash", @@ -335,6 +334,70 @@ async def test_browser_use_parallel(): await browser.close() +async def test_deep_research_agent(): + from src.agent.deep_research.deep_research_agent import DeepSearchAgent + from src.utils import llm_provider + + llm = llm_provider.get_llm_model( + provider="azure_openai", + model_name="gpt-4o", + temperature=0.5, + base_url=os.getenv("AZURE_OPENAI_ENDPOINT", ""), + api_key=os.getenv("AZURE_OPENAI_API_KEY", ""), + ) + + mcp_server_config = { + "mcpServers": { + "desktop-commander": { + "command": "npx", + "args": [ + "-y", + "@wonderwhy-er/desktop-commander" + ] + }, + } + } + + browser_config = {"headless": False, "window_width": 1280, "window_height": 1100, "use_own_browser": False} + agent = DeepSearchAgent(llm=llm, browser_config=browser_config, mcp_server_config=mcp_server_config) + + research_topic = "Impact of Microplastics on Marine Ecosystems" + task_id_to_resume = None # Set this to resume a previous task ID + + print(f"Starting research on: {research_topic}") + + try: + # Call run and wait for the final result dictionary + result = await agent.run(research_topic, task_id=task_id_to_resume) + + print("\n--- Research Process Ended ---") + print(f"Status: {result.get('status')}") + print(f"Message: {result.get('message')}") + print(f"Task ID: {result.get('task_id')}") + + # Check the final state for the report + final_state = result.get('final_state', {}) + if final_state: + print("\n--- Final State Summary ---") + print( + f" Plan Steps Completed: {sum(1 for item in final_state.get('research_plan', []) if item.get('status') == 'completed')}") + print(f" Total Search Results Logged: {len(final_state.get('search_results', []))}") + if final_state.get("final_report"): + print(" Final Report: Generated (content omitted). You can find it in the output directory.") + # print("\n--- Final Report ---") # Optionally print report + # print(final_state["final_report"]) + else: + print(" Final Report: Not generated.") + else: + print("Final state information not available.") + + + except Exception as e: + print(f"\n--- An unhandled error occurred outside the agent run ---") + print(e) + + if __name__ == "__main__": # asyncio.run(test_browser_use_agent()) - asyncio.run(test_browser_use_parallel()) + # asyncio.run(test_browser_use_parallel()) + asyncio.run(test_deep_research_agent()) diff --git a/tests/test_controller.py b/tests/test_controller.py index 1e1608e..5234c46 100644 --- a/tests/test_controller.py +++ b/tests/test_controller.py @@ -32,6 +32,7 @@ async def test_mcp_client(): } mcp_tools, mcp_client = await setup_mcp_client_and_tools(test_server_config) + for tool in mcp_tools: tool_param_model = create_tool_param_model(tool) print(tool.name) diff --git a/webui2.py b/webui2.py deleted file mode 100644 index 98a23b4..0000000 --- a/webui2.py +++ /dev/null @@ -1,1095 +0,0 @@ -import pdb -import logging - -from dotenv import load_dotenv - -load_dotenv() -import os -import glob -import asyncio -import argparse -import os - -logger = logging.getLogger(__name__) - -import gradio as gr -import inspect -from functools import wraps - -from browser_use.agent.service import Agent -from playwright.async_api import async_playwright -from browser_use.browser.browser import Browser, BrowserConfig -from browser_use.browser.context import ( - BrowserContextConfig, - BrowserContextWindowSize, -) -from langchain_ollama import ChatOllama -from playwright.async_api import async_playwright -from src.utils.agent_state import AgentState - -from src.utils import utils -from src.agent.custom_agent import CustomAgent -from src.browser.custom_browser import CustomBrowser -from src.agent.custom_prompts import CustomSystemPrompt, CustomAgentMessagePrompt -from src.browser.custom_context import BrowserContextConfig, CustomBrowserContext -from src.controller.custom_controller import CustomController -from gradio.themes import Citrus, Default, Glass, Monochrome, Ocean, Origin, Soft, Base -from src.utils.utils import update_model_dropdown, get_latest_files, capture_screenshot, MissingAPIKeyError -from src.utils import utils - -# Global variables for persistence -_global_browser = None -_global_browser_context = None -_global_agent = None - - -async def stop_agent(): - """Request the agent to stop and update UI with enhanced feedback""" - global _global_agent - - try: - if _global_agent is not None: - # Request stop - _global_agent.stop() - # Update UI immediately - message = "Stop requested - the agent will halt at the next safe point" - logger.info(f"šŸ›‘ {message}") - - # Return UI updates - return ( - gr.update(value="Stopping...", interactive=False), # stop_button - gr.update(interactive=False), # run_button - ) - except Exception as e: - error_msg = f"Error during stop: {str(e)}" - logger.error(error_msg) - return ( - gr.update(value="Stop", interactive=True), - gr.update(interactive=True) - ) - - -async def run_browser_agent( - agent_type, - llm_provider, - llm_model_name, - llm_num_ctx, - llm_temperature, - llm_base_url, - llm_api_key, - use_own_browser, - keep_browser_open, - headless, - disable_security, - window_w, - window_h, - save_recording_path, - save_agent_history_path, - save_trace_path, - enable_recording, - task, - add_infos, - max_steps, - use_vision, - max_actions_per_step, - tool_calling_method, - chrome_cdp, - max_input_tokens -): - try: - # Disable recording if the checkbox is unchecked - if not enable_recording: - save_recording_path = None - - # Ensure the recording directory exists if recording is enabled - if save_recording_path: - os.makedirs(save_recording_path, exist_ok=True) - - # Run the agent - llm = utils.get_llm_model( - provider=llm_provider, - model_name=llm_model_name, - num_ctx=llm_num_ctx, - temperature=llm_temperature, - base_url=llm_base_url, - api_key=llm_api_key, - ) - if agent_type == "org": - final_result, errors, model_actions, model_thoughts, trace_file, history_file = await run_org_agent( - llm=llm, - use_own_browser=use_own_browser, - keep_browser_open=keep_browser_open, - headless=headless, - disable_security=disable_security, - window_w=window_w, - window_h=window_h, - save_recording_path=save_recording_path, - save_agent_history_path=save_agent_history_path, - save_trace_path=save_trace_path, - task=task, - max_steps=max_steps, - use_vision=use_vision, - max_actions_per_step=max_actions_per_step, - tool_calling_method=tool_calling_method, - chrome_cdp=chrome_cdp, - max_input_tokens=max_input_tokens - ) - elif agent_type == "custom": - final_result, errors, model_actions, model_thoughts, trace_file, history_file = await run_custom_agent( - llm=llm, - use_own_browser=use_own_browser, - keep_browser_open=keep_browser_open, - headless=headless, - disable_security=disable_security, - window_w=window_w, - window_h=window_h, - save_recording_path=save_recording_path, - save_agent_history_path=save_agent_history_path, - save_trace_path=save_trace_path, - task=task, - add_infos=add_infos, - max_steps=max_steps, - use_vision=use_vision, - max_actions_per_step=max_actions_per_step, - tool_calling_method=tool_calling_method, - chrome_cdp=chrome_cdp, - max_input_tokens=max_input_tokens - ) - else: - raise ValueError(f"Invalid agent type: {agent_type}") - - # Get the list of videos after the agent runs (if recording is enabled) - # latest_video = None - # if save_recording_path: - # new_videos = set( - # glob.glob(os.path.join(save_recording_path, "*.[mM][pP]4")) - # + glob.glob(os.path.join(save_recording_path, "*.[wW][eE][bB][mM]")) - # ) - # if new_videos - existing_videos: - # latest_video = list(new_videos - existing_videos)[0] # Get the first new video - - gif_path = os.path.join(os.path.dirname(__file__), "agent_history.gif") - - return ( - final_result, - errors, - model_actions, - model_thoughts, - gif_path, - trace_file, - history_file, - gr.update(value="Stop", interactive=True), # Re-enable stop button - gr.update(interactive=True) # Re-enable run button - ) - - except MissingAPIKeyError as e: - logger.error(str(e)) - raise gr.Error(str(e), print_exception=False) - - except Exception as e: - import traceback - traceback.print_exc() - errors = str(e) + "\n" + traceback.format_exc() - return ( - '', # final_result - errors, # errors - '', # model_actions - '', # model_thoughts - None, # latest_video - None, # history_file - None, # trace_file - gr.update(value="Stop", interactive=True), # Re-enable stop button - gr.update(interactive=True) # Re-enable run button - ) - - -async def run_org_agent( - llm, - use_own_browser, - keep_browser_open, - headless, - disable_security, - window_w, - window_h, - save_recording_path, - save_agent_history_path, - save_trace_path, - task, - max_steps, - use_vision, - max_actions_per_step, - tool_calling_method, - chrome_cdp, - max_input_tokens -): - try: - global _global_browser, _global_browser_context, _global_agent - - extra_chromium_args = ["--accept_downloads=True", f"--window-size={window_w},{window_h}"] - cdp_url = chrome_cdp - - if use_own_browser: - cdp_url = os.getenv("CHROME_CDP", chrome_cdp) - chrome_path = os.getenv("CHROME_PATH", None) - if chrome_path == "": - chrome_path = None - chrome_user_data = os.getenv("CHROME_USER_DATA", None) - if chrome_user_data: - extra_chromium_args += [f"--user-data-dir={chrome_user_data}"] - else: - chrome_path = None - - if _global_browser is None: - _global_browser = Browser( - config=BrowserConfig( - headless=headless, - cdp_url=cdp_url, - disable_security=disable_security, - chrome_instance_path=chrome_path, - extra_chromium_args=extra_chromium_args, - ) - ) - - if _global_browser_context is None: - _global_browser_context = await _global_browser.new_context( - config=BrowserContextConfig( - trace_path=save_trace_path if save_trace_path else None, - save_recording_path=save_recording_path if save_recording_path else None, - save_downloads_path="./tmp/downloads", - no_viewport=False, - browser_window_size=BrowserContextWindowSize( - width=window_w, height=window_h - ), - ) - ) - - if _global_agent is None: - _global_agent = Agent( - task=task, - llm=llm, - use_vision=use_vision, - browser=_global_browser, - browser_context=_global_browser_context, - max_actions_per_step=max_actions_per_step, - tool_calling_method=tool_calling_method, - max_input_tokens=max_input_tokens, - generate_gif=True - ) - history = await _global_agent.run(max_steps=max_steps) - - history_file = os.path.join(save_agent_history_path, f"{_global_agent.state.agent_id}.json") - _global_agent.save_history(history_file) - - final_result = history.final_result() - errors = history.errors() - model_actions = history.model_actions() - model_thoughts = history.model_thoughts() - - trace_file = get_latest_files(save_trace_path) - - return final_result, errors, model_actions, model_thoughts, trace_file.get('.zip'), history_file - except Exception as e: - import traceback - traceback.print_exc() - errors = str(e) + "\n" + traceback.format_exc() - return '', errors, '', '', None, None - finally: - _global_agent = None - # Handle cleanup based on persistence configuration - if not keep_browser_open: - if _global_browser_context: - await _global_browser_context.close() - _global_browser_context = None - - if _global_browser: - await _global_browser.close() - _global_browser = None - - -async def run_custom_agent( - llm, - use_own_browser, - keep_browser_open, - headless, - disable_security, - window_w, - window_h, - save_recording_path, - save_agent_history_path, - save_trace_path, - task, - add_infos, - max_steps, - use_vision, - max_actions_per_step, - tool_calling_method, - chrome_cdp, - max_input_tokens -): - try: - global _global_browser, _global_browser_context, _global_agent - - extra_chromium_args = ["--accept_downloads=True", f"--window-size={window_w},{window_h}"] - cdp_url = chrome_cdp - if use_own_browser: - cdp_url = os.getenv("CHROME_CDP", chrome_cdp) - - chrome_path = os.getenv("CHROME_PATH", None) - if chrome_path == "": - chrome_path = None - chrome_user_data = os.getenv("CHROME_USER_DATA", None) - if chrome_user_data: - extra_chromium_args += [f"--user-data-dir={chrome_user_data}"] - else: - chrome_path = None - - controller = CustomController() - - # Initialize global browser if needed - # if chrome_cdp not empty string nor None - if (_global_browser is None) or (cdp_url and cdp_url != "" and cdp_url != None): - _global_browser = CustomBrowser( - config=BrowserConfig( - headless=headless, - disable_security=disable_security, - cdp_url=cdp_url, - chrome_instance_path=chrome_path, - extra_chromium_args=extra_chromium_args, - ) - ) - - if _global_browser_context is None or (chrome_cdp and cdp_url != "" and cdp_url != None): - _global_browser_context = await _global_browser.new_context( - config=BrowserContextConfig( - trace_path=save_trace_path if save_trace_path else None, - save_recording_path=save_recording_path if save_recording_path else None, - no_viewport=False, - save_downloads_path="./tmp/downloads", - browser_window_size=BrowserContextWindowSize( - width=window_w, height=window_h - ), - ) - ) - - # Create and run agent - if _global_agent is None: - _global_agent = CustomAgent( - task=task, - add_infos=add_infos, - use_vision=use_vision, - llm=llm, - browser=_global_browser, - browser_context=_global_browser_context, - controller=controller, - system_prompt_class=CustomSystemPrompt, - agent_prompt_class=CustomAgentMessagePrompt, - max_actions_per_step=max_actions_per_step, - tool_calling_method=tool_calling_method, - max_input_tokens=max_input_tokens, - generate_gif=True - ) - history = await _global_agent.run(max_steps=max_steps) - - history_file = os.path.join(save_agent_history_path, f"{_global_agent.state.agent_id}.json") - _global_agent.save_history(history_file) - - final_result = history.final_result() - errors = history.errors() - model_actions = history.model_actions() - model_thoughts = history.model_thoughts() - - trace_file = get_latest_files(save_trace_path) - - return final_result, errors, model_actions, model_thoughts, trace_file.get('.zip'), history_file - except Exception as e: - import traceback - traceback.print_exc() - errors = str(e) + "\n" + traceback.format_exc() - return '', errors, '', '', None, None - finally: - _global_agent = None - # Handle cleanup based on persistence configuration - if not keep_browser_open: - if _global_browser_context: - await _global_browser_context.close() - _global_browser_context = None - - if _global_browser: - await _global_browser.close() - _global_browser = None - - -async def run_with_stream( - agent_type, - llm_provider, - llm_model_name, - llm_num_ctx, - llm_temperature, - llm_base_url, - llm_api_key, - use_own_browser, - keep_browser_open, - headless, - disable_security, - window_w, - window_h, - save_recording_path, - save_agent_history_path, - save_trace_path, - enable_recording, - task, - add_infos, - max_steps, - use_vision, - max_actions_per_step, - tool_calling_method, - chrome_cdp, - max_input_tokens -): - global _global_agent - - stream_vw = 80 - stream_vh = int(80 * window_h // window_w) - if not headless: - result = await run_browser_agent( - agent_type=agent_type, - llm_provider=llm_provider, - llm_model_name=llm_model_name, - llm_num_ctx=llm_num_ctx, - llm_temperature=llm_temperature, - llm_base_url=llm_base_url, - llm_api_key=llm_api_key, - use_own_browser=use_own_browser, - keep_browser_open=keep_browser_open, - headless=headless, - disable_security=disable_security, - window_w=window_w, - window_h=window_h, - save_recording_path=save_recording_path, - save_agent_history_path=save_agent_history_path, - save_trace_path=save_trace_path, - enable_recording=enable_recording, - task=task, - add_infos=add_infos, - max_steps=max_steps, - use_vision=use_vision, - max_actions_per_step=max_actions_per_step, - tool_calling_method=tool_calling_method, - chrome_cdp=chrome_cdp, - max_input_tokens=max_input_tokens - ) - # Add HTML content at the start of the result array - yield [gr.update(visible=False)] + list(result) - else: - try: - # Run the browser agent in the background - agent_task = asyncio.create_task( - run_browser_agent( - agent_type=agent_type, - llm_provider=llm_provider, - llm_model_name=llm_model_name, - llm_num_ctx=llm_num_ctx, - llm_temperature=llm_temperature, - llm_base_url=llm_base_url, - llm_api_key=llm_api_key, - use_own_browser=use_own_browser, - keep_browser_open=keep_browser_open, - headless=headless, - disable_security=disable_security, - window_w=window_w, - window_h=window_h, - save_recording_path=save_recording_path, - save_agent_history_path=save_agent_history_path, - save_trace_path=save_trace_path, - enable_recording=enable_recording, - task=task, - add_infos=add_infos, - max_steps=max_steps, - use_vision=use_vision, - max_actions_per_step=max_actions_per_step, - tool_calling_method=tool_calling_method, - chrome_cdp=chrome_cdp, - max_input_tokens=max_input_tokens - ) - ) - - # Initialize values for streaming - html_content = f"

Using browser...

" - final_result = errors = model_actions = model_thoughts = "" - recording_gif = trace = history_file = None - - # Periodically update the stream while the agent task is running - while not agent_task.done(): - try: - encoded_screenshot = await capture_screenshot(_global_browser_context) - if encoded_screenshot is not None: - html_content = f'' - else: - html_content = f"

Waiting for browser session...

" - except Exception as e: - html_content = f"

Waiting for browser session...

" - - if _global_agent and _global_agent.state.stopped: - yield [ - gr.HTML(value=html_content, visible=True), - final_result, - errors, - model_actions, - model_thoughts, - recording_gif, - trace, - history_file, - gr.update(value="Stopping...", interactive=False), # stop_button - gr.update(interactive=False), # run_button - ] - break - else: - yield [ - gr.HTML(value=html_content, visible=True), - final_result, - errors, - model_actions, - model_thoughts, - recording_gif, - trace, - history_file, - gr.update(), # Re-enable stop button - gr.update() # Re-enable run button - ] - await asyncio.sleep(0.1) - - # Once the agent task completes, get the results - try: - result = await agent_task - final_result, errors, model_actions, model_thoughts, recording_gif, trace, history_file, stop_button, run_button = result - except gr.Error: - final_result = "" - model_actions = "" - model_thoughts = "" - recording_gif = trace = history_file = None - - except Exception as e: - errors = f"Agent error: {str(e)}" - - yield [ - gr.HTML(value=html_content, visible=True), - final_result, - errors, - model_actions, - model_thoughts, - recording_gif, - trace, - history_file, - stop_button, - run_button - ] - - except Exception as e: - import traceback - yield [ - gr.HTML( - value=f"

Waiting for browser session...

", - visible=True), - "", - f"Error: {str(e)}\n{traceback.format_exc()}", - "", - "", - None, - None, - None, - gr.update(value="Stop", interactive=True), # Re-enable stop button - gr.update(interactive=True) # Re-enable run button - ] - - -# Define the theme map globally -theme_map = { - "Default": Default(), - "Soft": Soft(), - "Monochrome": Monochrome(), - "Glass": Glass(), - "Origin": Origin(), - "Citrus": Citrus(), - "Ocean": Ocean(), - "Base": Base() -} - - -async def close_global_browser(): - global _global_browser, _global_browser_context - - if _global_browser_context: - await _global_browser_context.close() - _global_browser_context = None - - if _global_browser: - await _global_browser.close() - _global_browser = None - - -async def run_deep_search(research_task, max_search_iteration_input, max_query_per_iter_input, llm_provider, - llm_model_name, llm_num_ctx, llm_temperature, llm_base_url, llm_api_key, use_vision, - use_own_browser, headless, chrome_cdp): - from src.utils.deep_research import deep_research - global _global_agent_state - - # Clear any previous stop request - _global_agent_state.clear_stop() - - llm = utils.get_llm_model( - provider=llm_provider, - model_name=llm_model_name, - num_ctx=llm_num_ctx, - temperature=llm_temperature, - base_url=llm_base_url, - api_key=llm_api_key, - ) - markdown_content, file_path = await deep_research(research_task, llm, _global_agent_state, - max_search_iterations=max_search_iteration_input, - max_query_num=max_query_per_iter_input, - use_vision=use_vision, - headless=headless, - use_own_browser=use_own_browser, - chrome_cdp=chrome_cdp - ) - - return markdown_content, file_path, gr.update(value="Stop", interactive=True), gr.update(interactive=True) - - -def create_ui(theme_name="Ocean"): - css = """ - .gradio-container { - width: 60vw !important; - max-width: 60% !important; - margin-left: auto !important; - margin-right: auto !important; - padding-top: 20px !important; - } - .header-text { - text-align: center; - margin-bottom: 30px; - } - .theme-section { - margin-bottom: 20px; - padding: 15px; - border-radius: 10px; - } - """ - - with gr.Blocks( - title="Browser Use WebUI", theme=theme_map[theme_name], css=css - ) as demo: - with gr.Row(): - gr.Markdown( - """ - # 🌐 Browser Use WebUI - ### Control your browser with AI assistance - """, - elem_classes=["header-text"], - ) - - with gr.Tabs() as tabs: - with gr.TabItem("āš™ļø Agent Settings", id=1): - with gr.Group(): - agent_type = gr.Radio( - ["org", "custom"], - label="Agent Type", - value="custom", - info="Select the type of agent to use", - interactive=True - ) - with gr.Column(): - max_steps = gr.Slider( - minimum=1, - maximum=200, - value=100, - step=1, - label="Max Run Steps", - info="Maximum number of steps the agent will take", - interactive=True - ) - max_actions_per_step = gr.Slider( - minimum=1, - maximum=100, - value=10, - step=1, - label="Max Actions per Step", - info="Maximum number of actions the agent will take per step", - interactive=True - ) - with gr.Column(): - use_vision = gr.Checkbox( - label="Use Vision", - value=True, - info="Enable visual processing capabilities", - interactive=True - ) - max_input_tokens = gr.Number( - label="Max Input Tokens", - value=128000, - precision=0, - interactive=True - ) - tool_calling_method = gr.Dropdown( - label="Tool Calling Method", - value="auto", - interactive=True, - allow_custom_value=True, # Allow users to input custom model names - choices=["auto", "json_schema", "function_calling"], - info="Tool Calls Funtion Name", - visible=False - ) - - with gr.TabItem("šŸ”§ LLM Settings", id=2): - with gr.Group(): - llm_provider = gr.Dropdown( - choices=[provider for provider, model in utils.model_names.items()], - label="LLM Provider", - value="openai", - info="Select your preferred language model provider", - interactive=True - ) - llm_model_name = gr.Dropdown( - label="Model Name", - choices=utils.model_names['openai'], - value="gpt-4o", - interactive=True, - allow_custom_value=True, # Allow users to input custom model names - info="Select a model in the dropdown options or directly type a custom model name" - ) - ollama_num_ctx = gr.Slider( - minimum=2 ** 8, - maximum=2 ** 16, - value=16000, - step=1, - label="Ollama Context Length", - info="Controls max context length model needs to handle (less = faster)", - visible=False, - interactive=True - ) - llm_temperature = gr.Slider( - minimum=0.0, - maximum=2.0, - value=0.6, - step=0.1, - label="Temperature", - info="Controls randomness in model outputs", - interactive=True - ) - with gr.Row(): - llm_base_url = gr.Textbox( - label="Base URL", - value="", - info="API endpoint URL (if required)" - ) - llm_api_key = gr.Textbox( - label="API Key", - type="password", - value="", - info="Your API key (leave blank to use .env)" - ) - - # Change event to update context length slider - def update_llm_num_ctx_visibility(llm_provider): - return gr.update(visible=llm_provider == "ollama") - - # Bind the change event of llm_provider to update the visibility of context length slider - llm_provider.change( - fn=update_llm_num_ctx_visibility, - inputs=llm_provider, - outputs=ollama_num_ctx - ) - - with gr.TabItem("🌐 Browser Settings", id=3): - with gr.Group(): - with gr.Row(): - use_own_browser = gr.Checkbox( - label="Use Own Browser", - value=False, - info="Use your existing browser instance", - interactive=True - ) - keep_browser_open = gr.Checkbox( - label="Keep Browser Open", - value=False, - info="Keep Browser Open between Tasks", - interactive=True - ) - headless = gr.Checkbox( - label="Headless Mode", - value=False, - info="Run browser without GUI", - interactive=True - ) - disable_security = gr.Checkbox( - label="Disable Security", - value=True, - info="Disable browser security features", - interactive=True - ) - enable_recording = gr.Checkbox( - label="Enable Recording", - value=True, - info="Enable saving browser recordings", - interactive=True - ) - - with gr.Row(): - window_w = gr.Number( - label="Window Width", - value=1280, - info="Browser window width", - interactive=True - ) - window_h = gr.Number( - label="Window Height", - value=1100, - info="Browser window height", - interactive=True - ) - - chrome_cdp = gr.Textbox( - label="CDP URL", - placeholder="http://localhost:9222", - value="", - info="CDP for google remote debugging", - interactive=True, # Allow editing only if recording is enabled - ) - - save_recording_path = gr.Textbox( - label="Recording Path", - placeholder="e.g. ./tmp/record_videos", - value="./tmp/record_videos", - info="Path to save browser recordings", - interactive=True, # Allow editing only if recording is enabled - ) - - save_trace_path = gr.Textbox( - label="Trace Path", - placeholder="e.g. ./tmp/traces", - value="./tmp/traces", - info="Path to save Agent traces", - interactive=True, - ) - - save_agent_history_path = gr.Textbox( - label="Agent History Save Path", - placeholder="e.g., ./tmp/agent_history", - value="./tmp/agent_history", - info="Specify the directory where agent history should be saved.", - interactive=True, - ) - - with gr.TabItem("šŸ¤– Run Agent", id=4): - task = gr.Textbox( - label="Task Description", - lines=4, - placeholder="Enter your task here...", - value="go to google.com and type 'OpenAI' click search and give me the first url", - info="Describe what you want the agent to do", - interactive=True - ) - add_infos = gr.Textbox( - label="Additional Information", - lines=3, - placeholder="Add any helpful context or instructions...", - info="Optional hints to help the LLM complete the task", - value="", - interactive=True - ) - - with gr.Row(): - run_button = gr.Button("ā–¶ļø Run Agent", variant="primary", scale=2) - stop_button = gr.Button("ā¹ļø Stop", variant="stop", scale=1) - - with gr.Row(): - browser_view = gr.HTML( - value="

Waiting for browser session...

", - label="Live Browser View", - visible=False - ) - - gr.Markdown("### Results") - with gr.Row(): - with gr.Column(): - final_result_output = gr.Textbox( - label="Final Result", lines=3, show_label=True - ) - with gr.Column(): - errors_output = gr.Textbox( - label="Errors", lines=3, show_label=True - ) - with gr.Row(): - with gr.Column(): - model_actions_output = gr.Textbox( - label="Model Actions", lines=3, show_label=True, visible=False - ) - with gr.Column(): - model_thoughts_output = gr.Textbox( - label="Model Thoughts", lines=3, show_label=True, visible=False - ) - recording_gif = gr.Image(label="Result GIF", format="gif") - trace_file = gr.File(label="Trace File") - agent_history_file = gr.File(label="Agent History") - - with gr.TabItem("🧐 Deep Research", id=5): - research_task_input = gr.Textbox(label="Research Task", lines=5, - value="Compose a report on the use of Reinforcement Learning for training Large Language Models, encompassing its origins, current advancements, and future prospects, substantiated with examples of relevant models and techniques. The report should reflect original insights and analysis, moving beyond mere summarization of existing literature.", - interactive=True) - with gr.Row(): - max_search_iteration_input = gr.Number(label="Max Search Iteration", value=3, - precision=0, - interactive=True) # precision=0 ē”®äæę˜Æę•“ę•° - max_query_per_iter_input = gr.Number(label="Max Query per Iteration", value=1, - precision=0, - interactive=True) # precision=0 ē”®äæę˜Æę•“ę•° - with gr.Row(): - research_button = gr.Button("ā–¶ļø Run Deep Research", variant="primary", scale=2) - stop_research_button = gr.Button("ā¹ Stop", variant="stop", scale=1) - markdown_output_display = gr.Markdown(label="Research Report") - markdown_download = gr.File(label="Download Research Report") - - # Bind the stop button click event after errors_output is defined - stop_button.click( - fn=stop_agent, - inputs=[], - outputs=[stop_button, run_button], - ) - - # Run button click handler - run_button.click( - fn=run_with_stream, - inputs=[ - agent_type, llm_provider, llm_model_name, ollama_num_ctx, llm_temperature, llm_base_url, - llm_api_key, - use_own_browser, keep_browser_open, headless, disable_security, window_w, window_h, - save_recording_path, save_agent_history_path, save_trace_path, # Include the new path - enable_recording, task, add_infos, max_steps, use_vision, max_actions_per_step, - tool_calling_method, chrome_cdp, max_input_tokens - ], - outputs=[ - browser_view, # Browser view - final_result_output, # Final result - errors_output, # Errors - model_actions_output, # Model actions - model_thoughts_output, # Model thoughts - recording_gif, # Latest recording - trace_file, # Trace file - agent_history_file, # Agent history file - stop_button, # Stop button - run_button # Run button - ], - ) - - # Run Deep Research - research_button.click( - fn=run_deep_search, - inputs=[research_task_input, max_search_iteration_input, max_query_per_iter_input, llm_provider, - llm_model_name, ollama_num_ctx, llm_temperature, llm_base_url, llm_api_key, use_vision, - use_own_browser, headless, chrome_cdp], - outputs=[markdown_output_display, markdown_download, stop_research_button, research_button] - ) - # Bind the stop button click event after errors_output is defined - stop_research_button.click( - fn=stop_research_agent, - inputs=[], - outputs=[stop_research_button, research_button], - ) - - with gr.TabItem("šŸŽ„ Recordings", id=7, visible=True): - def list_recordings(save_recording_path): - if not os.path.exists(save_recording_path): - return [] - - # Get all video files - recordings = glob.glob(os.path.join(save_recording_path, "*.[mM][pP]4")) + glob.glob( - os.path.join(save_recording_path, "*.[wW][eE][bB][mM]")) - - # Sort recordings by creation time (oldest first) - recordings.sort(key=os.path.getctime) - - # Add numbering to the recordings - numbered_recordings = [] - for idx, recording in enumerate(recordings, start=1): - filename = os.path.basename(recording) - numbered_recordings.append((recording, f"{idx}. {filename}")) - - return numbered_recordings - - recordings_gallery = gr.Gallery( - label="Recordings", - columns=3, - height="auto", - object_fit="contain" - ) - - refresh_button = gr.Button("šŸ”„ Refresh Recordings", variant="secondary") - refresh_button.click( - fn=list_recordings, - inputs=save_recording_path, - outputs=recordings_gallery - ) - - with gr.TabItem("šŸ“ UI Configuration", id=8): - config_file_input = gr.File( - label="Load UI Settings from Config File", - file_types=[".json"], - interactive=True - ) - with gr.Row(): - load_config_button = gr.Button("Load Config", variant="primary") - save_config_button = gr.Button("Save UI Settings", variant="primary") - - config_status = gr.Textbox( - label="Status", - lines=2, - interactive=False - ) - save_config_button.click( - fn=save_current_config, - inputs=[], # äøéœ€č¦č¾“å…„å‚ę•° - outputs=[config_status] - ) - - # Attach the callback to the LLM provider dropdown - llm_provider.change( - lambda provider, api_key, base_url: update_model_dropdown(provider, api_key, base_url), - inputs=[llm_provider, llm_api_key, llm_base_url], - outputs=llm_model_name - ) - - # Add this after defining the components - enable_recording.change( - lambda enabled: gr.update(interactive=enabled), - inputs=enable_recording, - outputs=save_recording_path - ) - - use_own_browser.change(fn=close_global_browser) - keep_browser_open.change(fn=close_global_browser) - - scan_and_register_components(demo) - global webui_config_manager - all_components = webui_config_manager.get_all_components() - - load_config_button.click( - fn=update_ui_from_config, - inputs=[config_file_input], - outputs=all_components + [config_status] - ) - return demo - - -def main(): - parser = argparse.ArgumentParser(description="Gradio UI for Browser Agent") - parser.add_argument("--ip", type=str, default="127.0.0.1", help="IP address to bind to") - parser.add_argument("--port", type=int, default=7788, help="Port to listen on") - parser.add_argument("--theme", type=str, default="Ocean", choices=theme_map.keys(), help="Theme to use for the UI") - args = parser.parse_args() - - demo = create_ui(theme_name=args.theme) - demo.launch(server_name=args.ip, server_port=args.port) - - -if __name__ == '__main__': - main() From 09e3f21e05bad6f2e874632c79a2630fcbedfaba Mon Sep 17 00:00:00 2001 From: vvincent1234 Date: Wed, 30 Apr 2025 00:15:08 +0800 Subject: [PATCH 14/35] fix deep research agent --- requirements.txt | 2 +- .../deep_research/deep_research_agent.py | 312 ++++++++++++------ 2 files changed, 216 insertions(+), 98 deletions(-) diff --git a/requirements.txt b/requirements.txt index 6c44d12..a9f6c87 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,4 +7,4 @@ MainContentExtractor==0.0.4 langchain-ibm==0.3.10 langchain_mcp_adapters==0.0.9 langgraph==0.3.34 -langchain-community==0.3.23 \ No newline at end of file +langchain-community \ No newline at end of file diff --git a/src/agent/deep_research/deep_research_agent.py b/src/agent/deep_research/deep_research_agent.py index 6863f47..b87eb7a 100644 --- a/src/agent/deep_research/deep_research_agent.py +++ b/src/agent/deep_research/deep_research_agent.py @@ -2,6 +2,7 @@ import asyncio import json import logging import os +import pdb import uuid from pathlib import Path from typing import List, Dict, Any, TypedDict, Optional, Sequence, Annotated @@ -16,6 +17,8 @@ from langchain.agents import AgentExecutor # We might use parts, but Langgraph from langchain_community.tools.file_management import WriteFileTool, ReadFileTool, CopyFileTool, ListDirectoryTool, \ MoveFileTool, FileSearchTool from langchain_openai import ChatOpenAI # Replace with your actual LLM import +from pydantic import BaseModel, Field +import operator from browser_use.browser.browser import BrowserConfig from browser_use.browser.context import BrowserContextWindowSize @@ -37,7 +40,7 @@ os.makedirs(TMP_DIR, exist_ok=True) REPORT_FILENAME = "report.md" PLAN_FILENAME = "research_plan.md" SEARCH_INFO_FILENAME = "search_info.json" -MAX_PARALLEL_BROWSERS = 2 +MAX_PARALLEL_BROWSERS = 1 _AGENT_STOP_FLAGS = {} _BROWSER_AGENT_INSTANCES = {} # To store running browser agents for stopping @@ -175,41 +178,90 @@ async def run_single_browser_task( logger.error(f"Error closing browser: {e}") -async def browser_search_tool_func(queries: List[str], task_id: str, llm: Any, browser_config: Dict[str, Any], - stop_event: threading.Event): +class BrowserSearchInput(BaseModel): + queries: List[str] = Field( + description=f"List of distinct search queries (max {MAX_PARALLEL_BROWSERS}) to find information relevant to the research task.") + + +async def _run_browser_search_tool( + queries: List[str], + task_id: str, # Injected dependency + llm: Any, # Injected dependency + browser_config: Dict[str, Any], # Injected dependency + stop_event: threading.Event # Injected dependency +) -> List[Dict[str, Any]]: """ - Tool function to run multiple browser searches in parallel (up to MAX_PARALLEL_BROWSERS). + Internal function to execute parallel browser searches based on LLM-provided queries. + Handles concurrency and stop signals. """ - if not BrowserUseAgent: - return [{"query": q, "error": "BrowserUseAgent components not available."} for q in queries] + + # Limit queries just in case LLM ignores the description + queries = queries[:MAX_PARALLEL_BROWSERS] + logger.info(f"[Browser Tool {task_id}] Running search for {len(queries)} queries: {queries}") results = [] - # Use asyncio.Semaphore to limit concurrent browser instances semaphore = asyncio.Semaphore(MAX_PARALLEL_BROWSERS) async def task_wrapper(query): async with semaphore: if stop_event.is_set(): - logger.info(f"Skipping browser task due to stop signal: {query}") + logger.info(f"[Browser Tool {task_id}] Skipping task due to stop signal: {query}") return {"query": query, "result": None, "status": "cancelled"} - # Pass necessary configs and the stop event - return await run_single_browser_task(query, task_id, llm, browser_config, stop_event) + # Pass necessary injected configs and the stop event + return await run_single_browser_task( + query, + task_id, + llm, # Pass the main LLM (or a dedicated one if needed) + browser_config, + stop_event + # use_vision could be added here if needed + ) tasks = [task_wrapper(query) for query in queries] - # Use asyncio.gather to run tasks concurrently search_results = await asyncio.gather(*tasks, return_exceptions=True) - # Process results, handling potential exceptions returned by gather - for result in search_results: - if isinstance(result, Exception): - # Log the exception, but maybe return a specific error structure - logger.error(f"Browser task gather caught exception: {result}") - # Find which query failed if possible (difficult with gather exceptions directly) - results.append({"query": "unknown", "error": str(result), "status": "failed"}) + processed_results = [] + for i, res in enumerate(search_results): + query = queries[i] # Get corresponding query + if isinstance(res, Exception): + logger.error(f"[Browser Tool {task_id}] Gather caught exception for query '{query}': {res}", exc_info=True) + processed_results.append({"query": query, "error": str(res), "status": "failed"}) + elif isinstance(res, dict): + processed_results.append(res) else: - results.append(result) + logger.error(f"[Browser Tool {task_id}] Unexpected result type for query '{query}': {type(res)}") + processed_results.append({"query": query, "error": "Unexpected result type", "status": "failed"}) - return results + logger.info(f"[Browser Tool {task_id}] Finished search. Results count: {len(processed_results)}") + return processed_results + + +def create_browser_search_tool( + llm: Any, + browser_config: Dict[str, Any], + task_id: str, + stop_event: threading.Event +) -> StructuredTool: + """Factory function to create the browser search tool with necessary dependencies.""" + # Use partial to bind the dependencies that aren't part of the LLM call arguments + from functools import partial + bound_tool_func = partial( + _run_browser_search_tool, + task_id=task_id, + llm=llm, + browser_config=browser_config, + stop_event=stop_event, + ) + + return StructuredTool.from_function( + coroutine=bound_tool_func, + name="parallel_browser_search", + description=f"""Use this tool to actively search the web for information related to a specific research task or question. +It runs up to {MAX_PARALLEL_BROWSERS} searches in parallel using a browser agent for better results than simple scraping. +Provide a list of distinct search queries that are likely to yield relevant information. +The tool returns a list of results, each containing the original query, the status (completed, failed, stopped), and the summarized information found (or an error message).""", + args_schema=BrowserSearchInput, + ) # --- Langgraph State Definition --- @@ -238,6 +290,8 @@ class DeepResearchState(TypedDict): # Add other state variables as needed error_message: Optional[str] # To store errors + messages: List[BaseMessage] + # --- Langgraph Nodes --- @@ -398,23 +452,27 @@ async def planning_node(state: DeepResearchState) -> Dict[str, Any]: async def research_execution_node(state: DeepResearchState) -> Dict[str, Any]: - """Executes the next step in the research plan using the browser tool.""" + """ + Executes the next step in the research plan by invoking the LLM with tools. + The LLM decides which tool (e.g., browser search) to use and provides arguments. + """ logger.info("--- Entering Research Execution Node ---") if state.get('stop_requested'): logger.info("Stop requested, skipping research execution.") - return {"stop_requested": True} + return {"stop_requested": True, "current_step_index": state['current_step_index']} # Keep index same plan = state['research_plan'] current_index = state['current_step_index'] llm = state['llm'] - browser_config = state['browser_config'] - output_dir = state['output_dir'] + tools = state['tools'] # Tools are now passed in state + output_dir = str(state['output_dir']) task_id = state['task_id'] - stop_event = _AGENT_STOP_FLAGS.get(task_id) + # Stop event is bound inside the tool function, no need to pass directly here if not plan or current_index >= len(plan): logger.info("Research plan complete or empty.") - return {} # Signal to move to synthesis or end + # This condition should ideally be caught by `should_continue` before reaching here + return {} current_step = plan[current_index] if current_step['status'] == 'completed': @@ -423,93 +481,145 @@ async def research_execution_node(state: DeepResearchState) -> Dict[str, Any]: logger.info(f"Executing research step {current_step['step']}: {current_step['task']}") - # 1. Generate Search Queries for the current task using LLM - query_gen_prompt = ChatPromptTemplate.from_messages([ - ("system", - f"You are an expert search query formulator. Given a research task, generate {MAX_PARALLEL_BROWSERS} distinct, effective search engine queries to find relevant information. Focus on diversity and different angles of the task. Output ONLY the queries, each on a new line."), - ("human", f"Research Task: {current_step['task']}\n\nGenerate search queries:") - ]) + # Bind tools to the LLM for this call + llm_with_tools = llm.bind_tools(tools) + if state['messages']: + current_task_message = [HumanMessage( + content=f"Research Task (Step {current_step['step']}): {current_step['task']}")] + invocation_messages = state['messages'] + current_task_message + else: + current_task_message = [ + SystemMessage( + content="You are a research assistant executing one step of a research plan. Use the available tools, especially the 'parallel_browser_search' tool, to gather information needed for the current task. Be precise with your search queries if using the browser tool."), + HumanMessage( + content=f"Research Task (Step {current_step['step']}): {current_step['task']}") + ] + invocation_messages = current_task_message try: - response = await llm.ainvoke(query_gen_prompt.format_prompt().to_messages()) - queries = [q.strip() for q in response.content.strip().split('\n') if q.strip()] - if not queries: + # Invoke the LLM, expecting it to make a tool call + logger.info(f"Invoking LLM with tools for task: {current_step['task']}") + ai_response: BaseMessage = await llm_with_tools.ainvoke(invocation_messages) + logger.info("LLM invocation complete.") + + tool_results = [] + executed_tool_names = [] + + if not isinstance(ai_response, AIMessage) or not ai_response.tool_calls: + # LLM didn't call a tool. Maybe it answered directly? Or failed? logger.warning( - f"LLM did not generate any search queries for task: {current_step['task']}. Using task itself as query.") - queries = [current_step['task']] - else: - queries = queries[:MAX_PARALLEL_BROWSERS] # Limit to max parallel - logger.info(f"Generated queries: {queries}") - current_step['queries'] = queries # Store generated queries in the plan item - - except Exception as e: - logger.error(f"Failed to generate search queries: {e}. Using task as query.", exc_info=True) - queries = [current_step['task']] - current_step['queries'] = queries - - # 2. Execute Searches using the Browser Tool - try: - search_results_list = await browser_search_tool_func( - queries=queries, - task_id=task_id, - llm=llm, - browser_config=browser_config, - stop_event=stop_event - ) - - # Check for stop signal *after* search execution attempt - if stop_event and stop_event.is_set(): - logger.info("Stop requested during or after search execution.") - # Update plan partially if needed, or just signal stop - current_step['status'] = 'pending' # Mark as not completed due to stop + f"LLM did not call any tool for step {current_step['step']}. Response: {ai_response.content[:100]}...") + # How to handle this? Mark step as failed? Or store the content? + # Let's mark as failed for now, assuming a tool was expected. + current_step['status'] = 'failed' + current_step['result_summary'] = "LLM did not use a tool as expected." _save_plan_to_md(plan, output_dir) - # Save any partial results gathered before stop - current_search_results = state.get('search_results', []) - current_search_results.extend([r for r in search_results_list if r.get('status') != 'cancelled']) - _save_search_results_to_json(current_search_results, output_dir) - return {"stop_requested": True, "search_results": current_search_results, "research_plan": plan} + return { + "research_plan": plan, + "current_step_index": current_index + 1, + "error_message": f"LLM failed to call a tool for step {current_step['step']}." + } - # 3. Process Results and Update State - successful_results = [r for r in search_results_list if r.get('status') == 'completed' and r.get('result')] - failed_queries = [r['query'] for r in search_results_list if r.get('status') == 'failed'] - # Combine results with existing ones - all_search_results = state.get('search_results', []) - all_search_results.extend(search_results_list) # Add all results (incl. errors) + # Process tool calls + for tool_call in ai_response.tool_calls: + tool_name = tool_call.get("name") + tool_args = tool_call.get("args", {}) + tool_call_id = tool_call.get("id") # Important for ToolMessage - if failed_queries: - logger.warning(f"Some queries failed: {failed_queries}") - # Optionally add logic to retry failed queries + logger.info(f"LLM requested tool call: {tool_name} with args: {tool_args}") + executed_tool_names.append(tool_name) - if successful_results: - # Optionally, summarize the findings for this step (could be another LLM call) - # current_step['result_summary'] = "Summary of findings..." - current_step['status'] = 'completed' - logger.info(f"Step {current_step['step']} completed successfully.") + # Find the corresponding tool instance + selected_tool = next((t for t in tools if t.name == tool_name), None) + + if not selected_tool: + logger.error(f"LLM called tool '{tool_name}' which is not available.") + # Create a ToolMessage indicating the error + tool_results.append(ToolMessage( + content=f"Error: Tool '{tool_name}' not found.", + tool_call_id=tool_call_id + )) + continue # Skip to next tool call if any + + # Execute the tool + try: + # Stop check before executing the tool (tool itself also checks) + stop_event = _AGENT_STOP_FLAGS.get(task_id) + if stop_event and stop_event.is_set(): + logger.info(f"Stop requested before executing tool: {tool_name}") + # How to report this back? Maybe skip execution, return special state? + # Let's update state and return stop_requested = True + current_step['status'] = 'pending' # Not completed due to stop + _save_plan_to_md(plan, output_dir) + return {"stop_requested": True, "research_plan": plan} + + logger.info(f"Executing tool: {tool_name}") + # Assuming tool functions handle async correctly + tool_output = await selected_tool.ainvoke(tool_args) + logger.info(f"Tool '{tool_name}' executed successfully.") + browser_tool_called = "parallel_browser_search" in executed_tool_names + # Append result to overall search results + current_search_results = state.get('search_results', []) + if browser_tool_called: # Specific handling for browser tool output + current_search_results.extend(tool_output) + else: # Handle other tool outputs (e.g., file tools return strings) + # Store it associated with the step? Or a generic log? + # Let's just log it for now. Need better handling for diverse tool outputs. + logger.info(f"Result from tool '{tool_name}': {str(tool_output)[:200]}...") + + # Store result for potential next LLM call (if we were doing multi-turn) + tool_results.append(ToolMessage( + content=json.dumps(tool_output), + tool_call_id=tool_call_id + )) + + except Exception as e: + logger.error(f"Error executing tool '{tool_name}': {e}", exc_info=True) + tool_results.append(ToolMessage( + content=f"Error executing tool {tool_name}: {e}", + tool_call_id=tool_call_id + )) + # Also update overall state search_results with error? + current_search_results = state.get('search_results', []) + current_search_results.append( + {"tool_name": tool_name, "args": tool_args, "status": "failed", "error": str(e)}) + + # Basic check: Did the browser tool run at all? (More specific checks needed) + browser_tool_called = "parallel_browser_search" in executed_tool_names + # We might need a more nuanced status based on the *content* of tool_results + step_failed = any("Error:" in str(tr.content) for tr in tool_results) or not browser_tool_called + + if step_failed: + logger.warning(f"Step {current_step['step']} failed or did not yield results via browser search.") + current_step['status'] = 'failed' + current_step[ + 'result_summary'] = f"Tool execution failed or browser tool not used. Errors: {[tr.content for tr in tool_results if 'Error' in str(tr.content)]}" else: - # Decide how to handle steps with no successful results - logger.warning(f"Step {current_step['step']} completed but yielded no successful results.") - current_step['status'] = 'failed' # Or 'completed_no_results' + logger.info(f"Step {current_step['step']} completed using tool(s): {executed_tool_names}.") + current_step['status'] = 'completed' + + current_step['result_summary'] = f"Executed tool(s): {', '.join(executed_tool_names)}." - # Update the plan file on disk _save_plan_to_md(plan, output_dir) - # Update the search results file on disk - _save_search_results_to_json(all_search_results, output_dir) + _save_search_results_to_json(current_search_results, output_dir) return { "research_plan": plan, - "search_results": all_search_results, + "search_results": current_search_results, # Update with new results "current_step_index": current_index + 1, - "error_message": None if not failed_queries else f"Failed queries: {failed_queries}" + "messages": state["messages"] + current_task_message + [ai_response] + tool_results, + # Optionally return the tool_results messages if needed by downstream nodes } except Exception as e: - logger.error(f"Error during research execution for step {current_step['step']}: {e}", exc_info=True) + logger.error(f"Unhandled error during research execution node for step {current_step['step']}: {e}", + exc_info=True) current_step['status'] = 'failed' _save_plan_to_md(plan, output_dir) return { "research_plan": plan, - "current_step_index": current_index + 1, # Move to next step even if failed? Or retry? Let's move on. - "error_message": f"Execution Error on step {current_step['step']}: {e}" + "current_step_index": current_index + 1, # Move on even if error? + "error_message": f"Core Execution Error on step {current_step['step']}: {e}" } @@ -668,15 +778,22 @@ class DeepSearchAgent: self.stop_event: Optional[threading.Event] = None self.runner: Optional[asyncio.Task] = None # To hold the asyncio task for run - async def _setup_tools(self) -> List[Tool]: + async def _setup_tools(self, task_id: str, stop_event: threading.Event) -> List[Tool]: """Sets up the basic tools (File I/O) and optional MCP tools.""" - tools = [WriteFileTool(), ReadFileTool(), ListDirectoryTool(), CopyFileTool(), - MoveFileTool()] # Basic file operations - + tools = [WriteFileTool(), ReadFileTool(), ListDirectoryTool()] # Basic file operations + browser_use_tool = create_browser_search_tool( + llm=self.llm, + browser_config=self.browser_config, + task_id=task_id, + stop_event=stop_event + ) + tools += [browser_use_tool] # Add MCP tools if config is provided if self.mcp_server_config: try: logger.info("Setting up MCP client and tools...") + if self.mcp_client: + await self.mcp_client.__aexit__(None, None, None) self.mcp_client = await setup_mcp_client_and_tools(self.mcp_server_config) mcp_tools = self.mcp_client.get_tools() logger.info(f"Loaded {len(mcp_tools)} MCP tools.") @@ -744,12 +861,13 @@ class DeepSearchAgent: self.stop_event = threading.Event() _AGENT_STOP_FLAGS[self.current_task_id] = self.stop_event - agent_tools = await self._setup_tools() + agent_tools = await self._setup_tools(self.current_task_id, self.stop_event) initial_state: DeepResearchState = { "task_id": self.current_task_id, "topic": topic, "research_plan": [], "search_results": [], + "messages": [], "llm": self.llm, "tools": agent_tools, "output_dir": output_dir, From eba5788b154abad9b4c61a403c56d63b111fa03e Mon Sep 17 00:00:00 2001 From: vvincent1234 Date: Wed, 30 Apr 2025 09:32:58 +0800 Subject: [PATCH 15/35] add deep research tab --- .gitignore | 1 + .../deep_research/deep_research_agent.py | 112 ++--- src/utils/llm_provider.py | 21 +- .../components/deep_research_agent_tab.py | 440 +++++++++++++++++- src/webui/webui_manager.py | 10 + tests/test_agents.py | 23 +- 6 files changed, 512 insertions(+), 95 deletions(-) diff --git a/.gitignore b/.gitignore index a3f269d..548d48d 100644 --- a/.gitignore +++ b/.gitignore @@ -187,3 +187,4 @@ data/ # For Config Files (Current Settings) .config.pkl +*.pdf \ No newline at end of file diff --git a/src/agent/deep_research/deep_research_agent.py b/src/agent/deep_research/deep_research_agent.py index b87eb7a..db81895 100644 --- a/src/agent/deep_research/deep_research_agent.py +++ b/src/agent/deep_research/deep_research_agent.py @@ -35,15 +35,11 @@ from src.utils.mcp_client import setup_mcp_client_and_tools logger = logging.getLogger(__name__) # Constants -TMP_DIR = Path("./tmp/deep_research") -os.makedirs(TMP_DIR, exist_ok=True) REPORT_FILENAME = "report.md" PLAN_FILENAME = "research_plan.md" SEARCH_INFO_FILENAME = "search_info.json" -MAX_PARALLEL_BROWSERS = 1 _AGENT_STOP_FLAGS = {} -_BROWSER_AGENT_INSTANCES = {} # To store running browser agents for stopping async def run_single_browser_task( @@ -119,6 +115,7 @@ async def run_single_browser_task( 2. The title of the source page or document. 3. The URL of the source. Focus on accuracy and relevance. Avoid irrelevant details. + PDF cannot directly extract _content, please try to download first, then using read_file, if you can't save or read, please try other methods. """ bu_agent_instance = BrowserUseAgent( @@ -131,8 +128,7 @@ async def run_single_browser_task( ) # Store instance for potential stop() call - task_key = f"{task_id}_{uuid.uuid4()}" # Unique key for this run - _BROWSER_AGENT_INSTANCES[task_key] = bu_agent_instance + task_key = f"{task_id}_{uuid.uuid4()}" # --- Run with Stop Check --- # BrowserUseAgent needs to internally check a stop signal or have a stop method. @@ -162,17 +158,17 @@ async def run_single_browser_task( logger.error(f"Error during browser task for query '{task_query}': {e}", exc_info=True) return {"query": task_query, "error": str(e), "status": "failed"} finally: - if task_key in _BROWSER_AGENT_INSTANCES: - del _BROWSER_AGENT_INSTANCES[task_key] if bu_browser_context: try: await bu_browser_context.close() + bu_browser_context = None logger.info("Closed browser context.") except Exception as e: logger.error(f"Error closing browser context: {e}") if bu_browser: try: await bu_browser.close() + bu_browser = None logger.info("Closed browser.") except Exception as e: logger.error(f"Error closing browser: {e}") @@ -180,15 +176,16 @@ async def run_single_browser_task( class BrowserSearchInput(BaseModel): queries: List[str] = Field( - description=f"List of distinct search queries (max {MAX_PARALLEL_BROWSERS}) to find information relevant to the research task.") + description=f"List of distinct search queries to find information relevant to the research task.") async def _run_browser_search_tool( queries: List[str], task_id: str, # Injected dependency llm: Any, # Injected dependency - browser_config: Dict[str, Any], # Injected dependency - stop_event: threading.Event # Injected dependency + browser_config: Dict[str, Any], + stop_event: threading.Event, + max_parallel_browsers: int = 1 ) -> List[Dict[str, Any]]: """ Internal function to execute parallel browser searches based on LLM-provided queries. @@ -196,11 +193,11 @@ async def _run_browser_search_tool( """ # Limit queries just in case LLM ignores the description - queries = queries[:MAX_PARALLEL_BROWSERS] + queries = queries[:max_parallel_browsers] logger.info(f"[Browser Tool {task_id}] Running search for {len(queries)} queries: {queries}") results = [] - semaphore = asyncio.Semaphore(MAX_PARALLEL_BROWSERS) + semaphore = asyncio.Semaphore(max_parallel_browsers) async def task_wrapper(query): async with semaphore: @@ -240,7 +237,8 @@ def create_browser_search_tool( llm: Any, browser_config: Dict[str, Any], task_id: str, - stop_event: threading.Event + stop_event: threading.Event, + max_parallel_browsers: int = 1, ) -> StructuredTool: """Factory function to create the browser search tool with necessary dependencies.""" # Use partial to bind the dependencies that aren't part of the LLM call arguments @@ -251,15 +249,15 @@ def create_browser_search_tool( llm=llm, browser_config=browser_config, stop_event=stop_event, + max_parallel_browsers=max_parallel_browsers ) return StructuredTool.from_function( coroutine=bound_tool_func, name="parallel_browser_search", description=f"""Use this tool to actively search the web for information related to a specific research task or question. -It runs up to {MAX_PARALLEL_BROWSERS} searches in parallel using a browser agent for better results than simple scraping. -Provide a list of distinct search queries that are likely to yield relevant information. -The tool returns a list of results, each containing the original query, the status (completed, failed, stopped), and the summarized information found (or an error message).""", +It runs up to {max_parallel_browsers} searches in parallel using a browser agent for better results than simple scraping. +Provide a list of distinct search queries that are likely to yield relevant information.""", args_schema=BrowserSearchInput, ) @@ -747,7 +745,7 @@ def should_continue(state: DeepResearchState) -> str: return "end_run" # Should not happen if planning node ran correctly # Check if there are pending steps in the plan - if current_index < len(plan): + if current_index < 2: logger.info( f"Plan has pending steps (current index {current_index}/{len(plan)}). Routing to Research Execution.") return "execute_research" @@ -758,7 +756,7 @@ def should_continue(state: DeepResearchState) -> str: # --- DeepSearchAgent Class --- -class DeepSearchAgent: +class DeepResearchAgent: def __init__(self, llm: Any, browser_config: Dict[str, Any], mcp_server_config: Optional[Dict[str, Any]] = None): """ Initializes the DeepSearchAgent. @@ -773,28 +771,30 @@ class DeepSearchAgent: self.browser_config = browser_config self.mcp_server_config = mcp_server_config self.mcp_client = None + self.stopped = False self.graph = self._compile_graph() self.current_task_id: Optional[str] = None self.stop_event: Optional[threading.Event] = None self.runner: Optional[asyncio.Task] = None # To hold the asyncio task for run - async def _setup_tools(self, task_id: str, stop_event: threading.Event) -> List[Tool]: + async def _setup_tools(self, task_id: str, stop_event: threading.Event, max_parallel_browsers: int = 1) -> List[ + Tool]: """Sets up the basic tools (File I/O) and optional MCP tools.""" tools = [WriteFileTool(), ReadFileTool(), ListDirectoryTool()] # Basic file operations browser_use_tool = create_browser_search_tool( llm=self.llm, browser_config=self.browser_config, task_id=task_id, - stop_event=stop_event + stop_event=stop_event, + max_parallel_browsers=max_parallel_browsers ) tools += [browser_use_tool] # Add MCP tools if config is provided if self.mcp_server_config: try: logger.info("Setting up MCP client and tools...") - if self.mcp_client: - await self.mcp_client.__aexit__(None, None, None) - self.mcp_client = await setup_mcp_client_and_tools(self.mcp_server_config) + if not self.mcp_client: + self.mcp_client = await setup_mcp_client_and_tools(self.mcp_server_config) mcp_tools = self.mcp_client.get_tools() logger.info(f"Loaded {len(mcp_tools)} MCP tools.") tools.extend(mcp_tools) @@ -802,8 +802,13 @@ class DeepSearchAgent: logger.error(f"Failed to set up MCP tools: {e}", exc_info=True) elif self.mcp_server_config: logger.warning("MCP server config provided, but setup function unavailable.") + tools_map = {tool.name: tool for tool in tools} + return tools_map.values() - return tools + async def close_mcp_client(self): + if self.mcp_client: + await self.mcp_client.__aexit__(None, None, None) + self.mcp_client = None def _compile_graph(self) -> StateGraph: """Compiles the Langgraph state machine.""" @@ -836,7 +841,9 @@ class DeepSearchAgent: app = workflow.compile() return app - async def run(self, topic: str, task_id: Optional[str] = None) -> Dict[str, Any]: + async def run(self, topic: str, task_id: Optional[str] = None, save_dir: str = "./tmp/deep_research", + max_parallel_browsers: int = 1) -> Dict[ + str, Any]: """ Starts the deep research process (Async Generator Version). @@ -853,7 +860,7 @@ class DeepSearchAgent: return {"status": "error", "message": "Agent already running.", "task_id": self.current_task_id} self.current_task_id = task_id if task_id else str(uuid.uuid4()) - output_dir = os.path.join(TMP_DIR, self.current_task_id) + output_dir = os.path.join(save_dir, self.current_task_id) os.makedirs(output_dir, exist_ok=True) logger.info(f"[AsyncGen] Starting research task ID: {self.current_task_id} for topic: '{topic}'") @@ -861,7 +868,7 @@ class DeepSearchAgent: self.stop_event = threading.Event() _AGENT_STOP_FLAGS[self.current_task_id] = self.stop_event - agent_tools = await self._setup_tools(self.current_task_id, self.stop_event) + agent_tools = await self._setup_tools(self.current_task_id, self.stop_event, max_parallel_browsers) initial_state: DeepResearchState = { "task_id": self.current_task_id, "topic": topic, @@ -933,19 +940,7 @@ class DeepSearchAgent: # final_state will remain None or the state before the error finally: logger.info(f"Cleaning up resources for task {self.current_task_id}") - task_id_to_clean = self.current_task_id # Store before potentially clearing - if task_id_to_clean in _AGENT_STOP_FLAGS: - del _AGENT_STOP_FLAGS[task_id_to_clean] - # Stop any potentially lingering browser agents for this task - await self._stop_lingering_browsers(task_id_to_clean) - # Ensure the instance tracker is clean (should be handled by tool's finally block) - lingering_keys = [k for k in _BROWSER_AGENT_INSTANCES if k.startswith(f"{task_id_to_clean}_")] - if lingering_keys: - logger.warning( - f"{len(lingering_keys)} lingering browser instances found in tracker for task {task_id_to_clean} after cleanup attempt.") - # Force clear them from the tracker dict - for key in lingering_keys: - del _BROWSER_AGENT_INSTANCES[key] + task_id_to_clean = self.current_task_id self.stop_event = None self.current_task_id = None @@ -961,28 +956,6 @@ class DeepSearchAgent: "final_state": final_state if final_state else {} # Return the final state dict } - async def _stop_lingering_browsers(self, task_id): - """Attempts to stop any BrowserUseAgent instances associated with the task_id.""" - keys_to_stop = [key for key in _BROWSER_AGENT_INSTANCES if key.startswith(f"{task_id}_")] - if not keys_to_stop: - return - - logger.warning( - f"Found {len(keys_to_stop)} potentially lingering browser agents for task {task_id}. Attempting stop...") - for key in keys_to_stop: - agent_instance = _BROWSER_AGENT_INSTANCES.get(key) - if agent_instance and hasattr(agent_instance, 'stop'): - try: - # Assuming BU agent has an async stop method - await agent_instance.stop() - logger.info(f"Called stop() on browser agent instance {key}") - except Exception as e: - logger.error(f"Error calling stop() on browser agent instance {key}: {e}") - # Instance should be removed by the finally block in run_single_browser_task - # but we ensure removal here too. - if key in _BROWSER_AGENT_INSTANCES: - del _BROWSER_AGENT_INSTANCES[key] - def stop(self): """Signals the currently running agent task to stop.""" if not self.current_task_id or not self.stop_event: @@ -991,14 +964,7 @@ class DeepSearchAgent: logger.info(f"Stop requested for task ID: {self.current_task_id}") self.stop_event.set() # Signal the stop event + self.stopped = True - # Additionally, try to stop the browser agents directly - # Need to run this async in the background or manage event loops carefully - async def do_stop_browsers(): - await self._stop_lingering_browsers(self.current_task_id) - - try: - loop = asyncio.get_running_loop() - loop.create_task(do_stop_browsers()) - except RuntimeError: # No running loop in current thread - asyncio.run(do_stop_browsers()) + def close(self): + self.stopped = False diff --git a/src/utils/llm_provider.py b/src/utils/llm_provider.py index 33e9328..4858478 100644 --- a/src/utils/llm_provider.py +++ b/src/utils/llm_provider.py @@ -46,6 +46,8 @@ from langchain_google_genai import ChatGoogleGenerativeAI from langchain_ollama import ChatOllama from langchain_openai import AzureChatOpenAI, ChatOpenAI from langchain_ibm import ChatWatsonx +from langchain_aws import ChatBedrock +from pydantic import SecretStr from src.utils import config @@ -154,7 +156,7 @@ def get_llm_model(provider: str, **kwargs): :param kwargs: :return: """ - if provider not in ["ollama"]: + if provider not in ["ollama", "bedrock"]: env_var = f"{provider.upper()}_API_KEY" api_key = kwargs.get("api_key", "") or os.getenv(env_var, "") if not api_key: @@ -263,6 +265,23 @@ def get_llm_model(provider: str, **kwargs): azure_endpoint=base_url, api_key=api_key, ) + elif provider == "bedrock": + if not kwargs.get("base_url", ""): + access_key_id = os.getenv('AWS_ACCESS_KEY_ID', '') + else: + access_key_id = kwargs.get("base_url") + + if not kwargs.get("api_key", ""): + api_key = os.getenv('AWS_SECRET_ACCESS_KEY', '') + else: + api_key = kwargs.get("api_key") + return ChatBedrock( + model=kwargs.get("model_name", 'anthropic.claude-3-5-sonnet-20241022-v2:0'), + region=kwargs.get("bedrock_region", 'us-west-2'), # with higher quota + aws_access_key_id=SecretStr(access_key_id), + aws_secret_access_key=SecretStr(api_key), + temperature=kwargs.get("temperature", 0.0), + ) elif provider == "alibaba": if not kwargs.get("base_url", ""): base_url = os.getenv("ALIBABA_ENDPOINT", "https://dashscope.aliyuncs.com/compatible-mode/v1") diff --git a/src/webui/components/deep_research_agent_tab.py b/src/webui/components/deep_research_agent_tab.py index eeaf58a..66c745f 100644 --- a/src/webui/components/deep_research_agent_tab.py +++ b/src/webui/components/deep_research_agent_tab.py @@ -1,8 +1,382 @@ import gradio as gr from gradio.components import Component +from functools import partial from src.webui.webui_manager import WebuiManager from src.utils import config +import logging +import os +from typing import Any, Dict, AsyncGenerator, Optional, Tuple, Union +import asyncio +import json +from src.agent.deep_research.deep_research_agent import DeepResearchAgent +from src.utils import llm_provider + +logger = logging.getLogger(__name__) + + +async def _initialize_llm(provider: Optional[str], model_name: Optional[str], temperature: float, + base_url: Optional[str], api_key: Optional[str], num_ctx: Optional[int] = None): + """Initializes the LLM based on settings. Returns None if provider/model is missing.""" + if not provider or not model_name: + logger.info("LLM Provider or Model Name not specified, LLM will be None.") + return None + try: + logger.info(f"Initializing LLM: Provider={provider}, Model={model_name}, Temp={temperature}") + # Use your actual LLM provider logic here + llm = llm_provider.get_llm_model( + provider=provider, + model_name=model_name, + temperature=temperature, + base_url=base_url or None, + api_key=api_key or None, + num_ctx=num_ctx if provider == "ollama" else None + ) + return llm + except Exception as e: + logger.error(f"Failed to initialize LLM: {e}", exc_info=True) + gr.Warning( + f"Failed to initialize LLM '{model_name}' for provider '{provider}'. Please check settings. Error: {e}") + return None + + +def _read_file_safe(file_path: str) -> Optional[str]: + """Safely read a file, returning None if it doesn't exist or on error.""" + if not os.path.exists(file_path): + return None + try: + with open(file_path, 'r', encoding='utf-8') as f: + return f.read() + except Exception as e: + logger.error(f"Error reading file {file_path}: {e}") + return None + + +# --- Deep Research Agent Specific Logic --- + +async def run_deep_research(webui_manager: WebuiManager, components: Dict[Component, Any]) -> AsyncGenerator[ + Dict[Component, Any], None]: + """Handles initializing and running the DeepResearchAgent.""" + + # --- Get Components --- + research_task_comp = webui_manager.get_component_by_id("deep_research_agent.research_task") + resume_task_id_comp = webui_manager.get_component_by_id("deep_research_agent.resume_task_id") + parallel_num_comp = webui_manager.get_component_by_id("deep_research_agent.parallel_num") + save_dir_comp = webui_manager.get_component_by_id( + "deep_research_agent.max_query") # Note: component ID seems misnamed in original code + start_button_comp = webui_manager.get_component_by_id("deep_research_agent.start_button") + stop_button_comp = webui_manager.get_component_by_id("deep_research_agent.stop_button") + markdown_display_comp = webui_manager.get_component_by_id("deep_research_agent.markdown_display") + markdown_download_comp = webui_manager.get_component_by_id("deep_research_agent.markdown_download") + mcp_server_config_comp = webui_manager.get_component_by_id("deep_research_agent.mcp_server_config") + + # --- 1. Get Task and Settings --- + task_topic = components.get(research_task_comp, "").strip() + task_id_to_resume = components.get(resume_task_id_comp, "").strip() or None + max_parallel_agents = int(components.get(parallel_num_comp, 1)) + base_save_dir = components.get(save_dir_comp, "./tmp/deep_research") + mcp_server_config_str = components.get(mcp_server_config_comp) + mcp_config = json.loads(mcp_server_config_str) if mcp_server_config_str else None + + if not task_topic: + gr.Warning("Please enter a research task.") + yield {start_button_comp: gr.update(interactive=True)} # Re-enable start button + return + + # Store base save dir for stop handler + webui_manager._dr_save_dir = base_save_dir + os.makedirs(base_save_dir, exist_ok=True) + + # --- 2. Initial UI Update --- + yield { + start_button_comp: gr.update(value="ā³ Running...", interactive=False), + stop_button_comp: gr.update(interactive=True), + research_task_comp: gr.update(interactive=False), + resume_task_id_comp: gr.update(interactive=False), + parallel_num_comp: gr.update(interactive=False), + save_dir_comp: gr.update(interactive=False), + markdown_display_comp: gr.update(value="Starting research..."), + markdown_download_comp: gr.update(value=None, interactive=False) + } + + agent_task = None + running_task_id = None + plan_file_path = None + report_file_path = None + last_plan_content = None + last_plan_mtime = 0 + + try: + # --- 3. Get LLM and Browser Config from other tabs --- + # Access settings values via components dict, getting IDs from webui_manager + def get_setting(tab: str, key: str, default: Any = None): + comp = webui_manager.id_to_component.get(f"{tab}.{key}") + return components.get(comp, default) if comp else default + + # LLM Config (from agent_settings tab) + llm_provider_name = get_setting("agent_settings", "llm_provider") + llm_model_name = get_setting("agent_settings", "llm_model_name") + llm_temperature = get_setting("agent_settings", "llm_temperature", 0.5) # Default if not found + llm_base_url = get_setting("agent_settings", "llm_base_url") + llm_api_key = get_setting("agent_settings", "llm_api_key") + ollama_num_ctx = get_setting("agent_settings", "ollama_num_ctx") + + llm = await _initialize_llm( + llm_provider_name, llm_model_name, llm_temperature, llm_base_url, llm_api_key, + ollama_num_ctx if llm_provider_name == "ollama" else None + ) + if not llm: + raise ValueError("LLM Initialization failed. Please check Agent Settings.") + + # Browser Config (from browser_settings tab) + # Note: DeepResearchAgent constructor takes a dict, not full Browser/Context objects + browser_config_dict = { + "headless": get_setting("browser_settings", "headless", False), + "disable_security": get_setting("browser_settings", "disable_security", True), + "browser_binary_path": get_setting("browser_settings", "browser_binary_path"), + "user_data_dir": get_setting("browser_settings", "browser_user_data_dir"), + "window_width": int(get_setting("browser_settings", "window_w", 1280)), + "window_height": int(get_setting("browser_settings", "window_h", 1100)), + # Add other relevant fields if DeepResearchAgent accepts them + } + + # --- 4. Initialize or Get Agent --- + if not webui_manager._dr_agent: + webui_manager._dr_agent = DeepResearchAgent( + llm=llm, + browser_config=browser_config_dict, + mcp_server_config=mcp_config + ) + logger.info("DeepResearchAgent initialized.") + + # --- 5. Start Agent Run --- + agent_run_coro = await webui_manager._dr_agent.run( + topic=task_topic, + task_id=task_id_to_resume, + save_dir=base_save_dir, + max_parallel_browsers=max_parallel_agents + ) + agent_task = asyncio.create_task(agent_run_coro) + webui_manager._dr_current_task = agent_task + + # Wait briefly for the agent to start and potentially create the task ID/folder + await asyncio.sleep(1.0) + + # Determine the actual task ID being used (agent sets this) + running_task_id = webui_manager._dr_agent.current_task_id + if not running_task_id: + # Agent might not have set it yet, try to get from result later? Risky. + # Or derive from resume_task_id if provided? + running_task_id = task_id_to_resume + if not running_task_id: + logger.warning("Could not determine running task ID immediately.") + # We can still monitor, but might miss initial plan if ID needed for path + else: + logger.info(f"Assuming task ID based on resume ID: {running_task_id}") + else: + logger.info(f"Agent started with Task ID: {running_task_id}") + + webui_manager._dr_task_id = running_task_id # Store for stop handler + + # --- 6. Monitor Progress via research_plan.md --- + if running_task_id: + task_specific_dir = os.path.join(base_save_dir, str(running_task_id)) + plan_file_path = os.path.join(task_specific_dir, "research_plan.md") + report_file_path = os.path.join(task_specific_dir, "report.md") + logger.info(f"Monitoring plan file: {plan_file_path}") + else: + logger.warning("Cannot monitor plan file: Task ID unknown.") + plan_file_path = None + + while not agent_task.done(): + update_dict = {} + + # Check for stop signal (agent sets self.stopped) + agent_stopped = getattr(webui_manager._dr_agent, 'stopped', False) + if agent_stopped: + logger.info("Stop signal detected from agent state.") + break # Exit monitoring loop + + # Check and update research plan display + if plan_file_path: + try: + current_mtime = os.path.getmtime(plan_file_path) if os.path.exists(plan_file_path) else 0 + if current_mtime > last_plan_mtime: + logger.info(f"Detected change in {plan_file_path}") + plan_content = _read_file_safe(plan_file_path) + if plan_content is not None and plan_content != last_plan_content: + update_dict[markdown_display_comp] = gr.update(value=plan_content) + last_plan_content = plan_content + last_plan_mtime = current_mtime + elif plan_content is None: + # File might have been deleted or became unreadable + last_plan_mtime = 0 # Reset to force re-read attempt later + except Exception as e: + logger.warning(f"Error checking/reading plan file {plan_file_path}: {e}") + # Avoid continuous logging for the same error + await asyncio.sleep(2.0) + + # Yield updates if any + if update_dict: + yield update_dict + + await asyncio.sleep(1.0) # Check file changes every second + + # --- 7. Task Finalization --- + logger.info("Agent task processing finished. Awaiting final result...") + final_result_dict = await agent_task # Get result or raise exception + logger.info(f"Agent run completed. Result keys: {final_result_dict.keys() if final_result_dict else 'None'}") + + # Try to get task ID from result if not known before + if not running_task_id and final_result_dict and 'task_id' in final_result_dict: + running_task_id = final_result_dict['task_id'] + webui_manager._dr_task_id = running_task_id + task_specific_dir = os.path.join(base_save_dir, str(running_task_id)) + report_file_path = os.path.join(task_specific_dir, "report.md") + logger.info(f"Task ID confirmed from result: {running_task_id}") + + final_ui_update = {} + if report_file_path and os.path.exists(report_file_path): + logger.info(f"Loading final report from: {report_file_path}") + report_content = _read_file_safe(report_file_path) + if report_content: + final_ui_update[markdown_display_comp] = gr.update(value=report_content) + final_ui_update[markdown_download_comp] = gr.File(value=report_file_path, + label=f"Report ({running_task_id}.md)", + interactive=True) + else: + final_ui_update[markdown_display_comp] = gr.update( + value="# Research Complete\n\n*Error reading final report file.*") + elif final_result_dict and 'report' in final_result_dict: + logger.info("Using report content directly from agent result.") + # If agent directly returns report content + final_ui_update[markdown_display_comp] = gr.update(value=final_result_dict['report']) + # Cannot offer download if only content is available + final_ui_update[markdown_download_comp] = gr.update(value=None, label="Download Research Report", + interactive=False) + else: + logger.warning("Final report file not found and not in result dict.") + final_ui_update[markdown_display_comp] = gr.update(value="# Research Complete\n\n*Final report not found.*") + + yield final_ui_update + + + except Exception as e: + logger.error(f"Error during Deep Research Agent execution: {e}", exc_info=True) + gr.Error(f"Research failed: {e}") + yield {markdown_display_comp: gr.update(value=f"# Research Failed\n\n**Error:**\n```\n{e}\n```")} + + finally: + # --- 8. Final UI Reset --- + webui_manager._dr_current_task = None # Clear task reference + webui_manager._dr_task_id = None # Clear running task ID + # Optionally close agent resources if needed, e.g., browser pool + if webui_manager._dr_agent and hasattr(webui_manager._dr_agent, 'close'): + try: + await webui_manager._dr_agent.close() # Assuming an async close method + logger.info("Closed DeepResearchAgent resources.") + webui_manager._dr_agent = None + except Exception as e_close: + logger.error(f"Error closing DeepResearchAgent: {e_close}") + + yield { + start_button_comp: gr.update(value="ā–¶ļø Run", interactive=True), + stop_button_comp: gr.update(interactive=False), + research_task_comp: gr.update(interactive=True), + resume_task_id_comp: gr.update(interactive=True), + parallel_num_comp: gr.update(interactive=True), + save_dir_comp: gr.update(interactive=True), + # Keep download button enabled if file exists + markdown_download_comp: gr.update() if report_file_path and os.path.exists(report_file_path) else gr.update( + interactive=False) + } + + +async def stop_deep_research(webui_manager: WebuiManager) -> Dict[Component, Any]: + """Handles the Stop button click.""" + logger.info("Stop button clicked for Deep Research.") + agent = webui_manager._dr_agent + task = webui_manager._dr_current_task + task_id = webui_manager._dr_task_id + base_save_dir = webui_manager._dr_save_dir + + stop_button_comp = webui_manager.get_component_by_id("deep_research_agent.stop_button") + start_button_comp = webui_manager.get_component_by_id("deep_research_agent.start_button") + markdown_display_comp = webui_manager.get_component_by_id("deep_research_agent.markdown_display") + markdown_download_comp = webui_manager.get_component_by_id("deep_research_agent.markdown_download") + + final_update = { + stop_button_comp: gr.update(interactive=False, value="ā¹ļø Stopping...") + } + + if agent and task and not task.done(): + logger.info("Signalling DeepResearchAgent to stop.") + if hasattr(agent, 'stop'): + try: + # Assuming stop is synchronous or sets a flag quickly + agent.stop() + except Exception as e: + logger.error(f"Error calling agent.stop(): {e}") + else: + logger.warning("Agent has no 'stop' method. Task cancellation might not be graceful.") + # Task cancellation is handled by the run_deep_research finally block if needed + + # The run_deep_research loop should detect the stop and exit. + # We yield an intermediate "Stopping..." state. The final reset is done by run_deep_research. + + # Try to show the final report if available after stopping + await asyncio.sleep(1.5) # Give agent a moment to write final files potentially + report_file_path = None + if task_id and base_save_dir: + report_file_path = os.path.join(base_save_dir, str(task_id), "report.md") + + if report_file_path and os.path.exists(report_file_path): + report_content = _read_file_safe(report_file_path) + if report_content: + final_update[markdown_display_comp] = gr.update( + value=report_content + "\n\n---\n*Research stopped by user.*") + final_update[markdown_download_comp] = gr.File(value=report_file_path, label=f"Report ({task_id}.md)", + interactive=True) + else: + final_update[markdown_display_comp] = gr.update( + value="# Research Stopped\n\n*Error reading final report file after stop.*") + else: + final_update[markdown_display_comp] = gr.update(value="# Research Stopped by User") + + # Keep start button disabled, run_deep_research finally block will re-enable it. + final_update[start_button_comp] = gr.update(interactive=False) + + else: + logger.warning("Stop clicked but no active research task found.") + # Reset UI state just in case + final_update = { + start_button_comp: gr.update(interactive=True), + stop_button_comp: gr.update(interactive=False), + webui_manager.get_component_by_id("deep_research_agent.research_task"): gr.update(interactive=True), + webui_manager.get_component_by_id("deep_research_agent.resume_task_id"): gr.update(interactive=True), + webui_manager.get_component_by_id("deep_research_agent.max_iteration"): gr.update(interactive=True), + webui_manager.get_component_by_id("deep_research_agent.max_query"): gr.update(interactive=True), + } + + return final_update + + +def update_mcp_server(mcp_file: str, webui_manager: WebuiManager): + """ + Update the MCP server. + """ + if hasattr(webui_manager, "dr_agent") and webui_manager.dr_agent: + logger.warning("āš ļø Close controller because mcp file has changed!") + webui_manager.dr_agent.close_mcp_client() + + if not mcp_file or not os.path.exists(mcp_file) or not mcp_file.endswith('.json'): + logger.warning(f"{mcp_file} is not a valid MCP file.") + return None, gr.update(visible=False) + + with open(mcp_file, 'r') as f: + mcp_server = json.load(f) + + return json.dumps(mcp_server, indent=2), gr.update(visible=True) def create_deep_research_agent_tab(webui_manager: WebuiManager): @@ -12,30 +386,70 @@ def create_deep_research_agent_tab(webui_manager: WebuiManager): input_components = set(webui_manager.get_components()) tab_components = {} - research_task = gr.Textbox(label="Research Task", lines=5, - value="Give me a detailed plan for traveling to Switzerland on June 1st.", - interactive=True) - with gr.Row(): - max_iteration = gr.Number(label="Max Search Iteration", value=3, - precision=0, - interactive=True) # precision=0 ē”®äæę˜Æę•“ę•° - max_query = gr.Number(label="Max Query per Iteration", value=1, - precision=0, - interactive=True) # precision=0 ē”®äæę˜Æę•“ę•° + with gr.Group(): + with gr.Row(): + mcp_json_file = gr.File(label="MCP server json", interactive=True, file_types=[".json"]) + mcp_server_config = gr.Textbox(label="MCP server", lines=6, interactive=True, visible=False) + + with gr.Group(): + research_task = gr.Textbox(label="Research Task", lines=5, + value="Give me a detailed plan for traveling to Switzerland on June 1st.", + interactive=True) + with gr.Row(): + resume_task_id = gr.Textbox(label="Resume Task ID", value="", + interactive=True) + parallel_num = gr.Number(label="Parallel Agent Num", value=1, + precision=0, + interactive=True) + max_query = gr.Textbox(label="Research Save Dir", value="./tmp/deep_research", + interactive=True) with gr.Row(): stop_button = gr.Button("ā¹ļø Stop", variant="stop", scale=2) start_button = gr.Button("ā–¶ļø Run", variant="primary", scale=3) - markdown_display = gr.Markdown(label="Research Report") - markdown_download = gr.File(label="Download Research Report", interactive=False) + with gr.Group(): + markdown_display = gr.Markdown(label="Research Report") + markdown_download = gr.File(label="Download Research Report", interactive=False) tab_components.update( dict( research_task=research_task, - max_iteration=max_iteration, + parallel_num=parallel_num, max_query=max_query, start_button=start_button, stop_button=stop_button, markdown_display=markdown_display, markdown_download=markdown_download, + resume_task_id=resume_task_id ) ) webui_manager.add_components("deep_research_agent", tab_components) + webui_manager.init_deep_research_agent() + mcp_json_file.change( + partial(update_mcp_server, webui_manager=webui_manager), + inputs=[mcp_json_file], + outputs=[mcp_server_config, mcp_server_config] + ) + + dr_tab_outputs = list(tab_components.values()) + all_managed_inputs = webui_manager.get_components() + + # --- Define Event Handler Wrappers --- + async def start_wrapper(comps: Dict[Component, Any]) -> AsyncGenerator[Dict[Component, Any], None]: + async for update in run_deep_research(webui_manager, comps): + yield update + + async def stop_wrapper() -> AsyncGenerator[Dict[Component, Any], None]: + update_dict = await stop_deep_research(webui_manager) + yield update_dict # Yield the single dict update + + # --- Connect Handlers --- + start_button.click( + fn=start_wrapper, + inputs=all_managed_inputs, + outputs=dr_tab_outputs # Update only components in this tab + ) + + stop_button.click( + fn=stop_wrapper, + inputs=None, + outputs=dr_tab_outputs # Update only components in this tab + ) diff --git a/src/webui/webui_manager.py b/src/webui/webui_manager.py index 5cbd31f..e4cf833 100644 --- a/src/webui/webui_manager.py +++ b/src/webui/webui_manager.py @@ -15,6 +15,7 @@ from browser_use.agent.service import Agent from src.browser.custom_browser import CustomBrowser from src.browser.custom_context import CustomBrowserContext from src.controller.custom_controller import CustomController +from src.agent.deep_research.deep_research_agent import DeepResearchAgent class WebuiManager: @@ -39,6 +40,15 @@ class WebuiManager: self.bu_current_task: Optional[asyncio.Task] = None self.bu_agent_task_id: Optional[str] = None + def init_deep_research_agent(self) -> None: + """ + init deep research agent + """ + self.dr_agent: Optional[DeepResearchAgent] = None + self._dr_current_task = None + self.dr_agent_task_id: Optional[str] = None + self._dr_save_dir: Optional[str] = None + def add_components(self, tab_name: str, components_dict: dict[str, "Component"]) -> None: """ Add tab components diff --git a/tests/test_agents.py b/tests/test_agents.py index 216541a..e71d2b1 100644 --- a/tests/test_agents.py +++ b/tests/test_agents.py @@ -335,15 +335,19 @@ async def test_browser_use_parallel(): async def test_deep_research_agent(): - from src.agent.deep_research.deep_research_agent import DeepSearchAgent + from src.agent.deep_research.deep_research_agent import DeepResearchAgent, PLAN_FILENAME, REPORT_FILENAME from src.utils import llm_provider + # llm = llm_provider.get_llm_model( + # provider="azure_openai", + # model_name="gpt-4o", + # temperature=0.5, + # base_url=os.getenv("AZURE_OPENAI_ENDPOINT", ""), + # api_key=os.getenv("AZURE_OPENAI_API_KEY", ""), + # ) + llm = llm_provider.get_llm_model( - provider="azure_openai", - model_name="gpt-4o", - temperature=0.5, - base_url=os.getenv("AZURE_OPENAI_ENDPOINT", ""), - api_key=os.getenv("AZURE_OPENAI_API_KEY", ""), + provider="bedrock", ) mcp_server_config = { @@ -359,7 +363,7 @@ async def test_deep_research_agent(): } browser_config = {"headless": False, "window_width": 1280, "window_height": 1100, "use_own_browser": False} - agent = DeepSearchAgent(llm=llm, browser_config=browser_config, mcp_server_config=mcp_server_config) + agent = DeepResearchAgent(llm=llm, browser_config=browser_config, mcp_server_config=mcp_server_config) research_topic = "Impact of Microplastics on Marine Ecosystems" task_id_to_resume = None # Set this to resume a previous task ID @@ -368,7 +372,10 @@ async def test_deep_research_agent(): try: # Call run and wait for the final result dictionary - result = await agent.run(research_topic, task_id=task_id_to_resume) + result = await agent.run(research_topic, + task_id=task_id_to_resume, + save_dir="./tmp/downloads", + max_parallel_browsers=1) print("\n--- Research Process Ended ---") print(f"Status: {result.get('status')}") From f941819d2908191fbc8affc4f75a95b281bde578 Mon Sep 17 00:00:00 2001 From: vincent Date: Wed, 30 Apr 2025 20:38:41 +0800 Subject: [PATCH 16/35] opt deep research --- .../deep_research/deep_research_agent.py | 49 ++++++++---- src/utils/config.py | 5 +- src/utils/llm_provider.py | 17 ----- .../components/deep_research_agent_tab.py | 76 ++++++++----------- src/webui/webui_manager.py | 4 +- tests/test_agents.py | 24 +++--- tests/test_llm_api.py | 10 ++- 7 files changed, 92 insertions(+), 93 deletions(-) diff --git a/src/agent/deep_research/deep_research_agent.py b/src/agent/deep_research/deep_research_agent.py index db81895..c9ee3c1 100644 --- a/src/agent/deep_research/deep_research_agent.py +++ b/src/agent/deep_research/deep_research_agent.py @@ -40,6 +40,7 @@ PLAN_FILENAME = "research_plan.md" SEARCH_INFO_FILENAME = "search_info.json" _AGENT_STOP_FLAGS = {} +_BROWSER_AGENT_INSTANCES = {} async def run_single_browser_task( @@ -129,6 +130,7 @@ async def run_single_browser_task( # Store instance for potential stop() call task_key = f"{task_id}_{uuid.uuid4()}" + _BROWSER_AGENT_INSTANCES[task_key] = bu_agent_instance # --- Run with Stop Check --- # BrowserUseAgent needs to internally check a stop signal or have a stop method. @@ -173,6 +175,9 @@ async def run_single_browser_task( except Exception as e: logger.error(f"Error closing browser: {e}") + if task_key in _BROWSER_AGENT_INSTANCES: + del _BROWSER_AGENT_INSTANCES[task_key] + class BrowserSearchInput(BaseModel): queries: List[str] = Field( @@ -257,7 +262,7 @@ def create_browser_search_tool( name="parallel_browser_search", description=f"""Use this tool to actively search the web for information related to a specific research task or question. It runs up to {max_parallel_browsers} searches in parallel using a browser agent for better results than simple scraping. -Provide a list of distinct search queries that are likely to yield relevant information.""", +Provide a list of distinct search queries(up to {max_parallel_browsers}) that are likely to yield relevant information.""", args_schema=BrowserSearchInput, ) @@ -296,9 +301,8 @@ class DeepResearchState(TypedDict): def _load_previous_state(task_id: str, output_dir: str) -> Dict[str, Any]: """Loads state from files if they exist.""" state_updates = {} - plan_file = os.path.join(output_dir, task_id, PLAN_FILENAME) - search_file = os.path.join(output_dir, task_id, SEARCH_INFO_FILENAME) - + plan_file = os.path.join(output_dir, PLAN_FILENAME) + search_file = os.path.join(output_dir, SEARCH_INFO_FILENAME) if os.path.exists(plan_file): try: with open(plan_file, 'r', encoding='utf-8') as f: @@ -307,9 +311,9 @@ def _load_previous_state(task_id: str, output_dir: str) -> Dict[str, Any]: step = 1 for line in f: line = line.strip() - if line.startswith(("[x]", "[ ]")): - status = "completed" if line.startswith("[x]") else "pending" - task = line[4:].strip() + if line.startswith(("- [x]", "- [ ]")): + status = "completed" if line.startswith("- [x]") else "pending" + task = line[5:].strip() plan.append( ResearchPlanItem(step=step, task=task, status=status, queries=None, result_summary=None)) step += 1 @@ -321,7 +325,6 @@ def _load_previous_state(task_id: str, output_dir: str) -> Dict[str, Any]: except Exception as e: logger.error(f"Failed to load or parse research plan {plan_file}: {e}") state_updates['error_message'] = f"Failed to load research plan: {e}" - if os.path.exists(search_file): try: with open(search_file, 'r', encoding='utf-8') as f: @@ -342,7 +345,7 @@ def _save_plan_to_md(plan: List[ResearchPlanItem], output_dir: str): with open(plan_file, 'w', encoding='utf-8') as f: f.write("# Research Plan\n\n") for item in plan: - marker = "[x]" if item['status'] == 'completed' else "[ ]" + marker = "- [x]" if item['status'] == 'completed' else "- [ ]" f.write(f"{marker} {item['task']}\n") logger.info(f"Research plan saved to {plan_file}") except Exception as e: @@ -545,8 +548,6 @@ async def research_execution_node(state: DeepResearchState) -> Dict[str, Any]: stop_event = _AGENT_STOP_FLAGS.get(task_id) if stop_event and stop_event.is_set(): logger.info(f"Stop requested before executing tool: {tool_name}") - # How to report this back? Maybe skip execution, return special state? - # Let's update state and return stop_requested = True current_step['status'] = 'pending' # Not completed due to stop _save_plan_to_md(plan, output_dir) return {"stop_requested": True, "research_plan": plan} @@ -668,7 +669,8 @@ async def synthesis_node(state: DeepResearchState) -> Dict[str, Any]: # Prepare the research plan context plan_summary = "\nResearch Plan Followed:\n" for item in plan: - marker = "[x]" if item['status'] == 'completed' else "[?]" if item['status'] == 'failed' else "[ ]" + marker = "- [x]" if item['status'] == 'completed' else "- [ ] (Failed)" if item[ + 'status'] == 'failed' else "- [ ]" plan_summary += f"{marker} {item['task']}\n" synthesis_prompt = ChatPromptTemplate.from_messages([ @@ -745,7 +747,7 @@ def should_continue(state: DeepResearchState) -> str: return "end_run" # Should not happen if planning node ran correctly # Check if there are pending steps in the plan - if current_index < 2: + if current_index < len(plan): logger.info( f"Plan has pending steps (current index {current_index}/{len(plan)}). Routing to Research Execution.") return "execute_research" @@ -956,7 +958,25 @@ class DeepResearchAgent: "final_state": final_state if final_state else {} # Return the final state dict } - def stop(self): + async def _stop_lingering_browsers(self, task_id): + """Attempts to stop any BrowserUseAgent instances associated with the task_id.""" + keys_to_stop = [key for key in _BROWSER_AGENT_INSTANCES if key.startswith(f"{task_id}_")] + if not keys_to_stop: + return + + logger.warning( + f"Found {len(keys_to_stop)} potentially lingering browser agents for task {task_id}. Attempting stop...") + for key in keys_to_stop: + agent_instance = _BROWSER_AGENT_INSTANCES.get(key) + try: + if agent_instance: + # Assuming BU agent has an async stop method + await agent_instance.stop() + logger.info(f"Called stop() on browser agent instance {key}") + except Exception as e: + logger.error(f"Error calling stop() on browser agent instance {key}: {e}") + + async def stop(self): """Signals the currently running agent task to stop.""" if not self.current_task_id or not self.stop_event: logger.info("No agent task is currently running.") @@ -965,6 +985,7 @@ class DeepResearchAgent: logger.info(f"Stop requested for task ID: {self.current_task_id}") self.stop_event.set() # Signal the stop event self.stopped = True + await self._stop_lingering_browsers(self.current_task_id) def close(self): self.stopped = False diff --git a/src/utils/config.py b/src/utils/config.py index 0bfd028..b3d55fe 100644 --- a/src/utils/config.py +++ b/src/utils/config.py @@ -16,12 +16,13 @@ model_names = { "openai": ["gpt-4o", "gpt-4", "gpt-3.5-turbo", "o3-mini"], "deepseek": ["deepseek-chat", "deepseek-reasoner"], "google": ["gemini-2.0-flash", "gemini-2.0-flash-thinking-exp", "gemini-1.5-flash-latest", - "gemini-1.5-flash-8b-latest", "gemini-2.0-flash-thinking-exp-01-21", "gemini-2.0-pro-exp-02-05"], + "gemini-1.5-flash-8b-latest", "gemini-2.0-flash-thinking-exp-01-21", "gemini-2.0-pro-exp-02-05", + "gemini-2.5-pro-preview-03-25", "gemini-2.5-flash-preview-04-17"], "ollama": ["qwen2.5:7b", "qwen2.5:14b", "qwen2.5:32b", "qwen2.5-coder:14b", "qwen2.5-coder:32b", "llama2:7b", "deepseek-r1:14b", "deepseek-r1:32b"], "azure_openai": ["gpt-4o", "gpt-4", "gpt-3.5-turbo"], "mistral": ["pixtral-large-latest", "mistral-large-latest", "mistral-small-latest", "ministral-8b-latest"], - "alibaba": ["qwen-plus", "qwen-max", "qwen-turbo", "qwen-long"], + "alibaba": ["qwen-plus", "qwen-max", "qwen-vl-max", "qwen-vl-plus", "qwen-turbo", "qwen-long"], "moonshot": ["moonshot-v1-32k-vision-preview", "moonshot-v1-8k-vision-preview"], "unbound": ["gemini-2.0-flash", "gpt-4o-mini", "gpt-4o", "gpt-4.5-preview"], "siliconflow": [ diff --git a/src/utils/llm_provider.py b/src/utils/llm_provider.py index 4858478..c285e36 100644 --- a/src/utils/llm_provider.py +++ b/src/utils/llm_provider.py @@ -265,23 +265,6 @@ def get_llm_model(provider: str, **kwargs): azure_endpoint=base_url, api_key=api_key, ) - elif provider == "bedrock": - if not kwargs.get("base_url", ""): - access_key_id = os.getenv('AWS_ACCESS_KEY_ID', '') - else: - access_key_id = kwargs.get("base_url") - - if not kwargs.get("api_key", ""): - api_key = os.getenv('AWS_SECRET_ACCESS_KEY', '') - else: - api_key = kwargs.get("api_key") - return ChatBedrock( - model=kwargs.get("model_name", 'anthropic.claude-3-5-sonnet-20241022-v2:0'), - region=kwargs.get("bedrock_region", 'us-west-2'), # with higher quota - aws_access_key_id=SecretStr(access_key_id), - aws_secret_access_key=SecretStr(api_key), - temperature=kwargs.get("temperature", 0.0), - ) elif provider == "alibaba": if not kwargs.get("base_url", ""): base_url = os.getenv("ALIBABA_ENDPOINT", "https://dashscope.aliyuncs.com/compatible-mode/v1") diff --git a/src/webui/components/deep_research_agent_tab.py b/src/webui/components/deep_research_agent_tab.py index 66c745f..f245f1b 100644 --- a/src/webui/components/deep_research_agent_tab.py +++ b/src/webui/components/deep_research_agent_tab.py @@ -84,7 +84,7 @@ async def run_deep_research(webui_manager: WebuiManager, components: Dict[Compon return # Store base save dir for stop handler - webui_manager._dr_save_dir = base_save_dir + webui_manager.dr_save_dir = base_save_dir os.makedirs(base_save_dir, exist_ok=True) # --- 2. Initial UI Update --- @@ -141,8 +141,8 @@ async def run_deep_research(webui_manager: WebuiManager, components: Dict[Compon } # --- 4. Initialize or Get Agent --- - if not webui_manager._dr_agent: - webui_manager._dr_agent = DeepResearchAgent( + if not webui_manager.dr_agent: + webui_manager.dr_agent = DeepResearchAgent( llm=llm, browser_config=browser_config_dict, mcp_server_config=mcp_config @@ -150,20 +150,20 @@ async def run_deep_research(webui_manager: WebuiManager, components: Dict[Compon logger.info("DeepResearchAgent initialized.") # --- 5. Start Agent Run --- - agent_run_coro = await webui_manager._dr_agent.run( + agent_run_coro = webui_manager.dr_agent.run( topic=task_topic, task_id=task_id_to_resume, save_dir=base_save_dir, max_parallel_browsers=max_parallel_agents ) agent_task = asyncio.create_task(agent_run_coro) - webui_manager._dr_current_task = agent_task + webui_manager.dr_current_task = agent_task # Wait briefly for the agent to start and potentially create the task ID/folder await asyncio.sleep(1.0) # Determine the actual task ID being used (agent sets this) - running_task_id = webui_manager._dr_agent.current_task_id + running_task_id = webui_manager.dr_agent.current_task_id if not running_task_id: # Agent might not have set it yet, try to get from result later? Risky. # Or derive from resume_task_id if provided? @@ -176,7 +176,7 @@ async def run_deep_research(webui_manager: WebuiManager, components: Dict[Compon else: logger.info(f"Agent started with Task ID: {running_task_id}") - webui_manager._dr_task_id = running_task_id # Store for stop handler + webui_manager.dr_task_id = running_task_id # Store for stop handler # --- 6. Monitor Progress via research_plan.md --- if running_task_id: @@ -187,12 +187,11 @@ async def run_deep_research(webui_manager: WebuiManager, components: Dict[Compon else: logger.warning("Cannot monitor plan file: Task ID unknown.") plan_file_path = None - + last_plan_content = None while not agent_task.done(): update_dict = {} - - # Check for stop signal (agent sets self.stopped) - agent_stopped = getattr(webui_manager._dr_agent, 'stopped', False) + update_dict[resume_task_id_comp] = gr.update(value=running_task_id) + agent_stopped = getattr(webui_manager.dr_agent, 'stopped', False) if agent_stopped: logger.info("Stop signal detected from agent state.") break # Exit monitoring loop @@ -204,7 +203,8 @@ async def run_deep_research(webui_manager: WebuiManager, components: Dict[Compon if current_mtime > last_plan_mtime: logger.info(f"Detected change in {plan_file_path}") plan_content = _read_file_safe(plan_file_path) - if plan_content is not None and plan_content != last_plan_content: + if last_plan_content is None or ( + plan_content is not None and plan_content != last_plan_content): update_dict[markdown_display_comp] = gr.update(value=plan_content) last_plan_content = plan_content last_plan_mtime = current_mtime @@ -230,7 +230,7 @@ async def run_deep_research(webui_manager: WebuiManager, components: Dict[Compon # Try to get task ID from result if not known before if not running_task_id and final_result_dict and 'task_id' in final_result_dict: running_task_id = final_result_dict['task_id'] - webui_manager._dr_task_id = running_task_id + webui_manager.dr_task_id = running_task_id task_specific_dir = os.path.join(base_save_dir, str(running_task_id)) report_file_path = os.path.join(task_specific_dir, "report.md") logger.info(f"Task ID confirmed from result: {running_task_id}") @@ -268,22 +268,14 @@ async def run_deep_research(webui_manager: WebuiManager, components: Dict[Compon finally: # --- 8. Final UI Reset --- - webui_manager._dr_current_task = None # Clear task reference - webui_manager._dr_task_id = None # Clear running task ID - # Optionally close agent resources if needed, e.g., browser pool - if webui_manager._dr_agent and hasattr(webui_manager._dr_agent, 'close'): - try: - await webui_manager._dr_agent.close() # Assuming an async close method - logger.info("Closed DeepResearchAgent resources.") - webui_manager._dr_agent = None - except Exception as e_close: - logger.error(f"Error closing DeepResearchAgent: {e_close}") + webui_manager.dr_current_task = None # Clear task reference + webui_manager.dr_task_id = None # Clear running task ID yield { start_button_comp: gr.update(value="ā–¶ļø Run", interactive=True), stop_button_comp: gr.update(interactive=False), research_task_comp: gr.update(interactive=True), - resume_task_id_comp: gr.update(interactive=True), + resume_task_id_comp: gr.update(value="", interactive=True), parallel_num_comp: gr.update(interactive=True), save_dir_comp: gr.update(interactive=True), # Keep download button enabled if file exists @@ -295,10 +287,10 @@ async def run_deep_research(webui_manager: WebuiManager, components: Dict[Compon async def stop_deep_research(webui_manager: WebuiManager) -> Dict[Component, Any]: """Handles the Stop button click.""" logger.info("Stop button clicked for Deep Research.") - agent = webui_manager._dr_agent - task = webui_manager._dr_current_task - task_id = webui_manager._dr_task_id - base_save_dir = webui_manager._dr_save_dir + agent = webui_manager.dr_agent + task = webui_manager.dr_current_task + task_id = webui_manager.dr_task_id + base_save_dir = webui_manager.dr_save_dir stop_button_comp = webui_manager.get_component_by_id("deep_research_agent.stop_button") start_button_comp = webui_manager.get_component_by_id("deep_research_agent.start_button") @@ -311,15 +303,11 @@ async def stop_deep_research(webui_manager: WebuiManager) -> Dict[Component, Any if agent and task and not task.done(): logger.info("Signalling DeepResearchAgent to stop.") - if hasattr(agent, 'stop'): - try: - # Assuming stop is synchronous or sets a flag quickly - agent.stop() - except Exception as e: - logger.error(f"Error calling agent.stop(): {e}") - else: - logger.warning("Agent has no 'stop' method. Task cancellation might not be graceful.") - # Task cancellation is handled by the run_deep_research finally block if needed + try: + # Assuming stop is synchronous or sets a flag quickly + await agent.stop() + except Exception as e: + logger.error(f"Error calling agent.stop(): {e}") # The run_deep_research loop should detect the stop and exit. # We yield an intermediate "Stopping..." state. The final reset is done by run_deep_research. @@ -393,7 +381,7 @@ def create_deep_research_agent_tab(webui_manager: WebuiManager): with gr.Group(): research_task = gr.Textbox(label="Research Task", lines=5, - value="Give me a detailed plan for traveling to Switzerland on June 1st.", + value="Give me a detailed travel plan to Switzerland from June 1st to 10th.", interactive=True) with gr.Row(): resume_task_id = gr.Textbox(label="Resume Task ID", value="", @@ -418,7 +406,9 @@ def create_deep_research_agent_tab(webui_manager: WebuiManager): stop_button=stop_button, markdown_display=markdown_display, markdown_download=markdown_download, - resume_task_id=resume_task_id + resume_task_id=resume_task_id, + mcp_json_file=mcp_json_file, + mcp_server_config=mcp_server_config, ) ) webui_manager.add_components("deep_research_agent", tab_components) @@ -430,7 +420,7 @@ def create_deep_research_agent_tab(webui_manager: WebuiManager): ) dr_tab_outputs = list(tab_components.values()) - all_managed_inputs = webui_manager.get_components() + all_managed_inputs = set(webui_manager.get_components()) # --- Define Event Handler Wrappers --- async def start_wrapper(comps: Dict[Component, Any]) -> AsyncGenerator[Dict[Component, Any], None]: @@ -439,17 +429,17 @@ def create_deep_research_agent_tab(webui_manager: WebuiManager): async def stop_wrapper() -> AsyncGenerator[Dict[Component, Any], None]: update_dict = await stop_deep_research(webui_manager) - yield update_dict # Yield the single dict update + yield update_dict # --- Connect Handlers --- start_button.click( fn=start_wrapper, inputs=all_managed_inputs, - outputs=dr_tab_outputs # Update only components in this tab + outputs=dr_tab_outputs ) stop_button.click( fn=stop_wrapper, inputs=None, - outputs=dr_tab_outputs # Update only components in this tab + outputs=dr_tab_outputs ) diff --git a/src/webui/webui_manager.py b/src/webui/webui_manager.py index e4cf833..b64e8d1 100644 --- a/src/webui/webui_manager.py +++ b/src/webui/webui_manager.py @@ -45,9 +45,9 @@ class WebuiManager: init deep research agent """ self.dr_agent: Optional[DeepResearchAgent] = None - self._dr_current_task = None + self.dr_current_task = None self.dr_agent_task_id: Optional[str] = None - self._dr_save_dir: Optional[str] = None + self.dr_save_dir: Optional[str] = None def add_components(self, tab_name: str, components_dict: dict[str, "Component"]) -> None: """ diff --git a/tests/test_agents.py b/tests/test_agents.py index e71d2b1..23a6fb0 100644 --- a/tests/test_agents.py +++ b/tests/test_agents.py @@ -338,18 +338,16 @@ async def test_deep_research_agent(): from src.agent.deep_research.deep_research_agent import DeepResearchAgent, PLAN_FILENAME, REPORT_FILENAME from src.utils import llm_provider - # llm = llm_provider.get_llm_model( - # provider="azure_openai", - # model_name="gpt-4o", - # temperature=0.5, - # base_url=os.getenv("AZURE_OPENAI_ENDPOINT", ""), - # api_key=os.getenv("AZURE_OPENAI_API_KEY", ""), - # ) - llm = llm_provider.get_llm_model( - provider="bedrock", + provider="openai", + model_name="gpt-4o", + temperature=0.5 ) + # llm = llm_provider.get_llm_model( + # provider="bedrock", + # ) + mcp_server_config = { "mcpServers": { "desktop-commander": { @@ -364,9 +362,8 @@ async def test_deep_research_agent(): browser_config = {"headless": False, "window_width": 1280, "window_height": 1100, "use_own_browser": False} agent = DeepResearchAgent(llm=llm, browser_config=browser_config, mcp_server_config=mcp_server_config) - research_topic = "Impact of Microplastics on Marine Ecosystems" - task_id_to_resume = None # Set this to resume a previous task ID + task_id_to_resume = "815460fb-337a-4850-8fa4-a5f2db301a89" # Set this to resume a previous task ID print(f"Starting research on: {research_topic}") @@ -374,8 +371,9 @@ async def test_deep_research_agent(): # Call run and wait for the final result dictionary result = await agent.run(research_topic, task_id=task_id_to_resume, - save_dir="./tmp/downloads", - max_parallel_browsers=1) + save_dir="./tmp/deep_research", + max_parallel_browsers=1, + ) print("\n--- Research Process Ended ---") print(f"Status: {result.get('status')}") diff --git a/tests/test_llm_api.py b/tests/test_llm_api.py index c0e9e16..e98569b 100644 --- a/tests/test_llm_api.py +++ b/tests/test_llm_api.py @@ -141,13 +141,19 @@ def test_ibm_model(): test_llm(config, "Describe this image", "assets/examples/test.png") +def test_qwen_model(): + config = LLMConfig(provider="alibaba", model_name="qwen3-30b-a3b") + test_llm(config, "How many 'r's are in the word 'strawberry'?") + + if __name__ == "__main__": # test_openai_model() # test_google_model() - test_azure_openai_model() + # test_azure_openai_model() # test_deepseek_model() # test_ollama_model() - # test_deepseek_r1_model() + test_deepseek_r1_model() # test_deepseek_r1_ollama_model() # test_mistral_model() # test_ibm_model() + # test_qwen_model() From cf2422c364c6a1eb86fc122d80ae2a933f7a88e2 Mon Sep 17 00:00:00 2001 From: vincent Date: Wed, 30 Apr 2025 20:59:31 +0800 Subject: [PATCH 17/35] fix async close --- src/webui/components/agent_settings_tab.py | 11 ++++++++--- src/webui/components/deep_research_agent_tab.py | 12 +++++++++--- 2 files changed, 17 insertions(+), 6 deletions(-) diff --git a/src/webui/components/agent_settings_tab.py b/src/webui/components/agent_settings_tab.py index 6528a11..0aef05f 100644 --- a/src/webui/components/agent_settings_tab.py +++ b/src/webui/components/agent_settings_tab.py @@ -24,13 +24,13 @@ def update_model_dropdown(llm_provider): return gr.Dropdown(choices=[], value="", interactive=True, allow_custom_value=True) -def update_mcp_server(mcp_file: str, webui_manager: WebuiManager): +async def update_mcp_server(mcp_file: str, webui_manager: WebuiManager): """ Update the MCP server. """ if hasattr(webui_manager, "bu_controller") and webui_manager.bu_controller: logger.warning("āš ļø Close controller because mcp file has changed!") - webui_manager.bu_controller.close_mcp_client() + await webui_manager.bu_controller.close_mcp_client() webui_manager.bu_controller = None if not mcp_file or not os.path.exists(mcp_file) or not mcp_file.endswith('.json'): @@ -257,8 +257,13 @@ def create_agent_settings_tab(webui_manager: WebuiManager): outputs=[planner_llm_model_name] ) + async def update_wrapper(mcp_file): + """Wrapper for handle_pause_resume.""" + update_dict = await update_mcp_server(mcp_file, webui_manager) + yield update_dict + mcp_json_file.change( - partial(update_mcp_server, webui_manager=webui_manager), + update_wrapper, inputs=[mcp_json_file], outputs=[mcp_server_config, mcp_server_config] ) diff --git a/src/webui/components/deep_research_agent_tab.py b/src/webui/components/deep_research_agent_tab.py index f245f1b..430b4e0 100644 --- a/src/webui/components/deep_research_agent_tab.py +++ b/src/webui/components/deep_research_agent_tab.py @@ -349,13 +349,13 @@ async def stop_deep_research(webui_manager: WebuiManager) -> Dict[Component, Any return final_update -def update_mcp_server(mcp_file: str, webui_manager: WebuiManager): +async def update_mcp_server(mcp_file: str, webui_manager: WebuiManager): """ Update the MCP server. """ if hasattr(webui_manager, "dr_agent") and webui_manager.dr_agent: logger.warning("āš ļø Close controller because mcp file has changed!") - webui_manager.dr_agent.close_mcp_client() + await webui_manager.dr_agent.close_mcp_client() if not mcp_file or not os.path.exists(mcp_file) or not mcp_file.endswith('.json'): logger.warning(f"{mcp_file} is not a valid MCP file.") @@ -413,8 +413,14 @@ def create_deep_research_agent_tab(webui_manager: WebuiManager): ) webui_manager.add_components("deep_research_agent", tab_components) webui_manager.init_deep_research_agent() + + async def update_wrapper(mcp_file): + """Wrapper for handle_pause_resume.""" + update_dict = await update_mcp_server(mcp_file, webui_manager) + yield update_dict + mcp_json_file.change( - partial(update_mcp_server, webui_manager=webui_manager), + update_wrapper, inputs=[mcp_json_file], outputs=[mcp_server_config, mcp_server_config] ) From a1ec7ad012ee285a5eecde31fa433b4099fd9cdb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Magnus=20M=C3=BCller?= <67061560+MagMueller@users.noreply.github.com> Date: Fri, 2 May 2025 13:21:39 +0800 Subject: [PATCH 18/35] Update browser-use package to version 0.1.42 --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index a9f6c87..01fe29a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -browser-use==0.1.41 +browser-use==0.1.42 pyperclip==1.9.0 gradio==5.27.0 json-repair From 74bea17eb1f48213f5c0d99cd5a18326bd747372 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Magnus=20M=C3=BCller?= <67061560+MagMueller@users.noreply.github.com> Date: Fri, 2 May 2025 13:21:47 +0800 Subject: [PATCH 19/35] Refactor browser agent and update dependencies - Updated import statements to use 'patchright' instead of 'playwright'. - Cleaned up the BrowserUseAgent class for better readability. - Modified README instructions for browser installation. - Added new entries to .gitignore for PDF files and workflow. --- .gitignore | 4 +- README.md | 7 +- src/agent/browser_use/browser_use_agent.py | 96 ++++++++-------------- src/browser/custom_browser.py | 8 +- src/browser/custom_context.py | 4 +- tests/test_agents.py | 2 +- tests/test_playwright.py | 2 +- 7 files changed, 45 insertions(+), 78 deletions(-) diff --git a/.gitignore b/.gitignore index 548d48d..a7a55cd 100644 --- a/.gitignore +++ b/.gitignore @@ -187,4 +187,6 @@ data/ # For Config Files (Current Settings) .config.pkl -*.pdf \ No newline at end of file +*.pdf + +workflow \ No newline at end of file diff --git a/README.md b/README.md index 355ff76..91fb7fa 100644 --- a/README.md +++ b/README.md @@ -68,12 +68,7 @@ uv pip install -r requirements.txt Install Browsers in Playwright: You can install specific browsers by running: ```bash -playwright install --with-deps chromium -``` - -To install all browsers: -```bash -playwright install +patchright install chromium ``` #### Step 4: Configure Environment diff --git a/src/agent/browser_use/browser_use_agent.py b/src/agent/browser_use/browser_use_agent.py index a38211e..9234bca 100644 --- a/src/agent/browser_use/browser_use_agent.py +++ b/src/agent/browser_use/browser_use_agent.py @@ -1,75 +1,37 @@ from __future__ import annotations import asyncio -import gc -import inspect -import json import logging import os -import re -import time -from pathlib import Path -from typing import Any, Awaitable, Callable, Dict, Generic, List, Optional, TypeVar, Union - -from dotenv import load_dotenv -from langchain_core.language_models.chat_models import BaseChatModel -from langchain_core.messages import ( - BaseMessage, - HumanMessage, - SystemMessage, -) # from lmnr.sdk.decorators import observe -from pydantic import BaseModel, ValidationError - from browser_use.agent.gif import create_history_gif -from browser_use.agent.memory.service import Memory, MemorySettings -from browser_use.agent.message_manager.service import MessageManager, MessageManagerSettings -from browser_use.agent.message_manager.utils import convert_input_messages, extract_json_from_model_output, save_conversation -from browser_use.agent.prompts import AgentMessagePrompt, PlannerPrompt, SystemPrompt -from browser_use.agent.views import ( - REQUIRED_LLM_API_ENV_VARS, - ActionResult, - AgentError, - AgentHistory, - AgentHistoryList, - AgentOutput, - AgentSettings, - AgentState, - AgentStepInfo, - StepMetadata, - ToolCallingMethod, -) -from browser_use.browser.browser import Browser -from browser_use.browser.context import BrowserContext -from browser_use.browser.views import BrowserState, BrowserStateHistory -from browser_use.controller.registry.views import ActionModel -from browser_use.controller.service import Controller -from browser_use.dom.history_tree_processor.service import ( - DOMHistoryElement, - HistoryTreeProcessor, -) -from browser_use.exceptions import LLMException -from browser_use.telemetry.service import ProductTelemetry -from browser_use.telemetry.views import ( - AgentEndTelemetryEvent, - AgentRunTelemetryEvent, - AgentStepTelemetryEvent, -) -from browser_use.utils import check_env_variables, time_execution_async, time_execution_sync from browser_use.agent.service import Agent, AgentHookFunc +from browser_use.agent.views import ( + AgentHistoryList, + AgentStepInfo, +) +from browser_use.telemetry.views import ( + AgentEndTelemetryEvent, +) +from browser_use.utils import time_execution_async +from dotenv import load_dotenv load_dotenv() logger = logging.getLogger(__name__) -SKIP_LLM_API_KEY_VERIFICATION = os.environ.get('SKIP_LLM_API_KEY_VERIFICATION', 'false').lower()[0] in 'ty1' +SKIP_LLM_API_KEY_VERIFICATION = ( + os.environ.get("SKIP_LLM_API_KEY_VERIFICATION", "false").lower()[0] in "ty1" +) class BrowserUseAgent(Agent): - @time_execution_async('--run (agent)') + @time_execution_async("--run (agent)") async def run( - self, max_steps: int = 100, on_step_start: AgentHookFunc | None = None, - on_step_end: AgentHookFunc | None = None + self, + max_steps: int = 100, + on_step_start: AgentHookFunc | None = None, + on_step_end: AgentHookFunc | None = None, ) -> AgentHistoryList: """Execute the task with maximum number of steps""" @@ -88,7 +50,7 @@ class BrowserUseAgent(Agent): signal_handler.register() # Wait for verification task to complete if it exists - if hasattr(self, '_verification_task') and not self._verification_task.done(): + if hasattr(self, "_verification_task") and not self._verification_task.done(): try: await self._verification_task except Exception: @@ -100,7 +62,9 @@ class BrowserUseAgent(Agent): # Execute initial actions if provided if self.initial_actions: - result = await self.multi_act(self.initial_actions, check_for_new_elements=False) + result = await self.multi_act( + self.initial_actions, check_for_new_elements=False + ) self.state.last_result = result for step in range(max_steps): @@ -112,12 +76,14 @@ class BrowserUseAgent(Agent): # Check if we should stop due to too many failures if self.state.consecutive_failures >= self.settings.max_failures: - logger.error(f'āŒ Stopping due to {self.settings.max_failures} consecutive failures') + logger.error( + f"āŒ Stopping due to {self.settings.max_failures} consecutive failures" + ) break # Check control flags before each step if self.state.stopped: - logger.info('Agent stopped') + logger.info("Agent stopped") break while self.state.paused: @@ -142,13 +108,15 @@ class BrowserUseAgent(Agent): await self.log_completion() break else: - logger.info('āŒ Failed to complete task in maximum steps') + logger.info("āŒ Failed to complete task in maximum steps") return self.state.history except KeyboardInterrupt: # Already handled by our signal handler, but catch any direct KeyboardInterrupt as well - logger.info('Got KeyboardInterrupt during execution, returning current history') + logger.info( + "Got KeyboardInterrupt during execution, returning current history" + ) return self.state.history finally: @@ -171,8 +139,10 @@ class BrowserUseAgent(Agent): await self.close() if self.settings.generate_gif: - output_path: str = 'agent_history.gif' + output_path: str = "agent_history.gif" if isinstance(self.settings.generate_gif, str): output_path = self.settings.generate_gif - create_history_gif(task=self.task, history=self.state.history, output_path=output_path) \ No newline at end of file + create_history_gif( + task=self.task, history=self.state.history, output_path=output_path + ) diff --git a/src/browser/custom_browser.py b/src/browser/custom_browser.py index 6db980f..02875e3 100644 --- a/src/browser/custom_browser.py +++ b/src/browser/custom_browser.py @@ -1,17 +1,17 @@ import asyncio import pdb -from playwright.async_api import Browser as PlaywrightBrowser -from playwright.async_api import ( +from patchright.async_api import Browser as PlaywrightBrowser +from patchright.async_api import ( BrowserContext as PlaywrightBrowserContext, ) -from playwright.async_api import ( +from patchright.async_api import ( Playwright, async_playwright, ) from browser_use.browser.browser import Browser, IN_DOCKER from browser_use.browser.context import BrowserContext, BrowserContextConfig -from playwright.async_api import BrowserContext as PlaywrightBrowserContext +from patchright.async_api import BrowserContext as PlaywrightBrowserContext import logging from browser_use.browser.chrome import ( diff --git a/src/browser/custom_context.py b/src/browser/custom_context.py index 43a67a8..753b4c5 100644 --- a/src/browser/custom_context.py +++ b/src/browser/custom_context.py @@ -4,8 +4,8 @@ import os from browser_use.browser.browser import Browser, IN_DOCKER from browser_use.browser.context import BrowserContext, BrowserContextConfig -from playwright.async_api import Browser as PlaywrightBrowser -from playwright.async_api import BrowserContext as PlaywrightBrowserContext +from patchright.async_api import Browser as PlaywrightBrowser +from patchright.async_api import BrowserContext as PlaywrightBrowserContext from typing import Optional from browser_use.browser.context import BrowserContextState diff --git a/tests/test_agents.py b/tests/test_agents.py index 23a6fb0..ffa743f 100644 --- a/tests/test_agents.py +++ b/tests/test_agents.py @@ -169,7 +169,7 @@ async def test_browser_use_agent(): async def test_browser_use_parallel(): from browser_use.browser.context import BrowserContextWindowSize from browser_use.browser.browser import BrowserConfig - from playwright.async_api import async_playwright + from patchright.async_api import async_playwright from browser_use.browser.browser import Browser from src.browser.custom_context import BrowserContextConfig from src.controller.custom_controller import CustomController diff --git a/tests/test_playwright.py b/tests/test_playwright.py index 6704a02..5a522fd 100644 --- a/tests/test_playwright.py +++ b/tests/test_playwright.py @@ -6,7 +6,7 @@ load_dotenv() def test_connect_browser(): import os - from playwright.sync_api import sync_playwright + from patchright.sync_api import sync_playwright chrome_exe = os.getenv("CHROME_PATH", "") chrome_use_data = os.getenv("CHROME_USER_DATA", "") From 40a61fa216aeed578cb86339bc790ca1c286a8ff Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Magnus=20M=C3=BCller?= <67061560+MagMueller@users.noreply.github.com> Date: Fri, 2 May 2025 13:25:59 +0800 Subject: [PATCH 20/35] Added source = webui --- .../deep_research/deep_research_agent.py | 566 ++++++++++------ src/webui/components/browser_use_agent_tab.py | 610 ++++++++++++------ 2 files changed, 772 insertions(+), 404 deletions(-) diff --git a/src/agent/deep_research/deep_research_agent.py b/src/agent/deep_research/deep_research_agent.py index c9ee3c1..2f6c672 100644 --- a/src/agent/deep_research/deep_research_agent.py +++ b/src/agent/deep_research/deep_research_agent.py @@ -2,34 +2,38 @@ import asyncio import json import logging import os -import pdb +import threading import uuid from pathlib import Path -from typing import List, Dict, Any, TypedDict, Optional, Sequence, Annotated -from concurrent.futures import ThreadPoolExecutor, as_completed -import threading - -# Langchain imports -from langchain_core.messages import BaseMessage, HumanMessage, AIMessage, ToolMessage, SystemMessage -from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder -from langchain_core.tools import Tool, StructuredTool -from langchain.agents import AgentExecutor # We might use parts, but Langgraph is primary -from langchain_community.tools.file_management import WriteFileTool, ReadFileTool, CopyFileTool, ListDirectoryTool, \ - MoveFileTool, FileSearchTool -from langchain_openai import ChatOpenAI # Replace with your actual LLM import -from pydantic import BaseModel, Field -import operator +from typing import Any, Dict, List, Optional, TypedDict from browser_use.browser.browser import BrowserConfig from browser_use.browser.context import BrowserContextWindowSize +from langchain_community.tools.file_management import ( + ListDirectoryTool, + ReadFileTool, + WriteFileTool, +) + +# Langchain imports +from langchain_core.messages import ( + AIMessage, + BaseMessage, + HumanMessage, + SystemMessage, + ToolMessage, +) +from langchain_core.prompts import ChatPromptTemplate +from langchain_core.tools import StructuredTool, Tool # Langgraph imports -from langgraph.graph import StateGraph, END -from src.controller.custom_controller import CustomController -from src.utils import llm_provider -from src.browser.custom_browser import CustomBrowser -from src.browser.custom_context import CustomBrowserContext, CustomBrowserContextConfig +from langgraph.graph import StateGraph +from pydantic import BaseModel, Field + from src.agent.browser_use.browser_use_agent import BrowserUseAgent +from src.browser.custom_browser import CustomBrowser +from src.browser.custom_context import CustomBrowserContextConfig +from src.controller.custom_controller import CustomController from src.utils.mcp_client import setup_mcp_client_and_tools logger = logging.getLogger(__name__) @@ -44,19 +48,22 @@ _BROWSER_AGENT_INSTANCES = {} async def run_single_browser_task( - task_query: str, - task_id: str, - llm: Any, # Pass the main LLM - browser_config: Dict[str, Any], - stop_event: threading.Event, - use_vision: bool = False, + task_query: str, + task_id: str, + llm: Any, # Pass the main LLM + browser_config: Dict[str, Any], + stop_event: threading.Event, + use_vision: bool = False, ) -> Dict[str, Any]: """ Runs a single BrowserUseAgent task. Manages browser creation and closing for this specific task. """ if not BrowserUseAgent: - return {"query": task_query, "error": "BrowserUseAgent components not available."} + return { + "query": task_query, + "error": "BrowserUseAgent components not available.", + } # --- Browser Setup --- # These should ideally come from the main agent's config @@ -79,9 +86,11 @@ async def run_single_browser_task( extra_args.append(f"--user-data-dir={browser_user_data_dir}") if use_own_browser: browser_binary_path = os.getenv("CHROME_PATH", None) or browser_binary_path - if browser_binary_path == "": browser_binary_path = None + if browser_binary_path == "": + browser_binary_path = None chrome_user_data = os.getenv("CHROME_USER_DATA", None) - if chrome_user_data: extra_args += [f"--user-data-dir={chrome_user_data}"] + if chrome_user_data: + extra_args += [f"--user-data-dir={chrome_user_data}"] else: browser_binary_path = None @@ -98,8 +107,10 @@ async def run_single_browser_task( context_config = CustomBrowserContextConfig( save_downloads_path="./tmp/downloads", - browser_window_size=BrowserContextWindowSize(width=window_w, height=window_h), - force_new_context=True + browser_window_size=BrowserContextWindowSize( + width=window_w, height=window_h + ), + force_new_context=True, ) bu_browser_context = await bu_browser.new_context(config=context_config) @@ -126,6 +137,7 @@ async def run_single_browser_task( browser_context=bu_browser_context, controller=bu_controller, use_vision=use_vision, + source="webui", ) # Store instance for potential stop() call @@ -157,7 +169,9 @@ async def run_single_browser_task( return {"query": task_query, "result": final_data, "status": "completed"} except Exception as e: - logger.error(f"Error during browser task for query '{task_query}': {e}", exc_info=True) + logger.error( + f"Error during browser task for query '{task_query}': {e}", exc_info=True + ) return {"query": task_query, "error": str(e), "status": "failed"} finally: if bu_browser_context: @@ -181,16 +195,17 @@ async def run_single_browser_task( class BrowserSearchInput(BaseModel): queries: List[str] = Field( - description=f"List of distinct search queries to find information relevant to the research task.") + description="List of distinct search queries to find information relevant to the research task." + ) async def _run_browser_search_tool( - queries: List[str], - task_id: str, # Injected dependency - llm: Any, # Injected dependency - browser_config: Dict[str, Any], - stop_event: threading.Event, - max_parallel_browsers: int = 1 + queries: List[str], + task_id: str, # Injected dependency + llm: Any, # Injected dependency + browser_config: Dict[str, Any], + stop_event: threading.Event, + max_parallel_browsers: int = 1, ) -> List[Dict[str, Any]]: """ Internal function to execute parallel browser searches based on LLM-provided queries. @@ -199,7 +214,9 @@ async def _run_browser_search_tool( # Limit queries just in case LLM ignores the description queries = queries[:max_parallel_browsers] - logger.info(f"[Browser Tool {task_id}] Running search for {len(queries)} queries: {queries}") + logger.info( + f"[Browser Tool {task_id}] Running search for {len(queries)} queries: {queries}" + ) results = [] semaphore = asyncio.Semaphore(max_parallel_browsers) @@ -207,7 +224,9 @@ async def _run_browser_search_tool( async def task_wrapper(query): async with semaphore: if stop_event.is_set(): - logger.info(f"[Browser Tool {task_id}] Skipping task due to stop signal: {query}") + logger.info( + f"[Browser Tool {task_id}] Skipping task due to stop signal: {query}" + ) return {"query": query, "result": None, "status": "cancelled"} # Pass necessary injected configs and the stop event return await run_single_browser_task( @@ -215,7 +234,7 @@ async def _run_browser_search_tool( task_id, llm, # Pass the main LLM (or a dedicated one if needed) browser_config, - stop_event + stop_event, # use_vision could be added here if needed ) @@ -226,35 +245,47 @@ async def _run_browser_search_tool( for i, res in enumerate(search_results): query = queries[i] # Get corresponding query if isinstance(res, Exception): - logger.error(f"[Browser Tool {task_id}] Gather caught exception for query '{query}': {res}", exc_info=True) - processed_results.append({"query": query, "error": str(res), "status": "failed"}) + logger.error( + f"[Browser Tool {task_id}] Gather caught exception for query '{query}': {res}", + exc_info=True, + ) + processed_results.append( + {"query": query, "error": str(res), "status": "failed"} + ) elif isinstance(res, dict): processed_results.append(res) else: - logger.error(f"[Browser Tool {task_id}] Unexpected result type for query '{query}': {type(res)}") - processed_results.append({"query": query, "error": "Unexpected result type", "status": "failed"}) + logger.error( + f"[Browser Tool {task_id}] Unexpected result type for query '{query}': {type(res)}" + ) + processed_results.append( + {"query": query, "error": "Unexpected result type", "status": "failed"} + ) - logger.info(f"[Browser Tool {task_id}] Finished search. Results count: {len(processed_results)}") + logger.info( + f"[Browser Tool {task_id}] Finished search. Results count: {len(processed_results)}" + ) return processed_results def create_browser_search_tool( - llm: Any, - browser_config: Dict[str, Any], - task_id: str, - stop_event: threading.Event, - max_parallel_browsers: int = 1, + llm: Any, + browser_config: Dict[str, Any], + task_id: str, + stop_event: threading.Event, + max_parallel_browsers: int = 1, ) -> StructuredTool: """Factory function to create the browser search tool with necessary dependencies.""" # Use partial to bind the dependencies that aren't part of the LLM call arguments from functools import partial + bound_tool_func = partial( _run_browser_search_tool, task_id=task_id, llm=llm, browser_config=browser_config, stop_event=stop_event, - max_parallel_browsers=max_parallel_browsers + max_parallel_browsers=max_parallel_browsers, ) return StructuredTool.from_function( @@ -269,6 +300,7 @@ Provide a list of distinct search queries(up to {max_parallel_browsers}) that ar # --- Langgraph State Definition --- + class ResearchPlanItem(TypedDict): step: int task: str @@ -298,6 +330,7 @@ class DeepResearchState(TypedDict): # --- Langgraph Nodes --- + def _load_previous_state(task_id: str, output_dir: str) -> Dict[str, Any]: """Loads state from files if they exist.""" state_updates = {} @@ -305,7 +338,7 @@ def _load_previous_state(task_id: str, output_dir: str) -> Dict[str, Any]: search_file = os.path.join(output_dir, SEARCH_INFO_FILENAME) if os.path.exists(plan_file): try: - with open(plan_file, 'r', encoding='utf-8') as f: + with open(plan_file, "r", encoding="utf-8") as f: # Basic parsing, assumes markdown checklist format plan = [] step = 1 @@ -315,24 +348,36 @@ def _load_previous_state(task_id: str, output_dir: str) -> Dict[str, Any]: status = "completed" if line.startswith("- [x]") else "pending" task = line[5:].strip() plan.append( - ResearchPlanItem(step=step, task=task, status=status, queries=None, result_summary=None)) + ResearchPlanItem( + step=step, + task=task, + status=status, + queries=None, + result_summary=None, + ) + ) step += 1 - state_updates['research_plan'] = plan + state_updates["research_plan"] = plan # Determine next step index based on loaded plan - next_step = next((i for i, item in enumerate(plan) if item['status'] == 'pending'), len(plan)) - state_updates['current_step_index'] = next_step - logger.info(f"Loaded research plan from {plan_file}, next step index: {next_step}") + next_step = next( + (i for i, item in enumerate(plan) if item["status"] == "pending"), + len(plan), + ) + state_updates["current_step_index"] = next_step + logger.info( + f"Loaded research plan from {plan_file}, next step index: {next_step}" + ) except Exception as e: logger.error(f"Failed to load or parse research plan {plan_file}: {e}") - state_updates['error_message'] = f"Failed to load research plan: {e}" + state_updates["error_message"] = f"Failed to load research plan: {e}" if os.path.exists(search_file): try: - with open(search_file, 'r', encoding='utf-8') as f: - state_updates['search_results'] = json.load(f) + with open(search_file, "r", encoding="utf-8") as f: + state_updates["search_results"] = json.load(f) logger.info(f"Loaded search results from {search_file}") except Exception as e: logger.error(f"Failed to load search results {search_file}: {e}") - state_updates['error_message'] = f"Failed to load search results: {e}" + state_updates["error_message"] = f"Failed to load search results: {e}" # Decide if this is fatal or if we can continue without old results return state_updates @@ -342,10 +387,10 @@ def _save_plan_to_md(plan: List[ResearchPlanItem], output_dir: str): """Saves the research plan to a markdown checklist file.""" plan_file = os.path.join(output_dir, PLAN_FILENAME) try: - with open(plan_file, 'w', encoding='utf-8') as f: + with open(plan_file, "w", encoding="utf-8") as f: f.write("# Research Plan\n\n") for item in plan: - marker = "- [x]" if item['status'] == 'completed' else "- [ ]" + marker = "- [x]" if item["status"] == "completed" else "- [ ]" f.write(f"{marker} {item['task']}\n") logger.info(f"Research plan saved to {plan_file}") except Exception as e: @@ -357,7 +402,7 @@ def _save_search_results_to_json(results: List[Dict[str, Any]], output_dir: str) search_file = os.path.join(output_dir, SEARCH_INFO_FILENAME) try: # Simple overwrite for now, could be append - with open(search_file, 'w', encoding='utf-8') as f: + with open(search_file, "w", encoding="utf-8") as f: json.dump(results, f, indent=2, ensure_ascii=False) logger.info(f"Search results saved to {search_file}") except Exception as e: @@ -368,7 +413,7 @@ def _save_report_to_md(report: str, output_dir: Path): """Saves the final report to a markdown file.""" report_file = os.path.join(output_dir, REPORT_FILENAME) try: - with open(report_file, 'w', encoding='utf-8') as f: + with open(report_file, "w", encoding="utf-8") as f: f.write(report) logger.info(f"Final report saved to {report_file}") except Exception as e: @@ -378,17 +423,17 @@ def _save_report_to_md(report: str, output_dir: Path): async def planning_node(state: DeepResearchState) -> Dict[str, Any]: """Generates the initial research plan or refines it if resuming.""" logger.info("--- Entering Planning Node ---") - if state.get('stop_requested'): + if state.get("stop_requested"): logger.info("Stop requested, skipping planning.") return {"stop_requested": True} - llm = state['llm'] - topic = state['topic'] - existing_plan = state.get('research_plan') - existing_results = state.get('search_results') - output_dir = state['output_dir'] + llm = state["llm"] + topic = state["topic"] + existing_plan = state.get("research_plan") + existing_results = state.get("search_results") + output_dir = state["output_dir"] - if existing_plan and state.get('current_step_index', 0) > 0: + if existing_plan and state.get("current_step_index", 0) > 0: logger.info("Resuming with existing plan.") # Maybe add logic here to let LLM review and potentially adjust the plan # based on existing_results, but for now, we just use the loaded plan. @@ -397,8 +442,11 @@ async def planning_node(state: DeepResearchState) -> Dict[str, Any]: logger.info(f"Generating new research plan for topic: {topic}") - prompt = ChatPromptTemplate.from_messages([ - ("system", """You are a meticulous research assistant. Your goal is to create a step-by-step research plan to thoroughly investigate a given topic. + prompt = ChatPromptTemplate.from_messages( + [ + ( + "system", + """You are a meticulous research assistant. Your goal is to create a step-by-step research plan to thoroughly investigate a given topic. The plan should consist of clear, actionable research tasks or questions. Each step should logically build towards a comprehensive understanding. Format the output as a numbered list. Each item should represent a distinct research step or question. Example: @@ -410,9 +458,11 @@ async def planning_node(state: DeepResearchState) -> Dict[str, Any]: 6. Summarize the findings and draw conclusions. Keep the plan focused and manageable. Aim for 5-10 detailed steps. - """), - ("human", f"Generate a research plan for the topic: {topic}") - ]) + """, + ), + ("human", f"Generate a research plan for the topic: {topic}"), + ] + ) try: response = await llm.ainvoke(prompt.format_prompt(topic=topic).to_messages()) @@ -420,19 +470,25 @@ async def planning_node(state: DeepResearchState) -> Dict[str, Any]: # Parse the numbered list into the plan structure new_plan: List[ResearchPlanItem] = [] - for i, line in enumerate(plan_text.strip().split('\n')): + for i, line in enumerate(plan_text.strip().split("\n")): line = line.strip() if line and (line[0].isdigit() or line.startswith(("*", "-"))): # Simple parsing: remove number/bullet and space - task_text = line.split('.', 1)[-1].strip() if line[0].isdigit() else line[1:].strip() + task_text = ( + line.split(".", 1)[-1].strip() + if line[0].isdigit() + else line[1:].strip() + ) if task_text: - new_plan.append(ResearchPlanItem( - step=i + 1, - task=task_text, - status="pending", - queries=None, - result_summary=None - )) + new_plan.append( + ResearchPlanItem( + step=i + 1, + task=task_text, + status="pending", + queries=None, + result_summary=None, + ) + ) if not new_plan: logger.error("LLM failed to generate a valid plan structure.") @@ -458,16 +514,19 @@ async def research_execution_node(state: DeepResearchState) -> Dict[str, Any]: The LLM decides which tool (e.g., browser search) to use and provides arguments. """ logger.info("--- Entering Research Execution Node ---") - if state.get('stop_requested'): + if state.get("stop_requested"): logger.info("Stop requested, skipping research execution.") - return {"stop_requested": True, "current_step_index": state['current_step_index']} # Keep index same + return { + "stop_requested": True, + "current_step_index": state["current_step_index"], + } # Keep index same - plan = state['research_plan'] - current_index = state['current_step_index'] - llm = state['llm'] - tools = state['tools'] # Tools are now passed in state - output_dir = str(state['output_dir']) - task_id = state['task_id'] + plan = state["research_plan"] + current_index = state["current_step_index"] + llm = state["llm"] + tools = state["tools"] # Tools are now passed in state + output_dir = str(state["output_dir"]) + task_id = state["task_id"] # Stop event is bound inside the tool function, no need to pass directly here if not plan or current_index >= len(plan): @@ -476,24 +535,31 @@ async def research_execution_node(state: DeepResearchState) -> Dict[str, Any]: return {} current_step = plan[current_index] - if current_step['status'] == 'completed': + if current_step["status"] == "completed": logger.info(f"Step {current_step['step']} already completed, skipping.") return {"current_step_index": current_index + 1} # Move to next step - logger.info(f"Executing research step {current_step['step']}: {current_step['task']}") + logger.info( + f"Executing research step {current_step['step']}: {current_step['task']}" + ) # Bind tools to the LLM for this call llm_with_tools = llm.bind_tools(tools) - if state['messages']: - current_task_message = [HumanMessage( - content=f"Research Task (Step {current_step['step']}): {current_step['task']}")] - invocation_messages = state['messages'] + current_task_message + if state["messages"]: + current_task_message = [ + HumanMessage( + content=f"Research Task (Step {current_step['step']}): {current_step['task']}" + ) + ] + invocation_messages = state["messages"] + current_task_message else: current_task_message = [ SystemMessage( - content="You are a research assistant executing one step of a research plan. Use the available tools, especially the 'parallel_browser_search' tool, to gather information needed for the current task. Be precise with your search queries if using the browser tool."), + content="You are a research assistant executing one step of a research plan. Use the available tools, especially the 'parallel_browser_search' tool, to gather information needed for the current task. Be precise with your search queries if using the browser tool." + ), HumanMessage( - content=f"Research Task (Step {current_step['step']}): {current_step['task']}") + content=f"Research Task (Step {current_step['step']}): {current_step['task']}" + ), ] invocation_messages = current_task_message @@ -509,16 +575,17 @@ async def research_execution_node(state: DeepResearchState) -> Dict[str, Any]: if not isinstance(ai_response, AIMessage) or not ai_response.tool_calls: # LLM didn't call a tool. Maybe it answered directly? Or failed? logger.warning( - f"LLM did not call any tool for step {current_step['step']}. Response: {ai_response.content[:100]}...") + f"LLM did not call any tool for step {current_step['step']}. Response: {ai_response.content[:100]}..." + ) # How to handle this? Mark step as failed? Or store the content? # Let's mark as failed for now, assuming a tool was expected. - current_step['status'] = 'failed' - current_step['result_summary'] = "LLM did not use a tool as expected." + current_step["status"] = "failed" + current_step["result_summary"] = "LLM did not use a tool as expected." _save_plan_to_md(plan, output_dir) return { "research_plan": plan, "current_step_index": current_index + 1, - "error_message": f"LLM failed to call a tool for step {current_step['step']}." + "error_message": f"LLM failed to call a tool for step {current_step['step']}.", } # Process tool calls @@ -536,10 +603,12 @@ async def research_execution_node(state: DeepResearchState) -> Dict[str, Any]: if not selected_tool: logger.error(f"LLM called tool '{tool_name}' which is not available.") # Create a ToolMessage indicating the error - tool_results.append(ToolMessage( - content=f"Error: Tool '{tool_name}' not found.", - tool_call_id=tool_call_id - )) + tool_results.append( + ToolMessage( + content=f"Error: Tool '{tool_name}' not found.", + tool_call_id=tool_call_id, + ) + ) continue # Skip to next tool call if any # Execute the tool @@ -548,7 +617,7 @@ async def research_execution_node(state: DeepResearchState) -> Dict[str, Any]: stop_event = _AGENT_STOP_FLAGS.get(task_id) if stop_event and stop_event.is_set(): logger.info(f"Stop requested before executing tool: {tool_name}") - current_step['status'] = 'pending' # Not completed due to stop + current_step["status"] = "pending" # Not completed due to stop _save_plan_to_md(plan, output_dir) return {"stop_requested": True, "research_plan": plan} @@ -558,46 +627,67 @@ async def research_execution_node(state: DeepResearchState) -> Dict[str, Any]: logger.info(f"Tool '{tool_name}' executed successfully.") browser_tool_called = "parallel_browser_search" in executed_tool_names # Append result to overall search results - current_search_results = state.get('search_results', []) + current_search_results = state.get("search_results", []) if browser_tool_called: # Specific handling for browser tool output current_search_results.extend(tool_output) else: # Handle other tool outputs (e.g., file tools return strings) # Store it associated with the step? Or a generic log? # Let's just log it for now. Need better handling for diverse tool outputs. - logger.info(f"Result from tool '{tool_name}': {str(tool_output)[:200]}...") + logger.info( + f"Result from tool '{tool_name}': {str(tool_output)[:200]}..." + ) # Store result for potential next LLM call (if we were doing multi-turn) - tool_results.append(ToolMessage( - content=json.dumps(tool_output), - tool_call_id=tool_call_id - )) + tool_results.append( + ToolMessage( + content=json.dumps(tool_output), tool_call_id=tool_call_id + ) + ) except Exception as e: logger.error(f"Error executing tool '{tool_name}': {e}", exc_info=True) - tool_results.append(ToolMessage( - content=f"Error executing tool {tool_name}: {e}", - tool_call_id=tool_call_id - )) + tool_results.append( + ToolMessage( + content=f"Error executing tool {tool_name}: {e}", + tool_call_id=tool_call_id, + ) + ) # Also update overall state search_results with error? - current_search_results = state.get('search_results', []) + current_search_results = state.get("search_results", []) current_search_results.append( - {"tool_name": tool_name, "args": tool_args, "status": "failed", "error": str(e)}) + { + "tool_name": tool_name, + "args": tool_args, + "status": "failed", + "error": str(e), + } + ) # Basic check: Did the browser tool run at all? (More specific checks needed) browser_tool_called = "parallel_browser_search" in executed_tool_names # We might need a more nuanced status based on the *content* of tool_results - step_failed = any("Error:" in str(tr.content) for tr in tool_results) or not browser_tool_called + step_failed = ( + any("Error:" in str(tr.content) for tr in tool_results) + or not browser_tool_called + ) if step_failed: - logger.warning(f"Step {current_step['step']} failed or did not yield results via browser search.") - current_step['status'] = 'failed' - current_step[ - 'result_summary'] = f"Tool execution failed or browser tool not used. Errors: {[tr.content for tr in tool_results if 'Error' in str(tr.content)]}" + logger.warning( + f"Step {current_step['step']} failed or did not yield results via browser search." + ) + current_step["status"] = "failed" + current_step["result_summary"] = ( + f"Tool execution failed or browser tool not used. Errors: {[tr.content for tr in tool_results if 'Error' in str(tr.content)]}" + ) else: - logger.info(f"Step {current_step['step']} completed using tool(s): {executed_tool_names}.") - current_step['status'] = 'completed' + logger.info( + f"Step {current_step['step']} completed using tool(s): {executed_tool_names}." + ) + current_step["status"] = "completed" - current_step['result_summary'] = f"Executed tool(s): {', '.join(executed_tool_names)}." + current_step["result_summary"] = ( + f"Executed tool(s): {', '.join(executed_tool_names)}." + ) _save_plan_to_md(plan, output_dir) _save_search_results_to_json(current_search_results, output_dir) @@ -606,34 +696,39 @@ async def research_execution_node(state: DeepResearchState) -> Dict[str, Any]: "research_plan": plan, "search_results": current_search_results, # Update with new results "current_step_index": current_index + 1, - "messages": state["messages"] + current_task_message + [ai_response] + tool_results, + "messages": state["messages"] + + current_task_message + + [ai_response] + + tool_results, # Optionally return the tool_results messages if needed by downstream nodes } except Exception as e: - logger.error(f"Unhandled error during research execution node for step {current_step['step']}: {e}", - exc_info=True) - current_step['status'] = 'failed' + logger.error( + f"Unhandled error during research execution node for step {current_step['step']}: {e}", + exc_info=True, + ) + current_step["status"] = "failed" _save_plan_to_md(plan, output_dir) return { "research_plan": plan, "current_step_index": current_index + 1, # Move on even if error? - "error_message": f"Core Execution Error on step {current_step['step']}: {e}" + "error_message": f"Core Execution Error on step {current_step['step']}: {e}", } async def synthesis_node(state: DeepResearchState) -> Dict[str, Any]: """Synthesizes the final report from the collected search results.""" logger.info("--- Entering Synthesis Node ---") - if state.get('stop_requested'): + if state.get("stop_requested"): logger.info("Stop requested, skipping synthesis.") return {"stop_requested": True} - llm = state['llm'] - topic = state['topic'] - search_results = state.get('search_results', []) - output_dir = state['output_dir'] - plan = state['research_plan'] # Include plan for context + llm = state["llm"] + topic = state["topic"] + search_results = state.get("search_results", []) + output_dir = state["output_dir"] + plan = state["research_plan"] # Include plan for context if not search_results: logger.warning("No search results found to synthesize report.") @@ -641,7 +736,9 @@ async def synthesis_node(state: DeepResearchState) -> Dict[str, Any]: _save_report_to_md(report, output_dir) return {"final_report": report} - logger.info(f"Synthesizing report from {len(search_results)} collected search result entries.") + logger.info( + f"Synthesizing report from {len(search_results)} collected search result entries." + ) # Prepare context for the LLM # Format search results nicely, maybe group by query or original plan step @@ -649,19 +746,21 @@ async def synthesis_node(state: DeepResearchState) -> Dict[str, Any]: references = {} ref_count = 1 for i, result_entry in enumerate(search_results): - query = result_entry.get('query', 'Unknown Query') - status = result_entry.get('status', 'unknown') - result_data = result_entry.get('result') # This should be the dict with summary, title, url - error = result_entry.get('error') + query = result_entry.get("query", "Unknown Query") + status = result_entry.get("status", "unknown") + result_data = result_entry.get( + "result" + ) # This should be the dict with summary, title, url + error = result_entry.get("error") - if status == 'completed' and result_data: + if status == "completed" and result_data: summary = result_data - formatted_results += f"### Finding from Query: \"{query}\"\n" + formatted_results += f'### Finding from Query: "{query}"\n' formatted_results += f"- **Summary:**\n{summary}\n" formatted_results += "---\n" - elif status == 'failed': - formatted_results += f"### Failed Query: \"{query}\"\n" + elif status == "failed": + formatted_results += f'### Failed Query: "{query}"\n' formatted_results += f"- **Error:** {error}\n" formatted_results += "---\n" # Ignore cancelled/other statuses for the report content @@ -669,12 +768,20 @@ async def synthesis_node(state: DeepResearchState) -> Dict[str, Any]: # Prepare the research plan context plan_summary = "\nResearch Plan Followed:\n" for item in plan: - marker = "- [x]" if item['status'] == 'completed' else "- [ ] (Failed)" if item[ - 'status'] == 'failed' else "- [ ]" + marker = ( + "- [x]" + if item["status"] == "completed" + else "- [ ] (Failed)" + if item["status"] == "failed" + else "- [ ]" + ) plan_summary += f"{marker} {item['task']}\n" - synthesis_prompt = ChatPromptTemplate.from_messages([ - ("system", """You are a professional researcher tasked with writing a comprehensive and well-structured report based on collected findings. + synthesis_prompt = ChatPromptTemplate.from_messages( + [ + ( + "system", + """You are a professional researcher tasked with writing a comprehensive and well-structured report based on collected findings. The report should address the research topic thoroughly, synthesizing the information gathered from various sources. Structure the report logically: 1. **Introduction:** Briefly introduce the topic and the report's scope (mentioning the research plan followed is good). @@ -682,8 +789,11 @@ async def synthesis_node(state: DeepResearchState) -> Dict[str, Any]: 3. **Conclusion:** Summarize the main points and offer concluding thoughts or potential areas for further research. Ensure the tone is objective, professional, and analytical. Base the report **strictly** on the provided findings. Do not add external knowledge. If findings are contradictory or incomplete, acknowledge this. - """), - ("human", f""" + """, + ), + ( + "human", + f""" **Research Topic:** {topic} {plan_summary} @@ -696,25 +806,31 @@ async def synthesis_node(state: DeepResearchState) -> Dict[str, Any]: ``` Please generate the final research report in Markdown format based **only** on the information above. Ensure all claims derived from the findings are properly cited using the format [Reference_ID]. - """) - ]) + """, + ), + ] + ) try: - response = await llm.ainvoke(synthesis_prompt.format_prompt( - topic=topic, - plan_summary=plan_summary, - formatted_results=formatted_results, - references=references - ).to_messages()) + response = await llm.ainvoke( + synthesis_prompt.format_prompt( + topic=topic, + plan_summary=plan_summary, + formatted_results=formatted_results, + references=references, + ).to_messages() + ) final_report_md = response.content # Append the reference list automatically to the end of the generated markdown if references: report_references_section = "\n\n## References\n\n" # Sort refs by ID for consistent output - sorted_refs = sorted(references.values(), key=lambda x: x['id']) + sorted_refs = sorted(references.values(), key=lambda x: x["id"]) for ref in sorted_refs: - report_references_section += f"[{ref['id']}] {ref['title']} - {ref['url']}\n" + report_references_section += ( + f"[{ref['id']}] {ref['title']} - {ref['url']}\n" + ) final_report_md += report_references_section logger.info("Successfully synthesized the final report.") @@ -728,28 +844,32 @@ async def synthesis_node(state: DeepResearchState) -> Dict[str, Any]: # --- Langgraph Edges and Conditional Logic --- + def should_continue(state: DeepResearchState) -> str: """Determines the next step based on the current state.""" logger.info("--- Evaluating Condition: Should Continue? ---") - if state.get('stop_requested'): + if state.get("stop_requested"): logger.info("Stop requested, routing to END.") return "end_run" # Go to a dedicated end node for cleanup if needed - if state.get('error_message'): + if state.get("error_message"): logger.warning(f"Error detected: {state['error_message']}. Routing to END.") # Decide if errors should halt execution or if it should try to synthesize anyway return "end_run" # Stop on error for now - plan = state.get('research_plan') - current_index = state.get('current_step_index', 0) + plan = state.get("research_plan") + current_index = state.get("current_step_index", 0) if not plan: - logger.warning("No research plan found, cannot continue execution. Routing to END.") + logger.warning( + "No research plan found, cannot continue execution. Routing to END." + ) return "end_run" # Should not happen if planning node ran correctly # Check if there are pending steps in the plan if current_index < len(plan): logger.info( - f"Plan has pending steps (current index {current_index}/{len(plan)}). Routing to Research Execution.") + f"Plan has pending steps (current index {current_index}/{len(plan)}). Routing to Research Execution." + ) return "execute_research" else: logger.info("All plan steps processed. Routing to Synthesis.") @@ -758,8 +878,14 @@ def should_continue(state: DeepResearchState) -> str: # --- DeepSearchAgent Class --- + class DeepResearchAgent: - def __init__(self, llm: Any, browser_config: Dict[str, Any], mcp_server_config: Optional[Dict[str, Any]] = None): + def __init__( + self, + llm: Any, + browser_config: Dict[str, Any], + mcp_server_config: Optional[Dict[str, Any]] = None, + ): """ Initializes the DeepSearchAgent. @@ -779,16 +905,21 @@ class DeepResearchAgent: self.stop_event: Optional[threading.Event] = None self.runner: Optional[asyncio.Task] = None # To hold the asyncio task for run - async def _setup_tools(self, task_id: str, stop_event: threading.Event, max_parallel_browsers: int = 1) -> List[ - Tool]: + async def _setup_tools( + self, task_id: str, stop_event: threading.Event, max_parallel_browsers: int = 1 + ) -> List[Tool]: """Sets up the basic tools (File I/O) and optional MCP tools.""" - tools = [WriteFileTool(), ReadFileTool(), ListDirectoryTool()] # Basic file operations + tools = [ + WriteFileTool(), + ReadFileTool(), + ListDirectoryTool(), + ] # Basic file operations browser_use_tool = create_browser_search_tool( llm=self.llm, browser_config=self.browser_config, task_id=task_id, stop_event=stop_event, - max_parallel_browsers=max_parallel_browsers + max_parallel_browsers=max_parallel_browsers, ) tools += [browser_use_tool] # Add MCP tools if config is provided @@ -796,14 +927,18 @@ class DeepResearchAgent: try: logger.info("Setting up MCP client and tools...") if not self.mcp_client: - self.mcp_client = await setup_mcp_client_and_tools(self.mcp_server_config) + self.mcp_client = await setup_mcp_client_and_tools( + self.mcp_server_config + ) mcp_tools = self.mcp_client.get_tools() logger.info(f"Loaded {len(mcp_tools)} MCP tools.") tools.extend(mcp_tools) except Exception as e: logger.error(f"Failed to set up MCP tools: {e}", exc_info=True) elif self.mcp_server_config: - logger.warning("MCP server config provided, but setup function unavailable.") + logger.warning( + "MCP server config provided, but setup function unavailable." + ) tools_map = {tool.name: tool for tool in tools} return tools_map.values() @@ -820,12 +955,16 @@ class DeepResearchAgent: workflow.add_node("plan_research", planning_node) workflow.add_node("execute_research", research_execution_node) workflow.add_node("synthesize_report", synthesis_node) - workflow.add_node("end_run", lambda state: logger.info("--- Reached End Run Node ---") or {}) # Simple end node + workflow.add_node( + "end_run", lambda state: logger.info("--- Reached End Run Node ---") or {} + ) # Simple end node # Define edges workflow.set_entry_point("plan_research") - workflow.add_edge("plan_research", "execute_research") # Always execute after planning + workflow.add_edge( + "plan_research", "execute_research" + ) # Always execute after planning # Conditional edge after execution workflow.add_conditional_edges( @@ -834,8 +973,8 @@ class DeepResearchAgent: { "execute_research": "execute_research", # Loop back if more steps "synthesize_report": "synthesize_report", # Move to synthesis if done - "end_run": "end_run" # End if stop requested or error - } + "end_run": "end_run", # End if stop requested or error + }, ) workflow.add_edge("synthesize_report", "end_run") # End after synthesis @@ -843,9 +982,13 @@ class DeepResearchAgent: app = workflow.compile() return app - async def run(self, topic: str, task_id: Optional[str] = None, save_dir: str = "./tmp/deep_research", - max_parallel_browsers: int = 1) -> Dict[ - str, Any]: + async def run( + self, + topic: str, + task_id: Optional[str] = None, + save_dir: str = "./tmp/deep_research", + max_parallel_browsers: int = 1, + ) -> Dict[str, Any]: """ Starts the deep research process (Async Generator Version). @@ -857,20 +1000,30 @@ class DeepResearchAgent: Intermediate state updates or messages during execution. """ if self.runner and not self.runner.done(): - logger.warning("Agent is already running. Please stop the current task first.") + logger.warning( + "Agent is already running. Please stop the current task first." + ) # Return an error status instead of yielding - return {"status": "error", "message": "Agent already running.", "task_id": self.current_task_id} + return { + "status": "error", + "message": "Agent already running.", + "task_id": self.current_task_id, + } self.current_task_id = task_id if task_id else str(uuid.uuid4()) output_dir = os.path.join(save_dir, self.current_task_id) os.makedirs(output_dir, exist_ok=True) - logger.info(f"[AsyncGen] Starting research task ID: {self.current_task_id} for topic: '{topic}'") + logger.info( + f"[AsyncGen] Starting research task ID: {self.current_task_id} for topic: '{topic}'" + ) logger.info(f"[AsyncGen] Output directory: {output_dir}") self.stop_event = threading.Event() _AGENT_STOP_FLAGS[self.current_task_id] = self.stop_event - agent_tools = await self._setup_tools(self.current_task_id, self.stop_event, max_parallel_browsers) + agent_tools = await self._setup_tools( + self.current_task_id, self.stop_event, max_parallel_browsers + ) initial_state: DeepResearchState = { "task_id": self.current_task_id, "topic": topic, @@ -894,11 +1047,15 @@ class DeepResearchAgent: initial_state.update(loaded_state) if loaded_state.get("research_plan"): logger.info( - f"Resuming with {len(loaded_state['research_plan'])} plan steps and {len(loaded_state.get('search_results', []))} existing results.") - initial_state[ - "topic"] = topic # Allow overriding topic even when resuming? Or use stored topic? Let's use new one. + f"Resuming with {len(loaded_state['research_plan'])} plan steps and {len(loaded_state.get('search_results', []))} existing results." + ) + initial_state["topic"] = ( + topic # Allow overriding topic even when resuming? Or use stored topic? Let's use new one. + ) else: - logger.warning(f"Resume requested for {task_id}, but no previous plan found. Starting fresh.") + logger.warning( + f"Resume requested for {task_id}, but no previous plan found. Starting fresh." + ) initial_state["current_step_index"] = 0 # --- Execute Graph using ainvoke --- @@ -955,17 +1112,22 @@ class DeepResearchAgent: "status": status, "message": message, "task_id": task_id_to_clean, # Use the stored task_id - "final_state": final_state if final_state else {} # Return the final state dict + "final_state": final_state + if final_state + else {}, # Return the final state dict } async def _stop_lingering_browsers(self, task_id): """Attempts to stop any BrowserUseAgent instances associated with the task_id.""" - keys_to_stop = [key for key in _BROWSER_AGENT_INSTANCES if key.startswith(f"{task_id}_")] + keys_to_stop = [ + key for key in _BROWSER_AGENT_INSTANCES if key.startswith(f"{task_id}_") + ] if not keys_to_stop: return logger.warning( - f"Found {len(keys_to_stop)} potentially lingering browser agents for task {task_id}. Attempting stop...") + f"Found {len(keys_to_stop)} potentially lingering browser agents for task {task_id}. Attempting stop..." + ) for key in keys_to_stop: agent_instance = _BROWSER_AGENT_INSTANCES.get(key) try: @@ -974,7 +1136,9 @@ class DeepResearchAgent: await agent_instance.stop() logger.info(f"Called stop() on browser agent instance {key}") except Exception as e: - logger.error(f"Error calling stop() on browser agent instance {key}: {e}") + logger.error( + f"Error calling stop() on browser agent instance {key}: {e}" + ) async def stop(self): """Signals the currently running agent task to stop.""" diff --git a/src/webui/components/browser_use_agent_tab.py b/src/webui/components/browser_use_agent_tab.py index 25f56bf..1657086 100644 --- a/src/webui/components/browser_use_agent_tab.py +++ b/src/webui/components/browser_use_agent_tab.py @@ -1,61 +1,53 @@ -import pdb +import asyncio +import json +import logging +import os +import uuid +from typing import Any, AsyncGenerator, Dict, Optional import gradio as gr -from gradio.components import Component -import asyncio -import os -import json -import uuid -import logging -from datetime import datetime -from typing import List, Dict, Optional, Any, Set, Generator, AsyncGenerator, Union -from collections.abc import Awaitable -from langchain_core.language_models.chat_models import BaseChatModel -import base64 -from browser_use.browser.browser import Browser, BrowserConfig -from browser_use.browser.context import BrowserContext, BrowserContextConfig, BrowserContextWindowSize + # from browser_use.agent.service import Agent -from browser_use.agent.views import AgentHistoryList -from browser_use.agent.views import ToolCallingMethod # Adjust import from browser_use.agent.views import ( - REQUIRED_LLM_API_ENV_VARS, - ActionResult, - AgentError, - AgentHistory, AgentHistoryList, AgentOutput, - AgentSettings, - AgentState, - AgentStepInfo, - StepMetadata, - ToolCallingMethod, ) -from browser_use.browser.browser import Browser -from browser_use.browser.context import BrowserContext -from browser_use.browser.views import BrowserState, BrowserStateHistory +from browser_use.browser.browser import BrowserConfig +from browser_use.browser.context import BrowserContext, BrowserContextWindowSize +from browser_use.browser.views import BrowserState +from gradio.components import Component +from langchain_core.language_models.chat_models import BaseChatModel -from src.webui.webui_manager import WebuiManager +from src.agent.browser_use.browser_use_agent import BrowserUseAgent +from src.browser.custom_browser import CustomBrowser +from src.browser.custom_context import CustomBrowserContextConfig from src.controller.custom_controller import CustomController from src.utils import llm_provider -from src.browser.custom_browser import CustomBrowser -from src.browser.custom_context import CustomBrowserContext, CustomBrowserContextConfig -from src.agent.browser_use.browser_use_agent import BrowserUseAgent +from src.webui.webui_manager import WebuiManager logger = logging.getLogger(__name__) # --- Helper Functions --- (Defined at module level) -async def _initialize_llm(provider: Optional[str], model_name: Optional[str], temperature: float, - base_url: Optional[str], api_key: Optional[str], num_ctx: Optional[int] = None) -> Optional[ - BaseChatModel]: + +async def _initialize_llm( + provider: Optional[str], + model_name: Optional[str], + temperature: float, + base_url: Optional[str], + api_key: Optional[str], + num_ctx: Optional[int] = None, +) -> Optional[BaseChatModel]: """Initializes the LLM based on settings. Returns None if provider/model is missing.""" if not provider or not model_name: logger.info("LLM Provider or Model Name not specified, LLM will be None.") return None try: # Use your actual LLM provider logic here - logger.info(f"Initializing LLM: Provider={provider}, Model={model_name}, Temp={temperature}") + logger.info( + f"Initializing LLM: Provider={provider}, Model={model_name}, Temp={temperature}" + ) # Example using a placeholder function llm = llm_provider.get_llm_model( provider=provider, @@ -64,18 +56,23 @@ async def _initialize_llm(provider: Optional[str], model_name: Optional[str], te base_url=base_url or None, api_key=api_key or None, # Add other relevant params like num_ctx for ollama - num_ctx=num_ctx if provider == "ollama" else None + num_ctx=num_ctx if provider == "ollama" else None, ) return llm except Exception as e: logger.error(f"Failed to initialize LLM: {e}", exc_info=True) gr.Warning( - f"Failed to initialize LLM '{model_name}' for provider '{provider}'. Please check settings. Error: {e}") + f"Failed to initialize LLM '{model_name}' for provider '{provider}'. Please check settings. Error: {e}" + ) return None -def _get_config_value(webui_manager: WebuiManager, comp_dict: Dict[gr.components.Component, Any], comp_id_suffix: str, - default: Any = None) -> Any: +def _get_config_value( + webui_manager: WebuiManager, + comp_dict: Dict[gr.components.Component, Any], + comp_id_suffix: str, + default: Any = None, +) -> Any: """Safely get value from component dictionary using its ID suffix relative to the tab.""" # Assumes component ID format is "tab_name.comp_name" tab_name = "browser_use_agent" # Hardcode or derive if needed @@ -93,7 +90,9 @@ def _get_config_value(webui_manager: WebuiManager, comp_dict: Dict[gr.components return comp_dict.get(comp, default) except KeyError: continue - logger.warning(f"Component with suffix '{comp_id_suffix}' not found in manager for value lookup.") + logger.warning( + f"Component with suffix '{comp_id_suffix}' not found in manager for value lookup." + ) return default @@ -103,12 +102,14 @@ def _format_agent_output(model_output: AgentOutput) -> str: if model_output: try: # Directly use model_dump if actions and current_state are Pydantic models - action_dump = [action.model_dump(exclude_none=True) for action in model_output.action] + action_dump = [ + action.model_dump(exclude_none=True) for action in model_output.action + ] state_dump = model_output.current_state.model_dump(exclude_none=True) model_output_dump = { - 'current_state': state_dump, - 'action': action_dump, + "current_state": state_dump, + "action": action_dump, } # Dump to JSON string with indentation json_string = json.dumps(model_output_dump, indent=4, ensure_ascii=False) @@ -117,7 +118,8 @@ def _format_agent_output(model_output: AgentOutput) -> str: except AttributeError as ae: logger.error( - f"AttributeError during model dump: {ae}. Check if 'action' or 'current_state' or their items support 'model_dump'.") + f"AttributeError during model dump: {ae}. Check if 'action' or 'current_state' or their items support 'model_dump'." + ) content = f"
Error: Could not format agent output (AttributeError: {ae}).\nRaw output: {str(model_output)}
" except Exception as e: logger.error(f"Error formatting agent output: {e}", exc_info=True) @@ -129,12 +131,17 @@ def _format_agent_output(model_output: AgentOutput) -> str: # --- Updated Callback Implementation --- -async def _handle_new_step(webui_manager: WebuiManager, state: BrowserState, output: AgentOutput, step_num: int): + +async def _handle_new_step( + webui_manager: WebuiManager, state: BrowserState, output: AgentOutput, step_num: int +): """Callback for each step taken by the agent, including screenshot display.""" # Use the correct chat history attribute name from the user's code - if not hasattr(webui_manager, 'bu_chat_history'): - logger.error("Attribute 'bu_chat_history' not found in webui_manager! Cannot add chat message.") + if not hasattr(webui_manager, "bu_chat_history"): + logger.error( + "Attribute 'bu_chat_history' not found in webui_manager! Cannot add chat message." + ) # Initialize it maybe? Or raise an error? For now, log and potentially skip chat update. webui_manager.bu_chat_history = [] # Initialize if missing (consider if this is the right place) # return # Or stop if this is critical @@ -145,21 +152,29 @@ async def _handle_new_step(webui_manager: WebuiManager, state: BrowserState, out screenshot_html = "" # Ensure state.screenshot exists and is not empty before proceeding # Use getattr for safer access - screenshot_data = getattr(state, 'screenshot', None) + screenshot_data = getattr(state, "screenshot", None) if screenshot_data: try: # Basic validation: check if it looks like base64 - if isinstance(screenshot_data, str) and len(screenshot_data) > 100: # Arbitrary length check + if ( + isinstance(screenshot_data, str) and len(screenshot_data) > 100 + ): # Arbitrary length check # *** UPDATED STYLE: Removed centering, adjusted width *** img_tag = f'Step {step_num} Screenshot' - screenshot_html = img_tag + "
" # Use
for line break after inline-block image + screenshot_html = ( + img_tag + "
" + ) # Use
for line break after inline-block image else: logger.warning( - f"Screenshot for step {step_num} seems invalid (type: {type(screenshot_data)}, len: {len(screenshot_data) if isinstance(screenshot_data, str) else 'N/A'}).") + f"Screenshot for step {step_num} seems invalid (type: {type(screenshot_data)}, len: {len(screenshot_data) if isinstance(screenshot_data, str) else 'N/A'})." + ) screenshot_html = "**[Invalid screenshot data]**
" except Exception as e: - logger.error(f"Error processing or formatting screenshot for step {step_num}: {e}", exc_info=True) + logger.error( + f"Error processing or formatting screenshot for step {step_num}: {e}", + exc_info=True, + ) screenshot_html = "**[Error displaying screenshot]**
" else: logger.debug(f"No screenshot available for step {step_num}.") @@ -174,7 +189,7 @@ async def _handle_new_step(webui_manager: WebuiManager, state: BrowserState, out chat_message = { "role": "assistant", - "content": final_content.strip() # Remove leading/trailing whitespace + "content": final_content.strip(), # Remove leading/trailing whitespace } # Append to the correct chat history list @@ -186,8 +201,9 @@ async def _handle_new_step(webui_manager: WebuiManager, state: BrowserState, out def _handle_done(webui_manager: WebuiManager, history: AgentHistoryList): """Callback when the agent finishes the task (success or failure).""" logger.info( - f"Agent task finished. Duration: {history.total_duration_seconds():.2f}s, Tokens: {history.total_input_tokens()}") - final_summary = f"**Task Completed**\n" + f"Agent task finished. Duration: {history.total_duration_seconds():.2f}s, Tokens: {history.total_input_tokens()}" + ) + final_summary = "**Task Completed**\n" final_summary += f"- Duration: {history.total_duration_seconds():.2f} seconds\n" final_summary += f"- Total Input Tokens: {history.total_input_tokens()}\n" # Or total tokens if available @@ -201,20 +217,27 @@ def _handle_done(webui_manager: WebuiManager, history: AgentHistoryList): else: final_summary += "- Status: Success\n" - webui_manager.bu_chat_history.append({"role": "assistant", "content": final_summary}) + webui_manager.bu_chat_history.append( + {"role": "assistant", "content": final_summary} + ) -async def _ask_assistant_callback(webui_manager: WebuiManager, query: str, browser_context: BrowserContext) -> Dict[ - str, Any]: +async def _ask_assistant_callback( + webui_manager: WebuiManager, query: str, browser_context: BrowserContext +) -> Dict[str, Any]: """Callback triggered by the agent's ask_for_assistant action.""" logger.info("Agent requires assistance. Waiting for user input.") - if not hasattr(webui_manager, '_chat_history'): + if not hasattr(webui_manager, "_chat_history"): logger.error("Chat history not found in webui_manager during ask_assistant!") return {"response": "Internal Error: Cannot display help request."} - webui_manager.bu_chat_history.append({"role": "assistant", - "content": f"**Need Help:** {query}\nPlease provide information or perform the required action in the browser, then type your response/confirmation below and click 'Submit Response'."}) + webui_manager.bu_chat_history.append( + { + "role": "assistant", + "content": f"**Need Help:** {query}\nPlease provide information or perform the required action in the browser, then type your response/confirmation below and click 'Submit Response'.", + } + ) # Use state stored in webui_manager webui_manager.bu_response_event = asyncio.Event() @@ -222,38 +245,60 @@ async def _ask_assistant_callback(webui_manager: WebuiManager, query: str, brows try: logger.info("Waiting for user response event...") - await asyncio.wait_for(webui_manager.bu_response_event.wait(), timeout=3600.0) # Long timeout + await asyncio.wait_for( + webui_manager.bu_response_event.wait(), timeout=3600.0 + ) # Long timeout logger.info("User response event received.") except asyncio.TimeoutError: logger.warning("Timeout waiting for user assistance.") webui_manager.bu_chat_history.append( - {"role": "assistant", "content": "**Timeout:** No response received. Trying to proceed."}) + { + "role": "assistant", + "content": "**Timeout:** No response received. Trying to proceed.", + } + ) webui_manager.bu_response_event = None # Clear the event return {"response": "Timeout: User did not respond."} # Inform the agent response = webui_manager.bu_user_help_response - webui_manager.bu_chat_history.append({"role": "user", "content": response}) # Show user response in chat - webui_manager.bu_response_event = None # Clear the event for the next potential request + webui_manager.bu_chat_history.append( + {"role": "user", "content": response} + ) # Show user response in chat + webui_manager.bu_response_event = ( + None # Clear the event for the next potential request + ) return {"response": response} # --- Core Agent Execution Logic --- (Needs access to webui_manager) -async def run_agent_task(webui_manager: WebuiManager, components: Dict[gr.components.Component, Any]) -> AsyncGenerator[ - Dict[gr.components.Component, Any], None]: + +async def run_agent_task( + webui_manager: WebuiManager, components: Dict[gr.components.Component, Any] +) -> AsyncGenerator[Dict[gr.components.Component, Any], None]: """Handles the entire lifecycle of initializing and running the agent.""" # --- Get Components --- # Need handles to specific UI components to update them user_input_comp = webui_manager.get_component_by_id("browser_use_agent.user_input") run_button_comp = webui_manager.get_component_by_id("browser_use_agent.run_button") - stop_button_comp = webui_manager.get_component_by_id("browser_use_agent.stop_button") - pause_resume_button_comp = webui_manager.get_component_by_id("browser_use_agent.pause_resume_button") - clear_button_comp = webui_manager.get_component_by_id("browser_use_agent.clear_button") + stop_button_comp = webui_manager.get_component_by_id( + "browser_use_agent.stop_button" + ) + pause_resume_button_comp = webui_manager.get_component_by_id( + "browser_use_agent.pause_resume_button" + ) + clear_button_comp = webui_manager.get_component_by_id( + "browser_use_agent.clear_button" + ) chatbot_comp = webui_manager.get_component_by_id("browser_use_agent.chatbot") - history_file_comp = webui_manager.get_component_by_id("browser_use_agent.agent_history_file") + history_file_comp = webui_manager.get_component_by_id( + "browser_use_agent.agent_history_file" + ) gif_comp = webui_manager.get_component_by_id("browser_use_agent.recording_gif") - browser_view_comp = webui_manager.get_component_by_id("browser_use_agent.browser_view") + browser_view_comp = webui_manager.get_component_by_id( + "browser_use_agent.browser_view" + ) # --- 1. Get Task and Initial UI Update --- task = components.get(user_input_comp, "").strip() @@ -266,7 +311,9 @@ async def run_agent_task(webui_manager: WebuiManager, components: Dict[gr.compon webui_manager.bu_chat_history.append({"role": "user", "content": task}) yield { - user_input_comp: gr.Textbox(value="", interactive=False, placeholder="Agent is running..."), + user_input_comp: gr.Textbox( + value="", interactive=False, placeholder="Agent is running..." + ), run_button_comp: gr.Button(value="ā³ Running...", interactive=False), stop_button_comp: gr.Button(interactive=True), pause_resume_button_comp: gr.Button(value="āøļø Pause", interactive=True), @@ -284,7 +331,9 @@ async def run_agent_task(webui_manager: WebuiManager, components: Dict[gr.compon override_system_prompt = get_setting("override_system_prompt") or None extend_system_prompt = get_setting("extend_system_prompt") or None - llm_provider_name = get_setting("llm_provider", None) # Default to None if not found + llm_provider_name = get_setting( + "llm_provider", None + ) # Default to None if not found llm_model_name = get_setting("llm_model_name", None) llm_temperature = get_setting("llm_temperature", 0.6) use_vision = get_setting("use_vision", True) @@ -296,9 +345,15 @@ async def run_agent_task(webui_manager: WebuiManager, components: Dict[gr.compon max_input_tokens = get_setting("max_input_tokens", 128000) tool_calling_str = get_setting("tool_calling_method", "auto") tool_calling_method = tool_calling_str if tool_calling_str != "None" else None - mcp_server_config_comp = webui_manager.id_to_component.get("agent_settings.mcp_server_config") - mcp_server_config_str = components.get(mcp_server_config_comp) if mcp_server_config_comp else None - mcp_server_config = json.loads(mcp_server_config_str) if mcp_server_config_str else None + mcp_server_config_comp = webui_manager.id_to_component.get( + "agent_settings.mcp_server_config" + ) + mcp_server_config_str = ( + components.get(mcp_server_config_comp) if mcp_server_config_comp else None + ) + mcp_server_config = ( + json.loads(mcp_server_config_str) if mcp_server_config_str else None + ) # Planner LLM Settings (Optional) planner_llm_provider_name = get_setting("planner_llm_provider") or None @@ -312,9 +367,12 @@ async def run_agent_task(webui_manager: WebuiManager, components: Dict[gr.compon planner_use_vision = get_setting("planner_use_vision", False) planner_llm = await _initialize_llm( - planner_llm_provider_name, planner_llm_model_name, planner_llm_temperature, - planner_llm_base_url, planner_llm_api_key, - planner_ollama_num_ctx if planner_llm_provider_name == "ollama" else None + planner_llm_provider_name, + planner_llm_model_name, + planner_llm_temperature, + planner_llm_base_url, + planner_llm_api_key, + planner_ollama_num_ctx if planner_llm_provider_name == "ollama" else None, ) # --- Browser Settings --- @@ -324,7 +382,9 @@ async def run_agent_task(webui_manager: WebuiManager, components: Dict[gr.compon browser_binary_path = get_browser_setting("browser_binary_path") or None browser_user_data_dir = get_browser_setting("browser_user_data_dir") or None - use_own_browser = get_browser_setting("use_own_browser", False) # Logic handled by CDP/WSS presence + use_own_browser = get_browser_setting( + "use_own_browser", False + ) # Logic handled by CDP/WSS presence keep_browser_open = get_browser_setting("keep_browser_open", False) headless = get_browser_setting("headless", False) disable_security = get_browser_setting("disable_security", True) @@ -334,29 +394,42 @@ async def run_agent_task(webui_manager: WebuiManager, components: Dict[gr.compon wss_url = get_browser_setting("wss_url") or None save_recording_path = get_browser_setting("save_recording_path") or None save_trace_path = get_browser_setting("save_trace_path") or None - save_agent_history_path = get_browser_setting("save_agent_history_path", "./tmp/agent_history") + save_agent_history_path = get_browser_setting( + "save_agent_history_path", "./tmp/agent_history" + ) save_download_path = get_browser_setting("save_download_path", "./tmp/downloads") stream_vw = 70 stream_vh = int(70 * window_h // window_w) os.makedirs(save_agent_history_path, exist_ok=True) - if save_recording_path: os.makedirs(save_recording_path, exist_ok=True) - if save_trace_path: os.makedirs(save_trace_path, exist_ok=True) - if save_download_path: os.makedirs(save_download_path, exist_ok=True) + if save_recording_path: + os.makedirs(save_recording_path, exist_ok=True) + if save_trace_path: + os.makedirs(save_trace_path, exist_ok=True) + if save_download_path: + os.makedirs(save_download_path, exist_ok=True) # --- 2. Initialize LLM --- main_llm = await _initialize_llm( - llm_provider_name, llm_model_name, llm_temperature, llm_base_url, llm_api_key, - ollama_num_ctx if llm_provider_name == "ollama" else None + llm_provider_name, + llm_model_name, + llm_temperature, + llm_base_url, + llm_api_key, + ollama_num_ctx if llm_provider_name == "ollama" else None, ) # Pass the webui_manager instance to the callback when wrapping it - async def ask_callback_wrapper(query: str, browser_context: BrowserContext) -> Dict[str, Any]: + async def ask_callback_wrapper( + query: str, browser_context: BrowserContext + ) -> Dict[str, Any]: return await _ask_assistant_callback(webui_manager, query, browser_context) if not webui_manager.bu_controller: - webui_manager.bu_controller = CustomController(ask_assistant_callback=ask_callback_wrapper) + webui_manager.bu_controller = CustomController( + ask_assistant_callback=ask_callback_wrapper + ) await webui_manager.bu_controller.setup_mcp_client(mcp_server_config) # --- 4. Initialize Browser and Context --- @@ -382,7 +455,9 @@ async def run_agent_task(webui_manager: WebuiManager, components: Dict[gr.compon extra_args.append(f"--user-data-dir={browser_user_data_dir}") if use_own_browser: - browser_binary_path = os.getenv("CHROME_PATH", None) or browser_binary_path + browser_binary_path = ( + os.getenv("CHROME_PATH", None) or browser_binary_path + ) if browser_binary_path == "": browser_binary_path = None chrome_user_data = os.getenv("CHROME_USER_DATA", None) @@ -406,24 +481,41 @@ async def run_agent_task(webui_manager: WebuiManager, components: Dict[gr.compon logger.info("Creating new browser context.") context_config = CustomBrowserContextConfig( trace_path=save_trace_path if save_trace_path else None, - save_recording_path=save_recording_path if save_recording_path else None, + save_recording_path=save_recording_path + if save_recording_path + else None, save_downloads_path=save_download_path if save_download_path else None, - browser_window_size=BrowserContextWindowSize(width=window_w, height=window_h) + browser_window_size=BrowserContextWindowSize( + width=window_w, height=window_h + ), ) if not webui_manager.bu_browser: raise ValueError("Browser not initialized, cannot create context.") - webui_manager.bu_browser_context = await webui_manager.bu_browser.new_context(config=context_config) + webui_manager.bu_browser_context = ( + await webui_manager.bu_browser.new_context(config=context_config) + ) # --- 5. Initialize or Update Agent --- webui_manager.bu_agent_task_id = str(uuid.uuid4()) # New ID for this task run - os.makedirs(os.path.join(save_agent_history_path, webui_manager.bu_agent_task_id), exist_ok=True) - history_file = os.path.join(save_agent_history_path, webui_manager.bu_agent_task_id, - f"{webui_manager.bu_agent_task_id}.json") - gif_path = os.path.join(save_agent_history_path, webui_manager.bu_agent_task_id, - f"{webui_manager.bu_agent_task_id}.gif") + os.makedirs( + os.path.join(save_agent_history_path, webui_manager.bu_agent_task_id), + exist_ok=True, + ) + history_file = os.path.join( + save_agent_history_path, + webui_manager.bu_agent_task_id, + f"{webui_manager.bu_agent_task_id}.json", + ) + gif_path = os.path.join( + save_agent_history_path, + webui_manager.bu_agent_task_id, + f"{webui_manager.bu_agent_task_id}.gif", + ) # Pass the webui_manager to callbacks when wrapping them - async def step_callback_wrapper(state: BrowserState, output: AgentOutput, step_num: int): + async def step_callback_wrapper( + state: BrowserState, output: AgentOutput, step_num: int + ): await _handle_new_step(webui_manager, state, output, step_num) def done_callback_wrapper(history: AgentHistoryList): @@ -432,7 +524,9 @@ async def run_agent_task(webui_manager: WebuiManager, components: Dict[gr.compon if not webui_manager.bu_agent: logger.info(f"Initializing new agent for task: {task}") if not webui_manager.bu_browser or not webui_manager.bu_browser_context: - raise ValueError("Browser or Context not initialized, cannot create agent.") + raise ValueError( + "Browser or Context not initialized, cannot create agent." + ) webui_manager.bu_agent = BrowserUseAgent( task=task, llm=main_llm, @@ -448,7 +542,8 @@ async def run_agent_task(webui_manager: WebuiManager, components: Dict[gr.compon max_actions_per_step=max_actions, tool_calling_method=tool_calling_method, planner_llm=planner_llm, - use_vision_for_planner=planner_use_vision if planner_llm else False + use_vision_for_planner=planner_use_vision if planner_llm else False, + source="webui", ) webui_manager.bu_agent.state.agent_id = webui_manager.bu_agent_task_id webui_manager.bu_agent.settings.generate_gif = gif_path @@ -473,7 +568,9 @@ async def run_agent_task(webui_manager: WebuiManager, components: Dict[gr.compon # Check for pause state if is_paused: yield { - pause_resume_button_comp: gr.update(value="ā–¶ļø Resume", interactive=True), + pause_resume_button_comp: gr.update( + value="ā–¶ļø Resume", interactive=True + ), stop_button_comp: gr.update(interactive=True), } # Wait until pause is released or task is stopped/done @@ -485,13 +582,19 @@ async def run_agent_task(webui_manager: WebuiManager, components: Dict[gr.compon break await asyncio.sleep(0.2) - if agent_task.done() or is_stopped: # If stopped or task finished while paused + if ( + agent_task.done() or is_stopped + ): # If stopped or task finished while paused break # If resumed, yield UI update yield { - pause_resume_button_comp: gr.update(value="āøļø Pause", interactive=True), - run_button_comp: gr.update(value="ā³ Running...", interactive=False), + pause_resume_button_comp: gr.update( + value="āøļø Pause", interactive=True + ), + run_button_comp: gr.update( + value="ā³ Running...", interactive=False + ), } # Check if agent stopped itself or stop button was pressed (which sets agent.state.stopped) @@ -500,9 +603,13 @@ async def run_agent_task(webui_manager: WebuiManager, components: Dict[gr.compon if not agent_task.done(): # Ensure the task coroutine finishes if agent just set flag try: - await asyncio.wait_for(agent_task, timeout=1.0) # Give it a moment to exit run() + await asyncio.wait_for( + agent_task, timeout=1.0 + ) # Give it a moment to exit run() except asyncio.TimeoutError: - logger.warning("Agent task did not finish quickly after stop signal, cancelling.") + logger.warning( + "Agent task did not finish quickly after stop signal, cancelling." + ) agent_task.cancel() except Exception: # Catch task exceptions if it errors on stop pass @@ -512,23 +619,34 @@ async def run_agent_task(webui_manager: WebuiManager, components: Dict[gr.compon update_dict = {} if webui_manager.bu_response_event is not None: update_dict = { - user_input_comp: gr.update(placeholder="Agent needs help. Enter response and submit.", - interactive=True), - run_button_comp: gr.update(value="āœ”ļø Submit Response", interactive=True), + user_input_comp: gr.update( + placeholder="Agent needs help. Enter response and submit.", + interactive=True, + ), + run_button_comp: gr.update( + value="āœ”ļø Submit Response", interactive=True + ), pause_resume_button_comp: gr.update(interactive=False), stop_button_comp: gr.update(interactive=False), - chatbot_comp: gr.update(value=webui_manager.bu_chat_history) + chatbot_comp: gr.update(value=webui_manager.bu_chat_history), } last_chat_len = len(webui_manager.bu_chat_history) yield update_dict # Wait until response is submitted or task finishes - while webui_manager.bu_response_event is not None and not agent_task.done(): + while ( + webui_manager.bu_response_event is not None + and not agent_task.done() + ): await asyncio.sleep(0.2) # Restore UI after response submitted or if task ended unexpectedly if not agent_task.done(): yield { - user_input_comp: gr.update(placeholder="Agent is running...", interactive=False), - run_button_comp: gr.update(value="ā³ Running...", interactive=False), + user_input_comp: gr.update( + placeholder="Agent is running...", interactive=False + ), + run_button_comp: gr.update( + value="ā³ Running...", interactive=False + ), pause_resume_button_comp: gr.update(interactive=True), stop_button_comp: gr.update(interactive=True), } @@ -537,24 +655,33 @@ async def run_agent_task(webui_manager: WebuiManager, components: Dict[gr.compon # Update Chatbot if new messages arrived via callbacks if len(webui_manager.bu_chat_history) > last_chat_len: - update_dict[chatbot_comp] = gr.update(value=webui_manager.bu_chat_history) + update_dict[chatbot_comp] = gr.update( + value=webui_manager.bu_chat_history + ) last_chat_len = len(webui_manager.bu_chat_history) # Update Browser View if headless and webui_manager.bu_browser_context: try: - screenshot_b64 = await webui_manager.bu_browser_context.take_screenshot() + screenshot_b64 = ( + await webui_manager.bu_browser_context.take_screenshot() + ) if screenshot_b64: html_content = f'' - update_dict[browser_view_comp] = gr.update(value=html_content, visible=True) + update_dict[browser_view_comp] = gr.update( + value=html_content, visible=True + ) else: html_content = f"

Waiting for browser session...

" - update_dict[browser_view_comp] = gr.update(value=html_content, - visible=True) + update_dict[browser_view_comp] = gr.update( + value=html_content, visible=True + ) except Exception as e: logger.debug(f"Failed to capture screenshot: {e}") - update_dict[browser_view_comp] = gr.update(value="
Error loading view...
", - visible=True) + update_dict[browser_view_comp] = gr.update( + value="
Error loading view...
", + visible=True, + ) else: update_dict[browser_view_comp] = gr.update(visible=False) @@ -589,16 +716,28 @@ async def run_agent_task(webui_manager: WebuiManager, components: Dict[gr.compon except asyncio.CancelledError: logger.info("Agent task was cancelled.") - if not any("Cancelled" in msg.get("content", "") for msg in webui_manager.bu_chat_history if - msg.get("role") == "assistant"): - webui_manager.bu_chat_history.append({"role": "assistant", "content": "**Task Cancelled**."}) + if not any( + "Cancelled" in msg.get("content", "") + for msg in webui_manager.bu_chat_history + if msg.get("role") == "assistant" + ): + webui_manager.bu_chat_history.append( + {"role": "assistant", "content": "**Task Cancelled**."} + ) final_update[chatbot_comp] = gr.update(value=webui_manager.bu_chat_history) except Exception as e: logger.error(f"Error during agent execution: {e}", exc_info=True) - error_message = f"**Agent Execution Error:**\n```\n{type(e).__name__}: {e}\n```" - if not any(error_message in msg.get("content", "") for msg in webui_manager.bu_chat_history if - msg.get("role") == "assistant"): - webui_manager.bu_chat_history.append({"role": "assistant", "content": error_message}) + error_message = ( + f"**Agent Execution Error:**\n```\n{type(e).__name__}: {e}\n```" + ) + if not any( + error_message in msg.get("content", "") + for msg in webui_manager.bu_chat_history + if msg.get("role") == "assistant" + ): + webui_manager.bu_chat_history.append( + {"role": "assistant", "content": error_message} + ) final_update[chatbot_comp] = gr.update(value=webui_manager.bu_chat_history) gr.Error(f"Agent execution failed: {e}") @@ -617,15 +756,23 @@ async def run_agent_task(webui_manager: WebuiManager, components: Dict[gr.compon webui_manager.bu_browser = None # --- 8. Final UI Update --- - final_update.update({ - user_input_comp: gr.update(value="", interactive=True, placeholder="Enter your next task..."), - run_button_comp: gr.update(value="ā–¶ļø Submit Task", interactive=True), - stop_button_comp: gr.update(value="ā¹ļø Stop", interactive=False), - pause_resume_button_comp: gr.update(value="āøļø Pause", interactive=False), - clear_button_comp: gr.update(interactive=True), - # Ensure final chat history is shown - chatbot_comp: gr.update(value=webui_manager.bu_chat_history) - }) + final_update.update( + { + user_input_comp: gr.update( + value="", + interactive=True, + placeholder="Enter your next task...", + ), + run_button_comp: gr.update(value="ā–¶ļø Submit Task", interactive=True), + stop_button_comp: gr.update(value="ā¹ļø Stop", interactive=False), + pause_resume_button_comp: gr.update( + value="āøļø Pause", interactive=False + ), + clear_button_comp: gr.update(interactive=True), + # Ensure final chat history is shown + chatbot_comp: gr.update(value=webui_manager.bu_chat_history), + } + ) yield final_update except Exception as e: @@ -633,19 +780,26 @@ async def run_agent_task(webui_manager: WebuiManager, components: Dict[gr.compon logger.error(f"Error setting up agent task: {e}", exc_info=True) webui_manager.bu_current_task = None # Ensure state is reset yield { - user_input_comp: gr.update(interactive=True, placeholder="Error during setup. Enter task..."), + user_input_comp: gr.update( + interactive=True, placeholder="Error during setup. Enter task..." + ), run_button_comp: gr.update(value="ā–¶ļø Submit Task", interactive=True), stop_button_comp: gr.update(value="ā¹ļø Stop", interactive=False), pause_resume_button_comp: gr.update(value="āøļø Pause", interactive=False), clear_button_comp: gr.update(interactive=True), chatbot_comp: gr.update( - value=webui_manager.bu_chat_history + [{"role": "assistant", "content": f"**Setup Error:** {e}"}]), + value=webui_manager.bu_chat_history + + [{"role": "assistant", "content": f"**Setup Error:** {e}"}] + ), } # --- Button Click Handlers --- (Need access to webui_manager) -async def handle_submit(webui_manager: WebuiManager, components: Dict[gr.components.Component, Any]): + +async def handle_submit( + webui_manager: WebuiManager, components: Dict[gr.components.Component, Any] +): """Handles clicks on the main 'Submit' button.""" user_input_comp = webui_manager.get_component_by_id("browser_use_agent.user_input") user_input_value = components.get(user_input_comp, "").strip() @@ -653,17 +807,26 @@ async def handle_submit(webui_manager: WebuiManager, components: Dict[gr.compone # Check if waiting for user assistance if webui_manager.bu_response_event and not webui_manager.bu_response_event.is_set(): logger.info(f"User submitted assistance: {user_input_value}") - webui_manager.bu_user_help_response = user_input_value if user_input_value else "User provided no text response." + webui_manager.bu_user_help_response = ( + user_input_value if user_input_value else "User provided no text response." + ) webui_manager.bu_response_event.set() # UI updates handled by the main loop reacting to the event being set yield { - user_input_comp: gr.update(value="", interactive=False, placeholder="Waiting for agent to continue..."), - webui_manager.get_component_by_id("browser_use_agent.run_button"): gr.update(value="ā³ Running...", - interactive=False) + user_input_comp: gr.update( + value="", + interactive=False, + placeholder="Waiting for agent to continue...", + ), + webui_manager.get_component_by_id( + "browser_use_agent.run_button" + ): gr.update(value="ā³ Running...", interactive=False), } # Check if a task is currently running (using _current_task) elif webui_manager.bu_current_task and not webui_manager.bu_current_task.done(): - logger.warning("Submit button clicked while agent is already running and not asking for help.") + logger.warning( + "Submit button clicked while agent is already running and not asking for help." + ) gr.Info("Agent is currently running. Please wait or use Stop/Pause.") yield {} # No change else: @@ -685,19 +848,32 @@ async def handle_stop(webui_manager: WebuiManager): agent.state.stopped = True agent.state.paused = False # Ensure not paused if stopped return { - webui_manager.get_component_by_id("browser_use_agent.stop_button"): gr.update(interactive=False, - value="ā¹ļø Stopping..."), - webui_manager.get_component_by_id("browser_use_agent.pause_resume_button"): gr.update(interactive=False), - webui_manager.get_component_by_id("browser_use_agent.run_button"): gr.update(interactive=False), + webui_manager.get_component_by_id( + "browser_use_agent.stop_button" + ): gr.update(interactive=False, value="ā¹ļø Stopping..."), + webui_manager.get_component_by_id( + "browser_use_agent.pause_resume_button" + ): gr.update(interactive=False), + webui_manager.get_component_by_id( + "browser_use_agent.run_button" + ): gr.update(interactive=False), } else: logger.warning("Stop clicked but agent is not running or task is already done.") # Reset UI just in case it's stuck return { - webui_manager.get_component_by_id("browser_use_agent.run_button"): gr.update(interactive=True), - webui_manager.get_component_by_id("browser_use_agent.stop_button"): gr.update(interactive=False), - webui_manager.get_component_by_id("browser_use_agent.pause_resume_button"): gr.update(interactive=False), - webui_manager.get_component_by_id("browser_use_agent.clear_button"): gr.update(interactive=True), + webui_manager.get_component_by_id( + "browser_use_agent.run_button" + ): gr.update(interactive=True), + webui_manager.get_component_by_id( + "browser_use_agent.stop_button" + ): gr.update(interactive=False), + webui_manager.get_component_by_id( + "browser_use_agent.pause_resume_button" + ): gr.update(interactive=False), + webui_manager.get_component_by_id( + "browser_use_agent.clear_button" + ): gr.update(interactive=True), } @@ -712,16 +888,22 @@ async def handle_pause_resume(webui_manager: WebuiManager): agent.resume() # UI update happens in main loop return { - webui_manager.get_component_by_id("browser_use_agent.pause_resume_button"): gr.update(value="āøļø Pause", - interactive=True)} # Optimistic update + webui_manager.get_component_by_id( + "browser_use_agent.pause_resume_button" + ): gr.update(value="āøļø Pause", interactive=True) + } # Optimistic update else: logger.info("Pause button clicked.") agent.pause() return { - webui_manager.get_component_by_id("browser_use_agent.pause_resume_button"): gr.update(value="ā–¶ļø Resume", - interactive=True)} # Optimistic update + webui_manager.get_component_by_id( + "browser_use_agent.pause_resume_button" + ): gr.update(value="ā–¶ļø Resume", interactive=True) + } # Optimistic update else: - logger.warning("Pause/Resume clicked but agent is not running or doesn't support state.") + logger.warning( + "Pause/Resume clicked but agent is not running or doesn't support state." + ) return {} # No change @@ -758,24 +940,39 @@ async def handle_clear(webui_manager: WebuiManager): # Reset UI components return { - webui_manager.get_component_by_id("browser_use_agent.chatbot"): gr.update(value=[]), - webui_manager.get_component_by_id("browser_use_agent.user_input"): gr.update(value="", - placeholder="Enter your task here..."), - webui_manager.get_component_by_id("browser_use_agent.agent_history_file"): gr.update(value=None), - webui_manager.get_component_by_id("browser_use_agent.recording_gif"): gr.update(value=None), + webui_manager.get_component_by_id("browser_use_agent.chatbot"): gr.update( + value=[] + ), + webui_manager.get_component_by_id("browser_use_agent.user_input"): gr.update( + value="", placeholder="Enter your task here..." + ), + webui_manager.get_component_by_id( + "browser_use_agent.agent_history_file" + ): gr.update(value=None), + webui_manager.get_component_by_id("browser_use_agent.recording_gif"): gr.update( + value=None + ), webui_manager.get_component_by_id("browser_use_agent.browser_view"): gr.update( - value="
Browser Cleared
"), - webui_manager.get_component_by_id("browser_use_agent.run_button"): gr.update(value="ā–¶ļø Submit Task", - interactive=True), - webui_manager.get_component_by_id("browser_use_agent.stop_button"): gr.update(interactive=False), - webui_manager.get_component_by_id("browser_use_agent.pause_resume_button"): gr.update(value="āøļø Pause", - interactive=False), - webui_manager.get_component_by_id("browser_use_agent.clear_button"): gr.update(interactive=True), + value="
Browser Cleared
" + ), + webui_manager.get_component_by_id("browser_use_agent.run_button"): gr.update( + value="ā–¶ļø Submit Task", interactive=True + ), + webui_manager.get_component_by_id("browser_use_agent.stop_button"): gr.update( + interactive=False + ), + webui_manager.get_component_by_id( + "browser_use_agent.pause_resume_button" + ): gr.update(value="āøļø Pause", interactive=False), + webui_manager.get_component_by_id("browser_use_agent.clear_button"): gr.update( + interactive=True + ), } # --- Tab Creation Function --- + def create_browser_use_agent_tab(webui_manager: WebuiManager): """ Create the run agent tab, defining UI, state, and handlers. @@ -799,12 +996,18 @@ def create_browser_use_agent_tab(webui_manager: WebuiManager): placeholder="Enter your task here or provide assistance when asked.", lines=3, interactive=True, - elem_id="user_input" + elem_id="user_input", ) with gr.Row(): - stop_button = gr.Button("ā¹ļø Stop", interactive=False, variant="stop", scale=2) - pause_resume_button = gr.Button("āøļø Pause", interactive=False, variant="secondary", scale=2, visible=True) - clear_button = gr.Button("šŸ—‘ļø Clear", interactive=True, variant="secondary", scale=2) + stop_button = gr.Button( + "ā¹ļø Stop", interactive=False, variant="stop", scale=2 + ) + pause_resume_button = gr.Button( + "āøļø Pause", interactive=False, variant="secondary", scale=2, visible=True + ) + clear_button = gr.Button( + "šŸ—‘ļø Clear", interactive=True, variant="secondary", scale=2 + ) run_button = gr.Button("ā–¶ļø Submit Task", variant="primary", scale=3) browser_view = gr.HTML( @@ -816,24 +1019,39 @@ def create_browser_use_agent_tab(webui_manager: WebuiManager): with gr.Column(): gr.Markdown("### Task Outputs") agent_history_file = gr.File(label="Agent History JSON", interactive=False) - recording_gif = gr.Image(label="Task Recording GIF", format="gif", interactive=False, - type="filepath") + recording_gif = gr.Image( + label="Task Recording GIF", + format="gif", + interactive=False, + type="filepath", + ) # --- Store Components in Manager --- tab_components.update( dict( - chatbot=chatbot, user_input=user_input, clear_button=clear_button, - run_button=run_button, stop_button=stop_button, pause_resume_button=pause_resume_button, - agent_history_file=agent_history_file, recording_gif=recording_gif, - browser_view=browser_view + chatbot=chatbot, + user_input=user_input, + clear_button=clear_button, + run_button=run_button, + stop_button=stop_button, + pause_resume_button=pause_resume_button, + agent_history_file=agent_history_file, + recording_gif=recording_gif, + browser_view=browser_view, ) ) - webui_manager.add_components("browser_use_agent", tab_components) # Use "browser_use_agent" as tab_name prefix + webui_manager.add_components( + "browser_use_agent", tab_components + ) # Use "browser_use_agent" as tab_name prefix - all_managed_components = set(webui_manager.get_components()) # Get all components known to manager + all_managed_components = set( + webui_manager.get_components() + ) # Get all components known to manager run_tab_outputs = list(tab_components.values()) - async def submit_wrapper(components_dict: Dict[Component, Any]) -> AsyncGenerator[Dict[Component, Any], None]: + async def submit_wrapper( + components_dict: Dict[Component, Any], + ) -> AsyncGenerator[Dict[Component, Any], None]: """Wrapper for handle_submit that yields its results.""" async for update in handle_submit(webui_manager, components_dict): yield update @@ -855,27 +1073,13 @@ def create_browser_use_agent_tab(webui_manager: WebuiManager): # --- Connect Event Handlers using the Wrappers -- run_button.click( - fn=submit_wrapper, - inputs=all_managed_components, - outputs=run_tab_outputs + fn=submit_wrapper, inputs=all_managed_components, outputs=run_tab_outputs ) user_input.submit( - fn=submit_wrapper, - inputs=all_managed_components, - outputs=run_tab_outputs - ) - stop_button.click( - fn=stop_wrapper, - inputs=None, - outputs=run_tab_outputs + fn=submit_wrapper, inputs=all_managed_components, outputs=run_tab_outputs ) + stop_button.click(fn=stop_wrapper, inputs=None, outputs=run_tab_outputs) pause_resume_button.click( - fn=pause_resume_wrapper, - inputs=None, - outputs=run_tab_outputs - ) - clear_button.click( - fn=clear_wrapper, - inputs=None, - outputs=run_tab_outputs + fn=pause_resume_wrapper, inputs=None, outputs=run_tab_outputs ) + clear_button.click(fn=clear_wrapper, inputs=None, outputs=run_tab_outputs) From c67bb6a0cc471fe391c87afa4e6bf876aa5f1e0d Mon Sep 17 00:00:00 2001 From: marginal23326 <58261815+marginal23326@users.noreply.github.com> Date: Sat, 3 May 2025 06:31:08 +0600 Subject: [PATCH 21/35] chore: remove duplicate imports --- src/utils/mcp_client.py | 29 ++++++++--------------------- 1 file changed, 8 insertions(+), 21 deletions(-) diff --git a/src/utils/mcp_client.py b/src/utils/mcp_client.py index b909d0d..126d49d 100644 --- a/src/utils/mcp_client.py +++ b/src/utils/mcp_client.py @@ -1,28 +1,15 @@ -import os -import asyncio -import base64 -import pdb -from typing import List, Tuple, Optional -from langchain_core.tools import BaseTool -from langchain_mcp_adapters.client import MultiServerMCPClient -import base64 -import json +import inspect import logging -from typing import Optional, Dict, Any, Type -from langchain_core.tools import BaseTool -from pydantic.v1 import BaseModel, Field -from langchain_core.runnables import RunnableConfig -from pydantic import BaseModel, Field, create_model -from typing import Type, Dict, Any, Optional, get_type_hints, List, Union, Annotated, Set -from pydantic import BaseModel, ConfigDict, create_model, Field -from langchain.tools import BaseTool -import inspect -from datetime import datetime, date, time import uuid +from datetime import date, datetime, time from enum import Enum -import inspect +from typing import Any, Dict, List, Optional, Set, Type, Union, get_type_hints + from browser_use.controller.registry.views import ActionModel -from typing import Type, Dict, Any, Optional, get_type_hints +from langchain.tools import BaseTool +from langchain_mcp_adapters.client import MultiServerMCPClient +from pydantic import BaseModel, Field, create_model +from pydantic.v1 import BaseModel, Field logger = logging.getLogger(__name__) From db4bffb526451a0bb78050c00093887645febeaa Mon Sep 17 00:00:00 2001 From: marginal23326 <58261815+marginal23326@users.noreply.github.com> Date: Sat, 3 May 2025 06:53:39 +0600 Subject: [PATCH 22/35] fix: address gradio deprecation warnings --- src/webui/components/browser_use_agent_tab.py | 1 - src/webui/webui_manager.py | 5 ++++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/src/webui/components/browser_use_agent_tab.py b/src/webui/components/browser_use_agent_tab.py index 1657086..1b38629 100644 --- a/src/webui/components/browser_use_agent_tab.py +++ b/src/webui/components/browser_use_agent_tab.py @@ -989,7 +989,6 @@ def create_browser_use_agent_tab(webui_manager: WebuiManager): type="messages", height=600, show_copy_button=True, - bubble_full_width=False, ) user_input = gr.Textbox( label="Your Task or Response", diff --git a/src/webui/webui_manager.py b/src/webui/webui_manager.py index b64e8d1..542d387 100644 --- a/src/webui/webui_manager.py +++ b/src/webui/webui_manager.py @@ -104,7 +104,10 @@ class WebuiManager: for comp_id, comp_val in ui_settings.items(): if comp_id in self.id_to_component: comp = self.id_to_component[comp_id] - update_components[comp] = comp.__class__(value=comp_val) + if comp.__class__.__name__ == "Chatbot": + update_components[comp] = comp.__class__(value=comp_val, type="messages") + else: + update_components[comp] = comp.__class__(value=comp_val) config_status = self.id_to_component["load_save_config.config_status"] update_components.update( From dc1bcf9d200e5130c6c99011ed3a88bd5f60c6e7 Mon Sep 17 00:00:00 2001 From: Tayyab Akmal <62791376+tayyabakmal1@users.noreply.github.com> Date: Tue, 6 May 2025 14:08:08 +0500 Subject: [PATCH 23/35] Update browser-use version requirements.txt Update browser-use version requirements.txt --- requirements.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index 01fe29a..4762a7e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -browser-use==0.1.42 +browser-use==0.1.43 pyperclip==1.9.0 gradio==5.27.0 json-repair @@ -7,4 +7,4 @@ MainContentExtractor==0.0.4 langchain-ibm==0.3.10 langchain_mcp_adapters==0.0.9 langgraph==0.3.34 -langchain-community \ No newline at end of file +langchain-community From 3c7ba914fb0c948ebff29f6f5efd03cb07e62dbd Mon Sep 17 00:00:00 2001 From: Tayyab Akmal <62791376+tayyabakmal1@users.noreply.github.com> Date: Tue, 6 May 2025 14:11:43 +0500 Subject: [PATCH 24/35] Update requirements.txt --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 4762a7e..bc8de8c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -browser-use==0.1.43 +browser-use==0.1.45 pyperclip==1.9.0 gradio==5.27.0 json-repair From 2f0b2cef43f5fadcb2053fc67f9cca3358ec6da7 Mon Sep 17 00:00:00 2001 From: Tayyab Akmal <62791376+tayyabakmal1@users.noreply.github.com> Date: Tue, 6 May 2025 14:16:47 +0500 Subject: [PATCH 25/35] Update custom_context.py --- src/browser/custom_context.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/browser/custom_context.py b/src/browser/custom_context.py index 753b4c5..8a59f8c 100644 --- a/src/browser/custom_context.py +++ b/src/browser/custom_context.py @@ -42,7 +42,10 @@ class CustomBrowserContext(BrowserContext): bypass_csp=self.config.disable_security, ignore_https_errors=self.config.disable_security, record_video_dir=self.config.save_recording_path, - record_video_size=self.config.browser_window_size.model_dump(), + record_video_size={ + "width": self.config.window_width, + "height": self.config.window_height + }, record_har_path=self.config.save_har_path, locale=self.config.locale, http_credentials=self.config.http_credentials, From 6f80bf60286c354e3b3ee2a1b0f46b7f9971782e Mon Sep 17 00:00:00 2001 From: Tayyab Akmal <62791376+tayyabakmal1@users.noreply.github.com> Date: Tue, 6 May 2025 14:20:52 +0500 Subject: [PATCH 26/35] Update browser_use_agent_tab.py --- src/webui/components/browser_use_agent_tab.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/webui/components/browser_use_agent_tab.py b/src/webui/components/browser_use_agent_tab.py index 1b38629..a3b0ca8 100644 --- a/src/webui/components/browser_use_agent_tab.py +++ b/src/webui/components/browser_use_agent_tab.py @@ -13,7 +13,7 @@ from browser_use.agent.views import ( AgentOutput, ) from browser_use.browser.browser import BrowserConfig -from browser_use.browser.context import BrowserContext, BrowserContextWindowSize +from browser_use.browser.context import BrowserContext from browser_use.browser.views import BrowserState from gradio.components import Component from langchain_core.language_models.chat_models import BaseChatModel @@ -485,9 +485,8 @@ async def run_agent_task( if save_recording_path else None, save_downloads_path=save_download_path if save_download_path else None, - browser_window_size=BrowserContextWindowSize( - width=window_w, height=window_h - ), + window_width=window_w, + window_height=window_h, ) if not webui_manager.bu_browser: raise ValueError("Browser not initialized, cannot create context.") From d938b39fe54a59b480593b18b9830297f4e67c1d Mon Sep 17 00:00:00 2001 From: Tayyab Akmal <62791376+tayyabakmal1@users.noreply.github.com> Date: Tue, 6 May 2025 14:23:00 +0500 Subject: [PATCH 27/35] Update deep_research_agent.py --- src/agent/deep_research/deep_research_agent.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/agent/deep_research/deep_research_agent.py b/src/agent/deep_research/deep_research_agent.py index 2f6c672..80b41e7 100644 --- a/src/agent/deep_research/deep_research_agent.py +++ b/src/agent/deep_research/deep_research_agent.py @@ -8,7 +8,6 @@ from pathlib import Path from typing import Any, Dict, List, Optional, TypedDict from browser_use.browser.browser import BrowserConfig -from browser_use.browser.context import BrowserContextWindowSize from langchain_community.tools.file_management import ( ListDirectoryTool, ReadFileTool, @@ -107,9 +106,8 @@ async def run_single_browser_task( context_config = CustomBrowserContextConfig( save_downloads_path="./tmp/downloads", - browser_window_size=BrowserContextWindowSize( - width=window_w, height=window_h - ), + window_width=window_w, + window_height=window_h, force_new_context=True, ) bu_browser_context = await bu_browser.new_context(config=context_config) From eb91cb64ec0f675350687062c7ad4f0d0bad6b71 Mon Sep 17 00:00:00 2001 From: vincent Date: Fri, 9 May 2025 09:27:12 +0800 Subject: [PATCH 28/35] update to bu==0.1.43 and fix deep research --- requirements.txt | 2 +- src/agent/browser_use/browser_use_agent.py | 84 ++++++++++------ .../deep_research/deep_research_agent.py | 81 ++++++++-------- src/browser/custom_browser.py | 21 ++-- src/browser/custom_context.py | 97 ------------------- src/controller/custom_controller.py | 4 + src/webui/components/browser_use_agent_tab.py | 71 +++++++------- .../components/deep_research_agent_tab.py | 4 +- tests/test_agents.py | 64 ++++++------ tests/test_controller.py | 57 ++++++----- 10 files changed, 218 insertions(+), 267 deletions(-) diff --git a/requirements.txt b/requirements.txt index bc8de8c..4762a7e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -browser-use==0.1.45 +browser-use==0.1.43 pyperclip==1.9.0 gradio==5.27.0 json-repair diff --git a/src/agent/browser_use/browser_use_agent.py b/src/agent/browser_use/browser_use_agent.py index 9234bca..49d671f 100644 --- a/src/agent/browser_use/browser_use_agent.py +++ b/src/agent/browser_use/browser_use_agent.py @@ -8,9 +8,13 @@ import os from browser_use.agent.gif import create_history_gif from browser_use.agent.service import Agent, AgentHookFunc from browser_use.agent.views import ( + ActionResult, + AgentHistory, AgentHistoryList, AgentStepInfo, + ToolCallingMethod, ) +from browser_use.browser.views import BrowserStateHistory from browser_use.telemetry.views import ( AgentEndTelemetryEvent, ) @@ -21,17 +25,15 @@ load_dotenv() logger = logging.getLogger(__name__) SKIP_LLM_API_KEY_VERIFICATION = ( - os.environ.get("SKIP_LLM_API_KEY_VERIFICATION", "false").lower()[0] in "ty1" + os.environ.get("SKIP_LLM_API_KEY_VERIFICATION", "false").lower()[0] in "ty1" ) class BrowserUseAgent(Agent): @time_execution_async("--run (agent)") async def run( - self, - max_steps: int = 100, - on_step_start: AgentHookFunc | None = None, - on_step_end: AgentHookFunc | None = None, + self, max_steps: int = 100, on_step_start: AgentHookFunc | None = None, + on_step_end: AgentHookFunc | None = None ) -> AgentHistoryList: """Execute the task with maximum number of steps""" @@ -49,41 +51,28 @@ class BrowserUseAgent(Agent): ) signal_handler.register() - # Wait for verification task to complete if it exists - if hasattr(self, "_verification_task") and not self._verification_task.done(): - try: - await self._verification_task - except Exception: - # Error already logged in the task - pass - try: self._log_agent_run() # Execute initial actions if provided if self.initial_actions: - result = await self.multi_act( - self.initial_actions, check_for_new_elements=False - ) + result = await self.multi_act(self.initial_actions, check_for_new_elements=False) self.state.last_result = result for step in range(max_steps): # Check if waiting for user input after Ctrl+C - while self.state.paused: - await asyncio.sleep(0.5) - if self.state.stopped: - break + if self.state.paused: + signal_handler.wait_for_resume() + signal_handler.reset() # Check if we should stop due to too many failures if self.state.consecutive_failures >= self.settings.max_failures: - logger.error( - f"āŒ Stopping due to {self.settings.max_failures} consecutive failures" - ) + logger.error(f'āŒ Stopping due to {self.settings.max_failures} consecutive failures') break # Check control flags before each step if self.state.stopped: - logger.info("Agent stopped") + logger.info('Agent stopped') break while self.state.paused: @@ -108,15 +97,30 @@ class BrowserUseAgent(Agent): await self.log_completion() break else: - logger.info("āŒ Failed to complete task in maximum steps") + error_message = 'Failed to complete task in maximum steps' + + self.state.history.history.append( + AgentHistory( + model_output=None, + result=[ActionResult(error=error_message, include_in_memory=True)], + state=BrowserStateHistory( + url='', + title='', + tabs=[], + interacted_element=[], + screenshot=None, + ), + metadata=None, + ) + ) + + logger.info(f'āŒ {error_message}') return self.state.history except KeyboardInterrupt: # Already handled by our signal handler, but catch any direct KeyboardInterrupt as well - logger.info( - "Got KeyboardInterrupt during execution, returning current history" - ) + logger.info('Got KeyboardInterrupt during execution, returning current history') return self.state.history finally: @@ -136,13 +140,29 @@ class BrowserUseAgent(Agent): ) ) + if self.settings.save_playwright_script_path: + logger.info( + f'Agent run finished. Attempting to save Playwright script to: {self.settings.save_playwright_script_path}' + ) + try: + # Extract sensitive data keys if sensitive_data is provided + keys = list(self.sensitive_data.keys()) if self.sensitive_data else None + # Pass browser and context config to the saving method + self.state.history.save_as_playwright_script( + self.settings.save_playwright_script_path, + sensitive_data_keys=keys, + browser_config=self.browser.config, + context_config=self.browser_context.config, + ) + except Exception as script_gen_err: + # Log any error during script generation/saving + logger.error(f'Failed to save Playwright script: {script_gen_err}', exc_info=True) + await self.close() if self.settings.generate_gif: - output_path: str = "agent_history.gif" + output_path: str = 'agent_history.gif' if isinstance(self.settings.generate_gif, str): output_path = self.settings.generate_gif - create_history_gif( - task=self.task, history=self.state.history, output_path=output_path - ) + create_history_gif(task=self.task, history=self.state.history, output_path=output_path) diff --git a/src/agent/deep_research/deep_research_agent.py b/src/agent/deep_research/deep_research_agent.py index 80b41e7..278d251 100644 --- a/src/agent/deep_research/deep_research_agent.py +++ b/src/agent/deep_research/deep_research_agent.py @@ -29,9 +29,10 @@ from langchain_core.tools import StructuredTool, Tool from langgraph.graph import StateGraph from pydantic import BaseModel, Field +from browser_use.browser.context import BrowserContextWindowSize, BrowserContextConfig + from src.agent.browser_use.browser_use_agent import BrowserUseAgent from src.browser.custom_browser import CustomBrowser -from src.browser.custom_context import CustomBrowserContextConfig from src.controller.custom_controller import CustomController from src.utils.mcp_client import setup_mcp_client_and_tools @@ -47,12 +48,12 @@ _BROWSER_AGENT_INSTANCES = {} async def run_single_browser_task( - task_query: str, - task_id: str, - llm: Any, # Pass the main LLM - browser_config: Dict[str, Any], - stop_event: threading.Event, - use_vision: bool = False, + task_query: str, + task_id: str, + llm: Any, # Pass the main LLM + browser_config: Dict[str, Any], + stop_event: threading.Event, + use_vision: bool = False, ) -> Dict[str, Any]: """ Runs a single BrowserUseAgent task. @@ -104,10 +105,9 @@ async def run_single_browser_task( ) ) - context_config = CustomBrowserContextConfig( + context_config = BrowserContextConfig( save_downloads_path="./tmp/downloads", - window_width=window_w, - window_height=window_h, + browser_window_size=BrowserContextWindowSize(width=window_w, height=window_h), force_new_context=True, ) bu_browser_context = await bu_browser.new_context(config=context_config) @@ -198,12 +198,12 @@ class BrowserSearchInput(BaseModel): async def _run_browser_search_tool( - queries: List[str], - task_id: str, # Injected dependency - llm: Any, # Injected dependency - browser_config: Dict[str, Any], - stop_event: threading.Event, - max_parallel_browsers: int = 1, + queries: List[str], + task_id: str, # Injected dependency + llm: Any, # Injected dependency + browser_config: Dict[str, Any], + stop_event: threading.Event, + max_parallel_browsers: int = 1, ) -> List[Dict[str, Any]]: """ Internal function to execute parallel browser searches based on LLM-provided queries. @@ -267,11 +267,11 @@ async def _run_browser_search_tool( def create_browser_search_tool( - llm: Any, - browser_config: Dict[str, Any], - task_id: str, - stop_event: threading.Event, - max_parallel_browsers: int = 1, + llm: Any, + browser_config: Dict[str, Any], + task_id: str, + stop_event: threading.Event, + max_parallel_browsers: int = 1, ) -> StructuredTool: """Factory function to create the browser search tool with necessary dependencies.""" # Use partial to bind the dependencies that aren't part of the LLM call arguments @@ -553,7 +553,7 @@ async def research_execution_node(state: DeepResearchState) -> Dict[str, Any]: else: current_task_message = [ SystemMessage( - content="You are a research assistant executing one step of a research plan. Use the available tools, especially the 'parallel_browser_search' tool, to gather information needed for the current task. Be precise with your search queries if using the browser tool." + content="You are a research assistant executing one step of a research plan. Use the available tools, especially the 'parallel_browser_search' tool, to gather information needed for the current task. Be precise with your search queries if using the browser tool. Please output at least one tool." ), HumanMessage( content=f"Research Task (Step {current_step['step']}): {current_step['task']}" @@ -582,8 +582,11 @@ async def research_execution_node(state: DeepResearchState) -> Dict[str, Any]: _save_plan_to_md(plan, output_dir) return { "research_plan": plan, - "current_step_index": current_index + 1, - "error_message": f"LLM failed to call a tool for step {current_step['step']}.", + "status": "pending", + "current_step_index": current_index, + "messages": [ + f"LLM failed to call a tool for step {current_step['step']}. Response: {ai_response.content}" + f". Please use tool to do research unless you are thinking or summary"], } # Process tool calls @@ -665,8 +668,8 @@ async def research_execution_node(state: DeepResearchState) -> Dict[str, Any]: browser_tool_called = "parallel_browser_search" in executed_tool_names # We might need a more nuanced status based on the *content* of tool_results step_failed = ( - any("Error:" in str(tr.content) for tr in tool_results) - or not browser_tool_called + any("Error:" in str(tr.content) for tr in tool_results) + or not browser_tool_called ) if step_failed: @@ -695,9 +698,9 @@ async def research_execution_node(state: DeepResearchState) -> Dict[str, Any]: "search_results": current_search_results, # Update with new results "current_step_index": current_index + 1, "messages": state["messages"] - + current_task_message - + [ai_response] - + tool_results, + + current_task_message + + [ai_response] + + tool_results, # Optionally return the tool_results messages if needed by downstream nodes } @@ -879,10 +882,10 @@ def should_continue(state: DeepResearchState) -> str: class DeepResearchAgent: def __init__( - self, - llm: Any, - browser_config: Dict[str, Any], - mcp_server_config: Optional[Dict[str, Any]] = None, + self, + llm: Any, + browser_config: Dict[str, Any], + mcp_server_config: Optional[Dict[str, Any]] = None, ): """ Initializes the DeepSearchAgent. @@ -904,7 +907,7 @@ class DeepResearchAgent: self.runner: Optional[asyncio.Task] = None # To hold the asyncio task for run async def _setup_tools( - self, task_id: str, stop_event: threading.Event, max_parallel_browsers: int = 1 + self, task_id: str, stop_event: threading.Event, max_parallel_browsers: int = 1 ) -> List[Tool]: """Sets up the basic tools (File I/O) and optional MCP tools.""" tools = [ @@ -981,11 +984,11 @@ class DeepResearchAgent: return app async def run( - self, - topic: str, - task_id: Optional[str] = None, - save_dir: str = "./tmp/deep_research", - max_parallel_browsers: int = 1, + self, + topic: str, + task_id: Optional[str] = None, + save_dir: str = "./tmp/deep_research", + max_parallel_browsers: int = 1, ) -> Dict[str, Any]: """ Starts the deep research process (Async Generator Version). diff --git a/src/browser/custom_browser.py b/src/browser/custom_browser.py index 02875e3..676ec49 100644 --- a/src/browser/custom_browser.py +++ b/src/browser/custom_browser.py @@ -26,25 +26,33 @@ from browser_use.browser.utils.screen_resolution import get_screen_resolution, g from browser_use.utils import time_execution_async import socket -from .custom_context import CustomBrowserContext, CustomBrowserContextConfig +from .custom_context import CustomBrowserContext logger = logging.getLogger(__name__) class CustomBrowser(Browser): - async def new_context(self, config: CustomBrowserContextConfig | None = None) -> CustomBrowserContext: + async def new_context(self, config: BrowserContextConfig | None = None) -> CustomBrowserContext: """Create a browser context""" browser_config = self.config.model_dump() if self.config else {} context_config = config.model_dump() if config else {} merged_config = {**browser_config, **context_config} - return CustomBrowserContext(config=CustomBrowserContextConfig(**merged_config), browser=self) + return CustomBrowserContext(config=BrowserContextConfig(**merged_config), browser=self) async def _setup_builtin_browser(self, playwright: Playwright) -> PlaywrightBrowser: """Sets up and returns a Playwright Browser instance with anti-detection measures.""" assert self.config.browser_binary_path is None, 'browser_binary_path should be None if trying to use the builtin browsers' - if self.config.headless: + # Use the configured window size from new_context_config if available + if ( + not self.config.headless + and hasattr(self.config, 'new_context_config') + and hasattr(self.config.new_context_config, 'browser_window_size') + ): + screen_size = self.config.new_context_config.browser_window_size.model_dump() + offset_x, offset_y = get_window_adjustments() + elif self.config.headless: screen_size = {'width': 1920, 'height': 1080} offset_x, offset_y = 0, 0 else: @@ -52,6 +60,7 @@ class CustomBrowser(Browser): offset_x, offset_y = get_window_adjustments() chrome_args = { + f'--remote-debugging-port={self.config.chrome_remote_debugging_port}', *CHROME_ARGS, *(CHROME_DOCKER_ARGS if IN_DOCKER else []), *(CHROME_HEADLESS_ARGS if self.config.headless else []), @@ -70,8 +79,8 @@ class CustomBrowser(Browser): # check if port 9222 is already taken, if so remove the remote-debugging-port arg to prevent conflicts with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: - if s.connect_ex(('localhost', 9222)) == 0: - chrome_args.remove('--remote-debugging-port=9222') + if s.connect_ex(('localhost', self.config.chrome_remote_debugging_port)) == 0: + chrome_args.remove(f'--remote-debugging-port={self.config.chrome_remote_debugging_port}') browser_class = getattr(playwright, self.config.browser_class) args = { diff --git a/src/browser/custom_context.py b/src/browser/custom_context.py index 8a59f8c..c146d34 100644 --- a/src/browser/custom_context.py +++ b/src/browser/custom_context.py @@ -12,10 +12,6 @@ from browser_use.browser.context import BrowserContextState logger = logging.getLogger(__name__) -class CustomBrowserContextConfig(BrowserContextConfig): - force_new_context: bool = False # force to create new context - - class CustomBrowserContext(BrowserContext): def __init__( self, @@ -24,96 +20,3 @@ class CustomBrowserContext(BrowserContext): state: Optional[BrowserContextState] = None, ): super(CustomBrowserContext, self).__init__(browser=browser, config=config, state=state) - - async def _create_context(self, browser: PlaywrightBrowser): - """Creates a new browser context with anti-detection measures and loads cookies if available.""" - if not self.config.force_new_context and self.browser.config.cdp_url and len(browser.contexts) > 0: - context = browser.contexts[0] - elif not self.config.force_new_context and self.browser.config.browser_binary_path and len( - browser.contexts) > 0: - # Connect to existing Chrome instance instead of creating new one - context = browser.contexts[0] - else: - # Original code for creating new context - context = await browser.new_context( - no_viewport=True, - user_agent=self.config.user_agent, - java_script_enabled=True, - bypass_csp=self.config.disable_security, - ignore_https_errors=self.config.disable_security, - record_video_dir=self.config.save_recording_path, - record_video_size={ - "width": self.config.window_width, - "height": self.config.window_height - }, - record_har_path=self.config.save_har_path, - locale=self.config.locale, - http_credentials=self.config.http_credentials, - is_mobile=self.config.is_mobile, - has_touch=self.config.has_touch, - geolocation=self.config.geolocation, - permissions=self.config.permissions, - timezone_id=self.config.timezone_id, - ) - - if self.config.trace_path: - await context.tracing.start(screenshots=True, snapshots=True, sources=True) - - # Load cookies if they exist - if self.config.cookies_file and os.path.exists(self.config.cookies_file): - with open(self.config.cookies_file, 'r') as f: - try: - cookies = json.load(f) - - valid_same_site_values = ['Strict', 'Lax', 'None'] - for cookie in cookies: - if 'sameSite' in cookie: - if cookie['sameSite'] not in valid_same_site_values: - logger.warning( - f"Fixed invalid sameSite value '{cookie['sameSite']}' to 'None' for cookie {cookie.get('name')}" - ) - cookie['sameSite'] = 'None' - logger.info(f'šŸŖ Loaded {len(cookies)} cookies from {self.config.cookies_file}') - await context.add_cookies(cookies) - - except json.JSONDecodeError as e: - logger.error(f'Failed to parse cookies file: {str(e)}') - - # Expose anti-detection scripts - await context.add_init_script( - """ - // Webdriver property - Object.defineProperty(navigator, 'webdriver', { - get: () => undefined - }); - - // Languages - Object.defineProperty(navigator, 'languages', { - get: () => ['en-US'] - }); - - // Plugins - Object.defineProperty(navigator, 'plugins', { - get: () => [1, 2, 3, 4, 5] - }); - - // Chrome runtime - window.chrome = { runtime: {} }; - - // Permissions - const originalQuery = window.navigator.permissions.query; - window.navigator.permissions.query = (parameters) => ( - parameters.name === 'notifications' ? - Promise.resolve({ state: Notification.permission }) : - originalQuery(parameters) - ); - (function () { - const originalAttachShadow = Element.prototype.attachShadow; - Element.prototype.attachShadow = function attachShadow(options) { - return originalAttachShadow.call(this, { ...options, mode: "open" }); - }; - })(); - """ - ) - - return context diff --git a/src/controller/custom_controller.py b/src/controller/custom_controller.py index d07c88b..00e050c 100644 --- a/src/controller/custom_controller.py +++ b/src/controller/custom_controller.py @@ -172,6 +172,10 @@ class CustomController(Controller): param_model=create_tool_param_model(tool), ) logger.info(f"Add mcp tool: {tool_name}") + logger.debug( + f"Registered {len(self.mcp_client.server_name_to_tools[server_name])} mcp tools for {server_name}") + else: + logger.warning(f"MCP client not started.") async def close_mcp_client(self): if self.mcp_client: diff --git a/src/webui/components/browser_use_agent_tab.py b/src/webui/components/browser_use_agent_tab.py index a3b0ca8..b3c00a0 100644 --- a/src/webui/components/browser_use_agent_tab.py +++ b/src/webui/components/browser_use_agent_tab.py @@ -13,14 +13,13 @@ from browser_use.agent.views import ( AgentOutput, ) from browser_use.browser.browser import BrowserConfig -from browser_use.browser.context import BrowserContext +from browser_use.browser.context import BrowserContext, BrowserContextWindowSize, BrowserContextConfig from browser_use.browser.views import BrowserState from gradio.components import Component from langchain_core.language_models.chat_models import BaseChatModel from src.agent.browser_use.browser_use_agent import BrowserUseAgent from src.browser.custom_browser import CustomBrowser -from src.browser.custom_context import CustomBrowserContextConfig from src.controller.custom_controller import CustomController from src.utils import llm_provider from src.webui.webui_manager import WebuiManager @@ -32,12 +31,12 @@ logger = logging.getLogger(__name__) async def _initialize_llm( - provider: Optional[str], - model_name: Optional[str], - temperature: float, - base_url: Optional[str], - api_key: Optional[str], - num_ctx: Optional[int] = None, + provider: Optional[str], + model_name: Optional[str], + temperature: float, + base_url: Optional[str], + api_key: Optional[str], + num_ctx: Optional[int] = None, ) -> Optional[BaseChatModel]: """Initializes the LLM based on settings. Returns None if provider/model is missing.""" if not provider or not model_name: @@ -68,10 +67,10 @@ async def _initialize_llm( def _get_config_value( - webui_manager: WebuiManager, - comp_dict: Dict[gr.components.Component, Any], - comp_id_suffix: str, - default: Any = None, + webui_manager: WebuiManager, + comp_dict: Dict[gr.components.Component, Any], + comp_id_suffix: str, + default: Any = None, ) -> Any: """Safely get value from component dictionary using its ID suffix relative to the tab.""" # Assumes component ID format is "tab_name.comp_name" @@ -133,7 +132,7 @@ def _format_agent_output(model_output: AgentOutput) -> str: async def _handle_new_step( - webui_manager: WebuiManager, state: BrowserState, output: AgentOutput, step_num: int + webui_manager: WebuiManager, state: BrowserState, output: AgentOutput, step_num: int ): """Callback for each step taken by the agent, including screenshot display.""" @@ -157,12 +156,12 @@ async def _handle_new_step( try: # Basic validation: check if it looks like base64 if ( - isinstance(screenshot_data, str) and len(screenshot_data) > 100 + isinstance(screenshot_data, str) and len(screenshot_data) > 100 ): # Arbitrary length check # *** UPDATED STYLE: Removed centering, adjusted width *** img_tag = f'Step {step_num} Screenshot' screenshot_html = ( - img_tag + "
" + img_tag + "
" ) # Use
for line break after inline-block image else: logger.warning( @@ -223,7 +222,7 @@ def _handle_done(webui_manager: WebuiManager, history: AgentHistoryList): async def _ask_assistant_callback( - webui_manager: WebuiManager, query: str, browser_context: BrowserContext + webui_manager: WebuiManager, query: str, browser_context: BrowserContext ) -> Dict[str, Any]: """Callback triggered by the agent's ask_for_assistant action.""" logger.info("Agent requires assistance. Waiting for user input.") @@ -274,7 +273,7 @@ async def _ask_assistant_callback( async def run_agent_task( - webui_manager: WebuiManager, components: Dict[gr.components.Component, Any] + webui_manager: WebuiManager, components: Dict[gr.components.Component, Any] ) -> AsyncGenerator[Dict[gr.components.Component, Any], None]: """Handles the entire lifecycle of initializing and running the agent.""" @@ -358,6 +357,7 @@ async def run_agent_task( # Planner LLM Settings (Optional) planner_llm_provider_name = get_setting("planner_llm_provider") or None planner_llm = None + planner_use_vision = False if planner_llm_provider_name: planner_llm_model_name = get_setting("planner_llm_model_name") planner_llm_temperature = get_setting("planner_llm_temperature", 0.6) @@ -387,7 +387,7 @@ async def run_agent_task( ) # Logic handled by CDP/WSS presence keep_browser_open = get_browser_setting("keep_browser_open", False) headless = get_browser_setting("headless", False) - disable_security = get_browser_setting("disable_security", True) + disable_security = get_browser_setting("disable_security", False) window_w = int(get_browser_setting("window_w", 1280)) window_h = int(get_browser_setting("window_h", 1100)) cdp_url = get_browser_setting("cdp_url") or None @@ -422,7 +422,7 @@ async def run_agent_task( # Pass the webui_manager instance to the callback when wrapping it async def ask_callback_wrapper( - query: str, browser_context: BrowserContext + query: str, browser_context: BrowserContext ) -> Dict[str, Any]: return await _ask_assistant_callback(webui_manager, query, browser_context) @@ -456,7 +456,7 @@ async def run_agent_task( if use_own_browser: browser_binary_path = ( - os.getenv("CHROME_PATH", None) or browser_binary_path + os.getenv("CHROME_PATH", None) or browser_binary_path ) if browser_binary_path == "": browser_binary_path = None @@ -479,14 +479,13 @@ async def run_agent_task( # Create Context if needed if not webui_manager.bu_browser_context: logger.info("Creating new browser context.") - context_config = CustomBrowserContextConfig( + context_config = BrowserContextConfig( trace_path=save_trace_path if save_trace_path else None, save_recording_path=save_recording_path if save_recording_path else None, save_downloads_path=save_download_path if save_download_path else None, - window_width=window_w, - window_height=window_h, + browser_window_size=BrowserContextWindowSize(width=window_w, height=window_h), ) if not webui_manager.bu_browser: raise ValueError("Browser not initialized, cannot create context.") @@ -513,7 +512,7 @@ async def run_agent_task( # Pass the webui_manager to callbacks when wrapping them async def step_callback_wrapper( - state: BrowserState, output: AgentOutput, step_num: int + state: BrowserState, output: AgentOutput, step_num: int ): await _handle_new_step(webui_manager, state, output, step_num) @@ -582,7 +581,7 @@ async def run_agent_task( await asyncio.sleep(0.2) if ( - agent_task.done() or is_stopped + agent_task.done() or is_stopped ): # If stopped or task finished while paused break @@ -633,8 +632,8 @@ async def run_agent_task( yield update_dict # Wait until response is submitted or task finishes while ( - webui_manager.bu_response_event is not None - and not agent_task.done() + webui_manager.bu_response_event is not None + and not agent_task.done() ): await asyncio.sleep(0.2) # Restore UI after response submitted or if task ended unexpectedly @@ -716,9 +715,9 @@ async def run_agent_task( except asyncio.CancelledError: logger.info("Agent task was cancelled.") if not any( - "Cancelled" in msg.get("content", "") - for msg in webui_manager.bu_chat_history - if msg.get("role") == "assistant" + "Cancelled" in msg.get("content", "") + for msg in webui_manager.bu_chat_history + if msg.get("role") == "assistant" ): webui_manager.bu_chat_history.append( {"role": "assistant", "content": "**Task Cancelled**."} @@ -730,9 +729,9 @@ async def run_agent_task( f"**Agent Execution Error:**\n```\n{type(e).__name__}: {e}\n```" ) if not any( - error_message in msg.get("content", "") - for msg in webui_manager.bu_chat_history - if msg.get("role") == "assistant" + error_message in msg.get("content", "") + for msg in webui_manager.bu_chat_history + if msg.get("role") == "assistant" ): webui_manager.bu_chat_history.append( {"role": "assistant", "content": error_message} @@ -788,7 +787,7 @@ async def run_agent_task( clear_button_comp: gr.update(interactive=True), chatbot_comp: gr.update( value=webui_manager.bu_chat_history - + [{"role": "assistant", "content": f"**Setup Error:** {e}"}] + + [{"role": "assistant", "content": f"**Setup Error:** {e}"}] ), } @@ -797,7 +796,7 @@ async def run_agent_task( async def handle_submit( - webui_manager: WebuiManager, components: Dict[gr.components.Component, Any] + webui_manager: WebuiManager, components: Dict[gr.components.Component, Any] ): """Handles clicks on the main 'Submit' button.""" user_input_comp = webui_manager.get_component_by_id("browser_use_agent.user_input") @@ -1048,7 +1047,7 @@ def create_browser_use_agent_tab(webui_manager: WebuiManager): run_tab_outputs = list(tab_components.values()) async def submit_wrapper( - components_dict: Dict[Component, Any], + components_dict: Dict[Component, Any], ) -> AsyncGenerator[Dict[Component, Any], None]: """Wrapper for handle_submit that yields its results.""" async for update in handle_submit(webui_manager, components_dict): diff --git a/src/webui/components/deep_research_agent_tab.py b/src/webui/components/deep_research_agent_tab.py index 430b4e0..ff455b5 100644 --- a/src/webui/components/deep_research_agent_tab.py +++ b/src/webui/components/deep_research_agent_tab.py @@ -116,7 +116,7 @@ async def run_deep_research(webui_manager: WebuiManager, components: Dict[Compon # LLM Config (from agent_settings tab) llm_provider_name = get_setting("agent_settings", "llm_provider") llm_model_name = get_setting("agent_settings", "llm_model_name") - llm_temperature = get_setting("agent_settings", "llm_temperature", 0.5) # Default if not found + llm_temperature = max(get_setting("agent_settings", "llm_temperature", 0.5), 0.5) llm_base_url = get_setting("agent_settings", "llm_base_url") llm_api_key = get_setting("agent_settings", "llm_api_key") ollama_num_ctx = get_setting("agent_settings", "ollama_num_ctx") @@ -132,7 +132,7 @@ async def run_deep_research(webui_manager: WebuiManager, components: Dict[Compon # Note: DeepResearchAgent constructor takes a dict, not full Browser/Context objects browser_config_dict = { "headless": get_setting("browser_settings", "headless", False), - "disable_security": get_setting("browser_settings", "disable_security", True), + "disable_security": get_setting("browser_settings", "disable_security", False), "browser_binary_path": get_setting("browser_settings", "browser_binary_path"), "user_data_dir": get_setting("browser_settings", "browser_user_data_dir"), "window_width": int(get_setting("browser_settings", "window_w", 1280)), diff --git a/tests/test_agents.py b/tests/test_agents.py index ffa743f..d485c70 100644 --- a/tests/test_agents.py +++ b/tests/test_agents.py @@ -26,9 +26,9 @@ async def test_browser_use_agent(): from browser_use.agent.service import Agent from src.browser.custom_browser import CustomBrowser - from src.browser.custom_context import CustomBrowserContextConfig from src.controller.custom_controller import CustomController from src.utils import llm_provider + from src.agent.browser_use.browser_use_agent import BrowserUseAgent # llm = utils.get_llm_model( # provider="openai", @@ -77,15 +77,15 @@ async def test_browser_use_agent(): mcp_server_config = { "mcpServers": { - "markitdown": { - "command": "docker", - "args": [ - "run", - "--rm", - "-i", - "markitdown-mcp:latest" - ] - }, + # "markitdown": { + # "command": "docker", + # "args": [ + # "run", + # "--rm", + # "-i", + # "markitdown-mcp:latest" + # ] + # }, "desktop-commander": { "command": "npx", "args": [ @@ -97,8 +97,8 @@ async def test_browser_use_agent(): } controller = CustomController() await controller.setup_mcp_client(mcp_server_config) - use_own_browser = False - disable_security = True + use_own_browser = True + disable_security = False use_vision = True # Set to False when using DeepSeek max_actions_per_step = 10 @@ -125,7 +125,7 @@ async def test_browser_use_agent(): ) ) browser_context = await browser.new_context( - config=CustomBrowserContextConfig( + config=BrowserContextConfig( trace_path="./tmp/traces", save_recording_path="./tmp/record_videos", save_downloads_path="./tmp/downloads", @@ -135,8 +135,9 @@ async def test_browser_use_agent(): force_new_context=True ) ) - agent = Agent( - task="download pdf from https://arxiv.org/abs/2504.10458 and rename this pdf to 'GUI-r1-test.pdf'", + agent = BrowserUseAgent( + # task="download pdf from https://arxiv.org/pdf/2311.16498 and rename this pdf to 'mcp-test.pdf'", + task="give me nvidia stock price", llm=llm, browser=browser, browser_context=browser_context, @@ -153,7 +154,6 @@ async def test_browser_use_agent(): print("\nErrors:") pprint(history.errors(), indent=4) - except Exception: import traceback traceback.print_exc() @@ -182,9 +182,9 @@ async def test_browser_use_parallel(): from browser_use.agent.service import Agent from src.browser.custom_browser import CustomBrowser - from src.browser.custom_context import CustomBrowserContextConfig from src.controller.custom_controller import CustomController from src.utils import llm_provider + from src.agent.browser_use.browser_use_agent import BrowserUseAgent # llm = utils.get_llm_model( # provider="openai", @@ -233,15 +233,15 @@ async def test_browser_use_parallel(): mcp_server_config = { "mcpServers": { - "markitdown": { - "command": "docker", - "args": [ - "run", - "--rm", - "-i", - "markitdown-mcp:latest" - ] - }, + # "markitdown": { + # "command": "docker", + # "args": [ + # "run", + # "--rm", + # "-i", + # "markitdown-mcp:latest" + # ] + # }, "desktop-commander": { "command": "npx", "args": [ @@ -262,7 +262,7 @@ async def test_browser_use_parallel(): controller = CustomController() await controller.setup_mcp_client(mcp_server_config) use_own_browser = False - disable_security = True + disable_security = False use_vision = True # Set to False when using DeepSeek max_actions_per_step = 10 @@ -289,7 +289,7 @@ async def test_browser_use_parallel(): ) ) browser_context = await browser.new_context( - config=CustomBrowserContextConfig( + config=BrowserContextConfig( trace_path="./tmp/traces", save_recording_path="./tmp/record_videos", save_downloads_path="./tmp/downloads", @@ -300,7 +300,7 @@ async def test_browser_use_parallel(): ) ) agents = [ - Agent(task=task, llm=llm, browser=browser, controller=controller) + BrowserUseAgent(task=task, llm=llm, browser=browser, controller=controller) for task in [ 'Search Google for weather in Tokyo', # 'Check Reddit front page title', @@ -332,6 +332,8 @@ async def test_browser_use_parallel(): await browser_context.close() if browser: await browser.close() + if controller: + await controller.close_mcp_client() async def test_deep_research_agent(): @@ -362,8 +364,8 @@ async def test_deep_research_agent(): browser_config = {"headless": False, "window_width": 1280, "window_height": 1100, "use_own_browser": False} agent = DeepResearchAgent(llm=llm, browser_config=browser_config, mcp_server_config=mcp_server_config) - research_topic = "Impact of Microplastics on Marine Ecosystems" - task_id_to_resume = "815460fb-337a-4850-8fa4-a5f2db301a89" # Set this to resume a previous task ID + research_topic = "Give me a detailed travel plan to Switzerland from June 1st to 10th." + task_id_to_resume = "" # Set this to resume a previous task ID print(f"Starting research on: {research_topic}") diff --git a/tests/test_controller.py b/tests/test_controller.py index 5234c46..173bae4 100644 --- a/tests/test_controller.py +++ b/tests/test_controller.py @@ -14,20 +14,31 @@ async def test_mcp_client(): from src.utils.mcp_client import setup_mcp_client_and_tools, create_tool_param_model test_server_config = { - "playwright": { - "command": "npx", - "args": [ - "@playwright/mcp@latest", - ], - "transport": "stdio", - }, - "filesystem": { - "command": "npx", - "args": [ - "-y", - "@modelcontextprotocol/server-filesystem", - "/Users/warmshao/ai_workspace", - ] + "mcpServers": { + # "markitdown": { + # "command": "docker", + # "args": [ + # "run", + # "--rm", + # "-i", + # "markitdown-mcp:latest" + # ] + # }, + "desktop-commander": { + "command": "npx", + "args": [ + "-y", + "@wonderwhy-er/desktop-commander" + ] + }, + # "filesystem": { + # "command": "npx", + # "args": [ + # "-y", + # "@modelcontextprotocol/server-filesystem", + # "/Users/xxx/ai_workspace", + # ] + # }, } } @@ -48,15 +59,15 @@ async def test_controller_with_mcp(): mcp_server_config = { "mcpServers": { - "markitdown": { - "command": "docker", - "args": [ - "run", - "--rm", - "-i", - "markitdown-mcp:latest" - ] - }, + # "markitdown": { + # "command": "docker", + # "args": [ + # "run", + # "--rm", + # "-i", + # "markitdown-mcp:latest" + # ] + # }, "desktop-commander": { "command": "npx", "args": [ From 81c0f4777526e2ae55e8e5b170bb0cdfbe9bcf21 Mon Sep 17 00:00:00 2001 From: vincent Date: Fri, 9 May 2025 09:34:41 +0800 Subject: [PATCH 29/35] set deafult browser security --- src/agent/deep_research/deep_research_agent.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/agent/deep_research/deep_research_agent.py b/src/agent/deep_research/deep_research_agent.py index 278d251..8f95eec 100644 --- a/src/agent/deep_research/deep_research_agent.py +++ b/src/agent/deep_research/deep_research_agent.py @@ -97,7 +97,7 @@ async def run_single_browser_task( bu_browser = CustomBrowser( config=BrowserConfig( headless=headless, - disable_security=disable_security, + disable_security=False, browser_binary_path=browser_binary_path, extra_browser_args=extra_args, wss_url=wss_url, @@ -553,7 +553,7 @@ async def research_execution_node(state: DeepResearchState) -> Dict[str, Any]: else: current_task_message = [ SystemMessage( - content="You are a research assistant executing one step of a research plan. Use the available tools, especially the 'parallel_browser_search' tool, to gather information needed for the current task. Be precise with your search queries if using the browser tool. Please output at least one tool." + content="You are a research assistant executing one step of a research plan. Use the available tools, especially the 'parallel_browser_search' tool, to gather information needed for the current task. Be precise with your search queries if using the browser tool." ), HumanMessage( content=f"Research Task (Step {current_step['step']}): {current_step['task']}" From a04773266c9cb171cfb1501d82d906e2d6e5c0fa Mon Sep 17 00:00:00 2001 From: alexwarm Date: Fri, 9 May 2025 20:34:30 +0800 Subject: [PATCH 30/35] merge dockerfile --- .env.example | 14 +- Dockerfile | 44 +- Dockerfile.arm64 | 85 -- README.md | 132 +-- docker-compose.yml | 58 +- entrypoint.sh | 4 - requirements.txt | 2 +- src/agent/browser_use/browser_use_agent.py | 17 + .../deep_research/deep_research_agent.py | 768 ++++++++++-------- src/webui/components/browser_settings_tab.py | 5 +- src/webui/components/browser_use_agent_tab.py | 19 +- supervisord.conf | 6 +- tests/test_agents.py | 108 ++- 13 files changed, 625 insertions(+), 637 deletions(-) delete mode 100644 Dockerfile.arm64 delete mode 100644 entrypoint.sh diff --git a/.env.example b/.env.example index ad0bc6a..2e007b2 100644 --- a/.env.example +++ b/.env.example @@ -40,14 +40,14 @@ ANONYMIZED_TELEMETRY=false # LogLevel: Set to debug to enable verbose logging, set to result to get results only. Available: result | debug | info BROWSER_USE_LOGGING_LEVEL=info -# Chrome settings -CHROME_PATH= -CHROME_USER_DATA= -CHROME_DEBUGGING_PORT=9222 -CHROME_DEBUGGING_HOST=localhost +# Browser settings +BROWSER_PATH= +BROWSER_USER_DATA= +BROWSER_DEBUGGING_PORT=9222 +BROWSER_DEBUGGING_HOST=localhost # Set to true to keep browser open between AI tasks -CHROME_PERSISTENT_SESSION=false -CHROME_CDP= +KEEP_BROWSER_OPEN=true +BROWSER_CDP= # Display settings # Format: WIDTHxHEIGHTxDEPTH RESOLUTION=1920x1080x24 diff --git a/Dockerfile b/Dockerfile index 7b6d39f..b4d6fa1 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,5 +1,8 @@ FROM python:3.11-slim +# Set platform for multi-arch builds (Docker Buildx will set this) +ARG TARGETPLATFORM + # Install system dependencies RUN apt-get update && apt-get install -y \ wget \ @@ -28,7 +31,6 @@ RUN apt-get update && apt-get install -y \ fonts-liberation \ dbus \ xauth \ - xvfb \ x11vnc \ tigervnc-tools \ supervisor \ @@ -47,33 +49,45 @@ RUN git clone https://github.com/novnc/noVNC.git /opt/novnc \ && git clone https://github.com/novnc/websockify /opt/novnc/utils/websockify \ && ln -s /opt/novnc/vnc.html /opt/novnc/index.html -# Set platform for ARM64 compatibility -ARG TARGETPLATFORM=linux/amd64 - # Set up working directory WORKDIR /app # Copy requirements and install Python dependencies COPY requirements.txt . -RUN pip install --no-cache-dir -r requirements.txt +# Ensure 'patchright' is in your requirements.txt or install it directly +# RUN pip install --no-cache-dir -r requirements.txt patchright # If not in requirements +RUN pip install --no-cache-dir -r requirements.txt # Assuming patchright is in requirements.txt +RUN pip install --no-cache-dir patchright # Or install it explicitly -# Install Playwright and browsers with system dependencies -ENV PLAYWRIGHT_BROWSERS_PATH=/ms-playwright -RUN playwright install --with-deps chromium -RUN playwright install-deps +# Install Patchright browsers and dependencies +# Patchright documentation suggests PLAYWRIGHT_BROWSERS_PATH is still relevant +# or that Patchright installs to a similar default location that Playwright would. +# Let's assume Patchright respects PLAYWRIGHT_BROWSERS_PATH or its default install location is findable. +ENV PLAYWRIGHT_BROWSERS_PATH=/ms-browsers +RUN mkdir -p $PLAYWRIGHT_BROWSERS_PATH + +# Install recommended: Google Chrome (instead of just Chromium for better undetectability) +# The 'patchright install chrome' command might download and place it. +# The '--with-deps' equivalent for patchright install is to run 'patchright install-deps chrome' after. +RUN patchright install chrome +RUN patchright install-deps chrome + +# Alternative: Install Chromium if Google Chrome is problematic in certain environments +RUN patchright install chromium +RUN patchright install-deps chromium # Copy the application code COPY . . -# Set environment variables +# Set environment variables (Updated Names) ENV PYTHONUNBUFFERED=1 ENV BROWSER_USE_LOGGING_LEVEL=info -ENV CHROME_PATH=/ms-playwright/chromium-*/chrome-linux/chrome +# BROWSER_PATH will be determined by Patchright installation, supervisord will find it. ENV ANONYMIZED_TELEMETRY=false ENV DISPLAY=:99 ENV RESOLUTION=1920x1080x24 -ENV VNC_PASSWORD=vncpassword -ENV CHROME_PERSISTENT_SESSION=true +ENV VNC_PASSWORD=youvncpassword +ENV KEEP_BROWSER_OPEN=true ENV RESOLUTION_WIDTH=1920 ENV RESOLUTION_HEIGHT=1080 @@ -81,6 +95,6 @@ ENV RESOLUTION_HEIGHT=1080 RUN mkdir -p /var/log/supervisor COPY supervisord.conf /etc/supervisor/conf.d/supervisord.conf -EXPOSE 7788 6080 5901 +EXPOSE 7788 6080 5901 9222 -CMD ["/usr/bin/supervisord", "-c", "/etc/supervisor/conf.d/supervisord.conf"] +CMD ["/usr/bin/supervisord", "-c", "/etc/supervisor/conf.d/supervisord.conf"] \ No newline at end of file diff --git a/Dockerfile.arm64 b/Dockerfile.arm64 deleted file mode 100644 index 6d7a3ff..0000000 --- a/Dockerfile.arm64 +++ /dev/null @@ -1,85 +0,0 @@ -FROM python:3.11-slim - -# Install system dependencies -RUN apt-get update && apt-get install -y \ - wget \ - gnupg \ - curl \ - unzip \ - xvfb \ - libgconf-2-4 \ - libxss1 \ - libnss3 \ - libnspr4 \ - libasound2 \ - libatk1.0-0 \ - libatk-bridge2.0-0 \ - libcups2 \ - libdbus-1-3 \ - libdrm2 \ - libgbm1 \ - libgtk-3-0 \ - libxcomposite1 \ - libxdamage1 \ - libxfixes3 \ - libxrandr2 \ - xdg-utils \ - fonts-liberation \ - dbus \ - xauth \ - xvfb \ - x11vnc \ - tigervnc-tools \ - supervisor \ - net-tools \ - procps \ - git \ - python3-numpy \ - fontconfig \ - fonts-dejavu \ - fonts-dejavu-core \ - fonts-dejavu-extra \ - && rm -rf /var/lib/apt/lists/* - -# Install noVNC -RUN git clone https://github.com/novnc/noVNC.git /opt/novnc \ - && git clone https://github.com/novnc/websockify /opt/novnc/utils/websockify \ - && ln -s /opt/novnc/vnc.html /opt/novnc/index.html - -# Set platform explicitly for ARM64 -ARG TARGETPLATFORM=linux/arm64 - -# Set up working directory -WORKDIR /app - -# Copy requirements and install Python dependencies -COPY requirements.txt . -RUN pip install --no-cache-dir -r requirements.txt - -# Install Playwright and browsers with system dependencies optimized for ARM64 -ENV PLAYWRIGHT_BROWSERS_PATH=/ms-playwright -RUN PLAYWRIGHT_SKIP_BROWSER_DOWNLOAD=1 pip install playwright && \ - playwright install --with-deps chromium - -# Copy the application code -COPY . . - -# Set environment variables -ENV PYTHONUNBUFFERED=1 -ENV BROWSER_USE_LOGGING_LEVEL=info -ENV CHROME_PATH=/ms-playwright/chromium-*/chrome-linux/chrome -ENV ANONYMIZED_TELEMETRY=false -ENV DISPLAY=:99 -ENV RESOLUTION=1920x1080x24 -ENV VNC_PASSWORD=vncpassword -ENV CHROME_PERSISTENT_SESSION=true -ENV RESOLUTION_WIDTH=1920 -ENV RESOLUTION_HEIGHT=1080 - -# Set up supervisor configuration -RUN mkdir -p /var/log/supervisor -COPY supervisord.conf /etc/supervisor/conf.d/supervisord.conf - -EXPOSE 7788 6080 5901 - -CMD ["/usr/bin/supervisord", "-c", "/etc/supervisor/conf.d/supervisord.conf"] \ No newline at end of file diff --git a/README.md b/README.md index 91fb7fa..238dd40 100644 --- a/README.md +++ b/README.md @@ -23,10 +23,6 @@ We would like to officially thank [WarmShao](https://github.com/warmshao) for hi ## Installation Guide -### Prerequisites -- Python 3.11 or higher -- Git (for cloning the repository) - ### Option 1: Local Installation Read the [quickstart guide](https://docs.browser-use.com/quickstart#prepare-the-environment) or follow the steps below to get started. @@ -65,10 +61,13 @@ Install Python packages: uv pip install -r requirements.txt ``` -Install Browsers in Playwright: -You can install specific browsers by running: +Install Browsers in Patchright. ```bash -patchright install chromium +patchright install +``` +Or you can install specific browsers by running: +```bash +patchright install chromium --with-deps --no-shell ``` #### Step 4: Configure Environment @@ -83,6 +82,42 @@ cp .env.example .env ``` 2. Open `.env` in your preferred text editor and add your API keys and other settings +#### Local Setup +1. **Run the WebUI:** + After completing the installation steps above, start the application: + ```bash + python webui.py --ip 127.0.0.1 --port 7788 + ``` +2. WebUI options: + - `--ip`: The IP address to bind the WebUI to. Default is `127.0.0.1`. + - `--port`: The port to bind the WebUI to. Default is `7788`. + - `--theme`: The theme for the user interface. Default is `Ocean`. + - **Default**: The standard theme with a balanced design. + - **Soft**: A gentle, muted color scheme for a relaxed viewing experience. + - **Monochrome**: A grayscale theme with minimal color for simplicity and focus. + - **Glass**: A sleek, semi-transparent design for a modern appearance. + - **Origin**: A classic, retro-inspired theme for a nostalgic feel. + - **Citrus**: A vibrant, citrus-inspired palette with bright and fresh colors. + - **Ocean** (default): A blue, ocean-inspired theme providing a calming effect. + - `--dark-mode`: Enables dark mode for the user interface. +3. **Access the WebUI:** Open your web browser and navigate to `http://127.0.0.1:7788`. +4. **Using Your Own Browser(Optional):** + - Set `CHROME_PATH` to the executable path of your browser and `CHROME_USER_DATA` to the user data directory of your browser. Leave `CHROME_USER_DATA` empty if you want to use local user data. + - Windows + ```env + CHROME_PATH="C:\Program Files\Google\Chrome\Application\chrome.exe" + CHROME_USER_DATA="C:\Users\YourUsername\AppData\Local\Google\Chrome\User Data" + ``` + > Note: Replace `YourUsername` with your actual Windows username for Windows systems. + - Mac + ```env + CHROME_PATH="/Applications/Google Chrome.app/Contents/MacOS/Google Chrome" + CHROME_USER_DATA="/Users/YourUsername/Library/Application Support/Google/Chrome" + ``` + - Close all Chrome windows + - Open the WebUI in a non-Chrome browser, such as Firefox or Edge. This is important because the persistent browser context will use the Chrome data when running the agent. + - Check the "Use Own Browser" option within the Browser Settings. + ### Option 2: Docker Installation #### Prerequisites @@ -118,95 +153,12 @@ docker compose up --build CHROME_PERSISTENT_SESSION=true docker compose up --build ``` - 4. Access the Application: - Web Interface: Open `http://localhost:7788` in your browser - VNC Viewer (for watching browser interactions): Open `http://localhost:6080/vnc.html` - Default VNC password: "youvncpassword" - Can be changed by setting `VNC_PASSWORD` in your `.env` file -## Usage - -### Local Setup -1. **Run the WebUI:** - After completing the installation steps above, start the application: - ```bash - python webui.py --ip 127.0.0.1 --port 7788 - ``` -2. WebUI options: - - `--ip`: The IP address to bind the WebUI to. Default is `127.0.0.1`. - - `--port`: The port to bind the WebUI to. Default is `7788`. - - `--theme`: The theme for the user interface. Default is `Ocean`. - - **Default**: The standard theme with a balanced design. - - **Soft**: A gentle, muted color scheme for a relaxed viewing experience. - - **Monochrome**: A grayscale theme with minimal color for simplicity and focus. - - **Glass**: A sleek, semi-transparent design for a modern appearance. - - **Origin**: A classic, retro-inspired theme for a nostalgic feel. - - **Citrus**: A vibrant, citrus-inspired palette with bright and fresh colors. - - **Ocean** (default): A blue, ocean-inspired theme providing a calming effect. - - `--dark-mode`: Enables dark mode for the user interface. -3. **Access the WebUI:** Open your web browser and navigate to `http://127.0.0.1:7788`. -4. **Using Your Own Browser(Optional):** - - Set `CHROME_PATH` to the executable path of your browser and `CHROME_USER_DATA` to the user data directory of your browser. Leave `CHROME_USER_DATA` empty if you want to use local user data. - - Windows - ```env - CHROME_PATH="C:\Program Files\Google\Chrome\Application\chrome.exe" - CHROME_USER_DATA="C:\Users\YourUsername\AppData\Local\Google\Chrome\User Data" - ``` - > Note: Replace `YourUsername` with your actual Windows username for Windows systems. - - Mac - ```env - CHROME_PATH="/Applications/Google Chrome.app/Contents/MacOS/Google Chrome" - CHROME_USER_DATA="/Users/YourUsername/Library/Application Support/Google/Chrome" - ``` - - Close all Chrome windows - - Open the WebUI in a non-Chrome browser, such as Firefox or Edge. This is important because the persistent browser context will use the Chrome data when running the agent. - - Check the "Use Own Browser" option within the Browser Settings. -5. **Keep Browser Open(Optional):** - - Set `CHROME_PERSISTENT_SESSION=true` in the `.env` file. - -### Docker Setup -1. **Environment Variables:** - - All configuration is done through the `.env` file - - Available environment variables: - ``` - # LLM API Keys - OPENAI_API_KEY=your_key_here - ANTHROPIC_API_KEY=your_key_here - GOOGLE_API_KEY=your_key_here - - # Browser Settings - CHROME_PERSISTENT_SESSION=true # Set to true to keep browser open between AI tasks - RESOLUTION=1920x1080x24 # Custom resolution format: WIDTHxHEIGHTxDEPTH - RESOLUTION_WIDTH=1920 # Custom width in pixels - RESOLUTION_HEIGHT=1080 # Custom height in pixels - - # VNC Settings - VNC_PASSWORD=your_vnc_password # Optional, defaults to "vncpassword" - ``` - -2. **Platform Support:** - - Supports both AMD64 and ARM64 architectures - - For ARM64 systems (e.g., Apple Silicon Macs), the container will automatically use the appropriate image - -3. **Browser Persistence Modes:** - - **Default Mode (CHROME_PERSISTENT_SESSION=false):** - - Browser opens and closes with each AI task - - Clean state for each interaction - - Lower resource usage - - - **Persistent Mode (CHROME_PERSISTENT_SESSION=true):** - - Browser stays open between AI tasks - - Maintains history and state - - Allows viewing previous AI interactions - - Set in `.env` file or via environment variable when starting container - -4. **Viewing Browser Interactions:** - - Access the noVNC viewer at `http://localhost:6080/vnc.html` - - Enter the VNC password (default: "vncpassword" or what you set in VNC_PASSWORD) - - Direct VNC access available on port 5900 (mapped to container port 5901) - - You can now see all browser interactions in real-time - 5. **Container Management:** ```bash # Start with persistent browser diff --git a/docker-compose.yml b/docker-compose.yml index 75b0fd0..780d2a9 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,62 +1,76 @@ services: browser-use-webui: - platform: linux/amd64 build: context: . - dockerfile: ${DOCKERFILE:-Dockerfile} + dockerfile: Dockerfile args: TARGETPLATFORM: ${TARGETPLATFORM:-linux/amd64} ports: - - "7788:7788" # Gradio default port - - "6080:6080" # noVNC web interface - - "5901:5901" # VNC port - - "9222:9222" # Chrome remote debugging port + - "7788:7788" + - "6080:6080" + - "5901:5901" + - "9222:9222" environment: + # LLM API Keys & Endpoints (Your existing list) - OPENAI_ENDPOINT=${OPENAI_ENDPOINT:-https://api.openai.com/v1} - OPENAI_API_KEY=${OPENAI_API_KEY:-} - - ANTHROPIC_API_KEY=${ANTHROPIC_API_KEY:-} - ANTHROPIC_ENDPOINT=${ANTHROPIC_ENDPOINT:-https://api.anthropic.com} + - ANTHROPIC_API_KEY=${ANTHROPIC_API_KEY:-} - GOOGLE_API_KEY=${GOOGLE_API_KEY:-} - AZURE_OPENAI_ENDPOINT=${AZURE_OPENAI_ENDPOINT:-} - AZURE_OPENAI_API_KEY=${AZURE_OPENAI_API_KEY:-} + - AZURE_OPENAI_API_VERSION=${AZURE_OPENAI_API_VERSION:-2025-01-01-preview} - DEEPSEEK_ENDPOINT=${DEEPSEEK_ENDPOINT:-https://api.deepseek.com} - DEEPSEEK_API_KEY=${DEEPSEEK_API_KEY:-} - OLLAMA_ENDPOINT=${OLLAMA_ENDPOINT:-http://localhost:11434} - - MISTRAL_API_KEY=${MISTRAL_API_KEY:-} - MISTRAL_ENDPOINT=${MISTRAL_ENDPOINT:-https://api.mistral.ai/v1} + - MISTRAL_API_KEY=${MISTRAL_API_KEY:-} - ALIBABA_ENDPOINT=${ALIBABA_ENDPOINT:-https://dashscope.aliyuncs.com/compatible-mode/v1} - ALIBABA_API_KEY=${ALIBABA_API_KEY:-} - MOONSHOT_ENDPOINT=${MOONSHOT_ENDPOINT:-https://api.moonshot.cn/v1} - MOONSHOT_API_KEY=${MOONSHOT_API_KEY:-} - - IBM_API_KEY=${IBM_API_KEY:-} + - UNBOUND_ENDPOINT=${UNBOUND_ENDPOINT:-https://api.getunbound.ai} + - UNBOUND_API_KEY=${UNBOUND_API_KEY:-} + - SiliconFLOW_ENDPOINT=${SiliconFLOW_ENDPOINT:-https://api.siliconflow.cn/v1/} + - SiliconFLOW_API_KEY=${SiliconFLOW_API_KEY:-} - IBM_ENDPOINT=${IBM_ENDPOINT:-https://us-south.ml.cloud.ibm.com} + - IBM_API_KEY=${IBM_API_KEY:-} - IBM_PROJECT_ID=${IBM_PROJECT_ID:-} - - BROWSER_USE_LOGGING_LEVEL=${BROWSER_USE_LOGGING_LEVEL:-info} + + # Application Settings - ANONYMIZED_TELEMETRY=${ANONYMIZED_TELEMETRY:-false} - - CHROME_PATH=/usr/bin/google-chrome - - CHROME_USER_DATA=/app/data/chrome_data - - CHROME_PERSISTENT_SESSION=${CHROME_PERSISTENT_SESSION:-false} - - CHROME_CDP=${CHROME_CDP:-http://localhost:9222} + - BROWSER_USE_LOGGING_LEVEL=${BROWSER_USE_LOGGING_LEVEL:-info} + + # Browser Settings + - BROWSER_USER_DATA=${BROWSER_USER_DATA:-/app/data/chrome_data} + - BROWSER_DEBUGGING_PORT=${BROWSER_DEBUGGING_PORT:-9222} + - BROWSER_DEBUGGING_HOST=${BROWSER_DEBUGGING_HOST:-0.0.0.0} + - KEEP_BROWSER_OPEN=${KEEP_BROWSER_OPEN:-true} + - BROWSER_CDP=${BROWSER_CDP:-} # e.g., http://localhost:9222 + + # Display Settings - DISPLAY=:99 - - PLAYWRIGHT_BROWSERS_PATH=/ms-playwright + # This ENV is used by the Dockerfile during build time if Patchright respects it. + # It's not strictly needed at runtime by docker-compose unless your app or scripts also read it. + - PLAYWRIGHT_BROWSERS_PATH=/ms-browsers # Matches Dockerfile ENV - RESOLUTION=${RESOLUTION:-1920x1080x24} - RESOLUTION_WIDTH=${RESOLUTION_WIDTH:-1920} - RESOLUTION_HEIGHT=${RESOLUTION_HEIGHT:-1080} - - VNC_PASSWORD=${VNC_PASSWORD:-vncpassword} - - CHROME_DEBUGGING_PORT=9222 - - CHROME_DEBUGGING_HOST=localhost + + # VNC Settings + - VNC_PASSWORD=${VNC_PASSWORD:-youvncpassword} + volumes: - /tmp/.X11-unix:/tmp/.X11-unix + # - ./my_chrome_data:/app/data/chrome_data # Optional: persist browser data restart: unless-stopped shm_size: '2gb' cap_add: - SYS_ADMIN - security_opt: - - seccomp=unconfined tmpfs: - /tmp healthcheck: - test: ["CMD", "nc", "-z", "localhost", "5901"] + test: ["CMD", "nc", "-z", "localhost", "5901"] # VNC port interval: 10s timeout: 5s - retries: 3 + retries: 3 \ No newline at end of file diff --git a/entrypoint.sh b/entrypoint.sh deleted file mode 100644 index 9ab9240..0000000 --- a/entrypoint.sh +++ /dev/null @@ -1,4 +0,0 @@ -#!/bin/bash - -# Start supervisord in the foreground to properly manage child processes -exec /usr/bin/supervisord -n -c /etc/supervisor/conf.d/supervisord.conf \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 4762a7e..bc8de8c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -browser-use==0.1.43 +browser-use==0.1.45 pyperclip==1.9.0 gradio==5.27.0 json-repair diff --git a/src/agent/browser_use/browser_use_agent.py b/src/agent/browser_use/browser_use_agent.py index 49d671f..d5cba0f 100644 --- a/src/agent/browser_use/browser_use_agent.py +++ b/src/agent/browser_use/browser_use_agent.py @@ -20,6 +20,7 @@ from browser_use.telemetry.views import ( ) from browser_use.utils import time_execution_async from dotenv import load_dotenv +from browser_use.agent.message_manager.utils import is_model_without_tool_support load_dotenv() logger = logging.getLogger(__name__) @@ -30,6 +31,22 @@ SKIP_LLM_API_KEY_VERIFICATION = ( class BrowserUseAgent(Agent): + def _set_tool_calling_method(self) -> ToolCallingMethod | None: + tool_calling_method = self.settings.tool_calling_method + if tool_calling_method == 'auto': + if is_model_without_tool_support(self.model_name): + return 'raw' + elif self.chat_model_library == 'ChatGoogleGenerativeAI': + return None + elif self.chat_model_library == 'ChatOpenAI': + return 'function_calling' + elif self.chat_model_library == 'AzureChatOpenAI': + return 'function_calling' + else: + return None + else: + return tool_calling_method + @time_execution_async("--run (agent)") async def run( self, max_steps: int = 100, on_step_start: AgentHookFunc | None = None, diff --git a/src/agent/deep_research/deep_research_agent.py b/src/agent/deep_research/deep_research_agent.py index 8f95eec..b7a7a56 100644 --- a/src/agent/deep_research/deep_research_agent.py +++ b/src/agent/deep_research/deep_research_agent.py @@ -29,7 +29,7 @@ from langchain_core.tools import StructuredTool, Tool from langgraph.graph import StateGraph from pydantic import BaseModel, Field -from browser_use.browser.context import BrowserContextWindowSize, BrowserContextConfig +from browser_use.browser.context import BrowserContextConfig from src.agent.browser_use.browser_use_agent import BrowserUseAgent from src.browser.custom_browser import CustomBrowser @@ -82,22 +82,19 @@ async def run_single_browser_task( try: logger.info(f"Starting browser task for query: {task_query}") extra_args = [f"--window-size={window_w},{window_h}"] - if browser_user_data_dir: - extra_args.append(f"--user-data-dir={browser_user_data_dir}") if use_own_browser: - browser_binary_path = os.getenv("CHROME_PATH", None) or browser_binary_path + browser_binary_path = os.getenv("BROWSER_PATH", None) or browser_binary_path if browser_binary_path == "": browser_binary_path = None - chrome_user_data = os.getenv("CHROME_USER_DATA", None) - if chrome_user_data: - extra_args += [f"--user-data-dir={chrome_user_data}"] + browser_user_data = browser_user_data_dir or os.getenv("BROWSER_USER_DATA", None) + if browser_user_data: + extra_args += [f"--user-data-dir={browser_user_data}"] else: browser_binary_path = None bu_browser = CustomBrowser( config=BrowserConfig( headless=headless, - disable_security=False, browser_binary_path=browser_binary_path, extra_browser_args=extra_args, wss_url=wss_url, @@ -107,7 +104,8 @@ async def run_single_browser_task( context_config = BrowserContextConfig( save_downloads_path="./tmp/downloads", - browser_window_size=BrowserContextWindowSize(width=window_w, height=window_h), + window_height=window_h, + window_width=window_w, force_new_context=True, ) bu_browser_context = await bu_browser.new_context(config=context_config) @@ -299,30 +297,34 @@ Provide a list of distinct search queries(up to {max_parallel_browsers}) that ar # --- Langgraph State Definition --- -class ResearchPlanItem(TypedDict): - step: int - task: str +class ResearchTaskItem(TypedDict): + # step: int # Maybe step within category, or just implicit by order + task_description: str status: str # "pending", "completed", "failed" - queries: Optional[List[str]] # Queries generated for this task - result_summary: Optional[str] # Optional brief summary after execution + queries: Optional[List[str]] + result_summary: Optional[str] + + +class ResearchCategoryItem(TypedDict): + category_name: str + tasks: List[ResearchTaskItem] + # Optional: category_status: str # Could be "pending", "in_progress", "completed" class DeepResearchState(TypedDict): task_id: str topic: str - research_plan: List[ResearchPlanItem] - search_results: List[Dict[str, Any]] # Stores results from browser_search_tool_func - # messages: Sequence[BaseMessage] # History for ReAct-like steps within nodes - llm: Any # The LLM instance + research_plan: List[ResearchCategoryItem] # CHANGED + search_results: List[Dict[str, Any]] + llm: Any tools: List[Tool] output_dir: Path browser_config: Dict[str, Any] final_report: Optional[str] - current_step_index: int # To track progress through the plan - stop_requested: bool # Flag to signal termination - # Add other state variables as needed - error_message: Optional[str] # To store errors - + current_category_index: int + current_task_index_in_category: int + stop_requested: bool + error_message: Optional[str] messages: List[BaseMessage] @@ -330,44 +332,75 @@ class DeepResearchState(TypedDict): def _load_previous_state(task_id: str, output_dir: str) -> Dict[str, Any]: - """Loads state from files if they exist.""" state_updates = {} plan_file = os.path.join(output_dir, PLAN_FILENAME) search_file = os.path.join(output_dir, SEARCH_INFO_FILENAME) + + loaded_plan: List[ResearchCategoryItem] = [] + next_cat_idx, next_task_idx = 0, 0 + found_pending = False + if os.path.exists(plan_file): try: with open(plan_file, "r", encoding="utf-8") as f: - # Basic parsing, assumes markdown checklist format - plan = [] - step = 1 - for line in f: - line = line.strip() - if line.startswith(("- [x]", "- [ ]")): - status = "completed" if line.startswith("- [x]") else "pending" - task = line[5:].strip() - plan.append( - ResearchPlanItem( - step=step, - task=task, - status=status, - queries=None, - result_summary=None, - ) + current_category: Optional[ResearchCategoryItem] = None + lines = f.readlines() + cat_counter = 0 + task_counter_in_cat = 0 + + for line_num, line_content in enumerate(lines): + line = line_content.strip() + if line.startswith("## "): # Category + if current_category: # Save previous category + loaded_plan.append(current_category) + if not found_pending: # If previous category was all done, advance cat counter + cat_counter += 1 + task_counter_in_cat = 0 + category_name = line[line.find(" "):].strip() # Get text after "## X. " + current_category = ResearchCategoryItem(category_name=category_name, tasks=[]) + elif (line.startswith("- [ ]") or line.startswith("- [x]") or line.startswith( + "- [-]")) and current_category: # Task + status = "pending" + if line.startswith("- [x]"): + status = "completed" + elif line.startswith("- [-]"): + status = "failed" + + task_desc = line[5:].strip() + current_category["tasks"].append( + ResearchTaskItem(task_description=task_desc, status=status, queries=None, + result_summary=None) ) - step += 1 - state_updates["research_plan"] = plan - # Determine next step index based on loaded plan - next_step = next( - (i for i, item in enumerate(plan) if item["status"] == "pending"), - len(plan), - ) - state_updates["current_step_index"] = next_step + if status == "pending" and not found_pending: + next_cat_idx = cat_counter + next_task_idx = task_counter_in_cat + found_pending = True + if not found_pending: # only increment if previous tasks were completed/failed + task_counter_in_cat += 1 + + if current_category: # Append last category + loaded_plan.append(current_category) + + if loaded_plan: + state_updates["research_plan"] = loaded_plan + if not found_pending and loaded_plan: # All tasks were completed or failed + next_cat_idx = len(loaded_plan) # Points beyond the last category + next_task_idx = 0 + state_updates["current_category_index"] = next_cat_idx + state_updates["current_task_index_in_category"] = next_task_idx logger.info( - f"Loaded research plan from {plan_file}, next step index: {next_step}" + f"Loaded hierarchical research plan from {plan_file}. " + f"Next task: Category {next_cat_idx}, Task {next_task_idx} in category." ) + else: + logger.warning(f"Plan file {plan_file} was empty or malformed.") + except Exception as e: - logger.error(f"Failed to load or parse research plan {plan_file}: {e}") + logger.error(f"Failed to load or parse research plan {plan_file}: {e}", exc_info=True) state_updates["error_message"] = f"Failed to load research plan: {e}" + else: + logger.info(f"Plan file {plan_file} not found. Will start fresh.") + if os.path.exists(search_file): try: with open(search_file, "r", encoding="utf-8") as f: @@ -375,22 +408,25 @@ def _load_previous_state(task_id: str, output_dir: str) -> Dict[str, Any]: logger.info(f"Loaded search results from {search_file}") except Exception as e: logger.error(f"Failed to load search results {search_file}: {e}") - state_updates["error_message"] = f"Failed to load search results: {e}" - # Decide if this is fatal or if we can continue without old results + state_updates["error_message"] = ( + state_updates.get("error_message", "") + f" Failed to load search results: {e}").strip() return state_updates -def _save_plan_to_md(plan: List[ResearchPlanItem], output_dir: str): - """Saves the research plan to a markdown checklist file.""" +def _save_plan_to_md(plan: List[ResearchCategoryItem], output_dir: str): plan_file = os.path.join(output_dir, PLAN_FILENAME) try: with open(plan_file, "w", encoding="utf-8") as f: - f.write("# Research Plan\n\n") - for item in plan: - marker = "- [x]" if item["status"] == "completed" else "- [ ]" - f.write(f"{marker} {item['task']}\n") - logger.info(f"Research plan saved to {plan_file}") + f.write(f"# Research Plan\n\n") + for cat_idx, category in enumerate(plan): + f.write(f"## {cat_idx + 1}. {category['category_name']}\n\n") + for task_idx, task in enumerate(category['tasks']): + marker = "- [x]" if task["status"] == "completed" else "- [ ]" if task[ + "status"] == "pending" else "- [-]" # [-] for failed + f.write(f" {marker} {task['task_description']}\n") + f.write("\n") + logger.info(f"Hierarchical research plan saved to {plan_file}") except Exception as e: logger.error(f"Failed to save research plan to {plan_file}: {e}") @@ -419,7 +455,6 @@ def _save_report_to_md(report: str, output_dir: Path): async def planning_node(state: DeepResearchState) -> Dict[str, Any]: - """Generates the initial research plan or refines it if resuming.""" logger.info("--- Entering Planning Node ---") if state.get("stop_requested"): logger.info("Stop requested, skipping planning.") @@ -428,293 +463,344 @@ async def planning_node(state: DeepResearchState) -> Dict[str, Any]: llm = state["llm"] topic = state["topic"] existing_plan = state.get("research_plan") - existing_results = state.get("search_results") output_dir = state["output_dir"] - if existing_plan and state.get("current_step_index", 0) > 0: + if existing_plan and ( + state.get("current_category_index", 0) > 0 or state.get("current_task_index_in_category", 0) > 0): logger.info("Resuming with existing plan.") - # Maybe add logic here to let LLM review and potentially adjust the plan - # based on existing_results, but for now, we just use the loaded plan. _save_plan_to_md(existing_plan, output_dir) # Ensure it's saved initially - return {"research_plan": existing_plan} # Return the loaded plan + # current_category_index and current_task_index_in_category should be set by _load_previous_state + return {"research_plan": existing_plan} logger.info(f"Generating new research plan for topic: {topic}") - prompt = ChatPromptTemplate.from_messages( - [ - ( - "system", - """You are a meticulous research assistant. Your goal is to create a step-by-step research plan to thoroughly investigate a given topic. - The plan should consist of clear, actionable research tasks or questions. Each step should logically build towards a comprehensive understanding. - Format the output as a numbered list. Each item should represent a distinct research step or question. - Example: - 1. Define the core concepts and terminology related to [Topic]. - 2. Identify the key historical developments of [Topic]. - 3. Analyze the current state-of-the-art and recent advancements in [Topic]. - 4. Investigate the major challenges and limitations associated with [Topic]. - 5. Explore the future trends and potential applications of [Topic]. - 6. Summarize the findings and draw conclusions. + prompt_text = f"""You are a meticulous research assistant. Your goal is to create a hierarchical research plan to thoroughly investigate the topic: "{topic}". +The plan should be structured into several main research categories. Each category should contain a list of specific, actionable research tasks or questions. +Format the output as a JSON list of objects. Each object represents a research category and should have: +1. "category_name": A string for the name of the research category. +2. "tasks": A list of strings, where each string is a specific research task for that category. - Keep the plan focused and manageable. Aim for 5-10 detailed steps. - """, - ), - ("human", f"Generate a research plan for the topic: {topic}"), - ] - ) +Example JSON Output: +[ + {{ + "category_name": "Understanding Core Concepts and Definitions", + "tasks": [ + "Define the primary terminology associated with '{topic}'.", + "Identify the fundamental principles and theories underpinning '{topic}'." + ] + }}, + {{ + "category_name": "Historical Development and Key Milestones", + "tasks": [ + "Trace the historical evolution of '{topic}'.", + "Identify key figures, events, or breakthroughs in the development of '{topic}'." + ] + }}, + {{ + "category_name": "Current State-of-the-Art and Applications", + "tasks": [ + "Analyze the current advancements and prominent applications of '{topic}'.", + "Investigate ongoing research and active areas of development related to '{topic}'." + ] + }}, + {{ + "category_name": "Challenges, Limitations, and Future Outlook", + "tasks": [ + "Identify the major challenges and limitations currently facing '{topic}'.", + "Explore potential future trends, ethical considerations, and societal impacts of '{topic}'." + ] + }} +] + +Generate a plan with 3-10 categories, and 2-6 tasks per category for the topic: "{topic}" according to the complexity of the topic. +Ensure the output is a valid JSON array. +""" + messages = [ + SystemMessage(content="You are a research planning assistant outputting JSON."), + HumanMessage(content=prompt_text) + ] try: - response = await llm.ainvoke(prompt.format_prompt(topic=topic).to_messages()) - plan_text = response.content + response = await llm.ainvoke(messages) + raw_content = response.content + # The LLM might wrap the JSON in backticks + if raw_content.strip().startswith("```json"): + raw_content = raw_content.strip()[7:-3].strip() + elif raw_content.strip().startswith("```"): + raw_content = raw_content.strip()[3:-3].strip() - # Parse the numbered list into the plan structure - new_plan: List[ResearchPlanItem] = [] - for i, line in enumerate(plan_text.strip().split("\n")): - line = line.strip() - if line and (line[0].isdigit() or line.startswith(("*", "-"))): - # Simple parsing: remove number/bullet and space - task_text = ( - line.split(".", 1)[-1].strip() - if line[0].isdigit() - else line[1:].strip() - ) - if task_text: - new_plan.append( - ResearchPlanItem( - step=i + 1, - task=task_text, + logger.debug(f"LLM response for plan: {raw_content}") + parsed_plan_from_llm = json.loads(raw_content) + + new_plan: List[ResearchCategoryItem] = [] + for cat_idx, category_data in enumerate(parsed_plan_from_llm): + if not isinstance(category_data, + dict) or "category_name" not in category_data or "tasks" not in category_data: + logger.warning(f"Skipping invalid category data: {category_data}") + continue + + tasks: List[ResearchTaskItem] = [] + for task_idx, task_desc in enumerate(category_data["tasks"]): + if isinstance(task_desc, str): + tasks.append( + ResearchTaskItem( + task_description=task_desc, status="pending", queries=None, result_summary=None, ) ) + else: # Sometimes LLM puts tasks as {"task": "description"} + if isinstance(task_desc, dict) and "task_description" in task_desc: + tasks.append( + ResearchTaskItem( + task_description=task_desc["task_description"], + status="pending", + queries=None, + result_summary=None, + ) + ) + elif isinstance(task_desc, dict) and "task" in task_desc: # common LLM mistake + tasks.append( + ResearchTaskItem( + task_description=task_desc["task"], + status="pending", + queries=None, + result_summary=None, + ) + ) + else: + logger.warning( + f"Skipping invalid task data: {task_desc} in category {category_data['category_name']}") + + new_plan.append( + ResearchCategoryItem( + category_name=category_data["category_name"], + tasks=tasks, + ) + ) if not new_plan: - logger.error("LLM failed to generate a valid plan structure.") + logger.error("LLM failed to generate a valid plan structure from JSON.") return {"error_message": "Failed to generate research plan structure."} - logger.info(f"Generated research plan with {len(new_plan)} steps.") - _save_plan_to_md(new_plan, output_dir) + logger.info(f"Generated research plan with {len(new_plan)} categories.") + _save_plan_to_md(new_plan, output_dir) # Save the hierarchical plan return { "research_plan": new_plan, - "current_step_index": 0, # Start from the beginning - "search_results": [], # Initialize search results + "current_category_index": 0, + "current_task_index_in_category": 0, + "search_results": [], } + except json.JSONDecodeError as e: + logger.error(f"Failed to parse JSON from LLM for plan: {e}. Response was: {raw_content}", exc_info=True) + return {"error_message": f"LLM generated invalid JSON for research plan: {e}"} except Exception as e: logger.error(f"Error during planning: {e}", exc_info=True) return {"error_message": f"LLM Error during planning: {e}"} async def research_execution_node(state: DeepResearchState) -> Dict[str, Any]: - """ - Executes the next step in the research plan by invoking the LLM with tools. - The LLM decides which tool (e.g., browser search) to use and provides arguments. - """ logger.info("--- Entering Research Execution Node ---") if state.get("stop_requested"): logger.info("Stop requested, skipping research execution.") return { "stop_requested": True, - "current_step_index": state["current_step_index"], - } # Keep index same + "current_category_index": state["current_category_index"], + "current_task_index_in_category": state["current_task_index_in_category"], + } plan = state["research_plan"] - current_index = state["current_step_index"] + cat_idx = state["current_category_index"] + task_idx = state["current_task_index_in_category"] llm = state["llm"] - tools = state["tools"] # Tools are now passed in state + tools = state["tools"] output_dir = str(state["output_dir"]) - task_id = state["task_id"] - # Stop event is bound inside the tool function, no need to pass directly here + task_id = state["task_id"] # For _AGENT_STOP_FLAGS - if not plan or current_index >= len(plan): - logger.info("Research plan complete or empty.") - # This condition should ideally be caught by `should_continue` before reaching here - return {} + # This check should ideally be handled by `should_continue` + if not plan or cat_idx >= len(plan): + logger.info("Research plan complete or categories exhausted.") + return {} # should route to synthesis - current_step = plan[current_index] - if current_step["status"] == "completed": - logger.info(f"Step {current_step['step']} already completed, skipping.") - return {"current_step_index": current_index + 1} # Move to next step + current_category = plan[cat_idx] + if task_idx >= len(current_category["tasks"]): + logger.info(f"All tasks in category '{current_category['category_name']}' completed. Moving to next category.") + # This logic is now effectively handled by should_continue and the index updates below + # The next iteration will be caught by should_continue or this node with updated indices + return { + "current_category_index": cat_idx + 1, + "current_task_index_in_category": 0, + "messages": state["messages"] # Pass messages along + } + + current_task = current_category["tasks"][task_idx] + + if current_task["status"] == "completed": + logger.info( + f"Task '{current_task['task_description']}' in category '{current_category['category_name']}' already completed. Skipping.") + # Logic to find next task + next_task_idx = task_idx + 1 + next_cat_idx = cat_idx + if next_task_idx >= len(current_category["tasks"]): + next_cat_idx += 1 + next_task_idx = 0 + return { + "current_category_index": next_cat_idx, + "current_task_index_in_category": next_task_idx, + "messages": state["messages"] # Pass messages along + } logger.info( - f"Executing research step {current_step['step']}: {current_step['task']}" + f"Executing research task: '{current_task['task_description']}' (Category: '{current_category['category_name']}')" ) - # Bind tools to the LLM for this call llm_with_tools = llm.bind_tools(tools) - if state["messages"]: - current_task_message = [ - HumanMessage( - content=f"Research Task (Step {current_step['step']}): {current_step['task']}" - ) - ] - invocation_messages = state["messages"] + current_task_message + + # Construct messages for LLM invocation + task_prompt_content = ( + f"Current Research Category: {current_category['category_name']}\n" + f"Specific Task: {current_task['task_description']}\n\n" + "Please use the available tools, especially 'parallel_browser_search', to gather information for this specific task. " + "Provide focused search queries relevant ONLY to this task. " + "If you believe you have sufficient information from previous steps for this specific task, you can indicate that you are ready to summarize or that no further search is needed." + ) + current_task_message_history = [ + HumanMessage(content=task_prompt_content) + ] + if not state["messages"]: # First actual execution message + invocation_messages = [ + SystemMessage( + content="You are a research assistant executing one task of a research plan. Focus on the current task only."), + ] + current_task_message_history else: - current_task_message = [ - SystemMessage( - content="You are a research assistant executing one step of a research plan. Use the available tools, especially the 'parallel_browser_search' tool, to gather information needed for the current task. Be precise with your search queries if using the browser tool." - ), - HumanMessage( - content=f"Research Task (Step {current_step['step']}): {current_step['task']}" - ), - ] - invocation_messages = current_task_message + invocation_messages = state["messages"] + current_task_message_history try: - # Invoke the LLM, expecting it to make a tool call - logger.info(f"Invoking LLM with tools for task: {current_step['task']}") + logger.info(f"Invoking LLM with tools for task: {current_task['task_description']}") ai_response: BaseMessage = await llm_with_tools.ainvoke(invocation_messages) logger.info("LLM invocation complete.") tool_results = [] executed_tool_names = [] + current_search_results = state.get("search_results", []) # Get existing search results if not isinstance(ai_response, AIMessage) or not ai_response.tool_calls: - # LLM didn't call a tool. Maybe it answered directly? Or failed? logger.warning( - f"LLM did not call any tool for step {current_step['step']}. Response: {ai_response.content[:100]}..." - ) - # How to handle this? Mark step as failed? Or store the content? - # Let's mark as failed for now, assuming a tool was expected. - current_step["status"] = "failed" - current_step["result_summary"] = "LLM did not use a tool as expected." - _save_plan_to_md(plan, output_dir) - return { - "research_plan": plan, - "status": "pending", - "current_step_index": current_index, - "messages": [ - f"LLM failed to call a tool for step {current_step['step']}. Response: {ai_response.content}" - f". Please use tool to do research unless you are thinking or summary"], - } - - # Process tool calls - for tool_call in ai_response.tool_calls: - tool_name = tool_call.get("name") - tool_args = tool_call.get("args", {}) - tool_call_id = tool_call.get("id") # Important for ToolMessage - - logger.info(f"LLM requested tool call: {tool_name} with args: {tool_args}") - executed_tool_names.append(tool_name) - - # Find the corresponding tool instance - selected_tool = next((t for t in tools if t.name == tool_name), None) - - if not selected_tool: - logger.error(f"LLM called tool '{tool_name}' which is not available.") - # Create a ToolMessage indicating the error - tool_results.append( - ToolMessage( - content=f"Error: Tool '{tool_name}' not found.", - tool_call_id=tool_call_id, - ) - ) - continue # Skip to next tool call if any - - # Execute the tool - try: - # Stop check before executing the tool (tool itself also checks) - stop_event = _AGENT_STOP_FLAGS.get(task_id) - if stop_event and stop_event.is_set(): - logger.info(f"Stop requested before executing tool: {tool_name}") - current_step["status"] = "pending" # Not completed due to stop - _save_plan_to_md(plan, output_dir) - return {"stop_requested": True, "research_plan": plan} - - logger.info(f"Executing tool: {tool_name}") - # Assuming tool functions handle async correctly - tool_output = await selected_tool.ainvoke(tool_args) - logger.info(f"Tool '{tool_name}' executed successfully.") - browser_tool_called = "parallel_browser_search" in executed_tool_names - # Append result to overall search results - current_search_results = state.get("search_results", []) - if browser_tool_called: # Specific handling for browser tool output - current_search_results.extend(tool_output) - else: # Handle other tool outputs (e.g., file tools return strings) - # Store it associated with the step? Or a generic log? - # Let's just log it for now. Need better handling for diverse tool outputs. - logger.info( - f"Result from tool '{tool_name}': {str(tool_output)[:200]}..." - ) - - # Store result for potential next LLM call (if we were doing multi-turn) - tool_results.append( - ToolMessage( - content=json.dumps(tool_output), tool_call_id=tool_call_id - ) - ) - - except Exception as e: - logger.error(f"Error executing tool '{tool_name}': {e}", exc_info=True) - tool_results.append( - ToolMessage( - content=f"Error executing tool {tool_name}: {e}", - tool_call_id=tool_call_id, - ) - ) - # Also update overall state search_results with error? - current_search_results = state.get("search_results", []) - current_search_results.append( - { - "tool_name": tool_name, - "args": tool_args, - "status": "failed", - "error": str(e), - } - ) - - # Basic check: Did the browser tool run at all? (More specific checks needed) - browser_tool_called = "parallel_browser_search" in executed_tool_names - # We might need a more nuanced status based on the *content* of tool_results - step_failed = ( - any("Error:" in str(tr.content) for tr in tool_results) - or not browser_tool_called - ) - - if step_failed: - logger.warning( - f"Step {current_step['step']} failed or did not yield results via browser search." - ) - current_step["status"] = "failed" - current_step["result_summary"] = ( - f"Tool execution failed or browser tool not used. Errors: {[tr.content for tr in tool_results if 'Error' in str(tr.content)]}" + f"LLM did not call any tool for task '{current_task['task_description']}'. Response: {ai_response.content[:100]}..." ) + current_task["status"] = "pending" # Or "completed_no_tool" if LLM explains it's done + current_task["result_summary"] = f"LLM did not use a tool. Response: {ai_response.content}" + current_task["current_category_index"] = cat_idx + current_task["current_task_index_in_category"] = task_idx + return current_task + # We still save the plan and advance. else: - logger.info( - f"Step {current_step['step']} completed using tool(s): {executed_tool_names}." - ) - current_step["status"] = "completed" + # Process tool calls + for tool_call in ai_response.tool_calls: + tool_name = tool_call.get("name") + tool_args = tool_call.get("args", {}) + tool_call_id = tool_call.get("id") - current_step["result_summary"] = ( - f"Executed tool(s): {', '.join(executed_tool_names)}." - ) + logger.info(f"LLM requested tool call: {tool_name} with args: {tool_args}") + executed_tool_names.append(tool_name) + selected_tool = next((t for t in tools if t.name == tool_name), None) + if not selected_tool: + logger.error(f"LLM called tool '{tool_name}' which is not available.") + tool_results.append( + ToolMessage(content=f"Error: Tool '{tool_name}' not found.", tool_call_id=tool_call_id)) + continue + + try: + stop_event = _AGENT_STOP_FLAGS.get(task_id) + if stop_event and stop_event.is_set(): + logger.info(f"Stop requested before executing tool: {tool_name}") + current_task["status"] = "pending" # Or a new "stopped" status + _save_plan_to_md(plan, output_dir) + return {"stop_requested": True, "research_plan": plan, "current_category_index": cat_idx, + "current_task_index_in_category": task_idx} + + logger.info(f"Executing tool: {tool_name}") + tool_output = await selected_tool.ainvoke(tool_args) + logger.info(f"Tool '{tool_name}' executed successfully.") + + if tool_name == "parallel_browser_search": + current_search_results.extend(tool_output) # tool_output is List[Dict] + else: # For other tools, we might need specific handling or just log + logger.info(f"Result from tool '{tool_name}': {str(tool_output)[:200]}...") + # Storing non-browser results might need a different structure or key in search_results + current_search_results.append( + {"tool_name": tool_name, "args": tool_args, "output": str(tool_output), + "status": "completed"}) + + tool_results.append(ToolMessage(content=json.dumps(tool_output), tool_call_id=tool_call_id)) + + except Exception as e: + logger.error(f"Error executing tool '{tool_name}': {e}", exc_info=True) + tool_results.append( + ToolMessage(content=f"Error executing tool {tool_name}: {e}", tool_call_id=tool_call_id)) + current_search_results.append( + {"tool_name": tool_name, "args": tool_args, "status": "failed", "error": str(e)}) + + # After processing all tool calls for this task + step_failed_tool_execution = any("Error:" in str(tr.content) for tr in tool_results) + # Consider a task successful if a browser search was attempted and didn't immediately error out during call + # The browser search itself returns status for each query. + browser_tool_attempted_successfully = "parallel_browser_search" in executed_tool_names and not step_failed_tool_execution + + if step_failed_tool_execution: + current_task["status"] = "failed" + current_task[ + "result_summary"] = f"Tool execution failed. Errors: {[tr.content for tr in tool_results if 'Error' in str(tr.content)]}" + elif executed_tool_names: # If any tool was called + current_task["status"] = "completed" + current_task["result_summary"] = f"Executed tool(s): {', '.join(executed_tool_names)}." + # TODO: Could ask LLM to summarize the tool_results for this task if needed, rather than just listing tools. + else: # No tool calls but AI response had .tool_calls structure (empty) + current_task["status"] = "failed" # Or a more specific status + current_task["result_summary"] = "LLM prepared for tool call but provided no tools." + + # Save progress _save_plan_to_md(plan, output_dir) _save_search_results_to_json(current_search_results, output_dir) + # Determine next indices + next_task_idx = task_idx + 1 + next_cat_idx = cat_idx + if next_task_idx >= len(current_category["tasks"]): + next_cat_idx += 1 + next_task_idx = 0 + + updated_messages = state["messages"] + current_task_message_history + [ai_response] + tool_results + return { "research_plan": plan, - "search_results": current_search_results, # Update with new results - "current_step_index": current_index + 1, - "messages": state["messages"] - + current_task_message - + [ai_response] - + tool_results, - # Optionally return the tool_results messages if needed by downstream nodes + "search_results": current_search_results, + "current_category_index": next_cat_idx, + "current_task_index_in_category": next_task_idx, + "messages": updated_messages, } except Exception as e: - logger.error( - f"Unhandled error during research execution node for step {current_step['step']}: {e}", - exc_info=True, - ) - current_step["status"] = "failed" + logger.error(f"Unhandled error during research execution for task '{current_task['task_description']}': {e}", + exc_info=True) + current_task["status"] = "failed" _save_plan_to_md(plan, output_dir) + # Determine next indices even on error to attempt to move on + next_task_idx = task_idx + 1 + next_cat_idx = cat_idx + if next_task_idx >= len(current_category["tasks"]): + next_cat_idx += 1 + next_task_idx = 0 return { "research_plan": plan, - "current_step_index": current_index + 1, # Move on even if error? - "error_message": f"Core Execution Error on step {current_step['step']}: {e}", + "current_category_index": next_cat_idx, + "current_task_index_in_category": next_task_idx, + "error_message": f"Core Execution Error on task '{current_task['task_description']}': {e}", + "messages": state["messages"] + current_task_message_history # Preserve messages up to error } @@ -747,36 +833,37 @@ async def synthesis_node(state: DeepResearchState) -> Dict[str, Any]: references = {} ref_count = 1 for i, result_entry in enumerate(search_results): - query = result_entry.get("query", "Unknown Query") + query = result_entry.get("query", "Unknown Query") # From parallel_browser_search + tool_name = result_entry.get("tool_name") # From other tools status = result_entry.get("status", "unknown") - result_data = result_entry.get( - "result" - ) # This should be the dict with summary, title, url - error = result_entry.get("error") + result_data = result_entry.get("result") # From BrowserUseAgent's final_result + tool_output_str = result_entry.get("output") # From other tools - if status == "completed" and result_data: - summary = result_data - formatted_results += f'### Finding from Query: "{query}"\n' - formatted_results += f"- **Summary:**\n{summary}\n" + if tool_name == "parallel_browser_search" and status == "completed" and result_data: + # result_data is the summary from BrowserUseAgent + formatted_results += f'### Finding from Web Search Query: "{query}"\n' + formatted_results += f"- **Summary:**\n{result_data}\n" # result_data is already a summary string here + # If result_data contained title/URL, you'd format them here. + # The current BrowserUseAgent returns a string summary directly as 'final_data' in run_single_browser_task + formatted_results += "---\n" + elif tool_name != "parallel_browser_search" and status == "completed" and tool_output_str: + formatted_results += f'### Finding from Tool: "{tool_name}" (Args: {result_entry.get("args")})\n' + formatted_results += f"- **Output:**\n{tool_output_str}\n" formatted_results += "---\n" - elif status == "failed": - formatted_results += f'### Failed Query: "{query}"\n' + error = result_entry.get("error") + q_or_t = f"Query: \"{query}\"" if query != "Unknown Query" else f"Tool: \"{tool_name}\"" + formatted_results += f'### Failed {q_or_t}\n' formatted_results += f"- **Error:** {error}\n" formatted_results += "---\n" - # Ignore cancelled/other statuses for the report content # Prepare the research plan context plan_summary = "\nResearch Plan Followed:\n" - for item in plan: - marker = ( - "- [x]" - if item["status"] == "completed" - else "- [ ] (Failed)" - if item["status"] == "failed" - else "- [ ]" - ) - plan_summary += f"{marker} {item['task']}\n" + for cat_idx, category in enumerate(plan): + plan_summary += f"\n#### Category {cat_idx + 1}: {category['category_name']}\n" + for task_idx, task in enumerate(category['tasks']): + marker = "[x]" if task["status"] == "completed" else "[ ]" if task["status"] == "pending" else "[-]" + plan_summary += f" - {marker} {task['task_description']}\n" synthesis_prompt = ChatPromptTemplate.from_messages( [ @@ -785,29 +872,28 @@ async def synthesis_node(state: DeepResearchState) -> Dict[str, Any]: """You are a professional researcher tasked with writing a comprehensive and well-structured report based on collected findings. The report should address the research topic thoroughly, synthesizing the information gathered from various sources. Structure the report logically: - 1. **Introduction:** Briefly introduce the topic and the report's scope (mentioning the research plan followed is good). - 2. **Main Body:** Discuss the key findings, organizing them thematically or according to the research plan steps. Analyze, compare, and contrast information from different sources where applicable. **Crucially, cite your sources using bracketed numbers [X] corresponding to the reference list.** - 3. **Conclusion:** Summarize the main points and offer concluding thoughts or potential areas for further research. + 1. Briefly introduce the topic and the report's scope (mentioning the research plan followed, including categories and tasks, is good). + 2. Discuss the key findings, organizing them thematically, possibly aligning with the research categories. Analyze, compare, and contrast information. + 3. Summarize the main points and offer concluding thoughts. - Ensure the tone is objective, professional, and analytical. Base the report **strictly** on the provided findings. Do not add external knowledge. If findings are contradictory or incomplete, acknowledge this. - """, + Ensure the tone is objective and professional. + If findings are contradictory or incomplete, acknowledge this. + """, # Removed citation part for simplicity for now, as browser agent returns summaries. ), ( "human", f""" - **Research Topic:** {topic} + **Research Topic:** {topic} - {plan_summary} + {plan_summary} - **Collected Findings:** - ``` - {formatted_results} - ``` + **Collected Findings:** + ``` + {formatted_results} + ``` - ``` - - Please generate the final research report in Markdown format based **only** on the information above. Ensure all claims derived from the findings are properly cited using the format [Reference_ID]. - """, + Please generate the final research report in Markdown format based **only** on the information above. + """, ), ] ) @@ -818,7 +904,6 @@ async def synthesis_node(state: DeepResearchState) -> Dict[str, Any]: topic=topic, plan_summary=plan_summary, formatted_results=formatted_results, - references=references, ).to_messages() ) final_report_md = response.content @@ -847,34 +932,44 @@ async def synthesis_node(state: DeepResearchState) -> Dict[str, Any]: def should_continue(state: DeepResearchState) -> str: - """Determines the next step based on the current state.""" logger.info("--- Evaluating Condition: Should Continue? ---") if state.get("stop_requested"): logger.info("Stop requested, routing to END.") - return "end_run" # Go to a dedicated end node for cleanup if needed - if state.get("error_message"): - logger.warning(f"Error detected: {state['error_message']}. Routing to END.") - # Decide if errors should halt execution or if it should try to synthesize anyway - return "end_run" # Stop on error for now + return "end_run" + if state.get("error_message") and "Core Execution Error" in state["error_message"]: # Critical error in node + logger.warning(f"Critical error detected: {state['error_message']}. Routing to END.") + return "end_run" plan = state.get("research_plan") - current_index = state.get("current_step_index", 0) + cat_idx = state.get("current_category_index", 0) + task_idx = state.get("current_task_index_in_category", 0) # This is the *next* task to check if not plan: - logger.warning( - "No research plan found, cannot continue execution. Routing to END." - ) - return "end_run" # Should not happen if planning node ran correctly + logger.warning("No research plan found. Routing to END.") + return "end_run" - # Check if there are pending steps in the plan - if current_index < len(plan): - logger.info( - f"Plan has pending steps (current index {current_index}/{len(plan)}). Routing to Research Execution." - ) - return "execute_research" - else: - logger.info("All plan steps processed. Routing to Synthesis.") - return "synthesize_report" + # Check if the current indices point to a valid pending task + if cat_idx < len(plan): + current_category = plan[cat_idx] + if task_idx < len(current_category["tasks"]): + # We are trying to execute the task at plan[cat_idx]["tasks"][task_idx] + # The research_execution_node will handle if it's already completed. + logger.info( + f"Plan has potential pending tasks (next up: Category {cat_idx}, Task {task_idx}). Routing to Research Execution." + ) + return "execute_research" + else: # task_idx is out of bounds for current category, means we need to check next category + if cat_idx + 1 < len(plan): # If there is a next category + logger.info( + f"Finished tasks in category {cat_idx}. Moving to category {cat_idx + 1}. Routing to Research Execution." + ) + # research_execution_node will update state to {current_category_index: cat_idx + 1, current_task_index_in_category: 0} + # Or rather, the previous execution node already set these indices to the start of the next category. + return "execute_research" + + # If we've gone through all categories and tasks (cat_idx >= len(plan)) + logger.info("All plan categories and tasks processed or current indices are out of bounds. Routing to Synthesis.") + return "synthesize_report" # --- DeepSearchAgent Class --- @@ -1033,22 +1128,24 @@ class DeepResearchAgent: "messages": [], "llm": self.llm, "tools": agent_tools, - "output_dir": output_dir, + "output_dir": Path(output_dir), "browser_config": self.browser_config, "final_report": None, - "current_step_index": 0, + "current_category_index": 0, + "current_task_index_in_category": 0, "stop_requested": False, "error_message": None, } - loaded_state = {} if task_id: logger.info(f"Attempting to resume task {task_id}...") loaded_state = _load_previous_state(task_id, output_dir) initial_state.update(loaded_state) if loaded_state.get("research_plan"): logger.info( - f"Resuming with {len(loaded_state['research_plan'])} plan steps and {len(loaded_state.get('search_results', []))} existing results." + f"Resuming with {len(loaded_state['research_plan'])} plan categories " + f"and {len(loaded_state.get('search_results', []))} existing results. " + f"Next task: Cat {initial_state['current_category_index']}, Task {initial_state['current_task_index_in_category']}" ) initial_state["topic"] = ( topic # Allow overriding topic even when resuming? Or use stored topic? Let's use new one. @@ -1057,7 +1154,6 @@ class DeepResearchAgent: logger.warning( f"Resume requested for {task_id}, but no previous plan found. Starting fresh." ) - initial_state["current_step_index"] = 0 # --- Execute Graph using ainvoke --- final_state = None diff --git a/src/webui/components/browser_settings_tab.py b/src/webui/components/browser_settings_tab.py index 40c104c..e502d9e 100644 --- a/src/webui/components/browser_settings_tab.py +++ b/src/webui/components/browser_settings_tab.py @@ -1,3 +1,5 @@ +import os + import gradio as gr import logging from gradio.components import Component @@ -56,7 +58,7 @@ def create_browser_settings_tab(webui_manager: WebuiManager): ) keep_browser_open = gr.Checkbox( label="Keep Browser Open", - value=True, + value=os.getenv("KEEP_BROWSER_OPEN", True), info="Keep Browser Open between Tasks", interactive=True ) @@ -91,6 +93,7 @@ def create_browser_settings_tab(webui_manager: WebuiManager): with gr.Row(): cdp_url = gr.Textbox( label="CDP URL", + value=os.getenv("BROWSER_CDP", None), info="CDP URL for browser remote debugging", interactive=True, ) diff --git a/src/webui/components/browser_use_agent_tab.py b/src/webui/components/browser_use_agent_tab.py index b3c00a0..1a292dd 100644 --- a/src/webui/components/browser_use_agent_tab.py +++ b/src/webui/components/browser_use_agent_tab.py @@ -13,7 +13,7 @@ from browser_use.agent.views import ( AgentOutput, ) from browser_use.browser.browser import BrowserConfig -from browser_use.browser.context import BrowserContext, BrowserContextWindowSize, BrowserContextConfig +from browser_use.browser.context import BrowserContext, BrowserContextConfig from browser_use.browser.views import BrowserState from gradio.components import Component from langchain_core.language_models.chat_models import BaseChatModel @@ -451,20 +451,16 @@ async def run_agent_task( if not webui_manager.bu_browser: logger.info("Launching new browser instance.") extra_args = [f"--window-size={window_w},{window_h}"] - if browser_user_data_dir: - extra_args.append(f"--user-data-dir={browser_user_data_dir}") - if use_own_browser: - browser_binary_path = ( - os.getenv("CHROME_PATH", None) or browser_binary_path - ) + browser_binary_path = os.getenv("BROWSER_PATH", None) or browser_binary_path if browser_binary_path == "": browser_binary_path = None - chrome_user_data = os.getenv("CHROME_USER_DATA", None) - if chrome_user_data: - extra_args += [f"--user-data-dir={chrome_user_data}"] + browser_user_data = browser_user_data_dir or os.getenv("BROWSER_USER_DATA", None) + if browser_user_data: + extra_args += [f"--user-data-dir={browser_user_data}"] else: browser_binary_path = None + webui_manager.bu_browser = CustomBrowser( config=BrowserConfig( headless=headless, @@ -485,7 +481,8 @@ async def run_agent_task( if save_recording_path else None, save_downloads_path=save_download_path if save_download_path else None, - browser_window_size=BrowserContextWindowSize(width=window_w, height=window_h), + window_height=window_h, + window_width=window_w, ) if not webui_manager.bu_browser: raise ValueError("Browser not initialized, cannot create context.") diff --git a/supervisord.conf b/supervisord.conf index 3410b91..2c5d8b4 100644 --- a/supervisord.conf +++ b/supervisord.conf @@ -66,8 +66,8 @@ startsecs=3 depends_on=x11vnc [program:persistent_browser] -environment=START_URL="data:text/html,

Browser Ready

" -command=bash -c "mkdir -p /app/data/chrome_data && sleep 8 && $(find /ms-playwright/chromium-*/chrome-linux -name chrome) --user-data-dir=/app/data/chrome_data --window-position=0,0 --window-size=%(ENV_RESOLUTION_WIDTH)s,%(ENV_RESOLUTION_HEIGHT)s --start-maximized --no-sandbox --disable-dev-shm-usage --disable-gpu --disable-software-rasterizer --disable-setuid-sandbox --no-first-run --no-default-browser-check --no-experiments --ignore-certificate-errors --remote-debugging-port=9222 --remote-debugging-address=0.0.0.0 \"$START_URL\"" +environment=START_URL="data:text/html,

Browser Ready

",BROWSER_USER_DATA="/app/data/chrome_data",BROWSER_DEBUGGING_PORT="%(ENV_BROWSER_DEBUGGING_PORT)s",BROWSER_DEBUGGING_HOST="%(ENV_BROWSER_DEBUGGING_HOST)s" +command=bash -c "mkdir -p %(ENV_BROWSER_USER_DATA)s && sleep 8 && $(find $PLAYWRIGHT_BROWSERS_PATH/chrome-*/chrome-linux -name chrome || find /root/.cache/ms-playwright/chrome-*/chrome-linux -name chrome || find /opt/google/chrome -name chrome || echo \"/usr/bin/google-chrome-stable\") --user-data-dir=%(ENV_BROWSER_USER_DATA)s --window-position=0,0 --window-size=%(ENV_RESOLUTION_WIDTH)s,%(ENV_RESOLUTION_HEIGHT)s --start-maximized --no-sandbox --disable-dev-shm-usage --disable-gpu --disable-software-rasterizer --disable-setuid-sandbox --no-first-run --no-default-browser-check --no-experiments --ignore-certificate-errors --remote-debugging-port=%(ENV_BROWSER_DEBUGGING_PORT)s --remote-debugging-address=%(ENV_BROWSER_DEBUGGING_HOST)s --enable-features=NetworkService,NetworkServiceInProcess --disable-features=ImprovedCookieControls \"$START_URL\"" autorestart=true stdout_logfile=/dev/stdout stdout_logfile_maxbytes=0 @@ -93,4 +93,4 @@ startretries=3 startsecs=3 stopsignal=TERM stopwaitsecs=10 -depends_on=persistent_browser +depends_on=persistent_browser \ No newline at end of file diff --git a/tests/test_agents.py b/tests/test_agents.py index d485c70..1285167 100644 --- a/tests/test_agents.py +++ b/tests/test_agents.py @@ -20,8 +20,7 @@ from src.utils import utils async def test_browser_use_agent(): from browser_use.browser.browser import Browser, BrowserConfig from browser_use.browser.context import ( - BrowserContextConfig, - BrowserContextWindowSize, + BrowserContextConfig ) from browser_use.agent.service import Agent @@ -38,12 +37,12 @@ async def test_browser_use_agent(): # api_key=os.getenv("OPENAI_API_KEY", ""), # ) - # llm = utils.get_llm_model( - # provider="google", - # model_name="gemini-2.0-flash", - # temperature=0.6, - # api_key=os.getenv("GOOGLE_API_KEY", "") - # ) + llm = llm_provider.get_llm_model( + provider="google", + model_name="gemini-2.0-flash", + temperature=0.6, + api_key=os.getenv("GOOGLE_API_KEY", "") + ) # llm = utils.get_llm_model( # provider="deepseek", @@ -67,13 +66,13 @@ async def test_browser_use_agent(): window_w, window_h = 1280, 1100 - llm = llm_provider.get_llm_model( - provider="azure_openai", - model_name="gpt-4o", - temperature=0.5, - base_url=os.getenv("AZURE_OPENAI_ENDPOINT", ""), - api_key=os.getenv("AZURE_OPENAI_API_KEY", ""), - ) + # llm = llm_provider.get_llm_model( + # provider="azure_openai", + # model_name="gpt-4o", + # temperature=0.5, + # base_url=os.getenv("AZURE_OPENAI_ENDPOINT", ""), + # api_key=os.getenv("AZURE_OPENAI_API_KEY", ""), + # ) mcp_server_config = { "mcpServers": { @@ -98,7 +97,6 @@ async def test_browser_use_agent(): controller = CustomController() await controller.setup_mcp_client(mcp_server_config) use_own_browser = True - disable_security = False use_vision = True # Set to False when using DeepSeek max_actions_per_step = 10 @@ -106,33 +104,30 @@ async def test_browser_use_agent(): browser_context = None try: - extra_chromium_args = [f"--window-size={window_w},{window_h}"] + extra_browser_args = [f"--window-size={window_w},{window_h}"] if use_own_browser: - chrome_path = os.getenv("CHROME_PATH", None) - if chrome_path == "": - chrome_path = None - chrome_user_data = os.getenv("CHROME_USER_DATA", None) - if chrome_user_data: - extra_chromium_args += [f"--user-data-dir={chrome_user_data}"] + browser_binary_path = os.getenv("BROWSER_PATH", None) + if browser_binary_path == "": + browser_binary_path = None + browser_user_data = os.getenv("BROWSER_USER_DATA", None) + if browser_user_data: + extra_browser_args += [f"--user-data-dir={browser_user_data}"] else: - chrome_path = None + browser_binary_path = None browser = CustomBrowser( config=BrowserConfig( headless=False, - disable_security=disable_security, - browser_binary_path=chrome_path, - extra_browser_args=extra_chromium_args, + browser_binary_path=browser_binary_path, + extra_browser_args=extra_browser_args, ) ) browser_context = await browser.new_context( config=BrowserContextConfig( - trace_path="./tmp/traces", - save_recording_path="./tmp/record_videos", + trace_path=None, + save_recording_path=None, save_downloads_path="./tmp/downloads", - browser_window_size=BrowserContextWindowSize( - width=window_w, height=window_h - ), - force_new_context=True + window_height=window_h, + window_width=window_w, ) ) agent = BrowserUseAgent( @@ -167,17 +162,9 @@ async def test_browser_use_agent(): async def test_browser_use_parallel(): - from browser_use.browser.context import BrowserContextWindowSize - from browser_use.browser.browser import BrowserConfig - from patchright.async_api import async_playwright - from browser_use.browser.browser import Browser - from src.browser.custom_context import BrowserContextConfig - from src.controller.custom_controller import CustomController - from browser_use.browser.browser import Browser, BrowserConfig from browser_use.browser.context import ( BrowserContextConfig, - BrowserContextWindowSize, ) from browser_use.agent.service import Agent @@ -261,8 +248,7 @@ async def test_browser_use_parallel(): } controller = CustomController() await controller.setup_mcp_client(mcp_server_config) - use_own_browser = False - disable_security = False + use_own_browser = True use_vision = True # Set to False when using DeepSeek max_actions_per_step = 10 @@ -270,32 +256,30 @@ async def test_browser_use_parallel(): browser_context = None try: - extra_chromium_args = [f"--window-size={window_w},{window_h}"] + extra_browser_args = [f"--window-size={window_w},{window_h}"] if use_own_browser: - chrome_path = os.getenv("CHROME_PATH", None) - if chrome_path == "": - chrome_path = None - chrome_user_data = os.getenv("CHROME_USER_DATA", None) - if chrome_user_data: - extra_chromium_args += [f"--user-data-dir={chrome_user_data}"] + browser_binary_path = os.getenv("BROWSER_PATH", None) + if browser_binary_path == "": + browser_binary_path = None + browser_user_data = os.getenv("BROWSER_USER_DATA", None) + if browser_user_data: + extra_browser_args += [f"--user-data-dir={browser_user_data}"] else: - chrome_path = None + browser_binary_path = None browser = CustomBrowser( config=BrowserConfig( headless=False, - disable_security=disable_security, - browser_binary_path=chrome_path, - extra_browser_args=extra_chromium_args, + browser_binary_path=browser_binary_path, + extra_browser_args=extra_browser_args, ) ) browser_context = await browser.new_context( config=BrowserContextConfig( - trace_path="./tmp/traces", - save_recording_path="./tmp/record_videos", + trace_path=None, + save_recording_path=None, save_downloads_path="./tmp/downloads", - browser_window_size=BrowserContextWindowSize( - width=window_w, height=window_h - ), + window_height=window_h, + window_width=window_w, force_new_context=True ) ) @@ -364,7 +348,7 @@ async def test_deep_research_agent(): browser_config = {"headless": False, "window_width": 1280, "window_height": 1100, "use_own_browser": False} agent = DeepResearchAgent(llm=llm, browser_config=browser_config, mcp_server_config=mcp_server_config) - research_topic = "Give me a detailed travel plan to Switzerland from June 1st to 10th." + research_topic = "Give me investment advices of nvidia and tesla." task_id_to_resume = "" # Set this to resume a previous task ID print(f"Starting research on: {research_topic}") @@ -405,6 +389,6 @@ async def test_deep_research_agent(): if __name__ == "__main__": - # asyncio.run(test_browser_use_agent()) + asyncio.run(test_browser_use_agent()) # asyncio.run(test_browser_use_parallel()) - asyncio.run(test_deep_research_agent()) + # asyncio.run(test_deep_research_agent()) From 483d20a3ec9cd83baa78bd08adec7165c9e757ea Mon Sep 17 00:00:00 2001 From: vincent Date: Fri, 9 May 2025 20:39:05 +0800 Subject: [PATCH 31/35] update readme --- README.md | 30 ++---------------------------- 1 file changed, 2 insertions(+), 28 deletions(-) diff --git a/README.md b/README.md index 238dd40..faa9638 100644 --- a/README.md +++ b/README.md @@ -88,20 +88,8 @@ cp .env.example .env ```bash python webui.py --ip 127.0.0.1 --port 7788 ``` -2. WebUI options: - - `--ip`: The IP address to bind the WebUI to. Default is `127.0.0.1`. - - `--port`: The port to bind the WebUI to. Default is `7788`. - - `--theme`: The theme for the user interface. Default is `Ocean`. - - **Default**: The standard theme with a balanced design. - - **Soft**: A gentle, muted color scheme for a relaxed viewing experience. - - **Monochrome**: A grayscale theme with minimal color for simplicity and focus. - - **Glass**: A sleek, semi-transparent design for a modern appearance. - - **Origin**: A classic, retro-inspired theme for a nostalgic feel. - - **Citrus**: A vibrant, citrus-inspired palette with bright and fresh colors. - - **Ocean** (default): A blue, ocean-inspired theme providing a calming effect. - - `--dark-mode`: Enables dark mode for the user interface. -3. **Access the WebUI:** Open your web browser and navigate to `http://127.0.0.1:7788`. -4. **Using Your Own Browser(Optional):** +2. **Access the WebUI:** Open your web browser and navigate to `http://127.0.0.1:7788`. +3. **Using Your Own Browser(Optional):** - Set `CHROME_PATH` to the executable path of your browser and `CHROME_USER_DATA` to the user data directory of your browser. Leave `CHROME_USER_DATA` empty if you want to use local user data. - Windows ```env @@ -159,20 +147,6 @@ CHROME_PERSISTENT_SESSION=true docker compose up --build - Default VNC password: "youvncpassword" - Can be changed by setting `VNC_PASSWORD` in your `.env` file -5. **Container Management:** - ```bash - # Start with persistent browser - CHROME_PERSISTENT_SESSION=true docker compose up -d - - # Start with default mode (browser closes after tasks) - docker compose up -d - - # View logs - docker compose logs -f - - # Stop the container - docker compose down - ``` ## Changelog - [x] **2025/01/26:** Thanks to @vvincent1234. Now browser-use-webui can combine with DeepSeek-r1 to engage in deep thinking! From 50a25d5ac8306d38831c6c3754be6bc4873d8f77 Mon Sep 17 00:00:00 2001 From: Gorden Chen Date: Fri, 9 May 2025 23:30:43 +0800 Subject: [PATCH 32/35] Update patchright in Dockerfile - Set ENV PLAYWRIGHT_BROWSERS_PATH=/ms-patchright to define custom browser install path for Patchright. - Replaced Playwright installation commands with Patchright equivalents: RUN patchright install --with-deps chromium RUN patchright install-deps --- Dockerfile | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Dockerfile b/Dockerfile index 7b6d39f..6a7b616 100644 --- a/Dockerfile +++ b/Dockerfile @@ -57,10 +57,10 @@ WORKDIR /app COPY requirements.txt . RUN pip install --no-cache-dir -r requirements.txt -# Install Playwright and browsers with system dependencies -ENV PLAYWRIGHT_BROWSERS_PATH=/ms-playwright -RUN playwright install --with-deps chromium -RUN playwright install-deps +# Install patchright and browsers with system dependencies +ENV PLAYWRIGHT_BROWSERS_PATH=/ms-patchright +RUN patchright install --with-deps chromium +RUN patchright install-deps # Copy the application code COPY . . From b7c8fe1f0446c36c57623cfb0e62307dd4665a2f Mon Sep 17 00:00:00 2001 From: vincent Date: Sat, 10 May 2025 00:41:41 +0800 Subject: [PATCH 33/35] fix dockerfile --- .dockerignore | 5 ++++- Dockerfile | 37 ++++++++++++++++++------------------- README.md | 37 +++++++++++++++++-------------------- docker-compose.yml | 6 ++++-- supervisord.conf | 2 +- tests/test_llm_api.py | 6 +++--- 6 files changed, 47 insertions(+), 46 deletions(-) diff --git a/.dockerignore b/.dockerignore index 9635889..140fab3 100644 --- a/.dockerignore +++ b/.dockerignore @@ -1,2 +1,5 @@ data -tmp \ No newline at end of file +tmp +results + +.env \ No newline at end of file diff --git a/Dockerfile b/Dockerfile index b4d6fa1..ffdf721 100644 --- a/Dockerfile +++ b/Dockerfile @@ -2,6 +2,7 @@ FROM python:3.11-slim # Set platform for multi-arch builds (Docker Buildx will set this) ARG TARGETPLATFORM +ARG NODE_MAJOR=20 # Install system dependencies RUN apt-get update && apt-get install -y \ @@ -42,6 +43,7 @@ RUN apt-get update && apt-get install -y \ fonts-dejavu \ fonts-dejavu-core \ fonts-dejavu-extra \ + vim \ && rm -rf /var/lib/apt/lists/* # Install noVNC @@ -49,6 +51,17 @@ RUN git clone https://github.com/novnc/noVNC.git /opt/novnc \ && git clone https://github.com/novnc/websockify /opt/novnc/utils/websockify \ && ln -s /opt/novnc/vnc.html /opt/novnc/index.html +# Install Node.js using NodeSource PPA +RUN mkdir -p /etc/apt/keyrings \ + && curl -fsSL https://deb.nodesource.com/gpgkey/nodesource-repo.gpg.key | gpg --dearmor -o /etc/apt/keyrings/nodesource.gpg \ + && echo "deb [signed-by=/etc/apt/keyrings/nodesource.gpg] https://deb.nodesource.com/node_$NODE_MAJOR.x nodistro main" | tee /etc/apt/sources.list.d/nodesource.list \ + && apt-get update \ + && apt-get install nodejs -y \ + && rm -rf /var/lib/apt/lists/* + +# Verify Node.js and npm installation (optional, but good for debugging) +RUN node -v && npm -v && npx -v + # Set up working directory WORKDIR /app @@ -56,8 +69,7 @@ WORKDIR /app COPY requirements.txt . # Ensure 'patchright' is in your requirements.txt or install it directly # RUN pip install --no-cache-dir -r requirements.txt patchright # If not in requirements -RUN pip install --no-cache-dir -r requirements.txt # Assuming patchright is in requirements.txt -RUN pip install --no-cache-dir patchright # Or install it explicitly +RUN pip install --no-cache-dir -r requirements.txt # Install Patchright browsers and dependencies # Patchright documentation suggests PLAYWRIGHT_BROWSERS_PATH is still relevant @@ -69,32 +81,19 @@ RUN mkdir -p $PLAYWRIGHT_BROWSERS_PATH # Install recommended: Google Chrome (instead of just Chromium for better undetectability) # The 'patchright install chrome' command might download and place it. # The '--with-deps' equivalent for patchright install is to run 'patchright install-deps chrome' after. -RUN patchright install chrome -RUN patchright install-deps chrome +RUN patchright install chrome --with-deps # Alternative: Install Chromium if Google Chrome is problematic in certain environments -RUN patchright install chromium -RUN patchright install-deps chromium +RUN patchright install chromium --with-deps # Copy the application code COPY . . -# Set environment variables (Updated Names) -ENV PYTHONUNBUFFERED=1 -ENV BROWSER_USE_LOGGING_LEVEL=info -# BROWSER_PATH will be determined by Patchright installation, supervisord will find it. -ENV ANONYMIZED_TELEMETRY=false -ENV DISPLAY=:99 -ENV RESOLUTION=1920x1080x24 -ENV VNC_PASSWORD=youvncpassword -ENV KEEP_BROWSER_OPEN=true -ENV RESOLUTION_WIDTH=1920 -ENV RESOLUTION_HEIGHT=1080 - # Set up supervisor configuration RUN mkdir -p /var/log/supervisor COPY supervisord.conf /etc/supervisor/conf.d/supervisord.conf EXPOSE 7788 6080 5901 9222 -CMD ["/usr/bin/supervisord", "-c", "/etc/supervisor/conf.d/supervisord.conf"] \ No newline at end of file +CMD ["/usr/bin/supervisord", "-c", "/etc/supervisor/conf.d/supervisord.conf"] +#CMD ["/bin/bash"] \ No newline at end of file diff --git a/README.md b/README.md index faa9638..b67a2ed 100644 --- a/README.md +++ b/README.md @@ -63,11 +63,11 @@ uv pip install -r requirements.txt Install Browsers in Patchright. ```bash -patchright install +patchright install --with-deps ``` Or you can install specific browsers by running: ```bash -patchright install chromium --with-deps --no-shell +patchright install chromium --with-deps ``` #### Step 4: Configure Environment @@ -82,25 +82,24 @@ cp .env.example .env ``` 2. Open `.env` in your preferred text editor and add your API keys and other settings -#### Local Setup +#### Step 5: Enjoy the web-ui 1. **Run the WebUI:** - After completing the installation steps above, start the application: ```bash python webui.py --ip 127.0.0.1 --port 7788 ``` 2. **Access the WebUI:** Open your web browser and navigate to `http://127.0.0.1:7788`. 3. **Using Your Own Browser(Optional):** - - Set `CHROME_PATH` to the executable path of your browser and `CHROME_USER_DATA` to the user data directory of your browser. Leave `CHROME_USER_DATA` empty if you want to use local user data. + - Set `BROWSER_PATH` to the executable path of your browser and `BROWSER_USER_DATA` to the user data directory of your browser. Leave `BROWSER_USER_DATA` empty if you want to use local user data. - Windows ```env - CHROME_PATH="C:\Program Files\Google\Chrome\Application\chrome.exe" - CHROME_USER_DATA="C:\Users\YourUsername\AppData\Local\Google\Chrome\User Data" + BROWSER_PATH="C:\Program Files\Google\Chrome\Application\chrome.exe" + BROWSER_USER_DATA="C:\Users\YourUsername\AppData\Local\Google\Chrome\User Data" ``` > Note: Replace `YourUsername` with your actual Windows username for Windows systems. - Mac ```env - CHROME_PATH="/Applications/Google Chrome.app/Contents/MacOS/Google Chrome" - CHROME_USER_DATA="/Users/YourUsername/Library/Application Support/Google/Chrome" + BROWSER_PATH="/Applications/Google Chrome.app/Contents/MacOS/Google Chrome" + BROWSER_USER_DATA="/Users/YourUsername/Library/Application Support/Google/Chrome" ``` - Close all Chrome windows - Open the WebUI in a non-Chrome browser, such as Firefox or Edge. This is important because the persistent browser context will use the Chrome data when running the agent. @@ -113,14 +112,14 @@ cp .env.example .env - [Docker Desktop](https://www.docker.com/products/docker-desktop/) (For Windows/macOS) - [Docker Engine](https://docs.docker.com/engine/install/) and [Docker Compose](https://docs.docker.com/compose/install/) (For Linux) -#### Installation Steps -1. Clone the repository: +#### Step 1: Clone the Repository ```bash git clone https://github.com/browser-use/web-ui.git cd web-ui ``` -2. Create and configure environment file: +#### Step 2: Configure Environment +1. Create a copy of the example environment file: - Windows (Command Prompt): ```bash copy .env.example .env @@ -129,25 +128,23 @@ copy .env.example .env ```bash cp .env.example .env ``` -Edit `.env` with your preferred text editor and add your API keys +2. Open `.env` in your preferred text editor and add your API keys and other settings -3. Run with Docker: +#### Step 3: Docker Build and Run ```bash -# Build and start the container with default settings (browser closes after AI tasks) docker compose up --build ``` +For ARM64 systems (e.g., Apple Silicon Macs), please run follow command: ```bash -# Or run with persistent browser (browser stays open between AI tasks) -CHROME_PERSISTENT_SESSION=true docker compose up --build +TARGETPLATFORM=linux/arm64 docker compose up --build ``` -4. Access the Application: -- Web Interface: Open `http://localhost:7788` in your browser +#### Step 4: Enjoy the web-ui and vnc +- Web-UI: Open `http://localhost:7788` in your browser - VNC Viewer (for watching browser interactions): Open `http://localhost:6080/vnc.html` - Default VNC password: "youvncpassword" - Can be changed by setting `VNC_PASSWORD` in your `.env` file - ## Changelog - [x] **2025/01/26:** Thanks to @vvincent1234. Now browser-use-webui can combine with DeepSeek-r1 to engage in deep thinking! - [x] **2025/01/10:** Thanks to @casistack. Now we have Docker Setup option and also Support keep browser open between tasks.[Video tutorial demo](https://github.com/browser-use/web-ui/issues/1#issuecomment-2582511750). diff --git a/docker-compose.yml b/docker-compose.yml index 780d2a9..c16fd92 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,4 +1,5 @@ services: + # debug: docker compose run --rm -it browser-use-webui bash browser-use-webui: build: context: . @@ -11,7 +12,7 @@ services: - "5901:5901" - "9222:9222" environment: - # LLM API Keys & Endpoints (Your existing list) + # LLM API Keys & Endpoints - OPENAI_ENDPOINT=${OPENAI_ENDPOINT:-https://api.openai.com/v1} - OPENAI_API_KEY=${OPENAI_API_KEY:-} - ANTHROPIC_ENDPOINT=${ANTHROPIC_ENDPOINT:-https://api.anthropic.com} @@ -42,7 +43,8 @@ services: - BROWSER_USE_LOGGING_LEVEL=${BROWSER_USE_LOGGING_LEVEL:-info} # Browser Settings - - BROWSER_USER_DATA=${BROWSER_USER_DATA:-/app/data/chrome_data} + - BROWSER_PATH=/usr/bin/google-chrome + - BROWSER_USER_DATA=/app/data/chrome_data - BROWSER_DEBUGGING_PORT=${BROWSER_DEBUGGING_PORT:-9222} - BROWSER_DEBUGGING_HOST=${BROWSER_DEBUGGING_HOST:-0.0.0.0} - KEEP_BROWSER_OPEN=${KEEP_BROWSER_OPEN:-true} diff --git a/supervisord.conf b/supervisord.conf index 2c5d8b4..5135d09 100644 --- a/supervisord.conf +++ b/supervisord.conf @@ -3,7 +3,7 @@ user=root nodaemon=true logfile=/dev/stdout logfile_maxbytes=0 -loglevel=debug +loglevel=error [program:xvfb] command=Xvfb :99 -screen 0 %(ENV_RESOLUTION)s -ac +extension GLX +render -noreset diff --git a/tests/test_llm_api.py b/tests/test_llm_api.py index e98569b..938f825 100644 --- a/tests/test_llm_api.py +++ b/tests/test_llm_api.py @@ -142,17 +142,17 @@ def test_ibm_model(): def test_qwen_model(): - config = LLMConfig(provider="alibaba", model_name="qwen3-30b-a3b") + config = LLMConfig(provider="alibaba", model_name="qwen-vl-max") test_llm(config, "How many 'r's are in the word 'strawberry'?") if __name__ == "__main__": # test_openai_model() # test_google_model() - # test_azure_openai_model() + test_azure_openai_model() # test_deepseek_model() # test_ollama_model() - test_deepseek_r1_model() + # test_deepseek_r1_model() # test_deepseek_r1_ollama_model() # test_mistral_model() # test_ibm_model() From 30f12195b7238a1c28e0a0340c6e035eb94a8b3e Mon Sep 17 00:00:00 2001 From: vincent Date: Sat, 10 May 2025 09:40:54 +0800 Subject: [PATCH 34/35] fix docker file --- docker-compose.yml | 5 +++-- src/webui/components/browser_settings_tab.py | 2 +- supervisord.conf | 2 +- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/docker-compose.yml b/docker-compose.yml index c16fd92..9b850e9 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -46,8 +46,9 @@ services: - BROWSER_PATH=/usr/bin/google-chrome - BROWSER_USER_DATA=/app/data/chrome_data - BROWSER_DEBUGGING_PORT=${BROWSER_DEBUGGING_PORT:-9222} - - BROWSER_DEBUGGING_HOST=${BROWSER_DEBUGGING_HOST:-0.0.0.0} - - KEEP_BROWSER_OPEN=${KEEP_BROWSER_OPEN:-true} + - BROWSER_DEBUGGING_HOST=0.0.0.0 + - USE_OWN_BROWSER=true + - KEEP_BROWSER_OPEN=true - BROWSER_CDP=${BROWSER_CDP:-} # e.g., http://localhost:9222 # Display Settings diff --git a/src/webui/components/browser_settings_tab.py b/src/webui/components/browser_settings_tab.py index e502d9e..f949357 100644 --- a/src/webui/components/browser_settings_tab.py +++ b/src/webui/components/browser_settings_tab.py @@ -52,7 +52,7 @@ def create_browser_settings_tab(webui_manager: WebuiManager): with gr.Row(): use_own_browser = gr.Checkbox( label="Use Own Browser", - value=False, + value=os.getenv("USE_OWN_BROWSER", False), info="Use your existing browser instance", interactive=True ) diff --git a/supervisord.conf b/supervisord.conf index 5135d09..f6cd33b 100644 --- a/supervisord.conf +++ b/supervisord.conf @@ -67,7 +67,7 @@ depends_on=x11vnc [program:persistent_browser] environment=START_URL="data:text/html,

Browser Ready

",BROWSER_USER_DATA="/app/data/chrome_data",BROWSER_DEBUGGING_PORT="%(ENV_BROWSER_DEBUGGING_PORT)s",BROWSER_DEBUGGING_HOST="%(ENV_BROWSER_DEBUGGING_HOST)s" -command=bash -c "mkdir -p %(ENV_BROWSER_USER_DATA)s && sleep 8 && $(find $PLAYWRIGHT_BROWSERS_PATH/chrome-*/chrome-linux -name chrome || find /root/.cache/ms-playwright/chrome-*/chrome-linux -name chrome || find /opt/google/chrome -name chrome || echo \"/usr/bin/google-chrome-stable\") --user-data-dir=%(ENV_BROWSER_USER_DATA)s --window-position=0,0 --window-size=%(ENV_RESOLUTION_WIDTH)s,%(ENV_RESOLUTION_HEIGHT)s --start-maximized --no-sandbox --disable-dev-shm-usage --disable-gpu --disable-software-rasterizer --disable-setuid-sandbox --no-first-run --no-default-browser-check --no-experiments --ignore-certificate-errors --remote-debugging-port=%(ENV_BROWSER_DEBUGGING_PORT)s --remote-debugging-address=%(ENV_BROWSER_DEBUGGING_HOST)s --enable-features=NetworkService,NetworkServiceInProcess --disable-features=ImprovedCookieControls \"$START_URL\"" +command=bash -c "mkdir -p %(ENV_BROWSER_USER_DATA)s && sleep 8 && $(/usr/bin/google-chrome || find $PLAYWRIGHT_BROWSERS_PATH/chromium-*/chrome-linux -name chrome) --user-data-dir=%(ENV_BROWSER_USER_DATA)s --window-position=0,0 --window-size=%(ENV_RESOLUTION_WIDTH)s,%(ENV_RESOLUTION_HEIGHT)s --start-maximized --no-sandbox --disable-dev-shm-usage --disable-gpu --no-first-run --no-default-browser-check --no-experiments --ignore-certificate-errors --remote-debugging-port=%(ENV_BROWSER_DEBUGGING_PORT)s --remote-debugging-address=%(ENV_BROWSER_DEBUGGING_HOST)s --enable-features=NetworkService,NetworkServiceInProcess --disable-features=ImprovedCookieControls \"$START_URL\"" autorestart=true stdout_logfile=/dev/stdout stdout_logfile_maxbytes=0 From 27c7caa44daa0504ad1fa3693dd9a44863377d3b Mon Sep 17 00:00:00 2001 From: vincent Date: Sat, 10 May 2025 20:46:37 +0800 Subject: [PATCH 35/35] simplify docker installation --- Dockerfile | 2 +- docker-compose.yml | 8 ++++---- src/webui/components/browser_settings_tab.py | 2 +- supervisord.conf | 18 +----------------- 4 files changed, 7 insertions(+), 23 deletions(-) diff --git a/Dockerfile b/Dockerfile index ffdf721..6d06c5e 100644 --- a/Dockerfile +++ b/Dockerfile @@ -81,7 +81,7 @@ RUN mkdir -p $PLAYWRIGHT_BROWSERS_PATH # Install recommended: Google Chrome (instead of just Chromium for better undetectability) # The 'patchright install chrome' command might download and place it. # The '--with-deps' equivalent for patchright install is to run 'patchright install-deps chrome' after. -RUN patchright install chrome --with-deps +# RUN patchright install chrome --with-deps # Alternative: Install Chromium if Google Chrome is problematic in certain environments RUN patchright install chromium --with-deps diff --git a/docker-compose.yml b/docker-compose.yml index 9b850e9..b7d9820 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -43,11 +43,11 @@ services: - BROWSER_USE_LOGGING_LEVEL=${BROWSER_USE_LOGGING_LEVEL:-info} # Browser Settings - - BROWSER_PATH=/usr/bin/google-chrome - - BROWSER_USER_DATA=/app/data/chrome_data + - BROWSER_PATH= + - BROWSER_USER_DATA= - BROWSER_DEBUGGING_PORT=${BROWSER_DEBUGGING_PORT:-9222} - - BROWSER_DEBUGGING_HOST=0.0.0.0 - - USE_OWN_BROWSER=true + - BROWSER_DEBUGGING_HOST=localhost + - USE_OWN_BROWSER=false - KEEP_BROWSER_OPEN=true - BROWSER_CDP=${BROWSER_CDP:-} # e.g., http://localhost:9222 diff --git a/src/webui/components/browser_settings_tab.py b/src/webui/components/browser_settings_tab.py index f949357..e502d9e 100644 --- a/src/webui/components/browser_settings_tab.py +++ b/src/webui/components/browser_settings_tab.py @@ -52,7 +52,7 @@ def create_browser_settings_tab(webui_manager: WebuiManager): with gr.Row(): use_own_browser = gr.Checkbox( label="Use Own Browser", - value=os.getenv("USE_OWN_BROWSER", False), + value=False, info="Use your existing browser instance", interactive=True ) diff --git a/supervisord.conf b/supervisord.conf index f6cd33b..6010766 100644 --- a/supervisord.conf +++ b/supervisord.conf @@ -65,21 +65,6 @@ startretries=5 startsecs=3 depends_on=x11vnc -[program:persistent_browser] -environment=START_URL="data:text/html,

Browser Ready

",BROWSER_USER_DATA="/app/data/chrome_data",BROWSER_DEBUGGING_PORT="%(ENV_BROWSER_DEBUGGING_PORT)s",BROWSER_DEBUGGING_HOST="%(ENV_BROWSER_DEBUGGING_HOST)s" -command=bash -c "mkdir -p %(ENV_BROWSER_USER_DATA)s && sleep 8 && $(/usr/bin/google-chrome || find $PLAYWRIGHT_BROWSERS_PATH/chromium-*/chrome-linux -name chrome) --user-data-dir=%(ENV_BROWSER_USER_DATA)s --window-position=0,0 --window-size=%(ENV_RESOLUTION_WIDTH)s,%(ENV_RESOLUTION_HEIGHT)s --start-maximized --no-sandbox --disable-dev-shm-usage --disable-gpu --no-first-run --no-default-browser-check --no-experiments --ignore-certificate-errors --remote-debugging-port=%(ENV_BROWSER_DEBUGGING_PORT)s --remote-debugging-address=%(ENV_BROWSER_DEBUGGING_HOST)s --enable-features=NetworkService,NetworkServiceInProcess --disable-features=ImprovedCookieControls \"$START_URL\"" -autorestart=true -stdout_logfile=/dev/stdout -stdout_logfile_maxbytes=0 -stderr_logfile=/dev/stderr -stderr_logfile_maxbytes=0 -priority=350 -startretries=5 -startsecs=10 -stopsignal=TERM -stopwaitsecs=15 -depends_on=novnc - [program:webui] command=python webui.py --ip 0.0.0.0 --port 7788 directory=/app @@ -92,5 +77,4 @@ priority=400 startretries=3 startsecs=3 stopsignal=TERM -stopwaitsecs=10 -depends_on=persistent_browser \ No newline at end of file +stopwaitsecs=10 \ No newline at end of file