diff --git a/src/agent/deep_research_agent.py b/src/agent/deep_research/deep_research_agent.py similarity index 100% rename from src/agent/deep_research_agent.py rename to src/agent/deep_research/deep_research_agent.py diff --git a/src/browser/custom_browser.py b/src/browser/custom_browser.py index 4a2d1ab..a1c057b 100644 --- a/src/browser/custom_browser.py +++ b/src/browser/custom_browser.py @@ -9,11 +9,23 @@ from playwright.async_api import ( Playwright, async_playwright, ) -from browser_use.browser.browser import Browser +from browser_use.browser.browser import Browser, IN_DOCKER from browser_use.browser.context import BrowserContext, BrowserContextConfig from playwright.async_api import BrowserContext as PlaywrightBrowserContext import logging +from browser_use.browser.chrome import ( + CHROME_ARGS, + CHROME_DETERMINISTIC_RENDERING_ARGS, + CHROME_DISABLE_SECURITY_ARGS, + CHROME_DOCKER_ARGS, + CHROME_HEADLESS_ARGS, +) +from browser_use.browser.context import BrowserContext, BrowserContextConfig +from browser_use.browser.utils.screen_resolution import get_screen_resolution, get_window_adjustments +from browser_use.utils import time_execution_async +import socket + from .custom_context import CustomBrowserContext logger = logging.getLogger(__name__) @@ -26,3 +38,62 @@ class CustomBrowser(Browser): config: BrowserContextConfig = BrowserContextConfig() ) -> CustomBrowserContext: return CustomBrowserContext(config=config, browser=self) + + async def _setup_builtin_browser(self, playwright: Playwright) -> PlaywrightBrowser: + """Sets up and returns a Playwright Browser instance with anti-detection measures.""" + assert self.config.browser_binary_path is None, 'browser_binary_path should be None if trying to use the builtin browsers' + + if self.config.headless: + screen_size = {'width': 1920, 'height': 1080} + offset_x, offset_y = 0, 0 + else: + screen_size = get_screen_resolution() + offset_x, offset_y = get_window_adjustments() + + chrome_args = { + *CHROME_ARGS, + *(CHROME_DOCKER_ARGS if IN_DOCKER else []), + *(CHROME_HEADLESS_ARGS if self.config.headless else []), + *(CHROME_DISABLE_SECURITY_ARGS if self.config.disable_security else []), + *(CHROME_DETERMINISTIC_RENDERING_ARGS if self.config.deterministic_rendering else []), + f'--window-position={offset_x},{offset_y}', + *self.config.extra_browser_args, + } + contain_window_size = False + for arg in self.config.extra_browser_args: + if "--window-size" in arg: + contain_window_size = True + break + if not contain_window_size: + chrome_args.add(f'--window-size={screen_size["width"]},{screen_size["height"]}') + + # check if port 9222 is already taken, if so remove the remote-debugging-port arg to prevent conflicts + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: + if s.connect_ex(('localhost', 9222)) == 0: + chrome_args.remove('--remote-debugging-port=9222') + + browser_class = getattr(playwright, self.config.browser_class) + args = { + 'chromium': list(chrome_args), + 'firefox': [ + *{ + '-no-remote', + *self.config.extra_browser_args, + } + ], + 'webkit': [ + *{ + '--no-startup-window', + *self.config.extra_browser_args, + } + ], + } + + browser = await browser_class.launch( + headless=self.config.headless, + args=args[self.config.browser_class], + proxy=self.config.proxy.model_dump() if self.config.proxy else None, + handle_sigterm=False, + handle_sigint=False, + ) + return browser diff --git a/src/browser/custom_context.py b/src/browser/custom_context.py index fd0e2e5..4dc2423 100644 --- a/src/browser/custom_context.py +++ b/src/browser/custom_context.py @@ -2,7 +2,7 @@ import json import logging import os -from browser_use.browser.browser import Browser +from browser_use.browser.browser import Browser, IN_DOCKER from browser_use.browser.context import BrowserContext, BrowserContextConfig from playwright.async_api import Browser as PlaywrightBrowser from playwright.async_api import BrowserContext as PlaywrightBrowserContext @@ -10,10 +10,104 @@ from playwright.async_api import BrowserContext as PlaywrightBrowserContext logger = logging.getLogger(__name__) +class CustomBrowserContextConfig(BrowserContextConfig): + force_new_context: bool = False # force to create new context + + class CustomBrowserContext(BrowserContext): def __init__( self, browser: "Browser", - config: BrowserContextConfig = BrowserContextConfig() + config: CustomBrowserContextConfig = CustomBrowserContextConfig(), ): super(CustomBrowserContext, self).__init__(browser=browser, config=config) + + async def _create_context(self, browser: PlaywrightBrowser): + """Creates a new browser context with anti-detection measures and loads cookies if available.""" + if not self.config.force_new_context and self.browser.config.cdp_url and len(browser.contexts) > 0: + context = browser.contexts[0] + elif not self.config.force_new_context and self.browser.config.browser_binary_path and len( + browser.contexts) > 0: + # Connect to existing Chrome instance instead of creating new one + context = browser.contexts[0] + else: + # Original code for creating new context + context = await browser.new_context( + no_viewport=True, + user_agent=self.config.user_agent, + java_script_enabled=True, + bypass_csp=self.config.disable_security, + ignore_https_errors=self.config.disable_security, + record_video_dir=self.config.save_recording_path, + record_video_size=self.config.browser_window_size.model_dump(), + record_har_path=self.config.save_har_path, + locale=self.config.locale, + http_credentials=self.config.http_credentials, + is_mobile=self.config.is_mobile, + has_touch=self.config.has_touch, + geolocation=self.config.geolocation, + permissions=self.config.permissions, + timezone_id=self.config.timezone_id, + ) + + if self.config.trace_path: + await context.tracing.start(screenshots=True, snapshots=True, sources=True) + + # Load cookies if they exist + if self.config.cookies_file and os.path.exists(self.config.cookies_file): + with open(self.config.cookies_file, 'r') as f: + try: + cookies = json.load(f) + + valid_same_site_values = ['Strict', 'Lax', 'None'] + for cookie in cookies: + if 'sameSite' in cookie: + if cookie['sameSite'] not in valid_same_site_values: + logger.warning( + f"Fixed invalid sameSite value '{cookie['sameSite']}' to 'None' for cookie {cookie.get('name')}" + ) + cookie['sameSite'] = 'None' + logger.info(f'🍪 Loaded {len(cookies)} cookies from {self.config.cookies_file}') + await context.add_cookies(cookies) + + except json.JSONDecodeError as e: + logger.error(f'Failed to parse cookies file: {str(e)}') + + # Expose anti-detection scripts + await context.add_init_script( + """ + // Webdriver property + Object.defineProperty(navigator, 'webdriver', { + get: () => undefined + }); + + // Languages + Object.defineProperty(navigator, 'languages', { + get: () => ['en-US'] + }); + + // Plugins + Object.defineProperty(navigator, 'plugins', { + get: () => [1, 2, 3, 4, 5] + }); + + // Chrome runtime + window.chrome = { runtime: {} }; + + // Permissions + const originalQuery = window.navigator.permissions.query; + window.navigator.permissions.query = (parameters) => ( + parameters.name === 'notifications' ? + Promise.resolve({ state: Notification.permission }) : + originalQuery(parameters) + ); + (function () { + const originalAttachShadow = Element.prototype.attachShadow; + Element.prototype.attachShadow = function attachShadow(options) { + return originalAttachShadow.call(this, { ...options, mode: "open" }); + }; + })(); + """ + ) + + return context diff --git a/src/controller/custom_controller.py b/src/controller/custom_controller.py index 7209e97..d07c88b 100644 --- a/src/controller/custom_controller.py +++ b/src/controller/custom_controller.py @@ -48,28 +48,6 @@ class CustomController(Controller): self.mcp_client = None self.mcp_server_config = None - async def setup_mcp_client(self, mcp_server_config: Optional[Dict[str, Any]] = None): - self.mcp_server_config = mcp_server_config - if self.mcp_server_config: - self.mcp_client = await setup_mcp_client_and_tools(self.mcp_server_config) - self.register_mcp_tools() - - def register_mcp_tools(self): - """ - Register the MCP tools used by this controller. - """ - if self.mcp_client: - for server_name in self.mcp_client.server_name_to_tools: - for tool in self.mcp_client.server_name_to_tools[server_name]: - tool_name = f"mcp.{server_name}.{tool.name}" - self.registry.registry.actions[tool_name] = RegisteredAction( - name=tool_name, - description=tool.description, - function=tool, - param_model=create_tool_param_model(tool), - ) - logger.info(f"Add mcp tool: {tool_name}") - def _register_custom_actions(self): """Register all custom browser actions""" @@ -173,6 +151,28 @@ class CustomController(Controller): except Exception as e: raise e + async def setup_mcp_client(self, mcp_server_config: Optional[Dict[str, Any]] = None): + self.mcp_server_config = mcp_server_config + if self.mcp_server_config: + self.mcp_client = await setup_mcp_client_and_tools(self.mcp_server_config) + self.register_mcp_tools() + + def register_mcp_tools(self): + """ + Register the MCP tools used by this controller. + """ + if self.mcp_client: + for server_name in self.mcp_client.server_name_to_tools: + for tool in self.mcp_client.server_name_to_tools[server_name]: + tool_name = f"mcp.{server_name}.{tool.name}" + self.registry.registry.actions[tool_name] = RegisteredAction( + name=tool_name, + description=tool.description, + function=tool, + param_model=create_tool_param_model(tool), + ) + logger.info(f"Add mcp tool: {tool_name}") + async def close_mcp_client(self): if self.mcp_client: await self.mcp_client.__aexit__(None, None, None) diff --git a/src/utils/mcp_client.py b/src/utils/mcp_client.py index a5d6fcd..b909d0d 100644 --- a/src/utils/mcp_client.py +++ b/src/utils/mcp_client.py @@ -40,7 +40,13 @@ async def setup_mcp_client_and_tools(mcp_server_config: Dict[str, Any]) -> Optio logger.info("Initializing MultiServerMCPClient...") + if not mcp_server_config: + logger.error("No MCP server configuration provided.") + return None + try: + if "mcpServers" in mcp_server_config: + mcp_server_config = mcp_server_config["mcpServers"] client = MultiServerMCPClient(mcp_server_config) await client.__aenter__() return client diff --git a/src/utils/utils.py b/src/utils/utils.py index 8703c46..f0f0b76 100644 --- a/src/utils/utils.py +++ b/src/utils/utils.py @@ -9,25 +9,6 @@ import gradio as gr import uuid -# Callback to update the model name dropdown based on the selected provider -def update_model_dropdown(llm_provider, api_key=None, base_url=None): - """ - Update the model name dropdown with predefined models for the selected provider. - """ - import gradio as gr - # Use API keys from .env if not provided - if not api_key: - api_key = os.getenv(f"{llm_provider.upper()}_API_KEY", "") - if not base_url: - base_url = os.getenv(f"{llm_provider.upper()}_BASE_URL", "") - - # Use predefined models for the selected provider - if llm_provider in model_names: - return gr.Dropdown(choices=model_names[llm_provider], value=model_names[llm_provider][0], interactive=True) - else: - return gr.Dropdown(choices=[], value="", interactive=True, allow_custom_value=True) - - def encode_image(img_path): if not img_path: return None @@ -56,108 +37,3 @@ def get_latest_files(directory: str, file_types: list = ['.webm', '.zip']) -> Di print(f"Error getting latest {file_type} file: {e}") return latest_files - - -async def capture_screenshot(browser_context): - """Capture and encode a screenshot""" - # Extract the Playwright browser instance - playwright_browser = browser_context.browser.playwright_browser # Ensure this is correct. - - # Check if the browser instance is valid and if an existing context can be reused - if playwright_browser and playwright_browser.contexts: - playwright_context = playwright_browser.contexts[0] - else: - return None - - # Access pages in the context - pages = None - if playwright_context: - pages = playwright_context.pages - - # Use an existing page or create a new one if none exist - if pages: - active_page = pages[0] - for page in pages: - if page.url != "about:blank": - active_page = page - else: - return None - - # Take screenshot - try: - screenshot = await active_page.screenshot( - type='jpeg', - quality=75, - scale="css" - ) - encoded = base64.b64encode(screenshot).decode('utf-8') - return encoded - except Exception as e: - return None - - -class ConfigManager: - def __init__(self): - self.components = {} - self.component_order = [] - - def register_component(self, name: str, component): - """Register a gradio component for config management.""" - self.components[name] = component - if name not in self.component_order: - self.component_order.append(name) - return component - - def save_current_config(self): - """Save the current configuration of all registered components.""" - current_config = {} - for name in self.component_order: - component = self.components[name] - # Get the current value from the component - current_config[name] = getattr(component, "value", None) - - return save_config_to_file(current_config) - - def update_ui_from_config(self, config_file): - """Update UI components from a loaded configuration file.""" - if config_file is None: - return [gr.update() for _ in self.component_order] + ["No file selected."] - - loaded_config = load_config_from_file(config_file.name) - - if not isinstance(loaded_config, dict): - return [gr.update() for _ in self.component_order] + ["Error: Invalid configuration file."] - - # Prepare updates for all components - updates = [] - for name in self.component_order: - if name in loaded_config: - updates.append(gr.update(value=loaded_config[name])) - else: - updates.append(gr.update()) - - updates.append("Configuration loaded successfully.") - return updates - - def get_all_components(self): - """Return all registered components in the order they were registered.""" - return [self.components[name] for name in self.component_order] - - -def load_config_from_file(config_file): - """Load settings from a config file (JSON format).""" - try: - with open(config_file, 'r') as f: - settings = json.load(f) - return settings - except Exception as e: - return f"Error loading configuration: {str(e)}" - - -def save_config_to_file(settings, save_dir="./tmp/webui_settings"): - """Save the current settings to a UUID.json file with a UUID name.""" - os.makedirs(save_dir, exist_ok=True) - config_file = os.path.join(save_dir, f"{uuid.uuid4()}.json") - with open(config_file, 'w') as f: - json.dump(settings, f, indent=2) - return f"Configuration saved to {config_file}" diff --git a/src/webui/components/agent_settings_tab.py b/src/webui/components/agent_settings_tab.py index a2479b3..85e7c0e 100644 --- a/src/webui/components/agent_settings_tab.py +++ b/src/webui/components/agent_settings_tab.py @@ -50,7 +50,7 @@ def create_agent_settings_tab(webui_manager: WebuiManager) -> dict[str, Componen extend_system_prompt = gr.Textbox(label="Extend system prompt", lines=4, interactive=True) with gr.Group(): - mcp_json_file = gr.File(label="MCP server file", interactive=True, file_types=[".json"]) + mcp_json_file = gr.File(label="MCP server json", interactive=True, file_types=[".json"]) mcp_server_config = gr.Textbox(label="MCP server", lines=6, interactive=True, visible=False) with gr.Group(): @@ -118,6 +118,7 @@ def create_agent_settings_tab(webui_manager: WebuiManager) -> dict[str, Componen choices=[provider for provider, model in config.model_names.items()], label="Planner LLM Provider", info="Select LLM provider for LLM", + value=None, interactive=True ) planner_llm_model_name = gr.Dropdown( @@ -201,7 +202,6 @@ def create_agent_settings_tab(webui_manager: WebuiManager) -> dict[str, Componen interactive=True, allow_custom_value=True, choices=["auto", "json_schema", "function_calling", "None"], - info="Tool Calls Function Name", visible=True ) tab_components.update(dict( @@ -228,6 +228,8 @@ def create_agent_settings_tab(webui_manager: WebuiManager) -> dict[str, Componen mcp_json_file=mcp_json_file, mcp_server_config=mcp_server_config, )) + webui_manager.add_components("agent_settings", tab_components) + llm_provider.change( fn=lambda x: gr.update(visible=x == "ollama"), inputs=llm_provider, @@ -236,23 +238,21 @@ def create_agent_settings_tab(webui_manager: WebuiManager) -> dict[str, Componen llm_provider.change( lambda provider: update_model_dropdown(provider), inputs=[llm_provider], - outputs=llm_model_name + outputs=[llm_model_name] ) planner_llm_provider.change( fn=lambda x: gr.update(visible=x == "ollama"), - inputs=planner_llm_provider, - outputs=planner_ollama_num_ctx + inputs=[planner_llm_provider], + outputs=[planner_ollama_num_ctx] ) planner_llm_provider.change( lambda provider: update_model_dropdown(provider), inputs=[planner_llm_provider], - outputs=planner_llm_model_name + outputs=[planner_llm_model_name] ) mcp_json_file.change( update_mcp_server, - inputs=mcp_json_file, + inputs=[mcp_json_file], outputs=[mcp_server_config, mcp_server_config] ) - - return tab_components diff --git a/src/webui/components/browser_settings_tab.py b/src/webui/components/browser_settings_tab.py index c2b3e56..0d3bcbb 100644 --- a/src/webui/components/browser_settings_tab.py +++ b/src/webui/components/browser_settings_tab.py @@ -35,7 +35,7 @@ def create_browser_settings_tab(webui_manager: WebuiManager) -> dict[str, Compon ) keep_browser_open = gr.Checkbox( label="Keep Browser Open", - value=False, + value=True, info="Keep Browser Open between Tasks", interactive=True ) @@ -119,7 +119,9 @@ def create_browser_settings_tab(webui_manager: WebuiManager) -> dict[str, Compon save_agent_history_path=save_agent_history_path, save_download_path=save_download_path, cdp_url=cdp_url, - wss_url=wss_url + wss_url=wss_url, + window_h=window_h, + window_w=window_w, ) ) - return tab_components + webui_manager.add_components("browser_settings", tab_components) diff --git a/src/webui/components/browser_use_agent_tab.py b/src/webui/components/browser_use_agent_tab.py index 8f842af..8a122b9 100644 --- a/src/webui/components/browser_use_agent_tab.py +++ b/src/webui/components/browser_use_agent_tab.py @@ -1,62 +1,921 @@ import gradio as gr from gradio.components import Component +import asyncio +import os +import json +import uuid +import logging +from datetime import datetime +from typing import List, Dict, Optional, Any, Set, Generator, AsyncGenerator, Union +from collections.abc import Awaitable +from langchain_core.language_models.chat_models import BaseChatModel +import base64 +from browser_use.browser.browser import Browser, BrowserConfig +from browser_use.browser.context import BrowserContext, BrowserContextConfig, BrowserContextWindowSize +from browser_use.agent.service import Agent +from browser_use.agent.views import AgentHistoryList +from browser_use.agent.views import ToolCallingMethod # Adjust import +from browser_use.agent.views import ( + REQUIRED_LLM_API_ENV_VARS, + ActionResult, + AgentError, + AgentHistory, + AgentHistoryList, + AgentOutput, + AgentSettings, + AgentState, + AgentStepInfo, + StepMetadata, + ToolCallingMethod, +) +from browser_use.browser.browser import Browser +from browser_use.browser.context import BrowserContext +from browser_use.browser.views import BrowserState, BrowserStateHistory from src.webui.webui_manager import WebuiManager -from src.utils import config +from src.controller.custom_controller import CustomController +from src.utils import llm_provider +from src.browser.custom_browser import CustomBrowser +from src.browser.custom_context import CustomBrowserContext, CustomBrowserContextConfig + +logger = logging.getLogger(__name__) -def create_browser_use_agent_tab(webui_manager: WebuiManager) -> dict[str, Component]: +# --- Helper Functions --- (Defined at module level) + +async def _initialize_llm(provider: Optional[str], model_name: Optional[str], temperature: float, + base_url: Optional[str], api_key: Optional[str], num_ctx: Optional[int] = None) -> Optional[ + BaseChatModel]: + """Initializes the LLM based on settings. Returns None if provider/model is missing.""" + if not provider or not model_name: + logger.info("LLM Provider or Model Name not specified, LLM will be None.") + return None + try: + # Use your actual LLM provider logic here + logger.info(f"Initializing LLM: Provider={provider}, Model={model_name}, Temp={temperature}") + # Example using a placeholder function + llm = llm_provider.get_llm_model( + provider=provider, + model_name=model_name, + temperature=temperature, + base_url=base_url or None, + api_key=api_key or None, + # Add other relevant params like num_ctx for ollama + num_ctx=num_ctx if provider == "ollama" else None + ) + return llm + except Exception as e: + logger.error(f"Failed to initialize LLM: {e}", exc_info=True) + gr.Warning( + f"Failed to initialize LLM '{model_name}' for provider '{provider}'. Please check settings. Error: {e}") + return None + + +def _get_config_value(webui_manager: WebuiManager, comp_dict: Dict[gr.components.Component, Any], comp_id_suffix: str, + default: Any = None) -> Any: + """Safely get value from component dictionary using its ID suffix relative to the tab.""" + # Assumes component ID format is "tab_name.comp_name" + tab_name = "browser_use_agent" # Hardcode or derive if needed + comp_id = f"{tab_name}.{comp_id_suffix}" + # Need to find the component object first using the ID from the manager + try: + comp = webui_manager.get_component_by_id(comp_id) + return comp_dict.get(comp, default) + except KeyError: + # Try accessing settings tabs as well + for prefix in ["agent_settings", "browser_settings"]: + try: + comp_id = f"{prefix}.{comp_id_suffix}" + comp = webui_manager.get_component_by_id(comp_id) + return comp_dict.get(comp, default) + except KeyError: + continue + logger.warning(f"Component with suffix '{comp_id_suffix}' not found in manager for value lookup.") + return default + + +def _format_agent_output(model_output: AgentOutput) -> str: + """Formats AgentOutput for display in the chatbot using JSON.""" + content = "" + if model_output: + try: + # Directly use model_dump if actions and current_state are Pydantic models + action_dump = [action.model_dump(exclude_none=True) for action in model_output.action] + + state_dump = model_output.current_state.model_dump(exclude_none=True) + model_output_dump = { + 'current_state': state_dump, + 'action': action_dump, + } + # Dump to JSON string with indentation + json_string = json.dumps(model_output_dump, indent=4, ensure_ascii=False) + # Wrap in
for proper display in HTML
+ content = f"{json_string}
"
+
+ except AttributeError as ae:
+ logger.error(
+ f"AttributeError during model dump: {ae}. Check if 'action' or 'current_state' or their items support 'model_dump'.")
+ content = f"Error: Could not format agent output (AttributeError: {ae}).\nRaw output: {str(model_output)}
"
+ except Exception as e:
+ logger.error(f"Error formatting agent output: {e}", exc_info=True)
+ # Fallback to simple string representation on error
+ content = f"Error formatting agent output.\nRaw output:\n{str(model_output)}
"
+
+ return content.strip()
+
+
+# --- Updated Callback Implementation ---
+
+async def _handle_new_step(webui_manager: WebuiManager, state: BrowserState, output: AgentOutput, step_num: int):
+ """Callback for each step taken by the agent, including screenshot display."""
+
+ # Use the correct chat history attribute name from the user's code
+ if not hasattr(webui_manager, 'bu_chat_history'):
+ logger.error("Attribute 'bu_chat_history' not found in webui_manager! Cannot add chat message.")
+ # Initialize it maybe? Or raise an error? For now, log and potentially skip chat update.
+ webui_manager.bu_chat_history = [] # Initialize if missing (consider if this is the right place)
+ # return # Or stop if this is critical
+ step_num -= 1
+ logger.info(f"Step {step_num} completed.")
+
+ # --- Screenshot Handling ---
+ screenshot_html = ""
+ # Ensure state.screenshot exists and is not empty before proceeding
+ # Use getattr for safer access
+ screenshot_data = getattr(state, 'screenshot', None)
+ if screenshot_data:
+ try:
+ # Basic validation: check if it looks like base64
+ if isinstance(screenshot_data, str) and len(screenshot_data) > 100: # Arbitrary length check
+ # *** UPDATED STYLE: Removed centering, adjusted width ***
+ img_tag = f'
'
+ screenshot_html = img_tag + "
" # Use
for line break after inline-block image
+ else:
+ logger.warning(
+ f"Screenshot for step {step_num} seems invalid (type: {type(screenshot_data)}, len: {len(screenshot_data) if isinstance(screenshot_data, str) else 'N/A'}).")
+ screenshot_html = "**[Invalid screenshot data]**
"
+
+ except Exception as e:
+ logger.error(f"Error processing or formatting screenshot for step {step_num}: {e}", exc_info=True)
+ screenshot_html = "**[Error displaying screenshot]**
"
+ else:
+ logger.debug(f"No screenshot available for step {step_num}.")
+
+ # --- Format Agent Output ---
+ formatted_output = _format_agent_output(output) # Use the updated function
+
+ # --- Combine and Append to Chat ---
+ step_header = f"--- **Step {step_num}** ---"
+ # Combine header, image (with line break), and JSON block
+ final_content = step_header + "
" + screenshot_html + formatted_output
+
+ chat_message = {
+ "role": "assistant",
+ "content": final_content.strip() # Remove leading/trailing whitespace
+ }
+
+ # Append to the correct chat history list
+ webui_manager.bu_chat_history.append(chat_message)
+
+ await asyncio.sleep(0.05)
+
+
+def _handle_done(webui_manager: WebuiManager, history: AgentHistoryList):
+ """Callback when the agent finishes the task (success or failure)."""
+ logger.info(
+ f"Agent task finished. Duration: {history.total_duration_seconds():.2f}s, Tokens: {history.total_input_tokens()}")
+ final_summary = f"**Task Completed**\n"
+ final_summary += f"- Duration: {history.total_duration_seconds():.2f} seconds\n"
+ final_summary += f"- Total Input Tokens: {history.total_input_tokens()}\n" # Or total tokens if available
+
+ final_result = history.final_result()
+ if final_result:
+ final_summary += f"- Final Result: {final_result}\n"
+
+ errors = history.errors()
+ if errors and any(errors):
+ final_summary += f"- **Errors:**\n```\n{errors}\n```\n"
+ else:
+ final_summary += "- Status: Success\n"
+
+ webui_manager.bu_chat_history.append({"role": "assistant", "content": final_summary})
+
+
+async def _ask_assistant_callback(webui_manager: WebuiManager, query: str, browser_context: BrowserContext) -> Dict[
+ str, Any]:
+ """Callback triggered by the agent's ask_for_assistant action."""
+ logger.info("Agent requires assistance. Waiting for user input.")
+
+ if not hasattr(webui_manager, '_chat_history'):
+ logger.error("Chat history not found in webui_manager during ask_assistant!")
+ return {"response": "Internal Error: Cannot display help request."}
+
+ webui_manager.bu_chat_history.append({"role": "assistant",
+ "content": f"**Need Help:** {query}\nPlease provide information or perform the required action in the browser, then type your response/confirmation below and click 'Submit Response'."})
+
+ # Use state stored in webui_manager
+ webui_manager.bu_response_event = asyncio.Event()
+ webui_manager.bu_user_help_response = None # Reset previous response
+
+ try:
+ logger.info("Waiting for user response event...")
+ await asyncio.wait_for(webui_manager.bu_response_event.wait(), timeout=3600.0) # Long timeout
+ logger.info("User response event received.")
+ except asyncio.TimeoutError:
+ logger.warning("Timeout waiting for user assistance.")
+ webui_manager.bu_chat_history.append(
+ {"role": "assistant", "content": "**Timeout:** No response received. Trying to proceed."})
+ webui_manager.bu_response_event = None # Clear the event
+ return {"response": "Timeout: User did not respond."} # Inform the agent
+
+ response = webui_manager.bu_user_help_response
+ webui_manager.bu_chat_history.append({"role": "user", "content": response}) # Show user response in chat
+ webui_manager.bu_response_event = None # Clear the event for the next potential request
+ return {"response": response}
+
+
+async def capture_screenshot(browser_context):
+ """Capture and encode a screenshot"""
+ # Extract the Playwright browser instance
+ playwright_browser = browser_context.browser.playwright_browser # Ensure this is correct.
+
+ # Check if the browser instance is valid and if an existing context can be reused
+ if playwright_browser and playwright_browser.contexts:
+ playwright_context = playwright_browser.contexts[0]
+ else:
+ return None
+
+ # Access pages in the context
+ pages = None
+ if playwright_context:
+ pages = playwright_context.pages
+
+ # Use an existing page or create a new one if none exist
+ if pages:
+ active_page = pages[0]
+ for page in pages:
+ if page.url != "about:blank":
+ active_page = page
+ else:
+ return None
+
+ # Take screenshot
+ try:
+ screenshot = await active_page.screenshot(
+ type='jpeg',
+ quality=75,
+ scale="css"
+ )
+ encoded = base64.b64encode(screenshot).decode('utf-8')
+ return encoded
+ except Exception as e:
+ return None
+
+
+# --- Core Agent Execution Logic --- (Needs access to webui_manager)
+
+async def run_agent_task(webui_manager: WebuiManager, components: Dict[gr.components.Component, Any]) -> AsyncGenerator[
+ Dict[gr.components.Component, Any], None]:
+ """Handles the entire lifecycle of initializing and running the agent."""
+
+ # --- Get Components ---
+ # Need handles to specific UI components to update them
+ user_input_comp = webui_manager.get_component_by_id("browser_use_agent.user_input")
+ run_button_comp = webui_manager.get_component_by_id("browser_use_agent.run_button")
+ stop_button_comp = webui_manager.get_component_by_id("browser_use_agent.stop_button")
+ pause_resume_button_comp = webui_manager.get_component_by_id("browser_use_agent.pause_resume_button")
+ clear_button_comp = webui_manager.get_component_by_id("browser_use_agent.clear_button")
+ chatbot_comp = webui_manager.get_component_by_id("browser_use_agent.chatbot")
+ history_file_comp = webui_manager.get_component_by_id("browser_use_agent.agent_history_file")
+ gif_comp = webui_manager.get_component_by_id("browser_use_agent.recording_gif")
+ browser_view_comp = webui_manager.get_component_by_id("browser_use_agent.browser_view")
+
+ # --- 1. Get Task and Initial UI Update ---
+ task = components.get(user_input_comp, "").strip()
+ if not task:
+ gr.Warning("Please enter a task.")
+ yield {run_button_comp: gr.update(interactive=True)}
+ return
+
+ # Set running state indirectly via _current_task
+ webui_manager.bu_chat_history.append({"role": "user", "content": task})
+
+ yield {
+ user_input_comp: gr.Textbox(value="", interactive=False, placeholder="Agent is running..."),
+ run_button_comp: gr.Button(value="⏳ Running...", interactive=False),
+ stop_button_comp: gr.Button(interactive=True),
+ pause_resume_button_comp: gr.Button(value="⏸️ Pause", interactive=True),
+ clear_button_comp: gr.Button(interactive=False),
+ chatbot_comp: gr.update(value=webui_manager.bu_chat_history),
+ history_file_comp: gr.update(value=None),
+ gif_comp: gr.update(value=None),
+ }
+
+ # --- Agent Settings ---
+ # Access settings values via components dict, getting IDs from webui_manager
+ def get_setting(key, default=None):
+ comp = webui_manager.id_to_component.get(f"agent_settings.{key}")
+ return components.get(comp, default) if comp else default
+
+ override_system_prompt = get_setting("override_system_prompt") or None
+ extend_system_prompt = get_setting("extend_system_prompt") or None
+ llm_provider_name = get_setting("llm_provider", None) # Default to None if not found
+ llm_model_name = get_setting("llm_model_name", None)
+ llm_temperature = get_setting("llm_temperature", 0.6)
+ use_vision = get_setting("use_vision", True)
+ ollama_num_ctx = get_setting("ollama_num_ctx", 16000)
+ llm_base_url = get_setting("llm_base_url") or None
+ llm_api_key = get_setting("llm_api_key") or None
+ max_steps = get_setting("max_steps", 100)
+ max_actions = get_setting("max_actions", 10)
+ max_input_tokens = get_setting("max_input_tokens", 128000)
+ tool_calling_str = get_setting("tool_calling_method", "auto")
+ tool_calling_method = tool_calling_str if tool_calling_str != "None" else None
+ mcp_server_config_comp = webui_manager.id_to_component.get("agent_settings.mcp_server_config")
+ mcp_server_config_str = components.get(mcp_server_config_comp) if mcp_server_config_comp else None
+ mcp_server_config = json.loads(mcp_server_config_str) if mcp_server_config_str else None
+
+ # Planner LLM Settings (Optional)
+ planner_llm_provider_name = get_setting("planner_llm_provider") or None
+ planner_llm = None
+ if planner_llm_provider_name:
+ planner_llm_model_name = get_setting("planner_llm_model_name")
+ planner_llm_temperature = get_setting("planner_llm_temperature", 0.6)
+ planner_ollama_num_ctx = get_setting("planner_ollama_num_ctx", 16000)
+ planner_llm_base_url = get_setting("planner_llm_base_url") or None
+ planner_llm_api_key = get_setting("planner_llm_api_key") or None
+ planner_use_vision = get_setting("planner_use_vision", False)
+
+ planner_llm = await _initialize_llm(
+ planner_llm_provider_name, planner_llm_model_name, planner_llm_temperature,
+ planner_llm_base_url, planner_llm_api_key,
+ planner_ollama_num_ctx if planner_llm_provider_name == "ollama" else None
+ )
+
+ # --- Browser Settings ---
+ def get_browser_setting(key, default=None):
+ comp = webui_manager.id_to_component.get(f"browser_settings.{key}")
+ return components.get(comp, default) if comp else default
+
+ browser_binary_path = get_browser_setting("browser_binary_path") or None
+ browser_user_data_dir = get_browser_setting("browser_user_data_dir") or None
+ use_own_browser = get_browser_setting("use_own_browser", False) # Logic handled by CDP/WSS presence
+ keep_browser_open = get_browser_setting("keep_browser_open", False)
+ headless = get_browser_setting("headless", False)
+ disable_security = get_browser_setting("disable_security", True)
+ window_w = int(get_browser_setting("window_w", 1280))
+ window_h = int(get_browser_setting("window_h", 1100))
+ cdp_url = get_browser_setting("cdp_url") or None
+ wss_url = get_browser_setting("wss_url") or None
+ save_recording_path = get_browser_setting("save_recording_path") or None
+ save_trace_path = get_browser_setting("save_trace_path") or None
+ save_agent_history_path = get_browser_setting("save_agent_history_path", "./tmp/agent_history")
+ save_download_path = get_browser_setting("save_download_path", "./tmp/downloads")
+
+ stream_vw = 80
+ stream_vh = int(80 * window_h // window_w)
+
+ os.makedirs(save_agent_history_path, exist_ok=True)
+ if save_recording_path: os.makedirs(save_recording_path, exist_ok=True)
+ if save_trace_path: os.makedirs(save_trace_path, exist_ok=True)
+ if save_download_path: os.makedirs(save_download_path, exist_ok=True)
+
+ # --- 2. Initialize LLM ---
+ main_llm = await _initialize_llm(
+ llm_provider_name, llm_model_name, llm_temperature, llm_base_url, llm_api_key,
+ ollama_num_ctx if llm_provider_name == "ollama" else None
+ )
+
+ # Pass the webui_manager instance to the callback when wrapping it
+ async def ask_callback_wrapper(query: str, browser_context: BrowserContext) -> Dict[str, Any]:
+ return await _ask_assistant_callback(webui_manager, query, browser_context)
+
+ if not webui_manager.bu_controller:
+ webui_manager.bu_controller = CustomController(ask_assistant_callback=ask_callback_wrapper)
+ await webui_manager.bu_controller.setup_mcp_client(mcp_server_config)
+
+ # --- 4. Initialize Browser and Context ---
+ should_close_browser_on_finish = not keep_browser_open
+
+ try:
+ # Close existing resources if not keeping open
+ if not keep_browser_open:
+ if webui_manager.bu_browser_context:
+ logger.info("Closing previous browser context.")
+ await webui_manager.bu_browser_context.close()
+ webui_manager.bu_browser_context = None
+ if webui_manager.bu_browser:
+ logger.info("Closing previous browser.")
+ await webui_manager.bu_browser.close()
+ webui_manager.bu_browser = None
+
+ # Create Browser if needed
+ if not webui_manager.bu_browser:
+ logger.info("Launching new browser instance.")
+ extra_args = [f"--window-size={window_w},{window_h}"]
+ if browser_user_data_dir:
+ extra_args.append(f"--user-data-dir={browser_user_data_dir}")
+
+ if use_own_browser:
+ browser_binary_path = os.getenv("CHROME_PATH", None) or browser_binary_path
+ if browser_binary_path == "":
+ browser_binary_path = None
+ chrome_user_data = os.getenv("CHROME_USER_DATA", None)
+ if chrome_user_data:
+ extra_args += [f"--user-data-dir={chrome_user_data}"]
+ else:
+ browser_binary_path = None
+
+ webui_manager.bu_browser = CustomBrowser(
+ config=BrowserConfig(
+ headless=headless,
+ disable_security=disable_security,
+ browser_binary_path=browser_binary_path,
+ extra_browser_args=extra_args,
+ wss_url=wss_url,
+ cdp_url=cdp_url,
+ )
+ )
+
+ # Create Context if needed
+ if not webui_manager.bu_browser_context:
+ logger.info("Creating new browser context.")
+ context_config = CustomBrowserContextConfig(
+ trace_path=save_trace_path if save_trace_path else None,
+ save_recording_path=save_recording_path if save_recording_path else None,
+ save_downloads_path=save_download_path if save_download_path else None,
+ browser_window_size=BrowserContextWindowSize(width=window_w, height=window_h)
+ )
+ if not webui_manager.bu_browser:
+ raise ValueError("Browser not initialized, cannot create context.")
+ webui_manager.bu_browser_context = await webui_manager.bu_browser.new_context(config=context_config)
+
+ # --- 5. Initialize or Update Agent ---
+ webui_manager.bu_agent_task_id = str(uuid.uuid4()) # New ID for this task run
+ os.makedirs(os.path.join(save_agent_history_path, webui_manager.bu_agent_task_id), exist_ok=True)
+ history_file = os.path.join(save_agent_history_path, webui_manager.bu_agent_task_id,
+ f"{webui_manager.bu_agent_task_id}.json")
+ gif_path = os.path.join(save_agent_history_path, webui_manager.bu_agent_task_id,
+ f"{webui_manager.bu_agent_task_id}.gif")
+
+ # Pass the webui_manager to callbacks when wrapping them
+ async def step_callback_wrapper(state: BrowserState, output: AgentOutput, step_num: int):
+ await _handle_new_step(webui_manager, state, output, step_num)
+
+ def done_callback_wrapper(history: AgentHistoryList):
+ _handle_done(webui_manager, history)
+
+ if not webui_manager.bu_agent:
+ logger.info(f"Initializing new agent for task: {task}")
+ if not webui_manager.bu_browser or not webui_manager.bu_browser_context:
+ raise ValueError("Browser or Context not initialized, cannot create agent.")
+
+ webui_manager.bu_agent = Agent(
+ task=task,
+ llm=main_llm,
+ browser=webui_manager.bu_browser,
+ browser_context=webui_manager.bu_browser_context,
+ controller=webui_manager.bu_controller,
+ register_new_step_callback=step_callback_wrapper,
+ register_done_callback=done_callback_wrapper,
+ # Agent settings
+ use_vision=use_vision,
+ override_system_message=override_system_prompt,
+ extend_system_message=extend_system_prompt,
+ max_input_tokens=max_input_tokens,
+ max_actions_per_step=max_actions,
+ tool_calling_method=tool_calling_method,
+ planner_llm=planner_llm,
+ use_vision_for_planner=planner_use_vision if planner_llm else False,
+ save_conversation_path=history_file,
+ )
+ webui_manager.bu_agent.state.agent_id = webui_manager.bu_agent_task_id
+ webui_manager.bu_agent.settings.generate_gif = gif_path
+ else:
+ webui_manager.bu_agent.state.agent_id = webui_manager.bu_agent_task_id
+ webui_manager.bu_agent.add_new_task(task)
+ webui_manager.bu_agent.settings.generate_gif = gif_path
+
+ # --- 6. Run Agent Task and Stream Updates ---
+ agent_run_coro = webui_manager.bu_agent.run(max_steps=max_steps)
+ agent_task = asyncio.create_task(agent_run_coro)
+ webui_manager.bu_current_task = agent_task # Store the task
+
+ last_chat_len = len(webui_manager.bu_chat_history)
+ while not agent_task.done():
+ is_paused = webui_manager.bu_agent.state.paused
+ is_stopped = webui_manager.bu_agent.state.stopped
+
+ # Check for pause state
+ if is_paused:
+ yield {
+ pause_resume_button_comp: gr.update(value="▶️ Resume", interactive=True),
+ run_button_comp: gr.update(value="⏸️ Paused", interactive=False),
+ stop_button_comp: gr.update(interactive=True), # Allow stop while paused
+ }
+ # Wait until pause is released or task is stopped/done
+ while is_paused and not agent_task.done():
+ # Re-check agent state in loop
+ is_paused = webui_manager.bu_agent.state.paused
+ is_stopped = webui_manager.bu_agent.state.stopped
+ if is_stopped: # Stop signal received while paused
+ break
+ await asyncio.sleep(0.2)
+
+ if agent_task.done() or is_stopped: # If stopped or task finished while paused
+ break
+
+ # If resumed, yield UI update
+ yield {
+ pause_resume_button_comp: gr.update(value="⏸️ Pause", interactive=True),
+ run_button_comp: gr.update(value="⏳ Running...", interactive=False),
+ }
+
+ # Check if agent stopped itself or stop button was pressed (which sets agent.state.stopped)
+ if is_stopped:
+ logger.info("Agent has stopped (internally or via stop button).")
+ if not agent_task.done():
+ # Ensure the task coroutine finishes if agent just set flag
+ try:
+ await asyncio.wait_for(agent_task, timeout=1.0) # Give it a moment to exit run()
+ except asyncio.TimeoutError:
+ logger.warning("Agent task did not finish quickly after stop signal, cancelling.")
+ agent_task.cancel()
+ except Exception: # Catch task exceptions if it errors on stop
+ pass
+ break # Exit the streaming loop
+
+ # Check if agent is asking for help (via response_event)
+ update_dict = {}
+ if webui_manager.bu_response_event is not None:
+ update_dict = {
+ user_input_comp: gr.update(placeholder="Agent needs help. Enter response and submit.",
+ interactive=True),
+ run_button_comp: gr.update(value="✔️ Submit Response", interactive=True),
+ pause_resume_button_comp: gr.update(interactive=False),
+ stop_button_comp: gr.update(interactive=False),
+ chatbot_comp: gr.update(value=webui_manager.bu_chat_history)
+ }
+ last_chat_len = len(webui_manager.bu_chat_history)
+ yield update_dict
+ # Wait until response is submitted or task finishes
+ while webui_manager.bu_response_event is not None and not agent_task.done():
+ await asyncio.sleep(0.2)
+ # Restore UI after response submitted or if task ended unexpectedly
+ if not agent_task.done():
+ yield {
+ user_input_comp: gr.update(placeholder="Agent is running...", interactive=False),
+ run_button_comp: gr.update(value="⏳ Running...", interactive=False),
+ pause_resume_button_comp: gr.update(interactive=True),
+ stop_button_comp: gr.update(interactive=True),
+ }
+ else:
+ break # Task finished while waiting for response
+
+ # Update Chatbot if new messages arrived via callbacks
+ if len(webui_manager.bu_chat_history) > last_chat_len:
+ update_dict[chatbot_comp] = gr.update(value=webui_manager.bu_chat_history)
+ last_chat_len = len(webui_manager.bu_chat_history)
+
+ # Update Browser View
+ if headless and webui_manager.bu_browser_context:
+ try:
+ screenshot_b64 = await capture_screenshot(webui_manager.bu_browser_context)
+ if screenshot_b64:
+ html_content = f'
'
+ update_dict[browser_view_comp] = gr.update(value=html_content, visible=True)
+ else:
+ html_content = f"Waiting for browser session...
"
+ update_dict[browser_view_comp] = gr.update(value=html_content,
+ visible=True)
+ except Exception as e:
+ logger.debug(f"Failed to capture screenshot: {e}")
+ update_dict[browser_view_comp] = gr.update(value="Error loading view...",
+ visible=True)
+ else:
+ update_dict[browser_view_comp] = gr.update(visible=False)
+
+ # Yield accumulated updates
+ if update_dict:
+ yield update_dict
+
+ await asyncio.sleep(0.1) # Polling interval
+
+ # --- 7. Task Finalization ---
+ webui_manager.bu_agent.state.paused = False
+ webui_manager.bu_agent.state.stopped = False
+ final_update = {}
+ try:
+ logger.info("Agent task completing...")
+ # Await the task ensure completion and catch exceptions if not already caught
+ if not agent_task.done():
+ await agent_task # Retrieve result/exception
+ elif agent_task.exception(): # Check if task finished with exception
+ agent_task.result() # Raise the exception to be caught below
+ logger.info("Agent task completed processing.")
+
+ logger.info(f"Explicitly saving agent history to: {history_file}")
+ webui_manager.bu_agent.save_history(history_file)
+
+ if os.path.exists(history_file):
+ final_update[history_file_comp] = gr.File(value=history_file)
+
+ if gif_path and os.path.exists(gif_path):
+ logger.info(f"GIF found at: {gif_path}")
+ final_update[gif_comp] = gr.Image(value=gif_path)
+
+ except asyncio.CancelledError:
+ logger.info("Agent task was cancelled.")
+ if not any("Cancelled" in msg.get("content", "") for msg in webui_manager.bu_chat_history if
+ msg.get("role") == "assistant"):
+ webui_manager.bu_chat_history.append({"role": "assistant", "content": "**Task Cancelled**."})
+ final_update[chatbot_comp] = gr.update(value=webui_manager.bu_chat_history)
+ except Exception as e:
+ logger.error(f"Error during agent execution: {e}", exc_info=True)
+ error_message = f"**Agent Execution Error:**\n```\n{type(e).__name__}: {e}\n```"
+ if not any(error_message in msg.get("content", "") for msg in webui_manager.bu_chat_history if
+ msg.get("role") == "assistant"):
+ webui_manager.bu_chat_history.append({"role": "assistant", "content": error_message})
+ final_update[chatbot_comp] = gr.update(value=webui_manager.bu_chat_history)
+ gr.Error(f"Agent execution failed: {e}")
+
+ finally:
+ webui_manager.bu_current_task = None # Clear the task reference
+
+ # Close browser/context if requested
+ if should_close_browser_on_finish:
+ if webui_manager.bu_browser_context:
+ logger.info("Closing browser context after task.")
+ await webui_manager.bu_browser_context.close()
+ webui_manager.bu_browser_context = None
+ if webui_manager.bu_browser:
+ logger.info("Closing browser after task.")
+ await webui_manager.bu_browser.close()
+ webui_manager.bu_browser = None
+
+ # --- 8. Final UI Update ---
+ final_update.update({
+ user_input_comp: gr.update(value="", interactive=True, placeholder="Enter your next task..."),
+ run_button_comp: gr.update(value="▶️ Submit Task", interactive=True),
+ stop_button_comp: gr.update(interactive=False),
+ pause_resume_button_comp: gr.update(value="⏸️ Pause", interactive=False),
+ clear_button_comp: gr.update(interactive=True),
+ # Ensure final chat history is shown
+ chatbot_comp: gr.update(value=webui_manager.bu_chat_history)
+ })
+ yield final_update
+
+ except Exception as e:
+ # Catch errors during setup (before agent run starts)
+ logger.error(f"Error setting up agent task: {e}", exc_info=True)
+ webui_manager.bu_current_task = None # Ensure state is reset
+ yield {
+ user_input_comp: gr.update(interactive=True, placeholder="Error during setup. Enter task..."),
+ run_button_comp: gr.update(value="▶️ Submit Task", interactive=True),
+ stop_button_comp: gr.update(interactive=False),
+ pause_resume_button_comp: gr.update(value="⏸️ Pause", interactive=False),
+ clear_button_comp: gr.update(interactive=True),
+ chatbot_comp: gr.update(
+ value=webui_manager.bu_chat_history + [{"role": "assistant", "content": f"**Setup Error:** {e}"}]),
+ }
+
+
+# --- Button Click Handlers --- (Need access to webui_manager)
+
+async def handle_submit(webui_manager: WebuiManager, components: Dict[gr.components.Component, Any]):
+ """Handles clicks on the main 'Submit' button."""
+ user_input_comp = webui_manager.get_component_by_id("browser_use_agent.user_input")
+ user_input_value = components.get(user_input_comp, "").strip()
+
+ # Check if waiting for user assistance
+ if webui_manager.bu_response_event and not webui_manager.bu_response_event.is_set():
+ logger.info(f"User submitted assistance: {user_input_value}")
+ webui_manager.bu_user_help_response = user_input_value if user_input_value else "User provided no text response."
+ webui_manager.bu_response_event.set()
+ # UI updates handled by the main loop reacting to the event being set
+ yield {
+ user_input_comp: gr.update(value="", interactive=False, placeholder="Waiting for agent to continue..."),
+ webui_manager.get_component_by_id("browser_use_agent.run_button"): gr.update(value="⏳ Running...",
+ interactive=False)
+ }
+ # Check if a task is currently running (using _current_task)
+ elif webui_manager.bu_current_task and not webui_manager.bu_current_task.done():
+ logger.warning("Submit button clicked while agent is already running and not asking for help.")
+ gr.Info("Agent is currently running. Please wait or use Stop/Pause.")
+ yield {} # No change
+ else:
+ # Handle submission for a new task
+ logger.info("Submit button clicked for new task.")
+ # Use async generator to stream updates from run_agent_task
+ async for update in run_agent_task(webui_manager, components):
+ yield update
+
+
+async def handle_stop(webui_manager: WebuiManager):
+ """Handles clicks on the 'Stop' button."""
+ logger.info("Stop button clicked.")
+ agent = webui_manager.bu_agent
+ task = webui_manager.bu_current_task
+
+ if agent and task and not task.done():
+ # Signal the agent to stop by setting its internal flag
+ agent.state.stopped = True
+ agent.state.paused = False # Ensure not paused if stopped
+ return {
+ webui_manager.get_component_by_id("browser_use_agent.stop_button"): gr.update(interactive=False,
+ value="⏹️ Stopping..."),
+ webui_manager.get_component_by_id("browser_use_agent.pause_resume_button"): gr.update(interactive=False),
+ webui_manager.get_component_by_id("browser_use_agent.run_button"): gr.update(interactive=False),
+ }
+ else:
+ logger.warning("Stop clicked but agent is not running or task is already done.")
+ # Reset UI just in case it's stuck
+ return {
+ webui_manager.get_component_by_id("browser_use_agent.run_button"): gr.update(interactive=True),
+ webui_manager.get_component_by_id("browser_use_agent.stop_button"): gr.update(interactive=False),
+ webui_manager.get_component_by_id("browser_use_agent.pause_resume_button"): gr.update(interactive=False),
+ webui_manager.get_component_by_id("browser_use_agent.clear_button"): gr.update(interactive=True),
+ }
+
+
+async def handle_pause_resume(webui_manager: WebuiManager):
+ """Handles clicks on the 'Pause/Resume' button."""
+ agent = webui_manager.bu_agent
+ task = webui_manager.bu_current_task
+
+ if agent and task and not task.done():
+ if agent.state.paused:
+ logger.info("Resume button clicked.")
+ agent.resume()
+ # UI update happens in main loop
+ return {
+ webui_manager.get_component_by_id("browser_use_agent.pause_resume_button"): gr.update(value="⏸️ Pause",
+ interactive=True)} # Optimistic update
+ else:
+ logger.info("Pause button clicked.")
+ agent.pause()
+ return {
+ webui_manager.get_component_by_id("browser_use_agent.pause_resume_button"): gr.update(value="▶️ Resume",
+ interactive=True)} # Optimistic update
+ else:
+ logger.warning("Pause/Resume clicked but agent is not running or doesn't support state.")
+ return {} # No change
+
+
+async def handle_clear(webui_manager: WebuiManager):
+ """Handles clicks on the 'Clear' button."""
+ logger.info("Clear button clicked.")
+
+ # Stop any running task first
+ task = webui_manager.bu_current_task
+ if task and not task.done():
+ logger.info("Clearing requires stopping the current task.")
+ webui_manager.bu_agent.stop()
+ try:
+ await asyncio.wait_for(task, timeout=2.0) # Wait briefly
+ except (asyncio.CancelledError, asyncio.TimeoutError):
+ pass
+ except Exception as e:
+ logger.warning(f"Error stopping task on clear: {e}")
+ webui_manager.bu_current_task.cancel()
+ webui_manager.bu_current_task = None
+
+ if webui_manager.bu_controller:
+ await webui_manager.bu_controller.close_mcp_client()
+ webui_manager.bu_controller = None
+ webui_manager.bu_agent = None
+
+ # Reset state stored in manager
+ webui_manager.bu_chat_history = []
+ webui_manager.bu_response_event = None
+ webui_manager.bu_user_help_response = None
+ webui_manager.bu_agent_task_id = None
+
+ logger.info("Agent state and browser resources cleared.")
+
+ # Reset UI components
+ return {
+ webui_manager.get_component_by_id("browser_use_agent.chatbot"): gr.update(value=[]),
+ webui_manager.get_component_by_id("browser_use_agent.user_input"): gr.update(value="",
+ placeholder="Enter your task here..."),
+ webui_manager.get_component_by_id("browser_use_agent.agent_history_file"): gr.update(value=None),
+ webui_manager.get_component_by_id("browser_use_agent.recording_gif"): gr.update(value=None),
+ webui_manager.get_component_by_id("browser_use_agent.browser_view"): gr.update(
+ value="Browser Cleared"),
+ webui_manager.get_component_by_id("browser_use_agent.run_button"): gr.update(value="▶️ Submit Task",
+ interactive=True),
+ webui_manager.get_component_by_id("browser_use_agent.stop_button"): gr.update(interactive=False),
+ webui_manager.get_component_by_id("browser_use_agent.pause_resume_button"): gr.update(value="⏸️ Pause",
+ interactive=False),
+ webui_manager.get_component_by_id("browser_use_agent.clear_button"): gr.update(interactive=True),
+ }
+
+
+# --- Tab Creation Function ---
+
+def create_browser_use_agent_tab(webui_manager: WebuiManager):
"""
- Create the run agent tab
+ Create the run agent tab, defining UI, state, and handlers.
"""
- input_components = set(webui_manager.get_components())
+ webui_manager.init_browser_use_agent()
+
+ # --- Define UI Components ---
tab_components = {}
-
- chatbot = gr.Chatbot(type='messages', label="Chat History", height=600)
- user_input = gr.Textbox(
- label="User Input",
- lines=3,
- value="go to google.com and type 'OpenAI' click search and give me the first url",
- interactive=True
- )
-
- with gr.Row():
- stop_button = gr.Button("⏹️ Stop", interactive=False, variant="stop", scale=2)
- clear_button = gr.Button("🧹 Clear", interactive=True, variant="stop", scale=2)
- run_button = gr.Button("▶️ Summit", variant="primary", scale=3)
-
- browser_view = gr.HTML(
- value="Waiting for browser session...
",
- label="Browser Live View",
- visible=False
- )
-
- with gr.Row():
- agent_final_result = gr.Textbox(
- label="Final Result", lines=3, show_label=True, interactive=False
+ with gr.Column():
+ chatbot = gr.Chatbot(
+ lambda: webui_manager.bu_chat_history, # Load history dynamically
+ elem_id="browser_use_chatbot",
+ label="Agent Interaction",
+ type="messages",
+ height=600,
+ show_copy_button=True,
+ bubble_full_width=False,
)
- agent_errors = gr.Textbox(
- label="Errors", lines=3, show_label=True, interactive=False
+ user_input = gr.Textbox(
+ label="Your Task or Response",
+ placeholder="Enter your task here or provide assistance when asked.",
+ lines=3,
+ interactive=True,
+ elem_id="user_input"
)
+ with gr.Row():
+ stop_button = gr.Button("⏹️ Stop", interactive=False, variant="stop", scale=1)
+ pause_resume_button = gr.Button("⏸️ Pause", interactive=False, variant="secondary", scale=1)
+ clear_button = gr.Button("🗑️ Clear", interactive=True, variant="secondary", scale=1)
+ run_button = gr.Button("▶️ Submit Task", variant="primary", scale=2)
- with gr.Row():
- agent_trace_file = gr.File(label="Trace File", interactive=False)
- agent_history_file = gr.File(label="Agent History", interactive=False)
+ browser_view = gr.HTML(
+ value="Browser View (Requires Headless=True)
",
+ label="Browser Live View",
+ elem_id="browser_view",
+ visible=False,
+ )
+ with gr.Column():
+ gr.Markdown("### Task Outputs")
+ agent_history_file = gr.File(label="Agent History JSON", interactive=False)
+ recording_gif = gr.Image(label="Task Recording GIF", format="gif", interactive=False,
+ type="filepath")
- recording_gif = gr.Image(label="Result GIF", format="gif", interactive=False)
+ # --- Store Components in Manager ---
tab_components.update(
dict(
- chatbot=chatbot,
- user_input=user_input,
- clear_button=clear_button,
- run_button=run_button,
- stop_button=stop_button,
- agent_final_result=agent_final_result,
- agent_errors=agent_errors,
- agent_trace_file=agent_trace_file,
- agent_history_file=agent_history_file,
- recording_gif=recording_gif,
+ chatbot=chatbot, user_input=user_input, clear_button=clear_button,
+ run_button=run_button, stop_button=stop_button, pause_resume_button=pause_resume_button,
+ agent_history_file=agent_history_file, recording_gif=recording_gif,
browser_view=browser_view
)
)
- return tab_components
+ webui_manager.add_components("browser_use_agent", tab_components) # Use "browser_use_agent" as tab_name prefix
+
+ all_managed_components = set(webui_manager.get_components()) # Get all components known to manager
+ run_tab_outputs = list(tab_components.values())
+
+ async def submit_wrapper(components_dict: Dict[Component, Any]) -> AsyncGenerator[Dict[Component, Any], None]:
+ """Wrapper for handle_submit that yields its results."""
+ # handle_submit is an async generator, iterate and yield
+ async for update in handle_submit(webui_manager, components_dict):
+ yield update
+
+ async def stop_wrapper() -> AsyncGenerator[Dict[Component, Any], None]:
+ """Wrapper for handle_stop."""
+ # handle_stop is async def but returns a single dict. We yield it once.
+ update_dict = await handle_stop(webui_manager)
+ yield update_dict # Yield the final dictionary
+
+ async def pause_resume_wrapper() -> AsyncGenerator[Dict[Component, Any], None]:
+ """Wrapper for handle_pause_resume."""
+ update_dict = await handle_pause_resume(webui_manager)
+ yield update_dict
+
+ async def clear_wrapper() -> AsyncGenerator[Dict[Component, Any], None]:
+ """Wrapper for handle_clear."""
+ update_dict = await handle_clear(webui_manager)
+ yield update_dict
+
+ # --- Connect Event Handlers using the Wrappers --
+ run_button.click(
+ fn=submit_wrapper,
+ inputs=all_managed_components,
+ outputs=run_tab_outputs
+ )
+ user_input.submit(
+ fn=submit_wrapper,
+ inputs=all_managed_components,
+ outputs=run_tab_outputs
+ )
+ stop_button.click(
+ fn=stop_wrapper,
+ inputs=None,
+ outputs=run_tab_outputs
+ )
+ pause_resume_button.click(
+ fn=pause_resume_wrapper,
+ inputs=None,
+ outputs=run_tab_outputs
+ )
+ clear_button.click(
+ fn=clear_wrapper,
+ inputs=None,
+ outputs=run_tab_outputs
+ )
+
diff --git a/src/webui/components/deep_research_agent_tab.py b/src/webui/components/deep_research_agent_tab.py
index d9dfc24..5ce8dd7 100644
--- a/src/webui/components/deep_research_agent_tab.py
+++ b/src/webui/components/deep_research_agent_tab.py
@@ -38,4 +38,4 @@ def create_deep_research_agent_tab(webui_manager: WebuiManager) -> dict[str, Com
markdown_download=markdown_download,
)
)
- return tab_components
+ webui_manager.add_components("deep_research_agent", tab_components)
diff --git a/src/webui/components/load_save_config_tab.py b/src/webui/components/load_save_config_tab.py
index 91dcad7..acc0f69 100644
--- a/src/webui/components/load_save_config_tab.py
+++ b/src/webui/components/load_save_config_tab.py
@@ -34,16 +34,17 @@ def create_load_save_config_tab(webui_manager: WebuiManager) -> dict[str, Compon
config_file=config_file,
))
+ webui_manager.add_components("load_save_config", tab_components)
+
save_config_button.click(
- fn=webui_manager.save_current_config,
- inputs=[],
+ fn=webui_manager.save_config,
+ inputs=set(webui_manager.get_components()),
outputs=[config_status]
)
load_config_button.click(
fn=webui_manager.load_config,
inputs=[config_file],
- outputs=[config_status]
+ outputs=webui_manager.get_components(),
)
- return tab_components
diff --git a/src/webui/interface.py b/src/webui/interface.py
index 266b079..ba99245 100644
--- a/src/webui/interface.py
+++ b/src/webui/interface.py
@@ -32,6 +32,9 @@ def create_ui(theme_name="Ocean"):
text-align: center;
margin-bottom: 20px;
}
+ .tab-header-text {
+ text-align: center;
+ }
.theme-section {
margin-bottom: 10px;
padding: 15px;
@@ -67,18 +70,26 @@ def create_ui(theme_name="Ocean"):
with gr.Tabs() as tabs:
with gr.TabItem("⚙️ Agent Settings"):
- ui_manager.add_components("agent_settings", create_agent_settings_tab(ui_manager))
+ create_agent_settings_tab(ui_manager)
with gr.TabItem("🌐 Browser Settings"):
- ui_manager.add_components("browser_settings", create_browser_settings_tab(ui_manager))
+ create_browser_settings_tab(ui_manager)
with gr.TabItem("🤖 Run Agent"):
- ui_manager.add_components("browser_use_agent", create_browser_use_agent_tab(ui_manager))
+ create_browser_use_agent_tab(ui_manager)
- with gr.TabItem("🧐 Deep Research"):
- ui_manager.add_components("deep_research_agent", create_deep_research_agent_tab(ui_manager))
+ with gr.TabItem("🎁 Agent Collections"):
+ gr.Markdown(
+ """
+ ### Agents built on Browser-Use
+ """,
+ elem_classes=["tab-header-text"],
+ )
+ with gr.Tabs():
+ with gr.TabItem("Deep Research"):
+ create_deep_research_agent_tab(ui_manager)
with gr.TabItem("📁 Load & Save Config"):
- ui_manager.add_components("load_save_config", create_load_save_config_tab(ui_manager))
+ create_load_save_config_tab(ui_manager)
return demo
diff --git a/src/webui/webui_manager.py b/src/webui/webui_manager.py
index 033564a..5cbd31f 100644
--- a/src/webui/webui_manager.py
+++ b/src/webui/webui_manager.py
@@ -4,11 +4,17 @@ from typing import TYPE_CHECKING
import os
import gradio as gr
from datetime import datetime
+from typing import Optional, Dict, List
+import uuid
+import asyncio
from gradio.components import Component
from browser_use.browser.browser import Browser
from browser_use.browser.context import BrowserContext
from browser_use.agent.service import Agent
+from src.browser.custom_browser import CustomBrowser
+from src.browser.custom_context import CustomBrowserContext
+from src.controller.custom_controller import CustomController
class WebuiManager:
@@ -19,9 +25,19 @@ class WebuiManager:
self.settings_save_dir = settings_save_dir
os.makedirs(self.settings_save_dir, exist_ok=True)
- self.browser: Browser = None
- self.browser_context: BrowserContext = None
- self.bu_agent: Agent = None
+ def init_browser_use_agent(self) -> None:
+ """
+ init browser use agent
+ """
+ self.bu_agent: Optional[Agent] = None
+ self.bu_browser: Optional[CustomBrowser] = None
+ self.bu_browser_context: Optional[CustomBrowserContext] = None
+ self.bu_controller: Optional[CustomController] = None
+ self.bu_chat_history: List[Dict[str, Optional[str]]] = []
+ self.bu_response_event: Optional[asyncio.Event] = None
+ self.bu_user_help_response: Optional[str] = None
+ self.bu_current_task: Optional[asyncio.Task] = None
+ self.bu_agent_task_id: Optional[str] = None
def add_components(self, tab_name: str, components_dict: dict[str, "Component"]) -> None:
"""
@@ -50,15 +66,16 @@ class WebuiManager:
"""
return self.component_to_id[comp]
- def save_current_config(self):
+ def save_config(self, components: Dict["Component", str]) -> None:
"""
- Save current config
+ Save config
"""
cur_settings = {}
- for comp_id, comp in self.id_to_component.items():
+ for comp in components:
if not isinstance(comp, gr.Button) and not isinstance(comp, gr.File) and str(
getattr(comp, "interactive", True)).lower() != "false":
- cur_settings[comp_id] = getattr(comp, "value", None)
+ comp_id = self.get_id_by_component(comp)
+ cur_settings[comp_id] = components[comp]
config_name = datetime.now().strftime("%Y%m%d-%H%M%S")
with open(os.path.join(self.settings_save_dir, f"{config_name}.json"), "w") as fw:
@@ -76,6 +93,13 @@ class WebuiManager:
update_components = {}
for comp_id, comp_val in ui_settings.items():
if comp_id in self.id_to_component:
- update_components[self.id_to_component[comp_id]].value = comp_val
+ comp = self.id_to_component[comp_id]
+ update_components[comp] = comp.__class__(value=comp_val)
- return f"Successfully loaded config from {config_path}"
+ config_status = self.id_to_component["load_save_config.config_status"]
+ update_components.update(
+ {
+ config_status: config_status.__class__(value=f"Successfully loaded config: {config_path}")
+ }
+ )
+ yield update_components
diff --git a/tests/test_agents.py b/tests/test_agents.py
index 27bb704..79e48d6 100644
--- a/tests/test_agents.py
+++ b/tests/test_agents.py
@@ -17,98 +17,18 @@ from browser_use.agent.views import AgentHistoryList
from src.utils import utils
-async def test_browser_use_org():
+async def test_browser_use_agent():
from browser_use.browser.browser import Browser, BrowserConfig
from browser_use.browser.context import (
BrowserContextConfig,
BrowserContextWindowSize,
)
+ from browser_use.agent.service import Agent
- # llm = utils.get_llm_model(
- # provider="azure_openai",
- # model_name="gpt-4o",
- # temperature=0.8,
- # base_url=os.getenv("AZURE_OPENAI_ENDPOINT", ""),
- # api_key=os.getenv("AZURE_OPENAI_API_KEY", ""),
- # )
-
- # llm = utils.get_llm_model(
- # provider="deepseek",
- # model_name="deepseek-chat",
- # temperature=0.8
- # )
-
- llm = utils.get_llm_model(
- provider="ollama", model_name="deepseek-r1:14b", temperature=0.5
- )
-
- window_w, window_h = 1920, 1080
- use_vision = False
- use_own_browser = False
- if use_own_browser:
- chrome_path = os.getenv("CHROME_PATH", None)
- if chrome_path == "":
- chrome_path = None
- else:
- chrome_path = None
-
- tool_calling_method = "json_schema" # setting to json_schema when using ollma
-
- browser = Browser(
- config=BrowserConfig(
- headless=False,
- disable_security=True,
- chrome_instance_path=chrome_path,
- extra_chromium_args=[f"--window-size={window_w},{window_h}"],
- )
- )
- async with await browser.new_context(
- config=BrowserContextConfig(
- trace_path="./tmp/traces",
- save_recording_path="./tmp/record_videos",
- no_viewport=False,
- browser_window_size=BrowserContextWindowSize(
- width=window_w, height=window_h
- ),
- )
- ) as browser_context:
- agent = Agent(
- task="go to google.com and type 'OpenAI' click search and give me the first url",
- llm=llm,
- browser_context=browser_context,
- use_vision=use_vision,
- tool_calling_method=tool_calling_method
- )
- history: AgentHistoryList = await agent.run(max_steps=10)
-
- print("Final Result:")
- pprint(history.final_result(), indent=4)
-
- print("\nErrors:")
- pprint(history.errors(), indent=4)
-
- # e.g. xPaths the model clicked on
- print("\nModel Outputs:")
- pprint(history.model_actions(), indent=4)
-
- print("\nThoughts:")
- pprint(history.model_thoughts(), indent=4)
- # close browser
- await browser.close()
-
-
-async def test_browser_use_custom():
- from browser_use.browser.context import BrowserContextWindowSize
- from browser_use.browser.browser import BrowserConfig
- from playwright.async_api import async_playwright
-
- from src.agent.custom_agent import CustomAgent
- from src.agent.custom_prompts import CustomSystemPrompt, CustomAgentMessagePrompt
from src.browser.custom_browser import CustomBrowser
- from src.browser.custom_context import BrowserContextConfig
+ from src.browser.custom_context import CustomBrowserContextConfig
from src.controller.custom_controller import CustomController
-
- window_w, window_h = 1280, 1100
+ from src.utils import llm_provider
# llm = utils.get_llm_model(
# provider="openai",
@@ -118,14 +38,6 @@ async def test_browser_use_custom():
# api_key=os.getenv("OPENAI_API_KEY", ""),
# )
- llm = utils.get_llm_model(
- provider="azure_openai",
- model_name="gpt-4o",
- temperature=0.5,
- base_url=os.getenv("AZURE_OPENAI_ENDPOINT", ""),
- api_key=os.getenv("AZURE_OPENAI_API_KEY", ""),
- )
-
# llm = utils.get_llm_model(
# provider="google",
# model_name="gemini-2.0-flash",
@@ -153,13 +65,43 @@ async def test_browser_use_custom():
# provider="ollama", model_name="deepseek-r1:14b", temperature=0.5
# )
+ window_w, window_h = 1280, 1100
+
+ llm = llm_provider.get_llm_model(
+ provider="azure_openai",
+ model_name="gpt-4o",
+ temperature=0.5,
+ base_url=os.getenv("AZURE_OPENAI_ENDPOINT", ""),
+ api_key=os.getenv("AZURE_OPENAI_API_KEY", ""),
+ )
+
+ mcp_server_config = {
+ "mcpServers": {
+ "markitdown": {
+ "command": "docker",
+ "args": [
+ "run",
+ "--rm",
+ "-i",
+ "markitdown-mcp:latest"
+ ]
+ },
+ "desktop-commander": {
+ "command": "npx",
+ "args": [
+ "-y",
+ "@wonderwhy-er/desktop-commander"
+ ]
+ },
+ }
+ }
controller = CustomController()
- use_own_browser = True
+ await controller.setup_mcp_client(mcp_server_config)
+ use_own_browser = False
disable_security = True
use_vision = True # Set to False when using DeepSeek
max_actions_per_step = 10
- playwright = None
browser = None
browser_context = None
@@ -178,29 +120,27 @@ async def test_browser_use_custom():
config=BrowserConfig(
headless=False,
disable_security=disable_security,
- chrome_instance_path=chrome_path,
- extra_chromium_args=extra_chromium_args,
+ browser_binary_path=chrome_path,
+ extra_browser_args=extra_chromium_args,
)
)
browser_context = await browser.new_context(
- config=BrowserContextConfig(
+ config=CustomBrowserContextConfig(
trace_path="./tmp/traces",
save_recording_path="./tmp/record_videos",
- no_viewport=False,
+ save_downloads_path="./tmp/downloads",
browser_window_size=BrowserContextWindowSize(
width=window_w, height=window_h
),
+ force_new_context=True
)
)
- agent = CustomAgent(
- task="open youtube in tab 1 , open google email in tab 2, open facebook in tab 3",
- add_infos="", # some hints for llm to complete the task
+ agent = Agent(
+ task="download pdf from https://arxiv.org/abs/2504.10458 and rename this pdf to 'GUI-r1-test.pdf'",
llm=llm,
browser=browser,
browser_context=browser_context,
controller=controller,
- system_prompt_class=CustomSystemPrompt,
- agent_prompt_class=CustomAgentMessagePrompt,
use_vision=use_vision,
max_actions_per_step=max_actions_per_step,
generate_gif=True
@@ -213,28 +153,17 @@ async def test_browser_use_custom():
print("\nErrors:")
pprint(history.errors(), indent=4)
- # e.g. xPaths the model clicked on
- print("\nModel Outputs:")
- pprint(history.model_actions(), indent=4)
-
- print("\nThoughts:")
- pprint(history.model_thoughts(), indent=4)
-
except Exception:
import traceback
-
traceback.print_exc()
finally:
- # 显式关闭持久化上下文
if browser_context:
await browser_context.close()
-
- # 关闭 Playwright 对象
- if playwright:
- await playwright.stop()
if browser:
await browser.close()
+ if controller:
+ await controller.close_mcp_client()
async def test_browser_use_parallel():
@@ -242,13 +171,20 @@ async def test_browser_use_parallel():
from browser_use.browser.browser import BrowserConfig
from playwright.async_api import async_playwright
from browser_use.browser.browser import Browser
- from src.agent.custom_agent import CustomAgent
- from src.agent.custom_prompts import CustomSystemPrompt, CustomAgentMessagePrompt
- from src.browser.custom_browser import CustomBrowser
from src.browser.custom_context import BrowserContextConfig
from src.controller.custom_controller import CustomController
- window_w, window_h = 1920, 1080
+ from browser_use.browser.browser import Browser, BrowserConfig
+ from browser_use.browser.context import (
+ BrowserContextConfig,
+ BrowserContextWindowSize,
+ )
+ from browser_use.agent.service import Agent
+
+ from src.browser.custom_browser import CustomBrowser
+ from src.browser.custom_context import CustomBrowserContextConfig
+ from src.controller.custom_controller import CustomController
+ from src.utils import llm_provider
# llm = utils.get_llm_model(
# provider="openai",
@@ -258,20 +194,13 @@ async def test_browser_use_parallel():
# api_key=os.getenv("OPENAI_API_KEY", ""),
# )
- # llm = utils.get_llm_model(
- # provider="azure_openai",
- # model_name="gpt-4o",
- # temperature=0.8,
- # base_url=os.getenv("AZURE_OPENAI_ENDPOINT", ""),
- # api_key=os.getenv("AZURE_OPENAI_API_KEY", ""),
- # )
- llm = utils.get_llm_model(
- provider="gemini",
- model_name="gemini-2.0-flash-exp",
- temperature=1.0,
- api_key=os.getenv("GOOGLE_API_KEY", "")
- )
+ # llm = utils.get_llm_model(
+ # provider="google",
+ # model_name="gemini-2.0-flash",
+ # temperature=0.6,
+ # api_key=os.getenv("GOOGLE_API_KEY", "")
+ # )
# llm = utils.get_llm_model(
# provider="deepseek",
@@ -293,72 +222,119 @@ async def test_browser_use_parallel():
# provider="ollama", model_name="deepseek-r1:14b", temperature=0.5
# )
+ window_w, window_h = 1280, 1100
+
+ llm = llm_provider.get_llm_model(
+ provider="azure_openai",
+ model_name="gpt-4o",
+ temperature=0.5,
+ base_url=os.getenv("AZURE_OPENAI_ENDPOINT", ""),
+ api_key=os.getenv("AZURE_OPENAI_API_KEY", ""),
+ )
+
+ mcp_server_config = {
+ "mcpServers": {
+ "markitdown": {
+ "command": "docker",
+ "args": [
+ "run",
+ "--rm",
+ "-i",
+ "markitdown-mcp:latest"
+ ]
+ },
+ "desktop-commander": {
+ "command": "npx",
+ "args": [
+ "-y",
+ "@wonderwhy-er/desktop-commander"
+ ]
+ },
+ # "filesystem": {
+ # "command": "npx",
+ # "args": [
+ # "-y",
+ # "@modelcontextprotocol/server-filesystem",
+ # "/Users/xxx/ai_workspace",
+ # ]
+ # },
+ }
+ }
controller = CustomController()
- use_own_browser = True
+ await controller.setup_mcp_client(mcp_server_config)
+ use_own_browser = False
disable_security = True
use_vision = True # Set to False when using DeepSeek
- max_actions_per_step = 1
- playwright = None
+ max_actions_per_step = 10
browser = None
browser_context = None
- browser = Browser(
- config=BrowserConfig(
- disable_security=True,
- headless=False,
- new_context_config=BrowserContextConfig(save_recording_path='./tmp/recordings'),
- )
- )
-
try:
+ extra_chromium_args = [f"--window-size={window_w},{window_h}"]
+ if use_own_browser:
+ chrome_path = os.getenv("CHROME_PATH", None)
+ if chrome_path == "":
+ chrome_path = None
+ chrome_user_data = os.getenv("CHROME_USER_DATA", None)
+ if chrome_user_data:
+ extra_chromium_args += [f"--user-data-dir={chrome_user_data}"]
+ else:
+ chrome_path = None
+ browser = CustomBrowser(
+ config=BrowserConfig(
+ headless=False,
+ disable_security=disable_security,
+ browser_binary_path=chrome_path,
+ extra_browser_args=extra_chromium_args,
+ )
+ )
+ browser_context = await browser.new_context(
+ config=CustomBrowserContextConfig(
+ trace_path="./tmp/traces",
+ save_recording_path="./tmp/record_videos",
+ save_downloads_path="./tmp/downloads",
+ browser_window_size=BrowserContextWindowSize(
+ width=window_w, height=window_h
+ ),
+ force_new_context=True
+ )
+ )
agents = [
- Agent(task=task, llm=llm, browser=browser)
+ Agent(task=task, llm=llm, browser=browser, controller=controller)
for task in [
'Search Google for weather in Tokyo',
- 'Check Reddit front page title',
- 'Find NASA image of the day',
- 'Check top story on CNN',
+ # 'Check Reddit front page title',
+ # 'Find NASA image of the day',
+ # 'Check top story on CNN',
# 'Search latest SpaceX launch date',
# 'Look up population of Paris',
- # 'Find current time in Sydney',
- # 'Check who won last Super Bowl',
+ 'Find current time in Sydney',
+ 'Check who won last Super Bowl',
# 'Search trending topics on Twitter',
]
]
history = await asyncio.gather(*[agent.run() for agent in agents])
- pdb.set_trace()
print("Final Result:")
pprint(history.final_result(), indent=4)
print("\nErrors:")
pprint(history.errors(), indent=4)
- # e.g. xPaths the model clicked on
- print("\nModel Outputs:")
- pprint(history.model_actions(), indent=4)
+ pdb.set_trace()
- print("\nThoughts:")
- pprint(history.model_thoughts(), indent=4)
- # close browser
except Exception:
import traceback
traceback.print_exc()
finally:
- # 显式关闭持久化上下文
if browser_context:
await browser_context.close()
-
- # 关闭 Playwright 对象
- if playwright:
- await playwright.stop()
if browser:
await browser.close()
if __name__ == "__main__":
- asyncio.run(test_browser_use_org())
- # asyncio.run(test_browser_use_parallel())
- # asyncio.run(test_browser_use_custom())
+ # asyncio.run(test_browser_use_agent())
+ asyncio.run(test_browser_use_parallel())
diff --git a/tests/test_controller.py b/tests/test_controller.py
index 6a10ebc..1e1608e 100644
--- a/tests/test_controller.py
+++ b/tests/test_controller.py
@@ -45,33 +45,37 @@ async def test_controller_with_mcp():
from src.controller.custom_controller import CustomController
from browser_use.controller.registry.views import ActionModel
- test_server_config = {
- "playwright": {
- "command": "npx",
- "args": [
- "@playwright/mcp@latest",
- ],
- "transport": "stdio",
- },
- "filesystem": {
- "command": "npx",
- "args": [
- "-y",
- "@modelcontextprotocol/server-filesystem",
- "/Users/xxx/ai_workspace",
- ]
- },
- "desktop-commander": {
- "command": "npx",
- "args": [
- "-y",
- "@wonderwhy-er/desktop-commander"
- ]
+ mcp_server_config = {
+ "mcpServers": {
+ "markitdown": {
+ "command": "docker",
+ "args": [
+ "run",
+ "--rm",
+ "-i",
+ "markitdown-mcp:latest"
+ ]
+ },
+ "desktop-commander": {
+ "command": "npx",
+ "args": [
+ "-y",
+ "@wonderwhy-er/desktop-commander"
+ ]
+ },
+ # "filesystem": {
+ # "command": "npx",
+ # "args": [
+ # "-y",
+ # "@modelcontextprotocol/server-filesystem",
+ # "/Users/xxx/ai_workspace",
+ # ]
+ # },
}
}
controller = CustomController()
- await controller.setup_mcp_client(test_server_config)
+ await controller.setup_mcp_client(mcp_server_config)
action_name = "mcp.desktop-commander.execute_command"
action_info = controller.registry.registry.actions[action_name]
param_model = action_info.param_model
@@ -85,7 +89,8 @@ async def test_controller_with_mcp():
result = await controller.act(action_model)
result = result.extracted_content
print(result)
- if result and "Command is still running. Use read_output to get more output." in result and "PID" in result.split("\n")[0]:
+ if result and "Command is still running. Use read_output to get more output." in result and "PID" in \
+ result.split("\n")[0]:
pid = int(result.split("\n")[0].split("PID")[-1].strip())
action_name = "mcp.desktop-commander.read_output"
action_info = controller.registry.registry.actions[action_name]
diff --git a/tests/test_llm_api.py b/tests/test_llm_api.py
index bee1e6b..c0e9e16 100644
--- a/tests/test_llm_api.py
+++ b/tests/test_llm_api.py
@@ -144,10 +144,10 @@ def test_ibm_model():
if __name__ == "__main__":
# test_openai_model()
# test_google_model()
- # test_azure_openai_model()
+ test_azure_openai_model()
# test_deepseek_model()
# test_ollama_model()
# test_deepseek_r1_model()
# test_deepseek_r1_ollama_model()
# test_mistral_model()
- test_ibm_model()
+ # test_ibm_model()
diff --git a/webui.py b/webui.py
index 3066ecb..34e93ab 100644
--- a/webui.py
+++ b/webui.py
@@ -1,3 +1,5 @@
+from dotenv import load_dotenv
+load_dotenv()
import argparse
from src.webui.interface import theme_map, create_ui
diff --git a/webui2.py b/webui2.py
index 33d7ece..98a23b4 100644
--- a/webui2.py
+++ b/webui2.py
@@ -42,77 +42,6 @@ _global_browser = None
_global_browser_context = None
_global_agent = None
-# Create the global agent state instance
-_global_agent_state = AgentState()
-
-# webui config
-webui_config_manager = utils.ConfigManager()
-
-
-def scan_and_register_components(blocks):
- """扫描一个 Blocks 对象并注册其中的所有交互式组件,但不包括按钮"""
- global webui_config_manager
-
- def traverse_blocks(block, prefix=""):
- registered = 0
-
- # 处理 Blocks 自身的组件
- if hasattr(block, "children"):
- for i, child in enumerate(block.children):
- if isinstance(child, gr.components.Component):
- # 排除按钮 (Button) 组件
- if getattr(child, "interactive", False) and not isinstance(child, gr.Button):
- name = f"{prefix}component_{i}"
- if hasattr(child, "label") and child.label:
- # 使用标签作为名称的一部分
- label = child.label
- name = f"{prefix}{label}"
- logger.debug(f"Registering component: {name}")
- webui_config_manager.register_component(name, child)
- registered += 1
- elif hasattr(child, "children"):
- # 递归处理嵌套的 Blocks
- new_prefix = f"{prefix}block_{i}_"
- registered += traverse_blocks(child, new_prefix)
-
- return registered
-
- total = traverse_blocks(blocks)
- logger.info(f"Total registered components: {total}")
-
-
-def save_current_config():
- return webui_config_manager.save_current_config()
-
-
-def update_ui_from_config(config_file):
- return webui_config_manager.update_ui_from_config(config_file)
-
-
-def resolve_sensitive_env_variables(text):
- """
- Replace environment variable placeholders ($SENSITIVE_*) with their values.
- Only replaces variables that start with SENSITIVE_.
- """
- if not text:
- return text
-
- import re
-
- # Find all $SENSITIVE_* patterns
- env_vars = re.findall(r'\$SENSITIVE_[A-Za-z0-9_]*', text)
-
- result = text
- for var in env_vars:
- # Remove the $ prefix to get the actual environment variable name
- env_name = var[1:] # removes the $
- env_value = os.getenv(env_name)
- if env_value is not None:
- # Replace $SENSITIVE_VAR_NAME with its value
- result = result.replace(var, env_value)
-
- return result
-
async def stop_agent():
"""Request the agent to stop and update UI with enhanced feedback"""
@@ -140,32 +69,6 @@ async def stop_agent():
)
-async def stop_research_agent():
- """Request the agent to stop and update UI with enhanced feedback"""
- global _global_agent_state
-
- try:
- # Request stop
- _global_agent_state.request_stop()
-
- # Update UI immediately
- message = "Stop requested - the agent will halt at the next safe point"
- logger.info(f"🛑 {message}")
-
- # Return UI updates
- return ( # errors_output
- gr.update(value="Stopping...", interactive=False), # stop_button
- gr.update(interactive=False), # run_button
- )
- except Exception as e:
- error_msg = f"Error during stop: {str(e)}"
- logger.error(error_msg)
- return (
- gr.update(value="Stop", interactive=True),
- gr.update(interactive=True)
- )
-
-
async def run_browser_agent(
agent_type,
llm_provider,
@@ -202,16 +105,6 @@ async def run_browser_agent(
if save_recording_path:
os.makedirs(save_recording_path, exist_ok=True)
- # Get the list of existing videos before the agent runs
- existing_videos = set()
- if save_recording_path:
- existing_videos = set(
- glob.glob(os.path.join(save_recording_path, "*.[mM][pP]4"))
- + glob.glob(os.path.join(save_recording_path, "*.[wW][eE][bB][mM]"))
- )
-
- task = resolve_sensitive_env_variables(task)
-
# Run the agent
llm = utils.get_llm_model(
provider=llm_provider,