From 4c87694cef50ba504a97c10d4ecaa135a1e57a34 Mon Sep 17 00:00:00 2001 From: vincent Date: Mon, 28 Apr 2025 22:11:56 +0800 Subject: [PATCH] add browser-use agent run --- .../deep_research_agent.py | 0 src/browser/custom_browser.py | 73 +- src/browser/custom_context.py | 98 +- src/controller/custom_controller.py | 44 +- src/utils/mcp_client.py | 6 + src/utils/utils.py | 124 --- src/webui/components/agent_settings_tab.py | 18 +- src/webui/components/browser_settings_tab.py | 8 +- src/webui/components/browser_use_agent_tab.py | 947 +++++++++++++++++- .../components/deep_research_agent_tab.py | 2 +- src/webui/components/load_save_config_tab.py | 9 +- src/webui/interface.py | 23 +- src/webui/webui_manager.py | 42 +- tests/test_agents.py | 306 +++--- tests/test_controller.py | 53 +- tests/test_llm_api.py | 4 +- webui.py | 2 + webui2.py | 107 -- 18 files changed, 1343 insertions(+), 523 deletions(-) rename src/agent/{ => deep_research}/deep_research_agent.py (100%) diff --git a/src/agent/deep_research_agent.py b/src/agent/deep_research/deep_research_agent.py similarity index 100% rename from src/agent/deep_research_agent.py rename to src/agent/deep_research/deep_research_agent.py diff --git a/src/browser/custom_browser.py b/src/browser/custom_browser.py index 4a2d1ab..a1c057b 100644 --- a/src/browser/custom_browser.py +++ b/src/browser/custom_browser.py @@ -9,11 +9,23 @@ from playwright.async_api import ( Playwright, async_playwright, ) -from browser_use.browser.browser import Browser +from browser_use.browser.browser import Browser, IN_DOCKER from browser_use.browser.context import BrowserContext, BrowserContextConfig from playwright.async_api import BrowserContext as PlaywrightBrowserContext import logging +from browser_use.browser.chrome import ( + CHROME_ARGS, + CHROME_DETERMINISTIC_RENDERING_ARGS, + CHROME_DISABLE_SECURITY_ARGS, + CHROME_DOCKER_ARGS, + CHROME_HEADLESS_ARGS, +) +from browser_use.browser.context import BrowserContext, BrowserContextConfig +from browser_use.browser.utils.screen_resolution import get_screen_resolution, get_window_adjustments +from browser_use.utils import time_execution_async +import socket + from .custom_context import CustomBrowserContext logger = logging.getLogger(__name__) @@ -26,3 +38,62 @@ class CustomBrowser(Browser): config: BrowserContextConfig = BrowserContextConfig() ) -> CustomBrowserContext: return CustomBrowserContext(config=config, browser=self) + + async def _setup_builtin_browser(self, playwright: Playwright) -> PlaywrightBrowser: + """Sets up and returns a Playwright Browser instance with anti-detection measures.""" + assert self.config.browser_binary_path is None, 'browser_binary_path should be None if trying to use the builtin browsers' + + if self.config.headless: + screen_size = {'width': 1920, 'height': 1080} + offset_x, offset_y = 0, 0 + else: + screen_size = get_screen_resolution() + offset_x, offset_y = get_window_adjustments() + + chrome_args = { + *CHROME_ARGS, + *(CHROME_DOCKER_ARGS if IN_DOCKER else []), + *(CHROME_HEADLESS_ARGS if self.config.headless else []), + *(CHROME_DISABLE_SECURITY_ARGS if self.config.disable_security else []), + *(CHROME_DETERMINISTIC_RENDERING_ARGS if self.config.deterministic_rendering else []), + f'--window-position={offset_x},{offset_y}', + *self.config.extra_browser_args, + } + contain_window_size = False + for arg in self.config.extra_browser_args: + if "--window-size" in arg: + contain_window_size = True + break + if not contain_window_size: + chrome_args.add(f'--window-size={screen_size["width"]},{screen_size["height"]}') + + # check if port 9222 is already taken, if so remove the remote-debugging-port arg to prevent conflicts + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: + if s.connect_ex(('localhost', 9222)) == 0: + chrome_args.remove('--remote-debugging-port=9222') + + browser_class = getattr(playwright, self.config.browser_class) + args = { + 'chromium': list(chrome_args), + 'firefox': [ + *{ + '-no-remote', + *self.config.extra_browser_args, + } + ], + 'webkit': [ + *{ + '--no-startup-window', + *self.config.extra_browser_args, + } + ], + } + + browser = await browser_class.launch( + headless=self.config.headless, + args=args[self.config.browser_class], + proxy=self.config.proxy.model_dump() if self.config.proxy else None, + handle_sigterm=False, + handle_sigint=False, + ) + return browser diff --git a/src/browser/custom_context.py b/src/browser/custom_context.py index fd0e2e5..4dc2423 100644 --- a/src/browser/custom_context.py +++ b/src/browser/custom_context.py @@ -2,7 +2,7 @@ import json import logging import os -from browser_use.browser.browser import Browser +from browser_use.browser.browser import Browser, IN_DOCKER from browser_use.browser.context import BrowserContext, BrowserContextConfig from playwright.async_api import Browser as PlaywrightBrowser from playwright.async_api import BrowserContext as PlaywrightBrowserContext @@ -10,10 +10,104 @@ from playwright.async_api import BrowserContext as PlaywrightBrowserContext logger = logging.getLogger(__name__) +class CustomBrowserContextConfig(BrowserContextConfig): + force_new_context: bool = False # force to create new context + + class CustomBrowserContext(BrowserContext): def __init__( self, browser: "Browser", - config: BrowserContextConfig = BrowserContextConfig() + config: CustomBrowserContextConfig = CustomBrowserContextConfig(), ): super(CustomBrowserContext, self).__init__(browser=browser, config=config) + + async def _create_context(self, browser: PlaywrightBrowser): + """Creates a new browser context with anti-detection measures and loads cookies if available.""" + if not self.config.force_new_context and self.browser.config.cdp_url and len(browser.contexts) > 0: + context = browser.contexts[0] + elif not self.config.force_new_context and self.browser.config.browser_binary_path and len( + browser.contexts) > 0: + # Connect to existing Chrome instance instead of creating new one + context = browser.contexts[0] + else: + # Original code for creating new context + context = await browser.new_context( + no_viewport=True, + user_agent=self.config.user_agent, + java_script_enabled=True, + bypass_csp=self.config.disable_security, + ignore_https_errors=self.config.disable_security, + record_video_dir=self.config.save_recording_path, + record_video_size=self.config.browser_window_size.model_dump(), + record_har_path=self.config.save_har_path, + locale=self.config.locale, + http_credentials=self.config.http_credentials, + is_mobile=self.config.is_mobile, + has_touch=self.config.has_touch, + geolocation=self.config.geolocation, + permissions=self.config.permissions, + timezone_id=self.config.timezone_id, + ) + + if self.config.trace_path: + await context.tracing.start(screenshots=True, snapshots=True, sources=True) + + # Load cookies if they exist + if self.config.cookies_file and os.path.exists(self.config.cookies_file): + with open(self.config.cookies_file, 'r') as f: + try: + cookies = json.load(f) + + valid_same_site_values = ['Strict', 'Lax', 'None'] + for cookie in cookies: + if 'sameSite' in cookie: + if cookie['sameSite'] not in valid_same_site_values: + logger.warning( + f"Fixed invalid sameSite value '{cookie['sameSite']}' to 'None' for cookie {cookie.get('name')}" + ) + cookie['sameSite'] = 'None' + logger.info(f'🍪 Loaded {len(cookies)} cookies from {self.config.cookies_file}') + await context.add_cookies(cookies) + + except json.JSONDecodeError as e: + logger.error(f'Failed to parse cookies file: {str(e)}') + + # Expose anti-detection scripts + await context.add_init_script( + """ + // Webdriver property + Object.defineProperty(navigator, 'webdriver', { + get: () => undefined + }); + + // Languages + Object.defineProperty(navigator, 'languages', { + get: () => ['en-US'] + }); + + // Plugins + Object.defineProperty(navigator, 'plugins', { + get: () => [1, 2, 3, 4, 5] + }); + + // Chrome runtime + window.chrome = { runtime: {} }; + + // Permissions + const originalQuery = window.navigator.permissions.query; + window.navigator.permissions.query = (parameters) => ( + parameters.name === 'notifications' ? + Promise.resolve({ state: Notification.permission }) : + originalQuery(parameters) + ); + (function () { + const originalAttachShadow = Element.prototype.attachShadow; + Element.prototype.attachShadow = function attachShadow(options) { + return originalAttachShadow.call(this, { ...options, mode: "open" }); + }; + })(); + """ + ) + + return context diff --git a/src/controller/custom_controller.py b/src/controller/custom_controller.py index 7209e97..d07c88b 100644 --- a/src/controller/custom_controller.py +++ b/src/controller/custom_controller.py @@ -48,28 +48,6 @@ class CustomController(Controller): self.mcp_client = None self.mcp_server_config = None - async def setup_mcp_client(self, mcp_server_config: Optional[Dict[str, Any]] = None): - self.mcp_server_config = mcp_server_config - if self.mcp_server_config: - self.mcp_client = await setup_mcp_client_and_tools(self.mcp_server_config) - self.register_mcp_tools() - - def register_mcp_tools(self): - """ - Register the MCP tools used by this controller. - """ - if self.mcp_client: - for server_name in self.mcp_client.server_name_to_tools: - for tool in self.mcp_client.server_name_to_tools[server_name]: - tool_name = f"mcp.{server_name}.{tool.name}" - self.registry.registry.actions[tool_name] = RegisteredAction( - name=tool_name, - description=tool.description, - function=tool, - param_model=create_tool_param_model(tool), - ) - logger.info(f"Add mcp tool: {tool_name}") - def _register_custom_actions(self): """Register all custom browser actions""" @@ -173,6 +151,28 @@ class CustomController(Controller): except Exception as e: raise e + async def setup_mcp_client(self, mcp_server_config: Optional[Dict[str, Any]] = None): + self.mcp_server_config = mcp_server_config + if self.mcp_server_config: + self.mcp_client = await setup_mcp_client_and_tools(self.mcp_server_config) + self.register_mcp_tools() + + def register_mcp_tools(self): + """ + Register the MCP tools used by this controller. + """ + if self.mcp_client: + for server_name in self.mcp_client.server_name_to_tools: + for tool in self.mcp_client.server_name_to_tools[server_name]: + tool_name = f"mcp.{server_name}.{tool.name}" + self.registry.registry.actions[tool_name] = RegisteredAction( + name=tool_name, + description=tool.description, + function=tool, + param_model=create_tool_param_model(tool), + ) + logger.info(f"Add mcp tool: {tool_name}") + async def close_mcp_client(self): if self.mcp_client: await self.mcp_client.__aexit__(None, None, None) diff --git a/src/utils/mcp_client.py b/src/utils/mcp_client.py index a5d6fcd..b909d0d 100644 --- a/src/utils/mcp_client.py +++ b/src/utils/mcp_client.py @@ -40,7 +40,13 @@ async def setup_mcp_client_and_tools(mcp_server_config: Dict[str, Any]) -> Optio logger.info("Initializing MultiServerMCPClient...") + if not mcp_server_config: + logger.error("No MCP server configuration provided.") + return None + try: + if "mcpServers" in mcp_server_config: + mcp_server_config = mcp_server_config["mcpServers"] client = MultiServerMCPClient(mcp_server_config) await client.__aenter__() return client diff --git a/src/utils/utils.py b/src/utils/utils.py index 8703c46..f0f0b76 100644 --- a/src/utils/utils.py +++ b/src/utils/utils.py @@ -9,25 +9,6 @@ import gradio as gr import uuid -# Callback to update the model name dropdown based on the selected provider -def update_model_dropdown(llm_provider, api_key=None, base_url=None): - """ - Update the model name dropdown with predefined models for the selected provider. - """ - import gradio as gr - # Use API keys from .env if not provided - if not api_key: - api_key = os.getenv(f"{llm_provider.upper()}_API_KEY", "") - if not base_url: - base_url = os.getenv(f"{llm_provider.upper()}_BASE_URL", "") - - # Use predefined models for the selected provider - if llm_provider in model_names: - return gr.Dropdown(choices=model_names[llm_provider], value=model_names[llm_provider][0], interactive=True) - else: - return gr.Dropdown(choices=[], value="", interactive=True, allow_custom_value=True) - - def encode_image(img_path): if not img_path: return None @@ -56,108 +37,3 @@ def get_latest_files(directory: str, file_types: list = ['.webm', '.zip']) -> Di print(f"Error getting latest {file_type} file: {e}") return latest_files - - -async def capture_screenshot(browser_context): - """Capture and encode a screenshot""" - # Extract the Playwright browser instance - playwright_browser = browser_context.browser.playwright_browser # Ensure this is correct. - - # Check if the browser instance is valid and if an existing context can be reused - if playwright_browser and playwright_browser.contexts: - playwright_context = playwright_browser.contexts[0] - else: - return None - - # Access pages in the context - pages = None - if playwright_context: - pages = playwright_context.pages - - # Use an existing page or create a new one if none exist - if pages: - active_page = pages[0] - for page in pages: - if page.url != "about:blank": - active_page = page - else: - return None - - # Take screenshot - try: - screenshot = await active_page.screenshot( - type='jpeg', - quality=75, - scale="css" - ) - encoded = base64.b64encode(screenshot).decode('utf-8') - return encoded - except Exception as e: - return None - - -class ConfigManager: - def __init__(self): - self.components = {} - self.component_order = [] - - def register_component(self, name: str, component): - """Register a gradio component for config management.""" - self.components[name] = component - if name not in self.component_order: - self.component_order.append(name) - return component - - def save_current_config(self): - """Save the current configuration of all registered components.""" - current_config = {} - for name in self.component_order: - component = self.components[name] - # Get the current value from the component - current_config[name] = getattr(component, "value", None) - - return save_config_to_file(current_config) - - def update_ui_from_config(self, config_file): - """Update UI components from a loaded configuration file.""" - if config_file is None: - return [gr.update() for _ in self.component_order] + ["No file selected."] - - loaded_config = load_config_from_file(config_file.name) - - if not isinstance(loaded_config, dict): - return [gr.update() for _ in self.component_order] + ["Error: Invalid configuration file."] - - # Prepare updates for all components - updates = [] - for name in self.component_order: - if name in loaded_config: - updates.append(gr.update(value=loaded_config[name])) - else: - updates.append(gr.update()) - - updates.append("Configuration loaded successfully.") - return updates - - def get_all_components(self): - """Return all registered components in the order they were registered.""" - return [self.components[name] for name in self.component_order] - - -def load_config_from_file(config_file): - """Load settings from a config file (JSON format).""" - try: - with open(config_file, 'r') as f: - settings = json.load(f) - return settings - except Exception as e: - return f"Error loading configuration: {str(e)}" - - -def save_config_to_file(settings, save_dir="./tmp/webui_settings"): - """Save the current settings to a UUID.json file with a UUID name.""" - os.makedirs(save_dir, exist_ok=True) - config_file = os.path.join(save_dir, f"{uuid.uuid4()}.json") - with open(config_file, 'w') as f: - json.dump(settings, f, indent=2) - return f"Configuration saved to {config_file}" diff --git a/src/webui/components/agent_settings_tab.py b/src/webui/components/agent_settings_tab.py index a2479b3..85e7c0e 100644 --- a/src/webui/components/agent_settings_tab.py +++ b/src/webui/components/agent_settings_tab.py @@ -50,7 +50,7 @@ def create_agent_settings_tab(webui_manager: WebuiManager) -> dict[str, Componen extend_system_prompt = gr.Textbox(label="Extend system prompt", lines=4, interactive=True) with gr.Group(): - mcp_json_file = gr.File(label="MCP server file", interactive=True, file_types=[".json"]) + mcp_json_file = gr.File(label="MCP server json", interactive=True, file_types=[".json"]) mcp_server_config = gr.Textbox(label="MCP server", lines=6, interactive=True, visible=False) with gr.Group(): @@ -118,6 +118,7 @@ def create_agent_settings_tab(webui_manager: WebuiManager) -> dict[str, Componen choices=[provider for provider, model in config.model_names.items()], label="Planner LLM Provider", info="Select LLM provider for LLM", + value=None, interactive=True ) planner_llm_model_name = gr.Dropdown( @@ -201,7 +202,6 @@ def create_agent_settings_tab(webui_manager: WebuiManager) -> dict[str, Componen interactive=True, allow_custom_value=True, choices=["auto", "json_schema", "function_calling", "None"], - info="Tool Calls Function Name", visible=True ) tab_components.update(dict( @@ -228,6 +228,8 @@ def create_agent_settings_tab(webui_manager: WebuiManager) -> dict[str, Componen mcp_json_file=mcp_json_file, mcp_server_config=mcp_server_config, )) + webui_manager.add_components("agent_settings", tab_components) + llm_provider.change( fn=lambda x: gr.update(visible=x == "ollama"), inputs=llm_provider, @@ -236,23 +238,21 @@ def create_agent_settings_tab(webui_manager: WebuiManager) -> dict[str, Componen llm_provider.change( lambda provider: update_model_dropdown(provider), inputs=[llm_provider], - outputs=llm_model_name + outputs=[llm_model_name] ) planner_llm_provider.change( fn=lambda x: gr.update(visible=x == "ollama"), - inputs=planner_llm_provider, - outputs=planner_ollama_num_ctx + inputs=[planner_llm_provider], + outputs=[planner_ollama_num_ctx] ) planner_llm_provider.change( lambda provider: update_model_dropdown(provider), inputs=[planner_llm_provider], - outputs=planner_llm_model_name + outputs=[planner_llm_model_name] ) mcp_json_file.change( update_mcp_server, - inputs=mcp_json_file, + inputs=[mcp_json_file], outputs=[mcp_server_config, mcp_server_config] ) - - return tab_components diff --git a/src/webui/components/browser_settings_tab.py b/src/webui/components/browser_settings_tab.py index c2b3e56..0d3bcbb 100644 --- a/src/webui/components/browser_settings_tab.py +++ b/src/webui/components/browser_settings_tab.py @@ -35,7 +35,7 @@ def create_browser_settings_tab(webui_manager: WebuiManager) -> dict[str, Compon ) keep_browser_open = gr.Checkbox( label="Keep Browser Open", - value=False, + value=True, info="Keep Browser Open between Tasks", interactive=True ) @@ -119,7 +119,9 @@ def create_browser_settings_tab(webui_manager: WebuiManager) -> dict[str, Compon save_agent_history_path=save_agent_history_path, save_download_path=save_download_path, cdp_url=cdp_url, - wss_url=wss_url + wss_url=wss_url, + window_h=window_h, + window_w=window_w, ) ) - return tab_components + webui_manager.add_components("browser_settings", tab_components) diff --git a/src/webui/components/browser_use_agent_tab.py b/src/webui/components/browser_use_agent_tab.py index 8f842af..8a122b9 100644 --- a/src/webui/components/browser_use_agent_tab.py +++ b/src/webui/components/browser_use_agent_tab.py @@ -1,62 +1,921 @@ import gradio as gr from gradio.components import Component +import asyncio +import os +import json +import uuid +import logging +from datetime import datetime +from typing import List, Dict, Optional, Any, Set, Generator, AsyncGenerator, Union +from collections.abc import Awaitable +from langchain_core.language_models.chat_models import BaseChatModel +import base64 +from browser_use.browser.browser import Browser, BrowserConfig +from browser_use.browser.context import BrowserContext, BrowserContextConfig, BrowserContextWindowSize +from browser_use.agent.service import Agent +from browser_use.agent.views import AgentHistoryList +from browser_use.agent.views import ToolCallingMethod # Adjust import +from browser_use.agent.views import ( + REQUIRED_LLM_API_ENV_VARS, + ActionResult, + AgentError, + AgentHistory, + AgentHistoryList, + AgentOutput, + AgentSettings, + AgentState, + AgentStepInfo, + StepMetadata, + ToolCallingMethod, +) +from browser_use.browser.browser import Browser +from browser_use.browser.context import BrowserContext +from browser_use.browser.views import BrowserState, BrowserStateHistory from src.webui.webui_manager import WebuiManager -from src.utils import config +from src.controller.custom_controller import CustomController +from src.utils import llm_provider +from src.browser.custom_browser import CustomBrowser +from src.browser.custom_context import CustomBrowserContext, CustomBrowserContextConfig + +logger = logging.getLogger(__name__) -def create_browser_use_agent_tab(webui_manager: WebuiManager) -> dict[str, Component]: +# --- Helper Functions --- (Defined at module level) + +async def _initialize_llm(provider: Optional[str], model_name: Optional[str], temperature: float, + base_url: Optional[str], api_key: Optional[str], num_ctx: Optional[int] = None) -> Optional[ + BaseChatModel]: + """Initializes the LLM based on settings. Returns None if provider/model is missing.""" + if not provider or not model_name: + logger.info("LLM Provider or Model Name not specified, LLM will be None.") + return None + try: + # Use your actual LLM provider logic here + logger.info(f"Initializing LLM: Provider={provider}, Model={model_name}, Temp={temperature}") + # Example using a placeholder function + llm = llm_provider.get_llm_model( + provider=provider, + model_name=model_name, + temperature=temperature, + base_url=base_url or None, + api_key=api_key or None, + # Add other relevant params like num_ctx for ollama + num_ctx=num_ctx if provider == "ollama" else None + ) + return llm + except Exception as e: + logger.error(f"Failed to initialize LLM: {e}", exc_info=True) + gr.Warning( + f"Failed to initialize LLM '{model_name}' for provider '{provider}'. Please check settings. Error: {e}") + return None + + +def _get_config_value(webui_manager: WebuiManager, comp_dict: Dict[gr.components.Component, Any], comp_id_suffix: str, + default: Any = None) -> Any: + """Safely get value from component dictionary using its ID suffix relative to the tab.""" + # Assumes component ID format is "tab_name.comp_name" + tab_name = "browser_use_agent" # Hardcode or derive if needed + comp_id = f"{tab_name}.{comp_id_suffix}" + # Need to find the component object first using the ID from the manager + try: + comp = webui_manager.get_component_by_id(comp_id) + return comp_dict.get(comp, default) + except KeyError: + # Try accessing settings tabs as well + for prefix in ["agent_settings", "browser_settings"]: + try: + comp_id = f"{prefix}.{comp_id_suffix}" + comp = webui_manager.get_component_by_id(comp_id) + return comp_dict.get(comp, default) + except KeyError: + continue + logger.warning(f"Component with suffix '{comp_id_suffix}' not found in manager for value lookup.") + return default + + +def _format_agent_output(model_output: AgentOutput) -> str: + """Formats AgentOutput for display in the chatbot using JSON.""" + content = "" + if model_output: + try: + # Directly use model_dump if actions and current_state are Pydantic models + action_dump = [action.model_dump(exclude_none=True) for action in model_output.action] + + state_dump = model_output.current_state.model_dump(exclude_none=True) + model_output_dump = { + 'current_state': state_dump, + 'action': action_dump, + } + # Dump to JSON string with indentation + json_string = json.dumps(model_output_dump, indent=4, ensure_ascii=False) + # Wrap in
 for proper display in HTML
+            content = f"
{json_string}
" + + except AttributeError as ae: + logger.error( + f"AttributeError during model dump: {ae}. Check if 'action' or 'current_state' or their items support 'model_dump'.") + content = f"
Error: Could not format agent output (AttributeError: {ae}).\nRaw output: {str(model_output)}
" + except Exception as e: + logger.error(f"Error formatting agent output: {e}", exc_info=True) + # Fallback to simple string representation on error + content = f"
Error formatting agent output.\nRaw output:\n{str(model_output)}
" + + return content.strip() + + +# --- Updated Callback Implementation --- + +async def _handle_new_step(webui_manager: WebuiManager, state: BrowserState, output: AgentOutput, step_num: int): + """Callback for each step taken by the agent, including screenshot display.""" + + # Use the correct chat history attribute name from the user's code + if not hasattr(webui_manager, 'bu_chat_history'): + logger.error("Attribute 'bu_chat_history' not found in webui_manager! Cannot add chat message.") + # Initialize it maybe? Or raise an error? For now, log and potentially skip chat update. + webui_manager.bu_chat_history = [] # Initialize if missing (consider if this is the right place) + # return # Or stop if this is critical + step_num -= 1 + logger.info(f"Step {step_num} completed.") + + # --- Screenshot Handling --- + screenshot_html = "" + # Ensure state.screenshot exists and is not empty before proceeding + # Use getattr for safer access + screenshot_data = getattr(state, 'screenshot', None) + if screenshot_data: + try: + # Basic validation: check if it looks like base64 + if isinstance(screenshot_data, str) and len(screenshot_data) > 100: # Arbitrary length check + # *** UPDATED STYLE: Removed centering, adjusted width *** + img_tag = f'Step {step_num} Screenshot' + screenshot_html = img_tag + "
" # Use
for line break after inline-block image + else: + logger.warning( + f"Screenshot for step {step_num} seems invalid (type: {type(screenshot_data)}, len: {len(screenshot_data) if isinstance(screenshot_data, str) else 'N/A'}).") + screenshot_html = "**[Invalid screenshot data]**
" + + except Exception as e: + logger.error(f"Error processing or formatting screenshot for step {step_num}: {e}", exc_info=True) + screenshot_html = "**[Error displaying screenshot]**
" + else: + logger.debug(f"No screenshot available for step {step_num}.") + + # --- Format Agent Output --- + formatted_output = _format_agent_output(output) # Use the updated function + + # --- Combine and Append to Chat --- + step_header = f"--- **Step {step_num}** ---" + # Combine header, image (with line break), and JSON block + final_content = step_header + "
" + screenshot_html + formatted_output + + chat_message = { + "role": "assistant", + "content": final_content.strip() # Remove leading/trailing whitespace + } + + # Append to the correct chat history list + webui_manager.bu_chat_history.append(chat_message) + + await asyncio.sleep(0.05) + + +def _handle_done(webui_manager: WebuiManager, history: AgentHistoryList): + """Callback when the agent finishes the task (success or failure).""" + logger.info( + f"Agent task finished. Duration: {history.total_duration_seconds():.2f}s, Tokens: {history.total_input_tokens()}") + final_summary = f"**Task Completed**\n" + final_summary += f"- Duration: {history.total_duration_seconds():.2f} seconds\n" + final_summary += f"- Total Input Tokens: {history.total_input_tokens()}\n" # Or total tokens if available + + final_result = history.final_result() + if final_result: + final_summary += f"- Final Result: {final_result}\n" + + errors = history.errors() + if errors and any(errors): + final_summary += f"- **Errors:**\n```\n{errors}\n```\n" + else: + final_summary += "- Status: Success\n" + + webui_manager.bu_chat_history.append({"role": "assistant", "content": final_summary}) + + +async def _ask_assistant_callback(webui_manager: WebuiManager, query: str, browser_context: BrowserContext) -> Dict[ + str, Any]: + """Callback triggered by the agent's ask_for_assistant action.""" + logger.info("Agent requires assistance. Waiting for user input.") + + if not hasattr(webui_manager, '_chat_history'): + logger.error("Chat history not found in webui_manager during ask_assistant!") + return {"response": "Internal Error: Cannot display help request."} + + webui_manager.bu_chat_history.append({"role": "assistant", + "content": f"**Need Help:** {query}\nPlease provide information or perform the required action in the browser, then type your response/confirmation below and click 'Submit Response'."}) + + # Use state stored in webui_manager + webui_manager.bu_response_event = asyncio.Event() + webui_manager.bu_user_help_response = None # Reset previous response + + try: + logger.info("Waiting for user response event...") + await asyncio.wait_for(webui_manager.bu_response_event.wait(), timeout=3600.0) # Long timeout + logger.info("User response event received.") + except asyncio.TimeoutError: + logger.warning("Timeout waiting for user assistance.") + webui_manager.bu_chat_history.append( + {"role": "assistant", "content": "**Timeout:** No response received. Trying to proceed."}) + webui_manager.bu_response_event = None # Clear the event + return {"response": "Timeout: User did not respond."} # Inform the agent + + response = webui_manager.bu_user_help_response + webui_manager.bu_chat_history.append({"role": "user", "content": response}) # Show user response in chat + webui_manager.bu_response_event = None # Clear the event for the next potential request + return {"response": response} + + +async def capture_screenshot(browser_context): + """Capture and encode a screenshot""" + # Extract the Playwright browser instance + playwright_browser = browser_context.browser.playwright_browser # Ensure this is correct. + + # Check if the browser instance is valid and if an existing context can be reused + if playwright_browser and playwright_browser.contexts: + playwright_context = playwright_browser.contexts[0] + else: + return None + + # Access pages in the context + pages = None + if playwright_context: + pages = playwright_context.pages + + # Use an existing page or create a new one if none exist + if pages: + active_page = pages[0] + for page in pages: + if page.url != "about:blank": + active_page = page + else: + return None + + # Take screenshot + try: + screenshot = await active_page.screenshot( + type='jpeg', + quality=75, + scale="css" + ) + encoded = base64.b64encode(screenshot).decode('utf-8') + return encoded + except Exception as e: + return None + + +# --- Core Agent Execution Logic --- (Needs access to webui_manager) + +async def run_agent_task(webui_manager: WebuiManager, components: Dict[gr.components.Component, Any]) -> AsyncGenerator[ + Dict[gr.components.Component, Any], None]: + """Handles the entire lifecycle of initializing and running the agent.""" + + # --- Get Components --- + # Need handles to specific UI components to update them + user_input_comp = webui_manager.get_component_by_id("browser_use_agent.user_input") + run_button_comp = webui_manager.get_component_by_id("browser_use_agent.run_button") + stop_button_comp = webui_manager.get_component_by_id("browser_use_agent.stop_button") + pause_resume_button_comp = webui_manager.get_component_by_id("browser_use_agent.pause_resume_button") + clear_button_comp = webui_manager.get_component_by_id("browser_use_agent.clear_button") + chatbot_comp = webui_manager.get_component_by_id("browser_use_agent.chatbot") + history_file_comp = webui_manager.get_component_by_id("browser_use_agent.agent_history_file") + gif_comp = webui_manager.get_component_by_id("browser_use_agent.recording_gif") + browser_view_comp = webui_manager.get_component_by_id("browser_use_agent.browser_view") + + # --- 1. Get Task and Initial UI Update --- + task = components.get(user_input_comp, "").strip() + if not task: + gr.Warning("Please enter a task.") + yield {run_button_comp: gr.update(interactive=True)} + return + + # Set running state indirectly via _current_task + webui_manager.bu_chat_history.append({"role": "user", "content": task}) + + yield { + user_input_comp: gr.Textbox(value="", interactive=False, placeholder="Agent is running..."), + run_button_comp: gr.Button(value="⏳ Running...", interactive=False), + stop_button_comp: gr.Button(interactive=True), + pause_resume_button_comp: gr.Button(value="⏸️ Pause", interactive=True), + clear_button_comp: gr.Button(interactive=False), + chatbot_comp: gr.update(value=webui_manager.bu_chat_history), + history_file_comp: gr.update(value=None), + gif_comp: gr.update(value=None), + } + + # --- Agent Settings --- + # Access settings values via components dict, getting IDs from webui_manager + def get_setting(key, default=None): + comp = webui_manager.id_to_component.get(f"agent_settings.{key}") + return components.get(comp, default) if comp else default + + override_system_prompt = get_setting("override_system_prompt") or None + extend_system_prompt = get_setting("extend_system_prompt") or None + llm_provider_name = get_setting("llm_provider", None) # Default to None if not found + llm_model_name = get_setting("llm_model_name", None) + llm_temperature = get_setting("llm_temperature", 0.6) + use_vision = get_setting("use_vision", True) + ollama_num_ctx = get_setting("ollama_num_ctx", 16000) + llm_base_url = get_setting("llm_base_url") or None + llm_api_key = get_setting("llm_api_key") or None + max_steps = get_setting("max_steps", 100) + max_actions = get_setting("max_actions", 10) + max_input_tokens = get_setting("max_input_tokens", 128000) + tool_calling_str = get_setting("tool_calling_method", "auto") + tool_calling_method = tool_calling_str if tool_calling_str != "None" else None + mcp_server_config_comp = webui_manager.id_to_component.get("agent_settings.mcp_server_config") + mcp_server_config_str = components.get(mcp_server_config_comp) if mcp_server_config_comp else None + mcp_server_config = json.loads(mcp_server_config_str) if mcp_server_config_str else None + + # Planner LLM Settings (Optional) + planner_llm_provider_name = get_setting("planner_llm_provider") or None + planner_llm = None + if planner_llm_provider_name: + planner_llm_model_name = get_setting("planner_llm_model_name") + planner_llm_temperature = get_setting("planner_llm_temperature", 0.6) + planner_ollama_num_ctx = get_setting("planner_ollama_num_ctx", 16000) + planner_llm_base_url = get_setting("planner_llm_base_url") or None + planner_llm_api_key = get_setting("planner_llm_api_key") or None + planner_use_vision = get_setting("planner_use_vision", False) + + planner_llm = await _initialize_llm( + planner_llm_provider_name, planner_llm_model_name, planner_llm_temperature, + planner_llm_base_url, planner_llm_api_key, + planner_ollama_num_ctx if planner_llm_provider_name == "ollama" else None + ) + + # --- Browser Settings --- + def get_browser_setting(key, default=None): + comp = webui_manager.id_to_component.get(f"browser_settings.{key}") + return components.get(comp, default) if comp else default + + browser_binary_path = get_browser_setting("browser_binary_path") or None + browser_user_data_dir = get_browser_setting("browser_user_data_dir") or None + use_own_browser = get_browser_setting("use_own_browser", False) # Logic handled by CDP/WSS presence + keep_browser_open = get_browser_setting("keep_browser_open", False) + headless = get_browser_setting("headless", False) + disable_security = get_browser_setting("disable_security", True) + window_w = int(get_browser_setting("window_w", 1280)) + window_h = int(get_browser_setting("window_h", 1100)) + cdp_url = get_browser_setting("cdp_url") or None + wss_url = get_browser_setting("wss_url") or None + save_recording_path = get_browser_setting("save_recording_path") or None + save_trace_path = get_browser_setting("save_trace_path") or None + save_agent_history_path = get_browser_setting("save_agent_history_path", "./tmp/agent_history") + save_download_path = get_browser_setting("save_download_path", "./tmp/downloads") + + stream_vw = 80 + stream_vh = int(80 * window_h // window_w) + + os.makedirs(save_agent_history_path, exist_ok=True) + if save_recording_path: os.makedirs(save_recording_path, exist_ok=True) + if save_trace_path: os.makedirs(save_trace_path, exist_ok=True) + if save_download_path: os.makedirs(save_download_path, exist_ok=True) + + # --- 2. Initialize LLM --- + main_llm = await _initialize_llm( + llm_provider_name, llm_model_name, llm_temperature, llm_base_url, llm_api_key, + ollama_num_ctx if llm_provider_name == "ollama" else None + ) + + # Pass the webui_manager instance to the callback when wrapping it + async def ask_callback_wrapper(query: str, browser_context: BrowserContext) -> Dict[str, Any]: + return await _ask_assistant_callback(webui_manager, query, browser_context) + + if not webui_manager.bu_controller: + webui_manager.bu_controller = CustomController(ask_assistant_callback=ask_callback_wrapper) + await webui_manager.bu_controller.setup_mcp_client(mcp_server_config) + + # --- 4. Initialize Browser and Context --- + should_close_browser_on_finish = not keep_browser_open + + try: + # Close existing resources if not keeping open + if not keep_browser_open: + if webui_manager.bu_browser_context: + logger.info("Closing previous browser context.") + await webui_manager.bu_browser_context.close() + webui_manager.bu_browser_context = None + if webui_manager.bu_browser: + logger.info("Closing previous browser.") + await webui_manager.bu_browser.close() + webui_manager.bu_browser = None + + # Create Browser if needed + if not webui_manager.bu_browser: + logger.info("Launching new browser instance.") + extra_args = [f"--window-size={window_w},{window_h}"] + if browser_user_data_dir: + extra_args.append(f"--user-data-dir={browser_user_data_dir}") + + if use_own_browser: + browser_binary_path = os.getenv("CHROME_PATH", None) or browser_binary_path + if browser_binary_path == "": + browser_binary_path = None + chrome_user_data = os.getenv("CHROME_USER_DATA", None) + if chrome_user_data: + extra_args += [f"--user-data-dir={chrome_user_data}"] + else: + browser_binary_path = None + + webui_manager.bu_browser = CustomBrowser( + config=BrowserConfig( + headless=headless, + disable_security=disable_security, + browser_binary_path=browser_binary_path, + extra_browser_args=extra_args, + wss_url=wss_url, + cdp_url=cdp_url, + ) + ) + + # Create Context if needed + if not webui_manager.bu_browser_context: + logger.info("Creating new browser context.") + context_config = CustomBrowserContextConfig( + trace_path=save_trace_path if save_trace_path else None, + save_recording_path=save_recording_path if save_recording_path else None, + save_downloads_path=save_download_path if save_download_path else None, + browser_window_size=BrowserContextWindowSize(width=window_w, height=window_h) + ) + if not webui_manager.bu_browser: + raise ValueError("Browser not initialized, cannot create context.") + webui_manager.bu_browser_context = await webui_manager.bu_browser.new_context(config=context_config) + + # --- 5. Initialize or Update Agent --- + webui_manager.bu_agent_task_id = str(uuid.uuid4()) # New ID for this task run + os.makedirs(os.path.join(save_agent_history_path, webui_manager.bu_agent_task_id), exist_ok=True) + history_file = os.path.join(save_agent_history_path, webui_manager.bu_agent_task_id, + f"{webui_manager.bu_agent_task_id}.json") + gif_path = os.path.join(save_agent_history_path, webui_manager.bu_agent_task_id, + f"{webui_manager.bu_agent_task_id}.gif") + + # Pass the webui_manager to callbacks when wrapping them + async def step_callback_wrapper(state: BrowserState, output: AgentOutput, step_num: int): + await _handle_new_step(webui_manager, state, output, step_num) + + def done_callback_wrapper(history: AgentHistoryList): + _handle_done(webui_manager, history) + + if not webui_manager.bu_agent: + logger.info(f"Initializing new agent for task: {task}") + if not webui_manager.bu_browser or not webui_manager.bu_browser_context: + raise ValueError("Browser or Context not initialized, cannot create agent.") + + webui_manager.bu_agent = Agent( + task=task, + llm=main_llm, + browser=webui_manager.bu_browser, + browser_context=webui_manager.bu_browser_context, + controller=webui_manager.bu_controller, + register_new_step_callback=step_callback_wrapper, + register_done_callback=done_callback_wrapper, + # Agent settings + use_vision=use_vision, + override_system_message=override_system_prompt, + extend_system_message=extend_system_prompt, + max_input_tokens=max_input_tokens, + max_actions_per_step=max_actions, + tool_calling_method=tool_calling_method, + planner_llm=planner_llm, + use_vision_for_planner=planner_use_vision if planner_llm else False, + save_conversation_path=history_file, + ) + webui_manager.bu_agent.state.agent_id = webui_manager.bu_agent_task_id + webui_manager.bu_agent.settings.generate_gif = gif_path + else: + webui_manager.bu_agent.state.agent_id = webui_manager.bu_agent_task_id + webui_manager.bu_agent.add_new_task(task) + webui_manager.bu_agent.settings.generate_gif = gif_path + + # --- 6. Run Agent Task and Stream Updates --- + agent_run_coro = webui_manager.bu_agent.run(max_steps=max_steps) + agent_task = asyncio.create_task(agent_run_coro) + webui_manager.bu_current_task = agent_task # Store the task + + last_chat_len = len(webui_manager.bu_chat_history) + while not agent_task.done(): + is_paused = webui_manager.bu_agent.state.paused + is_stopped = webui_manager.bu_agent.state.stopped + + # Check for pause state + if is_paused: + yield { + pause_resume_button_comp: gr.update(value="▶️ Resume", interactive=True), + run_button_comp: gr.update(value="⏸️ Paused", interactive=False), + stop_button_comp: gr.update(interactive=True), # Allow stop while paused + } + # Wait until pause is released or task is stopped/done + while is_paused and not agent_task.done(): + # Re-check agent state in loop + is_paused = webui_manager.bu_agent.state.paused + is_stopped = webui_manager.bu_agent.state.stopped + if is_stopped: # Stop signal received while paused + break + await asyncio.sleep(0.2) + + if agent_task.done() or is_stopped: # If stopped or task finished while paused + break + + # If resumed, yield UI update + yield { + pause_resume_button_comp: gr.update(value="⏸️ Pause", interactive=True), + run_button_comp: gr.update(value="⏳ Running...", interactive=False), + } + + # Check if agent stopped itself or stop button was pressed (which sets agent.state.stopped) + if is_stopped: + logger.info("Agent has stopped (internally or via stop button).") + if not agent_task.done(): + # Ensure the task coroutine finishes if agent just set flag + try: + await asyncio.wait_for(agent_task, timeout=1.0) # Give it a moment to exit run() + except asyncio.TimeoutError: + logger.warning("Agent task did not finish quickly after stop signal, cancelling.") + agent_task.cancel() + except Exception: # Catch task exceptions if it errors on stop + pass + break # Exit the streaming loop + + # Check if agent is asking for help (via response_event) + update_dict = {} + if webui_manager.bu_response_event is not None: + update_dict = { + user_input_comp: gr.update(placeholder="Agent needs help. Enter response and submit.", + interactive=True), + run_button_comp: gr.update(value="✔️ Submit Response", interactive=True), + pause_resume_button_comp: gr.update(interactive=False), + stop_button_comp: gr.update(interactive=False), + chatbot_comp: gr.update(value=webui_manager.bu_chat_history) + } + last_chat_len = len(webui_manager.bu_chat_history) + yield update_dict + # Wait until response is submitted or task finishes + while webui_manager.bu_response_event is not None and not agent_task.done(): + await asyncio.sleep(0.2) + # Restore UI after response submitted or if task ended unexpectedly + if not agent_task.done(): + yield { + user_input_comp: gr.update(placeholder="Agent is running...", interactive=False), + run_button_comp: gr.update(value="⏳ Running...", interactive=False), + pause_resume_button_comp: gr.update(interactive=True), + stop_button_comp: gr.update(interactive=True), + } + else: + break # Task finished while waiting for response + + # Update Chatbot if new messages arrived via callbacks + if len(webui_manager.bu_chat_history) > last_chat_len: + update_dict[chatbot_comp] = gr.update(value=webui_manager.bu_chat_history) + last_chat_len = len(webui_manager.bu_chat_history) + + # Update Browser View + if headless and webui_manager.bu_browser_context: + try: + screenshot_b64 = await capture_screenshot(webui_manager.bu_browser_context) + if screenshot_b64: + html_content = f'' + update_dict[browser_view_comp] = gr.update(value=html_content, visible=True) + else: + html_content = f"

Waiting for browser session...

" + update_dict[browser_view_comp] = gr.update(value=html_content, + visible=True) + except Exception as e: + logger.debug(f"Failed to capture screenshot: {e}") + update_dict[browser_view_comp] = gr.update(value="
Error loading view...
", + visible=True) + else: + update_dict[browser_view_comp] = gr.update(visible=False) + + # Yield accumulated updates + if update_dict: + yield update_dict + + await asyncio.sleep(0.1) # Polling interval + + # --- 7. Task Finalization --- + webui_manager.bu_agent.state.paused = False + webui_manager.bu_agent.state.stopped = False + final_update = {} + try: + logger.info("Agent task completing...") + # Await the task ensure completion and catch exceptions if not already caught + if not agent_task.done(): + await agent_task # Retrieve result/exception + elif agent_task.exception(): # Check if task finished with exception + agent_task.result() # Raise the exception to be caught below + logger.info("Agent task completed processing.") + + logger.info(f"Explicitly saving agent history to: {history_file}") + webui_manager.bu_agent.save_history(history_file) + + if os.path.exists(history_file): + final_update[history_file_comp] = gr.File(value=history_file) + + if gif_path and os.path.exists(gif_path): + logger.info(f"GIF found at: {gif_path}") + final_update[gif_comp] = gr.Image(value=gif_path) + + except asyncio.CancelledError: + logger.info("Agent task was cancelled.") + if not any("Cancelled" in msg.get("content", "") for msg in webui_manager.bu_chat_history if + msg.get("role") == "assistant"): + webui_manager.bu_chat_history.append({"role": "assistant", "content": "**Task Cancelled**."}) + final_update[chatbot_comp] = gr.update(value=webui_manager.bu_chat_history) + except Exception as e: + logger.error(f"Error during agent execution: {e}", exc_info=True) + error_message = f"**Agent Execution Error:**\n```\n{type(e).__name__}: {e}\n```" + if not any(error_message in msg.get("content", "") for msg in webui_manager.bu_chat_history if + msg.get("role") == "assistant"): + webui_manager.bu_chat_history.append({"role": "assistant", "content": error_message}) + final_update[chatbot_comp] = gr.update(value=webui_manager.bu_chat_history) + gr.Error(f"Agent execution failed: {e}") + + finally: + webui_manager.bu_current_task = None # Clear the task reference + + # Close browser/context if requested + if should_close_browser_on_finish: + if webui_manager.bu_browser_context: + logger.info("Closing browser context after task.") + await webui_manager.bu_browser_context.close() + webui_manager.bu_browser_context = None + if webui_manager.bu_browser: + logger.info("Closing browser after task.") + await webui_manager.bu_browser.close() + webui_manager.bu_browser = None + + # --- 8. Final UI Update --- + final_update.update({ + user_input_comp: gr.update(value="", interactive=True, placeholder="Enter your next task..."), + run_button_comp: gr.update(value="▶️ Submit Task", interactive=True), + stop_button_comp: gr.update(interactive=False), + pause_resume_button_comp: gr.update(value="⏸️ Pause", interactive=False), + clear_button_comp: gr.update(interactive=True), + # Ensure final chat history is shown + chatbot_comp: gr.update(value=webui_manager.bu_chat_history) + }) + yield final_update + + except Exception as e: + # Catch errors during setup (before agent run starts) + logger.error(f"Error setting up agent task: {e}", exc_info=True) + webui_manager.bu_current_task = None # Ensure state is reset + yield { + user_input_comp: gr.update(interactive=True, placeholder="Error during setup. Enter task..."), + run_button_comp: gr.update(value="▶️ Submit Task", interactive=True), + stop_button_comp: gr.update(interactive=False), + pause_resume_button_comp: gr.update(value="⏸️ Pause", interactive=False), + clear_button_comp: gr.update(interactive=True), + chatbot_comp: gr.update( + value=webui_manager.bu_chat_history + [{"role": "assistant", "content": f"**Setup Error:** {e}"}]), + } + + +# --- Button Click Handlers --- (Need access to webui_manager) + +async def handle_submit(webui_manager: WebuiManager, components: Dict[gr.components.Component, Any]): + """Handles clicks on the main 'Submit' button.""" + user_input_comp = webui_manager.get_component_by_id("browser_use_agent.user_input") + user_input_value = components.get(user_input_comp, "").strip() + + # Check if waiting for user assistance + if webui_manager.bu_response_event and not webui_manager.bu_response_event.is_set(): + logger.info(f"User submitted assistance: {user_input_value}") + webui_manager.bu_user_help_response = user_input_value if user_input_value else "User provided no text response." + webui_manager.bu_response_event.set() + # UI updates handled by the main loop reacting to the event being set + yield { + user_input_comp: gr.update(value="", interactive=False, placeholder="Waiting for agent to continue..."), + webui_manager.get_component_by_id("browser_use_agent.run_button"): gr.update(value="⏳ Running...", + interactive=False) + } + # Check if a task is currently running (using _current_task) + elif webui_manager.bu_current_task and not webui_manager.bu_current_task.done(): + logger.warning("Submit button clicked while agent is already running and not asking for help.") + gr.Info("Agent is currently running. Please wait or use Stop/Pause.") + yield {} # No change + else: + # Handle submission for a new task + logger.info("Submit button clicked for new task.") + # Use async generator to stream updates from run_agent_task + async for update in run_agent_task(webui_manager, components): + yield update + + +async def handle_stop(webui_manager: WebuiManager): + """Handles clicks on the 'Stop' button.""" + logger.info("Stop button clicked.") + agent = webui_manager.bu_agent + task = webui_manager.bu_current_task + + if agent and task and not task.done(): + # Signal the agent to stop by setting its internal flag + agent.state.stopped = True + agent.state.paused = False # Ensure not paused if stopped + return { + webui_manager.get_component_by_id("browser_use_agent.stop_button"): gr.update(interactive=False, + value="⏹️ Stopping..."), + webui_manager.get_component_by_id("browser_use_agent.pause_resume_button"): gr.update(interactive=False), + webui_manager.get_component_by_id("browser_use_agent.run_button"): gr.update(interactive=False), + } + else: + logger.warning("Stop clicked but agent is not running or task is already done.") + # Reset UI just in case it's stuck + return { + webui_manager.get_component_by_id("browser_use_agent.run_button"): gr.update(interactive=True), + webui_manager.get_component_by_id("browser_use_agent.stop_button"): gr.update(interactive=False), + webui_manager.get_component_by_id("browser_use_agent.pause_resume_button"): gr.update(interactive=False), + webui_manager.get_component_by_id("browser_use_agent.clear_button"): gr.update(interactive=True), + } + + +async def handle_pause_resume(webui_manager: WebuiManager): + """Handles clicks on the 'Pause/Resume' button.""" + agent = webui_manager.bu_agent + task = webui_manager.bu_current_task + + if agent and task and not task.done(): + if agent.state.paused: + logger.info("Resume button clicked.") + agent.resume() + # UI update happens in main loop + return { + webui_manager.get_component_by_id("browser_use_agent.pause_resume_button"): gr.update(value="⏸️ Pause", + interactive=True)} # Optimistic update + else: + logger.info("Pause button clicked.") + agent.pause() + return { + webui_manager.get_component_by_id("browser_use_agent.pause_resume_button"): gr.update(value="▶️ Resume", + interactive=True)} # Optimistic update + else: + logger.warning("Pause/Resume clicked but agent is not running or doesn't support state.") + return {} # No change + + +async def handle_clear(webui_manager: WebuiManager): + """Handles clicks on the 'Clear' button.""" + logger.info("Clear button clicked.") + + # Stop any running task first + task = webui_manager.bu_current_task + if task and not task.done(): + logger.info("Clearing requires stopping the current task.") + webui_manager.bu_agent.stop() + try: + await asyncio.wait_for(task, timeout=2.0) # Wait briefly + except (asyncio.CancelledError, asyncio.TimeoutError): + pass + except Exception as e: + logger.warning(f"Error stopping task on clear: {e}") + webui_manager.bu_current_task.cancel() + webui_manager.bu_current_task = None + + if webui_manager.bu_controller: + await webui_manager.bu_controller.close_mcp_client() + webui_manager.bu_controller = None + webui_manager.bu_agent = None + + # Reset state stored in manager + webui_manager.bu_chat_history = [] + webui_manager.bu_response_event = None + webui_manager.bu_user_help_response = None + webui_manager.bu_agent_task_id = None + + logger.info("Agent state and browser resources cleared.") + + # Reset UI components + return { + webui_manager.get_component_by_id("browser_use_agent.chatbot"): gr.update(value=[]), + webui_manager.get_component_by_id("browser_use_agent.user_input"): gr.update(value="", + placeholder="Enter your task here..."), + webui_manager.get_component_by_id("browser_use_agent.agent_history_file"): gr.update(value=None), + webui_manager.get_component_by_id("browser_use_agent.recording_gif"): gr.update(value=None), + webui_manager.get_component_by_id("browser_use_agent.browser_view"): gr.update( + value="
Browser Cleared
"), + webui_manager.get_component_by_id("browser_use_agent.run_button"): gr.update(value="▶️ Submit Task", + interactive=True), + webui_manager.get_component_by_id("browser_use_agent.stop_button"): gr.update(interactive=False), + webui_manager.get_component_by_id("browser_use_agent.pause_resume_button"): gr.update(value="⏸️ Pause", + interactive=False), + webui_manager.get_component_by_id("browser_use_agent.clear_button"): gr.update(interactive=True), + } + + +# --- Tab Creation Function --- + +def create_browser_use_agent_tab(webui_manager: WebuiManager): """ - Create the run agent tab + Create the run agent tab, defining UI, state, and handlers. """ - input_components = set(webui_manager.get_components()) + webui_manager.init_browser_use_agent() + + # --- Define UI Components --- tab_components = {} - - chatbot = gr.Chatbot(type='messages', label="Chat History", height=600) - user_input = gr.Textbox( - label="User Input", - lines=3, - value="go to google.com and type 'OpenAI' click search and give me the first url", - interactive=True - ) - - with gr.Row(): - stop_button = gr.Button("⏹️ Stop", interactive=False, variant="stop", scale=2) - clear_button = gr.Button("🧹 Clear", interactive=True, variant="stop", scale=2) - run_button = gr.Button("▶️ Summit", variant="primary", scale=3) - - browser_view = gr.HTML( - value="

Waiting for browser session...

", - label="Browser Live View", - visible=False - ) - - with gr.Row(): - agent_final_result = gr.Textbox( - label="Final Result", lines=3, show_label=True, interactive=False + with gr.Column(): + chatbot = gr.Chatbot( + lambda: webui_manager.bu_chat_history, # Load history dynamically + elem_id="browser_use_chatbot", + label="Agent Interaction", + type="messages", + height=600, + show_copy_button=True, + bubble_full_width=False, ) - agent_errors = gr.Textbox( - label="Errors", lines=3, show_label=True, interactive=False + user_input = gr.Textbox( + label="Your Task or Response", + placeholder="Enter your task here or provide assistance when asked.", + lines=3, + interactive=True, + elem_id="user_input" ) + with gr.Row(): + stop_button = gr.Button("⏹️ Stop", interactive=False, variant="stop", scale=1) + pause_resume_button = gr.Button("⏸️ Pause", interactive=False, variant="secondary", scale=1) + clear_button = gr.Button("🗑️ Clear", interactive=True, variant="secondary", scale=1) + run_button = gr.Button("▶️ Submit Task", variant="primary", scale=2) - with gr.Row(): - agent_trace_file = gr.File(label="Trace File", interactive=False) - agent_history_file = gr.File(label="Agent History", interactive=False) + browser_view = gr.HTML( + value="

Browser View (Requires Headless=True)

", + label="Browser Live View", + elem_id="browser_view", + visible=False, + ) + with gr.Column(): + gr.Markdown("### Task Outputs") + agent_history_file = gr.File(label="Agent History JSON", interactive=False) + recording_gif = gr.Image(label="Task Recording GIF", format="gif", interactive=False, + type="filepath") - recording_gif = gr.Image(label="Result GIF", format="gif", interactive=False) + # --- Store Components in Manager --- tab_components.update( dict( - chatbot=chatbot, - user_input=user_input, - clear_button=clear_button, - run_button=run_button, - stop_button=stop_button, - agent_final_result=agent_final_result, - agent_errors=agent_errors, - agent_trace_file=agent_trace_file, - agent_history_file=agent_history_file, - recording_gif=recording_gif, + chatbot=chatbot, user_input=user_input, clear_button=clear_button, + run_button=run_button, stop_button=stop_button, pause_resume_button=pause_resume_button, + agent_history_file=agent_history_file, recording_gif=recording_gif, browser_view=browser_view ) ) - return tab_components + webui_manager.add_components("browser_use_agent", tab_components) # Use "browser_use_agent" as tab_name prefix + + all_managed_components = set(webui_manager.get_components()) # Get all components known to manager + run_tab_outputs = list(tab_components.values()) + + async def submit_wrapper(components_dict: Dict[Component, Any]) -> AsyncGenerator[Dict[Component, Any], None]: + """Wrapper for handle_submit that yields its results.""" + # handle_submit is an async generator, iterate and yield + async for update in handle_submit(webui_manager, components_dict): + yield update + + async def stop_wrapper() -> AsyncGenerator[Dict[Component, Any], None]: + """Wrapper for handle_stop.""" + # handle_stop is async def but returns a single dict. We yield it once. + update_dict = await handle_stop(webui_manager) + yield update_dict # Yield the final dictionary + + async def pause_resume_wrapper() -> AsyncGenerator[Dict[Component, Any], None]: + """Wrapper for handle_pause_resume.""" + update_dict = await handle_pause_resume(webui_manager) + yield update_dict + + async def clear_wrapper() -> AsyncGenerator[Dict[Component, Any], None]: + """Wrapper for handle_clear.""" + update_dict = await handle_clear(webui_manager) + yield update_dict + + # --- Connect Event Handlers using the Wrappers -- + run_button.click( + fn=submit_wrapper, + inputs=all_managed_components, + outputs=run_tab_outputs + ) + user_input.submit( + fn=submit_wrapper, + inputs=all_managed_components, + outputs=run_tab_outputs + ) + stop_button.click( + fn=stop_wrapper, + inputs=None, + outputs=run_tab_outputs + ) + pause_resume_button.click( + fn=pause_resume_wrapper, + inputs=None, + outputs=run_tab_outputs + ) + clear_button.click( + fn=clear_wrapper, + inputs=None, + outputs=run_tab_outputs + ) + diff --git a/src/webui/components/deep_research_agent_tab.py b/src/webui/components/deep_research_agent_tab.py index d9dfc24..5ce8dd7 100644 --- a/src/webui/components/deep_research_agent_tab.py +++ b/src/webui/components/deep_research_agent_tab.py @@ -38,4 +38,4 @@ def create_deep_research_agent_tab(webui_manager: WebuiManager) -> dict[str, Com markdown_download=markdown_download, ) ) - return tab_components + webui_manager.add_components("deep_research_agent", tab_components) diff --git a/src/webui/components/load_save_config_tab.py b/src/webui/components/load_save_config_tab.py index 91dcad7..acc0f69 100644 --- a/src/webui/components/load_save_config_tab.py +++ b/src/webui/components/load_save_config_tab.py @@ -34,16 +34,17 @@ def create_load_save_config_tab(webui_manager: WebuiManager) -> dict[str, Compon config_file=config_file, )) + webui_manager.add_components("load_save_config", tab_components) + save_config_button.click( - fn=webui_manager.save_current_config, - inputs=[], + fn=webui_manager.save_config, + inputs=set(webui_manager.get_components()), outputs=[config_status] ) load_config_button.click( fn=webui_manager.load_config, inputs=[config_file], - outputs=[config_status] + outputs=webui_manager.get_components(), ) - return tab_components diff --git a/src/webui/interface.py b/src/webui/interface.py index 266b079..ba99245 100644 --- a/src/webui/interface.py +++ b/src/webui/interface.py @@ -32,6 +32,9 @@ def create_ui(theme_name="Ocean"): text-align: center; margin-bottom: 20px; } + .tab-header-text { + text-align: center; + } .theme-section { margin-bottom: 10px; padding: 15px; @@ -67,18 +70,26 @@ def create_ui(theme_name="Ocean"): with gr.Tabs() as tabs: with gr.TabItem("⚙️ Agent Settings"): - ui_manager.add_components("agent_settings", create_agent_settings_tab(ui_manager)) + create_agent_settings_tab(ui_manager) with gr.TabItem("🌐 Browser Settings"): - ui_manager.add_components("browser_settings", create_browser_settings_tab(ui_manager)) + create_browser_settings_tab(ui_manager) with gr.TabItem("🤖 Run Agent"): - ui_manager.add_components("browser_use_agent", create_browser_use_agent_tab(ui_manager)) + create_browser_use_agent_tab(ui_manager) - with gr.TabItem("🧐 Deep Research"): - ui_manager.add_components("deep_research_agent", create_deep_research_agent_tab(ui_manager)) + with gr.TabItem("🎁 Agent Collections"): + gr.Markdown( + """ + ### Agents built on Browser-Use + """, + elem_classes=["tab-header-text"], + ) + with gr.Tabs(): + with gr.TabItem("Deep Research"): + create_deep_research_agent_tab(ui_manager) with gr.TabItem("📁 Load & Save Config"): - ui_manager.add_components("load_save_config", create_load_save_config_tab(ui_manager)) + create_load_save_config_tab(ui_manager) return demo diff --git a/src/webui/webui_manager.py b/src/webui/webui_manager.py index 033564a..5cbd31f 100644 --- a/src/webui/webui_manager.py +++ b/src/webui/webui_manager.py @@ -4,11 +4,17 @@ from typing import TYPE_CHECKING import os import gradio as gr from datetime import datetime +from typing import Optional, Dict, List +import uuid +import asyncio from gradio.components import Component from browser_use.browser.browser import Browser from browser_use.browser.context import BrowserContext from browser_use.agent.service import Agent +from src.browser.custom_browser import CustomBrowser +from src.browser.custom_context import CustomBrowserContext +from src.controller.custom_controller import CustomController class WebuiManager: @@ -19,9 +25,19 @@ class WebuiManager: self.settings_save_dir = settings_save_dir os.makedirs(self.settings_save_dir, exist_ok=True) - self.browser: Browser = None - self.browser_context: BrowserContext = None - self.bu_agent: Agent = None + def init_browser_use_agent(self) -> None: + """ + init browser use agent + """ + self.bu_agent: Optional[Agent] = None + self.bu_browser: Optional[CustomBrowser] = None + self.bu_browser_context: Optional[CustomBrowserContext] = None + self.bu_controller: Optional[CustomController] = None + self.bu_chat_history: List[Dict[str, Optional[str]]] = [] + self.bu_response_event: Optional[asyncio.Event] = None + self.bu_user_help_response: Optional[str] = None + self.bu_current_task: Optional[asyncio.Task] = None + self.bu_agent_task_id: Optional[str] = None def add_components(self, tab_name: str, components_dict: dict[str, "Component"]) -> None: """ @@ -50,15 +66,16 @@ class WebuiManager: """ return self.component_to_id[comp] - def save_current_config(self): + def save_config(self, components: Dict["Component", str]) -> None: """ - Save current config + Save config """ cur_settings = {} - for comp_id, comp in self.id_to_component.items(): + for comp in components: if not isinstance(comp, gr.Button) and not isinstance(comp, gr.File) and str( getattr(comp, "interactive", True)).lower() != "false": - cur_settings[comp_id] = getattr(comp, "value", None) + comp_id = self.get_id_by_component(comp) + cur_settings[comp_id] = components[comp] config_name = datetime.now().strftime("%Y%m%d-%H%M%S") with open(os.path.join(self.settings_save_dir, f"{config_name}.json"), "w") as fw: @@ -76,6 +93,13 @@ class WebuiManager: update_components = {} for comp_id, comp_val in ui_settings.items(): if comp_id in self.id_to_component: - update_components[self.id_to_component[comp_id]].value = comp_val + comp = self.id_to_component[comp_id] + update_components[comp] = comp.__class__(value=comp_val) - return f"Successfully loaded config from {config_path}" + config_status = self.id_to_component["load_save_config.config_status"] + update_components.update( + { + config_status: config_status.__class__(value=f"Successfully loaded config: {config_path}") + } + ) + yield update_components diff --git a/tests/test_agents.py b/tests/test_agents.py index 27bb704..79e48d6 100644 --- a/tests/test_agents.py +++ b/tests/test_agents.py @@ -17,98 +17,18 @@ from browser_use.agent.views import AgentHistoryList from src.utils import utils -async def test_browser_use_org(): +async def test_browser_use_agent(): from browser_use.browser.browser import Browser, BrowserConfig from browser_use.browser.context import ( BrowserContextConfig, BrowserContextWindowSize, ) + from browser_use.agent.service import Agent - # llm = utils.get_llm_model( - # provider="azure_openai", - # model_name="gpt-4o", - # temperature=0.8, - # base_url=os.getenv("AZURE_OPENAI_ENDPOINT", ""), - # api_key=os.getenv("AZURE_OPENAI_API_KEY", ""), - # ) - - # llm = utils.get_llm_model( - # provider="deepseek", - # model_name="deepseek-chat", - # temperature=0.8 - # ) - - llm = utils.get_llm_model( - provider="ollama", model_name="deepseek-r1:14b", temperature=0.5 - ) - - window_w, window_h = 1920, 1080 - use_vision = False - use_own_browser = False - if use_own_browser: - chrome_path = os.getenv("CHROME_PATH", None) - if chrome_path == "": - chrome_path = None - else: - chrome_path = None - - tool_calling_method = "json_schema" # setting to json_schema when using ollma - - browser = Browser( - config=BrowserConfig( - headless=False, - disable_security=True, - chrome_instance_path=chrome_path, - extra_chromium_args=[f"--window-size={window_w},{window_h}"], - ) - ) - async with await browser.new_context( - config=BrowserContextConfig( - trace_path="./tmp/traces", - save_recording_path="./tmp/record_videos", - no_viewport=False, - browser_window_size=BrowserContextWindowSize( - width=window_w, height=window_h - ), - ) - ) as browser_context: - agent = Agent( - task="go to google.com and type 'OpenAI' click search and give me the first url", - llm=llm, - browser_context=browser_context, - use_vision=use_vision, - tool_calling_method=tool_calling_method - ) - history: AgentHistoryList = await agent.run(max_steps=10) - - print("Final Result:") - pprint(history.final_result(), indent=4) - - print("\nErrors:") - pprint(history.errors(), indent=4) - - # e.g. xPaths the model clicked on - print("\nModel Outputs:") - pprint(history.model_actions(), indent=4) - - print("\nThoughts:") - pprint(history.model_thoughts(), indent=4) - # close browser - await browser.close() - - -async def test_browser_use_custom(): - from browser_use.browser.context import BrowserContextWindowSize - from browser_use.browser.browser import BrowserConfig - from playwright.async_api import async_playwright - - from src.agent.custom_agent import CustomAgent - from src.agent.custom_prompts import CustomSystemPrompt, CustomAgentMessagePrompt from src.browser.custom_browser import CustomBrowser - from src.browser.custom_context import BrowserContextConfig + from src.browser.custom_context import CustomBrowserContextConfig from src.controller.custom_controller import CustomController - - window_w, window_h = 1280, 1100 + from src.utils import llm_provider # llm = utils.get_llm_model( # provider="openai", @@ -118,14 +38,6 @@ async def test_browser_use_custom(): # api_key=os.getenv("OPENAI_API_KEY", ""), # ) - llm = utils.get_llm_model( - provider="azure_openai", - model_name="gpt-4o", - temperature=0.5, - base_url=os.getenv("AZURE_OPENAI_ENDPOINT", ""), - api_key=os.getenv("AZURE_OPENAI_API_KEY", ""), - ) - # llm = utils.get_llm_model( # provider="google", # model_name="gemini-2.0-flash", @@ -153,13 +65,43 @@ async def test_browser_use_custom(): # provider="ollama", model_name="deepseek-r1:14b", temperature=0.5 # ) + window_w, window_h = 1280, 1100 + + llm = llm_provider.get_llm_model( + provider="azure_openai", + model_name="gpt-4o", + temperature=0.5, + base_url=os.getenv("AZURE_OPENAI_ENDPOINT", ""), + api_key=os.getenv("AZURE_OPENAI_API_KEY", ""), + ) + + mcp_server_config = { + "mcpServers": { + "markitdown": { + "command": "docker", + "args": [ + "run", + "--rm", + "-i", + "markitdown-mcp:latest" + ] + }, + "desktop-commander": { + "command": "npx", + "args": [ + "-y", + "@wonderwhy-er/desktop-commander" + ] + }, + } + } controller = CustomController() - use_own_browser = True + await controller.setup_mcp_client(mcp_server_config) + use_own_browser = False disable_security = True use_vision = True # Set to False when using DeepSeek max_actions_per_step = 10 - playwright = None browser = None browser_context = None @@ -178,29 +120,27 @@ async def test_browser_use_custom(): config=BrowserConfig( headless=False, disable_security=disable_security, - chrome_instance_path=chrome_path, - extra_chromium_args=extra_chromium_args, + browser_binary_path=chrome_path, + extra_browser_args=extra_chromium_args, ) ) browser_context = await browser.new_context( - config=BrowserContextConfig( + config=CustomBrowserContextConfig( trace_path="./tmp/traces", save_recording_path="./tmp/record_videos", - no_viewport=False, + save_downloads_path="./tmp/downloads", browser_window_size=BrowserContextWindowSize( width=window_w, height=window_h ), + force_new_context=True ) ) - agent = CustomAgent( - task="open youtube in tab 1 , open google email in tab 2, open facebook in tab 3", - add_infos="", # some hints for llm to complete the task + agent = Agent( + task="download pdf from https://arxiv.org/abs/2504.10458 and rename this pdf to 'GUI-r1-test.pdf'", llm=llm, browser=browser, browser_context=browser_context, controller=controller, - system_prompt_class=CustomSystemPrompt, - agent_prompt_class=CustomAgentMessagePrompt, use_vision=use_vision, max_actions_per_step=max_actions_per_step, generate_gif=True @@ -213,28 +153,17 @@ async def test_browser_use_custom(): print("\nErrors:") pprint(history.errors(), indent=4) - # e.g. xPaths the model clicked on - print("\nModel Outputs:") - pprint(history.model_actions(), indent=4) - - print("\nThoughts:") - pprint(history.model_thoughts(), indent=4) - except Exception: import traceback - traceback.print_exc() finally: - # 显式关闭持久化上下文 if browser_context: await browser_context.close() - - # 关闭 Playwright 对象 - if playwright: - await playwright.stop() if browser: await browser.close() + if controller: + await controller.close_mcp_client() async def test_browser_use_parallel(): @@ -242,13 +171,20 @@ async def test_browser_use_parallel(): from browser_use.browser.browser import BrowserConfig from playwright.async_api import async_playwright from browser_use.browser.browser import Browser - from src.agent.custom_agent import CustomAgent - from src.agent.custom_prompts import CustomSystemPrompt, CustomAgentMessagePrompt - from src.browser.custom_browser import CustomBrowser from src.browser.custom_context import BrowserContextConfig from src.controller.custom_controller import CustomController - window_w, window_h = 1920, 1080 + from browser_use.browser.browser import Browser, BrowserConfig + from browser_use.browser.context import ( + BrowserContextConfig, + BrowserContextWindowSize, + ) + from browser_use.agent.service import Agent + + from src.browser.custom_browser import CustomBrowser + from src.browser.custom_context import CustomBrowserContextConfig + from src.controller.custom_controller import CustomController + from src.utils import llm_provider # llm = utils.get_llm_model( # provider="openai", @@ -258,20 +194,13 @@ async def test_browser_use_parallel(): # api_key=os.getenv("OPENAI_API_KEY", ""), # ) - # llm = utils.get_llm_model( - # provider="azure_openai", - # model_name="gpt-4o", - # temperature=0.8, - # base_url=os.getenv("AZURE_OPENAI_ENDPOINT", ""), - # api_key=os.getenv("AZURE_OPENAI_API_KEY", ""), - # ) - llm = utils.get_llm_model( - provider="gemini", - model_name="gemini-2.0-flash-exp", - temperature=1.0, - api_key=os.getenv("GOOGLE_API_KEY", "") - ) + # llm = utils.get_llm_model( + # provider="google", + # model_name="gemini-2.0-flash", + # temperature=0.6, + # api_key=os.getenv("GOOGLE_API_KEY", "") + # ) # llm = utils.get_llm_model( # provider="deepseek", @@ -293,72 +222,119 @@ async def test_browser_use_parallel(): # provider="ollama", model_name="deepseek-r1:14b", temperature=0.5 # ) + window_w, window_h = 1280, 1100 + + llm = llm_provider.get_llm_model( + provider="azure_openai", + model_name="gpt-4o", + temperature=0.5, + base_url=os.getenv("AZURE_OPENAI_ENDPOINT", ""), + api_key=os.getenv("AZURE_OPENAI_API_KEY", ""), + ) + + mcp_server_config = { + "mcpServers": { + "markitdown": { + "command": "docker", + "args": [ + "run", + "--rm", + "-i", + "markitdown-mcp:latest" + ] + }, + "desktop-commander": { + "command": "npx", + "args": [ + "-y", + "@wonderwhy-er/desktop-commander" + ] + }, + # "filesystem": { + # "command": "npx", + # "args": [ + # "-y", + # "@modelcontextprotocol/server-filesystem", + # "/Users/xxx/ai_workspace", + # ] + # }, + } + } controller = CustomController() - use_own_browser = True + await controller.setup_mcp_client(mcp_server_config) + use_own_browser = False disable_security = True use_vision = True # Set to False when using DeepSeek - max_actions_per_step = 1 - playwright = None + max_actions_per_step = 10 browser = None browser_context = None - browser = Browser( - config=BrowserConfig( - disable_security=True, - headless=False, - new_context_config=BrowserContextConfig(save_recording_path='./tmp/recordings'), - ) - ) - try: + extra_chromium_args = [f"--window-size={window_w},{window_h}"] + if use_own_browser: + chrome_path = os.getenv("CHROME_PATH", None) + if chrome_path == "": + chrome_path = None + chrome_user_data = os.getenv("CHROME_USER_DATA", None) + if chrome_user_data: + extra_chromium_args += [f"--user-data-dir={chrome_user_data}"] + else: + chrome_path = None + browser = CustomBrowser( + config=BrowserConfig( + headless=False, + disable_security=disable_security, + browser_binary_path=chrome_path, + extra_browser_args=extra_chromium_args, + ) + ) + browser_context = await browser.new_context( + config=CustomBrowserContextConfig( + trace_path="./tmp/traces", + save_recording_path="./tmp/record_videos", + save_downloads_path="./tmp/downloads", + browser_window_size=BrowserContextWindowSize( + width=window_w, height=window_h + ), + force_new_context=True + ) + ) agents = [ - Agent(task=task, llm=llm, browser=browser) + Agent(task=task, llm=llm, browser=browser, controller=controller) for task in [ 'Search Google for weather in Tokyo', - 'Check Reddit front page title', - 'Find NASA image of the day', - 'Check top story on CNN', + # 'Check Reddit front page title', + # 'Find NASA image of the day', + # 'Check top story on CNN', # 'Search latest SpaceX launch date', # 'Look up population of Paris', - # 'Find current time in Sydney', - # 'Check who won last Super Bowl', + 'Find current time in Sydney', + 'Check who won last Super Bowl', # 'Search trending topics on Twitter', ] ] history = await asyncio.gather(*[agent.run() for agent in agents]) - pdb.set_trace() print("Final Result:") pprint(history.final_result(), indent=4) print("\nErrors:") pprint(history.errors(), indent=4) - # e.g. xPaths the model clicked on - print("\nModel Outputs:") - pprint(history.model_actions(), indent=4) + pdb.set_trace() - print("\nThoughts:") - pprint(history.model_thoughts(), indent=4) - # close browser except Exception: import traceback traceback.print_exc() finally: - # 显式关闭持久化上下文 if browser_context: await browser_context.close() - - # 关闭 Playwright 对象 - if playwright: - await playwright.stop() if browser: await browser.close() if __name__ == "__main__": - asyncio.run(test_browser_use_org()) - # asyncio.run(test_browser_use_parallel()) - # asyncio.run(test_browser_use_custom()) + # asyncio.run(test_browser_use_agent()) + asyncio.run(test_browser_use_parallel()) diff --git a/tests/test_controller.py b/tests/test_controller.py index 6a10ebc..1e1608e 100644 --- a/tests/test_controller.py +++ b/tests/test_controller.py @@ -45,33 +45,37 @@ async def test_controller_with_mcp(): from src.controller.custom_controller import CustomController from browser_use.controller.registry.views import ActionModel - test_server_config = { - "playwright": { - "command": "npx", - "args": [ - "@playwright/mcp@latest", - ], - "transport": "stdio", - }, - "filesystem": { - "command": "npx", - "args": [ - "-y", - "@modelcontextprotocol/server-filesystem", - "/Users/xxx/ai_workspace", - ] - }, - "desktop-commander": { - "command": "npx", - "args": [ - "-y", - "@wonderwhy-er/desktop-commander" - ] + mcp_server_config = { + "mcpServers": { + "markitdown": { + "command": "docker", + "args": [ + "run", + "--rm", + "-i", + "markitdown-mcp:latest" + ] + }, + "desktop-commander": { + "command": "npx", + "args": [ + "-y", + "@wonderwhy-er/desktop-commander" + ] + }, + # "filesystem": { + # "command": "npx", + # "args": [ + # "-y", + # "@modelcontextprotocol/server-filesystem", + # "/Users/xxx/ai_workspace", + # ] + # }, } } controller = CustomController() - await controller.setup_mcp_client(test_server_config) + await controller.setup_mcp_client(mcp_server_config) action_name = "mcp.desktop-commander.execute_command" action_info = controller.registry.registry.actions[action_name] param_model = action_info.param_model @@ -85,7 +89,8 @@ async def test_controller_with_mcp(): result = await controller.act(action_model) result = result.extracted_content print(result) - if result and "Command is still running. Use read_output to get more output." in result and "PID" in result.split("\n")[0]: + if result and "Command is still running. Use read_output to get more output." in result and "PID" in \ + result.split("\n")[0]: pid = int(result.split("\n")[0].split("PID")[-1].strip()) action_name = "mcp.desktop-commander.read_output" action_info = controller.registry.registry.actions[action_name] diff --git a/tests/test_llm_api.py b/tests/test_llm_api.py index bee1e6b..c0e9e16 100644 --- a/tests/test_llm_api.py +++ b/tests/test_llm_api.py @@ -144,10 +144,10 @@ def test_ibm_model(): if __name__ == "__main__": # test_openai_model() # test_google_model() - # test_azure_openai_model() + test_azure_openai_model() # test_deepseek_model() # test_ollama_model() # test_deepseek_r1_model() # test_deepseek_r1_ollama_model() # test_mistral_model() - test_ibm_model() + # test_ibm_model() diff --git a/webui.py b/webui.py index 3066ecb..34e93ab 100644 --- a/webui.py +++ b/webui.py @@ -1,3 +1,5 @@ +from dotenv import load_dotenv +load_dotenv() import argparse from src.webui.interface import theme_map, create_ui diff --git a/webui2.py b/webui2.py index 33d7ece..98a23b4 100644 --- a/webui2.py +++ b/webui2.py @@ -42,77 +42,6 @@ _global_browser = None _global_browser_context = None _global_agent = None -# Create the global agent state instance -_global_agent_state = AgentState() - -# webui config -webui_config_manager = utils.ConfigManager() - - -def scan_and_register_components(blocks): - """扫描一个 Blocks 对象并注册其中的所有交互式组件,但不包括按钮""" - global webui_config_manager - - def traverse_blocks(block, prefix=""): - registered = 0 - - # 处理 Blocks 自身的组件 - if hasattr(block, "children"): - for i, child in enumerate(block.children): - if isinstance(child, gr.components.Component): - # 排除按钮 (Button) 组件 - if getattr(child, "interactive", False) and not isinstance(child, gr.Button): - name = f"{prefix}component_{i}" - if hasattr(child, "label") and child.label: - # 使用标签作为名称的一部分 - label = child.label - name = f"{prefix}{label}" - logger.debug(f"Registering component: {name}") - webui_config_manager.register_component(name, child) - registered += 1 - elif hasattr(child, "children"): - # 递归处理嵌套的 Blocks - new_prefix = f"{prefix}block_{i}_" - registered += traverse_blocks(child, new_prefix) - - return registered - - total = traverse_blocks(blocks) - logger.info(f"Total registered components: {total}") - - -def save_current_config(): - return webui_config_manager.save_current_config() - - -def update_ui_from_config(config_file): - return webui_config_manager.update_ui_from_config(config_file) - - -def resolve_sensitive_env_variables(text): - """ - Replace environment variable placeholders ($SENSITIVE_*) with their values. - Only replaces variables that start with SENSITIVE_. - """ - if not text: - return text - - import re - - # Find all $SENSITIVE_* patterns - env_vars = re.findall(r'\$SENSITIVE_[A-Za-z0-9_]*', text) - - result = text - for var in env_vars: - # Remove the $ prefix to get the actual environment variable name - env_name = var[1:] # removes the $ - env_value = os.getenv(env_name) - if env_value is not None: - # Replace $SENSITIVE_VAR_NAME with its value - result = result.replace(var, env_value) - - return result - async def stop_agent(): """Request the agent to stop and update UI with enhanced feedback""" @@ -140,32 +69,6 @@ async def stop_agent(): ) -async def stop_research_agent(): - """Request the agent to stop and update UI with enhanced feedback""" - global _global_agent_state - - try: - # Request stop - _global_agent_state.request_stop() - - # Update UI immediately - message = "Stop requested - the agent will halt at the next safe point" - logger.info(f"🛑 {message}") - - # Return UI updates - return ( # errors_output - gr.update(value="Stopping...", interactive=False), # stop_button - gr.update(interactive=False), # run_button - ) - except Exception as e: - error_msg = f"Error during stop: {str(e)}" - logger.error(error_msg) - return ( - gr.update(value="Stop", interactive=True), - gr.update(interactive=True) - ) - - async def run_browser_agent( agent_type, llm_provider, @@ -202,16 +105,6 @@ async def run_browser_agent( if save_recording_path: os.makedirs(save_recording_path, exist_ok=True) - # Get the list of existing videos before the agent runs - existing_videos = set() - if save_recording_path: - existing_videos = set( - glob.glob(os.path.join(save_recording_path, "*.[mM][pP]4")) - + glob.glob(os.path.join(save_recording_path, "*.[wW][eE][bB][mM]")) - ) - - task = resolve_sensitive_env_variables(task) - # Run the agent llm = utils.get_llm_model( provider=llm_provider,