add browser-use agent run

This commit is contained in:
vincent
2025-04-28 22:11:56 +08:00
parent 0d259efbeb
commit 4c87694cef
18 changed files with 1343 additions and 523 deletions

View File

@@ -9,11 +9,23 @@ from playwright.async_api import (
Playwright, Playwright,
async_playwright, async_playwright,
) )
from browser_use.browser.browser import Browser from browser_use.browser.browser import Browser, IN_DOCKER
from browser_use.browser.context import BrowserContext, BrowserContextConfig from browser_use.browser.context import BrowserContext, BrowserContextConfig
from playwright.async_api import BrowserContext as PlaywrightBrowserContext from playwright.async_api import BrowserContext as PlaywrightBrowserContext
import logging import logging
from browser_use.browser.chrome import (
CHROME_ARGS,
CHROME_DETERMINISTIC_RENDERING_ARGS,
CHROME_DISABLE_SECURITY_ARGS,
CHROME_DOCKER_ARGS,
CHROME_HEADLESS_ARGS,
)
from browser_use.browser.context import BrowserContext, BrowserContextConfig
from browser_use.browser.utils.screen_resolution import get_screen_resolution, get_window_adjustments
from browser_use.utils import time_execution_async
import socket
from .custom_context import CustomBrowserContext from .custom_context import CustomBrowserContext
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@@ -26,3 +38,62 @@ class CustomBrowser(Browser):
config: BrowserContextConfig = BrowserContextConfig() config: BrowserContextConfig = BrowserContextConfig()
) -> CustomBrowserContext: ) -> CustomBrowserContext:
return CustomBrowserContext(config=config, browser=self) return CustomBrowserContext(config=config, browser=self)
async def _setup_builtin_browser(self, playwright: Playwright) -> PlaywrightBrowser:
"""Sets up and returns a Playwright Browser instance with anti-detection measures."""
assert self.config.browser_binary_path is None, 'browser_binary_path should be None if trying to use the builtin browsers'
if self.config.headless:
screen_size = {'width': 1920, 'height': 1080}
offset_x, offset_y = 0, 0
else:
screen_size = get_screen_resolution()
offset_x, offset_y = get_window_adjustments()
chrome_args = {
*CHROME_ARGS,
*(CHROME_DOCKER_ARGS if IN_DOCKER else []),
*(CHROME_HEADLESS_ARGS if self.config.headless else []),
*(CHROME_DISABLE_SECURITY_ARGS if self.config.disable_security else []),
*(CHROME_DETERMINISTIC_RENDERING_ARGS if self.config.deterministic_rendering else []),
f'--window-position={offset_x},{offset_y}',
*self.config.extra_browser_args,
}
contain_window_size = False
for arg in self.config.extra_browser_args:
if "--window-size" in arg:
contain_window_size = True
break
if not contain_window_size:
chrome_args.add(f'--window-size={screen_size["width"]},{screen_size["height"]}')
# check if port 9222 is already taken, if so remove the remote-debugging-port arg to prevent conflicts
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
if s.connect_ex(('localhost', 9222)) == 0:
chrome_args.remove('--remote-debugging-port=9222')
browser_class = getattr(playwright, self.config.browser_class)
args = {
'chromium': list(chrome_args),
'firefox': [
*{
'-no-remote',
*self.config.extra_browser_args,
}
],
'webkit': [
*{
'--no-startup-window',
*self.config.extra_browser_args,
}
],
}
browser = await browser_class.launch(
headless=self.config.headless,
args=args[self.config.browser_class],
proxy=self.config.proxy.model_dump() if self.config.proxy else None,
handle_sigterm=False,
handle_sigint=False,
)
return browser

View File

@@ -2,7 +2,7 @@ import json
import logging import logging
import os import os
from browser_use.browser.browser import Browser from browser_use.browser.browser import Browser, IN_DOCKER
from browser_use.browser.context import BrowserContext, BrowserContextConfig from browser_use.browser.context import BrowserContext, BrowserContextConfig
from playwright.async_api import Browser as PlaywrightBrowser from playwright.async_api import Browser as PlaywrightBrowser
from playwright.async_api import BrowserContext as PlaywrightBrowserContext from playwright.async_api import BrowserContext as PlaywrightBrowserContext
@@ -10,10 +10,104 @@ from playwright.async_api import BrowserContext as PlaywrightBrowserContext
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
class CustomBrowserContextConfig(BrowserContextConfig):
force_new_context: bool = False # force to create new context
class CustomBrowserContext(BrowserContext): class CustomBrowserContext(BrowserContext):
def __init__( def __init__(
self, self,
browser: "Browser", browser: "Browser",
config: BrowserContextConfig = BrowserContextConfig() config: CustomBrowserContextConfig = CustomBrowserContextConfig(),
): ):
super(CustomBrowserContext, self).__init__(browser=browser, config=config) super(CustomBrowserContext, self).__init__(browser=browser, config=config)
async def _create_context(self, browser: PlaywrightBrowser):
"""Creates a new browser context with anti-detection measures and loads cookies if available."""
if not self.config.force_new_context and self.browser.config.cdp_url and len(browser.contexts) > 0:
context = browser.contexts[0]
elif not self.config.force_new_context and self.browser.config.browser_binary_path and len(
browser.contexts) > 0:
# Connect to existing Chrome instance instead of creating new one
context = browser.contexts[0]
else:
# Original code for creating new context
context = await browser.new_context(
no_viewport=True,
user_agent=self.config.user_agent,
java_script_enabled=True,
bypass_csp=self.config.disable_security,
ignore_https_errors=self.config.disable_security,
record_video_dir=self.config.save_recording_path,
record_video_size=self.config.browser_window_size.model_dump(),
record_har_path=self.config.save_har_path,
locale=self.config.locale,
http_credentials=self.config.http_credentials,
is_mobile=self.config.is_mobile,
has_touch=self.config.has_touch,
geolocation=self.config.geolocation,
permissions=self.config.permissions,
timezone_id=self.config.timezone_id,
)
if self.config.trace_path:
await context.tracing.start(screenshots=True, snapshots=True, sources=True)
# Load cookies if they exist
if self.config.cookies_file and os.path.exists(self.config.cookies_file):
with open(self.config.cookies_file, 'r') as f:
try:
cookies = json.load(f)
valid_same_site_values = ['Strict', 'Lax', 'None']
for cookie in cookies:
if 'sameSite' in cookie:
if cookie['sameSite'] not in valid_same_site_values:
logger.warning(
f"Fixed invalid sameSite value '{cookie['sameSite']}' to 'None' for cookie {cookie.get('name')}"
)
cookie['sameSite'] = 'None'
logger.info(f'🍪 Loaded {len(cookies)} cookies from {self.config.cookies_file}')
await context.add_cookies(cookies)
except json.JSONDecodeError as e:
logger.error(f'Failed to parse cookies file: {str(e)}')
# Expose anti-detection scripts
await context.add_init_script(
"""
// Webdriver property
Object.defineProperty(navigator, 'webdriver', {
get: () => undefined
});
// Languages
Object.defineProperty(navigator, 'languages', {
get: () => ['en-US']
});
// Plugins
Object.defineProperty(navigator, 'plugins', {
get: () => [1, 2, 3, 4, 5]
});
// Chrome runtime
window.chrome = { runtime: {} };
// Permissions
const originalQuery = window.navigator.permissions.query;
window.navigator.permissions.query = (parameters) => (
parameters.name === 'notifications' ?
Promise.resolve({ state: Notification.permission }) :
originalQuery(parameters)
);
(function () {
const originalAttachShadow = Element.prototype.attachShadow;
Element.prototype.attachShadow = function attachShadow(options) {
return originalAttachShadow.call(this, { ...options, mode: "open" });
};
})();
"""
)
return context

View File

@@ -48,28 +48,6 @@ class CustomController(Controller):
self.mcp_client = None self.mcp_client = None
self.mcp_server_config = None self.mcp_server_config = None
async def setup_mcp_client(self, mcp_server_config: Optional[Dict[str, Any]] = None):
self.mcp_server_config = mcp_server_config
if self.mcp_server_config:
self.mcp_client = await setup_mcp_client_and_tools(self.mcp_server_config)
self.register_mcp_tools()
def register_mcp_tools(self):
"""
Register the MCP tools used by this controller.
"""
if self.mcp_client:
for server_name in self.mcp_client.server_name_to_tools:
for tool in self.mcp_client.server_name_to_tools[server_name]:
tool_name = f"mcp.{server_name}.{tool.name}"
self.registry.registry.actions[tool_name] = RegisteredAction(
name=tool_name,
description=tool.description,
function=tool,
param_model=create_tool_param_model(tool),
)
logger.info(f"Add mcp tool: {tool_name}")
def _register_custom_actions(self): def _register_custom_actions(self):
"""Register all custom browser actions""" """Register all custom browser actions"""
@@ -173,6 +151,28 @@ class CustomController(Controller):
except Exception as e: except Exception as e:
raise e raise e
async def setup_mcp_client(self, mcp_server_config: Optional[Dict[str, Any]] = None):
self.mcp_server_config = mcp_server_config
if self.mcp_server_config:
self.mcp_client = await setup_mcp_client_and_tools(self.mcp_server_config)
self.register_mcp_tools()
def register_mcp_tools(self):
"""
Register the MCP tools used by this controller.
"""
if self.mcp_client:
for server_name in self.mcp_client.server_name_to_tools:
for tool in self.mcp_client.server_name_to_tools[server_name]:
tool_name = f"mcp.{server_name}.{tool.name}"
self.registry.registry.actions[tool_name] = RegisteredAction(
name=tool_name,
description=tool.description,
function=tool,
param_model=create_tool_param_model(tool),
)
logger.info(f"Add mcp tool: {tool_name}")
async def close_mcp_client(self): async def close_mcp_client(self):
if self.mcp_client: if self.mcp_client:
await self.mcp_client.__aexit__(None, None, None) await self.mcp_client.__aexit__(None, None, None)

View File

@@ -40,7 +40,13 @@ async def setup_mcp_client_and_tools(mcp_server_config: Dict[str, Any]) -> Optio
logger.info("Initializing MultiServerMCPClient...") logger.info("Initializing MultiServerMCPClient...")
if not mcp_server_config:
logger.error("No MCP server configuration provided.")
return None
try: try:
if "mcpServers" in mcp_server_config:
mcp_server_config = mcp_server_config["mcpServers"]
client = MultiServerMCPClient(mcp_server_config) client = MultiServerMCPClient(mcp_server_config)
await client.__aenter__() await client.__aenter__()
return client return client

View File

@@ -9,25 +9,6 @@ import gradio as gr
import uuid import uuid
# Callback to update the model name dropdown based on the selected provider
def update_model_dropdown(llm_provider, api_key=None, base_url=None):
"""
Update the model name dropdown with predefined models for the selected provider.
"""
import gradio as gr
# Use API keys from .env if not provided
if not api_key:
api_key = os.getenv(f"{llm_provider.upper()}_API_KEY", "")
if not base_url:
base_url = os.getenv(f"{llm_provider.upper()}_BASE_URL", "")
# Use predefined models for the selected provider
if llm_provider in model_names:
return gr.Dropdown(choices=model_names[llm_provider], value=model_names[llm_provider][0], interactive=True)
else:
return gr.Dropdown(choices=[], value="", interactive=True, allow_custom_value=True)
def encode_image(img_path): def encode_image(img_path):
if not img_path: if not img_path:
return None return None
@@ -56,108 +37,3 @@ def get_latest_files(directory: str, file_types: list = ['.webm', '.zip']) -> Di
print(f"Error getting latest {file_type} file: {e}") print(f"Error getting latest {file_type} file: {e}")
return latest_files return latest_files
async def capture_screenshot(browser_context):
"""Capture and encode a screenshot"""
# Extract the Playwright browser instance
playwright_browser = browser_context.browser.playwright_browser # Ensure this is correct.
# Check if the browser instance is valid and if an existing context can be reused
if playwright_browser and playwright_browser.contexts:
playwright_context = playwright_browser.contexts[0]
else:
return None
# Access pages in the context
pages = None
if playwright_context:
pages = playwright_context.pages
# Use an existing page or create a new one if none exist
if pages:
active_page = pages[0]
for page in pages:
if page.url != "about:blank":
active_page = page
else:
return None
# Take screenshot
try:
screenshot = await active_page.screenshot(
type='jpeg',
quality=75,
scale="css"
)
encoded = base64.b64encode(screenshot).decode('utf-8')
return encoded
except Exception as e:
return None
class ConfigManager:
def __init__(self):
self.components = {}
self.component_order = []
def register_component(self, name: str, component):
"""Register a gradio component for config management."""
self.components[name] = component
if name not in self.component_order:
self.component_order.append(name)
return component
def save_current_config(self):
"""Save the current configuration of all registered components."""
current_config = {}
for name in self.component_order:
component = self.components[name]
# Get the current value from the component
current_config[name] = getattr(component, "value", None)
return save_config_to_file(current_config)
def update_ui_from_config(self, config_file):
"""Update UI components from a loaded configuration file."""
if config_file is None:
return [gr.update() for _ in self.component_order] + ["No file selected."]
loaded_config = load_config_from_file(config_file.name)
if not isinstance(loaded_config, dict):
return [gr.update() for _ in self.component_order] + ["Error: Invalid configuration file."]
# Prepare updates for all components
updates = []
for name in self.component_order:
if name in loaded_config:
updates.append(gr.update(value=loaded_config[name]))
else:
updates.append(gr.update())
updates.append("Configuration loaded successfully.")
return updates
def get_all_components(self):
"""Return all registered components in the order they were registered."""
return [self.components[name] for name in self.component_order]
def load_config_from_file(config_file):
"""Load settings from a config file (JSON format)."""
try:
with open(config_file, 'r') as f:
settings = json.load(f)
return settings
except Exception as e:
return f"Error loading configuration: {str(e)}"
def save_config_to_file(settings, save_dir="./tmp/webui_settings"):
"""Save the current settings to a UUID.json file with a UUID name."""
os.makedirs(save_dir, exist_ok=True)
config_file = os.path.join(save_dir, f"{uuid.uuid4()}.json")
with open(config_file, 'w') as f:
json.dump(settings, f, indent=2)
return f"Configuration saved to {config_file}"

View File

@@ -50,7 +50,7 @@ def create_agent_settings_tab(webui_manager: WebuiManager) -> dict[str, Componen
extend_system_prompt = gr.Textbox(label="Extend system prompt", lines=4, interactive=True) extend_system_prompt = gr.Textbox(label="Extend system prompt", lines=4, interactive=True)
with gr.Group(): with gr.Group():
mcp_json_file = gr.File(label="MCP server file", interactive=True, file_types=[".json"]) mcp_json_file = gr.File(label="MCP server json", interactive=True, file_types=[".json"])
mcp_server_config = gr.Textbox(label="MCP server", lines=6, interactive=True, visible=False) mcp_server_config = gr.Textbox(label="MCP server", lines=6, interactive=True, visible=False)
with gr.Group(): with gr.Group():
@@ -118,6 +118,7 @@ def create_agent_settings_tab(webui_manager: WebuiManager) -> dict[str, Componen
choices=[provider for provider, model in config.model_names.items()], choices=[provider for provider, model in config.model_names.items()],
label="Planner LLM Provider", label="Planner LLM Provider",
info="Select LLM provider for LLM", info="Select LLM provider for LLM",
value=None,
interactive=True interactive=True
) )
planner_llm_model_name = gr.Dropdown( planner_llm_model_name = gr.Dropdown(
@@ -201,7 +202,6 @@ def create_agent_settings_tab(webui_manager: WebuiManager) -> dict[str, Componen
interactive=True, interactive=True,
allow_custom_value=True, allow_custom_value=True,
choices=["auto", "json_schema", "function_calling", "None"], choices=["auto", "json_schema", "function_calling", "None"],
info="Tool Calls Function Name",
visible=True visible=True
) )
tab_components.update(dict( tab_components.update(dict(
@@ -228,6 +228,8 @@ def create_agent_settings_tab(webui_manager: WebuiManager) -> dict[str, Componen
mcp_json_file=mcp_json_file, mcp_json_file=mcp_json_file,
mcp_server_config=mcp_server_config, mcp_server_config=mcp_server_config,
)) ))
webui_manager.add_components("agent_settings", tab_components)
llm_provider.change( llm_provider.change(
fn=lambda x: gr.update(visible=x == "ollama"), fn=lambda x: gr.update(visible=x == "ollama"),
inputs=llm_provider, inputs=llm_provider,
@@ -236,23 +238,21 @@ def create_agent_settings_tab(webui_manager: WebuiManager) -> dict[str, Componen
llm_provider.change( llm_provider.change(
lambda provider: update_model_dropdown(provider), lambda provider: update_model_dropdown(provider),
inputs=[llm_provider], inputs=[llm_provider],
outputs=llm_model_name outputs=[llm_model_name]
) )
planner_llm_provider.change( planner_llm_provider.change(
fn=lambda x: gr.update(visible=x == "ollama"), fn=lambda x: gr.update(visible=x == "ollama"),
inputs=planner_llm_provider, inputs=[planner_llm_provider],
outputs=planner_ollama_num_ctx outputs=[planner_ollama_num_ctx]
) )
planner_llm_provider.change( planner_llm_provider.change(
lambda provider: update_model_dropdown(provider), lambda provider: update_model_dropdown(provider),
inputs=[planner_llm_provider], inputs=[planner_llm_provider],
outputs=planner_llm_model_name outputs=[planner_llm_model_name]
) )
mcp_json_file.change( mcp_json_file.change(
update_mcp_server, update_mcp_server,
inputs=mcp_json_file, inputs=[mcp_json_file],
outputs=[mcp_server_config, mcp_server_config] outputs=[mcp_server_config, mcp_server_config]
) )
return tab_components

View File

@@ -35,7 +35,7 @@ def create_browser_settings_tab(webui_manager: WebuiManager) -> dict[str, Compon
) )
keep_browser_open = gr.Checkbox( keep_browser_open = gr.Checkbox(
label="Keep Browser Open", label="Keep Browser Open",
value=False, value=True,
info="Keep Browser Open between Tasks", info="Keep Browser Open between Tasks",
interactive=True interactive=True
) )
@@ -119,7 +119,9 @@ def create_browser_settings_tab(webui_manager: WebuiManager) -> dict[str, Compon
save_agent_history_path=save_agent_history_path, save_agent_history_path=save_agent_history_path,
save_download_path=save_download_path, save_download_path=save_download_path,
cdp_url=cdp_url, cdp_url=cdp_url,
wss_url=wss_url wss_url=wss_url,
window_h=window_h,
window_w=window_w,
) )
) )
return tab_components webui_manager.add_components("browser_settings", tab_components)

View File

@@ -1,62 +1,921 @@
import gradio as gr import gradio as gr
from gradio.components import Component from gradio.components import Component
import asyncio
import os
import json
import uuid
import logging
from datetime import datetime
from typing import List, Dict, Optional, Any, Set, Generator, AsyncGenerator, Union
from collections.abc import Awaitable
from langchain_core.language_models.chat_models import BaseChatModel
import base64
from browser_use.browser.browser import Browser, BrowserConfig
from browser_use.browser.context import BrowserContext, BrowserContextConfig, BrowserContextWindowSize
from browser_use.agent.service import Agent
from browser_use.agent.views import AgentHistoryList
from browser_use.agent.views import ToolCallingMethod # Adjust import
from browser_use.agent.views import (
REQUIRED_LLM_API_ENV_VARS,
ActionResult,
AgentError,
AgentHistory,
AgentHistoryList,
AgentOutput,
AgentSettings,
AgentState,
AgentStepInfo,
StepMetadata,
ToolCallingMethod,
)
from browser_use.browser.browser import Browser
from browser_use.browser.context import BrowserContext
from browser_use.browser.views import BrowserState, BrowserStateHistory
from src.webui.webui_manager import WebuiManager from src.webui.webui_manager import WebuiManager
from src.utils import config from src.controller.custom_controller import CustomController
from src.utils import llm_provider
from src.browser.custom_browser import CustomBrowser
from src.browser.custom_context import CustomBrowserContext, CustomBrowserContextConfig
logger = logging.getLogger(__name__)
def create_browser_use_agent_tab(webui_manager: WebuiManager) -> dict[str, Component]: # --- Helper Functions --- (Defined at module level)
async def _initialize_llm(provider: Optional[str], model_name: Optional[str], temperature: float,
base_url: Optional[str], api_key: Optional[str], num_ctx: Optional[int] = None) -> Optional[
BaseChatModel]:
"""Initializes the LLM based on settings. Returns None if provider/model is missing."""
if not provider or not model_name:
logger.info("LLM Provider or Model Name not specified, LLM will be None.")
return None
try:
# Use your actual LLM provider logic here
logger.info(f"Initializing LLM: Provider={provider}, Model={model_name}, Temp={temperature}")
# Example using a placeholder function
llm = llm_provider.get_llm_model(
provider=provider,
model_name=model_name,
temperature=temperature,
base_url=base_url or None,
api_key=api_key or None,
# Add other relevant params like num_ctx for ollama
num_ctx=num_ctx if provider == "ollama" else None
)
return llm
except Exception as e:
logger.error(f"Failed to initialize LLM: {e}", exc_info=True)
gr.Warning(
f"Failed to initialize LLM '{model_name}' for provider '{provider}'. Please check settings. Error: {e}")
return None
def _get_config_value(webui_manager: WebuiManager, comp_dict: Dict[gr.components.Component, Any], comp_id_suffix: str,
default: Any = None) -> Any:
"""Safely get value from component dictionary using its ID suffix relative to the tab."""
# Assumes component ID format is "tab_name.comp_name"
tab_name = "browser_use_agent" # Hardcode or derive if needed
comp_id = f"{tab_name}.{comp_id_suffix}"
# Need to find the component object first using the ID from the manager
try:
comp = webui_manager.get_component_by_id(comp_id)
return comp_dict.get(comp, default)
except KeyError:
# Try accessing settings tabs as well
for prefix in ["agent_settings", "browser_settings"]:
try:
comp_id = f"{prefix}.{comp_id_suffix}"
comp = webui_manager.get_component_by_id(comp_id)
return comp_dict.get(comp, default)
except KeyError:
continue
logger.warning(f"Component with suffix '{comp_id_suffix}' not found in manager for value lookup.")
return default
def _format_agent_output(model_output: AgentOutput) -> str:
"""Formats AgentOutput for display in the chatbot using JSON."""
content = ""
if model_output:
try:
# Directly use model_dump if actions and current_state are Pydantic models
action_dump = [action.model_dump(exclude_none=True) for action in model_output.action]
state_dump = model_output.current_state.model_dump(exclude_none=True)
model_output_dump = {
'current_state': state_dump,
'action': action_dump,
}
# Dump to JSON string with indentation
json_string = json.dumps(model_output_dump, indent=4, ensure_ascii=False)
# Wrap in <pre><code> for proper display in HTML
content = f"<pre><code class='language-json'>{json_string}</code></pre>"
except AttributeError as ae:
logger.error(
f"AttributeError during model dump: {ae}. Check if 'action' or 'current_state' or their items support 'model_dump'.")
content = f"<pre><code>Error: Could not format agent output (AttributeError: {ae}).\nRaw output: {str(model_output)}</code></pre>"
except Exception as e:
logger.error(f"Error formatting agent output: {e}", exc_info=True)
# Fallback to simple string representation on error
content = f"<pre><code>Error formatting agent output.\nRaw output:\n{str(model_output)}</code></pre>"
return content.strip()
# --- Updated Callback Implementation ---
async def _handle_new_step(webui_manager: WebuiManager, state: BrowserState, output: AgentOutput, step_num: int):
"""Callback for each step taken by the agent, including screenshot display."""
# Use the correct chat history attribute name from the user's code
if not hasattr(webui_manager, 'bu_chat_history'):
logger.error("Attribute 'bu_chat_history' not found in webui_manager! Cannot add chat message.")
# Initialize it maybe? Or raise an error? For now, log and potentially skip chat update.
webui_manager.bu_chat_history = [] # Initialize if missing (consider if this is the right place)
# return # Or stop if this is critical
step_num -= 1
logger.info(f"Step {step_num} completed.")
# --- Screenshot Handling ---
screenshot_html = ""
# Ensure state.screenshot exists and is not empty before proceeding
# Use getattr for safer access
screenshot_data = getattr(state, 'screenshot', None)
if screenshot_data:
try:
# Basic validation: check if it looks like base64
if isinstance(screenshot_data, str) and len(screenshot_data) > 100: # Arbitrary length check
# *** UPDATED STYLE: Removed centering, adjusted width ***
img_tag = f'<img src="data:image/jpeg;base64,{screenshot_data}" alt="Step {step_num} Screenshot" style="max-width: 600px; max-height: 300px; object-fit:contain; margin-bottom: 10px;" />'
screenshot_html = img_tag + "<br/>" # Use <br/> for line break after inline-block image
else:
logger.warning(
f"Screenshot for step {step_num} seems invalid (type: {type(screenshot_data)}, len: {len(screenshot_data) if isinstance(screenshot_data, str) else 'N/A'}).")
screenshot_html = "**[Invalid screenshot data]**<br/>"
except Exception as e:
logger.error(f"Error processing or formatting screenshot for step {step_num}: {e}", exc_info=True)
screenshot_html = "**[Error displaying screenshot]**<br/>"
else:
logger.debug(f"No screenshot available for step {step_num}.")
# --- Format Agent Output ---
formatted_output = _format_agent_output(output) # Use the updated function
# --- Combine and Append to Chat ---
step_header = f"--- **Step {step_num}** ---"
# Combine header, image (with line break), and JSON block
final_content = step_header + "<br/>" + screenshot_html + formatted_output
chat_message = {
"role": "assistant",
"content": final_content.strip() # Remove leading/trailing whitespace
}
# Append to the correct chat history list
webui_manager.bu_chat_history.append(chat_message)
await asyncio.sleep(0.05)
def _handle_done(webui_manager: WebuiManager, history: AgentHistoryList):
"""Callback when the agent finishes the task (success or failure)."""
logger.info(
f"Agent task finished. Duration: {history.total_duration_seconds():.2f}s, Tokens: {history.total_input_tokens()}")
final_summary = f"**Task Completed**\n"
final_summary += f"- Duration: {history.total_duration_seconds():.2f} seconds\n"
final_summary += f"- Total Input Tokens: {history.total_input_tokens()}\n" # Or total tokens if available
final_result = history.final_result()
if final_result:
final_summary += f"- Final Result: {final_result}\n"
errors = history.errors()
if errors and any(errors):
final_summary += f"- **Errors:**\n```\n{errors}\n```\n"
else:
final_summary += "- Status: Success\n"
webui_manager.bu_chat_history.append({"role": "assistant", "content": final_summary})
async def _ask_assistant_callback(webui_manager: WebuiManager, query: str, browser_context: BrowserContext) -> Dict[
str, Any]:
"""Callback triggered by the agent's ask_for_assistant action."""
logger.info("Agent requires assistance. Waiting for user input.")
if not hasattr(webui_manager, '_chat_history'):
logger.error("Chat history not found in webui_manager during ask_assistant!")
return {"response": "Internal Error: Cannot display help request."}
webui_manager.bu_chat_history.append({"role": "assistant",
"content": f"**Need Help:** {query}\nPlease provide information or perform the required action in the browser, then type your response/confirmation below and click 'Submit Response'."})
# Use state stored in webui_manager
webui_manager.bu_response_event = asyncio.Event()
webui_manager.bu_user_help_response = None # Reset previous response
try:
logger.info("Waiting for user response event...")
await asyncio.wait_for(webui_manager.bu_response_event.wait(), timeout=3600.0) # Long timeout
logger.info("User response event received.")
except asyncio.TimeoutError:
logger.warning("Timeout waiting for user assistance.")
webui_manager.bu_chat_history.append(
{"role": "assistant", "content": "**Timeout:** No response received. Trying to proceed."})
webui_manager.bu_response_event = None # Clear the event
return {"response": "Timeout: User did not respond."} # Inform the agent
response = webui_manager.bu_user_help_response
webui_manager.bu_chat_history.append({"role": "user", "content": response}) # Show user response in chat
webui_manager.bu_response_event = None # Clear the event for the next potential request
return {"response": response}
async def capture_screenshot(browser_context):
"""Capture and encode a screenshot"""
# Extract the Playwright browser instance
playwright_browser = browser_context.browser.playwright_browser # Ensure this is correct.
# Check if the browser instance is valid and if an existing context can be reused
if playwright_browser and playwright_browser.contexts:
playwright_context = playwright_browser.contexts[0]
else:
return None
# Access pages in the context
pages = None
if playwright_context:
pages = playwright_context.pages
# Use an existing page or create a new one if none exist
if pages:
active_page = pages[0]
for page in pages:
if page.url != "about:blank":
active_page = page
else:
return None
# Take screenshot
try:
screenshot = await active_page.screenshot(
type='jpeg',
quality=75,
scale="css"
)
encoded = base64.b64encode(screenshot).decode('utf-8')
return encoded
except Exception as e:
return None
# --- Core Agent Execution Logic --- (Needs access to webui_manager)
async def run_agent_task(webui_manager: WebuiManager, components: Dict[gr.components.Component, Any]) -> AsyncGenerator[
Dict[gr.components.Component, Any], None]:
"""Handles the entire lifecycle of initializing and running the agent."""
# --- Get Components ---
# Need handles to specific UI components to update them
user_input_comp = webui_manager.get_component_by_id("browser_use_agent.user_input")
run_button_comp = webui_manager.get_component_by_id("browser_use_agent.run_button")
stop_button_comp = webui_manager.get_component_by_id("browser_use_agent.stop_button")
pause_resume_button_comp = webui_manager.get_component_by_id("browser_use_agent.pause_resume_button")
clear_button_comp = webui_manager.get_component_by_id("browser_use_agent.clear_button")
chatbot_comp = webui_manager.get_component_by_id("browser_use_agent.chatbot")
history_file_comp = webui_manager.get_component_by_id("browser_use_agent.agent_history_file")
gif_comp = webui_manager.get_component_by_id("browser_use_agent.recording_gif")
browser_view_comp = webui_manager.get_component_by_id("browser_use_agent.browser_view")
# --- 1. Get Task and Initial UI Update ---
task = components.get(user_input_comp, "").strip()
if not task:
gr.Warning("Please enter a task.")
yield {run_button_comp: gr.update(interactive=True)}
return
# Set running state indirectly via _current_task
webui_manager.bu_chat_history.append({"role": "user", "content": task})
yield {
user_input_comp: gr.Textbox(value="", interactive=False, placeholder="Agent is running..."),
run_button_comp: gr.Button(value="⏳ Running...", interactive=False),
stop_button_comp: gr.Button(interactive=True),
pause_resume_button_comp: gr.Button(value="⏸️ Pause", interactive=True),
clear_button_comp: gr.Button(interactive=False),
chatbot_comp: gr.update(value=webui_manager.bu_chat_history),
history_file_comp: gr.update(value=None),
gif_comp: gr.update(value=None),
}
# --- Agent Settings ---
# Access settings values via components dict, getting IDs from webui_manager
def get_setting(key, default=None):
comp = webui_manager.id_to_component.get(f"agent_settings.{key}")
return components.get(comp, default) if comp else default
override_system_prompt = get_setting("override_system_prompt") or None
extend_system_prompt = get_setting("extend_system_prompt") or None
llm_provider_name = get_setting("llm_provider", None) # Default to None if not found
llm_model_name = get_setting("llm_model_name", None)
llm_temperature = get_setting("llm_temperature", 0.6)
use_vision = get_setting("use_vision", True)
ollama_num_ctx = get_setting("ollama_num_ctx", 16000)
llm_base_url = get_setting("llm_base_url") or None
llm_api_key = get_setting("llm_api_key") or None
max_steps = get_setting("max_steps", 100)
max_actions = get_setting("max_actions", 10)
max_input_tokens = get_setting("max_input_tokens", 128000)
tool_calling_str = get_setting("tool_calling_method", "auto")
tool_calling_method = tool_calling_str if tool_calling_str != "None" else None
mcp_server_config_comp = webui_manager.id_to_component.get("agent_settings.mcp_server_config")
mcp_server_config_str = components.get(mcp_server_config_comp) if mcp_server_config_comp else None
mcp_server_config = json.loads(mcp_server_config_str) if mcp_server_config_str else None
# Planner LLM Settings (Optional)
planner_llm_provider_name = get_setting("planner_llm_provider") or None
planner_llm = None
if planner_llm_provider_name:
planner_llm_model_name = get_setting("planner_llm_model_name")
planner_llm_temperature = get_setting("planner_llm_temperature", 0.6)
planner_ollama_num_ctx = get_setting("planner_ollama_num_ctx", 16000)
planner_llm_base_url = get_setting("planner_llm_base_url") or None
planner_llm_api_key = get_setting("planner_llm_api_key") or None
planner_use_vision = get_setting("planner_use_vision", False)
planner_llm = await _initialize_llm(
planner_llm_provider_name, planner_llm_model_name, planner_llm_temperature,
planner_llm_base_url, planner_llm_api_key,
planner_ollama_num_ctx if planner_llm_provider_name == "ollama" else None
)
# --- Browser Settings ---
def get_browser_setting(key, default=None):
comp = webui_manager.id_to_component.get(f"browser_settings.{key}")
return components.get(comp, default) if comp else default
browser_binary_path = get_browser_setting("browser_binary_path") or None
browser_user_data_dir = get_browser_setting("browser_user_data_dir") or None
use_own_browser = get_browser_setting("use_own_browser", False) # Logic handled by CDP/WSS presence
keep_browser_open = get_browser_setting("keep_browser_open", False)
headless = get_browser_setting("headless", False)
disable_security = get_browser_setting("disable_security", True)
window_w = int(get_browser_setting("window_w", 1280))
window_h = int(get_browser_setting("window_h", 1100))
cdp_url = get_browser_setting("cdp_url") or None
wss_url = get_browser_setting("wss_url") or None
save_recording_path = get_browser_setting("save_recording_path") or None
save_trace_path = get_browser_setting("save_trace_path") or None
save_agent_history_path = get_browser_setting("save_agent_history_path", "./tmp/agent_history")
save_download_path = get_browser_setting("save_download_path", "./tmp/downloads")
stream_vw = 80
stream_vh = int(80 * window_h // window_w)
os.makedirs(save_agent_history_path, exist_ok=True)
if save_recording_path: os.makedirs(save_recording_path, exist_ok=True)
if save_trace_path: os.makedirs(save_trace_path, exist_ok=True)
if save_download_path: os.makedirs(save_download_path, exist_ok=True)
# --- 2. Initialize LLM ---
main_llm = await _initialize_llm(
llm_provider_name, llm_model_name, llm_temperature, llm_base_url, llm_api_key,
ollama_num_ctx if llm_provider_name == "ollama" else None
)
# Pass the webui_manager instance to the callback when wrapping it
async def ask_callback_wrapper(query: str, browser_context: BrowserContext) -> Dict[str, Any]:
return await _ask_assistant_callback(webui_manager, query, browser_context)
if not webui_manager.bu_controller:
webui_manager.bu_controller = CustomController(ask_assistant_callback=ask_callback_wrapper)
await webui_manager.bu_controller.setup_mcp_client(mcp_server_config)
# --- 4. Initialize Browser and Context ---
should_close_browser_on_finish = not keep_browser_open
try:
# Close existing resources if not keeping open
if not keep_browser_open:
if webui_manager.bu_browser_context:
logger.info("Closing previous browser context.")
await webui_manager.bu_browser_context.close()
webui_manager.bu_browser_context = None
if webui_manager.bu_browser:
logger.info("Closing previous browser.")
await webui_manager.bu_browser.close()
webui_manager.bu_browser = None
# Create Browser if needed
if not webui_manager.bu_browser:
logger.info("Launching new browser instance.")
extra_args = [f"--window-size={window_w},{window_h}"]
if browser_user_data_dir:
extra_args.append(f"--user-data-dir={browser_user_data_dir}")
if use_own_browser:
browser_binary_path = os.getenv("CHROME_PATH", None) or browser_binary_path
if browser_binary_path == "":
browser_binary_path = None
chrome_user_data = os.getenv("CHROME_USER_DATA", None)
if chrome_user_data:
extra_args += [f"--user-data-dir={chrome_user_data}"]
else:
browser_binary_path = None
webui_manager.bu_browser = CustomBrowser(
config=BrowserConfig(
headless=headless,
disable_security=disable_security,
browser_binary_path=browser_binary_path,
extra_browser_args=extra_args,
wss_url=wss_url,
cdp_url=cdp_url,
)
)
# Create Context if needed
if not webui_manager.bu_browser_context:
logger.info("Creating new browser context.")
context_config = CustomBrowserContextConfig(
trace_path=save_trace_path if save_trace_path else None,
save_recording_path=save_recording_path if save_recording_path else None,
save_downloads_path=save_download_path if save_download_path else None,
browser_window_size=BrowserContextWindowSize(width=window_w, height=window_h)
)
if not webui_manager.bu_browser:
raise ValueError("Browser not initialized, cannot create context.")
webui_manager.bu_browser_context = await webui_manager.bu_browser.new_context(config=context_config)
# --- 5. Initialize or Update Agent ---
webui_manager.bu_agent_task_id = str(uuid.uuid4()) # New ID for this task run
os.makedirs(os.path.join(save_agent_history_path, webui_manager.bu_agent_task_id), exist_ok=True)
history_file = os.path.join(save_agent_history_path, webui_manager.bu_agent_task_id,
f"{webui_manager.bu_agent_task_id}.json")
gif_path = os.path.join(save_agent_history_path, webui_manager.bu_agent_task_id,
f"{webui_manager.bu_agent_task_id}.gif")
# Pass the webui_manager to callbacks when wrapping them
async def step_callback_wrapper(state: BrowserState, output: AgentOutput, step_num: int):
await _handle_new_step(webui_manager, state, output, step_num)
def done_callback_wrapper(history: AgentHistoryList):
_handle_done(webui_manager, history)
if not webui_manager.bu_agent:
logger.info(f"Initializing new agent for task: {task}")
if not webui_manager.bu_browser or not webui_manager.bu_browser_context:
raise ValueError("Browser or Context not initialized, cannot create agent.")
webui_manager.bu_agent = Agent(
task=task,
llm=main_llm,
browser=webui_manager.bu_browser,
browser_context=webui_manager.bu_browser_context,
controller=webui_manager.bu_controller,
register_new_step_callback=step_callback_wrapper,
register_done_callback=done_callback_wrapper,
# Agent settings
use_vision=use_vision,
override_system_message=override_system_prompt,
extend_system_message=extend_system_prompt,
max_input_tokens=max_input_tokens,
max_actions_per_step=max_actions,
tool_calling_method=tool_calling_method,
planner_llm=planner_llm,
use_vision_for_planner=planner_use_vision if planner_llm else False,
save_conversation_path=history_file,
)
webui_manager.bu_agent.state.agent_id = webui_manager.bu_agent_task_id
webui_manager.bu_agent.settings.generate_gif = gif_path
else:
webui_manager.bu_agent.state.agent_id = webui_manager.bu_agent_task_id
webui_manager.bu_agent.add_new_task(task)
webui_manager.bu_agent.settings.generate_gif = gif_path
# --- 6. Run Agent Task and Stream Updates ---
agent_run_coro = webui_manager.bu_agent.run(max_steps=max_steps)
agent_task = asyncio.create_task(agent_run_coro)
webui_manager.bu_current_task = agent_task # Store the task
last_chat_len = len(webui_manager.bu_chat_history)
while not agent_task.done():
is_paused = webui_manager.bu_agent.state.paused
is_stopped = webui_manager.bu_agent.state.stopped
# Check for pause state
if is_paused:
yield {
pause_resume_button_comp: gr.update(value="▶️ Resume", interactive=True),
run_button_comp: gr.update(value="⏸️ Paused", interactive=False),
stop_button_comp: gr.update(interactive=True), # Allow stop while paused
}
# Wait until pause is released or task is stopped/done
while is_paused and not agent_task.done():
# Re-check agent state in loop
is_paused = webui_manager.bu_agent.state.paused
is_stopped = webui_manager.bu_agent.state.stopped
if is_stopped: # Stop signal received while paused
break
await asyncio.sleep(0.2)
if agent_task.done() or is_stopped: # If stopped or task finished while paused
break
# If resumed, yield UI update
yield {
pause_resume_button_comp: gr.update(value="⏸️ Pause", interactive=True),
run_button_comp: gr.update(value="⏳ Running...", interactive=False),
}
# Check if agent stopped itself or stop button was pressed (which sets agent.state.stopped)
if is_stopped:
logger.info("Agent has stopped (internally or via stop button).")
if not agent_task.done():
# Ensure the task coroutine finishes if agent just set flag
try:
await asyncio.wait_for(agent_task, timeout=1.0) # Give it a moment to exit run()
except asyncio.TimeoutError:
logger.warning("Agent task did not finish quickly after stop signal, cancelling.")
agent_task.cancel()
except Exception: # Catch task exceptions if it errors on stop
pass
break # Exit the streaming loop
# Check if agent is asking for help (via response_event)
update_dict = {}
if webui_manager.bu_response_event is not None:
update_dict = {
user_input_comp: gr.update(placeholder="Agent needs help. Enter response and submit.",
interactive=True),
run_button_comp: gr.update(value="✔️ Submit Response", interactive=True),
pause_resume_button_comp: gr.update(interactive=False),
stop_button_comp: gr.update(interactive=False),
chatbot_comp: gr.update(value=webui_manager.bu_chat_history)
}
last_chat_len = len(webui_manager.bu_chat_history)
yield update_dict
# Wait until response is submitted or task finishes
while webui_manager.bu_response_event is not None and not agent_task.done():
await asyncio.sleep(0.2)
# Restore UI after response submitted or if task ended unexpectedly
if not agent_task.done():
yield {
user_input_comp: gr.update(placeholder="Agent is running...", interactive=False),
run_button_comp: gr.update(value="⏳ Running...", interactive=False),
pause_resume_button_comp: gr.update(interactive=True),
stop_button_comp: gr.update(interactive=True),
}
else:
break # Task finished while waiting for response
# Update Chatbot if new messages arrived via callbacks
if len(webui_manager.bu_chat_history) > last_chat_len:
update_dict[chatbot_comp] = gr.update(value=webui_manager.bu_chat_history)
last_chat_len = len(webui_manager.bu_chat_history)
# Update Browser View
if headless and webui_manager.bu_browser_context:
try:
screenshot_b64 = await capture_screenshot(webui_manager.bu_browser_context)
if screenshot_b64:
html_content = f'<img src="data:image/jpeg;base64,{screenshot_b64}" style="width:{stream_vw}vw; height:{stream_vh}vh ; border:1px solid #ccc;">'
update_dict[browser_view_comp] = gr.update(value=html_content, visible=True)
else:
html_content = f"<h1 style='width:{stream_vw}vw; height:{stream_vh}vh'>Waiting for browser session...</h1>"
update_dict[browser_view_comp] = gr.update(value=html_content,
visible=True)
except Exception as e:
logger.debug(f"Failed to capture screenshot: {e}")
update_dict[browser_view_comp] = gr.update(value="<div style='...'>Error loading view...</div>",
visible=True)
else:
update_dict[browser_view_comp] = gr.update(visible=False)
# Yield accumulated updates
if update_dict:
yield update_dict
await asyncio.sleep(0.1) # Polling interval
# --- 7. Task Finalization ---
webui_manager.bu_agent.state.paused = False
webui_manager.bu_agent.state.stopped = False
final_update = {}
try:
logger.info("Agent task completing...")
# Await the task ensure completion and catch exceptions if not already caught
if not agent_task.done():
await agent_task # Retrieve result/exception
elif agent_task.exception(): # Check if task finished with exception
agent_task.result() # Raise the exception to be caught below
logger.info("Agent task completed processing.")
logger.info(f"Explicitly saving agent history to: {history_file}")
webui_manager.bu_agent.save_history(history_file)
if os.path.exists(history_file):
final_update[history_file_comp] = gr.File(value=history_file)
if gif_path and os.path.exists(gif_path):
logger.info(f"GIF found at: {gif_path}")
final_update[gif_comp] = gr.Image(value=gif_path)
except asyncio.CancelledError:
logger.info("Agent task was cancelled.")
if not any("Cancelled" in msg.get("content", "") for msg in webui_manager.bu_chat_history if
msg.get("role") == "assistant"):
webui_manager.bu_chat_history.append({"role": "assistant", "content": "**Task Cancelled**."})
final_update[chatbot_comp] = gr.update(value=webui_manager.bu_chat_history)
except Exception as e:
logger.error(f"Error during agent execution: {e}", exc_info=True)
error_message = f"**Agent Execution Error:**\n```\n{type(e).__name__}: {e}\n```"
if not any(error_message in msg.get("content", "") for msg in webui_manager.bu_chat_history if
msg.get("role") == "assistant"):
webui_manager.bu_chat_history.append({"role": "assistant", "content": error_message})
final_update[chatbot_comp] = gr.update(value=webui_manager.bu_chat_history)
gr.Error(f"Agent execution failed: {e}")
finally:
webui_manager.bu_current_task = None # Clear the task reference
# Close browser/context if requested
if should_close_browser_on_finish:
if webui_manager.bu_browser_context:
logger.info("Closing browser context after task.")
await webui_manager.bu_browser_context.close()
webui_manager.bu_browser_context = None
if webui_manager.bu_browser:
logger.info("Closing browser after task.")
await webui_manager.bu_browser.close()
webui_manager.bu_browser = None
# --- 8. Final UI Update ---
final_update.update({
user_input_comp: gr.update(value="", interactive=True, placeholder="Enter your next task..."),
run_button_comp: gr.update(value="▶️ Submit Task", interactive=True),
stop_button_comp: gr.update(interactive=False),
pause_resume_button_comp: gr.update(value="⏸️ Pause", interactive=False),
clear_button_comp: gr.update(interactive=True),
# Ensure final chat history is shown
chatbot_comp: gr.update(value=webui_manager.bu_chat_history)
})
yield final_update
except Exception as e:
# Catch errors during setup (before agent run starts)
logger.error(f"Error setting up agent task: {e}", exc_info=True)
webui_manager.bu_current_task = None # Ensure state is reset
yield {
user_input_comp: gr.update(interactive=True, placeholder="Error during setup. Enter task..."),
run_button_comp: gr.update(value="▶️ Submit Task", interactive=True),
stop_button_comp: gr.update(interactive=False),
pause_resume_button_comp: gr.update(value="⏸️ Pause", interactive=False),
clear_button_comp: gr.update(interactive=True),
chatbot_comp: gr.update(
value=webui_manager.bu_chat_history + [{"role": "assistant", "content": f"**Setup Error:** {e}"}]),
}
# --- Button Click Handlers --- (Need access to webui_manager)
async def handle_submit(webui_manager: WebuiManager, components: Dict[gr.components.Component, Any]):
"""Handles clicks on the main 'Submit' button."""
user_input_comp = webui_manager.get_component_by_id("browser_use_agent.user_input")
user_input_value = components.get(user_input_comp, "").strip()
# Check if waiting for user assistance
if webui_manager.bu_response_event and not webui_manager.bu_response_event.is_set():
logger.info(f"User submitted assistance: {user_input_value}")
webui_manager.bu_user_help_response = user_input_value if user_input_value else "User provided no text response."
webui_manager.bu_response_event.set()
# UI updates handled by the main loop reacting to the event being set
yield {
user_input_comp: gr.update(value="", interactive=False, placeholder="Waiting for agent to continue..."),
webui_manager.get_component_by_id("browser_use_agent.run_button"): gr.update(value="⏳ Running...",
interactive=False)
}
# Check if a task is currently running (using _current_task)
elif webui_manager.bu_current_task and not webui_manager.bu_current_task.done():
logger.warning("Submit button clicked while agent is already running and not asking for help.")
gr.Info("Agent is currently running. Please wait or use Stop/Pause.")
yield {} # No change
else:
# Handle submission for a new task
logger.info("Submit button clicked for new task.")
# Use async generator to stream updates from run_agent_task
async for update in run_agent_task(webui_manager, components):
yield update
async def handle_stop(webui_manager: WebuiManager):
"""Handles clicks on the 'Stop' button."""
logger.info("Stop button clicked.")
agent = webui_manager.bu_agent
task = webui_manager.bu_current_task
if agent and task and not task.done():
# Signal the agent to stop by setting its internal flag
agent.state.stopped = True
agent.state.paused = False # Ensure not paused if stopped
return {
webui_manager.get_component_by_id("browser_use_agent.stop_button"): gr.update(interactive=False,
value="⏹️ Stopping..."),
webui_manager.get_component_by_id("browser_use_agent.pause_resume_button"): gr.update(interactive=False),
webui_manager.get_component_by_id("browser_use_agent.run_button"): gr.update(interactive=False),
}
else:
logger.warning("Stop clicked but agent is not running or task is already done.")
# Reset UI just in case it's stuck
return {
webui_manager.get_component_by_id("browser_use_agent.run_button"): gr.update(interactive=True),
webui_manager.get_component_by_id("browser_use_agent.stop_button"): gr.update(interactive=False),
webui_manager.get_component_by_id("browser_use_agent.pause_resume_button"): gr.update(interactive=False),
webui_manager.get_component_by_id("browser_use_agent.clear_button"): gr.update(interactive=True),
}
async def handle_pause_resume(webui_manager: WebuiManager):
"""Handles clicks on the 'Pause/Resume' button."""
agent = webui_manager.bu_agent
task = webui_manager.bu_current_task
if agent and task and not task.done():
if agent.state.paused:
logger.info("Resume button clicked.")
agent.resume()
# UI update happens in main loop
return {
webui_manager.get_component_by_id("browser_use_agent.pause_resume_button"): gr.update(value="⏸️ Pause",
interactive=True)} # Optimistic update
else:
logger.info("Pause button clicked.")
agent.pause()
return {
webui_manager.get_component_by_id("browser_use_agent.pause_resume_button"): gr.update(value="▶️ Resume",
interactive=True)} # Optimistic update
else:
logger.warning("Pause/Resume clicked but agent is not running or doesn't support state.")
return {} # No change
async def handle_clear(webui_manager: WebuiManager):
"""Handles clicks on the 'Clear' button."""
logger.info("Clear button clicked.")
# Stop any running task first
task = webui_manager.bu_current_task
if task and not task.done():
logger.info("Clearing requires stopping the current task.")
webui_manager.bu_agent.stop()
try:
await asyncio.wait_for(task, timeout=2.0) # Wait briefly
except (asyncio.CancelledError, asyncio.TimeoutError):
pass
except Exception as e:
logger.warning(f"Error stopping task on clear: {e}")
webui_manager.bu_current_task.cancel()
webui_manager.bu_current_task = None
if webui_manager.bu_controller:
await webui_manager.bu_controller.close_mcp_client()
webui_manager.bu_controller = None
webui_manager.bu_agent = None
# Reset state stored in manager
webui_manager.bu_chat_history = []
webui_manager.bu_response_event = None
webui_manager.bu_user_help_response = None
webui_manager.bu_agent_task_id = None
logger.info("Agent state and browser resources cleared.")
# Reset UI components
return {
webui_manager.get_component_by_id("browser_use_agent.chatbot"): gr.update(value=[]),
webui_manager.get_component_by_id("browser_use_agent.user_input"): gr.update(value="",
placeholder="Enter your task here..."),
webui_manager.get_component_by_id("browser_use_agent.agent_history_file"): gr.update(value=None),
webui_manager.get_component_by_id("browser_use_agent.recording_gif"): gr.update(value=None),
webui_manager.get_component_by_id("browser_use_agent.browser_view"): gr.update(
value="<div style='...'>Browser Cleared</div>"),
webui_manager.get_component_by_id("browser_use_agent.run_button"): gr.update(value="▶️ Submit Task",
interactive=True),
webui_manager.get_component_by_id("browser_use_agent.stop_button"): gr.update(interactive=False),
webui_manager.get_component_by_id("browser_use_agent.pause_resume_button"): gr.update(value="⏸️ Pause",
interactive=False),
webui_manager.get_component_by_id("browser_use_agent.clear_button"): gr.update(interactive=True),
}
# --- Tab Creation Function ---
def create_browser_use_agent_tab(webui_manager: WebuiManager):
""" """
Create the run agent tab Create the run agent tab, defining UI, state, and handlers.
""" """
input_components = set(webui_manager.get_components()) webui_manager.init_browser_use_agent()
# --- Define UI Components ---
tab_components = {} tab_components = {}
with gr.Column():
chatbot = gr.Chatbot(type='messages', label="Chat History", height=600) chatbot = gr.Chatbot(
user_input = gr.Textbox( lambda: webui_manager.bu_chat_history, # Load history dynamically
label="User Input", elem_id="browser_use_chatbot",
lines=3, label="Agent Interaction",
value="go to google.com and type 'OpenAI' click search and give me the first url", type="messages",
interactive=True height=600,
) show_copy_button=True,
bubble_full_width=False,
with gr.Row():
stop_button = gr.Button("⏹️ Stop", interactive=False, variant="stop", scale=2)
clear_button = gr.Button("🧹 Clear", interactive=True, variant="stop", scale=2)
run_button = gr.Button("▶️ Summit", variant="primary", scale=3)
browser_view = gr.HTML(
value="<h1 style='width:80vw; height:50vh'>Waiting for browser session...</h1>",
label="Browser Live View",
visible=False
)
with gr.Row():
agent_final_result = gr.Textbox(
label="Final Result", lines=3, show_label=True, interactive=False
) )
agent_errors = gr.Textbox( user_input = gr.Textbox(
label="Errors", lines=3, show_label=True, interactive=False label="Your Task or Response",
placeholder="Enter your task here or provide assistance when asked.",
lines=3,
interactive=True,
elem_id="user_input"
) )
with gr.Row():
stop_button = gr.Button("⏹️ Stop", interactive=False, variant="stop", scale=1)
pause_resume_button = gr.Button("⏸️ Pause", interactive=False, variant="secondary", scale=1)
clear_button = gr.Button("🗑️ Clear", interactive=True, variant="secondary", scale=1)
run_button = gr.Button("▶️ Submit Task", variant="primary", scale=2)
with gr.Row(): browser_view = gr.HTML(
agent_trace_file = gr.File(label="Trace File", interactive=False) value="<div style='width:100%; height:50vh; display:flex; justify-content:center; align-items:center; border:1px solid #ccc; background-color:#f0f0f0;'><p>Browser View (Requires Headless=True)</p></div>",
agent_history_file = gr.File(label="Agent History", interactive=False) label="Browser Live View",
elem_id="browser_view",
visible=False,
)
with gr.Column():
gr.Markdown("### Task Outputs")
agent_history_file = gr.File(label="Agent History JSON", interactive=False)
recording_gif = gr.Image(label="Task Recording GIF", format="gif", interactive=False,
type="filepath")
recording_gif = gr.Image(label="Result GIF", format="gif", interactive=False) # --- Store Components in Manager ---
tab_components.update( tab_components.update(
dict( dict(
chatbot=chatbot, chatbot=chatbot, user_input=user_input, clear_button=clear_button,
user_input=user_input, run_button=run_button, stop_button=stop_button, pause_resume_button=pause_resume_button,
clear_button=clear_button, agent_history_file=agent_history_file, recording_gif=recording_gif,
run_button=run_button,
stop_button=stop_button,
agent_final_result=agent_final_result,
agent_errors=agent_errors,
agent_trace_file=agent_trace_file,
agent_history_file=agent_history_file,
recording_gif=recording_gif,
browser_view=browser_view browser_view=browser_view
) )
) )
return tab_components webui_manager.add_components("browser_use_agent", tab_components) # Use "browser_use_agent" as tab_name prefix
all_managed_components = set(webui_manager.get_components()) # Get all components known to manager
run_tab_outputs = list(tab_components.values())
async def submit_wrapper(components_dict: Dict[Component, Any]) -> AsyncGenerator[Dict[Component, Any], None]:
"""Wrapper for handle_submit that yields its results."""
# handle_submit is an async generator, iterate and yield
async for update in handle_submit(webui_manager, components_dict):
yield update
async def stop_wrapper() -> AsyncGenerator[Dict[Component, Any], None]:
"""Wrapper for handle_stop."""
# handle_stop is async def but returns a single dict. We yield it once.
update_dict = await handle_stop(webui_manager)
yield update_dict # Yield the final dictionary
async def pause_resume_wrapper() -> AsyncGenerator[Dict[Component, Any], None]:
"""Wrapper for handle_pause_resume."""
update_dict = await handle_pause_resume(webui_manager)
yield update_dict
async def clear_wrapper() -> AsyncGenerator[Dict[Component, Any], None]:
"""Wrapper for handle_clear."""
update_dict = await handle_clear(webui_manager)
yield update_dict
# --- Connect Event Handlers using the Wrappers --
run_button.click(
fn=submit_wrapper,
inputs=all_managed_components,
outputs=run_tab_outputs
)
user_input.submit(
fn=submit_wrapper,
inputs=all_managed_components,
outputs=run_tab_outputs
)
stop_button.click(
fn=stop_wrapper,
inputs=None,
outputs=run_tab_outputs
)
pause_resume_button.click(
fn=pause_resume_wrapper,
inputs=None,
outputs=run_tab_outputs
)
clear_button.click(
fn=clear_wrapper,
inputs=None,
outputs=run_tab_outputs
)

View File

@@ -38,4 +38,4 @@ def create_deep_research_agent_tab(webui_manager: WebuiManager) -> dict[str, Com
markdown_download=markdown_download, markdown_download=markdown_download,
) )
) )
return tab_components webui_manager.add_components("deep_research_agent", tab_components)

View File

@@ -34,16 +34,17 @@ def create_load_save_config_tab(webui_manager: WebuiManager) -> dict[str, Compon
config_file=config_file, config_file=config_file,
)) ))
webui_manager.add_components("load_save_config", tab_components)
save_config_button.click( save_config_button.click(
fn=webui_manager.save_current_config, fn=webui_manager.save_config,
inputs=[], inputs=set(webui_manager.get_components()),
outputs=[config_status] outputs=[config_status]
) )
load_config_button.click( load_config_button.click(
fn=webui_manager.load_config, fn=webui_manager.load_config,
inputs=[config_file], inputs=[config_file],
outputs=[config_status] outputs=webui_manager.get_components(),
) )
return tab_components

View File

@@ -32,6 +32,9 @@ def create_ui(theme_name="Ocean"):
text-align: center; text-align: center;
margin-bottom: 20px; margin-bottom: 20px;
} }
.tab-header-text {
text-align: center;
}
.theme-section { .theme-section {
margin-bottom: 10px; margin-bottom: 10px;
padding: 15px; padding: 15px;
@@ -67,18 +70,26 @@ def create_ui(theme_name="Ocean"):
with gr.Tabs() as tabs: with gr.Tabs() as tabs:
with gr.TabItem("⚙️ Agent Settings"): with gr.TabItem("⚙️ Agent Settings"):
ui_manager.add_components("agent_settings", create_agent_settings_tab(ui_manager)) create_agent_settings_tab(ui_manager)
with gr.TabItem("🌐 Browser Settings"): with gr.TabItem("🌐 Browser Settings"):
ui_manager.add_components("browser_settings", create_browser_settings_tab(ui_manager)) create_browser_settings_tab(ui_manager)
with gr.TabItem("🤖 Run Agent"): with gr.TabItem("🤖 Run Agent"):
ui_manager.add_components("browser_use_agent", create_browser_use_agent_tab(ui_manager)) create_browser_use_agent_tab(ui_manager)
with gr.TabItem("🧐 Deep Research"): with gr.TabItem("🎁 Agent Collections"):
ui_manager.add_components("deep_research_agent", create_deep_research_agent_tab(ui_manager)) gr.Markdown(
"""
### Agents built on Browser-Use
""",
elem_classes=["tab-header-text"],
)
with gr.Tabs():
with gr.TabItem("Deep Research"):
create_deep_research_agent_tab(ui_manager)
with gr.TabItem("📁 Load & Save Config"): with gr.TabItem("📁 Load & Save Config"):
ui_manager.add_components("load_save_config", create_load_save_config_tab(ui_manager)) create_load_save_config_tab(ui_manager)
return demo return demo

View File

@@ -4,11 +4,17 @@ from typing import TYPE_CHECKING
import os import os
import gradio as gr import gradio as gr
from datetime import datetime from datetime import datetime
from typing import Optional, Dict, List
import uuid
import asyncio
from gradio.components import Component from gradio.components import Component
from browser_use.browser.browser import Browser from browser_use.browser.browser import Browser
from browser_use.browser.context import BrowserContext from browser_use.browser.context import BrowserContext
from browser_use.agent.service import Agent from browser_use.agent.service import Agent
from src.browser.custom_browser import CustomBrowser
from src.browser.custom_context import CustomBrowserContext
from src.controller.custom_controller import CustomController
class WebuiManager: class WebuiManager:
@@ -19,9 +25,19 @@ class WebuiManager:
self.settings_save_dir = settings_save_dir self.settings_save_dir = settings_save_dir
os.makedirs(self.settings_save_dir, exist_ok=True) os.makedirs(self.settings_save_dir, exist_ok=True)
self.browser: Browser = None def init_browser_use_agent(self) -> None:
self.browser_context: BrowserContext = None """
self.bu_agent: Agent = None init browser use agent
"""
self.bu_agent: Optional[Agent] = None
self.bu_browser: Optional[CustomBrowser] = None
self.bu_browser_context: Optional[CustomBrowserContext] = None
self.bu_controller: Optional[CustomController] = None
self.bu_chat_history: List[Dict[str, Optional[str]]] = []
self.bu_response_event: Optional[asyncio.Event] = None
self.bu_user_help_response: Optional[str] = None
self.bu_current_task: Optional[asyncio.Task] = None
self.bu_agent_task_id: Optional[str] = None
def add_components(self, tab_name: str, components_dict: dict[str, "Component"]) -> None: def add_components(self, tab_name: str, components_dict: dict[str, "Component"]) -> None:
""" """
@@ -50,15 +66,16 @@ class WebuiManager:
""" """
return self.component_to_id[comp] return self.component_to_id[comp]
def save_current_config(self): def save_config(self, components: Dict["Component", str]) -> None:
""" """
Save current config Save config
""" """
cur_settings = {} cur_settings = {}
for comp_id, comp in self.id_to_component.items(): for comp in components:
if not isinstance(comp, gr.Button) and not isinstance(comp, gr.File) and str( if not isinstance(comp, gr.Button) and not isinstance(comp, gr.File) and str(
getattr(comp, "interactive", True)).lower() != "false": getattr(comp, "interactive", True)).lower() != "false":
cur_settings[comp_id] = getattr(comp, "value", None) comp_id = self.get_id_by_component(comp)
cur_settings[comp_id] = components[comp]
config_name = datetime.now().strftime("%Y%m%d-%H%M%S") config_name = datetime.now().strftime("%Y%m%d-%H%M%S")
with open(os.path.join(self.settings_save_dir, f"{config_name}.json"), "w") as fw: with open(os.path.join(self.settings_save_dir, f"{config_name}.json"), "w") as fw:
@@ -76,6 +93,13 @@ class WebuiManager:
update_components = {} update_components = {}
for comp_id, comp_val in ui_settings.items(): for comp_id, comp_val in ui_settings.items():
if comp_id in self.id_to_component: if comp_id in self.id_to_component:
update_components[self.id_to_component[comp_id]].value = comp_val comp = self.id_to_component[comp_id]
update_components[comp] = comp.__class__(value=comp_val)
return f"Successfully loaded config from {config_path}" config_status = self.id_to_component["load_save_config.config_status"]
update_components.update(
{
config_status: config_status.__class__(value=f"Successfully loaded config: {config_path}")
}
)
yield update_components

View File

@@ -17,98 +17,18 @@ from browser_use.agent.views import AgentHistoryList
from src.utils import utils from src.utils import utils
async def test_browser_use_org(): async def test_browser_use_agent():
from browser_use.browser.browser import Browser, BrowserConfig from browser_use.browser.browser import Browser, BrowserConfig
from browser_use.browser.context import ( from browser_use.browser.context import (
BrowserContextConfig, BrowserContextConfig,
BrowserContextWindowSize, BrowserContextWindowSize,
) )
from browser_use.agent.service import Agent
# llm = utils.get_llm_model(
# provider="azure_openai",
# model_name="gpt-4o",
# temperature=0.8,
# base_url=os.getenv("AZURE_OPENAI_ENDPOINT", ""),
# api_key=os.getenv("AZURE_OPENAI_API_KEY", ""),
# )
# llm = utils.get_llm_model(
# provider="deepseek",
# model_name="deepseek-chat",
# temperature=0.8
# )
llm = utils.get_llm_model(
provider="ollama", model_name="deepseek-r1:14b", temperature=0.5
)
window_w, window_h = 1920, 1080
use_vision = False
use_own_browser = False
if use_own_browser:
chrome_path = os.getenv("CHROME_PATH", None)
if chrome_path == "":
chrome_path = None
else:
chrome_path = None
tool_calling_method = "json_schema" # setting to json_schema when using ollma
browser = Browser(
config=BrowserConfig(
headless=False,
disable_security=True,
chrome_instance_path=chrome_path,
extra_chromium_args=[f"--window-size={window_w},{window_h}"],
)
)
async with await browser.new_context(
config=BrowserContextConfig(
trace_path="./tmp/traces",
save_recording_path="./tmp/record_videos",
no_viewport=False,
browser_window_size=BrowserContextWindowSize(
width=window_w, height=window_h
),
)
) as browser_context:
agent = Agent(
task="go to google.com and type 'OpenAI' click search and give me the first url",
llm=llm,
browser_context=browser_context,
use_vision=use_vision,
tool_calling_method=tool_calling_method
)
history: AgentHistoryList = await agent.run(max_steps=10)
print("Final Result:")
pprint(history.final_result(), indent=4)
print("\nErrors:")
pprint(history.errors(), indent=4)
# e.g. xPaths the model clicked on
print("\nModel Outputs:")
pprint(history.model_actions(), indent=4)
print("\nThoughts:")
pprint(history.model_thoughts(), indent=4)
# close browser
await browser.close()
async def test_browser_use_custom():
from browser_use.browser.context import BrowserContextWindowSize
from browser_use.browser.browser import BrowserConfig
from playwright.async_api import async_playwright
from src.agent.custom_agent import CustomAgent
from src.agent.custom_prompts import CustomSystemPrompt, CustomAgentMessagePrompt
from src.browser.custom_browser import CustomBrowser from src.browser.custom_browser import CustomBrowser
from src.browser.custom_context import BrowserContextConfig from src.browser.custom_context import CustomBrowserContextConfig
from src.controller.custom_controller import CustomController from src.controller.custom_controller import CustomController
from src.utils import llm_provider
window_w, window_h = 1280, 1100
# llm = utils.get_llm_model( # llm = utils.get_llm_model(
# provider="openai", # provider="openai",
@@ -118,14 +38,6 @@ async def test_browser_use_custom():
# api_key=os.getenv("OPENAI_API_KEY", ""), # api_key=os.getenv("OPENAI_API_KEY", ""),
# ) # )
llm = utils.get_llm_model(
provider="azure_openai",
model_name="gpt-4o",
temperature=0.5,
base_url=os.getenv("AZURE_OPENAI_ENDPOINT", ""),
api_key=os.getenv("AZURE_OPENAI_API_KEY", ""),
)
# llm = utils.get_llm_model( # llm = utils.get_llm_model(
# provider="google", # provider="google",
# model_name="gemini-2.0-flash", # model_name="gemini-2.0-flash",
@@ -153,13 +65,43 @@ async def test_browser_use_custom():
# provider="ollama", model_name="deepseek-r1:14b", temperature=0.5 # provider="ollama", model_name="deepseek-r1:14b", temperature=0.5
# ) # )
window_w, window_h = 1280, 1100
llm = llm_provider.get_llm_model(
provider="azure_openai",
model_name="gpt-4o",
temperature=0.5,
base_url=os.getenv("AZURE_OPENAI_ENDPOINT", ""),
api_key=os.getenv("AZURE_OPENAI_API_KEY", ""),
)
mcp_server_config = {
"mcpServers": {
"markitdown": {
"command": "docker",
"args": [
"run",
"--rm",
"-i",
"markitdown-mcp:latest"
]
},
"desktop-commander": {
"command": "npx",
"args": [
"-y",
"@wonderwhy-er/desktop-commander"
]
},
}
}
controller = CustomController() controller = CustomController()
use_own_browser = True await controller.setup_mcp_client(mcp_server_config)
use_own_browser = False
disable_security = True disable_security = True
use_vision = True # Set to False when using DeepSeek use_vision = True # Set to False when using DeepSeek
max_actions_per_step = 10 max_actions_per_step = 10
playwright = None
browser = None browser = None
browser_context = None browser_context = None
@@ -178,29 +120,27 @@ async def test_browser_use_custom():
config=BrowserConfig( config=BrowserConfig(
headless=False, headless=False,
disable_security=disable_security, disable_security=disable_security,
chrome_instance_path=chrome_path, browser_binary_path=chrome_path,
extra_chromium_args=extra_chromium_args, extra_browser_args=extra_chromium_args,
) )
) )
browser_context = await browser.new_context( browser_context = await browser.new_context(
config=BrowserContextConfig( config=CustomBrowserContextConfig(
trace_path="./tmp/traces", trace_path="./tmp/traces",
save_recording_path="./tmp/record_videos", save_recording_path="./tmp/record_videos",
no_viewport=False, save_downloads_path="./tmp/downloads",
browser_window_size=BrowserContextWindowSize( browser_window_size=BrowserContextWindowSize(
width=window_w, height=window_h width=window_w, height=window_h
), ),
force_new_context=True
) )
) )
agent = CustomAgent( agent = Agent(
task="open youtube in tab 1 , open google email in tab 2, open facebook in tab 3", task="download pdf from https://arxiv.org/abs/2504.10458 and rename this pdf to 'GUI-r1-test.pdf'",
add_infos="", # some hints for llm to complete the task
llm=llm, llm=llm,
browser=browser, browser=browser,
browser_context=browser_context, browser_context=browser_context,
controller=controller, controller=controller,
system_prompt_class=CustomSystemPrompt,
agent_prompt_class=CustomAgentMessagePrompt,
use_vision=use_vision, use_vision=use_vision,
max_actions_per_step=max_actions_per_step, max_actions_per_step=max_actions_per_step,
generate_gif=True generate_gif=True
@@ -213,28 +153,17 @@ async def test_browser_use_custom():
print("\nErrors:") print("\nErrors:")
pprint(history.errors(), indent=4) pprint(history.errors(), indent=4)
# e.g. xPaths the model clicked on
print("\nModel Outputs:")
pprint(history.model_actions(), indent=4)
print("\nThoughts:")
pprint(history.model_thoughts(), indent=4)
except Exception: except Exception:
import traceback import traceback
traceback.print_exc() traceback.print_exc()
finally: finally:
# 显式关闭持久化上下文
if browser_context: if browser_context:
await browser_context.close() await browser_context.close()
# 关闭 Playwright 对象
if playwright:
await playwright.stop()
if browser: if browser:
await browser.close() await browser.close()
if controller:
await controller.close_mcp_client()
async def test_browser_use_parallel(): async def test_browser_use_parallel():
@@ -242,13 +171,20 @@ async def test_browser_use_parallel():
from browser_use.browser.browser import BrowserConfig from browser_use.browser.browser import BrowserConfig
from playwright.async_api import async_playwright from playwright.async_api import async_playwright
from browser_use.browser.browser import Browser from browser_use.browser.browser import Browser
from src.agent.custom_agent import CustomAgent
from src.agent.custom_prompts import CustomSystemPrompt, CustomAgentMessagePrompt
from src.browser.custom_browser import CustomBrowser
from src.browser.custom_context import BrowserContextConfig from src.browser.custom_context import BrowserContextConfig
from src.controller.custom_controller import CustomController from src.controller.custom_controller import CustomController
window_w, window_h = 1920, 1080 from browser_use.browser.browser import Browser, BrowserConfig
from browser_use.browser.context import (
BrowserContextConfig,
BrowserContextWindowSize,
)
from browser_use.agent.service import Agent
from src.browser.custom_browser import CustomBrowser
from src.browser.custom_context import CustomBrowserContextConfig
from src.controller.custom_controller import CustomController
from src.utils import llm_provider
# llm = utils.get_llm_model( # llm = utils.get_llm_model(
# provider="openai", # provider="openai",
@@ -258,20 +194,13 @@ async def test_browser_use_parallel():
# api_key=os.getenv("OPENAI_API_KEY", ""), # api_key=os.getenv("OPENAI_API_KEY", ""),
# ) # )
# llm = utils.get_llm_model(
# provider="azure_openai",
# model_name="gpt-4o",
# temperature=0.8,
# base_url=os.getenv("AZURE_OPENAI_ENDPOINT", ""),
# api_key=os.getenv("AZURE_OPENAI_API_KEY", ""),
# )
llm = utils.get_llm_model( # llm = utils.get_llm_model(
provider="gemini", # provider="google",
model_name="gemini-2.0-flash-exp", # model_name="gemini-2.0-flash",
temperature=1.0, # temperature=0.6,
api_key=os.getenv("GOOGLE_API_KEY", "") # api_key=os.getenv("GOOGLE_API_KEY", "")
) # )
# llm = utils.get_llm_model( # llm = utils.get_llm_model(
# provider="deepseek", # provider="deepseek",
@@ -293,72 +222,119 @@ async def test_browser_use_parallel():
# provider="ollama", model_name="deepseek-r1:14b", temperature=0.5 # provider="ollama", model_name="deepseek-r1:14b", temperature=0.5
# ) # )
window_w, window_h = 1280, 1100
llm = llm_provider.get_llm_model(
provider="azure_openai",
model_name="gpt-4o",
temperature=0.5,
base_url=os.getenv("AZURE_OPENAI_ENDPOINT", ""),
api_key=os.getenv("AZURE_OPENAI_API_KEY", ""),
)
mcp_server_config = {
"mcpServers": {
"markitdown": {
"command": "docker",
"args": [
"run",
"--rm",
"-i",
"markitdown-mcp:latest"
]
},
"desktop-commander": {
"command": "npx",
"args": [
"-y",
"@wonderwhy-er/desktop-commander"
]
},
# "filesystem": {
# "command": "npx",
# "args": [
# "-y",
# "@modelcontextprotocol/server-filesystem",
# "/Users/xxx/ai_workspace",
# ]
# },
}
}
controller = CustomController() controller = CustomController()
use_own_browser = True await controller.setup_mcp_client(mcp_server_config)
use_own_browser = False
disable_security = True disable_security = True
use_vision = True # Set to False when using DeepSeek use_vision = True # Set to False when using DeepSeek
max_actions_per_step = 1 max_actions_per_step = 10
playwright = None
browser = None browser = None
browser_context = None browser_context = None
browser = Browser(
config=BrowserConfig(
disable_security=True,
headless=False,
new_context_config=BrowserContextConfig(save_recording_path='./tmp/recordings'),
)
)
try: try:
extra_chromium_args = [f"--window-size={window_w},{window_h}"]
if use_own_browser:
chrome_path = os.getenv("CHROME_PATH", None)
if chrome_path == "":
chrome_path = None
chrome_user_data = os.getenv("CHROME_USER_DATA", None)
if chrome_user_data:
extra_chromium_args += [f"--user-data-dir={chrome_user_data}"]
else:
chrome_path = None
browser = CustomBrowser(
config=BrowserConfig(
headless=False,
disable_security=disable_security,
browser_binary_path=chrome_path,
extra_browser_args=extra_chromium_args,
)
)
browser_context = await browser.new_context(
config=CustomBrowserContextConfig(
trace_path="./tmp/traces",
save_recording_path="./tmp/record_videos",
save_downloads_path="./tmp/downloads",
browser_window_size=BrowserContextWindowSize(
width=window_w, height=window_h
),
force_new_context=True
)
)
agents = [ agents = [
Agent(task=task, llm=llm, browser=browser) Agent(task=task, llm=llm, browser=browser, controller=controller)
for task in [ for task in [
'Search Google for weather in Tokyo', 'Search Google for weather in Tokyo',
'Check Reddit front page title', # 'Check Reddit front page title',
'Find NASA image of the day', # 'Find NASA image of the day',
'Check top story on CNN', # 'Check top story on CNN',
# 'Search latest SpaceX launch date', # 'Search latest SpaceX launch date',
# 'Look up population of Paris', # 'Look up population of Paris',
# 'Find current time in Sydney', 'Find current time in Sydney',
# 'Check who won last Super Bowl', 'Check who won last Super Bowl',
# 'Search trending topics on Twitter', # 'Search trending topics on Twitter',
] ]
] ]
history = await asyncio.gather(*[agent.run() for agent in agents]) history = await asyncio.gather(*[agent.run() for agent in agents])
pdb.set_trace()
print("Final Result:") print("Final Result:")
pprint(history.final_result(), indent=4) pprint(history.final_result(), indent=4)
print("\nErrors:") print("\nErrors:")
pprint(history.errors(), indent=4) pprint(history.errors(), indent=4)
# e.g. xPaths the model clicked on pdb.set_trace()
print("\nModel Outputs:")
pprint(history.model_actions(), indent=4)
print("\nThoughts:")
pprint(history.model_thoughts(), indent=4)
# close browser
except Exception: except Exception:
import traceback import traceback
traceback.print_exc() traceback.print_exc()
finally: finally:
# 显式关闭持久化上下文
if browser_context: if browser_context:
await browser_context.close() await browser_context.close()
# 关闭 Playwright 对象
if playwright:
await playwright.stop()
if browser: if browser:
await browser.close() await browser.close()
if __name__ == "__main__": if __name__ == "__main__":
asyncio.run(test_browser_use_org()) # asyncio.run(test_browser_use_agent())
# asyncio.run(test_browser_use_parallel()) asyncio.run(test_browser_use_parallel())
# asyncio.run(test_browser_use_custom())

View File

@@ -45,33 +45,37 @@ async def test_controller_with_mcp():
from src.controller.custom_controller import CustomController from src.controller.custom_controller import CustomController
from browser_use.controller.registry.views import ActionModel from browser_use.controller.registry.views import ActionModel
test_server_config = { mcp_server_config = {
"playwright": { "mcpServers": {
"command": "npx", "markitdown": {
"args": [ "command": "docker",
"@playwright/mcp@latest", "args": [
], "run",
"transport": "stdio", "--rm",
}, "-i",
"filesystem": { "markitdown-mcp:latest"
"command": "npx", ]
"args": [ },
"-y", "desktop-commander": {
"@modelcontextprotocol/server-filesystem", "command": "npx",
"/Users/xxx/ai_workspace", "args": [
] "-y",
}, "@wonderwhy-er/desktop-commander"
"desktop-commander": { ]
"command": "npx", },
"args": [ # "filesystem": {
"-y", # "command": "npx",
"@wonderwhy-er/desktop-commander" # "args": [
] # "-y",
# "@modelcontextprotocol/server-filesystem",
# "/Users/xxx/ai_workspace",
# ]
# },
} }
} }
controller = CustomController() controller = CustomController()
await controller.setup_mcp_client(test_server_config) await controller.setup_mcp_client(mcp_server_config)
action_name = "mcp.desktop-commander.execute_command" action_name = "mcp.desktop-commander.execute_command"
action_info = controller.registry.registry.actions[action_name] action_info = controller.registry.registry.actions[action_name]
param_model = action_info.param_model param_model = action_info.param_model
@@ -85,7 +89,8 @@ async def test_controller_with_mcp():
result = await controller.act(action_model) result = await controller.act(action_model)
result = result.extracted_content result = result.extracted_content
print(result) print(result)
if result and "Command is still running. Use read_output to get more output." in result and "PID" in result.split("\n")[0]: if result and "Command is still running. Use read_output to get more output." in result and "PID" in \
result.split("\n")[0]:
pid = int(result.split("\n")[0].split("PID")[-1].strip()) pid = int(result.split("\n")[0].split("PID")[-1].strip())
action_name = "mcp.desktop-commander.read_output" action_name = "mcp.desktop-commander.read_output"
action_info = controller.registry.registry.actions[action_name] action_info = controller.registry.registry.actions[action_name]

View File

@@ -144,10 +144,10 @@ def test_ibm_model():
if __name__ == "__main__": if __name__ == "__main__":
# test_openai_model() # test_openai_model()
# test_google_model() # test_google_model()
# test_azure_openai_model() test_azure_openai_model()
# test_deepseek_model() # test_deepseek_model()
# test_ollama_model() # test_ollama_model()
# test_deepseek_r1_model() # test_deepseek_r1_model()
# test_deepseek_r1_ollama_model() # test_deepseek_r1_ollama_model()
# test_mistral_model() # test_mistral_model()
test_ibm_model() # test_ibm_model()

View File

@@ -1,3 +1,5 @@
from dotenv import load_dotenv
load_dotenv()
import argparse import argparse
from src.webui.interface import theme_map, create_ui from src.webui.interface import theme_map, create_ui

107
webui2.py
View File

@@ -42,77 +42,6 @@ _global_browser = None
_global_browser_context = None _global_browser_context = None
_global_agent = None _global_agent = None
# Create the global agent state instance
_global_agent_state = AgentState()
# webui config
webui_config_manager = utils.ConfigManager()
def scan_and_register_components(blocks):
"""扫描一个 Blocks 对象并注册其中的所有交互式组件,但不包括按钮"""
global webui_config_manager
def traverse_blocks(block, prefix=""):
registered = 0
# 处理 Blocks 自身的组件
if hasattr(block, "children"):
for i, child in enumerate(block.children):
if isinstance(child, gr.components.Component):
# 排除按钮 (Button) 组件
if getattr(child, "interactive", False) and not isinstance(child, gr.Button):
name = f"{prefix}component_{i}"
if hasattr(child, "label") and child.label:
# 使用标签作为名称的一部分
label = child.label
name = f"{prefix}{label}"
logger.debug(f"Registering component: {name}")
webui_config_manager.register_component(name, child)
registered += 1
elif hasattr(child, "children"):
# 递归处理嵌套的 Blocks
new_prefix = f"{prefix}block_{i}_"
registered += traverse_blocks(child, new_prefix)
return registered
total = traverse_blocks(blocks)
logger.info(f"Total registered components: {total}")
def save_current_config():
return webui_config_manager.save_current_config()
def update_ui_from_config(config_file):
return webui_config_manager.update_ui_from_config(config_file)
def resolve_sensitive_env_variables(text):
"""
Replace environment variable placeholders ($SENSITIVE_*) with their values.
Only replaces variables that start with SENSITIVE_.
"""
if not text:
return text
import re
# Find all $SENSITIVE_* patterns
env_vars = re.findall(r'\$SENSITIVE_[A-Za-z0-9_]*', text)
result = text
for var in env_vars:
# Remove the $ prefix to get the actual environment variable name
env_name = var[1:] # removes the $
env_value = os.getenv(env_name)
if env_value is not None:
# Replace $SENSITIVE_VAR_NAME with its value
result = result.replace(var, env_value)
return result
async def stop_agent(): async def stop_agent():
"""Request the agent to stop and update UI with enhanced feedback""" """Request the agent to stop and update UI with enhanced feedback"""
@@ -140,32 +69,6 @@ async def stop_agent():
) )
async def stop_research_agent():
"""Request the agent to stop and update UI with enhanced feedback"""
global _global_agent_state
try:
# Request stop
_global_agent_state.request_stop()
# Update UI immediately
message = "Stop requested - the agent will halt at the next safe point"
logger.info(f"🛑 {message}")
# Return UI updates
return ( # errors_output
gr.update(value="Stopping...", interactive=False), # stop_button
gr.update(interactive=False), # run_button
)
except Exception as e:
error_msg = f"Error during stop: {str(e)}"
logger.error(error_msg)
return (
gr.update(value="Stop", interactive=True),
gr.update(interactive=True)
)
async def run_browser_agent( async def run_browser_agent(
agent_type, agent_type,
llm_provider, llm_provider,
@@ -202,16 +105,6 @@ async def run_browser_agent(
if save_recording_path: if save_recording_path:
os.makedirs(save_recording_path, exist_ok=True) os.makedirs(save_recording_path, exist_ok=True)
# Get the list of existing videos before the agent runs
existing_videos = set()
if save_recording_path:
existing_videos = set(
glob.glob(os.path.join(save_recording_path, "*.[mM][pP]4"))
+ glob.glob(os.path.join(save_recording_path, "*.[wW][eE][bB][mM]"))
)
task = resolve_sensitive_env_variables(task)
# Run the agent # Run the agent
llm = utils.get_llm_model( llm = utils.get_llm_model(
provider=llm_provider, provider=llm_provider,