update to bu==0.1.43 and fix deep research

This commit is contained in:
vincent
2025-05-09 09:27:12 +08:00
parent fb65ca7ba2
commit eb91cb64ec
10 changed files with 218 additions and 267 deletions

View File

@@ -1,4 +1,4 @@
browser-use==0.1.45
browser-use==0.1.43
pyperclip==1.9.0
gradio==5.27.0
json-repair

View File

@@ -8,9 +8,13 @@ import os
from browser_use.agent.gif import create_history_gif
from browser_use.agent.service import Agent, AgentHookFunc
from browser_use.agent.views import (
ActionResult,
AgentHistory,
AgentHistoryList,
AgentStepInfo,
ToolCallingMethod,
)
from browser_use.browser.views import BrowserStateHistory
from browser_use.telemetry.views import (
AgentEndTelemetryEvent,
)
@@ -21,17 +25,15 @@ load_dotenv()
logger = logging.getLogger(__name__)
SKIP_LLM_API_KEY_VERIFICATION = (
os.environ.get("SKIP_LLM_API_KEY_VERIFICATION", "false").lower()[0] in "ty1"
os.environ.get("SKIP_LLM_API_KEY_VERIFICATION", "false").lower()[0] in "ty1"
)
class BrowserUseAgent(Agent):
@time_execution_async("--run (agent)")
async def run(
self,
max_steps: int = 100,
on_step_start: AgentHookFunc | None = None,
on_step_end: AgentHookFunc | None = None,
self, max_steps: int = 100, on_step_start: AgentHookFunc | None = None,
on_step_end: AgentHookFunc | None = None
) -> AgentHistoryList:
"""Execute the task with maximum number of steps"""
@@ -49,41 +51,28 @@ class BrowserUseAgent(Agent):
)
signal_handler.register()
# Wait for verification task to complete if it exists
if hasattr(self, "_verification_task") and not self._verification_task.done():
try:
await self._verification_task
except Exception:
# Error already logged in the task
pass
try:
self._log_agent_run()
# Execute initial actions if provided
if self.initial_actions:
result = await self.multi_act(
self.initial_actions, check_for_new_elements=False
)
result = await self.multi_act(self.initial_actions, check_for_new_elements=False)
self.state.last_result = result
for step in range(max_steps):
# Check if waiting for user input after Ctrl+C
while self.state.paused:
await asyncio.sleep(0.5)
if self.state.stopped:
break
if self.state.paused:
signal_handler.wait_for_resume()
signal_handler.reset()
# Check if we should stop due to too many failures
if self.state.consecutive_failures >= self.settings.max_failures:
logger.error(
f"❌ Stopping due to {self.settings.max_failures} consecutive failures"
)
logger.error(f'❌ Stopping due to {self.settings.max_failures} consecutive failures')
break
# Check control flags before each step
if self.state.stopped:
logger.info("Agent stopped")
logger.info('Agent stopped')
break
while self.state.paused:
@@ -108,15 +97,30 @@ class BrowserUseAgent(Agent):
await self.log_completion()
break
else:
logger.info("Failed to complete task in maximum steps")
error_message = 'Failed to complete task in maximum steps'
self.state.history.history.append(
AgentHistory(
model_output=None,
result=[ActionResult(error=error_message, include_in_memory=True)],
state=BrowserStateHistory(
url='',
title='',
tabs=[],
interacted_element=[],
screenshot=None,
),
metadata=None,
)
)
logger.info(f'{error_message}')
return self.state.history
except KeyboardInterrupt:
# Already handled by our signal handler, but catch any direct KeyboardInterrupt as well
logger.info(
"Got KeyboardInterrupt during execution, returning current history"
)
logger.info('Got KeyboardInterrupt during execution, returning current history')
return self.state.history
finally:
@@ -136,13 +140,29 @@ class BrowserUseAgent(Agent):
)
)
if self.settings.save_playwright_script_path:
logger.info(
f'Agent run finished. Attempting to save Playwright script to: {self.settings.save_playwright_script_path}'
)
try:
# Extract sensitive data keys if sensitive_data is provided
keys = list(self.sensitive_data.keys()) if self.sensitive_data else None
# Pass browser and context config to the saving method
self.state.history.save_as_playwright_script(
self.settings.save_playwright_script_path,
sensitive_data_keys=keys,
browser_config=self.browser.config,
context_config=self.browser_context.config,
)
except Exception as script_gen_err:
# Log any error during script generation/saving
logger.error(f'Failed to save Playwright script: {script_gen_err}', exc_info=True)
await self.close()
if self.settings.generate_gif:
output_path: str = "agent_history.gif"
output_path: str = 'agent_history.gif'
if isinstance(self.settings.generate_gif, str):
output_path = self.settings.generate_gif
create_history_gif(
task=self.task, history=self.state.history, output_path=output_path
)
create_history_gif(task=self.task, history=self.state.history, output_path=output_path)

View File

@@ -29,9 +29,10 @@ from langchain_core.tools import StructuredTool, Tool
from langgraph.graph import StateGraph
from pydantic import BaseModel, Field
from browser_use.browser.context import BrowserContextWindowSize, BrowserContextConfig
from src.agent.browser_use.browser_use_agent import BrowserUseAgent
from src.browser.custom_browser import CustomBrowser
from src.browser.custom_context import CustomBrowserContextConfig
from src.controller.custom_controller import CustomController
from src.utils.mcp_client import setup_mcp_client_and_tools
@@ -47,12 +48,12 @@ _BROWSER_AGENT_INSTANCES = {}
async def run_single_browser_task(
task_query: str,
task_id: str,
llm: Any, # Pass the main LLM
browser_config: Dict[str, Any],
stop_event: threading.Event,
use_vision: bool = False,
task_query: str,
task_id: str,
llm: Any, # Pass the main LLM
browser_config: Dict[str, Any],
stop_event: threading.Event,
use_vision: bool = False,
) -> Dict[str, Any]:
"""
Runs a single BrowserUseAgent task.
@@ -104,10 +105,9 @@ async def run_single_browser_task(
)
)
context_config = CustomBrowserContextConfig(
context_config = BrowserContextConfig(
save_downloads_path="./tmp/downloads",
window_width=window_w,
window_height=window_h,
browser_window_size=BrowserContextWindowSize(width=window_w, height=window_h),
force_new_context=True,
)
bu_browser_context = await bu_browser.new_context(config=context_config)
@@ -198,12 +198,12 @@ class BrowserSearchInput(BaseModel):
async def _run_browser_search_tool(
queries: List[str],
task_id: str, # Injected dependency
llm: Any, # Injected dependency
browser_config: Dict[str, Any],
stop_event: threading.Event,
max_parallel_browsers: int = 1,
queries: List[str],
task_id: str, # Injected dependency
llm: Any, # Injected dependency
browser_config: Dict[str, Any],
stop_event: threading.Event,
max_parallel_browsers: int = 1,
) -> List[Dict[str, Any]]:
"""
Internal function to execute parallel browser searches based on LLM-provided queries.
@@ -267,11 +267,11 @@ async def _run_browser_search_tool(
def create_browser_search_tool(
llm: Any,
browser_config: Dict[str, Any],
task_id: str,
stop_event: threading.Event,
max_parallel_browsers: int = 1,
llm: Any,
browser_config: Dict[str, Any],
task_id: str,
stop_event: threading.Event,
max_parallel_browsers: int = 1,
) -> StructuredTool:
"""Factory function to create the browser search tool with necessary dependencies."""
# Use partial to bind the dependencies that aren't part of the LLM call arguments
@@ -553,7 +553,7 @@ async def research_execution_node(state: DeepResearchState) -> Dict[str, Any]:
else:
current_task_message = [
SystemMessage(
content="You are a research assistant executing one step of a research plan. Use the available tools, especially the 'parallel_browser_search' tool, to gather information needed for the current task. Be precise with your search queries if using the browser tool."
content="You are a research assistant executing one step of a research plan. Use the available tools, especially the 'parallel_browser_search' tool, to gather information needed for the current task. Be precise with your search queries if using the browser tool. Please output at least one tool."
),
HumanMessage(
content=f"Research Task (Step {current_step['step']}): {current_step['task']}"
@@ -582,8 +582,11 @@ async def research_execution_node(state: DeepResearchState) -> Dict[str, Any]:
_save_plan_to_md(plan, output_dir)
return {
"research_plan": plan,
"current_step_index": current_index + 1,
"error_message": f"LLM failed to call a tool for step {current_step['step']}.",
"status": "pending",
"current_step_index": current_index,
"messages": [
f"LLM failed to call a tool for step {current_step['step']}. Response: {ai_response.content}"
f". Please use tool to do research unless you are thinking or summary"],
}
# Process tool calls
@@ -665,8 +668,8 @@ async def research_execution_node(state: DeepResearchState) -> Dict[str, Any]:
browser_tool_called = "parallel_browser_search" in executed_tool_names
# We might need a more nuanced status based on the *content* of tool_results
step_failed = (
any("Error:" in str(tr.content) for tr in tool_results)
or not browser_tool_called
any("Error:" in str(tr.content) for tr in tool_results)
or not browser_tool_called
)
if step_failed:
@@ -695,9 +698,9 @@ async def research_execution_node(state: DeepResearchState) -> Dict[str, Any]:
"search_results": current_search_results, # Update with new results
"current_step_index": current_index + 1,
"messages": state["messages"]
+ current_task_message
+ [ai_response]
+ tool_results,
+ current_task_message
+ [ai_response]
+ tool_results,
# Optionally return the tool_results messages if needed by downstream nodes
}
@@ -879,10 +882,10 @@ def should_continue(state: DeepResearchState) -> str:
class DeepResearchAgent:
def __init__(
self,
llm: Any,
browser_config: Dict[str, Any],
mcp_server_config: Optional[Dict[str, Any]] = None,
self,
llm: Any,
browser_config: Dict[str, Any],
mcp_server_config: Optional[Dict[str, Any]] = None,
):
"""
Initializes the DeepSearchAgent.
@@ -904,7 +907,7 @@ class DeepResearchAgent:
self.runner: Optional[asyncio.Task] = None # To hold the asyncio task for run
async def _setup_tools(
self, task_id: str, stop_event: threading.Event, max_parallel_browsers: int = 1
self, task_id: str, stop_event: threading.Event, max_parallel_browsers: int = 1
) -> List[Tool]:
"""Sets up the basic tools (File I/O) and optional MCP tools."""
tools = [
@@ -981,11 +984,11 @@ class DeepResearchAgent:
return app
async def run(
self,
topic: str,
task_id: Optional[str] = None,
save_dir: str = "./tmp/deep_research",
max_parallel_browsers: int = 1,
self,
topic: str,
task_id: Optional[str] = None,
save_dir: str = "./tmp/deep_research",
max_parallel_browsers: int = 1,
) -> Dict[str, Any]:
"""
Starts the deep research process (Async Generator Version).

View File

@@ -26,25 +26,33 @@ from browser_use.browser.utils.screen_resolution import get_screen_resolution, g
from browser_use.utils import time_execution_async
import socket
from .custom_context import CustomBrowserContext, CustomBrowserContextConfig
from .custom_context import CustomBrowserContext
logger = logging.getLogger(__name__)
class CustomBrowser(Browser):
async def new_context(self, config: CustomBrowserContextConfig | None = None) -> CustomBrowserContext:
async def new_context(self, config: BrowserContextConfig | None = None) -> CustomBrowserContext:
"""Create a browser context"""
browser_config = self.config.model_dump() if self.config else {}
context_config = config.model_dump() if config else {}
merged_config = {**browser_config, **context_config}
return CustomBrowserContext(config=CustomBrowserContextConfig(**merged_config), browser=self)
return CustomBrowserContext(config=BrowserContextConfig(**merged_config), browser=self)
async def _setup_builtin_browser(self, playwright: Playwright) -> PlaywrightBrowser:
"""Sets up and returns a Playwright Browser instance with anti-detection measures."""
assert self.config.browser_binary_path is None, 'browser_binary_path should be None if trying to use the builtin browsers'
if self.config.headless:
# Use the configured window size from new_context_config if available
if (
not self.config.headless
and hasattr(self.config, 'new_context_config')
and hasattr(self.config.new_context_config, 'browser_window_size')
):
screen_size = self.config.new_context_config.browser_window_size.model_dump()
offset_x, offset_y = get_window_adjustments()
elif self.config.headless:
screen_size = {'width': 1920, 'height': 1080}
offset_x, offset_y = 0, 0
else:
@@ -52,6 +60,7 @@ class CustomBrowser(Browser):
offset_x, offset_y = get_window_adjustments()
chrome_args = {
f'--remote-debugging-port={self.config.chrome_remote_debugging_port}',
*CHROME_ARGS,
*(CHROME_DOCKER_ARGS if IN_DOCKER else []),
*(CHROME_HEADLESS_ARGS if self.config.headless else []),
@@ -70,8 +79,8 @@ class CustomBrowser(Browser):
# check if port 9222 is already taken, if so remove the remote-debugging-port arg to prevent conflicts
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
if s.connect_ex(('localhost', 9222)) == 0:
chrome_args.remove('--remote-debugging-port=9222')
if s.connect_ex(('localhost', self.config.chrome_remote_debugging_port)) == 0:
chrome_args.remove(f'--remote-debugging-port={self.config.chrome_remote_debugging_port}')
browser_class = getattr(playwright, self.config.browser_class)
args = {

View File

@@ -12,10 +12,6 @@ from browser_use.browser.context import BrowserContextState
logger = logging.getLogger(__name__)
class CustomBrowserContextConfig(BrowserContextConfig):
force_new_context: bool = False # force to create new context
class CustomBrowserContext(BrowserContext):
def __init__(
self,
@@ -24,96 +20,3 @@ class CustomBrowserContext(BrowserContext):
state: Optional[BrowserContextState] = None,
):
super(CustomBrowserContext, self).__init__(browser=browser, config=config, state=state)
async def _create_context(self, browser: PlaywrightBrowser):
"""Creates a new browser context with anti-detection measures and loads cookies if available."""
if not self.config.force_new_context and self.browser.config.cdp_url and len(browser.contexts) > 0:
context = browser.contexts[0]
elif not self.config.force_new_context and self.browser.config.browser_binary_path and len(
browser.contexts) > 0:
# Connect to existing Chrome instance instead of creating new one
context = browser.contexts[0]
else:
# Original code for creating new context
context = await browser.new_context(
no_viewport=True,
user_agent=self.config.user_agent,
java_script_enabled=True,
bypass_csp=self.config.disable_security,
ignore_https_errors=self.config.disable_security,
record_video_dir=self.config.save_recording_path,
record_video_size={
"width": self.config.window_width,
"height": self.config.window_height
},
record_har_path=self.config.save_har_path,
locale=self.config.locale,
http_credentials=self.config.http_credentials,
is_mobile=self.config.is_mobile,
has_touch=self.config.has_touch,
geolocation=self.config.geolocation,
permissions=self.config.permissions,
timezone_id=self.config.timezone_id,
)
if self.config.trace_path:
await context.tracing.start(screenshots=True, snapshots=True, sources=True)
# Load cookies if they exist
if self.config.cookies_file and os.path.exists(self.config.cookies_file):
with open(self.config.cookies_file, 'r') as f:
try:
cookies = json.load(f)
valid_same_site_values = ['Strict', 'Lax', 'None']
for cookie in cookies:
if 'sameSite' in cookie:
if cookie['sameSite'] not in valid_same_site_values:
logger.warning(
f"Fixed invalid sameSite value '{cookie['sameSite']}' to 'None' for cookie {cookie.get('name')}"
)
cookie['sameSite'] = 'None'
logger.info(f'🍪 Loaded {len(cookies)} cookies from {self.config.cookies_file}')
await context.add_cookies(cookies)
except json.JSONDecodeError as e:
logger.error(f'Failed to parse cookies file: {str(e)}')
# Expose anti-detection scripts
await context.add_init_script(
"""
// Webdriver property
Object.defineProperty(navigator, 'webdriver', {
get: () => undefined
});
// Languages
Object.defineProperty(navigator, 'languages', {
get: () => ['en-US']
});
// Plugins
Object.defineProperty(navigator, 'plugins', {
get: () => [1, 2, 3, 4, 5]
});
// Chrome runtime
window.chrome = { runtime: {} };
// Permissions
const originalQuery = window.navigator.permissions.query;
window.navigator.permissions.query = (parameters) => (
parameters.name === 'notifications' ?
Promise.resolve({ state: Notification.permission }) :
originalQuery(parameters)
);
(function () {
const originalAttachShadow = Element.prototype.attachShadow;
Element.prototype.attachShadow = function attachShadow(options) {
return originalAttachShadow.call(this, { ...options, mode: "open" });
};
})();
"""
)
return context

View File

@@ -172,6 +172,10 @@ class CustomController(Controller):
param_model=create_tool_param_model(tool),
)
logger.info(f"Add mcp tool: {tool_name}")
logger.debug(
f"Registered {len(self.mcp_client.server_name_to_tools[server_name])} mcp tools for {server_name}")
else:
logger.warning(f"MCP client not started.")
async def close_mcp_client(self):
if self.mcp_client:

View File

@@ -13,14 +13,13 @@ from browser_use.agent.views import (
AgentOutput,
)
from browser_use.browser.browser import BrowserConfig
from browser_use.browser.context import BrowserContext
from browser_use.browser.context import BrowserContext, BrowserContextWindowSize, BrowserContextConfig
from browser_use.browser.views import BrowserState
from gradio.components import Component
from langchain_core.language_models.chat_models import BaseChatModel
from src.agent.browser_use.browser_use_agent import BrowserUseAgent
from src.browser.custom_browser import CustomBrowser
from src.browser.custom_context import CustomBrowserContextConfig
from src.controller.custom_controller import CustomController
from src.utils import llm_provider
from src.webui.webui_manager import WebuiManager
@@ -32,12 +31,12 @@ logger = logging.getLogger(__name__)
async def _initialize_llm(
provider: Optional[str],
model_name: Optional[str],
temperature: float,
base_url: Optional[str],
api_key: Optional[str],
num_ctx: Optional[int] = None,
provider: Optional[str],
model_name: Optional[str],
temperature: float,
base_url: Optional[str],
api_key: Optional[str],
num_ctx: Optional[int] = None,
) -> Optional[BaseChatModel]:
"""Initializes the LLM based on settings. Returns None if provider/model is missing."""
if not provider or not model_name:
@@ -68,10 +67,10 @@ async def _initialize_llm(
def _get_config_value(
webui_manager: WebuiManager,
comp_dict: Dict[gr.components.Component, Any],
comp_id_suffix: str,
default: Any = None,
webui_manager: WebuiManager,
comp_dict: Dict[gr.components.Component, Any],
comp_id_suffix: str,
default: Any = None,
) -> Any:
"""Safely get value from component dictionary using its ID suffix relative to the tab."""
# Assumes component ID format is "tab_name.comp_name"
@@ -133,7 +132,7 @@ def _format_agent_output(model_output: AgentOutput) -> str:
async def _handle_new_step(
webui_manager: WebuiManager, state: BrowserState, output: AgentOutput, step_num: int
webui_manager: WebuiManager, state: BrowserState, output: AgentOutput, step_num: int
):
"""Callback for each step taken by the agent, including screenshot display."""
@@ -157,12 +156,12 @@ async def _handle_new_step(
try:
# Basic validation: check if it looks like base64
if (
isinstance(screenshot_data, str) and len(screenshot_data) > 100
isinstance(screenshot_data, str) and len(screenshot_data) > 100
): # Arbitrary length check
# *** UPDATED STYLE: Removed centering, adjusted width ***
img_tag = f'<img src="data:image/jpeg;base64,{screenshot_data}" alt="Step {step_num} Screenshot" style="max-width: 800px; max-height: 600px; object-fit:contain;" />'
screenshot_html = (
img_tag + "<br/>"
img_tag + "<br/>"
) # Use <br/> for line break after inline-block image
else:
logger.warning(
@@ -223,7 +222,7 @@ def _handle_done(webui_manager: WebuiManager, history: AgentHistoryList):
async def _ask_assistant_callback(
webui_manager: WebuiManager, query: str, browser_context: BrowserContext
webui_manager: WebuiManager, query: str, browser_context: BrowserContext
) -> Dict[str, Any]:
"""Callback triggered by the agent's ask_for_assistant action."""
logger.info("Agent requires assistance. Waiting for user input.")
@@ -274,7 +273,7 @@ async def _ask_assistant_callback(
async def run_agent_task(
webui_manager: WebuiManager, components: Dict[gr.components.Component, Any]
webui_manager: WebuiManager, components: Dict[gr.components.Component, Any]
) -> AsyncGenerator[Dict[gr.components.Component, Any], None]:
"""Handles the entire lifecycle of initializing and running the agent."""
@@ -358,6 +357,7 @@ async def run_agent_task(
# Planner LLM Settings (Optional)
planner_llm_provider_name = get_setting("planner_llm_provider") or None
planner_llm = None
planner_use_vision = False
if planner_llm_provider_name:
planner_llm_model_name = get_setting("planner_llm_model_name")
planner_llm_temperature = get_setting("planner_llm_temperature", 0.6)
@@ -387,7 +387,7 @@ async def run_agent_task(
) # Logic handled by CDP/WSS presence
keep_browser_open = get_browser_setting("keep_browser_open", False)
headless = get_browser_setting("headless", False)
disable_security = get_browser_setting("disable_security", True)
disable_security = get_browser_setting("disable_security", False)
window_w = int(get_browser_setting("window_w", 1280))
window_h = int(get_browser_setting("window_h", 1100))
cdp_url = get_browser_setting("cdp_url") or None
@@ -422,7 +422,7 @@ async def run_agent_task(
# Pass the webui_manager instance to the callback when wrapping it
async def ask_callback_wrapper(
query: str, browser_context: BrowserContext
query: str, browser_context: BrowserContext
) -> Dict[str, Any]:
return await _ask_assistant_callback(webui_manager, query, browser_context)
@@ -456,7 +456,7 @@ async def run_agent_task(
if use_own_browser:
browser_binary_path = (
os.getenv("CHROME_PATH", None) or browser_binary_path
os.getenv("CHROME_PATH", None) or browser_binary_path
)
if browser_binary_path == "":
browser_binary_path = None
@@ -479,14 +479,13 @@ async def run_agent_task(
# Create Context if needed
if not webui_manager.bu_browser_context:
logger.info("Creating new browser context.")
context_config = CustomBrowserContextConfig(
context_config = BrowserContextConfig(
trace_path=save_trace_path if save_trace_path else None,
save_recording_path=save_recording_path
if save_recording_path
else None,
save_downloads_path=save_download_path if save_download_path else None,
window_width=window_w,
window_height=window_h,
browser_window_size=BrowserContextWindowSize(width=window_w, height=window_h),
)
if not webui_manager.bu_browser:
raise ValueError("Browser not initialized, cannot create context.")
@@ -513,7 +512,7 @@ async def run_agent_task(
# Pass the webui_manager to callbacks when wrapping them
async def step_callback_wrapper(
state: BrowserState, output: AgentOutput, step_num: int
state: BrowserState, output: AgentOutput, step_num: int
):
await _handle_new_step(webui_manager, state, output, step_num)
@@ -582,7 +581,7 @@ async def run_agent_task(
await asyncio.sleep(0.2)
if (
agent_task.done() or is_stopped
agent_task.done() or is_stopped
): # If stopped or task finished while paused
break
@@ -633,8 +632,8 @@ async def run_agent_task(
yield update_dict
# Wait until response is submitted or task finishes
while (
webui_manager.bu_response_event is not None
and not agent_task.done()
webui_manager.bu_response_event is not None
and not agent_task.done()
):
await asyncio.sleep(0.2)
# Restore UI after response submitted or if task ended unexpectedly
@@ -716,9 +715,9 @@ async def run_agent_task(
except asyncio.CancelledError:
logger.info("Agent task was cancelled.")
if not any(
"Cancelled" in msg.get("content", "")
for msg in webui_manager.bu_chat_history
if msg.get("role") == "assistant"
"Cancelled" in msg.get("content", "")
for msg in webui_manager.bu_chat_history
if msg.get("role") == "assistant"
):
webui_manager.bu_chat_history.append(
{"role": "assistant", "content": "**Task Cancelled**."}
@@ -730,9 +729,9 @@ async def run_agent_task(
f"**Agent Execution Error:**\n```\n{type(e).__name__}: {e}\n```"
)
if not any(
error_message in msg.get("content", "")
for msg in webui_manager.bu_chat_history
if msg.get("role") == "assistant"
error_message in msg.get("content", "")
for msg in webui_manager.bu_chat_history
if msg.get("role") == "assistant"
):
webui_manager.bu_chat_history.append(
{"role": "assistant", "content": error_message}
@@ -788,7 +787,7 @@ async def run_agent_task(
clear_button_comp: gr.update(interactive=True),
chatbot_comp: gr.update(
value=webui_manager.bu_chat_history
+ [{"role": "assistant", "content": f"**Setup Error:** {e}"}]
+ [{"role": "assistant", "content": f"**Setup Error:** {e}"}]
),
}
@@ -797,7 +796,7 @@ async def run_agent_task(
async def handle_submit(
webui_manager: WebuiManager, components: Dict[gr.components.Component, Any]
webui_manager: WebuiManager, components: Dict[gr.components.Component, Any]
):
"""Handles clicks on the main 'Submit' button."""
user_input_comp = webui_manager.get_component_by_id("browser_use_agent.user_input")
@@ -1048,7 +1047,7 @@ def create_browser_use_agent_tab(webui_manager: WebuiManager):
run_tab_outputs = list(tab_components.values())
async def submit_wrapper(
components_dict: Dict[Component, Any],
components_dict: Dict[Component, Any],
) -> AsyncGenerator[Dict[Component, Any], None]:
"""Wrapper for handle_submit that yields its results."""
async for update in handle_submit(webui_manager, components_dict):

View File

@@ -116,7 +116,7 @@ async def run_deep_research(webui_manager: WebuiManager, components: Dict[Compon
# LLM Config (from agent_settings tab)
llm_provider_name = get_setting("agent_settings", "llm_provider")
llm_model_name = get_setting("agent_settings", "llm_model_name")
llm_temperature = get_setting("agent_settings", "llm_temperature", 0.5) # Default if not found
llm_temperature = max(get_setting("agent_settings", "llm_temperature", 0.5), 0.5)
llm_base_url = get_setting("agent_settings", "llm_base_url")
llm_api_key = get_setting("agent_settings", "llm_api_key")
ollama_num_ctx = get_setting("agent_settings", "ollama_num_ctx")
@@ -132,7 +132,7 @@ async def run_deep_research(webui_manager: WebuiManager, components: Dict[Compon
# Note: DeepResearchAgent constructor takes a dict, not full Browser/Context objects
browser_config_dict = {
"headless": get_setting("browser_settings", "headless", False),
"disable_security": get_setting("browser_settings", "disable_security", True),
"disable_security": get_setting("browser_settings", "disable_security", False),
"browser_binary_path": get_setting("browser_settings", "browser_binary_path"),
"user_data_dir": get_setting("browser_settings", "browser_user_data_dir"),
"window_width": int(get_setting("browser_settings", "window_w", 1280)),

View File

@@ -26,9 +26,9 @@ async def test_browser_use_agent():
from browser_use.agent.service import Agent
from src.browser.custom_browser import CustomBrowser
from src.browser.custom_context import CustomBrowserContextConfig
from src.controller.custom_controller import CustomController
from src.utils import llm_provider
from src.agent.browser_use.browser_use_agent import BrowserUseAgent
# llm = utils.get_llm_model(
# provider="openai",
@@ -77,15 +77,15 @@ async def test_browser_use_agent():
mcp_server_config = {
"mcpServers": {
"markitdown": {
"command": "docker",
"args": [
"run",
"--rm",
"-i",
"markitdown-mcp:latest"
]
},
# "markitdown": {
# "command": "docker",
# "args": [
# "run",
# "--rm",
# "-i",
# "markitdown-mcp:latest"
# ]
# },
"desktop-commander": {
"command": "npx",
"args": [
@@ -97,8 +97,8 @@ async def test_browser_use_agent():
}
controller = CustomController()
await controller.setup_mcp_client(mcp_server_config)
use_own_browser = False
disable_security = True
use_own_browser = True
disable_security = False
use_vision = True # Set to False when using DeepSeek
max_actions_per_step = 10
@@ -125,7 +125,7 @@ async def test_browser_use_agent():
)
)
browser_context = await browser.new_context(
config=CustomBrowserContextConfig(
config=BrowserContextConfig(
trace_path="./tmp/traces",
save_recording_path="./tmp/record_videos",
save_downloads_path="./tmp/downloads",
@@ -135,8 +135,9 @@ async def test_browser_use_agent():
force_new_context=True
)
)
agent = Agent(
task="download pdf from https://arxiv.org/abs/2504.10458 and rename this pdf to 'GUI-r1-test.pdf'",
agent = BrowserUseAgent(
# task="download pdf from https://arxiv.org/pdf/2311.16498 and rename this pdf to 'mcp-test.pdf'",
task="give me nvidia stock price",
llm=llm,
browser=browser,
browser_context=browser_context,
@@ -153,7 +154,6 @@ async def test_browser_use_agent():
print("\nErrors:")
pprint(history.errors(), indent=4)
except Exception:
import traceback
traceback.print_exc()
@@ -182,9 +182,9 @@ async def test_browser_use_parallel():
from browser_use.agent.service import Agent
from src.browser.custom_browser import CustomBrowser
from src.browser.custom_context import CustomBrowserContextConfig
from src.controller.custom_controller import CustomController
from src.utils import llm_provider
from src.agent.browser_use.browser_use_agent import BrowserUseAgent
# llm = utils.get_llm_model(
# provider="openai",
@@ -233,15 +233,15 @@ async def test_browser_use_parallel():
mcp_server_config = {
"mcpServers": {
"markitdown": {
"command": "docker",
"args": [
"run",
"--rm",
"-i",
"markitdown-mcp:latest"
]
},
# "markitdown": {
# "command": "docker",
# "args": [
# "run",
# "--rm",
# "-i",
# "markitdown-mcp:latest"
# ]
# },
"desktop-commander": {
"command": "npx",
"args": [
@@ -262,7 +262,7 @@ async def test_browser_use_parallel():
controller = CustomController()
await controller.setup_mcp_client(mcp_server_config)
use_own_browser = False
disable_security = True
disable_security = False
use_vision = True # Set to False when using DeepSeek
max_actions_per_step = 10
@@ -289,7 +289,7 @@ async def test_browser_use_parallel():
)
)
browser_context = await browser.new_context(
config=CustomBrowserContextConfig(
config=BrowserContextConfig(
trace_path="./tmp/traces",
save_recording_path="./tmp/record_videos",
save_downloads_path="./tmp/downloads",
@@ -300,7 +300,7 @@ async def test_browser_use_parallel():
)
)
agents = [
Agent(task=task, llm=llm, browser=browser, controller=controller)
BrowserUseAgent(task=task, llm=llm, browser=browser, controller=controller)
for task in [
'Search Google for weather in Tokyo',
# 'Check Reddit front page title',
@@ -332,6 +332,8 @@ async def test_browser_use_parallel():
await browser_context.close()
if browser:
await browser.close()
if controller:
await controller.close_mcp_client()
async def test_deep_research_agent():
@@ -362,8 +364,8 @@ async def test_deep_research_agent():
browser_config = {"headless": False, "window_width": 1280, "window_height": 1100, "use_own_browser": False}
agent = DeepResearchAgent(llm=llm, browser_config=browser_config, mcp_server_config=mcp_server_config)
research_topic = "Impact of Microplastics on Marine Ecosystems"
task_id_to_resume = "815460fb-337a-4850-8fa4-a5f2db301a89" # Set this to resume a previous task ID
research_topic = "Give me a detailed travel plan to Switzerland from June 1st to 10th."
task_id_to_resume = "" # Set this to resume a previous task ID
print(f"Starting research on: {research_topic}")

View File

@@ -14,20 +14,31 @@ async def test_mcp_client():
from src.utils.mcp_client import setup_mcp_client_and_tools, create_tool_param_model
test_server_config = {
"playwright": {
"command": "npx",
"args": [
"@playwright/mcp@latest",
],
"transport": "stdio",
},
"filesystem": {
"command": "npx",
"args": [
"-y",
"@modelcontextprotocol/server-filesystem",
"/Users/warmshao/ai_workspace",
]
"mcpServers": {
# "markitdown": {
# "command": "docker",
# "args": [
# "run",
# "--rm",
# "-i",
# "markitdown-mcp:latest"
# ]
# },
"desktop-commander": {
"command": "npx",
"args": [
"-y",
"@wonderwhy-er/desktop-commander"
]
},
# "filesystem": {
# "command": "npx",
# "args": [
# "-y",
# "@modelcontextprotocol/server-filesystem",
# "/Users/xxx/ai_workspace",
# ]
# },
}
}
@@ -48,15 +59,15 @@ async def test_controller_with_mcp():
mcp_server_config = {
"mcpServers": {
"markitdown": {
"command": "docker",
"args": [
"run",
"--rm",
"-i",
"markitdown-mcp:latest"
]
},
# "markitdown": {
# "command": "docker",
# "args": [
# "run",
# "--rm",
# "-i",
# "markitdown-mcp:latest"
# ]
# },
"desktop-commander": {
"command": "npx",
"args": [