Merge pull request #553 from vvincent1234/feat/mcp

Enable bowser-use to support MCP
This commit is contained in:
warmshao
2025-05-01 09:46:15 +08:00
committed by GitHub
34 changed files with 4732 additions and 3417 deletions

1
.gitignore vendored
View File

@@ -187,3 +187,4 @@ data/
# For Config Files (Current Settings)
.config.pkl
*.pdf

View File

@@ -1,8 +1,10 @@
browser-use==0.1.40
browser-use==0.1.41
pyperclip==1.9.0
gradio==5.23.1
gradio==5.27.0
json-repair
langchain-mistralai==0.2.4
langchain-google-genai==2.0.8
MainContentExtractor==0.0.4
langchain-ibm==0.3.10
langchain-ibm==0.3.10
langchain_mcp_adapters==0.0.9
langgraph==0.3.34
langchain-community

View File

@@ -0,0 +1,178 @@
from __future__ import annotations
import asyncio
import gc
import inspect
import json
import logging
import os
import re
import time
from pathlib import Path
from typing import Any, Awaitable, Callable, Dict, Generic, List, Optional, TypeVar, Union
from dotenv import load_dotenv
from langchain_core.language_models.chat_models import BaseChatModel
from langchain_core.messages import (
BaseMessage,
HumanMessage,
SystemMessage,
)
# from lmnr.sdk.decorators import observe
from pydantic import BaseModel, ValidationError
from browser_use.agent.gif import create_history_gif
from browser_use.agent.memory.service import Memory, MemorySettings
from browser_use.agent.message_manager.service import MessageManager, MessageManagerSettings
from browser_use.agent.message_manager.utils import convert_input_messages, extract_json_from_model_output, save_conversation
from browser_use.agent.prompts import AgentMessagePrompt, PlannerPrompt, SystemPrompt
from browser_use.agent.views import (
REQUIRED_LLM_API_ENV_VARS,
ActionResult,
AgentError,
AgentHistory,
AgentHistoryList,
AgentOutput,
AgentSettings,
AgentState,
AgentStepInfo,
StepMetadata,
ToolCallingMethod,
)
from browser_use.browser.browser import Browser
from browser_use.browser.context import BrowserContext
from browser_use.browser.views import BrowserState, BrowserStateHistory
from browser_use.controller.registry.views import ActionModel
from browser_use.controller.service import Controller
from browser_use.dom.history_tree_processor.service import (
DOMHistoryElement,
HistoryTreeProcessor,
)
from browser_use.exceptions import LLMException
from browser_use.telemetry.service import ProductTelemetry
from browser_use.telemetry.views import (
AgentEndTelemetryEvent,
AgentRunTelemetryEvent,
AgentStepTelemetryEvent,
)
from browser_use.utils import check_env_variables, time_execution_async, time_execution_sync
from browser_use.agent.service import Agent, AgentHookFunc
load_dotenv()
logger = logging.getLogger(__name__)
SKIP_LLM_API_KEY_VERIFICATION = os.environ.get('SKIP_LLM_API_KEY_VERIFICATION', 'false').lower()[0] in 'ty1'
class BrowserUseAgent(Agent):
@time_execution_async('--run (agent)')
async def run(
self, max_steps: int = 100, on_step_start: AgentHookFunc | None = None,
on_step_end: AgentHookFunc | None = None
) -> AgentHistoryList:
"""Execute the task with maximum number of steps"""
loop = asyncio.get_event_loop()
# Set up the Ctrl+C signal handler with callbacks specific to this agent
from browser_use.utils import SignalHandler
signal_handler = SignalHandler(
loop=loop,
pause_callback=self.pause,
resume_callback=self.resume,
custom_exit_callback=None, # No special cleanup needed on forced exit
exit_on_second_int=True,
)
signal_handler.register()
# Wait for verification task to complete if it exists
if hasattr(self, '_verification_task') and not self._verification_task.done():
try:
await self._verification_task
except Exception:
# Error already logged in the task
pass
try:
self._log_agent_run()
# Execute initial actions if provided
if self.initial_actions:
result = await self.multi_act(self.initial_actions, check_for_new_elements=False)
self.state.last_result = result
for step in range(max_steps):
# Check if waiting for user input after Ctrl+C
while self.state.paused:
await asyncio.sleep(0.5)
if self.state.stopped:
break
# Check if we should stop due to too many failures
if self.state.consecutive_failures >= self.settings.max_failures:
logger.error(f'❌ Stopping due to {self.settings.max_failures} consecutive failures')
break
# Check control flags before each step
if self.state.stopped:
logger.info('Agent stopped')
break
while self.state.paused:
await asyncio.sleep(0.2) # Small delay to prevent CPU spinning
if self.state.stopped: # Allow stopping while paused
break
if on_step_start is not None:
await on_step_start(self)
step_info = AgentStepInfo(step_number=step, max_steps=max_steps)
await self.step(step_info)
if on_step_end is not None:
await on_step_end(self)
if self.state.history.is_done():
if self.settings.validate_output and step < max_steps - 1:
if not await self._validate_output():
continue
await self.log_completion()
break
else:
logger.info('❌ Failed to complete task in maximum steps')
return self.state.history
except KeyboardInterrupt:
# Already handled by our signal handler, but catch any direct KeyboardInterrupt as well
logger.info('Got KeyboardInterrupt during execution, returning current history')
return self.state.history
finally:
# Unregister signal handlers before cleanup
signal_handler.unregister()
self.telemetry.capture(
AgentEndTelemetryEvent(
agent_id=self.state.agent_id,
is_done=self.state.history.is_done(),
success=self.state.history.is_successful(),
steps=self.state.n_steps,
max_steps_reached=self.state.n_steps >= max_steps,
errors=self.state.history.errors(),
total_input_tokens=self.state.history.total_input_tokens(),
total_duration_seconds=self.state.history.total_duration_seconds(),
)
)
await self.close()
if self.settings.generate_gif:
output_path: str = 'agent_history.gif'
if isinstance(self.settings.generate_gif, str):
output_path = self.settings.generate_gif
create_history_gif(task=self.task, history=self.state.history, output_path=output_path)

View File

@@ -1,478 +0,0 @@
import json
import logging
import pdb
import traceback
from typing import Any, Awaitable, Callable, Dict, Generic, List, Optional, Type, TypeVar
from PIL import Image, ImageDraw, ImageFont
import os
import base64
import io
import asyncio
import time
import platform
from browser_use.agent.prompts import SystemPrompt, AgentMessagePrompt
from browser_use.agent.service import Agent
from browser_use.agent.message_manager.utils import convert_input_messages, extract_json_from_model_output, \
save_conversation
from browser_use.agent.views import (
ActionResult,
AgentError,
AgentHistory,
AgentHistoryList,
AgentOutput,
AgentSettings,
AgentState,
AgentStepInfo,
StepMetadata,
ToolCallingMethod,
)
from browser_use.agent.gif import create_history_gif
from browser_use.browser.browser import Browser
from browser_use.browser.context import BrowserContext
from browser_use.browser.views import BrowserStateHistory
from browser_use.controller.service import Controller
from browser_use.telemetry.views import (
AgentEndTelemetryEvent,
AgentRunTelemetryEvent,
AgentStepTelemetryEvent,
)
from browser_use.utils import time_execution_async
from langchain_core.language_models.chat_models import BaseChatModel
from langchain_core.messages import (
BaseMessage,
HumanMessage,
AIMessage
)
from browser_use.browser.views import BrowserState, BrowserStateHistory
from browser_use.agent.prompts import PlannerPrompt
from json_repair import repair_json
from src.utils.agent_state import AgentState
from .custom_message_manager import CustomMessageManager, CustomMessageManagerSettings
from .custom_views import CustomAgentOutput, CustomAgentStepInfo, CustomAgentState
logger = logging.getLogger(__name__)
Context = TypeVar('Context')
class CustomAgent(Agent):
def __init__(
self,
task: str,
llm: BaseChatModel,
add_infos: str = "",
# Optional parameters
browser: Browser | None = None,
browser_context: BrowserContext | None = None,
controller: Controller[Context] = Controller(),
# Initial agent run parameters
sensitive_data: Optional[Dict[str, str]] = None,
initial_actions: Optional[List[Dict[str, Dict[str, Any]]]] = None,
# Cloud Callbacks
register_new_step_callback: Callable[['BrowserState', 'AgentOutput', int], Awaitable[None]] | None = None,
register_done_callback: Callable[['AgentHistoryList'], Awaitable[None]] | None = None,
register_external_agent_status_raise_error_callback: Callable[[], Awaitable[bool]] | None = None,
# Agent settings
use_vision: bool = True,
use_vision_for_planner: bool = False,
save_conversation_path: Optional[str] = None,
save_conversation_path_encoding: Optional[str] = 'utf-8',
max_failures: int = 3,
retry_delay: int = 10,
system_prompt_class: Type[SystemPrompt] = SystemPrompt,
agent_prompt_class: Type[AgentMessagePrompt] = AgentMessagePrompt,
max_input_tokens: int = 128000,
validate_output: bool = False,
message_context: Optional[str] = None,
generate_gif: bool | str = False,
available_file_paths: Optional[list[str]] = None,
include_attributes: list[str] = [
'title',
'type',
'name',
'role',
'aria-label',
'placeholder',
'value',
'alt',
'aria-expanded',
'data-date-format',
],
max_actions_per_step: int = 10,
tool_calling_method: Optional[ToolCallingMethod] = 'auto',
page_extraction_llm: Optional[BaseChatModel] = None,
planner_llm: Optional[BaseChatModel] = None,
planner_interval: int = 1, # Run planner every N steps
# Inject state
injected_agent_state: Optional[AgentState] = None,
context: Context | None = None,
):
super(CustomAgent, self).__init__(
task=task,
llm=llm,
browser=browser,
browser_context=browser_context,
controller=controller,
sensitive_data=sensitive_data,
initial_actions=initial_actions,
register_new_step_callback=register_new_step_callback,
register_done_callback=register_done_callback,
register_external_agent_status_raise_error_callback=register_external_agent_status_raise_error_callback,
use_vision=use_vision,
use_vision_for_planner=use_vision_for_planner,
save_conversation_path=save_conversation_path,
save_conversation_path_encoding=save_conversation_path_encoding,
max_failures=max_failures,
retry_delay=retry_delay,
system_prompt_class=system_prompt_class,
max_input_tokens=max_input_tokens,
validate_output=validate_output,
message_context=message_context,
generate_gif=generate_gif,
available_file_paths=available_file_paths,
include_attributes=include_attributes,
max_actions_per_step=max_actions_per_step,
tool_calling_method=tool_calling_method,
page_extraction_llm=page_extraction_llm,
planner_llm=planner_llm,
planner_interval=planner_interval,
injected_agent_state=injected_agent_state,
context=context,
)
self.state = injected_agent_state or CustomAgentState()
self.add_infos = add_infos
self._message_manager = CustomMessageManager(
task=task,
system_message=self.settings.system_prompt_class(
self.available_actions,
max_actions_per_step=self.settings.max_actions_per_step,
).get_system_message(),
settings=CustomMessageManagerSettings(
max_input_tokens=self.settings.max_input_tokens,
include_attributes=self.settings.include_attributes,
message_context=self.settings.message_context,
sensitive_data=sensitive_data,
available_file_paths=self.settings.available_file_paths,
agent_prompt_class=agent_prompt_class
),
state=self.state.message_manager_state,
)
def _log_response(self, response: CustomAgentOutput) -> None:
"""Log the model's response"""
if "Success" in response.current_state.evaluation_previous_goal:
emoji = ""
elif "Failed" in response.current_state.evaluation_previous_goal:
emoji = ""
else:
emoji = "🤷"
logger.info(f"{emoji} Eval: {response.current_state.evaluation_previous_goal}")
logger.info(f"🧠 New Memory: {response.current_state.important_contents}")
logger.info(f"🤔 Thought: {response.current_state.thought}")
logger.info(f"🎯 Next Goal: {response.current_state.next_goal}")
for i, action in enumerate(response.action):
logger.info(
f"🛠️ Action {i + 1}/{len(response.action)}: {action.model_dump_json(exclude_unset=True)}"
)
def _setup_action_models(self) -> None:
"""Setup dynamic action models from controller's registry"""
# Get the dynamic action model from controller's registry
self.ActionModel = self.controller.registry.create_action_model()
# Create output model with the dynamic actions
self.AgentOutput = CustomAgentOutput.type_with_custom_actions(self.ActionModel)
def update_step_info(
self, model_output: CustomAgentOutput, step_info: CustomAgentStepInfo = None
):
"""
update step info
"""
if step_info is None:
return
step_info.step_number += 1
important_contents = model_output.current_state.important_contents
if (
important_contents
and "None" not in important_contents
and important_contents not in step_info.memory
):
step_info.memory += important_contents + "\n"
logger.info(f"🧠 All Memory: \n{step_info.memory}")
@time_execution_async("--get_next_action")
async def get_next_action(self, input_messages: list[BaseMessage]) -> AgentOutput:
"""Get next action from LLM based on current state"""
fixed_input_messages = self._convert_input_messages(input_messages)
ai_message = self.llm.invoke(fixed_input_messages)
self.message_manager._add_message_with_tokens(ai_message)
if hasattr(ai_message, "reasoning_content"):
logger.info("🤯 Start Deep Thinking: ")
logger.info(ai_message.reasoning_content)
logger.info("🤯 End Deep Thinking")
if isinstance(ai_message.content, list):
ai_content = ai_message.content[0]
else:
ai_content = ai_message.content
try:
ai_content = ai_content.replace("```json", "").replace("```", "")
ai_content = repair_json(ai_content)
parsed_json = json.loads(ai_content)
parsed: AgentOutput = self.AgentOutput(**parsed_json)
except Exception as e:
import traceback
traceback.print_exc()
logger.debug(ai_message.content)
raise ValueError('Could not parse response.')
if parsed is None:
logger.debug(ai_message.content)
raise ValueError('Could not parse response.')
# cut the number of actions to max_actions_per_step if needed
if len(parsed.action) > self.settings.max_actions_per_step:
parsed.action = parsed.action[: self.settings.max_actions_per_step]
self._log_response(parsed)
return parsed
async def _run_planner(self) -> Optional[str]:
"""Run the planner to analyze state and suggest next steps"""
# Skip planning if no planner_llm is set
if not self.settings.planner_llm:
return None
# Create planner message history using full message history
planner_messages = [
PlannerPrompt(self.controller.registry.get_prompt_description()).get_system_message(),
*self.message_manager.get_messages()[1:], # Use full message history except the first
]
if not self.settings.use_vision_for_planner and self.settings.use_vision:
last_state_message: HumanMessage = planner_messages[-1]
# remove image from last state message
new_msg = ''
if isinstance(last_state_message.content, list):
for msg in last_state_message.content:
if msg['type'] == 'text':
new_msg += msg['text']
elif msg['type'] == 'image_url':
continue
else:
new_msg = last_state_message.content
planner_messages[-1] = HumanMessage(content=new_msg)
# Get planner output
response = await self.settings.planner_llm.ainvoke(planner_messages)
plan = str(response.content)
last_state_message = self.message_manager.get_messages()[-1]
if isinstance(last_state_message, HumanMessage):
# remove image from last state message
if isinstance(last_state_message.content, list):
for msg in last_state_message.content:
if msg['type'] == 'text':
msg['text'] += f"\nPlanning Agent outputs plans:\n {plan}\n"
else:
last_state_message.content += f"\nPlanning Agent outputs plans:\n {plan}\n "
try:
plan_json = json.loads(plan.replace("```json", "").replace("```", ""))
logger.info(f'📋 Plans:\n{json.dumps(plan_json, indent=4)}')
if hasattr(response, "reasoning_content"):
logger.info("🤯 Start Planning Deep Thinking: ")
logger.info(response.reasoning_content)
logger.info("🤯 End Planning Deep Thinking")
except json.JSONDecodeError:
logger.info(f'📋 Plans:\n{plan}')
except Exception as e:
logger.debug(f'Error parsing planning analysis: {e}')
logger.info(f'📋 Plans: {plan}')
return plan
@time_execution_async("--step")
async def step(self, step_info: Optional[CustomAgentStepInfo] = None) -> None:
"""Execute one step of the task"""
logger.info(f"\n📍 Step {self.state.n_steps}")
state = None
model_output = None
result: list[ActionResult] = []
step_start_time = time.time()
tokens = 0
try:
state = await self.browser_context.get_state()
await self._raise_if_stopped_or_paused()
self.message_manager.add_state_message(state, self.state.last_action, self.state.last_result, step_info,
self.settings.use_vision)
# Run planner at specified intervals if planner is configured
if self.settings.planner_llm and self.state.n_steps % self.settings.planner_interval == 0:
await self._run_planner()
input_messages = self.message_manager.get_messages()
tokens = self._message_manager.state.history.current_tokens
try:
model_output = await self.get_next_action(input_messages)
self.update_step_info(model_output, step_info)
self.state.n_steps += 1
if self.register_new_step_callback:
await self.register_new_step_callback(state, model_output, self.state.n_steps)
if self.settings.save_conversation_path:
target = self.settings.save_conversation_path + f'_{self.state.n_steps}.txt'
save_conversation(input_messages, model_output, target,
self.settings.save_conversation_path_encoding)
if self.model_name != "deepseek-reasoner":
# remove prev message
self.message_manager._remove_state_message_by_index(-1)
await self._raise_if_stopped_or_paused()
except Exception as e:
# model call failed, remove last state message from history
self.message_manager._remove_state_message_by_index(-1)
raise e
result: list[ActionResult] = await self.multi_act(model_output.action)
for ret_ in result:
if ret_.extracted_content and "Extracted page" in ret_.extracted_content:
# record every extracted page
if ret_.extracted_content[:100] not in self.state.extracted_content:
self.state.extracted_content += ret_.extracted_content
self.state.last_result = result
self.state.last_action = model_output.action
if len(result) > 0 and result[-1].is_done:
if not self.state.extracted_content:
self.state.extracted_content = step_info.memory
result[-1].extracted_content = self.state.extracted_content
logger.info(f"📄 Result: {result[-1].extracted_content}")
self.state.consecutive_failures = 0
except InterruptedError:
logger.debug('Agent paused')
self.state.last_result = [
ActionResult(
error='The agent was paused - now continuing actions might need to be repeated',
include_in_memory=True
)
]
return
except Exception as e:
result = await self._handle_step_error(e)
self.state.last_result = result
finally:
step_end_time = time.time()
actions = [a.model_dump(exclude_unset=True) for a in model_output.action] if model_output else []
self.telemetry.capture(
AgentStepTelemetryEvent(
agent_id=self.state.agent_id,
step=self.state.n_steps,
actions=actions,
consecutive_failures=self.state.consecutive_failures,
step_error=[r.error for r in result if r.error] if result else ['No result'],
)
)
if not result:
return
if state:
metadata = StepMetadata(
step_number=self.state.n_steps,
step_start_time=step_start_time,
step_end_time=step_end_time,
input_tokens=tokens,
)
self._make_history_item(model_output, state, result, metadata)
async def run(self, max_steps: int = 100) -> AgentHistoryList:
"""Execute the task with maximum number of steps"""
try:
self._log_agent_run()
# Execute initial actions if provided
if self.initial_actions:
result = await self.multi_act(self.initial_actions, check_for_new_elements=False)
self.state.last_result = result
step_info = CustomAgentStepInfo(
task=self.task,
add_infos=self.add_infos,
step_number=1,
max_steps=max_steps,
memory="",
)
for step in range(max_steps):
# Check if we should stop due to too many failures
if self.state.consecutive_failures >= self.settings.max_failures:
logger.error(f'❌ Stopping due to {self.settings.max_failures} consecutive failures')
break
# Check control flags before each step
if self.state.stopped:
logger.info('Agent stopped')
break
while self.state.paused:
await asyncio.sleep(0.2) # Small delay to prevent CPU spinning
if self.state.stopped: # Allow stopping while paused
break
await self.step(step_info)
if self.state.history.is_done():
if self.settings.validate_output and step < max_steps - 1:
if not await self._validate_output():
continue
await self.log_completion()
break
else:
logger.info("❌ Failed to complete task in maximum steps")
if not self.state.extracted_content:
self.state.history.history[-1].result[-1].extracted_content = step_info.memory
else:
self.state.history.history[-1].result[-1].extracted_content = self.state.extracted_content
return self.state.history
finally:
self.telemetry.capture(
AgentEndTelemetryEvent(
agent_id=self.state.agent_id,
is_done=self.state.history.is_done(),
success=self.state.history.is_successful(),
steps=self.state.n_steps,
max_steps_reached=self.state.n_steps >= max_steps,
errors=self.state.history.errors(),
total_input_tokens=self.state.history.total_input_tokens(),
total_duration_seconds=self.state.history.total_duration_seconds(),
)
)
if not self.injected_browser_context:
await self.browser_context.close()
if not self.injected_browser and self.browser:
await self.browser.close()
if self.settings.generate_gif:
output_path: str = 'agent_history.gif'
if isinstance(self.settings.generate_gif, str):
output_path = self.settings.generate_gif
create_history_gif(task=self.task, history=self.state.history, output_path=output_path)

View File

@@ -1,111 +0,0 @@
from __future__ import annotations
import logging
import pdb
from typing import List, Optional, Type, Dict
from browser_use.agent.message_manager.service import MessageManager
from browser_use.agent.message_manager.views import MessageHistory
from browser_use.agent.prompts import SystemPrompt, AgentMessagePrompt
from browser_use.agent.views import ActionResult, AgentStepInfo, ActionModel
from browser_use.browser.views import BrowserState
from browser_use.agent.message_manager.service import MessageManagerSettings
from browser_use.agent.views import ActionResult, AgentOutput, AgentStepInfo, MessageManagerState
from langchain_core.language_models import BaseChatModel
from langchain_anthropic import ChatAnthropic
from langchain_core.language_models import BaseChatModel
from langchain_core.messages import (
AIMessage,
BaseMessage,
HumanMessage,
ToolMessage,
SystemMessage
)
from langchain_openai import ChatOpenAI
from ..utils.llm import DeepSeekR1ChatOpenAI
from .custom_prompts import CustomAgentMessagePrompt
logger = logging.getLogger(__name__)
class CustomMessageManagerSettings(MessageManagerSettings):
agent_prompt_class: Type[AgentMessagePrompt] = AgentMessagePrompt
class CustomMessageManager(MessageManager):
def __init__(
self,
task: str,
system_message: SystemMessage,
settings: MessageManagerSettings = MessageManagerSettings(),
state: MessageManagerState = MessageManagerState(),
):
super().__init__(
task=task,
system_message=system_message,
settings=settings,
state=state
)
def _init_messages(self) -> None:
"""Initialize the message history with system message, context, task, and other initial messages"""
self._add_message_with_tokens(self.system_prompt)
self.context_content = ""
if self.settings.message_context:
self.context_content += 'Context for the task' + self.settings.message_context
if self.settings.sensitive_data:
info = f'Here are placeholders for sensitive data: {list(self.settings.sensitive_data.keys())}'
info += 'To use them, write <secret>the placeholder name</secret>'
self.context_content += info
if self.settings.available_file_paths:
filepaths_msg = f'Here are file paths you can use: {self.settings.available_file_paths}'
self.context_content += filepaths_msg
if self.context_content:
context_message = HumanMessage(content=self.context_content)
self._add_message_with_tokens(context_message)
def cut_messages(self):
"""Get current message list, potentially trimmed to max tokens"""
diff = self.state.history.current_tokens - self.settings.max_input_tokens
min_message_len = 2 if self.context_content is not None else 1
while diff > 0 and len(self.state.history.messages) > min_message_len:
msg = self.state.history.messages.pop(min_message_len)
self.state.history.current_tokens -= msg.metadata.tokens
diff = self.state.history.current_tokens - self.settings.max_input_tokens
def add_state_message(
self,
state: BrowserState,
actions: Optional[List[ActionModel]] = None,
result: Optional[List[ActionResult]] = None,
step_info: Optional[AgentStepInfo] = None,
use_vision=True,
) -> None:
"""Add browser state as human message"""
# otherwise add state message and result to next message (which will not stay in memory)
state_message = self.settings.agent_prompt_class(
state,
actions,
result,
include_attributes=self.settings.include_attributes,
step_info=step_info,
).get_user_message(use_vision)
self._add_message_with_tokens(state_message)
def _remove_state_message_by_index(self, remove_ind=-1) -> None:
"""Remove state message by index from history"""
i = len(self.state.history.messages) - 1
remove_cnt = 0
while i >= 0:
if isinstance(self.state.history.messages[i].message, HumanMessage):
remove_cnt += 1
if remove_cnt == abs(remove_ind):
msg = self.state.history.messages.pop(i)
self.state.history.current_tokens -= msg.metadata.tokens
break
i -= 1

View File

@@ -1,125 +0,0 @@
import pdb
from typing import List, Optional
from browser_use.agent.prompts import SystemPrompt, AgentMessagePrompt
from browser_use.agent.views import ActionResult, ActionModel
from browser_use.browser.views import BrowserState
from langchain_core.messages import HumanMessage, SystemMessage
from datetime import datetime
import importlib
from .custom_views import CustomAgentStepInfo
class CustomSystemPrompt(SystemPrompt):
def _load_prompt_template(self) -> None:
"""Load the prompt template from the markdown file."""
try:
# This works both in development and when installed as a package
with importlib.resources.files('src.agent').joinpath('custom_system_prompt.md').open('r') as f:
self.prompt_template = f.read()
except Exception as e:
raise RuntimeError(f'Failed to load system prompt template: {e}')
def get_system_message(self) -> SystemMessage:
"""
Get the system prompt for the agent.
Returns:
SystemMessage: Formatted system prompt
"""
prompt = self.prompt_template.format(max_actions=self.max_actions_per_step,
available_actions=self.default_action_description)
return SystemMessage(content=prompt)
class CustomAgentMessagePrompt(AgentMessagePrompt):
def __init__(
self,
state: BrowserState,
actions: Optional[List[ActionModel]] = None,
result: Optional[List[ActionResult]] = None,
include_attributes: list[str] = [],
step_info: Optional[CustomAgentStepInfo] = None,
):
super(CustomAgentMessagePrompt, self).__init__(state=state,
result=result,
include_attributes=include_attributes,
step_info=step_info
)
self.actions = actions
def get_user_message(self, use_vision: bool = True) -> HumanMessage:
if self.step_info:
step_info_description = f'Current step: {self.step_info.step_number}/{self.step_info.max_steps}\n'
else:
step_info_description = ''
time_str = datetime.now().strftime("%Y-%m-%d %H:%M")
step_info_description += f"Current date and time: {time_str}"
elements_text = self.state.element_tree.clickable_elements_to_string(include_attributes=self.include_attributes)
has_content_above = (self.state.pixels_above or 0) > 0
has_content_below = (self.state.pixels_below or 0) > 0
if elements_text != '':
if has_content_above:
elements_text = (
f'... {self.state.pixels_above} pixels above - scroll or extract content to see more ...\n{elements_text}'
)
else:
elements_text = f'[Start of page]\n{elements_text}'
if has_content_below:
elements_text = (
f'{elements_text}\n... {self.state.pixels_below} pixels below - scroll or extract content to see more ...'
)
else:
elements_text = f'{elements_text}\n[End of page]'
else:
elements_text = 'empty page'
state_description = f"""
{step_info_description}
1. Task: {self.step_info.task}.
2. Hints(Optional):
{self.step_info.add_infos}
3. Memory:
{self.step_info.memory}
4. Current url: {self.state.url}
5. Available tabs:
{self.state.tabs}
6. Interactive elements:
{elements_text}
"""
if self.actions and self.result:
state_description += "\n **Previous Actions** \n"
state_description += f'Previous step: {self.step_info.step_number - 1}/{self.step_info.max_steps} \n'
for i, result in enumerate(self.result):
action = self.actions[i]
state_description += f"Previous action {i + 1}/{len(self.result)}: {action.model_dump_json(exclude_unset=True)}\n"
if result.error:
# only use last 300 characters of error
error = result.error.split('\n')[-1]
state_description += (
f"Error of previous action {i + 1}/{len(self.result)}: ...{error}\n"
)
if result.include_in_memory:
if result.extracted_content:
state_description += f"Result of previous action {i + 1}/{len(self.result)}: {result.extracted_content}\n"
if self.state.screenshot and use_vision == True:
# Format message for vision model
return HumanMessage(
content=[
{'type': 'text', 'text': state_description},
{
'type': 'image_url',
'image_url': {'url': f'data:image/png;base64,{self.state.screenshot}'},
},
]
)
return HumanMessage(content=state_description)

View File

@@ -1,80 +0,0 @@
You are an AI agent designed to automate browser tasks. Your goal is to accomplish the ultimate task following the rules.
# Input Format
Task
Previous steps
Current URL
Open Tabs
Interactive Elements
[index]<type>text</type>
- index: Numeric identifier for interaction
- type: HTML element type (button, input, etc.)
- text: Element description
Example:
[33]<button>Submit Form</button>
- Only elements with numeric indexes in [] are interactive
- elements without [] provide only context
# Response Rules
1. RESPONSE FORMAT: You must ALWAYS respond with valid JSON in this exact format:
{{
"current_state": {{
"evaluation_previous_goal": "Success|Failed|Unknown - Analyze the current elements and the image to check if the previous goals/actions are successful like intended by the task. Mention if something unexpected happened. Shortly state why/why not.",
"important_contents": "Output important contents closely related to user\'s instruction on the current page. If there is, please output the contents. If not, please output empty string ''.",
"thought": "Think about the requirements that have been completed in previous operations and the requirements that need to be completed in the next one operation. If your output of evaluation_previous_goal is 'Failed', please reflect and output your reflection here.",
"next_goal": "Please generate a brief natural language description for the goal of your next actions based on your thought."
}},
"action": [
{{"one_action_name": {{// action-specific parameter}}}}, // ... more actions in sequence
]
}}
2. ACTIONS: You can specify multiple actions in the list to be executed in sequence. But always specify only one action name per item. Use maximum {max_actions} actions per sequence.
Common action sequences:
- Form filling: [{{"input_text": {{"index": 1, "text": "username"}}}}, {{"input_text": {{"index": 2, "text": "password"}}}}, {{"click_element": {{"index": 3}}}}]
- Navigation and extraction: [{{"go_to_url": {{"url": "https://example.com"}}}}, {{"extract_content": {{"goal": "extract the names"}}}}]
- Actions are executed in the given order
- If the page changes after an action, the sequence is interrupted and you get the new state.
- Only provide the action sequence until an action which changes the page state significantly.
- Try to be efficient, e.g. fill forms at once, or chain actions where nothing changes on the page
- only use multiple actions if it makes sense.
- Only chose from below available actions.
3. ELEMENT INTERACTION:
- Only use indexes of the interactive elements
- Elements marked with "[]Non-interactive text" are non-interactive
4. NAVIGATION & ERROR HANDLING:
- If no suitable elements exist, use other functions to complete the task
- If stuck, try alternative approaches - like going back to a previous page, new search, new tab etc.
- Handle popups/cookies by accepting or closing them
- Use scroll to find elements you are looking for
- If you want to research something, open a new tab instead of using the current tab
- If captcha pops up, try to solve it - else try a different approach
- If the page is not fully loaded, use wait action
5. TASK COMPLETION:
- Use the done action as the last action as soon as the ultimate task is complete
- Dont use "done" before you are done with everything the user asked you, except you reach the last step of max_steps.
- If you reach your last step, use the done action even if the task is not fully finished. Provide all the information you have gathered so far. If the ultimate task is completly finished set success to true. If not everything the user asked for is completed set success in done to false!
- If you have to do something repeatedly for example the task says for "each", or "for all", or "x times", count always inside "memory" how many times you have done it and how many remain. Don't stop until you have completed like the task asked you. Only call done after the last step.
- Don't hallucinate actions
- Make sure you include everything you found out for the ultimate task in the done text parameter. Do not just say you are done, but include the requested information of the task.
6. VISUAL CONTEXT:
- When an image is provided, use it to understand the page layout
- Bounding boxes with labels on their top right corner correspond to element indexes
7. Form filling:
- If you fill an input field and your action sequence is interrupted, most often something changed e.g. suggestions popped up under the field.
8. Long tasks:
- Keep track of the status and subresults in the memory.
9. Extraction:
- If your task is to find information - call extract_content on the specific pages to get and store the information.
Your responses must be always JSON with the specified format.
Available Actions:
{available_actions}

View File

@@ -1,67 +0,0 @@
from dataclasses import dataclass
from typing import Any, Dict, List, Literal, Optional, Type
import uuid
from browser_use.agent.views import AgentOutput, AgentState, ActionResult, AgentHistoryList, MessageManagerState
from browser_use.controller.registry.views import ActionModel
from pydantic import BaseModel, ConfigDict, Field, create_model
@dataclass
class CustomAgentStepInfo:
step_number: int
max_steps: int
task: str
add_infos: str
memory: str
class CustomAgentBrain(BaseModel):
"""Current state of the agent"""
evaluation_previous_goal: str
important_contents: str
thought: str
next_goal: str
class CustomAgentOutput(AgentOutput):
"""Output model for agent
@dev note: this model is extended with custom actions in AgentService. You can also use some fields that are not in this model as provided by the linter, as long as they are registered in the DynamicActions model.
"""
current_state: CustomAgentBrain
@staticmethod
def type_with_custom_actions(
custom_actions: Type[ActionModel],
) -> Type["CustomAgentOutput"]:
"""Extend actions with custom actions"""
model_ = create_model(
"CustomAgentOutput",
__base__=CustomAgentOutput,
action=(
list[custom_actions],
Field(..., description='List of actions to execute', json_schema_extra={'min_items': 1}),
), # Properly annotated field with no default
__module__=CustomAgentOutput.__module__,
)
model_.__doc__ = 'AgentOutput model with custom actions'
return model_
class CustomAgentState(BaseModel):
agent_id: str = Field(default_factory=lambda: str(uuid.uuid4()))
n_steps: int = 1
consecutive_failures: int = 0
last_result: Optional[List['ActionResult']] = None
history: AgentHistoryList = Field(default_factory=lambda: AgentHistoryList(history=[]))
last_plan: Optional[str] = None
paused: bool = False
stopped: bool = False
message_manager_state: MessageManagerState = Field(default_factory=MessageManagerState)
last_action: Optional[List['ActionModel']] = None
extracted_content: str = ''

View File

@@ -0,0 +1,991 @@
import asyncio
import json
import logging
import os
import pdb
import uuid
from pathlib import Path
from typing import List, Dict, Any, TypedDict, Optional, Sequence, Annotated
from concurrent.futures import ThreadPoolExecutor, as_completed
import threading
# Langchain imports
from langchain_core.messages import BaseMessage, HumanMessage, AIMessage, ToolMessage, SystemMessage
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_core.tools import Tool, StructuredTool
from langchain.agents import AgentExecutor # We might use parts, but Langgraph is primary
from langchain_community.tools.file_management import WriteFileTool, ReadFileTool, CopyFileTool, ListDirectoryTool, \
MoveFileTool, FileSearchTool
from langchain_openai import ChatOpenAI # Replace with your actual LLM import
from pydantic import BaseModel, Field
import operator
from browser_use.browser.browser import BrowserConfig
from browser_use.browser.context import BrowserContextWindowSize
# Langgraph imports
from langgraph.graph import StateGraph, END
from src.controller.custom_controller import CustomController
from src.utils import llm_provider
from src.browser.custom_browser import CustomBrowser
from src.browser.custom_context import CustomBrowserContext, CustomBrowserContextConfig
from src.agent.browser_use.browser_use_agent import BrowserUseAgent
from src.utils.mcp_client import setup_mcp_client_and_tools
logger = logging.getLogger(__name__)
# Constants
REPORT_FILENAME = "report.md"
PLAN_FILENAME = "research_plan.md"
SEARCH_INFO_FILENAME = "search_info.json"
_AGENT_STOP_FLAGS = {}
_BROWSER_AGENT_INSTANCES = {}
async def run_single_browser_task(
task_query: str,
task_id: str,
llm: Any, # Pass the main LLM
browser_config: Dict[str, Any],
stop_event: threading.Event,
use_vision: bool = False,
) -> Dict[str, Any]:
"""
Runs a single BrowserUseAgent task.
Manages browser creation and closing for this specific task.
"""
if not BrowserUseAgent:
return {"query": task_query, "error": "BrowserUseAgent components not available."}
# --- Browser Setup ---
# These should ideally come from the main agent's config
headless = browser_config.get("headless", False)
window_w = browser_config.get("window_width", 1280)
window_h = browser_config.get("window_height", 1100)
browser_user_data_dir = browser_config.get("user_data_dir", None)
use_own_browser = browser_config.get("use_own_browser", False)
browser_binary_path = browser_config.get("browser_binary_path", None)
wss_url = browser_config.get("wss_url", None)
cdp_url = browser_config.get("cdp_url", None)
disable_security = browser_config.get("disable_security", False)
bu_browser = None
bu_browser_context = None
try:
logger.info(f"Starting browser task for query: {task_query}")
extra_args = [f"--window-size={window_w},{window_h}"]
if browser_user_data_dir:
extra_args.append(f"--user-data-dir={browser_user_data_dir}")
if use_own_browser:
browser_binary_path = os.getenv("CHROME_PATH", None) or browser_binary_path
if browser_binary_path == "": browser_binary_path = None
chrome_user_data = os.getenv("CHROME_USER_DATA", None)
if chrome_user_data: extra_args += [f"--user-data-dir={chrome_user_data}"]
else:
browser_binary_path = None
bu_browser = CustomBrowser(
config=BrowserConfig(
headless=headless,
disable_security=disable_security,
browser_binary_path=browser_binary_path,
extra_browser_args=extra_args,
wss_url=wss_url,
cdp_url=cdp_url,
)
)
context_config = CustomBrowserContextConfig(
save_downloads_path="./tmp/downloads",
browser_window_size=BrowserContextWindowSize(width=window_w, height=window_h),
force_new_context=True
)
bu_browser_context = await bu_browser.new_context(config=context_config)
# Simple controller example, replace with your actual implementation if needed
bu_controller = CustomController()
# Construct the task prompt for BrowserUseAgent
# Instruct it to find specific info and return title/URL
bu_task_prompt = f"""
Research Task: {task_query}
Objective: Find relevant information answering the query.
Output Requirements: For each relevant piece of information found, please provide:
1. A concise summary of the information.
2. The title of the source page or document.
3. The URL of the source.
Focus on accuracy and relevance. Avoid irrelevant details.
PDF cannot directly extract _content, please try to download first, then using read_file, if you can't save or read, please try other methods.
"""
bu_agent_instance = BrowserUseAgent(
task=bu_task_prompt,
llm=llm, # Use the passed LLM
browser=bu_browser,
browser_context=bu_browser_context,
controller=bu_controller,
use_vision=use_vision,
)
# Store instance for potential stop() call
task_key = f"{task_id}_{uuid.uuid4()}"
_BROWSER_AGENT_INSTANCES[task_key] = bu_agent_instance
# --- Run with Stop Check ---
# BrowserUseAgent needs to internally check a stop signal or have a stop method.
# We simulate checking before starting and assume `run` might be interruptible
# or have its own stop mechanism we can trigger via bu_agent_instance.stop().
if stop_event.is_set():
logger.info(f"Browser task for '{task_query}' cancelled before start.")
return {"query": task_query, "result": None, "status": "cancelled"}
# The run needs to be awaitable and ideally accept a stop signal or have a .stop() method
# result = await bu_agent_instance.run(max_steps=max_steps) # Add max_steps if applicable
# Let's assume a simplified run for now
logger.info(f"Running BrowserUseAgent for: {task_query}")
result = await bu_agent_instance.run() # Assuming run is the main method
logger.info(f"BrowserUseAgent finished for: {task_query}")
final_data = result.final_result()
if stop_event.is_set():
logger.info(f"Browser task for '{task_query}' stopped during execution.")
return {"query": task_query, "result": final_data, "status": "stopped"}
else:
logger.info(f"Browser result for '{task_query}': {final_data}")
return {"query": task_query, "result": final_data, "status": "completed"}
except Exception as e:
logger.error(f"Error during browser task for query '{task_query}': {e}", exc_info=True)
return {"query": task_query, "error": str(e), "status": "failed"}
finally:
if bu_browser_context:
try:
await bu_browser_context.close()
bu_browser_context = None
logger.info("Closed browser context.")
except Exception as e:
logger.error(f"Error closing browser context: {e}")
if bu_browser:
try:
await bu_browser.close()
bu_browser = None
logger.info("Closed browser.")
except Exception as e:
logger.error(f"Error closing browser: {e}")
if task_key in _BROWSER_AGENT_INSTANCES:
del _BROWSER_AGENT_INSTANCES[task_key]
class BrowserSearchInput(BaseModel):
queries: List[str] = Field(
description=f"List of distinct search queries to find information relevant to the research task.")
async def _run_browser_search_tool(
queries: List[str],
task_id: str, # Injected dependency
llm: Any, # Injected dependency
browser_config: Dict[str, Any],
stop_event: threading.Event,
max_parallel_browsers: int = 1
) -> List[Dict[str, Any]]:
"""
Internal function to execute parallel browser searches based on LLM-provided queries.
Handles concurrency and stop signals.
"""
# Limit queries just in case LLM ignores the description
queries = queries[:max_parallel_browsers]
logger.info(f"[Browser Tool {task_id}] Running search for {len(queries)} queries: {queries}")
results = []
semaphore = asyncio.Semaphore(max_parallel_browsers)
async def task_wrapper(query):
async with semaphore:
if stop_event.is_set():
logger.info(f"[Browser Tool {task_id}] Skipping task due to stop signal: {query}")
return {"query": query, "result": None, "status": "cancelled"}
# Pass necessary injected configs and the stop event
return await run_single_browser_task(
query,
task_id,
llm, # Pass the main LLM (or a dedicated one if needed)
browser_config,
stop_event
# use_vision could be added here if needed
)
tasks = [task_wrapper(query) for query in queries]
search_results = await asyncio.gather(*tasks, return_exceptions=True)
processed_results = []
for i, res in enumerate(search_results):
query = queries[i] # Get corresponding query
if isinstance(res, Exception):
logger.error(f"[Browser Tool {task_id}] Gather caught exception for query '{query}': {res}", exc_info=True)
processed_results.append({"query": query, "error": str(res), "status": "failed"})
elif isinstance(res, dict):
processed_results.append(res)
else:
logger.error(f"[Browser Tool {task_id}] Unexpected result type for query '{query}': {type(res)}")
processed_results.append({"query": query, "error": "Unexpected result type", "status": "failed"})
logger.info(f"[Browser Tool {task_id}] Finished search. Results count: {len(processed_results)}")
return processed_results
def create_browser_search_tool(
llm: Any,
browser_config: Dict[str, Any],
task_id: str,
stop_event: threading.Event,
max_parallel_browsers: int = 1,
) -> StructuredTool:
"""Factory function to create the browser search tool with necessary dependencies."""
# Use partial to bind the dependencies that aren't part of the LLM call arguments
from functools import partial
bound_tool_func = partial(
_run_browser_search_tool,
task_id=task_id,
llm=llm,
browser_config=browser_config,
stop_event=stop_event,
max_parallel_browsers=max_parallel_browsers
)
return StructuredTool.from_function(
coroutine=bound_tool_func,
name="parallel_browser_search",
description=f"""Use this tool to actively search the web for information related to a specific research task or question.
It runs up to {max_parallel_browsers} searches in parallel using a browser agent for better results than simple scraping.
Provide a list of distinct search queries(up to {max_parallel_browsers}) that are likely to yield relevant information.""",
args_schema=BrowserSearchInput,
)
# --- Langgraph State Definition ---
class ResearchPlanItem(TypedDict):
step: int
task: str
status: str # "pending", "completed", "failed"
queries: Optional[List[str]] # Queries generated for this task
result_summary: Optional[str] # Optional brief summary after execution
class DeepResearchState(TypedDict):
task_id: str
topic: str
research_plan: List[ResearchPlanItem]
search_results: List[Dict[str, Any]] # Stores results from browser_search_tool_func
# messages: Sequence[BaseMessage] # History for ReAct-like steps within nodes
llm: Any # The LLM instance
tools: List[Tool]
output_dir: Path
browser_config: Dict[str, Any]
final_report: Optional[str]
current_step_index: int # To track progress through the plan
stop_requested: bool # Flag to signal termination
# Add other state variables as needed
error_message: Optional[str] # To store errors
messages: List[BaseMessage]
# --- Langgraph Nodes ---
def _load_previous_state(task_id: str, output_dir: str) -> Dict[str, Any]:
"""Loads state from files if they exist."""
state_updates = {}
plan_file = os.path.join(output_dir, PLAN_FILENAME)
search_file = os.path.join(output_dir, SEARCH_INFO_FILENAME)
if os.path.exists(plan_file):
try:
with open(plan_file, 'r', encoding='utf-8') as f:
# Basic parsing, assumes markdown checklist format
plan = []
step = 1
for line in f:
line = line.strip()
if line.startswith(("- [x]", "- [ ]")):
status = "completed" if line.startswith("- [x]") else "pending"
task = line[5:].strip()
plan.append(
ResearchPlanItem(step=step, task=task, status=status, queries=None, result_summary=None))
step += 1
state_updates['research_plan'] = plan
# Determine next step index based on loaded plan
next_step = next((i for i, item in enumerate(plan) if item['status'] == 'pending'), len(plan))
state_updates['current_step_index'] = next_step
logger.info(f"Loaded research plan from {plan_file}, next step index: {next_step}")
except Exception as e:
logger.error(f"Failed to load or parse research plan {plan_file}: {e}")
state_updates['error_message'] = f"Failed to load research plan: {e}"
if os.path.exists(search_file):
try:
with open(search_file, 'r', encoding='utf-8') as f:
state_updates['search_results'] = json.load(f)
logger.info(f"Loaded search results from {search_file}")
except Exception as e:
logger.error(f"Failed to load search results {search_file}: {e}")
state_updates['error_message'] = f"Failed to load search results: {e}"
# Decide if this is fatal or if we can continue without old results
return state_updates
def _save_plan_to_md(plan: List[ResearchPlanItem], output_dir: str):
"""Saves the research plan to a markdown checklist file."""
plan_file = os.path.join(output_dir, PLAN_FILENAME)
try:
with open(plan_file, 'w', encoding='utf-8') as f:
f.write("# Research Plan\n\n")
for item in plan:
marker = "- [x]" if item['status'] == 'completed' else "- [ ]"
f.write(f"{marker} {item['task']}\n")
logger.info(f"Research plan saved to {plan_file}")
except Exception as e:
logger.error(f"Failed to save research plan to {plan_file}: {e}")
def _save_search_results_to_json(results: List[Dict[str, Any]], output_dir: str):
"""Appends or overwrites search results to a JSON file."""
search_file = os.path.join(output_dir, SEARCH_INFO_FILENAME)
try:
# Simple overwrite for now, could be append
with open(search_file, 'w', encoding='utf-8') as f:
json.dump(results, f, indent=2, ensure_ascii=False)
logger.info(f"Search results saved to {search_file}")
except Exception as e:
logger.error(f"Failed to save search results to {search_file}: {e}")
def _save_report_to_md(report: str, output_dir: Path):
"""Saves the final report to a markdown file."""
report_file = os.path.join(output_dir, REPORT_FILENAME)
try:
with open(report_file, 'w', encoding='utf-8') as f:
f.write(report)
logger.info(f"Final report saved to {report_file}")
except Exception as e:
logger.error(f"Failed to save final report to {report_file}: {e}")
async def planning_node(state: DeepResearchState) -> Dict[str, Any]:
"""Generates the initial research plan or refines it if resuming."""
logger.info("--- Entering Planning Node ---")
if state.get('stop_requested'):
logger.info("Stop requested, skipping planning.")
return {"stop_requested": True}
llm = state['llm']
topic = state['topic']
existing_plan = state.get('research_plan')
existing_results = state.get('search_results')
output_dir = state['output_dir']
if existing_plan and state.get('current_step_index', 0) > 0:
logger.info("Resuming with existing plan.")
# Maybe add logic here to let LLM review and potentially adjust the plan
# based on existing_results, but for now, we just use the loaded plan.
_save_plan_to_md(existing_plan, output_dir) # Ensure it's saved initially
return {"research_plan": existing_plan} # Return the loaded plan
logger.info(f"Generating new research plan for topic: {topic}")
prompt = ChatPromptTemplate.from_messages([
("system", """You are a meticulous research assistant. Your goal is to create a step-by-step research plan to thoroughly investigate a given topic.
The plan should consist of clear, actionable research tasks or questions. Each step should logically build towards a comprehensive understanding.
Format the output as a numbered list. Each item should represent a distinct research step or question.
Example:
1. Define the core concepts and terminology related to [Topic].
2. Identify the key historical developments of [Topic].
3. Analyze the current state-of-the-art and recent advancements in [Topic].
4. Investigate the major challenges and limitations associated with [Topic].
5. Explore the future trends and potential applications of [Topic].
6. Summarize the findings and draw conclusions.
Keep the plan focused and manageable. Aim for 5-10 detailed steps.
"""),
("human", f"Generate a research plan for the topic: {topic}")
])
try:
response = await llm.ainvoke(prompt.format_prompt(topic=topic).to_messages())
plan_text = response.content
# Parse the numbered list into the plan structure
new_plan: List[ResearchPlanItem] = []
for i, line in enumerate(plan_text.strip().split('\n')):
line = line.strip()
if line and (line[0].isdigit() or line.startswith(("*", "-"))):
# Simple parsing: remove number/bullet and space
task_text = line.split('.', 1)[-1].strip() if line[0].isdigit() else line[1:].strip()
if task_text:
new_plan.append(ResearchPlanItem(
step=i + 1,
task=task_text,
status="pending",
queries=None,
result_summary=None
))
if not new_plan:
logger.error("LLM failed to generate a valid plan structure.")
return {"error_message": "Failed to generate research plan structure."}
logger.info(f"Generated research plan with {len(new_plan)} steps.")
_save_plan_to_md(new_plan, output_dir)
return {
"research_plan": new_plan,
"current_step_index": 0, # Start from the beginning
"search_results": [], # Initialize search results
}
except Exception as e:
logger.error(f"Error during planning: {e}", exc_info=True)
return {"error_message": f"LLM Error during planning: {e}"}
async def research_execution_node(state: DeepResearchState) -> Dict[str, Any]:
"""
Executes the next step in the research plan by invoking the LLM with tools.
The LLM decides which tool (e.g., browser search) to use and provides arguments.
"""
logger.info("--- Entering Research Execution Node ---")
if state.get('stop_requested'):
logger.info("Stop requested, skipping research execution.")
return {"stop_requested": True, "current_step_index": state['current_step_index']} # Keep index same
plan = state['research_plan']
current_index = state['current_step_index']
llm = state['llm']
tools = state['tools'] # Tools are now passed in state
output_dir = str(state['output_dir'])
task_id = state['task_id']
# Stop event is bound inside the tool function, no need to pass directly here
if not plan or current_index >= len(plan):
logger.info("Research plan complete or empty.")
# This condition should ideally be caught by `should_continue` before reaching here
return {}
current_step = plan[current_index]
if current_step['status'] == 'completed':
logger.info(f"Step {current_step['step']} already completed, skipping.")
return {"current_step_index": current_index + 1} # Move to next step
logger.info(f"Executing research step {current_step['step']}: {current_step['task']}")
# Bind tools to the LLM for this call
llm_with_tools = llm.bind_tools(tools)
if state['messages']:
current_task_message = [HumanMessage(
content=f"Research Task (Step {current_step['step']}): {current_step['task']}")]
invocation_messages = state['messages'] + current_task_message
else:
current_task_message = [
SystemMessage(
content="You are a research assistant executing one step of a research plan. Use the available tools, especially the 'parallel_browser_search' tool, to gather information needed for the current task. Be precise with your search queries if using the browser tool."),
HumanMessage(
content=f"Research Task (Step {current_step['step']}): {current_step['task']}")
]
invocation_messages = current_task_message
try:
# Invoke the LLM, expecting it to make a tool call
logger.info(f"Invoking LLM with tools for task: {current_step['task']}")
ai_response: BaseMessage = await llm_with_tools.ainvoke(invocation_messages)
logger.info("LLM invocation complete.")
tool_results = []
executed_tool_names = []
if not isinstance(ai_response, AIMessage) or not ai_response.tool_calls:
# LLM didn't call a tool. Maybe it answered directly? Or failed?
logger.warning(
f"LLM did not call any tool for step {current_step['step']}. Response: {ai_response.content[:100]}...")
# How to handle this? Mark step as failed? Or store the content?
# Let's mark as failed for now, assuming a tool was expected.
current_step['status'] = 'failed'
current_step['result_summary'] = "LLM did not use a tool as expected."
_save_plan_to_md(plan, output_dir)
return {
"research_plan": plan,
"current_step_index": current_index + 1,
"error_message": f"LLM failed to call a tool for step {current_step['step']}."
}
# Process tool calls
for tool_call in ai_response.tool_calls:
tool_name = tool_call.get("name")
tool_args = tool_call.get("args", {})
tool_call_id = tool_call.get("id") # Important for ToolMessage
logger.info(f"LLM requested tool call: {tool_name} with args: {tool_args}")
executed_tool_names.append(tool_name)
# Find the corresponding tool instance
selected_tool = next((t for t in tools if t.name == tool_name), None)
if not selected_tool:
logger.error(f"LLM called tool '{tool_name}' which is not available.")
# Create a ToolMessage indicating the error
tool_results.append(ToolMessage(
content=f"Error: Tool '{tool_name}' not found.",
tool_call_id=tool_call_id
))
continue # Skip to next tool call if any
# Execute the tool
try:
# Stop check before executing the tool (tool itself also checks)
stop_event = _AGENT_STOP_FLAGS.get(task_id)
if stop_event and stop_event.is_set():
logger.info(f"Stop requested before executing tool: {tool_name}")
current_step['status'] = 'pending' # Not completed due to stop
_save_plan_to_md(plan, output_dir)
return {"stop_requested": True, "research_plan": plan}
logger.info(f"Executing tool: {tool_name}")
# Assuming tool functions handle async correctly
tool_output = await selected_tool.ainvoke(tool_args)
logger.info(f"Tool '{tool_name}' executed successfully.")
browser_tool_called = "parallel_browser_search" in executed_tool_names
# Append result to overall search results
current_search_results = state.get('search_results', [])
if browser_tool_called: # Specific handling for browser tool output
current_search_results.extend(tool_output)
else: # Handle other tool outputs (e.g., file tools return strings)
# Store it associated with the step? Or a generic log?
# Let's just log it for now. Need better handling for diverse tool outputs.
logger.info(f"Result from tool '{tool_name}': {str(tool_output)[:200]}...")
# Store result for potential next LLM call (if we were doing multi-turn)
tool_results.append(ToolMessage(
content=json.dumps(tool_output),
tool_call_id=tool_call_id
))
except Exception as e:
logger.error(f"Error executing tool '{tool_name}': {e}", exc_info=True)
tool_results.append(ToolMessage(
content=f"Error executing tool {tool_name}: {e}",
tool_call_id=tool_call_id
))
# Also update overall state search_results with error?
current_search_results = state.get('search_results', [])
current_search_results.append(
{"tool_name": tool_name, "args": tool_args, "status": "failed", "error": str(e)})
# Basic check: Did the browser tool run at all? (More specific checks needed)
browser_tool_called = "parallel_browser_search" in executed_tool_names
# We might need a more nuanced status based on the *content* of tool_results
step_failed = any("Error:" in str(tr.content) for tr in tool_results) or not browser_tool_called
if step_failed:
logger.warning(f"Step {current_step['step']} failed or did not yield results via browser search.")
current_step['status'] = 'failed'
current_step[
'result_summary'] = f"Tool execution failed or browser tool not used. Errors: {[tr.content for tr in tool_results if 'Error' in str(tr.content)]}"
else:
logger.info(f"Step {current_step['step']} completed using tool(s): {executed_tool_names}.")
current_step['status'] = 'completed'
current_step['result_summary'] = f"Executed tool(s): {', '.join(executed_tool_names)}."
_save_plan_to_md(plan, output_dir)
_save_search_results_to_json(current_search_results, output_dir)
return {
"research_plan": plan,
"search_results": current_search_results, # Update with new results
"current_step_index": current_index + 1,
"messages": state["messages"] + current_task_message + [ai_response] + tool_results,
# Optionally return the tool_results messages if needed by downstream nodes
}
except Exception as e:
logger.error(f"Unhandled error during research execution node for step {current_step['step']}: {e}",
exc_info=True)
current_step['status'] = 'failed'
_save_plan_to_md(plan, output_dir)
return {
"research_plan": plan,
"current_step_index": current_index + 1, # Move on even if error?
"error_message": f"Core Execution Error on step {current_step['step']}: {e}"
}
async def synthesis_node(state: DeepResearchState) -> Dict[str, Any]:
"""Synthesizes the final report from the collected search results."""
logger.info("--- Entering Synthesis Node ---")
if state.get('stop_requested'):
logger.info("Stop requested, skipping synthesis.")
return {"stop_requested": True}
llm = state['llm']
topic = state['topic']
search_results = state.get('search_results', [])
output_dir = state['output_dir']
plan = state['research_plan'] # Include plan for context
if not search_results:
logger.warning("No search results found to synthesize report.")
report = f"# Research Report: {topic}\n\nNo information was gathered during the research process."
_save_report_to_md(report, output_dir)
return {"final_report": report}
logger.info(f"Synthesizing report from {len(search_results)} collected search result entries.")
# Prepare context for the LLM
# Format search results nicely, maybe group by query or original plan step
formatted_results = ""
references = {}
ref_count = 1
for i, result_entry in enumerate(search_results):
query = result_entry.get('query', 'Unknown Query')
status = result_entry.get('status', 'unknown')
result_data = result_entry.get('result') # This should be the dict with summary, title, url
error = result_entry.get('error')
if status == 'completed' and result_data:
summary = result_data
formatted_results += f"### Finding from Query: \"{query}\"\n"
formatted_results += f"- **Summary:**\n{summary}\n"
formatted_results += "---\n"
elif status == 'failed':
formatted_results += f"### Failed Query: \"{query}\"\n"
formatted_results += f"- **Error:** {error}\n"
formatted_results += "---\n"
# Ignore cancelled/other statuses for the report content
# Prepare the research plan context
plan_summary = "\nResearch Plan Followed:\n"
for item in plan:
marker = "- [x]" if item['status'] == 'completed' else "- [ ] (Failed)" if item[
'status'] == 'failed' else "- [ ]"
plan_summary += f"{marker} {item['task']}\n"
synthesis_prompt = ChatPromptTemplate.from_messages([
("system", """You are a professional researcher tasked with writing a comprehensive and well-structured report based on collected findings.
The report should address the research topic thoroughly, synthesizing the information gathered from various sources.
Structure the report logically:
1. **Introduction:** Briefly introduce the topic and the report's scope (mentioning the research plan followed is good).
2. **Main Body:** Discuss the key findings, organizing them thematically or according to the research plan steps. Analyze, compare, and contrast information from different sources where applicable. **Crucially, cite your sources using bracketed numbers [X] corresponding to the reference list.**
3. **Conclusion:** Summarize the main points and offer concluding thoughts or potential areas for further research.
Ensure the tone is objective, professional, and analytical. Base the report **strictly** on the provided findings. Do not add external knowledge. If findings are contradictory or incomplete, acknowledge this.
"""),
("human", f"""
**Research Topic:** {topic}
{plan_summary}
**Collected Findings:**
```
{formatted_results}
```
```
Please generate the final research report in Markdown format based **only** on the information above. Ensure all claims derived from the findings are properly cited using the format [Reference_ID].
""")
])
try:
response = await llm.ainvoke(synthesis_prompt.format_prompt(
topic=topic,
plan_summary=plan_summary,
formatted_results=formatted_results,
references=references
).to_messages())
final_report_md = response.content
# Append the reference list automatically to the end of the generated markdown
if references:
report_references_section = "\n\n## References\n\n"
# Sort refs by ID for consistent output
sorted_refs = sorted(references.values(), key=lambda x: x['id'])
for ref in sorted_refs:
report_references_section += f"[{ref['id']}] {ref['title']} - {ref['url']}\n"
final_report_md += report_references_section
logger.info("Successfully synthesized the final report.")
_save_report_to_md(final_report_md, output_dir)
return {"final_report": final_report_md}
except Exception as e:
logger.error(f"Error during report synthesis: {e}", exc_info=True)
return {"error_message": f"LLM Error during synthesis: {e}"}
# --- Langgraph Edges and Conditional Logic ---
def should_continue(state: DeepResearchState) -> str:
"""Determines the next step based on the current state."""
logger.info("--- Evaluating Condition: Should Continue? ---")
if state.get('stop_requested'):
logger.info("Stop requested, routing to END.")
return "end_run" # Go to a dedicated end node for cleanup if needed
if state.get('error_message'):
logger.warning(f"Error detected: {state['error_message']}. Routing to END.")
# Decide if errors should halt execution or if it should try to synthesize anyway
return "end_run" # Stop on error for now
plan = state.get('research_plan')
current_index = state.get('current_step_index', 0)
if not plan:
logger.warning("No research plan found, cannot continue execution. Routing to END.")
return "end_run" # Should not happen if planning node ran correctly
# Check if there are pending steps in the plan
if current_index < len(plan):
logger.info(
f"Plan has pending steps (current index {current_index}/{len(plan)}). Routing to Research Execution.")
return "execute_research"
else:
logger.info("All plan steps processed. Routing to Synthesis.")
return "synthesize_report"
# --- DeepSearchAgent Class ---
class DeepResearchAgent:
def __init__(self, llm: Any, browser_config: Dict[str, Any], mcp_server_config: Optional[Dict[str, Any]] = None):
"""
Initializes the DeepSearchAgent.
Args:
llm: The Langchain compatible language model instance.
browser_config: Configuration dictionary for the BrowserUseAgent tool.
Example: {"headless": True, "window_width": 1280, ...}
mcp_server_config: Optional configuration for the MCP client.
"""
self.llm = llm
self.browser_config = browser_config
self.mcp_server_config = mcp_server_config
self.mcp_client = None
self.stopped = False
self.graph = self._compile_graph()
self.current_task_id: Optional[str] = None
self.stop_event: Optional[threading.Event] = None
self.runner: Optional[asyncio.Task] = None # To hold the asyncio task for run
async def _setup_tools(self, task_id: str, stop_event: threading.Event, max_parallel_browsers: int = 1) -> List[
Tool]:
"""Sets up the basic tools (File I/O) and optional MCP tools."""
tools = [WriteFileTool(), ReadFileTool(), ListDirectoryTool()] # Basic file operations
browser_use_tool = create_browser_search_tool(
llm=self.llm,
browser_config=self.browser_config,
task_id=task_id,
stop_event=stop_event,
max_parallel_browsers=max_parallel_browsers
)
tools += [browser_use_tool]
# Add MCP tools if config is provided
if self.mcp_server_config:
try:
logger.info("Setting up MCP client and tools...")
if not self.mcp_client:
self.mcp_client = await setup_mcp_client_and_tools(self.mcp_server_config)
mcp_tools = self.mcp_client.get_tools()
logger.info(f"Loaded {len(mcp_tools)} MCP tools.")
tools.extend(mcp_tools)
except Exception as e:
logger.error(f"Failed to set up MCP tools: {e}", exc_info=True)
elif self.mcp_server_config:
logger.warning("MCP server config provided, but setup function unavailable.")
tools_map = {tool.name: tool for tool in tools}
return tools_map.values()
async def close_mcp_client(self):
if self.mcp_client:
await self.mcp_client.__aexit__(None, None, None)
self.mcp_client = None
def _compile_graph(self) -> StateGraph:
"""Compiles the Langgraph state machine."""
workflow = StateGraph(DeepResearchState)
# Add nodes
workflow.add_node("plan_research", planning_node)
workflow.add_node("execute_research", research_execution_node)
workflow.add_node("synthesize_report", synthesis_node)
workflow.add_node("end_run", lambda state: logger.info("--- Reached End Run Node ---") or {}) # Simple end node
# Define edges
workflow.set_entry_point("plan_research")
workflow.add_edge("plan_research", "execute_research") # Always execute after planning
# Conditional edge after execution
workflow.add_conditional_edges(
"execute_research",
should_continue,
{
"execute_research": "execute_research", # Loop back if more steps
"synthesize_report": "synthesize_report", # Move to synthesis if done
"end_run": "end_run" # End if stop requested or error
}
)
workflow.add_edge("synthesize_report", "end_run") # End after synthesis
app = workflow.compile()
return app
async def run(self, topic: str, task_id: Optional[str] = None, save_dir: str = "./tmp/deep_research",
max_parallel_browsers: int = 1) -> Dict[
str, Any]:
"""
Starts the deep research process (Async Generator Version).
Args:
topic: The research topic.
task_id: Optional existing task ID to resume. If None, a new ID is generated.
Yields:
Intermediate state updates or messages during execution.
"""
if self.runner and not self.runner.done():
logger.warning("Agent is already running. Please stop the current task first.")
# Return an error status instead of yielding
return {"status": "error", "message": "Agent already running.", "task_id": self.current_task_id}
self.current_task_id = task_id if task_id else str(uuid.uuid4())
output_dir = os.path.join(save_dir, self.current_task_id)
os.makedirs(output_dir, exist_ok=True)
logger.info(f"[AsyncGen] Starting research task ID: {self.current_task_id} for topic: '{topic}'")
logger.info(f"[AsyncGen] Output directory: {output_dir}")
self.stop_event = threading.Event()
_AGENT_STOP_FLAGS[self.current_task_id] = self.stop_event
agent_tools = await self._setup_tools(self.current_task_id, self.stop_event, max_parallel_browsers)
initial_state: DeepResearchState = {
"task_id": self.current_task_id,
"topic": topic,
"research_plan": [],
"search_results": [],
"messages": [],
"llm": self.llm,
"tools": agent_tools,
"output_dir": output_dir,
"browser_config": self.browser_config,
"final_report": None,
"current_step_index": 0,
"stop_requested": False,
"error_message": None,
}
loaded_state = {}
if task_id:
logger.info(f"Attempting to resume task {task_id}...")
loaded_state = _load_previous_state(task_id, output_dir)
initial_state.update(loaded_state)
if loaded_state.get("research_plan"):
logger.info(
f"Resuming with {len(loaded_state['research_plan'])} plan steps and {len(loaded_state.get('search_results', []))} existing results.")
initial_state[
"topic"] = topic # Allow overriding topic even when resuming? Or use stored topic? Let's use new one.
else:
logger.warning(f"Resume requested for {task_id}, but no previous plan found. Starting fresh.")
initial_state["current_step_index"] = 0
# --- Execute Graph using ainvoke ---
final_state = None
status = "unknown"
message = None
try:
logger.info(f"Invoking graph execution for task {self.current_task_id}...")
self.runner = asyncio.create_task(self.graph.ainvoke(initial_state))
final_state = await self.runner
logger.info(f"Graph execution finished for task {self.current_task_id}.")
# Determine status based on final state
if self.stop_event and self.stop_event.is_set():
status = "stopped"
message = "Research process was stopped by request."
logger.info(message)
elif final_state and final_state.get("error_message"):
status = "error"
message = final_state["error_message"]
logger.error(f"Graph execution completed with error: {message}")
elif final_state and final_state.get("final_report"):
status = "completed"
message = "Research process completed successfully."
logger.info(message)
else:
# If it ends without error/report (e.g., empty plan, stopped before synthesis)
status = "finished_incomplete"
message = "Research process finished, but may be incomplete (no final report generated)."
logger.warning(message)
except asyncio.CancelledError:
status = "cancelled"
message = f"Agent run task cancelled for {self.current_task_id}."
logger.info(message)
# final_state will remain None or the state before cancellation if checkpointing was used
except Exception as e:
status = "error"
message = f"Unhandled error during graph execution for {self.current_task_id}: {e}"
logger.error(message, exc_info=True)
# final_state will remain None or the state before the error
finally:
logger.info(f"Cleaning up resources for task {self.current_task_id}")
task_id_to_clean = self.current_task_id
self.stop_event = None
self.current_task_id = None
self.runner = None # Mark runner as finished
if self.mcp_client:
await self.mcp_client.__aexit__(None, None, None)
# Return a result dictionary including the status and the final state if available
return {
"status": status,
"message": message,
"task_id": task_id_to_clean, # Use the stored task_id
"final_state": final_state if final_state else {} # Return the final state dict
}
async def _stop_lingering_browsers(self, task_id):
"""Attempts to stop any BrowserUseAgent instances associated with the task_id."""
keys_to_stop = [key for key in _BROWSER_AGENT_INSTANCES if key.startswith(f"{task_id}_")]
if not keys_to_stop:
return
logger.warning(
f"Found {len(keys_to_stop)} potentially lingering browser agents for task {task_id}. Attempting stop...")
for key in keys_to_stop:
agent_instance = _BROWSER_AGENT_INSTANCES.get(key)
try:
if agent_instance:
# Assuming BU agent has an async stop method
await agent_instance.stop()
logger.info(f"Called stop() on browser agent instance {key}")
except Exception as e:
logger.error(f"Error calling stop() on browser agent instance {key}: {e}")
async def stop(self):
"""Signals the currently running agent task to stop."""
if not self.current_task_id or not self.stop_event:
logger.info("No agent task is currently running.")
return
logger.info(f"Stop requested for task ID: {self.current_task_id}")
self.stop_event.set() # Signal the stop event
self.stopped = True
await self._stop_lingering_browsers(self.current_task_id)
def close(self):
self.stopped = False

View File

@@ -9,20 +9,92 @@ from playwright.async_api import (
Playwright,
async_playwright,
)
from browser_use.browser.browser import Browser
from browser_use.browser.browser import Browser, IN_DOCKER
from browser_use.browser.context import BrowserContext, BrowserContextConfig
from playwright.async_api import BrowserContext as PlaywrightBrowserContext
import logging
from .custom_context import CustomBrowserContext
from browser_use.browser.chrome import (
CHROME_ARGS,
CHROME_DETERMINISTIC_RENDERING_ARGS,
CHROME_DISABLE_SECURITY_ARGS,
CHROME_DOCKER_ARGS,
CHROME_HEADLESS_ARGS,
)
from browser_use.browser.context import BrowserContext, BrowserContextConfig
from browser_use.browser.utils.screen_resolution import get_screen_resolution, get_window_adjustments
from browser_use.utils import time_execution_async
import socket
from .custom_context import CustomBrowserContext, CustomBrowserContextConfig
logger = logging.getLogger(__name__)
class CustomBrowser(Browser):
async def new_context(
self,
config: BrowserContextConfig = BrowserContextConfig()
) -> CustomBrowserContext:
return CustomBrowserContext(config=config, browser=self)
async def new_context(self, config: CustomBrowserContextConfig | None = None) -> CustomBrowserContext:
"""Create a browser context"""
browser_config = self.config.model_dump() if self.config else {}
context_config = config.model_dump() if config else {}
merged_config = {**browser_config, **context_config}
return CustomBrowserContext(config=CustomBrowserContextConfig(**merged_config), browser=self)
async def _setup_builtin_browser(self, playwright: Playwright) -> PlaywrightBrowser:
"""Sets up and returns a Playwright Browser instance with anti-detection measures."""
assert self.config.browser_binary_path is None, 'browser_binary_path should be None if trying to use the builtin browsers'
if self.config.headless:
screen_size = {'width': 1920, 'height': 1080}
offset_x, offset_y = 0, 0
else:
screen_size = get_screen_resolution()
offset_x, offset_y = get_window_adjustments()
chrome_args = {
*CHROME_ARGS,
*(CHROME_DOCKER_ARGS if IN_DOCKER else []),
*(CHROME_HEADLESS_ARGS if self.config.headless else []),
*(CHROME_DISABLE_SECURITY_ARGS if self.config.disable_security else []),
*(CHROME_DETERMINISTIC_RENDERING_ARGS if self.config.deterministic_rendering else []),
f'--window-position={offset_x},{offset_y}',
*self.config.extra_browser_args,
}
contain_window_size = False
for arg in self.config.extra_browser_args:
if "--window-size" in arg:
contain_window_size = True
break
if not contain_window_size:
chrome_args.add(f'--window-size={screen_size["width"]},{screen_size["height"]}')
# check if port 9222 is already taken, if so remove the remote-debugging-port arg to prevent conflicts
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
if s.connect_ex(('localhost', 9222)) == 0:
chrome_args.remove('--remote-debugging-port=9222')
browser_class = getattr(playwright, self.config.browser_class)
args = {
'chromium': list(chrome_args),
'firefox': [
*{
'-no-remote',
*self.config.extra_browser_args,
}
],
'webkit': [
*{
'--no-startup-window',
*self.config.extra_browser_args,
}
],
}
browser = await browser_class.launch(
headless=self.config.headless,
args=args[self.config.browser_class],
proxy=self.config.proxy.model_dump() if self.config.proxy else None,
handle_sigterm=False,
handle_sigint=False,
)
return browser

View File

@@ -2,18 +2,115 @@ import json
import logging
import os
from browser_use.browser.browser import Browser
from browser_use.browser.browser import Browser, IN_DOCKER
from browser_use.browser.context import BrowserContext, BrowserContextConfig
from playwright.async_api import Browser as PlaywrightBrowser
from playwright.async_api import BrowserContext as PlaywrightBrowserContext
from typing import Optional
from browser_use.browser.context import BrowserContextState
logger = logging.getLogger(__name__)
class CustomBrowserContextConfig(BrowserContextConfig):
force_new_context: bool = False # force to create new context
class CustomBrowserContext(BrowserContext):
def __init__(
self,
browser: "Browser",
config: BrowserContextConfig = BrowserContextConfig()
browser: 'Browser',
config: BrowserContextConfig | None = None,
state: Optional[BrowserContextState] = None,
):
super(CustomBrowserContext, self).__init__(browser=browser, config=config)
super(CustomBrowserContext, self).__init__(browser=browser, config=config, state=state)
async def _create_context(self, browser: PlaywrightBrowser):
"""Creates a new browser context with anti-detection measures and loads cookies if available."""
if not self.config.force_new_context and self.browser.config.cdp_url and len(browser.contexts) > 0:
context = browser.contexts[0]
elif not self.config.force_new_context and self.browser.config.browser_binary_path and len(
browser.contexts) > 0:
# Connect to existing Chrome instance instead of creating new one
context = browser.contexts[0]
else:
# Original code for creating new context
context = await browser.new_context(
no_viewport=True,
user_agent=self.config.user_agent,
java_script_enabled=True,
bypass_csp=self.config.disable_security,
ignore_https_errors=self.config.disable_security,
record_video_dir=self.config.save_recording_path,
record_video_size=self.config.browser_window_size.model_dump(),
record_har_path=self.config.save_har_path,
locale=self.config.locale,
http_credentials=self.config.http_credentials,
is_mobile=self.config.is_mobile,
has_touch=self.config.has_touch,
geolocation=self.config.geolocation,
permissions=self.config.permissions,
timezone_id=self.config.timezone_id,
)
if self.config.trace_path:
await context.tracing.start(screenshots=True, snapshots=True, sources=True)
# Load cookies if they exist
if self.config.cookies_file and os.path.exists(self.config.cookies_file):
with open(self.config.cookies_file, 'r') as f:
try:
cookies = json.load(f)
valid_same_site_values = ['Strict', 'Lax', 'None']
for cookie in cookies:
if 'sameSite' in cookie:
if cookie['sameSite'] not in valid_same_site_values:
logger.warning(
f"Fixed invalid sameSite value '{cookie['sameSite']}' to 'None' for cookie {cookie.get('name')}"
)
cookie['sameSite'] = 'None'
logger.info(f'🍪 Loaded {len(cookies)} cookies from {self.config.cookies_file}')
await context.add_cookies(cookies)
except json.JSONDecodeError as e:
logger.error(f'Failed to parse cookies file: {str(e)}')
# Expose anti-detection scripts
await context.add_init_script(
"""
// Webdriver property
Object.defineProperty(navigator, 'webdriver', {
get: () => undefined
});
// Languages
Object.defineProperty(navigator, 'languages', {
get: () => ['en-US']
});
// Plugins
Object.defineProperty(navigator, 'plugins', {
get: () => [1, 2, 3, 4, 5]
});
// Chrome runtime
window.chrome = { runtime: {} };
// Permissions
const originalQuery = window.navigator.permissions.query;
window.navigator.permissions.query = (parameters) => (
parameters.name === 'notifications' ?
Promise.resolve({ state: Notification.permission }) :
originalQuery(parameters)
);
(function () {
const originalAttachShadow = Element.prototype.attachShadow;
Element.prototype.attachShadow = function attachShadow(options) {
return originalAttachShadow.call(this, { ...options, mode: "open" });
};
})();
"""
)
return context

View File

@@ -1,11 +1,12 @@
import pdb
import pyperclip
from typing import Optional, Type
from typing import Optional, Type, Callable, Dict, Any, Union, Awaitable, TypeVar
from pydantic import BaseModel
from browser_use.agent.views import ActionResult
from browser_use.browser.context import BrowserContext
from browser_use.controller.service import Controller, DoneAction
from browser_use.controller.registry.service import Registry, RegisteredAction
from main_content_extractor import MainContentExtractor
from browser_use.controller.views import (
ClickElementAction,
@@ -20,30 +21,158 @@ from browser_use.controller.views import (
SwitchTabAction,
)
import logging
import inspect
import asyncio
import os
from langchain_core.language_models.chat_models import BaseChatModel
from browser_use.agent.views import ActionModel, ActionResult
from src.utils.mcp_client import create_tool_param_model, setup_mcp_client_and_tools
from browser_use.utils import time_execution_sync
logger = logging.getLogger(__name__)
Context = TypeVar('Context')
class CustomController(Controller):
def __init__(self, exclude_actions: list[str] = [],
output_model: Optional[Type[BaseModel]] = None
output_model: Optional[Type[BaseModel]] = None,
ask_assistant_callback: Optional[Union[Callable[[str, BrowserContext], Dict[str, Any]], Callable[
[str, BrowserContext], Awaitable[Dict[str, Any]]]]] = None,
):
super().__init__(exclude_actions=exclude_actions, output_model=output_model)
self._register_custom_actions()
self.ask_assistant_callback = ask_assistant_callback
self.mcp_client = None
self.mcp_server_config = None
def _register_custom_actions(self):
"""Register all custom browser actions"""
@self.registry.action("Copy text to clipboard")
def copy_to_clipboard(text: str):
pyperclip.copy(text)
return ActionResult(extracted_content=text)
@self.registry.action(
"When executing tasks, prioritize autonomous completion. However, if you encounter a definitive blocker "
"that prevents you from proceeding independently such as needing credentials you don't possess, "
"requiring subjective human judgment, needing a physical action performed, encountering complex CAPTCHAs, "
"or facing limitations in your capabilities you must request human assistance."
)
async def ask_for_assistant(query: str, browser: BrowserContext):
if self.ask_assistant_callback:
if inspect.iscoroutinefunction(self.ask_assistant_callback):
user_response = await self.ask_assistant_callback(query, browser)
else:
user_response = self.ask_assistant_callback(query, browser)
msg = f"AI ask: {query}. User response: {user_response['response']}"
logger.info(msg)
return ActionResult(extracted_content=msg, include_in_memory=True)
else:
return ActionResult(extracted_content="Human cannot help you. Please try another way.",
include_in_memory=True)
@self.registry.action("Paste text from clipboard")
async def paste_from_clipboard(browser: BrowserContext):
text = pyperclip.paste()
# send text to browser
page = await browser.get_current_page()
await page.keyboard.type(text)
@self.registry.action(
'Upload file to interactive element with file path ',
)
async def upload_file(index: int, path: str, browser: BrowserContext, available_file_paths: list[str]):
if path not in available_file_paths:
return ActionResult(error=f'File path {path} is not available')
return ActionResult(extracted_content=text)
if not os.path.exists(path):
return ActionResult(error=f'File {path} does not exist')
dom_el = await browser.get_dom_element_by_index(index)
file_upload_dom_el = dom_el.get_file_upload_element()
if file_upload_dom_el is None:
msg = f'No file upload element found at index {index}'
logger.info(msg)
return ActionResult(error=msg)
file_upload_el = await browser.get_locate_element(file_upload_dom_el)
if file_upload_el is None:
msg = f'No file upload element found at index {index}'
logger.info(msg)
return ActionResult(error=msg)
try:
await file_upload_el.set_input_files(path)
msg = f'Successfully uploaded file to index {index}'
logger.info(msg)
return ActionResult(extracted_content=msg, include_in_memory=True)
except Exception as e:
msg = f'Failed to upload file to index {index}: {str(e)}'
logger.info(msg)
return ActionResult(error=msg)
@time_execution_sync('--act')
async def act(
self,
action: ActionModel,
browser_context: Optional[BrowserContext] = None,
#
page_extraction_llm: Optional[BaseChatModel] = None,
sensitive_data: Optional[Dict[str, str]] = None,
available_file_paths: Optional[list[str]] = None,
#
context: Context | None = None,
) -> ActionResult:
"""Execute an action"""
try:
for action_name, params in action.model_dump(exclude_unset=True).items():
if params is not None:
if action_name.startswith("mcp"):
# this is a mcp tool
logger.debug(f"Invoke MCP tool: {action_name}")
mcp_tool = self.registry.registry.actions.get(action_name).function
result = await mcp_tool.ainvoke(params)
else:
result = await self.registry.execute_action(
action_name,
params,
browser=browser_context,
page_extraction_llm=page_extraction_llm,
sensitive_data=sensitive_data,
available_file_paths=available_file_paths,
context=context,
)
if isinstance(result, str):
return ActionResult(extracted_content=result)
elif isinstance(result, ActionResult):
return result
elif result is None:
return ActionResult()
else:
raise ValueError(f'Invalid action result type: {type(result)} of {result}')
return ActionResult()
except Exception as e:
raise e
async def setup_mcp_client(self, mcp_server_config: Optional[Dict[str, Any]] = None):
self.mcp_server_config = mcp_server_config
if self.mcp_server_config:
self.mcp_client = await setup_mcp_client_and_tools(self.mcp_server_config)
self.register_mcp_tools()
def register_mcp_tools(self):
"""
Register the MCP tools used by this controller.
"""
if self.mcp_client:
for server_name in self.mcp_client.server_name_to_tools:
for tool in self.mcp_client.server_name_to_tools[server_name]:
tool_name = f"mcp.{server_name}.{tool.name}"
self.registry.registry.actions[tool_name] = RegisteredAction(
name=tool_name,
description=tool.description,
function=tool,
param_model=create_tool_param_model(tool),
)
logger.info(f"Add mcp tool: {tool_name}")
async def close_mcp_client(self):
if self.mcp_client:
await self.mcp_client.__aexit__(None, None, None)

View File

@@ -1,31 +0,0 @@
import asyncio
class AgentState:
_instance = None
def __init__(self):
if not hasattr(self, '_stop_requested'):
self._stop_requested = asyncio.Event()
self.last_valid_state = None # store the last valid browser state
def __new__(cls):
if cls._instance is None:
cls._instance = super(AgentState, cls).__new__(cls)
return cls._instance
def request_stop(self):
self._stop_requested.set()
def clear_stop(self):
self._stop_requested.clear()
self.last_valid_state = None
def is_stop_requested(self):
return self._stop_requested.is_set()
def set_last_valid_state(self, state):
self.last_valid_state = state
def get_last_valid_state(self):
return self.last_valid_state

63
src/utils/config.py Normal file
View File

@@ -0,0 +1,63 @@
PROVIDER_DISPLAY_NAMES = {
"openai": "OpenAI",
"azure_openai": "Azure OpenAI",
"anthropic": "Anthropic",
"deepseek": "DeepSeek",
"google": "Google",
"alibaba": "Alibaba",
"moonshot": "MoonShot",
"unbound": "Unbound AI",
"ibm": "IBM"
}
# Predefined model names for common providers
model_names = {
"anthropic": ["claude-3-5-sonnet-20241022", "claude-3-5-sonnet-20240620", "claude-3-opus-20240229"],
"openai": ["gpt-4o", "gpt-4", "gpt-3.5-turbo", "o3-mini"],
"deepseek": ["deepseek-chat", "deepseek-reasoner"],
"google": ["gemini-2.0-flash", "gemini-2.0-flash-thinking-exp", "gemini-1.5-flash-latest",
"gemini-1.5-flash-8b-latest", "gemini-2.0-flash-thinking-exp-01-21", "gemini-2.0-pro-exp-02-05",
"gemini-2.5-pro-preview-03-25", "gemini-2.5-flash-preview-04-17"],
"ollama": ["qwen2.5:7b", "qwen2.5:14b", "qwen2.5:32b", "qwen2.5-coder:14b", "qwen2.5-coder:32b", "llama2:7b",
"deepseek-r1:14b", "deepseek-r1:32b"],
"azure_openai": ["gpt-4o", "gpt-4", "gpt-3.5-turbo"],
"mistral": ["pixtral-large-latest", "mistral-large-latest", "mistral-small-latest", "ministral-8b-latest"],
"alibaba": ["qwen-plus", "qwen-max", "qwen-vl-max", "qwen-vl-plus", "qwen-turbo", "qwen-long"],
"moonshot": ["moonshot-v1-32k-vision-preview", "moonshot-v1-8k-vision-preview"],
"unbound": ["gemini-2.0-flash", "gpt-4o-mini", "gpt-4o", "gpt-4.5-preview"],
"siliconflow": [
"deepseek-ai/DeepSeek-R1",
"deepseek-ai/DeepSeek-V3",
"deepseek-ai/DeepSeek-R1-Distill-Qwen-32B",
"deepseek-ai/DeepSeek-R1-Distill-Qwen-14B",
"deepseek-ai/DeepSeek-R1-Distill-Qwen-7B",
"deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
"deepseek-ai/DeepSeek-V2.5",
"deepseek-ai/deepseek-vl2",
"Qwen/Qwen2.5-72B-Instruct-128K",
"Qwen/Qwen2.5-72B-Instruct",
"Qwen/Qwen2.5-32B-Instruct",
"Qwen/Qwen2.5-14B-Instruct",
"Qwen/Qwen2.5-7B-Instruct",
"Qwen/Qwen2.5-Coder-32B-Instruct",
"Qwen/Qwen2.5-Coder-7B-Instruct",
"Qwen/Qwen2-7B-Instruct",
"Qwen/Qwen2-1.5B-Instruct",
"Qwen/QwQ-32B-Preview",
"Qwen/Qwen2-VL-72B-Instruct",
"Qwen/Qwen2.5-VL-32B-Instruct",
"Qwen/Qwen2.5-VL-72B-Instruct",
"TeleAI/TeleChat2",
"THUDM/glm-4-9b-chat",
"Vendor-A/Qwen/Qwen2.5-72B-Instruct",
"internlm/internlm2_5-7b-chat",
"internlm/internlm2_5-20b-chat",
"Pro/Qwen/Qwen2.5-7B-Instruct",
"Pro/Qwen/Qwen2-7B-Instruct",
"Pro/Qwen/Qwen2-1.5B-Instruct",
"Pro/THUDM/chatglm3-6b",
"Pro/THUDM/glm-4-9b-chat",
],
"ibm": ["ibm/granite-vision-3.1-2b-preview", "meta-llama/llama-4-maverick-17b-128e-instruct-fp8",
"meta-llama/llama-3-2-90b-vision-instruct"]
}

View File

@@ -1,387 +0,0 @@
import pdb
from dotenv import load_dotenv
load_dotenv()
import asyncio
import os
import sys
import logging
from pprint import pprint
from uuid import uuid4
from src.utils import utils
from src.agent.custom_agent import CustomAgent
import json
import re
from browser_use.agent.service import Agent
from browser_use.browser.browser import BrowserConfig, Browser
from browser_use.agent.views import ActionResult
from browser_use.browser.context import BrowserContext
from browser_use.controller.service import Controller, DoneAction
from main_content_extractor import MainContentExtractor
from langchain_core.messages import (
AIMessage,
BaseMessage,
HumanMessage,
ToolMessage,
SystemMessage
)
from json_repair import repair_json
from src.agent.custom_prompts import CustomSystemPrompt, CustomAgentMessagePrompt
from src.controller.custom_controller import CustomController
from src.browser.custom_browser import CustomBrowser
from src.browser.custom_context import BrowserContextConfig, BrowserContext
from browser_use.browser.context import (
BrowserContextConfig,
BrowserContextWindowSize,
)
logger = logging.getLogger(__name__)
async def deep_research(task, llm, agent_state=None, **kwargs):
task_id = str(uuid4())
save_dir = kwargs.get("save_dir", os.path.join(f"./tmp/deep_research/{task_id}"))
logger.info(f"Save Deep Research at: {save_dir}")
os.makedirs(save_dir, exist_ok=True)
# max qyery num per iteration
max_query_num = kwargs.get("max_query_num", 3)
use_own_browser = kwargs.get("use_own_browser", False)
extra_chromium_args = []
if use_own_browser:
cdp_url = os.getenv("CHROME_CDP", kwargs.get("chrome_cdp", None))
# TODO: if use own browser, max query num must be 1 per iter, how to solve it?
max_query_num = 1
chrome_path = os.getenv("CHROME_PATH", None)
if chrome_path == "":
chrome_path = None
chrome_user_data = os.getenv("CHROME_USER_DATA", None)
if chrome_user_data:
extra_chromium_args += [f"--user-data-dir={chrome_user_data}"]
browser = CustomBrowser(
config=BrowserConfig(
headless=kwargs.get("headless", False),
cdp_url=cdp_url,
disable_security=kwargs.get("disable_security", True),
chrome_instance_path=chrome_path,
extra_chromium_args=extra_chromium_args,
)
)
browser_context = await browser.new_context()
else:
browser = None
browser_context = None
controller = CustomController()
@controller.registry.action(
'Extract page content to get the pure markdown.',
)
async def extract_content(browser: BrowserContext):
page = await browser.get_current_page()
# use jina reader
url = page.url
jina_url = f"https://r.jina.ai/{url}"
await page.goto(jina_url)
output_format = 'markdown'
content = MainContentExtractor.extract( # type: ignore
html=await page.content(),
output_format=output_format,
)
# go back to org url
await page.go_back()
msg = f'Extracted page content:\n{content}\n'
logger.info(msg)
return ActionResult(extracted_content=msg)
search_system_prompt = f"""
You are a **Deep Researcher**, an AI agent specializing in in-depth information gathering and research using a web browser with **automated execution capabilities**. Your expertise lies in formulating comprehensive research plans and executing them meticulously to fulfill complex user requests. You will analyze user instructions, devise a detailed research plan, and determine the necessary search queries to gather the required information.
**Your Task:**
Given a user's research topic, you will:
1. **Develop a Research Plan:** Outline the key aspects and subtopics that need to be investigated to thoroughly address the user's request. This plan should be a high-level overview of the research direction.
2. **Generate Search Queries:** Based on your research plan, generate a list of specific search queries to be executed in a web browser. These queries should be designed to efficiently gather relevant information for each aspect of your plan.
**Output Format:**
Your output will be a JSON object with the following structure:
```json
{{
"plan": "A concise, high-level research plan outlining the key areas to investigate.",
"queries": [
"search query 1",
"search query 2",
//... up to a maximum of {max_query_num} search queries
]
}}
```
**Important:**
* Limit your output to a **maximum of {max_query_num}** search queries.
* Make the search queries to help the automated agent find the needed information. Consider what keywords are most likely to lead to useful results.
* If you have gathered for all the information you want and no further search queries are required, output queries with an empty list: `[]`
* Make sure output search queries are different from the history queries.
**Inputs:**
1. **User Instruction:** The original instruction given by the user.
2. **Previous Queries:** History Queries.
3. **Previous Search Results:** Textual data gathered from prior search queries. If there are no previous search results this string will be empty.
"""
search_messages = [SystemMessage(content=search_system_prompt)]
record_system_prompt = """
You are an expert information recorder. Your role is to process user instructions, current search results, and previously recorded information to extract, summarize, and record new, useful information that helps fulfill the user's request. Your output will be a JSON formatted list, where each element represents a piece of extracted information and follows the structure: `{"url": "source_url", "title": "source_title", "summary_content": "concise_summary", "thinking": "reasoning"}`.
**Important Considerations:**
1. **Minimize Information Loss:** While concise, prioritize retaining important details and nuances from the sources. Aim for a summary that captures the essence of the information without over-simplification. **Crucially, ensure to preserve key data and figures within the `summary_content`. This is essential for later stages, such as generating tables and reports.**
2. **Avoid Redundancy:** Do not record information that is already present in the Previous Recorded Information. Check for semantic similarity, not just exact matches. However, if the same information is expressed differently in a new source and this variation adds valuable context or clarity, it should be included.
3. **Source Information:** Extract and include the source title and URL for each piece of information summarized. This is crucial for verification and context. **The Current Search Results are provided in a specific format, where each item starts with "Title:", followed by the title, then "URL Source:", followed by the URL, and finally "Markdown Content:", followed by the content. Please extract the title and URL from this structure.** If a piece of information cannot be attributed to a specific source from the provided search results, use `"url": "unknown"` and `"title": "unknown"`.
4. **Thinking and Report Structure:** For each extracted piece of information, add a `"thinking"` key. This field should contain your assessment of how this information could be used in a report, which section it might belong to (e.g., introduction, background, analysis, conclusion, specific subtopics), and any other relevant thoughts about its significance or connection to other information.
**Output Format:**
Provide your output as a JSON formatted list. Each item in the list must adhere to the following format:
```json
[
{
"url": "source_url_1",
"title": "source_title_1",
"summary_content": "Concise summary of content. Remember to include key data and figures here.",
"thinking": "This could be used in the introduction to set the context. It also relates to the section on the history of the topic."
},
// ... more entries
{
"url": "unknown",
"title": "unknown",
"summary_content": "concise_summary_of_content_without_clear_source",
"thinking": "This might be useful background information, but I need to verify its accuracy. Could be used in the methodology section to explain how data was collected."
}
]
```
**Inputs:**
1. **User Instruction:** The original instruction given by the user. This helps you determine what kind of information will be useful and how to structure your thinking.
2. **Previous Recorded Information:** Textual data gathered and recorded from previous searches and processing, represented as a single text string.
3. **Current Search Plan:** Research plan for current search.
4. **Current Search Query:** The current search query.
5. **Current Search Results:** Textual data gathered from the most recent search query.
"""
record_messages = [SystemMessage(content=record_system_prompt)]
search_iteration = 0
max_search_iterations = kwargs.get("max_search_iterations", 10) # Limit search iterations to prevent infinite loop
use_vision = kwargs.get("use_vision", False)
history_query = []
history_infos = []
try:
while search_iteration < max_search_iterations:
search_iteration += 1
logger.info(f"Start {search_iteration}th Search...")
history_query_ = json.dumps(history_query, indent=4)
history_infos_ = json.dumps(history_infos, indent=4)
query_prompt = f"This is search {search_iteration} of {max_search_iterations} maximum searches allowed.\n User Instruction:{task} \n Previous Queries:\n {history_query_} \n Previous Search Results:\n {history_infos_}\n"
search_messages.append(HumanMessage(content=query_prompt))
ai_query_msg = llm.invoke(search_messages[:1] + search_messages[1:][-1:])
search_messages.append(ai_query_msg)
if hasattr(ai_query_msg, "reasoning_content"):
logger.info("🤯 Start Search Deep Thinking: ")
logger.info(ai_query_msg.reasoning_content)
logger.info("🤯 End Search Deep Thinking")
ai_query_content = ai_query_msg.content.replace("```json", "").replace("```", "")
ai_query_content = repair_json(ai_query_content)
ai_query_content = json.loads(ai_query_content)
query_plan = ai_query_content["plan"]
logger.info(f"Current Iteration {search_iteration} Planing:")
logger.info(query_plan)
query_tasks = ai_query_content["queries"]
if not query_tasks:
break
else:
query_tasks = query_tasks[:max_query_num]
history_query.extend(query_tasks)
logger.info("Query tasks:")
logger.info(query_tasks)
# 2. Perform Web Search and Auto exec
# Parallel BU agents
add_infos = "1. Please click on the most relevant link to get information and go deeper, instead of just staying on the search page. \n" \
"2. When opening a PDF file, please remember to extract the content using extract_content instead of simply opening it for the user to view.\n"
if use_own_browser:
agent = CustomAgent(
task=query_tasks[0],
llm=llm,
add_infos=add_infos,
browser=browser,
browser_context=browser_context,
use_vision=use_vision,
system_prompt_class=CustomSystemPrompt,
agent_prompt_class=CustomAgentMessagePrompt,
max_actions_per_step=5,
controller=controller
)
agent_result = await agent.run(max_steps=kwargs.get("max_steps", 10))
query_results = [agent_result]
# Manually close all tab
session = await browser_context.get_session()
pages = session.context.pages
await browser_context.create_new_tab()
for page_id, page in enumerate(pages):
await page.close()
else:
agents = [CustomAgent(
task=task,
llm=llm,
add_infos=add_infos,
browser=browser,
browser_context=browser_context,
use_vision=use_vision,
system_prompt_class=CustomSystemPrompt,
agent_prompt_class=CustomAgentMessagePrompt,
max_actions_per_step=5,
controller=controller,
) for task in query_tasks]
query_results = await asyncio.gather(
*[agent.run(max_steps=kwargs.get("max_steps", 10)) for agent in agents])
if agent_state and agent_state.is_stop_requested():
# Stop
break
# 3. Summarize Search Result
query_result_dir = os.path.join(save_dir, "query_results")
os.makedirs(query_result_dir, exist_ok=True)
for i in range(len(query_tasks)):
query_result = query_results[i].final_result()
if not query_result:
continue
querr_save_path = os.path.join(query_result_dir, f"{search_iteration}-{i}.md")
logger.info(f"save query: {query_tasks[i]} at {querr_save_path}")
with open(querr_save_path, "w", encoding="utf-8") as fw:
fw.write(f"Query: {query_tasks[i]}\n")
fw.write(query_result)
# split query result in case the content is too long
query_results_split = query_result.split("Extracted page content:")
for qi, query_result_ in enumerate(query_results_split):
if not query_result_:
continue
else:
# TODO: limit content lenght: 128k tokens, ~3 chars per token
query_result_ = query_result_[:128000 * 3]
history_infos_ = json.dumps(history_infos, indent=4)
record_prompt = f"User Instruction:{task}. \nPrevious Recorded Information:\n {history_infos_}\n Current Search Iteration: {search_iteration}\n Current Search Plan:\n{query_plan}\n Current Search Query:\n {query_tasks[i]}\n Current Search Results: {query_result_}\n "
record_messages.append(HumanMessage(content=record_prompt))
ai_record_msg = llm.invoke(record_messages[:1] + record_messages[-1:])
record_messages.append(ai_record_msg)
if hasattr(ai_record_msg, "reasoning_content"):
logger.info("🤯 Start Record Deep Thinking: ")
logger.info(ai_record_msg.reasoning_content)
logger.info("🤯 End Record Deep Thinking")
record_content = ai_record_msg.content
record_content = repair_json(record_content)
new_record_infos = json.loads(record_content)
history_infos.extend(new_record_infos)
if agent_state and agent_state.is_stop_requested():
# Stop
break
logger.info("\nFinish Searching, Start Generating Report...")
# 5. Report Generation in Markdown (or JSON if you prefer)
return await generate_final_report(task, history_infos, save_dir, llm)
except Exception as e:
logger.error(f"Deep research Error: {e}")
return await generate_final_report(task, history_infos, save_dir, llm, str(e))
finally:
if browser:
await browser.close()
if browser_context:
await browser_context.close()
logger.info("Browser closed.")
async def generate_final_report(task, history_infos, save_dir, llm, error_msg=None):
"""Generate report from collected information with error handling"""
try:
logger.info("\nAttempting to generate final report from collected data...")
writer_system_prompt = """
You are a **Deep Researcher** and a professional report writer tasked with creating polished, high-quality reports that fully meet the user's needs, based on the user's instructions and the relevant information provided. You will write the report using Markdown format, ensuring it is both informative and visually appealing.
**Specific Instructions:**
* **Structure for Impact:** The report must have a clear, logical, and impactful structure. Begin with a compelling introduction that immediately grabs the reader's attention. Develop well-structured body paragraphs that flow smoothly and logically, and conclude with a concise and memorable conclusion that summarizes key takeaways and leaves a lasting impression.
* **Engaging and Vivid Language:** Employ precise, vivid, and descriptive language to make the report captivating and enjoyable to read. Use stylistic techniques to enhance engagement. Tailor your tone, vocabulary, and writing style to perfectly suit the subject matter and the intended audience to maximize impact and readability.
* **Accuracy, Credibility, and Citations:** Ensure that all information presented is meticulously accurate, rigorously truthful, and robustly supported by the available data. **Cite sources exclusively using bracketed sequential numbers within the text (e.g., [1], [2], etc.). If no references are used, omit citations entirely.** These numbers must correspond to a numbered list of references at the end of the report.
* **Publication-Ready Formatting:** Adhere strictly to Markdown formatting for excellent readability and a clean, highly professional visual appearance. Pay close attention to formatting details like headings, lists, emphasis, and spacing to optimize the visual presentation and reader experience. The report should be ready for immediate publication upon completion, requiring minimal to no further editing for style or format.
* **Conciseness and Clarity (Unless Specified Otherwise):** When the user does not provide a specific length, prioritize concise and to-the-point writing, maximizing information density while maintaining clarity.
* **Data-Driven Comparisons with Tables:** **When appropriate and beneficial for enhancing clarity and impact, present data comparisons in well-structured Markdown tables. This is especially encouraged when dealing with numerical data or when a visual comparison can significantly improve the reader's understanding.**
* **Length Adherence:** When the user specifies a length constraint, meticulously stay within reasonable bounds of that specification, ensuring the content is appropriately scaled without sacrificing quality or completeness.
* **Comprehensive Instruction Following:** Pay meticulous attention to all details and nuances provided in the user instructions. Strive to fulfill every aspect of the user's request with the highest degree of accuracy and attention to detail, creating a report that not only meets but exceeds expectations for quality and professionalism.
* **Reference List Formatting:** The reference list at the end must be formatted as follows:
`[1] Title (URL, if available)`
**Each reference must be separated by a blank line to ensure proper spacing.** For example:
```
[1] Title 1 (URL1, if available)
[2] Title 2 (URL2, if available)
```
**Furthermore, ensure that the reference list is free of duplicates. Each unique source should be listed only once, regardless of how many times it is cited in the text.**
* **ABSOLUTE FINAL OUTPUT RESTRICTION:** **Your output must contain ONLY the finished, publication-ready Markdown report. Do not include ANY extraneous text, phrases, preambles, meta-commentary, or markdown code indicators (e.g., "```markdown```"). The report should begin directly with the title and introductory paragraph, and end directly after the conclusion and the reference list (if applicable).** **Your response will be deemed a failure if this instruction is not followed precisely.**
**Inputs:**
1. **User Instruction:** The original instruction given by the user. This helps you determine what kind of information will be useful and how to structure your thinking.
2. **Search Information:** Information gathered from the search queries.
"""
history_infos_ = json.dumps(history_infos, indent=4)
record_json_path = os.path.join(save_dir, "record_infos.json")
logger.info(f"save All recorded information at {record_json_path}")
with open(record_json_path, "w") as fw:
json.dump(history_infos, fw, indent=4)
report_prompt = f"User Instruction:{task} \n Search Information:\n {history_infos_}"
report_messages = [SystemMessage(content=writer_system_prompt),
HumanMessage(content=report_prompt)] # New context for report generation
ai_report_msg = llm.invoke(report_messages)
if hasattr(ai_report_msg, "reasoning_content"):
logger.info("🤯 Start Report Deep Thinking: ")
logger.info(ai_report_msg.reasoning_content)
logger.info("🤯 End Report Deep Thinking")
report_content = ai_report_msg.content
report_content = re.sub(r"^```\s*markdown\s*|^\s*```|```\s*$", "", report_content, flags=re.MULTILINE)
report_content = report_content.strip()
# Add error notification to the report
if error_msg:
report_content = f"## ⚠️ Research Incomplete - Partial Results\n" \
f"**The research process was interrupted by an error:** {error_msg}\n\n" \
f"{report_content}"
report_file_path = os.path.join(save_dir, "final_report.md")
with open(report_file_path, "w", encoding="utf-8") as f:
f.write(report_content)
logger.info(f"Save Report at: {report_file_path}")
return report_content, report_file_path
except Exception as report_error:
logger.error(f"Failed to generate partial report: {report_error}")
return f"Error generating report: {str(report_error)}", None

View File

@@ -1,138 +0,0 @@
from openai import OpenAI
import pdb
from langchain_openai import ChatOpenAI
from langchain_core.globals import get_llm_cache
from langchain_core.language_models.base import (
BaseLanguageModel,
LangSmithParams,
LanguageModelInput,
)
from langchain_core.load import dumpd, dumps
from langchain_core.messages import (
AIMessage,
SystemMessage,
AnyMessage,
BaseMessage,
BaseMessageChunk,
HumanMessage,
convert_to_messages,
message_chunk_to_message,
)
from langchain_core.outputs import (
ChatGeneration,
ChatGenerationChunk,
ChatResult,
LLMResult,
RunInfo,
)
from langchain_ollama import ChatOllama
from langchain_core.output_parsers.base import OutputParserLike
from langchain_core.runnables import Runnable, RunnableConfig
from langchain_core.tools import BaseTool
from typing import (
TYPE_CHECKING,
Any,
Callable,
Literal,
Optional,
Union,
cast, List,
)
class DeepSeekR1ChatOpenAI(ChatOpenAI):
def __init__(self, *args: Any, **kwargs: Any) -> None:
super().__init__(*args, **kwargs)
self.client = OpenAI(
base_url=kwargs.get("base_url"),
api_key=kwargs.get("api_key")
)
async def ainvoke(
self,
input: LanguageModelInput,
config: Optional[RunnableConfig] = None,
*,
stop: Optional[list[str]] = None,
**kwargs: Any,
) -> AIMessage:
message_history = []
for input_ in input:
if isinstance(input_, SystemMessage):
message_history.append({"role": "system", "content": input_.content})
elif isinstance(input_, AIMessage):
message_history.append({"role": "assistant", "content": input_.content})
else:
message_history.append({"role": "user", "content": input_.content})
response = self.client.chat.completions.create(
model=self.model_name,
messages=message_history
)
reasoning_content = response.choices[0].message.reasoning_content
content = response.choices[0].message.content
return AIMessage(content=content, reasoning_content=reasoning_content)
def invoke(
self,
input: LanguageModelInput,
config: Optional[RunnableConfig] = None,
*,
stop: Optional[list[str]] = None,
**kwargs: Any,
) -> AIMessage:
message_history = []
for input_ in input:
if isinstance(input_, SystemMessage):
message_history.append({"role": "system", "content": input_.content})
elif isinstance(input_, AIMessage):
message_history.append({"role": "assistant", "content": input_.content})
else:
message_history.append({"role": "user", "content": input_.content})
response = self.client.chat.completions.create(
model=self.model_name,
messages=message_history
)
reasoning_content = response.choices[0].message.reasoning_content
content = response.choices[0].message.content
return AIMessage(content=content, reasoning_content=reasoning_content)
class DeepSeekR1ChatOllama(ChatOllama):
async def ainvoke(
self,
input: LanguageModelInput,
config: Optional[RunnableConfig] = None,
*,
stop: Optional[list[str]] = None,
**kwargs: Any,
) -> AIMessage:
org_ai_message = await super().ainvoke(input=input)
org_content = org_ai_message.content
reasoning_content = org_content.split("</think>")[0].replace("<think>", "")
content = org_content.split("</think>")[1]
if "**JSON Response:**" in content:
content = content.split("**JSON Response:**")[-1]
return AIMessage(content=content, reasoning_content=reasoning_content)
def invoke(
self,
input: LanguageModelInput,
config: Optional[RunnableConfig] = None,
*,
stop: Optional[list[str]] = None,
**kwargs: Any,
) -> AIMessage:
org_ai_message = super().invoke(input=input)
org_content = org_ai_message.content
reasoning_content = org_content.split("</think>")[0].replace("<think>", "")
content = org_content.split("</think>")[1]
if "**JSON Response:**" in content:
content = content.split("**JSON Response:**")[-1]
return AIMessage(content=content, reasoning_content=reasoning_content)

327
src/utils/llm_provider.py Normal file
View File

@@ -0,0 +1,327 @@
from openai import OpenAI
import pdb
from langchain_openai import ChatOpenAI
from langchain_core.globals import get_llm_cache
from langchain_core.language_models.base import (
BaseLanguageModel,
LangSmithParams,
LanguageModelInput,
)
import os
from langchain_core.load import dumpd, dumps
from langchain_core.messages import (
AIMessage,
SystemMessage,
AnyMessage,
BaseMessage,
BaseMessageChunk,
HumanMessage,
convert_to_messages,
message_chunk_to_message,
)
from langchain_core.outputs import (
ChatGeneration,
ChatGenerationChunk,
ChatResult,
LLMResult,
RunInfo,
)
from langchain_ollama import ChatOllama
from langchain_core.output_parsers.base import OutputParserLike
from langchain_core.runnables import Runnable, RunnableConfig
from langchain_core.tools import BaseTool
from typing import (
TYPE_CHECKING,
Any,
Callable,
Literal,
Optional,
Union,
cast, List,
)
from langchain_anthropic import ChatAnthropic
from langchain_mistralai import ChatMistralAI
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_ollama import ChatOllama
from langchain_openai import AzureChatOpenAI, ChatOpenAI
from langchain_ibm import ChatWatsonx
from langchain_aws import ChatBedrock
from pydantic import SecretStr
from src.utils import config
class DeepSeekR1ChatOpenAI(ChatOpenAI):
def __init__(self, *args: Any, **kwargs: Any) -> None:
super().__init__(*args, **kwargs)
self.client = OpenAI(
base_url=kwargs.get("base_url"),
api_key=kwargs.get("api_key")
)
async def ainvoke(
self,
input: LanguageModelInput,
config: Optional[RunnableConfig] = None,
*,
stop: Optional[list[str]] = None,
**kwargs: Any,
) -> AIMessage:
message_history = []
for input_ in input:
if isinstance(input_, SystemMessage):
message_history.append({"role": "system", "content": input_.content})
elif isinstance(input_, AIMessage):
message_history.append({"role": "assistant", "content": input_.content})
else:
message_history.append({"role": "user", "content": input_.content})
response = self.client.chat.completions.create(
model=self.model_name,
messages=message_history
)
reasoning_content = response.choices[0].message.reasoning_content
content = response.choices[0].message.content
return AIMessage(content=content, reasoning_content=reasoning_content)
def invoke(
self,
input: LanguageModelInput,
config: Optional[RunnableConfig] = None,
*,
stop: Optional[list[str]] = None,
**kwargs: Any,
) -> AIMessage:
message_history = []
for input_ in input:
if isinstance(input_, SystemMessage):
message_history.append({"role": "system", "content": input_.content})
elif isinstance(input_, AIMessage):
message_history.append({"role": "assistant", "content": input_.content})
else:
message_history.append({"role": "user", "content": input_.content})
response = self.client.chat.completions.create(
model=self.model_name,
messages=message_history
)
reasoning_content = response.choices[0].message.reasoning_content
content = response.choices[0].message.content
return AIMessage(content=content, reasoning_content=reasoning_content)
class DeepSeekR1ChatOllama(ChatOllama):
async def ainvoke(
self,
input: LanguageModelInput,
config: Optional[RunnableConfig] = None,
*,
stop: Optional[list[str]] = None,
**kwargs: Any,
) -> AIMessage:
org_ai_message = await super().ainvoke(input=input)
org_content = org_ai_message.content
reasoning_content = org_content.split("</think>")[0].replace("<think>", "")
content = org_content.split("</think>")[1]
if "**JSON Response:**" in content:
content = content.split("**JSON Response:**")[-1]
return AIMessage(content=content, reasoning_content=reasoning_content)
def invoke(
self,
input: LanguageModelInput,
config: Optional[RunnableConfig] = None,
*,
stop: Optional[list[str]] = None,
**kwargs: Any,
) -> AIMessage:
org_ai_message = super().invoke(input=input)
org_content = org_ai_message.content
reasoning_content = org_content.split("</think>")[0].replace("<think>", "")
content = org_content.split("</think>")[1]
if "**JSON Response:**" in content:
content = content.split("**JSON Response:**")[-1]
return AIMessage(content=content, reasoning_content=reasoning_content)
def get_llm_model(provider: str, **kwargs):
"""
Get LLM model
:param provider: LLM provider
:param kwargs:
:return:
"""
if provider not in ["ollama", "bedrock"]:
env_var = f"{provider.upper()}_API_KEY"
api_key = kwargs.get("api_key", "") or os.getenv(env_var, "")
if not api_key:
provider_display = config.PROVIDER_DISPLAY_NAMES.get(provider, provider.upper())
error_msg = f"💥 {provider_display} API key not found! 🔑 Please set the `{env_var}` environment variable or provide it in the UI."
raise ValueError(error_msg)
kwargs["api_key"] = api_key
if provider == "anthropic":
if not kwargs.get("base_url", ""):
base_url = "https://api.anthropic.com"
else:
base_url = kwargs.get("base_url")
return ChatAnthropic(
model=kwargs.get("model_name", "claude-3-5-sonnet-20241022"),
temperature=kwargs.get("temperature", 0.0),
base_url=base_url,
api_key=api_key,
)
elif provider == 'mistral':
if not kwargs.get("base_url", ""):
base_url = os.getenv("MISTRAL_ENDPOINT", "https://api.mistral.ai/v1")
else:
base_url = kwargs.get("base_url")
if not kwargs.get("api_key", ""):
api_key = os.getenv("MISTRAL_API_KEY", "")
else:
api_key = kwargs.get("api_key")
return ChatMistralAI(
model=kwargs.get("model_name", "mistral-large-latest"),
temperature=kwargs.get("temperature", 0.0),
base_url=base_url,
api_key=api_key,
)
elif provider == "openai":
if not kwargs.get("base_url", ""):
base_url = os.getenv("OPENAI_ENDPOINT", "https://api.openai.com/v1")
else:
base_url = kwargs.get("base_url")
return ChatOpenAI(
model=kwargs.get("model_name", "gpt-4o"),
temperature=kwargs.get("temperature", 0.0),
base_url=base_url,
api_key=api_key,
)
elif provider == "deepseek":
if not kwargs.get("base_url", ""):
base_url = os.getenv("DEEPSEEK_ENDPOINT", "")
else:
base_url = kwargs.get("base_url")
if kwargs.get("model_name", "deepseek-chat") == "deepseek-reasoner":
return DeepSeekR1ChatOpenAI(
model=kwargs.get("model_name", "deepseek-reasoner"),
temperature=kwargs.get("temperature", 0.0),
base_url=base_url,
api_key=api_key,
)
else:
return ChatOpenAI(
model=kwargs.get("model_name", "deepseek-chat"),
temperature=kwargs.get("temperature", 0.0),
base_url=base_url,
api_key=api_key,
)
elif provider == "google":
return ChatGoogleGenerativeAI(
model=kwargs.get("model_name", "gemini-2.0-flash-exp"),
temperature=kwargs.get("temperature", 0.0),
api_key=api_key,
)
elif provider == "ollama":
if not kwargs.get("base_url", ""):
base_url = os.getenv("OLLAMA_ENDPOINT", "http://localhost:11434")
else:
base_url = kwargs.get("base_url")
if "deepseek-r1" in kwargs.get("model_name", "qwen2.5:7b"):
return DeepSeekR1ChatOllama(
model=kwargs.get("model_name", "deepseek-r1:14b"),
temperature=kwargs.get("temperature", 0.0),
num_ctx=kwargs.get("num_ctx", 32000),
base_url=base_url,
)
else:
return ChatOllama(
model=kwargs.get("model_name", "qwen2.5:7b"),
temperature=kwargs.get("temperature", 0.0),
num_ctx=kwargs.get("num_ctx", 32000),
num_predict=kwargs.get("num_predict", 1024),
base_url=base_url,
)
elif provider == "azure_openai":
if not kwargs.get("base_url", ""):
base_url = os.getenv("AZURE_OPENAI_ENDPOINT", "")
else:
base_url = kwargs.get("base_url")
api_version = kwargs.get("api_version", "") or os.getenv("AZURE_OPENAI_API_VERSION", "2025-01-01-preview")
return AzureChatOpenAI(
model=kwargs.get("model_name", "gpt-4o"),
temperature=kwargs.get("temperature", 0.0),
api_version=api_version,
azure_endpoint=base_url,
api_key=api_key,
)
elif provider == "alibaba":
if not kwargs.get("base_url", ""):
base_url = os.getenv("ALIBABA_ENDPOINT", "https://dashscope.aliyuncs.com/compatible-mode/v1")
else:
base_url = kwargs.get("base_url")
return ChatOpenAI(
model=kwargs.get("model_name", "qwen-plus"),
temperature=kwargs.get("temperature", 0.0),
base_url=base_url,
api_key=api_key,
)
elif provider == "ibm":
parameters = {
"temperature": kwargs.get("temperature", 0.0),
"max_tokens": kwargs.get("num_ctx", 32000)
}
if not kwargs.get("base_url", ""):
base_url = os.getenv("IBM_ENDPOINT", "https://us-south.ml.cloud.ibm.com")
else:
base_url = kwargs.get("base_url")
return ChatWatsonx(
model_id=kwargs.get("model_name", "ibm/granite-vision-3.1-2b-preview"),
url=base_url,
project_id=os.getenv("IBM_PROJECT_ID"),
apikey=os.getenv("IBM_API_KEY"),
params=parameters
)
elif provider == "moonshot":
return ChatOpenAI(
model=kwargs.get("model_name", "moonshot-v1-32k-vision-preview"),
temperature=kwargs.get("temperature", 0.0),
base_url=os.getenv("MOONSHOT_ENDPOINT"),
api_key=os.getenv("MOONSHOT_API_KEY"),
)
elif provider == "unbound":
return ChatOpenAI(
model=kwargs.get("model_name", "gpt-4o-mini"),
temperature=kwargs.get("temperature", 0.0),
base_url=os.getenv("UNBOUND_ENDPOINT", "https://api.getunbound.ai"),
api_key=api_key,
)
elif provider == "siliconflow":
if not kwargs.get("api_key", ""):
api_key = os.getenv("SiliconFLOW_API_KEY", "")
else:
api_key = kwargs.get("api_key")
if not kwargs.get("base_url", ""):
base_url = os.getenv("SiliconFLOW_ENDPOINT", "")
else:
base_url = kwargs.get("base_url")
return ChatOpenAI(
api_key=api_key,
base_url=base_url,
model_name=kwargs.get("model_name", "Qwen/QwQ-32B"),
temperature=kwargs.get("temperature", 0.0),
)
else:
raise ValueError(f"Unsupported provider: {provider}")

267
src/utils/mcp_client.py Normal file
View File

@@ -0,0 +1,267 @@
import os
import asyncio
import base64
import pdb
from typing import List, Tuple, Optional
from langchain_core.tools import BaseTool
from langchain_mcp_adapters.client import MultiServerMCPClient
import base64
import json
import logging
from typing import Optional, Dict, Any, Type
from langchain_core.tools import BaseTool
from pydantic.v1 import BaseModel, Field
from langchain_core.runnables import RunnableConfig
from pydantic import BaseModel, Field, create_model
from typing import Type, Dict, Any, Optional, get_type_hints, List, Union, Annotated, Set
from pydantic import BaseModel, ConfigDict, create_model, Field
from langchain.tools import BaseTool
import inspect
from datetime import datetime, date, time
import uuid
from enum import Enum
import inspect
from browser_use.controller.registry.views import ActionModel
from typing import Type, Dict, Any, Optional, get_type_hints
logger = logging.getLogger(__name__)
async def setup_mcp_client_and_tools(mcp_server_config: Dict[str, Any]) -> Optional[MultiServerMCPClient]:
"""
Initializes the MultiServerMCPClient, connects to servers, fetches tools,
filters them, and returns a flat list of usable tools and the client instance.
Returns:
A tuple containing:
- list[BaseTool]: The filtered list of usable LangChain tools.
- MultiServerMCPClient | None: The initialized and started client instance, or None on failure.
"""
logger.info("Initializing MultiServerMCPClient...")
if not mcp_server_config:
logger.error("No MCP server configuration provided.")
return None
try:
if "mcpServers" in mcp_server_config:
mcp_server_config = mcp_server_config["mcpServers"]
client = MultiServerMCPClient(mcp_server_config)
await client.__aenter__()
return client
except Exception as e:
logger.error(f"Failed to setup MCP client or fetch tools: {e}", exc_info=True)
return None
def create_tool_param_model(tool: BaseTool) -> Type[BaseModel]:
"""Creates a Pydantic model from a LangChain tool's schema"""
# Get tool schema information
json_schema = tool.args_schema
tool_name = tool.name
# If the tool already has a schema defined, convert it to a new param_model
if json_schema is not None:
# Create new parameter model
params = {}
# Process properties if they exist
if 'properties' in json_schema:
# Find required fields
required_fields: Set[str] = set(json_schema.get('required', []))
for prop_name, prop_details in json_schema['properties'].items():
field_type = resolve_type(prop_details, f"{tool_name}_{prop_name}")
# Check if parameter is required
is_required = prop_name in required_fields
# Get default value and description
default_value = prop_details.get('default', ... if is_required else None)
description = prop_details.get('description', '')
# Add field constraints
field_kwargs = {'default': default_value}
if description:
field_kwargs['description'] = description
# Add additional constraints if present
if 'minimum' in prop_details:
field_kwargs['ge'] = prop_details['minimum']
if 'maximum' in prop_details:
field_kwargs['le'] = prop_details['maximum']
if 'minLength' in prop_details:
field_kwargs['min_length'] = prop_details['minLength']
if 'maxLength' in prop_details:
field_kwargs['max_length'] = prop_details['maxLength']
if 'pattern' in prop_details:
field_kwargs['pattern'] = prop_details['pattern']
# Add to parameters dictionary
params[prop_name] = (field_type, Field(**field_kwargs))
return create_model(
f'{tool_name}_parameters',
__base__=ActionModel,
**params, # type: ignore
)
# If no schema is defined, extract parameters from the _run method
run_method = tool._run
sig = inspect.signature(run_method)
# Get type hints for better type information
try:
type_hints = get_type_hints(run_method)
except Exception:
type_hints = {}
params = {}
for name, param in sig.parameters.items():
# Skip 'self' parameter and any other parameters you want to exclude
if name == 'self':
continue
# Get annotation from type hints if available, otherwise from signature
annotation = type_hints.get(name, param.annotation)
if annotation == inspect.Parameter.empty:
annotation = Any
# Use default value if available, otherwise make it required
if param.default != param.empty:
params[name] = (annotation, param.default)
else:
params[name] = (annotation, ...)
return create_model(
f'{tool_name}_parameters',
__base__=ActionModel,
**params, # type: ignore
)
def resolve_type(prop_details: Dict[str, Any], prefix: str = "") -> Any:
"""Recursively resolves JSON schema type to Python/Pydantic type"""
# Handle reference types
if '$ref' in prop_details:
# In a real application, reference resolution would be needed
return Any
# Basic type mapping
type_mapping = {
'string': str,
'integer': int,
'number': float,
'boolean': bool,
'array': List,
'object': Dict,
'null': type(None),
}
# Handle formatted strings
if prop_details.get('type') == 'string' and 'format' in prop_details:
format_mapping = {
'date-time': datetime,
'date': date,
'time': time,
'email': str,
'uri': str,
'url': str,
'uuid': uuid.UUID,
'binary': bytes,
}
return format_mapping.get(prop_details['format'], str)
# Handle enum types
if 'enum' in prop_details:
enum_values = prop_details['enum']
# Create dynamic enum class with safe names
enum_dict = {}
for i, v in enumerate(enum_values):
# Ensure enum names are valid Python identifiers
if isinstance(v, str):
key = v.upper().replace(' ', '_').replace('-', '_')
if not key.isidentifier():
key = f"VALUE_{i}"
else:
key = f"VALUE_{i}"
enum_dict[key] = v
# Only create enum if we have values
if enum_dict:
return Enum(f"{prefix}_Enum", enum_dict)
return str # Fallback
# Handle array types
if prop_details.get('type') == 'array' and 'items' in prop_details:
item_type = resolve_type(prop_details['items'], f"{prefix}_item")
return List[item_type] # type: ignore
# Handle object types with properties
if prop_details.get('type') == 'object' and 'properties' in prop_details:
nested_params = {}
for nested_name, nested_details in prop_details['properties'].items():
nested_type = resolve_type(nested_details, f"{prefix}_{nested_name}")
# Get required field info
required_fields = prop_details.get('required', [])
is_required = nested_name in required_fields
default_value = nested_details.get('default', ... if is_required else None)
description = nested_details.get('description', '')
field_kwargs = {'default': default_value}
if description:
field_kwargs['description'] = description
nested_params[nested_name] = (nested_type, Field(**field_kwargs))
# Create nested model
nested_model = create_model(f"{prefix}_Model", **nested_params)
return nested_model
# Handle union types (oneOf, anyOf)
if 'oneOf' in prop_details or 'anyOf' in prop_details:
union_schema = prop_details.get('oneOf') or prop_details.get('anyOf')
union_types = []
for i, t in enumerate(union_schema):
union_types.append(resolve_type(t, f"{prefix}_{i}"))
if union_types:
return Union.__getitem__(tuple(union_types)) # type: ignore
return Any
# Handle allOf (intersection types)
if 'allOf' in prop_details:
nested_params = {}
for i, schema_part in enumerate(prop_details['allOf']):
if 'properties' in schema_part:
for nested_name, nested_details in schema_part['properties'].items():
nested_type = resolve_type(nested_details, f"{prefix}_allOf_{i}_{nested_name}")
# Check if required
required_fields = schema_part.get('required', [])
is_required = nested_name in required_fields
nested_params[nested_name] = (nested_type, ... if is_required else None)
# Create composite model
if nested_params:
composite_model = create_model(f"{prefix}_CompositeModel", **nested_params)
return composite_model
return Dict
# Default to basic types
schema_type = prop_details.get('type', 'string')
if isinstance(schema_type, list):
# Handle multiple types (e.g., ["string", "null"])
non_null_types = [t for t in schema_type if t != 'null']
if non_null_types:
primary_type = type_mapping.get(non_null_types[0], Any)
if 'null' in schema_type:
return Optional[primary_type] # type: ignore
return primary_type
return Any
return type_mapping.get(schema_type, Any)

View File

@@ -8,282 +8,6 @@ import json
import gradio as gr
import uuid
from langchain_anthropic import ChatAnthropic
from langchain_mistralai import ChatMistralAI
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_ollama import ChatOllama
from langchain_openai import AzureChatOpenAI, ChatOpenAI
from langchain_ibm import ChatWatsonx
from .llm import DeepSeekR1ChatOpenAI, DeepSeekR1ChatOllama
PROVIDER_DISPLAY_NAMES = {
"openai": "OpenAI",
"azure_openai": "Azure OpenAI",
"anthropic": "Anthropic",
"deepseek": "DeepSeek",
"google": "Google",
"alibaba": "Alibaba",
"moonshot": "MoonShot",
"unbound": "Unbound AI",
"ibm": "IBM"
}
def get_llm_model(provider: str, **kwargs):
"""
获取LLM 模型
:param provider: 模型类型
:param kwargs:
:return:
"""
if provider not in ["ollama"]:
env_var = f"{provider.upper()}_API_KEY"
api_key = kwargs.get("api_key", "") or os.getenv(env_var, "")
if not api_key:
raise MissingAPIKeyError(provider, env_var)
kwargs["api_key"] = api_key
if provider == "anthropic":
if not kwargs.get("base_url", ""):
base_url = "https://api.anthropic.com"
else:
base_url = kwargs.get("base_url")
return ChatAnthropic(
model=kwargs.get("model_name", "claude-3-5-sonnet-20241022"),
temperature=kwargs.get("temperature", 0.0),
base_url=base_url,
api_key=api_key,
)
elif provider == 'mistral':
if not kwargs.get("base_url", ""):
base_url = os.getenv("MISTRAL_ENDPOINT", "https://api.mistral.ai/v1")
else:
base_url = kwargs.get("base_url")
if not kwargs.get("api_key", ""):
api_key = os.getenv("MISTRAL_API_KEY", "")
else:
api_key = kwargs.get("api_key")
return ChatMistralAI(
model=kwargs.get("model_name", "mistral-large-latest"),
temperature=kwargs.get("temperature", 0.0),
base_url=base_url,
api_key=api_key,
)
elif provider == "openai":
if not kwargs.get("base_url", ""):
base_url = os.getenv("OPENAI_ENDPOINT", "https://api.openai.com/v1")
else:
base_url = kwargs.get("base_url")
return ChatOpenAI(
model=kwargs.get("model_name", "gpt-4o"),
temperature=kwargs.get("temperature", 0.0),
base_url=base_url,
api_key=api_key,
)
elif provider == "deepseek":
if not kwargs.get("base_url", ""):
base_url = os.getenv("DEEPSEEK_ENDPOINT", "")
else:
base_url = kwargs.get("base_url")
if kwargs.get("model_name", "deepseek-chat") == "deepseek-reasoner":
return DeepSeekR1ChatOpenAI(
model=kwargs.get("model_name", "deepseek-reasoner"),
temperature=kwargs.get("temperature", 0.0),
base_url=base_url,
api_key=api_key,
)
else:
return ChatOpenAI(
model=kwargs.get("model_name", "deepseek-chat"),
temperature=kwargs.get("temperature", 0.0),
base_url=base_url,
api_key=api_key,
)
elif provider == "google":
return ChatGoogleGenerativeAI(
model=kwargs.get("model_name", "gemini-2.0-flash-exp"),
temperature=kwargs.get("temperature", 0.0),
api_key=api_key,
)
elif provider == "ollama":
if not kwargs.get("base_url", ""):
base_url = os.getenv("OLLAMA_ENDPOINT", "http://localhost:11434")
else:
base_url = kwargs.get("base_url")
if "deepseek-r1" in kwargs.get("model_name", "qwen2.5:7b"):
return DeepSeekR1ChatOllama(
model=kwargs.get("model_name", "deepseek-r1:14b"),
temperature=kwargs.get("temperature", 0.0),
num_ctx=kwargs.get("num_ctx", 32000),
base_url=base_url,
)
else:
return ChatOllama(
model=kwargs.get("model_name", "qwen2.5:7b"),
temperature=kwargs.get("temperature", 0.0),
num_ctx=kwargs.get("num_ctx", 32000),
num_predict=kwargs.get("num_predict", 1024),
base_url=base_url,
)
elif provider == "azure_openai":
if not kwargs.get("base_url", ""):
base_url = os.getenv("AZURE_OPENAI_ENDPOINT", "")
else:
base_url = kwargs.get("base_url")
api_version = kwargs.get("api_version", "") or os.getenv("AZURE_OPENAI_API_VERSION", "2025-01-01-preview")
return AzureChatOpenAI(
model=kwargs.get("model_name", "gpt-4o"),
temperature=kwargs.get("temperature", 0.0),
api_version=api_version,
azure_endpoint=base_url,
api_key=api_key,
)
elif provider == "alibaba":
if not kwargs.get("base_url", ""):
base_url = os.getenv("ALIBABA_ENDPOINT", "https://dashscope.aliyuncs.com/compatible-mode/v1")
else:
base_url = kwargs.get("base_url")
return ChatOpenAI(
model=kwargs.get("model_name", "qwen-plus"),
temperature=kwargs.get("temperature", 0.0),
base_url=base_url,
api_key=api_key,
)
elif provider == "ibm":
parameters = {
"temperature": kwargs.get("temperature", 0.0),
"max_tokens": kwargs.get("num_ctx", 32000)
}
if not kwargs.get("base_url", ""):
base_url = os.getenv("IBM_ENDPOINT", "https://us-south.ml.cloud.ibm.com")
else:
base_url = kwargs.get("base_url")
return ChatWatsonx(
model_id=kwargs.get("model_name", "ibm/granite-vision-3.1-2b-preview"),
url=base_url,
project_id=os.getenv("IBM_PROJECT_ID"),
apikey=os.getenv("IBM_API_KEY"),
params=parameters
)
elif provider == "moonshot":
return ChatOpenAI(
model=kwargs.get("model_name", "moonshot-v1-32k-vision-preview"),
temperature=kwargs.get("temperature", 0.0),
base_url=os.getenv("MOONSHOT_ENDPOINT"),
api_key=os.getenv("MOONSHOT_API_KEY"),
)
elif provider == "unbound":
return ChatOpenAI(
model=kwargs.get("model_name", "gpt-4o-mini"),
temperature=kwargs.get("temperature", 0.0),
base_url=os.getenv("UNBOUND_ENDPOINT", "https://api.getunbound.ai"),
api_key=api_key,
)
elif provider == "siliconflow":
if not kwargs.get("api_key", ""):
api_key = os.getenv("SiliconFLOW_API_KEY", "")
else:
api_key = kwargs.get("api_key")
if not kwargs.get("base_url", ""):
base_url = os.getenv("SiliconFLOW_ENDPOINT", "")
else:
base_url = kwargs.get("base_url")
return ChatOpenAI(
api_key=api_key,
base_url=base_url,
model_name=kwargs.get("model_name", "Qwen/QwQ-32B"),
temperature=kwargs.get("temperature", 0.0),
)
else:
raise ValueError(f"Unsupported provider: {provider}")
# Predefined model names for common providers
model_names = {
"anthropic": ["claude-3-5-sonnet-20241022", "claude-3-5-sonnet-20240620", "claude-3-opus-20240229"],
"openai": ["gpt-4o", "gpt-4", "gpt-3.5-turbo", "o3-mini"],
"deepseek": ["deepseek-chat", "deepseek-reasoner"],
"google": ["gemini-2.0-flash", "gemini-2.0-flash-thinking-exp", "gemini-1.5-flash-latest",
"gemini-1.5-flash-8b-latest", "gemini-2.0-flash-thinking-exp-01-21", "gemini-2.0-pro-exp-02-05"],
"ollama": ["qwen2.5:7b", "qwen2.5:14b", "qwen2.5:32b", "qwen2.5-coder:14b", "qwen2.5-coder:32b", "llama2:7b",
"deepseek-r1:14b", "deepseek-r1:32b"],
"azure_openai": ["gpt-4o", "gpt-4", "gpt-3.5-turbo"],
"mistral": ["pixtral-large-latest", "mistral-large-latest", "mistral-small-latest", "ministral-8b-latest"],
"alibaba": ["qwen-plus", "qwen-max", "qwen-turbo", "qwen-long"],
"moonshot": ["moonshot-v1-32k-vision-preview", "moonshot-v1-8k-vision-preview"],
"unbound": ["gemini-2.0-flash", "gpt-4o-mini", "gpt-4o", "gpt-4.5-preview"],
"siliconflow": [
"deepseek-ai/DeepSeek-R1",
"deepseek-ai/DeepSeek-V3",
"deepseek-ai/DeepSeek-R1-Distill-Qwen-32B",
"deepseek-ai/DeepSeek-R1-Distill-Qwen-14B",
"deepseek-ai/DeepSeek-R1-Distill-Qwen-7B",
"deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
"deepseek-ai/DeepSeek-V2.5",
"deepseek-ai/deepseek-vl2",
"Qwen/Qwen2.5-72B-Instruct-128K",
"Qwen/Qwen2.5-72B-Instruct",
"Qwen/Qwen2.5-32B-Instruct",
"Qwen/Qwen2.5-14B-Instruct",
"Qwen/Qwen2.5-7B-Instruct",
"Qwen/Qwen2.5-Coder-32B-Instruct",
"Qwen/Qwen2.5-Coder-7B-Instruct",
"Qwen/Qwen2-7B-Instruct",
"Qwen/Qwen2-1.5B-Instruct",
"Qwen/QwQ-32B-Preview",
"Qwen/Qwen2-VL-72B-Instruct",
"Qwen/Qwen2.5-VL-32B-Instruct",
"Qwen/Qwen2.5-VL-72B-Instruct",
"TeleAI/TeleChat2",
"THUDM/glm-4-9b-chat",
"Vendor-A/Qwen/Qwen2.5-72B-Instruct",
"internlm/internlm2_5-7b-chat",
"internlm/internlm2_5-20b-chat",
"Pro/Qwen/Qwen2.5-7B-Instruct",
"Pro/Qwen/Qwen2-7B-Instruct",
"Pro/Qwen/Qwen2-1.5B-Instruct",
"Pro/THUDM/chatglm3-6b",
"Pro/THUDM/glm-4-9b-chat",
],
"ibm": ["ibm/granite-vision-3.1-2b-preview", "meta-llama/llama-4-maverick-17b-128e-instruct-fp8","meta-llama/llama-3-2-90b-vision-instruct"]
}
# Callback to update the model name dropdown based on the selected provider
def update_model_dropdown(llm_provider, api_key=None, base_url=None):
"""
Update the model name dropdown with predefined models for the selected provider.
"""
import gradio as gr
# Use API keys from .env if not provided
if not api_key:
api_key = os.getenv(f"{llm_provider.upper()}_API_KEY", "")
if not base_url:
base_url = os.getenv(f"{llm_provider.upper()}_BASE_URL", "")
# Use predefined models for the selected provider
if llm_provider in model_names:
return gr.Dropdown(choices=model_names[llm_provider], value=model_names[llm_provider][0], interactive=True)
else:
return gr.Dropdown(choices=[], value="", interactive=True, allow_custom_value=True)
class MissingAPIKeyError(Exception):
"""Custom exception for missing API key."""
def __init__(self, provider: str, env_var: str):
provider_display = PROVIDER_DISPLAY_NAMES.get(provider, provider.upper())
super().__init__(f"💥 {provider_display} API key not found! 🔑 Please set the "
f"`{env_var}` environment variable or provide it in the UI.")
def encode_image(img_path):
if not img_path:
@@ -313,108 +37,3 @@ def get_latest_files(directory: str, file_types: list = ['.webm', '.zip']) -> Di
print(f"Error getting latest {file_type} file: {e}")
return latest_files
async def capture_screenshot(browser_context):
"""Capture and encode a screenshot"""
# Extract the Playwright browser instance
playwright_browser = browser_context.browser.playwright_browser # Ensure this is correct.
# Check if the browser instance is valid and if an existing context can be reused
if playwright_browser and playwright_browser.contexts:
playwright_context = playwright_browser.contexts[0]
else:
return None
# Access pages in the context
pages = None
if playwright_context:
pages = playwright_context.pages
# Use an existing page or create a new one if none exist
if pages:
active_page = pages[0]
for page in pages:
if page.url != "about:blank":
active_page = page
else:
return None
# Take screenshot
try:
screenshot = await active_page.screenshot(
type='jpeg',
quality=75,
scale="css"
)
encoded = base64.b64encode(screenshot).decode('utf-8')
return encoded
except Exception as e:
return None
class ConfigManager:
def __init__(self):
self.components = {}
self.component_order = []
def register_component(self, name: str, component):
"""Register a gradio component for config management."""
self.components[name] = component
if name not in self.component_order:
self.component_order.append(name)
return component
def save_current_config(self):
"""Save the current configuration of all registered components."""
current_config = {}
for name in self.component_order:
component = self.components[name]
# Get the current value from the component
current_config[name] = getattr(component, "value", None)
return save_config_to_file(current_config)
def update_ui_from_config(self, config_file):
"""Update UI components from a loaded configuration file."""
if config_file is None:
return [gr.update() for _ in self.component_order] + ["No file selected."]
loaded_config = load_config_from_file(config_file.name)
if not isinstance(loaded_config, dict):
return [gr.update() for _ in self.component_order] + ["Error: Invalid configuration file."]
# Prepare updates for all components
updates = []
for name in self.component_order:
if name in loaded_config:
updates.append(gr.update(value=loaded_config[name]))
else:
updates.append(gr.update())
updates.append("Configuration loaded successfully.")
return updates
def get_all_components(self):
"""Return all registered components in the order they were registered."""
return [self.components[name] for name in self.component_order]
def load_config_from_file(config_file):
"""Load settings from a config file (JSON format)."""
try:
with open(config_file, 'r') as f:
settings = json.load(f)
return settings
except Exception as e:
return f"Error loading configuration: {str(e)}"
def save_config_to_file(settings, save_dir="./tmp/webui_settings"):
"""Save the current settings to a UUID.json file with a UUID name."""
os.makedirs(save_dir, exist_ok=True)
config_file = os.path.join(save_dir, f"{uuid.uuid4()}.json")
with open(config_file, 'w') as f:
json.dump(settings, f, indent=2)
return f"Configuration saved to {config_file}"

0
src/webui/__init__.py Normal file
View File

View File

View File

@@ -0,0 +1,269 @@
import json
import os
import gradio as gr
from gradio.components import Component
from typing import Any, Dict, Optional
from src.webui.webui_manager import WebuiManager
from src.utils import config
import logging
from functools import partial
logger = logging.getLogger(__name__)
def update_model_dropdown(llm_provider):
"""
Update the model name dropdown with predefined models for the selected provider.
"""
# Use predefined models for the selected provider
if llm_provider in config.model_names:
return gr.Dropdown(choices=config.model_names[llm_provider], value=config.model_names[llm_provider][0],
interactive=True)
else:
return gr.Dropdown(choices=[], value="", interactive=True, allow_custom_value=True)
async def update_mcp_server(mcp_file: str, webui_manager: WebuiManager):
"""
Update the MCP server.
"""
if hasattr(webui_manager, "bu_controller") and webui_manager.bu_controller:
logger.warning("⚠️ Close controller because mcp file has changed!")
await webui_manager.bu_controller.close_mcp_client()
webui_manager.bu_controller = None
if not mcp_file or not os.path.exists(mcp_file) or not mcp_file.endswith('.json'):
logger.warning(f"{mcp_file} is not a valid MCP file.")
return None, gr.update(visible=False)
with open(mcp_file, 'r') as f:
mcp_server = json.load(f)
return json.dumps(mcp_server, indent=2), gr.update(visible=True)
def create_agent_settings_tab(webui_manager: WebuiManager):
"""
Creates an agent settings tab.
"""
input_components = set(webui_manager.get_components())
tab_components = {}
with gr.Group():
with gr.Column():
override_system_prompt = gr.Textbox(label="Override system prompt", lines=4, interactive=True)
extend_system_prompt = gr.Textbox(label="Extend system prompt", lines=4, interactive=True)
with gr.Group():
mcp_json_file = gr.File(label="MCP server json", interactive=True, file_types=[".json"])
mcp_server_config = gr.Textbox(label="MCP server", lines=6, interactive=True, visible=False)
with gr.Group():
with gr.Row():
llm_provider = gr.Dropdown(
choices=[provider for provider, model in config.model_names.items()],
label="LLM Provider",
value="openai",
info="Select LLM provider for LLM",
interactive=True
)
llm_model_name = gr.Dropdown(
label="LLM Model Name",
choices=config.model_names['openai'],
value="gpt-4o",
interactive=True,
allow_custom_value=True,
info="Select a model in the dropdown options or directly type a custom model name"
)
with gr.Row():
llm_temperature = gr.Slider(
minimum=0.0,
maximum=2.0,
value=0.6,
step=0.1,
label="LLM Temperature",
info="Controls randomness in model outputs",
interactive=True
)
use_vision = gr.Checkbox(
label="Use Vision",
value=True,
info="Enable Vision(Input highlighted screenshot into LLM)",
interactive=True
)
ollama_num_ctx = gr.Slider(
minimum=2 ** 8,
maximum=2 ** 16,
value=16000,
step=1,
label="Ollama Context Length",
info="Controls max context length model needs to handle (less = faster)",
visible=False,
interactive=True
)
with gr.Row():
llm_base_url = gr.Textbox(
label="Base URL",
value="",
info="API endpoint URL (if required)"
)
llm_api_key = gr.Textbox(
label="API Key",
type="password",
value="",
info="Your API key (leave blank to use .env)"
)
with gr.Group():
with gr.Row():
planner_llm_provider = gr.Dropdown(
choices=[provider for provider, model in config.model_names.items()],
label="Planner LLM Provider",
info="Select LLM provider for LLM",
value=None,
interactive=True
)
planner_llm_model_name = gr.Dropdown(
label="Planner LLM Model Name",
interactive=True,
allow_custom_value=True,
info="Select a model in the dropdown options or directly type a custom model name"
)
with gr.Row():
planner_llm_temperature = gr.Slider(
minimum=0.0,
maximum=2.0,
value=0.6,
step=0.1,
label="Planner LLM Temperature",
info="Controls randomness in model outputs",
interactive=True
)
planner_use_vision = gr.Checkbox(
label="Use Vision(Planner LLM)",
value=False,
info="Enable Vision(Input highlighted screenshot into LLM)",
interactive=True
)
planner_ollama_num_ctx = gr.Slider(
minimum=2 ** 8,
maximum=2 ** 16,
value=16000,
step=1,
label="Ollama Context Length",
info="Controls max context length model needs to handle (less = faster)",
visible=False,
interactive=True
)
with gr.Row():
planner_llm_base_url = gr.Textbox(
label="Base URL",
value="",
info="API endpoint URL (if required)"
)
planner_llm_api_key = gr.Textbox(
label="API Key",
type="password",
value="",
info="Your API key (leave blank to use .env)"
)
with gr.Row():
max_steps = gr.Slider(
minimum=1,
maximum=1000,
value=100,
step=1,
label="Max Run Steps",
info="Maximum number of steps the agent will take",
interactive=True
)
max_actions = gr.Slider(
minimum=1,
maximum=100,
value=10,
step=1,
label="Max Number of Actions",
info="Maximum number of actions the agent will take per step",
interactive=True
)
with gr.Row():
max_input_tokens = gr.Number(
label="Max Input Tokens",
value=128000,
precision=0,
interactive=True
)
tool_calling_method = gr.Dropdown(
label="Tool Calling Method",
value="auto",
interactive=True,
allow_custom_value=True,
choices=["auto", "json_schema", "function_calling", "None"],
visible=True
)
tab_components.update(dict(
override_system_prompt=override_system_prompt,
extend_system_prompt=extend_system_prompt,
llm_provider=llm_provider,
llm_model_name=llm_model_name,
llm_temperature=llm_temperature,
use_vision=use_vision,
ollama_num_ctx=ollama_num_ctx,
llm_base_url=llm_base_url,
llm_api_key=llm_api_key,
planner_llm_provider=planner_llm_provider,
planner_llm_model_name=planner_llm_model_name,
planner_llm_temperature=planner_llm_temperature,
planner_use_vision=planner_use_vision,
planner_ollama_num_ctx=planner_ollama_num_ctx,
planner_llm_base_url=planner_llm_base_url,
planner_llm_api_key=planner_llm_api_key,
max_steps=max_steps,
max_actions=max_actions,
max_input_tokens=max_input_tokens,
tool_calling_method=tool_calling_method,
mcp_json_file=mcp_json_file,
mcp_server_config=mcp_server_config,
))
webui_manager.add_components("agent_settings", tab_components)
llm_provider.change(
fn=lambda x: gr.update(visible=x == "ollama"),
inputs=llm_provider,
outputs=ollama_num_ctx
)
llm_provider.change(
lambda provider: update_model_dropdown(provider),
inputs=[llm_provider],
outputs=[llm_model_name]
)
planner_llm_provider.change(
fn=lambda x: gr.update(visible=x == "ollama"),
inputs=[planner_llm_provider],
outputs=[planner_ollama_num_ctx]
)
planner_llm_provider.change(
lambda provider: update_model_dropdown(provider),
inputs=[planner_llm_provider],
outputs=[planner_llm_model_name]
)
async def update_wrapper(mcp_file):
"""Wrapper for handle_pause_resume."""
update_dict = await update_mcp_server(mcp_file, webui_manager)
yield update_dict
mcp_json_file.change(
update_wrapper,
inputs=[mcp_json_file],
outputs=[mcp_server_config, mcp_server_config]
)

View File

@@ -0,0 +1,158 @@
import gradio as gr
import logging
from gradio.components import Component
from src.webui.webui_manager import WebuiManager
from src.utils import config
logger = logging.getLogger(__name__)
async def close_browser(webui_manager: WebuiManager):
"""
Close browser
"""
if webui_manager.bu_current_task and not webui_manager.bu_current_task.done():
webui_manager.bu_current_task.cancel()
webui_manager.bu_current_task = None
if webui_manager.bu_browser_context:
logger.info("⚠️ Closing browser context when changing browser config.")
await webui_manager.bu_browser_context.close()
webui_manager.bu_browser_context = None
if webui_manager.bu_browser:
logger.info("⚠️ Closing browser when changing browser config.")
await webui_manager.bu_browser.close()
webui_manager.bu_browser = None
def create_browser_settings_tab(webui_manager: WebuiManager):
"""
Creates a browser settings tab.
"""
input_components = set(webui_manager.get_components())
tab_components = {}
with gr.Group():
with gr.Row():
browser_binary_path = gr.Textbox(
label="Browser Binary Path",
lines=1,
interactive=True,
placeholder="e.g. '/Applications/Google\\ Chrome.app/Contents/MacOS/Google\\ Chrome'"
)
browser_user_data_dir = gr.Textbox(
label="Browser User Data Dir",
lines=1,
interactive=True,
placeholder="Leave it empty if you use your default user data",
)
with gr.Group():
with gr.Row():
use_own_browser = gr.Checkbox(
label="Use Own Browser",
value=False,
info="Use your existing browser instance",
interactive=True
)
keep_browser_open = gr.Checkbox(
label="Keep Browser Open",
value=True,
info="Keep Browser Open between Tasks",
interactive=True
)
headless = gr.Checkbox(
label="Headless Mode",
value=False,
info="Run browser without GUI",
interactive=True
)
disable_security = gr.Checkbox(
label="Disable Security",
value=False,
info="Disable browser security",
interactive=True
)
with gr.Group():
with gr.Row():
window_w = gr.Number(
label="Window Width",
value=1280,
info="Browser window width",
interactive=True
)
window_h = gr.Number(
label="Window Height",
value=1100,
info="Browser window height",
interactive=True
)
with gr.Group():
with gr.Row():
cdp_url = gr.Textbox(
label="CDP URL",
info="CDP URL for browser remote debugging",
interactive=True,
)
wss_url = gr.Textbox(
label="WSS URL",
info="WSS URL for browser remote debugging",
interactive=True,
)
with gr.Group():
with gr.Row():
save_recording_path = gr.Textbox(
label="Recording Path",
placeholder="e.g. ./tmp/record_videos",
info="Path to save browser recordings",
interactive=True,
)
save_trace_path = gr.Textbox(
label="Trace Path",
placeholder="e.g. ./tmp/traces",
info="Path to save Agent traces",
interactive=True,
)
with gr.Row():
save_agent_history_path = gr.Textbox(
label="Agent History Save Path",
value="./tmp/agent_history",
info="Specify the directory where agent history should be saved.",
interactive=True,
)
save_download_path = gr.Textbox(
label="Save Directory for browser downloads",
value="./tmp/downloads",
info="Specify the directory where downloaded files should be saved.",
interactive=True,
)
tab_components.update(
dict(
browser_binary_path=browser_binary_path,
browser_user_data_dir=browser_user_data_dir,
use_own_browser=use_own_browser,
keep_browser_open=keep_browser_open,
headless=headless,
disable_security=disable_security,
save_recording_path=save_recording_path,
save_trace_path=save_trace_path,
save_agent_history_path=save_agent_history_path,
save_download_path=save_download_path,
cdp_url=cdp_url,
wss_url=wss_url,
window_h=window_h,
window_w=window_w,
)
)
webui_manager.add_components("browser_settings", tab_components)
async def close_wrapper():
"""Wrapper for handle_clear."""
await close_browser(webui_manager)
headless.change(close_wrapper)
keep_browser_open.change(close_wrapper)
disable_security.change(close_wrapper)
use_own_browser.change(close_wrapper)

View File

@@ -0,0 +1,881 @@
import pdb
import gradio as gr
from gradio.components import Component
import asyncio
import os
import json
import uuid
import logging
from datetime import datetime
from typing import List, Dict, Optional, Any, Set, Generator, AsyncGenerator, Union
from collections.abc import Awaitable
from langchain_core.language_models.chat_models import BaseChatModel
import base64
from browser_use.browser.browser import Browser, BrowserConfig
from browser_use.browser.context import BrowserContext, BrowserContextConfig, BrowserContextWindowSize
# from browser_use.agent.service import Agent
from browser_use.agent.views import AgentHistoryList
from browser_use.agent.views import ToolCallingMethod # Adjust import
from browser_use.agent.views import (
REQUIRED_LLM_API_ENV_VARS,
ActionResult,
AgentError,
AgentHistory,
AgentHistoryList,
AgentOutput,
AgentSettings,
AgentState,
AgentStepInfo,
StepMetadata,
ToolCallingMethod,
)
from browser_use.browser.browser import Browser
from browser_use.browser.context import BrowserContext
from browser_use.browser.views import BrowserState, BrowserStateHistory
from src.webui.webui_manager import WebuiManager
from src.controller.custom_controller import CustomController
from src.utils import llm_provider
from src.browser.custom_browser import CustomBrowser
from src.browser.custom_context import CustomBrowserContext, CustomBrowserContextConfig
from src.agent.browser_use.browser_use_agent import BrowserUseAgent
logger = logging.getLogger(__name__)
# --- Helper Functions --- (Defined at module level)
async def _initialize_llm(provider: Optional[str], model_name: Optional[str], temperature: float,
base_url: Optional[str], api_key: Optional[str], num_ctx: Optional[int] = None) -> Optional[
BaseChatModel]:
"""Initializes the LLM based on settings. Returns None if provider/model is missing."""
if not provider or not model_name:
logger.info("LLM Provider or Model Name not specified, LLM will be None.")
return None
try:
# Use your actual LLM provider logic here
logger.info(f"Initializing LLM: Provider={provider}, Model={model_name}, Temp={temperature}")
# Example using a placeholder function
llm = llm_provider.get_llm_model(
provider=provider,
model_name=model_name,
temperature=temperature,
base_url=base_url or None,
api_key=api_key or None,
# Add other relevant params like num_ctx for ollama
num_ctx=num_ctx if provider == "ollama" else None
)
return llm
except Exception as e:
logger.error(f"Failed to initialize LLM: {e}", exc_info=True)
gr.Warning(
f"Failed to initialize LLM '{model_name}' for provider '{provider}'. Please check settings. Error: {e}")
return None
def _get_config_value(webui_manager: WebuiManager, comp_dict: Dict[gr.components.Component, Any], comp_id_suffix: str,
default: Any = None) -> Any:
"""Safely get value from component dictionary using its ID suffix relative to the tab."""
# Assumes component ID format is "tab_name.comp_name"
tab_name = "browser_use_agent" # Hardcode or derive if needed
comp_id = f"{tab_name}.{comp_id_suffix}"
# Need to find the component object first using the ID from the manager
try:
comp = webui_manager.get_component_by_id(comp_id)
return comp_dict.get(comp, default)
except KeyError:
# Try accessing settings tabs as well
for prefix in ["agent_settings", "browser_settings"]:
try:
comp_id = f"{prefix}.{comp_id_suffix}"
comp = webui_manager.get_component_by_id(comp_id)
return comp_dict.get(comp, default)
except KeyError:
continue
logger.warning(f"Component with suffix '{comp_id_suffix}' not found in manager for value lookup.")
return default
def _format_agent_output(model_output: AgentOutput) -> str:
"""Formats AgentOutput for display in the chatbot using JSON."""
content = ""
if model_output:
try:
# Directly use model_dump if actions and current_state are Pydantic models
action_dump = [action.model_dump(exclude_none=True) for action in model_output.action]
state_dump = model_output.current_state.model_dump(exclude_none=True)
model_output_dump = {
'current_state': state_dump,
'action': action_dump,
}
# Dump to JSON string with indentation
json_string = json.dumps(model_output_dump, indent=4, ensure_ascii=False)
# Wrap in <pre><code> for proper display in HTML
content = f"<pre><code class='language-json'>{json_string}</code></pre>"
except AttributeError as ae:
logger.error(
f"AttributeError during model dump: {ae}. Check if 'action' or 'current_state' or their items support 'model_dump'.")
content = f"<pre><code>Error: Could not format agent output (AttributeError: {ae}).\nRaw output: {str(model_output)}</code></pre>"
except Exception as e:
logger.error(f"Error formatting agent output: {e}", exc_info=True)
# Fallback to simple string representation on error
content = f"<pre><code>Error formatting agent output.\nRaw output:\n{str(model_output)}</code></pre>"
return content.strip()
# --- Updated Callback Implementation ---
async def _handle_new_step(webui_manager: WebuiManager, state: BrowserState, output: AgentOutput, step_num: int):
"""Callback for each step taken by the agent, including screenshot display."""
# Use the correct chat history attribute name from the user's code
if not hasattr(webui_manager, 'bu_chat_history'):
logger.error("Attribute 'bu_chat_history' not found in webui_manager! Cannot add chat message.")
# Initialize it maybe? Or raise an error? For now, log and potentially skip chat update.
webui_manager.bu_chat_history = [] # Initialize if missing (consider if this is the right place)
# return # Or stop if this is critical
step_num -= 1
logger.info(f"Step {step_num} completed.")
# --- Screenshot Handling ---
screenshot_html = ""
# Ensure state.screenshot exists and is not empty before proceeding
# Use getattr for safer access
screenshot_data = getattr(state, 'screenshot', None)
if screenshot_data:
try:
# Basic validation: check if it looks like base64
if isinstance(screenshot_data, str) and len(screenshot_data) > 100: # Arbitrary length check
# *** UPDATED STYLE: Removed centering, adjusted width ***
img_tag = f'<img src="data:image/jpeg;base64,{screenshot_data}" alt="Step {step_num} Screenshot" style="max-width: 800px; max-height: 600px; object-fit:contain;" />'
screenshot_html = img_tag + "<br/>" # Use <br/> for line break after inline-block image
else:
logger.warning(
f"Screenshot for step {step_num} seems invalid (type: {type(screenshot_data)}, len: {len(screenshot_data) if isinstance(screenshot_data, str) else 'N/A'}).")
screenshot_html = "**[Invalid screenshot data]**<br/>"
except Exception as e:
logger.error(f"Error processing or formatting screenshot for step {step_num}: {e}", exc_info=True)
screenshot_html = "**[Error displaying screenshot]**<br/>"
else:
logger.debug(f"No screenshot available for step {step_num}.")
# --- Format Agent Output ---
formatted_output = _format_agent_output(output) # Use the updated function
# --- Combine and Append to Chat ---
step_header = f"--- **Step {step_num}** ---"
# Combine header, image (with line break), and JSON block
final_content = step_header + "<br/>" + screenshot_html + formatted_output
chat_message = {
"role": "assistant",
"content": final_content.strip() # Remove leading/trailing whitespace
}
# Append to the correct chat history list
webui_manager.bu_chat_history.append(chat_message)
await asyncio.sleep(0.05)
def _handle_done(webui_manager: WebuiManager, history: AgentHistoryList):
"""Callback when the agent finishes the task (success or failure)."""
logger.info(
f"Agent task finished. Duration: {history.total_duration_seconds():.2f}s, Tokens: {history.total_input_tokens()}")
final_summary = f"**Task Completed**\n"
final_summary += f"- Duration: {history.total_duration_seconds():.2f} seconds\n"
final_summary += f"- Total Input Tokens: {history.total_input_tokens()}\n" # Or total tokens if available
final_result = history.final_result()
if final_result:
final_summary += f"- Final Result: {final_result}\n"
errors = history.errors()
if errors and any(errors):
final_summary += f"- **Errors:**\n```\n{errors}\n```\n"
else:
final_summary += "- Status: Success\n"
webui_manager.bu_chat_history.append({"role": "assistant", "content": final_summary})
async def _ask_assistant_callback(webui_manager: WebuiManager, query: str, browser_context: BrowserContext) -> Dict[
str, Any]:
"""Callback triggered by the agent's ask_for_assistant action."""
logger.info("Agent requires assistance. Waiting for user input.")
if not hasattr(webui_manager, '_chat_history'):
logger.error("Chat history not found in webui_manager during ask_assistant!")
return {"response": "Internal Error: Cannot display help request."}
webui_manager.bu_chat_history.append({"role": "assistant",
"content": f"**Need Help:** {query}\nPlease provide information or perform the required action in the browser, then type your response/confirmation below and click 'Submit Response'."})
# Use state stored in webui_manager
webui_manager.bu_response_event = asyncio.Event()
webui_manager.bu_user_help_response = None # Reset previous response
try:
logger.info("Waiting for user response event...")
await asyncio.wait_for(webui_manager.bu_response_event.wait(), timeout=3600.0) # Long timeout
logger.info("User response event received.")
except asyncio.TimeoutError:
logger.warning("Timeout waiting for user assistance.")
webui_manager.bu_chat_history.append(
{"role": "assistant", "content": "**Timeout:** No response received. Trying to proceed."})
webui_manager.bu_response_event = None # Clear the event
return {"response": "Timeout: User did not respond."} # Inform the agent
response = webui_manager.bu_user_help_response
webui_manager.bu_chat_history.append({"role": "user", "content": response}) # Show user response in chat
webui_manager.bu_response_event = None # Clear the event for the next potential request
return {"response": response}
# --- Core Agent Execution Logic --- (Needs access to webui_manager)
async def run_agent_task(webui_manager: WebuiManager, components: Dict[gr.components.Component, Any]) -> AsyncGenerator[
Dict[gr.components.Component, Any], None]:
"""Handles the entire lifecycle of initializing and running the agent."""
# --- Get Components ---
# Need handles to specific UI components to update them
user_input_comp = webui_manager.get_component_by_id("browser_use_agent.user_input")
run_button_comp = webui_manager.get_component_by_id("browser_use_agent.run_button")
stop_button_comp = webui_manager.get_component_by_id("browser_use_agent.stop_button")
pause_resume_button_comp = webui_manager.get_component_by_id("browser_use_agent.pause_resume_button")
clear_button_comp = webui_manager.get_component_by_id("browser_use_agent.clear_button")
chatbot_comp = webui_manager.get_component_by_id("browser_use_agent.chatbot")
history_file_comp = webui_manager.get_component_by_id("browser_use_agent.agent_history_file")
gif_comp = webui_manager.get_component_by_id("browser_use_agent.recording_gif")
browser_view_comp = webui_manager.get_component_by_id("browser_use_agent.browser_view")
# --- 1. Get Task and Initial UI Update ---
task = components.get(user_input_comp, "").strip()
if not task:
gr.Warning("Please enter a task.")
yield {run_button_comp: gr.update(interactive=True)}
return
# Set running state indirectly via _current_task
webui_manager.bu_chat_history.append({"role": "user", "content": task})
yield {
user_input_comp: gr.Textbox(value="", interactive=False, placeholder="Agent is running..."),
run_button_comp: gr.Button(value="⏳ Running...", interactive=False),
stop_button_comp: gr.Button(interactive=True),
pause_resume_button_comp: gr.Button(value="⏸️ Pause", interactive=True),
clear_button_comp: gr.Button(interactive=False),
chatbot_comp: gr.update(value=webui_manager.bu_chat_history),
history_file_comp: gr.update(value=None),
gif_comp: gr.update(value=None),
}
# --- Agent Settings ---
# Access settings values via components dict, getting IDs from webui_manager
def get_setting(key, default=None):
comp = webui_manager.id_to_component.get(f"agent_settings.{key}")
return components.get(comp, default) if comp else default
override_system_prompt = get_setting("override_system_prompt") or None
extend_system_prompt = get_setting("extend_system_prompt") or None
llm_provider_name = get_setting("llm_provider", None) # Default to None if not found
llm_model_name = get_setting("llm_model_name", None)
llm_temperature = get_setting("llm_temperature", 0.6)
use_vision = get_setting("use_vision", True)
ollama_num_ctx = get_setting("ollama_num_ctx", 16000)
llm_base_url = get_setting("llm_base_url") or None
llm_api_key = get_setting("llm_api_key") or None
max_steps = get_setting("max_steps", 100)
max_actions = get_setting("max_actions", 10)
max_input_tokens = get_setting("max_input_tokens", 128000)
tool_calling_str = get_setting("tool_calling_method", "auto")
tool_calling_method = tool_calling_str if tool_calling_str != "None" else None
mcp_server_config_comp = webui_manager.id_to_component.get("agent_settings.mcp_server_config")
mcp_server_config_str = components.get(mcp_server_config_comp) if mcp_server_config_comp else None
mcp_server_config = json.loads(mcp_server_config_str) if mcp_server_config_str else None
# Planner LLM Settings (Optional)
planner_llm_provider_name = get_setting("planner_llm_provider") or None
planner_llm = None
if planner_llm_provider_name:
planner_llm_model_name = get_setting("planner_llm_model_name")
planner_llm_temperature = get_setting("planner_llm_temperature", 0.6)
planner_ollama_num_ctx = get_setting("planner_ollama_num_ctx", 16000)
planner_llm_base_url = get_setting("planner_llm_base_url") or None
planner_llm_api_key = get_setting("planner_llm_api_key") or None
planner_use_vision = get_setting("planner_use_vision", False)
planner_llm = await _initialize_llm(
planner_llm_provider_name, planner_llm_model_name, planner_llm_temperature,
planner_llm_base_url, planner_llm_api_key,
planner_ollama_num_ctx if planner_llm_provider_name == "ollama" else None
)
# --- Browser Settings ---
def get_browser_setting(key, default=None):
comp = webui_manager.id_to_component.get(f"browser_settings.{key}")
return components.get(comp, default) if comp else default
browser_binary_path = get_browser_setting("browser_binary_path") or None
browser_user_data_dir = get_browser_setting("browser_user_data_dir") or None
use_own_browser = get_browser_setting("use_own_browser", False) # Logic handled by CDP/WSS presence
keep_browser_open = get_browser_setting("keep_browser_open", False)
headless = get_browser_setting("headless", False)
disable_security = get_browser_setting("disable_security", True)
window_w = int(get_browser_setting("window_w", 1280))
window_h = int(get_browser_setting("window_h", 1100))
cdp_url = get_browser_setting("cdp_url") or None
wss_url = get_browser_setting("wss_url") or None
save_recording_path = get_browser_setting("save_recording_path") or None
save_trace_path = get_browser_setting("save_trace_path") or None
save_agent_history_path = get_browser_setting("save_agent_history_path", "./tmp/agent_history")
save_download_path = get_browser_setting("save_download_path", "./tmp/downloads")
stream_vw = 70
stream_vh = int(70 * window_h // window_w)
os.makedirs(save_agent_history_path, exist_ok=True)
if save_recording_path: os.makedirs(save_recording_path, exist_ok=True)
if save_trace_path: os.makedirs(save_trace_path, exist_ok=True)
if save_download_path: os.makedirs(save_download_path, exist_ok=True)
# --- 2. Initialize LLM ---
main_llm = await _initialize_llm(
llm_provider_name, llm_model_name, llm_temperature, llm_base_url, llm_api_key,
ollama_num_ctx if llm_provider_name == "ollama" else None
)
# Pass the webui_manager instance to the callback when wrapping it
async def ask_callback_wrapper(query: str, browser_context: BrowserContext) -> Dict[str, Any]:
return await _ask_assistant_callback(webui_manager, query, browser_context)
if not webui_manager.bu_controller:
webui_manager.bu_controller = CustomController(ask_assistant_callback=ask_callback_wrapper)
await webui_manager.bu_controller.setup_mcp_client(mcp_server_config)
# --- 4. Initialize Browser and Context ---
should_close_browser_on_finish = not keep_browser_open
try:
# Close existing resources if not keeping open
if not keep_browser_open:
if webui_manager.bu_browser_context:
logger.info("Closing previous browser context.")
await webui_manager.bu_browser_context.close()
webui_manager.bu_browser_context = None
if webui_manager.bu_browser:
logger.info("Closing previous browser.")
await webui_manager.bu_browser.close()
webui_manager.bu_browser = None
# Create Browser if needed
if not webui_manager.bu_browser:
logger.info("Launching new browser instance.")
extra_args = [f"--window-size={window_w},{window_h}"]
if browser_user_data_dir:
extra_args.append(f"--user-data-dir={browser_user_data_dir}")
if use_own_browser:
browser_binary_path = os.getenv("CHROME_PATH", None) or browser_binary_path
if browser_binary_path == "":
browser_binary_path = None
chrome_user_data = os.getenv("CHROME_USER_DATA", None)
if chrome_user_data:
extra_args += [f"--user-data-dir={chrome_user_data}"]
else:
browser_binary_path = None
webui_manager.bu_browser = CustomBrowser(
config=BrowserConfig(
headless=headless,
disable_security=disable_security,
browser_binary_path=browser_binary_path,
extra_browser_args=extra_args,
wss_url=wss_url,
cdp_url=cdp_url,
)
)
# Create Context if needed
if not webui_manager.bu_browser_context:
logger.info("Creating new browser context.")
context_config = CustomBrowserContextConfig(
trace_path=save_trace_path if save_trace_path else None,
save_recording_path=save_recording_path if save_recording_path else None,
save_downloads_path=save_download_path if save_download_path else None,
browser_window_size=BrowserContextWindowSize(width=window_w, height=window_h)
)
if not webui_manager.bu_browser:
raise ValueError("Browser not initialized, cannot create context.")
webui_manager.bu_browser_context = await webui_manager.bu_browser.new_context(config=context_config)
# --- 5. Initialize or Update Agent ---
webui_manager.bu_agent_task_id = str(uuid.uuid4()) # New ID for this task run
os.makedirs(os.path.join(save_agent_history_path, webui_manager.bu_agent_task_id), exist_ok=True)
history_file = os.path.join(save_agent_history_path, webui_manager.bu_agent_task_id,
f"{webui_manager.bu_agent_task_id}.json")
gif_path = os.path.join(save_agent_history_path, webui_manager.bu_agent_task_id,
f"{webui_manager.bu_agent_task_id}.gif")
# Pass the webui_manager to callbacks when wrapping them
async def step_callback_wrapper(state: BrowserState, output: AgentOutput, step_num: int):
await _handle_new_step(webui_manager, state, output, step_num)
def done_callback_wrapper(history: AgentHistoryList):
_handle_done(webui_manager, history)
if not webui_manager.bu_agent:
logger.info(f"Initializing new agent for task: {task}")
if not webui_manager.bu_browser or not webui_manager.bu_browser_context:
raise ValueError("Browser or Context not initialized, cannot create agent.")
webui_manager.bu_agent = BrowserUseAgent(
task=task,
llm=main_llm,
browser=webui_manager.bu_browser,
browser_context=webui_manager.bu_browser_context,
controller=webui_manager.bu_controller,
register_new_step_callback=step_callback_wrapper,
register_done_callback=done_callback_wrapper,
use_vision=use_vision,
override_system_message=override_system_prompt,
extend_system_message=extend_system_prompt,
max_input_tokens=max_input_tokens,
max_actions_per_step=max_actions,
tool_calling_method=tool_calling_method,
planner_llm=planner_llm,
use_vision_for_planner=planner_use_vision if planner_llm else False
)
webui_manager.bu_agent.state.agent_id = webui_manager.bu_agent_task_id
webui_manager.bu_agent.settings.generate_gif = gif_path
else:
webui_manager.bu_agent.state.agent_id = webui_manager.bu_agent_task_id
webui_manager.bu_agent.add_new_task(task)
webui_manager.bu_agent.settings.generate_gif = gif_path
webui_manager.bu_agent.browser = webui_manager.bu_browser
webui_manager.bu_agent.browser_context = webui_manager.bu_browser_context
webui_manager.bu_agent.controller = webui_manager.bu_controller
# --- 6. Run Agent Task and Stream Updates ---
agent_run_coro = webui_manager.bu_agent.run(max_steps=max_steps)
agent_task = asyncio.create_task(agent_run_coro)
webui_manager.bu_current_task = agent_task # Store the task
last_chat_len = len(webui_manager.bu_chat_history)
while not agent_task.done():
is_paused = webui_manager.bu_agent.state.paused
is_stopped = webui_manager.bu_agent.state.stopped
# Check for pause state
if is_paused:
yield {
pause_resume_button_comp: gr.update(value="▶️ Resume", interactive=True),
stop_button_comp: gr.update(interactive=True),
}
# Wait until pause is released or task is stopped/done
while is_paused and not agent_task.done():
# Re-check agent state in loop
is_paused = webui_manager.bu_agent.state.paused
is_stopped = webui_manager.bu_agent.state.stopped
if is_stopped: # Stop signal received while paused
break
await asyncio.sleep(0.2)
if agent_task.done() or is_stopped: # If stopped or task finished while paused
break
# If resumed, yield UI update
yield {
pause_resume_button_comp: gr.update(value="⏸️ Pause", interactive=True),
run_button_comp: gr.update(value="⏳ Running...", interactive=False),
}
# Check if agent stopped itself or stop button was pressed (which sets agent.state.stopped)
if is_stopped:
logger.info("Agent has stopped (internally or via stop button).")
if not agent_task.done():
# Ensure the task coroutine finishes if agent just set flag
try:
await asyncio.wait_for(agent_task, timeout=1.0) # Give it a moment to exit run()
except asyncio.TimeoutError:
logger.warning("Agent task did not finish quickly after stop signal, cancelling.")
agent_task.cancel()
except Exception: # Catch task exceptions if it errors on stop
pass
break # Exit the streaming loop
# Check if agent is asking for help (via response_event)
update_dict = {}
if webui_manager.bu_response_event is not None:
update_dict = {
user_input_comp: gr.update(placeholder="Agent needs help. Enter response and submit.",
interactive=True),
run_button_comp: gr.update(value="✔️ Submit Response", interactive=True),
pause_resume_button_comp: gr.update(interactive=False),
stop_button_comp: gr.update(interactive=False),
chatbot_comp: gr.update(value=webui_manager.bu_chat_history)
}
last_chat_len = len(webui_manager.bu_chat_history)
yield update_dict
# Wait until response is submitted or task finishes
while webui_manager.bu_response_event is not None and not agent_task.done():
await asyncio.sleep(0.2)
# Restore UI after response submitted or if task ended unexpectedly
if not agent_task.done():
yield {
user_input_comp: gr.update(placeholder="Agent is running...", interactive=False),
run_button_comp: gr.update(value="⏳ Running...", interactive=False),
pause_resume_button_comp: gr.update(interactive=True),
stop_button_comp: gr.update(interactive=True),
}
else:
break # Task finished while waiting for response
# Update Chatbot if new messages arrived via callbacks
if len(webui_manager.bu_chat_history) > last_chat_len:
update_dict[chatbot_comp] = gr.update(value=webui_manager.bu_chat_history)
last_chat_len = len(webui_manager.bu_chat_history)
# Update Browser View
if headless and webui_manager.bu_browser_context:
try:
screenshot_b64 = await webui_manager.bu_browser_context.take_screenshot()
if screenshot_b64:
html_content = f'<img src="data:image/jpeg;base64,{screenshot_b64}" style="width:{stream_vw}vw; height:{stream_vh}vh ; border:1px solid #ccc;">'
update_dict[browser_view_comp] = gr.update(value=html_content, visible=True)
else:
html_content = f"<h1 style='width:{stream_vw}vw; height:{stream_vh}vh'>Waiting for browser session...</h1>"
update_dict[browser_view_comp] = gr.update(value=html_content,
visible=True)
except Exception as e:
logger.debug(f"Failed to capture screenshot: {e}")
update_dict[browser_view_comp] = gr.update(value="<div style='...'>Error loading view...</div>",
visible=True)
else:
update_dict[browser_view_comp] = gr.update(visible=False)
# Yield accumulated updates
if update_dict:
yield update_dict
await asyncio.sleep(0.1) # Polling interval
# --- 7. Task Finalization ---
webui_manager.bu_agent.state.paused = False
webui_manager.bu_agent.state.stopped = False
final_update = {}
try:
logger.info("Agent task completing...")
# Await the task ensure completion and catch exceptions if not already caught
if not agent_task.done():
await agent_task # Retrieve result/exception
elif agent_task.exception(): # Check if task finished with exception
agent_task.result() # Raise the exception to be caught below
logger.info("Agent task completed processing.")
logger.info(f"Explicitly saving agent history to: {history_file}")
webui_manager.bu_agent.save_history(history_file)
if os.path.exists(history_file):
final_update[history_file_comp] = gr.File(value=history_file)
if gif_path and os.path.exists(gif_path):
logger.info(f"GIF found at: {gif_path}")
final_update[gif_comp] = gr.Image(value=gif_path)
except asyncio.CancelledError:
logger.info("Agent task was cancelled.")
if not any("Cancelled" in msg.get("content", "") for msg in webui_manager.bu_chat_history if
msg.get("role") == "assistant"):
webui_manager.bu_chat_history.append({"role": "assistant", "content": "**Task Cancelled**."})
final_update[chatbot_comp] = gr.update(value=webui_manager.bu_chat_history)
except Exception as e:
logger.error(f"Error during agent execution: {e}", exc_info=True)
error_message = f"**Agent Execution Error:**\n```\n{type(e).__name__}: {e}\n```"
if not any(error_message in msg.get("content", "") for msg in webui_manager.bu_chat_history if
msg.get("role") == "assistant"):
webui_manager.bu_chat_history.append({"role": "assistant", "content": error_message})
final_update[chatbot_comp] = gr.update(value=webui_manager.bu_chat_history)
gr.Error(f"Agent execution failed: {e}")
finally:
webui_manager.bu_current_task = None # Clear the task reference
# Close browser/context if requested
if should_close_browser_on_finish:
if webui_manager.bu_browser_context:
logger.info("Closing browser context after task.")
await webui_manager.bu_browser_context.close()
webui_manager.bu_browser_context = None
if webui_manager.bu_browser:
logger.info("Closing browser after task.")
await webui_manager.bu_browser.close()
webui_manager.bu_browser = None
# --- 8. Final UI Update ---
final_update.update({
user_input_comp: gr.update(value="", interactive=True, placeholder="Enter your next task..."),
run_button_comp: gr.update(value="▶️ Submit Task", interactive=True),
stop_button_comp: gr.update(value="⏹️ Stop", interactive=False),
pause_resume_button_comp: gr.update(value="⏸️ Pause", interactive=False),
clear_button_comp: gr.update(interactive=True),
# Ensure final chat history is shown
chatbot_comp: gr.update(value=webui_manager.bu_chat_history)
})
yield final_update
except Exception as e:
# Catch errors during setup (before agent run starts)
logger.error(f"Error setting up agent task: {e}", exc_info=True)
webui_manager.bu_current_task = None # Ensure state is reset
yield {
user_input_comp: gr.update(interactive=True, placeholder="Error during setup. Enter task..."),
run_button_comp: gr.update(value="▶️ Submit Task", interactive=True),
stop_button_comp: gr.update(value="⏹️ Stop", interactive=False),
pause_resume_button_comp: gr.update(value="⏸️ Pause", interactive=False),
clear_button_comp: gr.update(interactive=True),
chatbot_comp: gr.update(
value=webui_manager.bu_chat_history + [{"role": "assistant", "content": f"**Setup Error:** {e}"}]),
}
# --- Button Click Handlers --- (Need access to webui_manager)
async def handle_submit(webui_manager: WebuiManager, components: Dict[gr.components.Component, Any]):
"""Handles clicks on the main 'Submit' button."""
user_input_comp = webui_manager.get_component_by_id("browser_use_agent.user_input")
user_input_value = components.get(user_input_comp, "").strip()
# Check if waiting for user assistance
if webui_manager.bu_response_event and not webui_manager.bu_response_event.is_set():
logger.info(f"User submitted assistance: {user_input_value}")
webui_manager.bu_user_help_response = user_input_value if user_input_value else "User provided no text response."
webui_manager.bu_response_event.set()
# UI updates handled by the main loop reacting to the event being set
yield {
user_input_comp: gr.update(value="", interactive=False, placeholder="Waiting for agent to continue..."),
webui_manager.get_component_by_id("browser_use_agent.run_button"): gr.update(value="⏳ Running...",
interactive=False)
}
# Check if a task is currently running (using _current_task)
elif webui_manager.bu_current_task and not webui_manager.bu_current_task.done():
logger.warning("Submit button clicked while agent is already running and not asking for help.")
gr.Info("Agent is currently running. Please wait or use Stop/Pause.")
yield {} # No change
else:
# Handle submission for a new task
logger.info("Submit button clicked for new task.")
# Use async generator to stream updates from run_agent_task
async for update in run_agent_task(webui_manager, components):
yield update
async def handle_stop(webui_manager: WebuiManager):
"""Handles clicks on the 'Stop' button."""
logger.info("Stop button clicked.")
agent = webui_manager.bu_agent
task = webui_manager.bu_current_task
if agent and task and not task.done():
# Signal the agent to stop by setting its internal flag
agent.state.stopped = True
agent.state.paused = False # Ensure not paused if stopped
return {
webui_manager.get_component_by_id("browser_use_agent.stop_button"): gr.update(interactive=False,
value="⏹️ Stopping..."),
webui_manager.get_component_by_id("browser_use_agent.pause_resume_button"): gr.update(interactive=False),
webui_manager.get_component_by_id("browser_use_agent.run_button"): gr.update(interactive=False),
}
else:
logger.warning("Stop clicked but agent is not running or task is already done.")
# Reset UI just in case it's stuck
return {
webui_manager.get_component_by_id("browser_use_agent.run_button"): gr.update(interactive=True),
webui_manager.get_component_by_id("browser_use_agent.stop_button"): gr.update(interactive=False),
webui_manager.get_component_by_id("browser_use_agent.pause_resume_button"): gr.update(interactive=False),
webui_manager.get_component_by_id("browser_use_agent.clear_button"): gr.update(interactive=True),
}
async def handle_pause_resume(webui_manager: WebuiManager):
"""Handles clicks on the 'Pause/Resume' button."""
agent = webui_manager.bu_agent
task = webui_manager.bu_current_task
if agent and task and not task.done():
if agent.state.paused:
logger.info("Resume button clicked.")
agent.resume()
# UI update happens in main loop
return {
webui_manager.get_component_by_id("browser_use_agent.pause_resume_button"): gr.update(value="⏸️ Pause",
interactive=True)} # Optimistic update
else:
logger.info("Pause button clicked.")
agent.pause()
return {
webui_manager.get_component_by_id("browser_use_agent.pause_resume_button"): gr.update(value="▶️ Resume",
interactive=True)} # Optimistic update
else:
logger.warning("Pause/Resume clicked but agent is not running or doesn't support state.")
return {} # No change
async def handle_clear(webui_manager: WebuiManager):
"""Handles clicks on the 'Clear' button."""
logger.info("Clear button clicked.")
# Stop any running task first
task = webui_manager.bu_current_task
if task and not task.done():
logger.info("Clearing requires stopping the current task.")
webui_manager.bu_agent.stop()
task.cancel()
try:
await asyncio.wait_for(task, timeout=2.0) # Wait briefly
except (asyncio.CancelledError, asyncio.TimeoutError):
pass
except Exception as e:
logger.warning(f"Error stopping task on clear: {e}")
webui_manager.bu_current_task = None
if webui_manager.bu_controller:
await webui_manager.bu_controller.close_mcp_client()
webui_manager.bu_controller = None
webui_manager.bu_agent = None
# Reset state stored in manager
webui_manager.bu_chat_history = []
webui_manager.bu_response_event = None
webui_manager.bu_user_help_response = None
webui_manager.bu_agent_task_id = None
logger.info("Agent state and browser resources cleared.")
# Reset UI components
return {
webui_manager.get_component_by_id("browser_use_agent.chatbot"): gr.update(value=[]),
webui_manager.get_component_by_id("browser_use_agent.user_input"): gr.update(value="",
placeholder="Enter your task here..."),
webui_manager.get_component_by_id("browser_use_agent.agent_history_file"): gr.update(value=None),
webui_manager.get_component_by_id("browser_use_agent.recording_gif"): gr.update(value=None),
webui_manager.get_component_by_id("browser_use_agent.browser_view"): gr.update(
value="<div style='...'>Browser Cleared</div>"),
webui_manager.get_component_by_id("browser_use_agent.run_button"): gr.update(value="▶️ Submit Task",
interactive=True),
webui_manager.get_component_by_id("browser_use_agent.stop_button"): gr.update(interactive=False),
webui_manager.get_component_by_id("browser_use_agent.pause_resume_button"): gr.update(value="⏸️ Pause",
interactive=False),
webui_manager.get_component_by_id("browser_use_agent.clear_button"): gr.update(interactive=True),
}
# --- Tab Creation Function ---
def create_browser_use_agent_tab(webui_manager: WebuiManager):
"""
Create the run agent tab, defining UI, state, and handlers.
"""
webui_manager.init_browser_use_agent()
# --- Define UI Components ---
tab_components = {}
with gr.Column():
chatbot = gr.Chatbot(
lambda: webui_manager.bu_chat_history, # Load history dynamically
elem_id="browser_use_chatbot",
label="Agent Interaction",
type="messages",
height=600,
show_copy_button=True,
bubble_full_width=False,
)
user_input = gr.Textbox(
label="Your Task or Response",
placeholder="Enter your task here or provide assistance when asked.",
lines=3,
interactive=True,
elem_id="user_input"
)
with gr.Row():
stop_button = gr.Button("⏹️ Stop", interactive=False, variant="stop", scale=2)
pause_resume_button = gr.Button("⏸️ Pause", interactive=False, variant="secondary", scale=2, visible=True)
clear_button = gr.Button("🗑️ Clear", interactive=True, variant="secondary", scale=2)
run_button = gr.Button("▶️ Submit Task", variant="primary", scale=3)
browser_view = gr.HTML(
value="<div style='width:100%; height:50vh; display:flex; justify-content:center; align-items:center; border:1px solid #ccc; background-color:#f0f0f0;'><p>Browser View (Requires Headless=True)</p></div>",
label="Browser Live View",
elem_id="browser_view",
visible=False,
)
with gr.Column():
gr.Markdown("### Task Outputs")
agent_history_file = gr.File(label="Agent History JSON", interactive=False)
recording_gif = gr.Image(label="Task Recording GIF", format="gif", interactive=False,
type="filepath")
# --- Store Components in Manager ---
tab_components.update(
dict(
chatbot=chatbot, user_input=user_input, clear_button=clear_button,
run_button=run_button, stop_button=stop_button, pause_resume_button=pause_resume_button,
agent_history_file=agent_history_file, recording_gif=recording_gif,
browser_view=browser_view
)
)
webui_manager.add_components("browser_use_agent", tab_components) # Use "browser_use_agent" as tab_name prefix
all_managed_components = set(webui_manager.get_components()) # Get all components known to manager
run_tab_outputs = list(tab_components.values())
async def submit_wrapper(components_dict: Dict[Component, Any]) -> AsyncGenerator[Dict[Component, Any], None]:
"""Wrapper for handle_submit that yields its results."""
async for update in handle_submit(webui_manager, components_dict):
yield update
async def stop_wrapper() -> AsyncGenerator[Dict[Component, Any], None]:
"""Wrapper for handle_stop."""
update_dict = await handle_stop(webui_manager)
yield update_dict
async def pause_resume_wrapper() -> AsyncGenerator[Dict[Component, Any], None]:
"""Wrapper for handle_pause_resume."""
update_dict = await handle_pause_resume(webui_manager)
yield update_dict
async def clear_wrapper() -> AsyncGenerator[Dict[Component, Any], None]:
"""Wrapper for handle_clear."""
update_dict = await handle_clear(webui_manager)
yield update_dict
# --- Connect Event Handlers using the Wrappers --
run_button.click(
fn=submit_wrapper,
inputs=all_managed_components,
outputs=run_tab_outputs
)
user_input.submit(
fn=submit_wrapper,
inputs=all_managed_components,
outputs=run_tab_outputs
)
stop_button.click(
fn=stop_wrapper,
inputs=None,
outputs=run_tab_outputs
)
pause_resume_button.click(
fn=pause_resume_wrapper,
inputs=None,
outputs=run_tab_outputs
)
clear_button.click(
fn=clear_wrapper,
inputs=None,
outputs=run_tab_outputs
)

View File

@@ -0,0 +1,451 @@
import gradio as gr
from gradio.components import Component
from functools import partial
from src.webui.webui_manager import WebuiManager
from src.utils import config
import logging
import os
from typing import Any, Dict, AsyncGenerator, Optional, Tuple, Union
import asyncio
import json
from src.agent.deep_research.deep_research_agent import DeepResearchAgent
from src.utils import llm_provider
logger = logging.getLogger(__name__)
async def _initialize_llm(provider: Optional[str], model_name: Optional[str], temperature: float,
base_url: Optional[str], api_key: Optional[str], num_ctx: Optional[int] = None):
"""Initializes the LLM based on settings. Returns None if provider/model is missing."""
if not provider or not model_name:
logger.info("LLM Provider or Model Name not specified, LLM will be None.")
return None
try:
logger.info(f"Initializing LLM: Provider={provider}, Model={model_name}, Temp={temperature}")
# Use your actual LLM provider logic here
llm = llm_provider.get_llm_model(
provider=provider,
model_name=model_name,
temperature=temperature,
base_url=base_url or None,
api_key=api_key or None,
num_ctx=num_ctx if provider == "ollama" else None
)
return llm
except Exception as e:
logger.error(f"Failed to initialize LLM: {e}", exc_info=True)
gr.Warning(
f"Failed to initialize LLM '{model_name}' for provider '{provider}'. Please check settings. Error: {e}")
return None
def _read_file_safe(file_path: str) -> Optional[str]:
"""Safely read a file, returning None if it doesn't exist or on error."""
if not os.path.exists(file_path):
return None
try:
with open(file_path, 'r', encoding='utf-8') as f:
return f.read()
except Exception as e:
logger.error(f"Error reading file {file_path}: {e}")
return None
# --- Deep Research Agent Specific Logic ---
async def run_deep_research(webui_manager: WebuiManager, components: Dict[Component, Any]) -> AsyncGenerator[
Dict[Component, Any], None]:
"""Handles initializing and running the DeepResearchAgent."""
# --- Get Components ---
research_task_comp = webui_manager.get_component_by_id("deep_research_agent.research_task")
resume_task_id_comp = webui_manager.get_component_by_id("deep_research_agent.resume_task_id")
parallel_num_comp = webui_manager.get_component_by_id("deep_research_agent.parallel_num")
save_dir_comp = webui_manager.get_component_by_id(
"deep_research_agent.max_query") # Note: component ID seems misnamed in original code
start_button_comp = webui_manager.get_component_by_id("deep_research_agent.start_button")
stop_button_comp = webui_manager.get_component_by_id("deep_research_agent.stop_button")
markdown_display_comp = webui_manager.get_component_by_id("deep_research_agent.markdown_display")
markdown_download_comp = webui_manager.get_component_by_id("deep_research_agent.markdown_download")
mcp_server_config_comp = webui_manager.get_component_by_id("deep_research_agent.mcp_server_config")
# --- 1. Get Task and Settings ---
task_topic = components.get(research_task_comp, "").strip()
task_id_to_resume = components.get(resume_task_id_comp, "").strip() or None
max_parallel_agents = int(components.get(parallel_num_comp, 1))
base_save_dir = components.get(save_dir_comp, "./tmp/deep_research")
mcp_server_config_str = components.get(mcp_server_config_comp)
mcp_config = json.loads(mcp_server_config_str) if mcp_server_config_str else None
if not task_topic:
gr.Warning("Please enter a research task.")
yield {start_button_comp: gr.update(interactive=True)} # Re-enable start button
return
# Store base save dir for stop handler
webui_manager.dr_save_dir = base_save_dir
os.makedirs(base_save_dir, exist_ok=True)
# --- 2. Initial UI Update ---
yield {
start_button_comp: gr.update(value="⏳ Running...", interactive=False),
stop_button_comp: gr.update(interactive=True),
research_task_comp: gr.update(interactive=False),
resume_task_id_comp: gr.update(interactive=False),
parallel_num_comp: gr.update(interactive=False),
save_dir_comp: gr.update(interactive=False),
markdown_display_comp: gr.update(value="Starting research..."),
markdown_download_comp: gr.update(value=None, interactive=False)
}
agent_task = None
running_task_id = None
plan_file_path = None
report_file_path = None
last_plan_content = None
last_plan_mtime = 0
try:
# --- 3. Get LLM and Browser Config from other tabs ---
# Access settings values via components dict, getting IDs from webui_manager
def get_setting(tab: str, key: str, default: Any = None):
comp = webui_manager.id_to_component.get(f"{tab}.{key}")
return components.get(comp, default) if comp else default
# LLM Config (from agent_settings tab)
llm_provider_name = get_setting("agent_settings", "llm_provider")
llm_model_name = get_setting("agent_settings", "llm_model_name")
llm_temperature = get_setting("agent_settings", "llm_temperature", 0.5) # Default if not found
llm_base_url = get_setting("agent_settings", "llm_base_url")
llm_api_key = get_setting("agent_settings", "llm_api_key")
ollama_num_ctx = get_setting("agent_settings", "ollama_num_ctx")
llm = await _initialize_llm(
llm_provider_name, llm_model_name, llm_temperature, llm_base_url, llm_api_key,
ollama_num_ctx if llm_provider_name == "ollama" else None
)
if not llm:
raise ValueError("LLM Initialization failed. Please check Agent Settings.")
# Browser Config (from browser_settings tab)
# Note: DeepResearchAgent constructor takes a dict, not full Browser/Context objects
browser_config_dict = {
"headless": get_setting("browser_settings", "headless", False),
"disable_security": get_setting("browser_settings", "disable_security", True),
"browser_binary_path": get_setting("browser_settings", "browser_binary_path"),
"user_data_dir": get_setting("browser_settings", "browser_user_data_dir"),
"window_width": int(get_setting("browser_settings", "window_w", 1280)),
"window_height": int(get_setting("browser_settings", "window_h", 1100)),
# Add other relevant fields if DeepResearchAgent accepts them
}
# --- 4. Initialize or Get Agent ---
if not webui_manager.dr_agent:
webui_manager.dr_agent = DeepResearchAgent(
llm=llm,
browser_config=browser_config_dict,
mcp_server_config=mcp_config
)
logger.info("DeepResearchAgent initialized.")
# --- 5. Start Agent Run ---
agent_run_coro = webui_manager.dr_agent.run(
topic=task_topic,
task_id=task_id_to_resume,
save_dir=base_save_dir,
max_parallel_browsers=max_parallel_agents
)
agent_task = asyncio.create_task(agent_run_coro)
webui_manager.dr_current_task = agent_task
# Wait briefly for the agent to start and potentially create the task ID/folder
await asyncio.sleep(1.0)
# Determine the actual task ID being used (agent sets this)
running_task_id = webui_manager.dr_agent.current_task_id
if not running_task_id:
# Agent might not have set it yet, try to get from result later? Risky.
# Or derive from resume_task_id if provided?
running_task_id = task_id_to_resume
if not running_task_id:
logger.warning("Could not determine running task ID immediately.")
# We can still monitor, but might miss initial plan if ID needed for path
else:
logger.info(f"Assuming task ID based on resume ID: {running_task_id}")
else:
logger.info(f"Agent started with Task ID: {running_task_id}")
webui_manager.dr_task_id = running_task_id # Store for stop handler
# --- 6. Monitor Progress via research_plan.md ---
if running_task_id:
task_specific_dir = os.path.join(base_save_dir, str(running_task_id))
plan_file_path = os.path.join(task_specific_dir, "research_plan.md")
report_file_path = os.path.join(task_specific_dir, "report.md")
logger.info(f"Monitoring plan file: {plan_file_path}")
else:
logger.warning("Cannot monitor plan file: Task ID unknown.")
plan_file_path = None
last_plan_content = None
while not agent_task.done():
update_dict = {}
update_dict[resume_task_id_comp] = gr.update(value=running_task_id)
agent_stopped = getattr(webui_manager.dr_agent, 'stopped', False)
if agent_stopped:
logger.info("Stop signal detected from agent state.")
break # Exit monitoring loop
# Check and update research plan display
if plan_file_path:
try:
current_mtime = os.path.getmtime(plan_file_path) if os.path.exists(plan_file_path) else 0
if current_mtime > last_plan_mtime:
logger.info(f"Detected change in {plan_file_path}")
plan_content = _read_file_safe(plan_file_path)
if last_plan_content is None or (
plan_content is not None and plan_content != last_plan_content):
update_dict[markdown_display_comp] = gr.update(value=plan_content)
last_plan_content = plan_content
last_plan_mtime = current_mtime
elif plan_content is None:
# File might have been deleted or became unreadable
last_plan_mtime = 0 # Reset to force re-read attempt later
except Exception as e:
logger.warning(f"Error checking/reading plan file {plan_file_path}: {e}")
# Avoid continuous logging for the same error
await asyncio.sleep(2.0)
# Yield updates if any
if update_dict:
yield update_dict
await asyncio.sleep(1.0) # Check file changes every second
# --- 7. Task Finalization ---
logger.info("Agent task processing finished. Awaiting final result...")
final_result_dict = await agent_task # Get result or raise exception
logger.info(f"Agent run completed. Result keys: {final_result_dict.keys() if final_result_dict else 'None'}")
# Try to get task ID from result if not known before
if not running_task_id and final_result_dict and 'task_id' in final_result_dict:
running_task_id = final_result_dict['task_id']
webui_manager.dr_task_id = running_task_id
task_specific_dir = os.path.join(base_save_dir, str(running_task_id))
report_file_path = os.path.join(task_specific_dir, "report.md")
logger.info(f"Task ID confirmed from result: {running_task_id}")
final_ui_update = {}
if report_file_path and os.path.exists(report_file_path):
logger.info(f"Loading final report from: {report_file_path}")
report_content = _read_file_safe(report_file_path)
if report_content:
final_ui_update[markdown_display_comp] = gr.update(value=report_content)
final_ui_update[markdown_download_comp] = gr.File(value=report_file_path,
label=f"Report ({running_task_id}.md)",
interactive=True)
else:
final_ui_update[markdown_display_comp] = gr.update(
value="# Research Complete\n\n*Error reading final report file.*")
elif final_result_dict and 'report' in final_result_dict:
logger.info("Using report content directly from agent result.")
# If agent directly returns report content
final_ui_update[markdown_display_comp] = gr.update(value=final_result_dict['report'])
# Cannot offer download if only content is available
final_ui_update[markdown_download_comp] = gr.update(value=None, label="Download Research Report",
interactive=False)
else:
logger.warning("Final report file not found and not in result dict.")
final_ui_update[markdown_display_comp] = gr.update(value="# Research Complete\n\n*Final report not found.*")
yield final_ui_update
except Exception as e:
logger.error(f"Error during Deep Research Agent execution: {e}", exc_info=True)
gr.Error(f"Research failed: {e}")
yield {markdown_display_comp: gr.update(value=f"# Research Failed\n\n**Error:**\n```\n{e}\n```")}
finally:
# --- 8. Final UI Reset ---
webui_manager.dr_current_task = None # Clear task reference
webui_manager.dr_task_id = None # Clear running task ID
yield {
start_button_comp: gr.update(value="▶️ Run", interactive=True),
stop_button_comp: gr.update(interactive=False),
research_task_comp: gr.update(interactive=True),
resume_task_id_comp: gr.update(value="", interactive=True),
parallel_num_comp: gr.update(interactive=True),
save_dir_comp: gr.update(interactive=True),
# Keep download button enabled if file exists
markdown_download_comp: gr.update() if report_file_path and os.path.exists(report_file_path) else gr.update(
interactive=False)
}
async def stop_deep_research(webui_manager: WebuiManager) -> Dict[Component, Any]:
"""Handles the Stop button click."""
logger.info("Stop button clicked for Deep Research.")
agent = webui_manager.dr_agent
task = webui_manager.dr_current_task
task_id = webui_manager.dr_task_id
base_save_dir = webui_manager.dr_save_dir
stop_button_comp = webui_manager.get_component_by_id("deep_research_agent.stop_button")
start_button_comp = webui_manager.get_component_by_id("deep_research_agent.start_button")
markdown_display_comp = webui_manager.get_component_by_id("deep_research_agent.markdown_display")
markdown_download_comp = webui_manager.get_component_by_id("deep_research_agent.markdown_download")
final_update = {
stop_button_comp: gr.update(interactive=False, value="⏹️ Stopping...")
}
if agent and task and not task.done():
logger.info("Signalling DeepResearchAgent to stop.")
try:
# Assuming stop is synchronous or sets a flag quickly
await agent.stop()
except Exception as e:
logger.error(f"Error calling agent.stop(): {e}")
# The run_deep_research loop should detect the stop and exit.
# We yield an intermediate "Stopping..." state. The final reset is done by run_deep_research.
# Try to show the final report if available after stopping
await asyncio.sleep(1.5) # Give agent a moment to write final files potentially
report_file_path = None
if task_id and base_save_dir:
report_file_path = os.path.join(base_save_dir, str(task_id), "report.md")
if report_file_path and os.path.exists(report_file_path):
report_content = _read_file_safe(report_file_path)
if report_content:
final_update[markdown_display_comp] = gr.update(
value=report_content + "\n\n---\n*Research stopped by user.*")
final_update[markdown_download_comp] = gr.File(value=report_file_path, label=f"Report ({task_id}.md)",
interactive=True)
else:
final_update[markdown_display_comp] = gr.update(
value="# Research Stopped\n\n*Error reading final report file after stop.*")
else:
final_update[markdown_display_comp] = gr.update(value="# Research Stopped by User")
# Keep start button disabled, run_deep_research finally block will re-enable it.
final_update[start_button_comp] = gr.update(interactive=False)
else:
logger.warning("Stop clicked but no active research task found.")
# Reset UI state just in case
final_update = {
start_button_comp: gr.update(interactive=True),
stop_button_comp: gr.update(interactive=False),
webui_manager.get_component_by_id("deep_research_agent.research_task"): gr.update(interactive=True),
webui_manager.get_component_by_id("deep_research_agent.resume_task_id"): gr.update(interactive=True),
webui_manager.get_component_by_id("deep_research_agent.max_iteration"): gr.update(interactive=True),
webui_manager.get_component_by_id("deep_research_agent.max_query"): gr.update(interactive=True),
}
return final_update
async def update_mcp_server(mcp_file: str, webui_manager: WebuiManager):
"""
Update the MCP server.
"""
if hasattr(webui_manager, "dr_agent") and webui_manager.dr_agent:
logger.warning("⚠️ Close controller because mcp file has changed!")
await webui_manager.dr_agent.close_mcp_client()
if not mcp_file or not os.path.exists(mcp_file) or not mcp_file.endswith('.json'):
logger.warning(f"{mcp_file} is not a valid MCP file.")
return None, gr.update(visible=False)
with open(mcp_file, 'r') as f:
mcp_server = json.load(f)
return json.dumps(mcp_server, indent=2), gr.update(visible=True)
def create_deep_research_agent_tab(webui_manager: WebuiManager):
"""
Creates a deep research agent tab
"""
input_components = set(webui_manager.get_components())
tab_components = {}
with gr.Group():
with gr.Row():
mcp_json_file = gr.File(label="MCP server json", interactive=True, file_types=[".json"])
mcp_server_config = gr.Textbox(label="MCP server", lines=6, interactive=True, visible=False)
with gr.Group():
research_task = gr.Textbox(label="Research Task", lines=5,
value="Give me a detailed travel plan to Switzerland from June 1st to 10th.",
interactive=True)
with gr.Row():
resume_task_id = gr.Textbox(label="Resume Task ID", value="",
interactive=True)
parallel_num = gr.Number(label="Parallel Agent Num", value=1,
precision=0,
interactive=True)
max_query = gr.Textbox(label="Research Save Dir", value="./tmp/deep_research",
interactive=True)
with gr.Row():
stop_button = gr.Button("⏹️ Stop", variant="stop", scale=2)
start_button = gr.Button("▶️ Run", variant="primary", scale=3)
with gr.Group():
markdown_display = gr.Markdown(label="Research Report")
markdown_download = gr.File(label="Download Research Report", interactive=False)
tab_components.update(
dict(
research_task=research_task,
parallel_num=parallel_num,
max_query=max_query,
start_button=start_button,
stop_button=stop_button,
markdown_display=markdown_display,
markdown_download=markdown_download,
resume_task_id=resume_task_id,
mcp_json_file=mcp_json_file,
mcp_server_config=mcp_server_config,
)
)
webui_manager.add_components("deep_research_agent", tab_components)
webui_manager.init_deep_research_agent()
async def update_wrapper(mcp_file):
"""Wrapper for handle_pause_resume."""
update_dict = await update_mcp_server(mcp_file, webui_manager)
yield update_dict
mcp_json_file.change(
update_wrapper,
inputs=[mcp_json_file],
outputs=[mcp_server_config, mcp_server_config]
)
dr_tab_outputs = list(tab_components.values())
all_managed_inputs = set(webui_manager.get_components())
# --- Define Event Handler Wrappers ---
async def start_wrapper(comps: Dict[Component, Any]) -> AsyncGenerator[Dict[Component, Any], None]:
async for update in run_deep_research(webui_manager, comps):
yield update
async def stop_wrapper() -> AsyncGenerator[Dict[Component, Any], None]:
update_dict = await stop_deep_research(webui_manager)
yield update_dict
# --- Connect Handlers ---
start_button.click(
fn=start_wrapper,
inputs=all_managed_inputs,
outputs=dr_tab_outputs
)
stop_button.click(
fn=stop_wrapper,
inputs=None,
outputs=dr_tab_outputs
)

View File

@@ -0,0 +1,50 @@
import gradio as gr
from gradio.components import Component
from src.webui.webui_manager import WebuiManager
from src.utils import config
def create_load_save_config_tab(webui_manager: WebuiManager):
"""
Creates a load and save config tab.
"""
input_components = set(webui_manager.get_components())
tab_components = {}
config_file = gr.File(
label="Load UI Settings from json",
file_types=[".json"],
interactive=True
)
with gr.Row():
load_config_button = gr.Button("Load Config", variant="primary")
save_config_button = gr.Button("Save UI Settings", variant="primary")
config_status = gr.Textbox(
label="Status",
lines=2,
interactive=False
)
tab_components.update(dict(
load_config_button=load_config_button,
save_config_button=save_config_button,
config_status=config_status,
config_file=config_file,
))
webui_manager.add_components("load_save_config", tab_components)
save_config_button.click(
fn=webui_manager.save_config,
inputs=set(webui_manager.get_components()),
outputs=[config_status]
)
load_config_button.click(
fn=webui_manager.load_config,
inputs=[config_file],
outputs=webui_manager.get_components(),
)

95
src/webui/interface.py Normal file
View File

@@ -0,0 +1,95 @@
import gradio as gr
from src.webui.webui_manager import WebuiManager
from src.webui.components.agent_settings_tab import create_agent_settings_tab
from src.webui.components.browser_settings_tab import create_browser_settings_tab
from src.webui.components.browser_use_agent_tab import create_browser_use_agent_tab
from src.webui.components.deep_research_agent_tab import create_deep_research_agent_tab
from src.webui.components.load_save_config_tab import create_load_save_config_tab
theme_map = {
"Default": gr.themes.Default(),
"Soft": gr.themes.Soft(),
"Monochrome": gr.themes.Monochrome(),
"Glass": gr.themes.Glass(),
"Origin": gr.themes.Origin(),
"Citrus": gr.themes.Citrus(),
"Ocean": gr.themes.Ocean(),
"Base": gr.themes.Base()
}
def create_ui(theme_name="Ocean"):
css = """
.gradio-container {
width: 70vw !important;
max-width: 70% !important;
margin-left: auto !important;
margin-right: auto !important;
padding-top: 10px !important;
}
.header-text {
text-align: center;
margin-bottom: 20px;
}
.tab-header-text {
text-align: center;
}
.theme-section {
margin-bottom: 10px;
padding: 15px;
border-radius: 10px;
}
"""
# dark mode in default
js_func = """
function refresh() {
const url = new URL(window.location);
if (url.searchParams.get('__theme') !== 'dark') {
url.searchParams.set('__theme', 'dark');
window.location.href = url.href;
}
}
"""
ui_manager = WebuiManager()
with gr.Blocks(
title="Browser Use WebUI", theme=theme_map[theme_name], css=css, js=js_func,
) as demo:
with gr.Row():
gr.Markdown(
"""
# 🌐 Browser Use WebUI
### Control your browser with AI assistance
""",
elem_classes=["header-text"],
)
with gr.Tabs() as tabs:
with gr.TabItem("⚙️ Agent Settings"):
create_agent_settings_tab(ui_manager)
with gr.TabItem("🌐 Browser Settings"):
create_browser_settings_tab(ui_manager)
with gr.TabItem("🤖 Run Agent"):
create_browser_use_agent_tab(ui_manager)
with gr.TabItem("🎁 Agent Marketplace"):
gr.Markdown(
"""
### Agents built on Browser-Use
""",
elem_classes=["tab-header-text"],
)
with gr.Tabs():
with gr.TabItem("Deep Research"):
create_deep_research_agent_tab(ui_manager)
with gr.TabItem("📁 Load & Save Config"):
create_load_save_config_tab(ui_manager)
return demo

115
src/webui/webui_manager.py Normal file
View File

@@ -0,0 +1,115 @@
import json
from collections.abc import Generator
from typing import TYPE_CHECKING
import os
import gradio as gr
from datetime import datetime
from typing import Optional, Dict, List
import uuid
import asyncio
from gradio.components import Component
from browser_use.browser.browser import Browser
from browser_use.browser.context import BrowserContext
from browser_use.agent.service import Agent
from src.browser.custom_browser import CustomBrowser
from src.browser.custom_context import CustomBrowserContext
from src.controller.custom_controller import CustomController
from src.agent.deep_research.deep_research_agent import DeepResearchAgent
class WebuiManager:
def __init__(self, settings_save_dir: str = "./tmp/webui_settings"):
self.id_to_component: dict[str, Component] = {}
self.component_to_id: dict[Component, str] = {}
self.settings_save_dir = settings_save_dir
os.makedirs(self.settings_save_dir, exist_ok=True)
def init_browser_use_agent(self) -> None:
"""
init browser use agent
"""
self.bu_agent: Optional[Agent] = None
self.bu_browser: Optional[CustomBrowser] = None
self.bu_browser_context: Optional[CustomBrowserContext] = None
self.bu_controller: Optional[CustomController] = None
self.bu_chat_history: List[Dict[str, Optional[str]]] = []
self.bu_response_event: Optional[asyncio.Event] = None
self.bu_user_help_response: Optional[str] = None
self.bu_current_task: Optional[asyncio.Task] = None
self.bu_agent_task_id: Optional[str] = None
def init_deep_research_agent(self) -> None:
"""
init deep research agent
"""
self.dr_agent: Optional[DeepResearchAgent] = None
self.dr_current_task = None
self.dr_agent_task_id: Optional[str] = None
self.dr_save_dir: Optional[str] = None
def add_components(self, tab_name: str, components_dict: dict[str, "Component"]) -> None:
"""
Add tab components
"""
for comp_name, component in components_dict.items():
comp_id = f"{tab_name}.{comp_name}"
self.id_to_component[comp_id] = component
self.component_to_id[component] = comp_id
def get_components(self) -> list["Component"]:
"""
Get all components
"""
return list(self.id_to_component.values())
def get_component_by_id(self, comp_id: str) -> "Component":
"""
Get component by id
"""
return self.id_to_component[comp_id]
def get_id_by_component(self, comp: "Component") -> str:
"""
Get id by component
"""
return self.component_to_id[comp]
def save_config(self, components: Dict["Component", str]) -> None:
"""
Save config
"""
cur_settings = {}
for comp in components:
if not isinstance(comp, gr.Button) and not isinstance(comp, gr.File) and str(
getattr(comp, "interactive", True)).lower() != "false":
comp_id = self.get_id_by_component(comp)
cur_settings[comp_id] = components[comp]
config_name = datetime.now().strftime("%Y%m%d-%H%M%S")
with open(os.path.join(self.settings_save_dir, f"{config_name}.json"), "w") as fw:
json.dump(cur_settings, fw, indent=4)
return os.path.join(self.settings_save_dir, f"{config_name}.json")
def load_config(self, config_path: str):
"""
Load config
"""
with open(config_path, "r") as fr:
ui_settings = json.load(fr)
update_components = {}
for comp_id, comp_val in ui_settings.items():
if comp_id in self.id_to_component:
comp = self.id_to_component[comp_id]
update_components[comp] = comp.__class__(value=comp_val)
config_status = self.id_to_component["load_save_config.config_status"]
update_components.update(
{
config_status: config_status.__class__(value=f"Successfully loaded config: {config_path}")
}
)
yield update_components

408
tests/test_agents.py Normal file
View File

@@ -0,0 +1,408 @@
import pdb
from dotenv import load_dotenv
load_dotenv()
import sys
sys.path.append(".")
import asyncio
import os
import sys
from pprint import pprint
from browser_use import Agent
from browser_use.agent.views import AgentHistoryList
from src.utils import utils
async def test_browser_use_agent():
from browser_use.browser.browser import Browser, BrowserConfig
from browser_use.browser.context import (
BrowserContextConfig,
BrowserContextWindowSize,
)
from browser_use.agent.service import Agent
from src.browser.custom_browser import CustomBrowser
from src.browser.custom_context import CustomBrowserContextConfig
from src.controller.custom_controller import CustomController
from src.utils import llm_provider
# llm = utils.get_llm_model(
# provider="openai",
# model_name="gpt-4o",
# temperature=0.8,
# base_url=os.getenv("OPENAI_ENDPOINT", ""),
# api_key=os.getenv("OPENAI_API_KEY", ""),
# )
# llm = utils.get_llm_model(
# provider="google",
# model_name="gemini-2.0-flash",
# temperature=0.6,
# api_key=os.getenv("GOOGLE_API_KEY", "")
# )
# llm = utils.get_llm_model(
# provider="deepseek",
# model_name="deepseek-reasoner",
# temperature=0.8
# )
# llm = utils.get_llm_model(
# provider="deepseek",
# model_name="deepseek-chat",
# temperature=0.8
# )
# llm = utils.get_llm_model(
# provider="ollama", model_name="qwen2.5:7b", temperature=0.5
# )
# llm = utils.get_llm_model(
# provider="ollama", model_name="deepseek-r1:14b", temperature=0.5
# )
window_w, window_h = 1280, 1100
llm = llm_provider.get_llm_model(
provider="azure_openai",
model_name="gpt-4o",
temperature=0.5,
base_url=os.getenv("AZURE_OPENAI_ENDPOINT", ""),
api_key=os.getenv("AZURE_OPENAI_API_KEY", ""),
)
mcp_server_config = {
"mcpServers": {
"markitdown": {
"command": "docker",
"args": [
"run",
"--rm",
"-i",
"markitdown-mcp:latest"
]
},
"desktop-commander": {
"command": "npx",
"args": [
"-y",
"@wonderwhy-er/desktop-commander"
]
},
}
}
controller = CustomController()
await controller.setup_mcp_client(mcp_server_config)
use_own_browser = False
disable_security = True
use_vision = True # Set to False when using DeepSeek
max_actions_per_step = 10
browser = None
browser_context = None
try:
extra_chromium_args = [f"--window-size={window_w},{window_h}"]
if use_own_browser:
chrome_path = os.getenv("CHROME_PATH", None)
if chrome_path == "":
chrome_path = None
chrome_user_data = os.getenv("CHROME_USER_DATA", None)
if chrome_user_data:
extra_chromium_args += [f"--user-data-dir={chrome_user_data}"]
else:
chrome_path = None
browser = CustomBrowser(
config=BrowserConfig(
headless=False,
disable_security=disable_security,
browser_binary_path=chrome_path,
extra_browser_args=extra_chromium_args,
)
)
browser_context = await browser.new_context(
config=CustomBrowserContextConfig(
trace_path="./tmp/traces",
save_recording_path="./tmp/record_videos",
save_downloads_path="./tmp/downloads",
browser_window_size=BrowserContextWindowSize(
width=window_w, height=window_h
),
force_new_context=True
)
)
agent = Agent(
task="download pdf from https://arxiv.org/abs/2504.10458 and rename this pdf to 'GUI-r1-test.pdf'",
llm=llm,
browser=browser,
browser_context=browser_context,
controller=controller,
use_vision=use_vision,
max_actions_per_step=max_actions_per_step,
generate_gif=True
)
history: AgentHistoryList = await agent.run(max_steps=100)
print("Final Result:")
pprint(history.final_result(), indent=4)
print("\nErrors:")
pprint(history.errors(), indent=4)
except Exception:
import traceback
traceback.print_exc()
finally:
if browser_context:
await browser_context.close()
if browser:
await browser.close()
if controller:
await controller.close_mcp_client()
async def test_browser_use_parallel():
from browser_use.browser.context import BrowserContextWindowSize
from browser_use.browser.browser import BrowserConfig
from playwright.async_api import async_playwright
from browser_use.browser.browser import Browser
from src.browser.custom_context import BrowserContextConfig
from src.controller.custom_controller import CustomController
from browser_use.browser.browser import Browser, BrowserConfig
from browser_use.browser.context import (
BrowserContextConfig,
BrowserContextWindowSize,
)
from browser_use.agent.service import Agent
from src.browser.custom_browser import CustomBrowser
from src.browser.custom_context import CustomBrowserContextConfig
from src.controller.custom_controller import CustomController
from src.utils import llm_provider
# llm = utils.get_llm_model(
# provider="openai",
# model_name="gpt-4o",
# temperature=0.8,
# base_url=os.getenv("OPENAI_ENDPOINT", ""),
# api_key=os.getenv("OPENAI_API_KEY", ""),
# )
# llm = utils.get_llm_model(
# provider="google",
# model_name="gemini-2.0-flash",
# temperature=0.6,
# api_key=os.getenv("GOOGLE_API_KEY", "")
# )
# llm = utils.get_llm_model(
# provider="deepseek",
# model_name="deepseek-reasoner",
# temperature=0.8
# )
# llm = utils.get_llm_model(
# provider="deepseek",
# model_name="deepseek-chat",
# temperature=0.8
# )
# llm = utils.get_llm_model(
# provider="ollama", model_name="qwen2.5:7b", temperature=0.5
# )
# llm = utils.get_llm_model(
# provider="ollama", model_name="deepseek-r1:14b", temperature=0.5
# )
window_w, window_h = 1280, 1100
llm = llm_provider.get_llm_model(
provider="azure_openai",
model_name="gpt-4o",
temperature=0.5,
base_url=os.getenv("AZURE_OPENAI_ENDPOINT", ""),
api_key=os.getenv("AZURE_OPENAI_API_KEY", ""),
)
mcp_server_config = {
"mcpServers": {
"markitdown": {
"command": "docker",
"args": [
"run",
"--rm",
"-i",
"markitdown-mcp:latest"
]
},
"desktop-commander": {
"command": "npx",
"args": [
"-y",
"@wonderwhy-er/desktop-commander"
]
},
# "filesystem": {
# "command": "npx",
# "args": [
# "-y",
# "@modelcontextprotocol/server-filesystem",
# "/Users/xxx/ai_workspace",
# ]
# },
}
}
controller = CustomController()
await controller.setup_mcp_client(mcp_server_config)
use_own_browser = False
disable_security = True
use_vision = True # Set to False when using DeepSeek
max_actions_per_step = 10
browser = None
browser_context = None
try:
extra_chromium_args = [f"--window-size={window_w},{window_h}"]
if use_own_browser:
chrome_path = os.getenv("CHROME_PATH", None)
if chrome_path == "":
chrome_path = None
chrome_user_data = os.getenv("CHROME_USER_DATA", None)
if chrome_user_data:
extra_chromium_args += [f"--user-data-dir={chrome_user_data}"]
else:
chrome_path = None
browser = CustomBrowser(
config=BrowserConfig(
headless=False,
disable_security=disable_security,
browser_binary_path=chrome_path,
extra_browser_args=extra_chromium_args,
)
)
browser_context = await browser.new_context(
config=CustomBrowserContextConfig(
trace_path="./tmp/traces",
save_recording_path="./tmp/record_videos",
save_downloads_path="./tmp/downloads",
browser_window_size=BrowserContextWindowSize(
width=window_w, height=window_h
),
force_new_context=True
)
)
agents = [
Agent(task=task, llm=llm, browser=browser, controller=controller)
for task in [
'Search Google for weather in Tokyo',
# 'Check Reddit front page title',
# 'Find NASA image of the day',
# 'Check top story on CNN',
# 'Search latest SpaceX launch date',
# 'Look up population of Paris',
'Find current time in Sydney',
'Check who won last Super Bowl',
# 'Search trending topics on Twitter',
]
]
history = await asyncio.gather(*[agent.run() for agent in agents])
print("Final Result:")
pprint(history.final_result(), indent=4)
print("\nErrors:")
pprint(history.errors(), indent=4)
pdb.set_trace()
except Exception:
import traceback
traceback.print_exc()
finally:
if browser_context:
await browser_context.close()
if browser:
await browser.close()
async def test_deep_research_agent():
from src.agent.deep_research.deep_research_agent import DeepResearchAgent, PLAN_FILENAME, REPORT_FILENAME
from src.utils import llm_provider
llm = llm_provider.get_llm_model(
provider="openai",
model_name="gpt-4o",
temperature=0.5
)
# llm = llm_provider.get_llm_model(
# provider="bedrock",
# )
mcp_server_config = {
"mcpServers": {
"desktop-commander": {
"command": "npx",
"args": [
"-y",
"@wonderwhy-er/desktop-commander"
]
},
}
}
browser_config = {"headless": False, "window_width": 1280, "window_height": 1100, "use_own_browser": False}
agent = DeepResearchAgent(llm=llm, browser_config=browser_config, mcp_server_config=mcp_server_config)
research_topic = "Impact of Microplastics on Marine Ecosystems"
task_id_to_resume = "815460fb-337a-4850-8fa4-a5f2db301a89" # Set this to resume a previous task ID
print(f"Starting research on: {research_topic}")
try:
# Call run and wait for the final result dictionary
result = await agent.run(research_topic,
task_id=task_id_to_resume,
save_dir="./tmp/deep_research",
max_parallel_browsers=1,
)
print("\n--- Research Process Ended ---")
print(f"Status: {result.get('status')}")
print(f"Message: {result.get('message')}")
print(f"Task ID: {result.get('task_id')}")
# Check the final state for the report
final_state = result.get('final_state', {})
if final_state:
print("\n--- Final State Summary ---")
print(
f" Plan Steps Completed: {sum(1 for item in final_state.get('research_plan', []) if item.get('status') == 'completed')}")
print(f" Total Search Results Logged: {len(final_state.get('search_results', []))}")
if final_state.get("final_report"):
print(" Final Report: Generated (content omitted). You can find it in the output directory.")
# print("\n--- Final Report ---") # Optionally print report
# print(final_state["final_report"])
else:
print(" Final Report: Not generated.")
else:
print("Final state information not available.")
except Exception as e:
print(f"\n--- An unhandled error occurred outside the agent run ---")
print(e)
if __name__ == "__main__":
# asyncio.run(test_browser_use_agent())
# asyncio.run(test_browser_use_parallel())
asyncio.run(test_deep_research_agent())

View File

@@ -1,364 +0,0 @@
import pdb
from dotenv import load_dotenv
load_dotenv()
import sys
sys.path.append(".")
import asyncio
import os
import sys
from pprint import pprint
from browser_use import Agent
from browser_use.agent.views import AgentHistoryList
from src.utils import utils
async def test_browser_use_org():
from browser_use.browser.browser import Browser, BrowserConfig
from browser_use.browser.context import (
BrowserContextConfig,
BrowserContextWindowSize,
)
# llm = utils.get_llm_model(
# provider="azure_openai",
# model_name="gpt-4o",
# temperature=0.8,
# base_url=os.getenv("AZURE_OPENAI_ENDPOINT", ""),
# api_key=os.getenv("AZURE_OPENAI_API_KEY", ""),
# )
# llm = utils.get_llm_model(
# provider="deepseek",
# model_name="deepseek-chat",
# temperature=0.8
# )
llm = utils.get_llm_model(
provider="ollama", model_name="deepseek-r1:14b", temperature=0.5
)
window_w, window_h = 1920, 1080
use_vision = False
use_own_browser = False
if use_own_browser:
chrome_path = os.getenv("CHROME_PATH", None)
if chrome_path == "":
chrome_path = None
else:
chrome_path = None
tool_calling_method = "json_schema" # setting to json_schema when using ollma
browser = Browser(
config=BrowserConfig(
headless=False,
disable_security=True,
chrome_instance_path=chrome_path,
extra_chromium_args=[f"--window-size={window_w},{window_h}"],
)
)
async with await browser.new_context(
config=BrowserContextConfig(
trace_path="./tmp/traces",
save_recording_path="./tmp/record_videos",
no_viewport=False,
browser_window_size=BrowserContextWindowSize(
width=window_w, height=window_h
),
)
) as browser_context:
agent = Agent(
task="go to google.com and type 'OpenAI' click search and give me the first url",
llm=llm,
browser_context=browser_context,
use_vision=use_vision,
tool_calling_method=tool_calling_method
)
history: AgentHistoryList = await agent.run(max_steps=10)
print("Final Result:")
pprint(history.final_result(), indent=4)
print("\nErrors:")
pprint(history.errors(), indent=4)
# e.g. xPaths the model clicked on
print("\nModel Outputs:")
pprint(history.model_actions(), indent=4)
print("\nThoughts:")
pprint(history.model_thoughts(), indent=4)
# close browser
await browser.close()
async def test_browser_use_custom():
from browser_use.browser.context import BrowserContextWindowSize
from browser_use.browser.browser import BrowserConfig
from playwright.async_api import async_playwright
from src.agent.custom_agent import CustomAgent
from src.agent.custom_prompts import CustomSystemPrompt, CustomAgentMessagePrompt
from src.browser.custom_browser import CustomBrowser
from src.browser.custom_context import BrowserContextConfig
from src.controller.custom_controller import CustomController
window_w, window_h = 1280, 1100
# llm = utils.get_llm_model(
# provider="openai",
# model_name="gpt-4o",
# temperature=0.8,
# base_url=os.getenv("OPENAI_ENDPOINT", ""),
# api_key=os.getenv("OPENAI_API_KEY", ""),
# )
llm = utils.get_llm_model(
provider="azure_openai",
model_name="gpt-4o",
temperature=0.5,
base_url=os.getenv("AZURE_OPENAI_ENDPOINT", ""),
api_key=os.getenv("AZURE_OPENAI_API_KEY", ""),
)
# llm = utils.get_llm_model(
# provider="google",
# model_name="gemini-2.0-flash",
# temperature=0.6,
# api_key=os.getenv("GOOGLE_API_KEY", "")
# )
# llm = utils.get_llm_model(
# provider="deepseek",
# model_name="deepseek-reasoner",
# temperature=0.8
# )
# llm = utils.get_llm_model(
# provider="deepseek",
# model_name="deepseek-chat",
# temperature=0.8
# )
# llm = utils.get_llm_model(
# provider="ollama", model_name="qwen2.5:7b", temperature=0.5
# )
# llm = utils.get_llm_model(
# provider="ollama", model_name="deepseek-r1:14b", temperature=0.5
# )
controller = CustomController()
use_own_browser = True
disable_security = True
use_vision = True # Set to False when using DeepSeek
max_actions_per_step = 10
playwright = None
browser = None
browser_context = None
try:
extra_chromium_args = [f"--window-size={window_w},{window_h}"]
if use_own_browser:
chrome_path = os.getenv("CHROME_PATH", None)
if chrome_path == "":
chrome_path = None
chrome_user_data = os.getenv("CHROME_USER_DATA", None)
if chrome_user_data:
extra_chromium_args += [f"--user-data-dir={chrome_user_data}"]
else:
chrome_path = None
browser = CustomBrowser(
config=BrowserConfig(
headless=False,
disable_security=disable_security,
chrome_instance_path=chrome_path,
extra_chromium_args=extra_chromium_args,
)
)
browser_context = await browser.new_context(
config=BrowserContextConfig(
trace_path="./tmp/traces",
save_recording_path="./tmp/record_videos",
no_viewport=False,
browser_window_size=BrowserContextWindowSize(
width=window_w, height=window_h
),
)
)
agent = CustomAgent(
task="open youtube in tab 1 , open google email in tab 2, open facebook in tab 3",
add_infos="", # some hints for llm to complete the task
llm=llm,
browser=browser,
browser_context=browser_context,
controller=controller,
system_prompt_class=CustomSystemPrompt,
agent_prompt_class=CustomAgentMessagePrompt,
use_vision=use_vision,
max_actions_per_step=max_actions_per_step,
generate_gif=True
)
history: AgentHistoryList = await agent.run(max_steps=100)
print("Final Result:")
pprint(history.final_result(), indent=4)
print("\nErrors:")
pprint(history.errors(), indent=4)
# e.g. xPaths the model clicked on
print("\nModel Outputs:")
pprint(history.model_actions(), indent=4)
print("\nThoughts:")
pprint(history.model_thoughts(), indent=4)
except Exception:
import traceback
traceback.print_exc()
finally:
# 显式关闭持久化上下文
if browser_context:
await browser_context.close()
# 关闭 Playwright 对象
if playwright:
await playwright.stop()
if browser:
await browser.close()
async def test_browser_use_parallel():
from browser_use.browser.context import BrowserContextWindowSize
from browser_use.browser.browser import BrowserConfig
from playwright.async_api import async_playwright
from browser_use.browser.browser import Browser
from src.agent.custom_agent import CustomAgent
from src.agent.custom_prompts import CustomSystemPrompt, CustomAgentMessagePrompt
from src.browser.custom_browser import CustomBrowser
from src.browser.custom_context import BrowserContextConfig
from src.controller.custom_controller import CustomController
window_w, window_h = 1920, 1080
# llm = utils.get_llm_model(
# provider="openai",
# model_name="gpt-4o",
# temperature=0.8,
# base_url=os.getenv("OPENAI_ENDPOINT", ""),
# api_key=os.getenv("OPENAI_API_KEY", ""),
# )
# llm = utils.get_llm_model(
# provider="azure_openai",
# model_name="gpt-4o",
# temperature=0.8,
# base_url=os.getenv("AZURE_OPENAI_ENDPOINT", ""),
# api_key=os.getenv("AZURE_OPENAI_API_KEY", ""),
# )
llm = utils.get_llm_model(
provider="gemini",
model_name="gemini-2.0-flash-exp",
temperature=1.0,
api_key=os.getenv("GOOGLE_API_KEY", "")
)
# llm = utils.get_llm_model(
# provider="deepseek",
# model_name="deepseek-reasoner",
# temperature=0.8
# )
# llm = utils.get_llm_model(
# provider="deepseek",
# model_name="deepseek-chat",
# temperature=0.8
# )
# llm = utils.get_llm_model(
# provider="ollama", model_name="qwen2.5:7b", temperature=0.5
# )
# llm = utils.get_llm_model(
# provider="ollama", model_name="deepseek-r1:14b", temperature=0.5
# )
controller = CustomController()
use_own_browser = True
disable_security = True
use_vision = True # Set to False when using DeepSeek
max_actions_per_step = 1
playwright = None
browser = None
browser_context = None
browser = Browser(
config=BrowserConfig(
disable_security=True,
headless=False,
new_context_config=BrowserContextConfig(save_recording_path='./tmp/recordings'),
)
)
try:
agents = [
Agent(task=task, llm=llm, browser=browser)
for task in [
'Search Google for weather in Tokyo',
'Check Reddit front page title',
'Find NASA image of the day',
'Check top story on CNN',
# 'Search latest SpaceX launch date',
# 'Look up population of Paris',
# 'Find current time in Sydney',
# 'Check who won last Super Bowl',
# 'Search trending topics on Twitter',
]
]
history = await asyncio.gather(*[agent.run() for agent in agents])
pdb.set_trace()
print("Final Result:")
pprint(history.final_result(), indent=4)
print("\nErrors:")
pprint(history.errors(), indent=4)
# e.g. xPaths the model clicked on
print("\nModel Outputs:")
pprint(history.model_actions(), indent=4)
print("\nThoughts:")
pprint(history.model_thoughts(), indent=4)
# close browser
except Exception:
import traceback
traceback.print_exc()
finally:
# 显式关闭持久化上下文
if browser_context:
await browser_context.close()
# 关闭 Playwright 对象
if playwright:
await playwright.stop()
if browser:
await browser.close()
if __name__ == "__main__":
# asyncio.run(test_browser_use_org())
# asyncio.run(test_browser_use_parallel())
asyncio.run(test_browser_use_custom())

120
tests/test_controller.py Normal file
View File

@@ -0,0 +1,120 @@
import asyncio
import pdb
import sys
import time
sys.path.append(".")
from dotenv import load_dotenv
load_dotenv()
async def test_mcp_client():
from src.utils.mcp_client import setup_mcp_client_and_tools, create_tool_param_model
test_server_config = {
"playwright": {
"command": "npx",
"args": [
"@playwright/mcp@latest",
],
"transport": "stdio",
},
"filesystem": {
"command": "npx",
"args": [
"-y",
"@modelcontextprotocol/server-filesystem",
"/Users/warmshao/ai_workspace",
]
}
}
mcp_tools, mcp_client = await setup_mcp_client_and_tools(test_server_config)
for tool in mcp_tools:
tool_param_model = create_tool_param_model(tool)
print(tool.name)
print(tool.description)
print(tool_param_model.model_json_schema())
pdb.set_trace()
async def test_controller_with_mcp():
import os
from src.controller.custom_controller import CustomController
from browser_use.controller.registry.views import ActionModel
mcp_server_config = {
"mcpServers": {
"markitdown": {
"command": "docker",
"args": [
"run",
"--rm",
"-i",
"markitdown-mcp:latest"
]
},
"desktop-commander": {
"command": "npx",
"args": [
"-y",
"@wonderwhy-er/desktop-commander"
]
},
# "filesystem": {
# "command": "npx",
# "args": [
# "-y",
# "@modelcontextprotocol/server-filesystem",
# "/Users/xxx/ai_workspace",
# ]
# },
}
}
controller = CustomController()
await controller.setup_mcp_client(mcp_server_config)
action_name = "mcp.desktop-commander.execute_command"
action_info = controller.registry.registry.actions[action_name]
param_model = action_info.param_model
print(param_model.model_json_schema())
params = {"command": f"python ./tmp/test.py"
}
validated_params = param_model(**params)
ActionModel_ = controller.registry.create_action_model()
# Create ActionModel instance with the validated parameters
action_model = ActionModel_(**{action_name: validated_params})
result = await controller.act(action_model)
result = result.extracted_content
print(result)
if result and "Command is still running. Use read_output to get more output." in result and "PID" in \
result.split("\n")[0]:
pid = int(result.split("\n")[0].split("PID")[-1].strip())
action_name = "mcp.desktop-commander.read_output"
action_info = controller.registry.registry.actions[action_name]
param_model = action_info.param_model
print(param_model.model_json_schema())
params = {"pid": pid}
validated_params = param_model(**params)
action_model = ActionModel_(**{action_name: validated_params})
output_result = ""
while True:
time.sleep(1)
result = await controller.act(action_model)
result = result.extracted_content
if result:
pdb.set_trace()
output_result = result
break
print(output_result)
pdb.set_trace()
await controller.close_mcp_client()
pdb.set_trace()
if __name__ == '__main__':
# asyncio.run(test_mcp_client())
asyncio.run(test_controller_with_mcp())

View File

@@ -1,30 +0,0 @@
import asyncio
import os
from dotenv import load_dotenv
load_dotenv()
import sys
sys.path.append(".")
async def test_deep_research():
from src.utils.deep_research import deep_research
from src.utils import utils
task = "write a report about DeepSeek-R1, get its pdf"
llm = utils.get_llm_model(
provider="gemini",
model_name="gemini-2.0-flash-thinking-exp-01-21",
temperature=1.0,
api_key=os.getenv("GOOGLE_API_KEY", "")
)
report_content, report_file_path = await deep_research(task=task, llm=llm, agent_state=None,
max_search_iterations=1,
max_query_num=3,
use_own_browser=False)
if __name__ == "__main__":
asyncio.run(test_deep_research())

View File

@@ -12,6 +12,7 @@ import sys
sys.path.append(".")
@dataclass
class LLMConfig:
provider: str
@@ -20,6 +21,7 @@ class LLMConfig:
base_url: str = None
api_key: str = None
def create_message_content(text, image_path=None):
content = [{"type": "text", "text": text}]
image_format = "png" if image_path and image_path.endswith(".png") else "jpeg"
@@ -32,6 +34,7 @@ def create_message_content(text, image_path=None):
})
return content
def get_env_value(key, provider):
env_mappings = {
"openai": {"api_key": "OPENAI_API_KEY", "base_url": "OPENAI_ENDPOINT"},
@@ -40,7 +43,7 @@ def get_env_value(key, provider):
"deepseek": {"api_key": "DEEPSEEK_API_KEY", "base_url": "DEEPSEEK_ENDPOINT"},
"mistral": {"api_key": "MISTRAL_API_KEY", "base_url": "MISTRAL_ENDPOINT"},
"alibaba": {"api_key": "ALIBABA_API_KEY", "base_url": "ALIBABA_ENDPOINT"},
"moonshot":{"api_key": "MOONSHOT_API_KEY", "base_url": "MOONSHOT_ENDPOINT"},
"moonshot": {"api_key": "MOONSHOT_API_KEY", "base_url": "MOONSHOT_ENDPOINT"},
"ibm": {"api_key": "IBM_API_KEY", "base_url": "IBM_ENDPOINT"}
}
@@ -48,13 +51,14 @@ def get_env_value(key, provider):
return os.getenv(env_mappings[provider][key], "")
return ""
def test_llm(config, query, image_path=None, system_message=None):
from src.utils import utils
from src.utils import utils, llm_provider
# Special handling for Ollama-based models
if config.provider == "ollama":
if "deepseek-r1" in config.model_name:
from src.utils.llm import DeepSeekR1ChatOllama
from src.utils.llm_provider import DeepSeekR1ChatOllama
llm = DeepSeekR1ChatOllama(model=config.model_name)
else:
llm = ChatOllama(model=config.model_name)
@@ -66,7 +70,7 @@ def test_llm(config, query, image_path=None, system_message=None):
return
# For other providers, use the standard configuration
llm = utils.get_llm_model(
llm = llm_provider.get_llm_model(
provider=config.provider,
model_name=config.model_name,
temperature=config.temperature,
@@ -86,58 +90,70 @@ def test_llm(config, query, image_path=None, system_message=None):
print(ai_msg.reasoning_content)
print(ai_msg.content)
if config.provider == "deepseek" and "deepseek-reasoner" in config.model_name:
print(llm.model_name)
pdb.set_trace()
def test_openai_model():
config = LLMConfig(provider="openai", model_name="gpt-4o")
test_llm(config, "Describe this image", "assets/examples/test.png")
def test_google_model():
# Enable your API key first if you haven't: https://ai.google.dev/palm_docs/oauth_quickstart
config = LLMConfig(provider="google", model_name="gemini-2.0-flash-exp")
test_llm(config, "Describe this image", "assets/examples/test.png")
def test_azure_openai_model():
config = LLMConfig(provider="azure_openai", model_name="gpt-4o")
test_llm(config, "Describe this image", "assets/examples/test.png")
def test_deepseek_model():
config = LLMConfig(provider="deepseek", model_name="deepseek-chat")
test_llm(config, "Who are you?")
def test_deepseek_r1_model():
config = LLMConfig(provider="deepseek", model_name="deepseek-reasoner")
test_llm(config, "Which is greater, 9.11 or 9.8?", system_message="You are a helpful AI assistant.")
def test_ollama_model():
config = LLMConfig(provider="ollama", model_name="qwen2.5:7b")
test_llm(config, "Sing a ballad of LangChain.")
def test_deepseek_r1_ollama_model():
config = LLMConfig(provider="ollama", model_name="deepseek-r1:14b")
test_llm(config, "How many 'r's are in the word 'strawberry'?")
def test_mistral_model():
config = LLMConfig(provider="mistral", model_name="pixtral-large-latest")
test_llm(config, "Describe this image", "assets/examples/test.png")
def test_moonshot_model():
config = LLMConfig(provider="moonshot", model_name="moonshot-v1-32k-vision-preview")
test_llm(config, "Describe this image", "assets/examples/test.png")
def test_ibm_model():
config = LLMConfig(provider="ibm", model_name="meta-llama/llama-4-maverick-17b-128e-instruct-fp8")
test_llm(config, "Describe this image", "assets/examples/test.png")
def test_qwen_model():
config = LLMConfig(provider="alibaba", model_name="qwen3-30b-a3b")
test_llm(config, "How many 'r's are in the word 'strawberry'?")
if __name__ == "__main__":
# test_openai_model()
# test_google_model()
# test_azure_openai_model()
#test_deepseek_model()
# test_deepseek_model()
# test_ollama_model()
# test_deepseek_r1_model()
test_deepseek_r1_model()
# test_deepseek_r1_ollama_model()
# test_mistral_model()
test_ibm_model()
# test_ibm_model()
# test_qwen_model()

1189
webui.py

File diff suppressed because it is too large Load Diff