fixed file formatting

This commit is contained in:
Gregor Žunič
2025-01-07 09:11:13 -08:00
parent 5539a838bf
commit 54266d8edf
10 changed files with 238 additions and 234 deletions

View File

@@ -4,99 +4,85 @@
# @ProjectName: browser-use-webui
# @FileName: custom_agent.py
import asyncio
import base64
import io
import json
import logging
import os
import pdb
import textwrap
import time
import uuid
from io import BytesIO
from pathlib import Path
from typing import Any, Optional, Type, TypeVar
from typing import Optional, Type
from dotenv import load_dotenv
from langchain_core.language_models.chat_models import BaseChatModel
from langchain_core.messages import (
BaseMessage,
SystemMessage,
)
from openai import RateLimitError
from PIL import Image, ImageDraw, ImageFont
from pydantic import BaseModel, ValidationError
from browser_use.agent.message_manager.service import MessageManager
from browser_use.agent.prompts import AgentMessagePrompt, SystemPrompt
from browser_use.agent.prompts import SystemPrompt
from browser_use.agent.service import Agent
from browser_use.agent.views import (
ActionResult,
AgentError,
AgentHistory,
AgentHistoryList,
AgentOutput,
AgentStepInfo,
)
from browser_use.browser.browser import Browser
from browser_use.browser.context import BrowserContext
from browser_use.browser.views import BrowserState, BrowserStateHistory
from browser_use.controller.registry.views import ActionModel
from browser_use.controller.service import Controller
from browser_use.dom.history_tree_processor.service import (
DOMHistoryElement,
HistoryTreeProcessor,
)
from browser_use.telemetry.service import ProductTelemetry
from browser_use.telemetry.views import (
AgentEndTelemetryEvent,
AgentRunTelemetryEvent,
AgentStepErrorTelemetryEvent,
)
from browser_use.utils import time_execution_async
from langchain_core.language_models.chat_models import BaseChatModel
from langchain_core.messages import (
BaseMessage,
)
from .custom_views import CustomAgentOutput, CustomAgentStepInfo
from .custom_massage_manager import CustomMassageManager
from .custom_views import CustomAgentOutput, CustomAgentStepInfo
logger = logging.getLogger(__name__)
class CustomAgent(Agent):
def __init__(
self,
task: str,
llm: BaseChatModel,
add_infos: str = '',
browser: Browser | None = None,
browser_context: BrowserContext | None = None,
controller: Controller = Controller(),
use_vision: bool = True,
save_conversation_path: Optional[str] = None,
max_failures: int = 5,
retry_delay: int = 10,
system_prompt_class: Type[SystemPrompt] = SystemPrompt,
max_input_tokens: int = 128000,
validate_output: bool = False,
include_attributes: list[str] = [
'title',
'type',
'name',
'role',
'tabindex',
'aria-label',
'placeholder',
'value',
'alt',
'aria-expanded',
],
max_error_length: int = 400,
max_actions_per_step: int = 10,
self,
task: str,
llm: BaseChatModel,
add_infos: str = "",
browser: Browser | None = None,
browser_context: BrowserContext | None = None,
controller: Controller = Controller(),
use_vision: bool = True,
save_conversation_path: Optional[str] = None,
max_failures: int = 5,
retry_delay: int = 10,
system_prompt_class: Type[SystemPrompt] = SystemPrompt,
max_input_tokens: int = 128000,
validate_output: bool = False,
include_attributes: list[str] = [
"title",
"type",
"name",
"role",
"tabindex",
"aria-label",
"placeholder",
"value",
"alt",
"aria-expanded",
],
max_error_length: int = 400,
max_actions_per_step: int = 10,
):
super().__init__(task, llm, browser, browser_context, controller, use_vision, save_conversation_path,
max_failures, retry_delay, system_prompt_class, max_input_tokens, validate_output,
include_attributes, max_error_length, max_actions_per_step)
super().__init__(
task,
llm,
browser,
browser_context,
controller,
use_vision,
save_conversation_path,
max_failures,
retry_delay,
system_prompt_class,
max_input_tokens,
validate_output,
include_attributes,
max_error_length,
max_actions_per_step,
)
self.add_infos = add_infos
self.message_manager = CustomMassageManager(
llm=self.llm,
@@ -118,24 +104,26 @@ class CustomAgent(Agent):
def _log_response(self, response: CustomAgentOutput) -> None:
"""Log the model's response"""
if 'Success' in response.current_state.prev_action_evaluation:
emoji = ''
elif 'Failed' in response.current_state.prev_action_evaluation:
emoji = ''
if "Success" in response.current_state.prev_action_evaluation:
emoji = ""
elif "Failed" in response.current_state.prev_action_evaluation:
emoji = ""
else:
emoji = '🤷'
emoji = "🤷"
logger.info(f'{emoji} Eval: {response.current_state.prev_action_evaluation}')
logger.info(f'🧠 New Memory: {response.current_state.important_contents}')
logger.info(f'⏳ Task Progress: {response.current_state.completed_contents}')
logger.info(f'🤔 Thought: {response.current_state.thought}')
logger.info(f'🎯 Summary: {response.current_state.summary}')
logger.info(f"{emoji} Eval: {response.current_state.prev_action_evaluation}")
logger.info(f"🧠 New Memory: {response.current_state.important_contents}")
logger.info(f"⏳ Task Progress: {response.current_state.completed_contents}")
logger.info(f"🤔 Thought: {response.current_state.thought}")
logger.info(f"🎯 Summary: {response.current_state.summary}")
for i, action in enumerate(response.action):
logger.info(
f'🛠️ Action {i + 1}/{len(response.action)}: {action.model_dump_json(exclude_unset=True)}'
f"🛠️ Action {i + 1}/{len(response.action)}: {action.model_dump_json(exclude_unset=True)}"
)
def update_step_info(self, model_output: CustomAgentOutput, step_info: CustomAgentStepInfo = None):
def update_step_info(
self, model_output: CustomAgentOutput, step_info: CustomAgentStepInfo = None
):
"""
update step info
"""
@@ -144,19 +132,23 @@ class CustomAgent(Agent):
step_info.step_number += 1
important_contents = model_output.current_state.important_contents
if important_contents and 'None' not in important_contents and important_contents not in step_info.memory:
step_info.memory += important_contents + '\n'
if (
important_contents
and "None" not in important_contents
and important_contents not in step_info.memory
):
step_info.memory += important_contents + "\n"
completed_contents = model_output.current_state.completed_contents
if completed_contents and 'None' not in completed_contents:
if completed_contents and "None" not in completed_contents:
step_info.task_progress = completed_contents
@time_execution_async('--get_next_action')
@time_execution_async("--get_next_action")
async def get_next_action(self, input_messages: list[BaseMessage]) -> AgentOutput:
"""Get next action from LLM based on current state"""
ret = self.llm.invoke(input_messages)
parsed_json = json.loads(ret.content.replace('```json', '').replace("```", ""))
parsed_json = json.loads(ret.content.replace("```json", "").replace("```", ""))
parsed: AgentOutput = self.AgentOutput(**parsed_json)
# cut the number of actions to max_actions_per_step
parsed.action = parsed.action[: self.max_actions_per_step]
@@ -165,10 +157,10 @@ class CustomAgent(Agent):
return parsed
@time_execution_async('--step')
@time_execution_async("--step")
async def step(self, step_info: Optional[CustomAgentStepInfo] = None) -> None:
"""Execute one step of the task"""
logger.info(f'\n📍 Step {self.n_steps}')
logger.info(f"\n📍 Step {self.n_steps}")
state = None
model_output = None
result: list[ActionResult] = []
@@ -179,7 +171,7 @@ class CustomAgent(Agent):
input_messages = self.message_manager.get_messages()
model_output = await self.get_next_action(input_messages)
self.update_step_info(model_output, step_info)
logger.info(f'🧠 All Memory: {step_info.memory}')
logger.info(f"🧠 All Memory: {step_info.memory}")
self._save_conversation(input_messages, model_output)
self.message_manager._remove_last_state_message() # we dont want the whole state in the chat history
self.message_manager.add_model_output(model_output)
@@ -190,7 +182,7 @@ class CustomAgent(Agent):
self._last_result = result
if len(result) > 0 and result[-1].is_done:
logger.info(f'📄 Result: {result[-1].extracted_content}')
logger.info(f"📄 Result: {result[-1].extracted_content}")
self.consecutive_failures = 0
@@ -215,7 +207,7 @@ class CustomAgent(Agent):
async def run(self, max_steps: int = 100) -> AgentHistoryList:
"""Execute the task with maximum number of steps"""
try:
logger.info(f'🚀 Starting task: {self.task}')
logger.info(f"🚀 Starting task: {self.task}")
self.telemetry.capture(
AgentRunTelemetryEvent(
@@ -224,13 +216,14 @@ class CustomAgent(Agent):
)
)
step_info = CustomAgentStepInfo(task=self.task,
add_infos=self.add_infos,
step_number=1,
max_steps=max_steps,
memory='',
task_progress=''
)
step_info = CustomAgentStepInfo(
task=self.task,
add_infos=self.add_infos,
step_number=1,
max_steps=max_steps,
memory="",
task_progress="",
)
for step in range(max_steps):
if self._too_many_failures():
@@ -240,15 +233,15 @@ class CustomAgent(Agent):
if self.history.is_done():
if (
self.validate_output and step < max_steps - 1
self.validate_output and step < max_steps - 1
): # if last step, we dont need to validate
if not await self._validate_output():
continue
logger.info('✅ Task completed successfully')
logger.info("✅ Task completed successfully")
break
else:
logger.info('❌ Failed to complete task in maximum steps')
logger.info("❌ Failed to complete task in maximum steps")
return self.history

View File

@@ -7,23 +7,17 @@
from __future__ import annotations
import logging
from datetime import datetime
from typing import List, Optional, Type
from langchain_anthropic import ChatAnthropic
from browser_use.agent.message_manager.service import MessageManager
from browser_use.agent.message_manager.views import MessageHistory
from browser_use.agent.prompts import SystemPrompt
from browser_use.agent.views import ActionResult, AgentStepInfo
from browser_use.browser.views import BrowserState
from langchain_core.language_models import BaseChatModel
from langchain_core.messages import (
AIMessage,
BaseMessage,
HumanMessage,
)
from langchain_openai import ChatOpenAI
from browser_use.agent.message_manager.views import MessageHistory, MessageMetadata
from browser_use.agent.prompts import AgentMessagePrompt, SystemPrompt
from browser_use.agent.views import ActionResult, AgentOutput, AgentStepInfo
from browser_use.browser.views import BrowserState
from browser_use.agent.message_manager.service import MessageManager
from .custom_prompts import CustomAgentMessagePrompt
@@ -32,31 +26,40 @@ logger = logging.getLogger(__name__)
class CustomMassageManager(MessageManager):
def __init__(
self,
llm: BaseChatModel,
task: str,
action_descriptions: str,
system_prompt_class: Type[SystemPrompt],
max_input_tokens: int = 128000,
estimated_tokens_per_character: int = 3,
image_tokens: int = 800,
include_attributes: list[str] = [],
max_error_length: int = 400,
max_actions_per_step: int = 10,
self,
llm: BaseChatModel,
task: str,
action_descriptions: str,
system_prompt_class: Type[SystemPrompt],
max_input_tokens: int = 128000,
estimated_tokens_per_character: int = 3,
image_tokens: int = 800,
include_attributes: list[str] = [],
max_error_length: int = 400,
max_actions_per_step: int = 10,
):
super().__init__(llm, task, action_descriptions, system_prompt_class, max_input_tokens,
estimated_tokens_per_character, image_tokens, include_attributes, max_error_length,
max_actions_per_step)
super().__init__(
llm,
task,
action_descriptions,
system_prompt_class,
max_input_tokens,
estimated_tokens_per_character,
image_tokens,
include_attributes,
max_error_length,
max_actions_per_step,
)
# Move Task info to state_message
self.history = MessageHistory()
self._add_message_with_tokens(self.system_prompt)
def add_state_message(
self,
state: BrowserState,
result: Optional[List[ActionResult]] = None,
step_info: Optional[AgentStepInfo] = None,
self,
state: BrowserState,
result: Optional[List[ActionResult]] = None,
step_info: Optional[AgentStepInfo] = None,
) -> None:
"""Add browser state as human message"""
@@ -68,7 +71,9 @@ class CustomMassageManager(MessageManager):
msg = HumanMessage(content=str(r.extracted_content))
self._add_message_with_tokens(msg)
if r.error:
msg = HumanMessage(content=str(r.error)[-self.max_error_length:])
msg = HumanMessage(
content=str(r.error)[-self.max_error_length :]
)
self._add_message_with_tokens(msg)
result = None # if result in history, we dont want to add it again

View File

@@ -4,14 +4,12 @@
# @ProjectName: browser-use-webui
# @FileName: custom_prompts.py
from datetime import datetime
from typing import List, Optional
from langchain_core.messages import HumanMessage, SystemMessage
from browser_use.agent.views import ActionResult, AgentStepInfo
from browser_use.agent.prompts import SystemPrompt
from browser_use.agent.views import ActionResult
from browser_use.browser.views import BrowserState
from browser_use.agent.prompts import SystemPrompt, AgentMessagePrompt
from langchain_core.messages import HumanMessage, SystemMessage
from .custom_views import CustomAgentStepInfo
@@ -93,7 +91,7 @@ class CustomSystemPrompt(SystemPrompt):
- Try to be efficient, e.g. fill forms at once, or chain actions where nothing changes on the page like saving, extracting, checkboxes...
- only use multiple actions if it makes sense.
"""
text += f' - use maximum {self.max_actions_per_step} actions per sequence'
text += f" - use maximum {self.max_actions_per_step} actions per sequence"
return text
def input_format(self) -> str:
@@ -128,7 +126,7 @@ class CustomSystemPrompt(SystemPrompt):
Returns:
str: Formatted system prompt
"""
time_str = self.current_date.strftime('%Y-%m-%d %H:%M')
time_str = self.current_date.strftime("%Y-%m-%d %H:%M")
AGENT_PROMPT = f"""You are a precise browser automation agent that interacts with websites through structured commands. Your role is to:
1. Analyze the provided webpage elements and structure
@@ -150,12 +148,12 @@ class CustomSystemPrompt(SystemPrompt):
class CustomAgentMessagePrompt:
def __init__(
self,
state: BrowserState,
result: Optional[List[ActionResult]] = None,
include_attributes: list[str] = [],
max_error_length: int = 400,
step_info: Optional[CustomAgentStepInfo] = None,
self,
state: BrowserState,
result: Optional[List[ActionResult]] = None,
include_attributes: list[str] = [],
max_error_length: int = 400,
step_info: Optional[CustomAgentStepInfo] = None,
):
self.state = state
self.result = result
@@ -182,22 +180,24 @@ class CustomAgentMessagePrompt:
if self.result:
for i, result in enumerate(self.result):
if result.extracted_content:
state_description += (
f'\nResult of action {i + 1}/{len(self.result)}: {result.extracted_content}'
)
state_description += f"\nResult of action {i + 1}/{len(self.result)}: {result.extracted_content}"
if result.error:
# only use last 300 characters of error
error = result.error[-self.max_error_length:]
state_description += f'\nError of action {i + 1}/{len(self.result)}: ...{error}'
error = result.error[-self.max_error_length :]
state_description += (
f"\nError of action {i + 1}/{len(self.result)}: ...{error}"
)
if self.state.screenshot:
# Format message for vision model
return HumanMessage(
content=[
{'type': 'text', 'text': state_description},
{"type": "text", "text": state_description},
{
'type': 'image_url',
'image_url': {'url': f'data:image/png;base64,{self.state.screenshot}'},
"type": "image_url",
"image_url": {
"url": f"data:image/png;base64,{self.state.screenshot}"
},
},
]
)

View File

@@ -6,9 +6,10 @@
from dataclasses import dataclass
from typing import Type
from pydantic import BaseModel, ConfigDict, Field, ValidationError, create_model
from browser_use.controller.registry.views import ActionModel
from browser_use.agent.views import AgentOutput
from browser_use.controller.registry.views import ActionModel
from pydantic import BaseModel, ConfigDict, Field, create_model
@dataclass
@@ -43,11 +44,16 @@ class CustomAgentOutput(AgentOutput):
action: list[ActionModel]
@staticmethod
def type_with_custom_actions(custom_actions: Type[ActionModel]) -> Type['CustomAgentOutput']:
def type_with_custom_actions(
custom_actions: Type[ActionModel],
) -> Type["CustomAgentOutput"]:
"""Extend actions with custom actions"""
return create_model(
'AgentOutput',
"AgentOutput",
__base__=CustomAgentOutput,
action=(list[custom_actions], Field(...)), # Properly annotated field with no default
action=(
list[custom_actions],
Field(...),
), # Properly annotated field with no default
__module__=CustomAgentOutput.__module__,
)

View File

@@ -4,16 +4,17 @@
# @ProjectName: browser-use-webui
# @FileName: browser.py
from browser_use.browser.browser import Browser, BrowserConfig
from browser_use.browser.context import BrowserContextConfig, BrowserContext
from browser_use.browser.browser import Browser
from browser_use.browser.context import BrowserContext, BrowserContextConfig
from .custom_context import CustomBrowserContext
class CustomBrowser(Browser):
async def new_context(
self, config: BrowserContextConfig = BrowserContextConfig(), context: CustomBrowserContext = None
self,
config: BrowserContextConfig = BrowserContextConfig(),
context: CustomBrowserContext = None,
) -> BrowserContext:
"""Create a browser context"""
return CustomBrowserContext(config=config, browser=self, context=context)

View File

@@ -5,26 +5,23 @@
# @Project : browser-use-webui
# @FileName: context.py
import asyncio
import base64
import json
import logging
import os
from playwright.async_api import Browser as PlaywrightBrowser
from browser_use.browser.context import BrowserContext, BrowserContextConfig
from browser_use.browser.browser import Browser
from browser_use.browser.context import BrowserContext, BrowserContextConfig
from playwright.async_api import Browser as PlaywrightBrowser
logger = logging.getLogger(__name__)
class CustomBrowserContext(BrowserContext):
def __init__(
self,
browser: 'Browser',
config: BrowserContextConfig = BrowserContextConfig(),
context: BrowserContext = None
self,
browser: "Browser",
config: BrowserContextConfig = BrowserContextConfig(),
context: BrowserContext = None,
):
super(CustomBrowserContext, self).__init__(browser, config)
self.context = context
@@ -42,14 +39,14 @@ class CustomBrowserContext(BrowserContext):
viewport=self.config.browser_window_size,
no_viewport=False,
user_agent=(
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
'(KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36'
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36"
),
java_script_enabled=True,
bypass_csp=self.config.disable_security,
ignore_https_errors=self.config.disable_security,
record_video_dir=self.config.save_recording_path,
record_video_size=self.config.browser_window_size # set record video size
record_video_size=self.config.browser_window_size, # set record video size
)
if self.config.trace_path:
@@ -57,9 +54,11 @@ class CustomBrowserContext(BrowserContext):
# Load cookies if they exist
if self.config.cookies_file and os.path.exists(self.config.cookies_file):
with open(self.config.cookies_file, 'r') as f:
with open(self.config.cookies_file, "r") as f:
cookies = json.load(f)
logger.info(f'Loaded {len(cookies)} cookies from {self.config.cookies_file}')
logger.info(
f"Loaded {len(cookies)} cookies from {self.config.cookies_file}"
)
await context.add_cookies(cookies)
# Expose anti-detection scripts

View File

@@ -5,10 +5,9 @@
# @FileName: custom_action.py
import pyperclip
from browser_use.controller.service import Controller
from browser_use.agent.views import ActionResult
from browser_use.browser.context import BrowserContext
from browser_use.controller.service import Controller
class CustomController(Controller):
@@ -19,12 +18,12 @@ class CustomController(Controller):
def _register_custom_actions(self):
"""Register all custom browser actions"""
@self.registry.action('Copy text to clipboard')
@self.registry.action("Copy text to clipboard")
def copy_to_clipboard(text: str):
pyperclip.copy(text)
return ActionResult(extracted_content=text)
@self.registry.action('Paste text from clipboard', requires_browser=True)
@self.registry.action("Paste text from clipboard", requires_browser=True)
async def paste_from_clipboard(browser: BrowserContext):
text = pyperclip.paste()
# send text to browser

View File

@@ -8,10 +8,10 @@
import base64
import os
from langchain_openai import ChatOpenAI, AzureChatOpenAI
from langchain_anthropic import ChatAnthropic
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_ollama import ChatOllama
from langchain_openai import AzureChatOpenAI, ChatOpenAI
def get_llm_model(provider: str, **kwargs):
@@ -21,7 +21,7 @@ def get_llm_model(provider: str, **kwargs):
:param kwargs:
:return:
"""
if provider == 'anthropic':
if provider == "anthropic":
if not kwargs.get("base_url", ""):
base_url = "https://api.anthropic.com"
else:
@@ -33,12 +33,12 @@ def get_llm_model(provider: str, **kwargs):
api_key = kwargs.get("api_key")
return ChatAnthropic(
model_name=kwargs.get("model_name", 'claude-3-5-sonnet-20240620'),
model_name=kwargs.get("model_name", "claude-3-5-sonnet-20240620"),
temperature=kwargs.get("temperature", 0.0),
base_url=base_url,
api_key=api_key
api_key=api_key,
)
elif provider == 'openai':
elif provider == "openai":
if not kwargs.get("base_url", ""):
base_url = os.getenv("OPENAI_ENDPOINT", "https://api.openai.com/v1")
else:
@@ -50,12 +50,12 @@ def get_llm_model(provider: str, **kwargs):
api_key = kwargs.get("api_key")
return ChatOpenAI(
model=kwargs.get("model_name", 'gpt-4o'),
model=kwargs.get("model_name", "gpt-4o"),
temperature=kwargs.get("temperature", 0.0),
base_url=base_url,
api_key=api_key
api_key=api_key,
)
elif provider == 'deepseek':
elif provider == "deepseek":
if not kwargs.get("base_url", ""):
base_url = os.getenv("DEEPSEEK_ENDPOINT", "")
else:
@@ -67,24 +67,24 @@ def get_llm_model(provider: str, **kwargs):
api_key = kwargs.get("api_key")
return ChatOpenAI(
model=kwargs.get("model_name", 'deepseek-chat'),
model=kwargs.get("model_name", "deepseek-chat"),
temperature=kwargs.get("temperature", 0.0),
base_url=base_url,
api_key=api_key
api_key=api_key,
)
elif provider == 'gemini':
elif provider == "gemini":
if not kwargs.get("api_key", ""):
api_key = os.getenv("GOOGLE_API_KEY", "")
else:
api_key = kwargs.get("api_key")
return ChatGoogleGenerativeAI(
model=kwargs.get("model_name", 'gemini-2.0-flash-exp'),
model=kwargs.get("model_name", "gemini-2.0-flash-exp"),
temperature=kwargs.get("temperature", 0.0),
google_api_key=api_key,
)
elif provider == 'ollama':
elif provider == "ollama":
return ChatOllama(
model=kwargs.get("model_name", 'qwen2.5:7b'),
model=kwargs.get("model_name", "qwen2.5:7b"),
temperature=kwargs.get("temperature", 0.0),
)
elif provider == "azure_openai":
@@ -97,14 +97,14 @@ def get_llm_model(provider: str, **kwargs):
else:
api_key = kwargs.get("api_key")
return AzureChatOpenAI(
model=kwargs.get("model_name", 'gpt-4o'),
model=kwargs.get("model_name", "gpt-4o"),
temperature=kwargs.get("temperature", 0.0),
api_version="2024-05-01-preview",
azure_endpoint=base_url,
api_key=api_key
api_key=api_key,
)
else:
raise ValueError(f'Unsupported provider: {provider}')
raise ValueError(f"Unsupported provider: {provider}")
def encode_image(img_path):

View File

@@ -3,7 +3,6 @@
# @Author : wenshao
# @ProjectName: browser-use-webui
# @FileName: test_browser_use.py
import pdb
from dotenv import load_dotenv
@@ -11,11 +10,11 @@ load_dotenv()
import sys
sys.path.append(".")
import asyncio
import os
import sys
from pprint import pprint
import asyncio
from browser_use import Agent
from browser_use.agent.views import AgentHistoryList
@@ -25,16 +24,16 @@ from src.utils import utils
async def test_browser_use_org():
from browser_use.browser.browser import Browser, BrowserConfig
from browser_use.browser.context import (
BrowserContext,
BrowserContextConfig,
BrowserContextWindowSize,
)
llm = utils.get_llm_model(
provider="azure_openai",
model_name="gpt-4o",
temperature=0.8,
base_url=os.getenv("AZURE_OPENAI_ENDPOINT", ""),
api_key=os.getenv("AZURE_OPENAI_API_KEY", "")
api_key=os.getenv("AZURE_OPENAI_API_KEY", ""),
)
window_w, window_h = 1920, 1080
@@ -43,16 +42,18 @@ async def test_browser_use_org():
config=BrowserConfig(
headless=False,
disable_security=True,
extra_chromium_args=[f'--window-size={window_w},{window_h}'],
extra_chromium_args=[f"--window-size={window_w},{window_h}"],
)
)
async with await browser.new_context(
config=BrowserContextConfig(
trace_path='./tmp/traces',
save_recording_path="./tmp/record_videos",
no_viewport=False,
browser_window_size=BrowserContextWindowSize(width=window_w, height=window_h),
)
config=BrowserContextConfig(
trace_path="./tmp/traces",
save_recording_path="./tmp/record_videos",
no_viewport=False,
browser_window_size=BrowserContextWindowSize(
width=window_w, height=window_h
),
)
) as browser_context:
agent = Agent(
task="go to google.com and type 'OpenAI' click search and give me the first url",
@@ -61,32 +62,31 @@ async def test_browser_use_org():
)
history: AgentHistoryList = await agent.run(max_steps=10)
print('Final Result:')
print("Final Result:")
pprint(history.final_result(), indent=4)
print('\nErrors:')
print("\nErrors:")
pprint(history.errors(), indent=4)
# e.g. xPaths the model clicked on
print('\nModel Outputs:')
print("\nModel Outputs:")
pprint(history.model_actions(), indent=4)
print('\nThoughts:')
print("\nThoughts:")
pprint(history.model_thoughts(), indent=4)
# close browser
await browser.close()
async def test_browser_use_custom():
from playwright.async_api import async_playwright
from browser_use.browser.context import BrowserContextWindowSize
from playwright.async_api import async_playwright
from src.browser.custom_browser import CustomBrowser, BrowserConfig
from src.browser.custom_context import BrowserContext, BrowserContextConfig
from src.controller.custom_controller import CustomController
from src.agent.custom_agent import CustomAgent
from src.agent.custom_prompts import CustomSystemPrompt
from src.browser.custom_context import CustomBrowserContext
from src.browser.custom_browser import BrowserConfig, CustomBrowser
from src.browser.custom_context import BrowserContextConfig
from src.controller.custom_controller import CustomController
window_w, window_h = 1920, 1080
@@ -112,9 +112,7 @@ async def test_browser_use_custom():
# )
llm = utils.get_llm_model(
provider="ollama",
model_name="qwen2.5:7b",
temperature=0.8
provider="ollama", model_name="qwen2.5:7b", temperature=0.8
)
controller = CustomController()
@@ -134,14 +132,14 @@ async def test_browser_use_custom():
no_viewport=False,
headless=False, # 保持浏览器窗口可见
user_agent=(
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
'(KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36'
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36"
),
java_script_enabled=True,
bypass_csp=disable_security,
ignore_https_errors=disable_security,
record_video_dir="./tmp/record_videos",
record_video_size={'width': window_w, 'height': window_h}
record_video_size={"width": window_w, "height": window_h},
)
else:
browser_context_ = None
@@ -150,18 +148,20 @@ async def test_browser_use_custom():
config=BrowserConfig(
headless=False,
disable_security=True,
extra_chromium_args=[f'--window-size={window_w},{window_h}'],
extra_chromium_args=[f"--window-size={window_w},{window_h}"],
)
)
async with await browser.new_context(
config=BrowserContextConfig(
trace_path='./tmp/result_processing',
save_recording_path="./tmp/record_videos",
no_viewport=False,
browser_window_size=BrowserContextWindowSize(width=window_w, height=window_h),
config=BrowserContextConfig(
trace_path="./tmp/result_processing",
save_recording_path="./tmp/record_videos",
no_viewport=False,
browser_window_size=BrowserContextWindowSize(
width=window_w, height=window_h
),
context=browser_context_
),
context=browser_context_,
) as browser_context:
agent = CustomAgent(
task="go to google.com and type 'OpenAI' click search and give me the first url",
@@ -170,25 +170,26 @@ async def test_browser_use_custom():
browser_context=browser_context,
controller=controller,
system_prompt_class=CustomSystemPrompt,
use_vision=use_vision
use_vision=use_vision,
)
history: AgentHistoryList = await agent.run(max_steps=10)
print('Final Result:')
print("Final Result:")
pprint(history.final_result(), indent=4)
print('\nErrors:')
print("\nErrors:")
pprint(history.errors(), indent=4)
# e.g. xPaths the model clicked on
print('\nModel Outputs:')
print("\nModel Outputs:")
pprint(history.model_actions(), indent=4)
print('\nThoughts:')
print("\nThoughts:")
pprint(history.model_thoughts(), indent=4)
# close browser
except Exception as e:
except Exception:
import traceback
traceback.print_exc()
finally:
# 显式关闭持久化上下文
@@ -202,6 +203,6 @@ async def test_browser_use_custom():
await browser.close()
if __name__ == '__main__':
if __name__ == "__main__":
# asyncio.run(test_browser_use_org())
asyncio.run(test_browser_use_custom())

View File

@@ -22,7 +22,7 @@ from playwright.async_api import async_playwright
from src.agent.custom_agent import CustomAgent
from src.agent.custom_prompts import CustomSystemPrompt
from src.browser.custom_browser import BrowserConfig, CustomBrowser
from src.browser.custom_browser import CustomBrowser
from src.browser.custom_context import BrowserContextConfig
from src.controller.custom_controller import CustomController
from src.utils import utils