From 517c8e0cf81592679f4816f577e62dd79c3321ec Mon Sep 17 00:00:00 2001 From: meshkatshb Date: Fri, 10 Jan 2025 13:24:44 +0330 Subject: [PATCH 1/9] feat: add default value to supperss UserWarning --- webui.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/webui.py b/webui.py index f594569..42c98fb 100644 --- a/webui.py +++ b/webui.py @@ -382,12 +382,12 @@ def create_ui(theme_name="Ocean"): llm_provider = gr.Dropdown( ["anthropic", "openai", "deepseek", "gemini", "ollama", "azure_openai"], label="LLM Provider", - value="", + value="deepseek", info="Select your preferred language model provider" ) llm_model_name = gr.Dropdown( label="Model Name", - value="", + value="deepseek-chat", interactive=True, allow_custom_value=True, # Allow users to input custom model names info="Select a model from the dropdown or type a custom model name" From 91f89e70453f2b812438680467d6a20a476b6f07 Mon Sep 17 00:00:00 2001 From: meshkatshb Date: Fri, 10 Jan 2025 13:55:10 +0330 Subject: [PATCH 2/9] feat: initialize browser and close it to resolve UnboundLocalError. --- webui.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/webui.py b/webui.py index f594569..d51ad61 100644 --- a/webui.py +++ b/webui.py @@ -199,6 +199,7 @@ async def run_custom_agent( controller = CustomController() playwright = None browser_context_ = None + browser = None # Initialize browser to None try: if use_own_browser: playwright = await async_playwright().start() @@ -278,14 +279,18 @@ async def run_custom_agent( model_actions = "" model_thoughts = "" finally: - # 显式关闭持久化上下文 + # Close persistent context if it was initialized if browser_context_: await browser_context_.close() - # 关闭 Playwright 对象 + # Stop Playwright if it was started if playwright: await playwright.stop() - await browser.close() + + # Close the browser if it was initialized + if browser: + await browser.close() + return final_result, errors, model_actions, model_thoughts # Define the theme map globally From ab0ba4589bd9ebaa50ef27c7aec48dcd04bd9d7b Mon Sep 17 00:00:00 2001 From: meshkatshb Date: Fri, 10 Jan 2025 17:01:15 +0330 Subject: [PATCH 3/9] feat: add openai provider and mode dropdown menu --- webui.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/webui.py b/webui.py index 42c98fb..36547ba 100644 --- a/webui.py +++ b/webui.py @@ -382,12 +382,13 @@ def create_ui(theme_name="Ocean"): llm_provider = gr.Dropdown( ["anthropic", "openai", "deepseek", "gemini", "ollama", "azure_openai"], label="LLM Provider", - value="deepseek", + value="openai", info="Select your preferred language model provider" ) llm_model_name = gr.Dropdown( label="Model Name", - value="deepseek-chat", + choices=["gpt-4o", "gpt-4o-mini", "gpt-4", "gpt-3.5-turbo",], + value="gpt-4o", interactive=True, allow_custom_value=True, # Allow users to input custom model names info="Select a model from the dropdown or type a custom model name" From dab1693bce5f11212539be7ecac51c4e7f55651e Mon Sep 17 00:00:00 2001 From: meshkatshb Date: Fri, 10 Jan 2025 17:39:58 +0330 Subject: [PATCH 4/9] refactor(chore): read llm provider and model from dictionary --- webui.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/webui.py b/webui.py index ff3db24..c14029e 100644 --- a/webui.py +++ b/webui.py @@ -385,14 +385,14 @@ def create_ui(theme_name="Ocean"): with gr.TabItem("🔧 LLM Configuration", id=2): with gr.Group(): llm_provider = gr.Dropdown( - ["anthropic", "openai", "deepseek", "gemini", "ollama", "azure_openai"], + choices=[provider for provider,model in utils.model_names.items()], label="LLM Provider", value="openai", info="Select your preferred language model provider" ) llm_model_name = gr.Dropdown( label="Model Name", - choices=["gpt-4o", "gpt-4o-mini", "gpt-4", "gpt-3.5-turbo",], + choices=utils.model_names['openai'], value="gpt-4o", interactive=True, allow_custom_value=True, # Allow users to input custom model names From d988bf1c95dcf12ed7f0594e9b1b10707140a51d Mon Sep 17 00:00:00 2001 From: meshkatshb Date: Fri, 10 Jan 2025 17:59:20 +0330 Subject: [PATCH 5/9] revert changes --- webui.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/webui.py b/webui.py index c14029e..f1b784b 100644 --- a/webui.py +++ b/webui.py @@ -199,7 +199,6 @@ async def run_custom_agent( controller = CustomController() playwright = None browser_context_ = None - browser = None # Initialize browser to None try: if use_own_browser: playwright = await async_playwright().start() @@ -279,17 +278,15 @@ async def run_custom_agent( model_actions = "" model_thoughts = "" finally: - # Close persistent context if it was initialized + # 显式关闭持久化上下文 if browser_context_: await browser_context_.close() - # Stop Playwright if it was started + # 关闭 Playwright 对象 if playwright: await playwright.stop() + await browser.close() - # Close the browser if it was initialized - if browser: - await browser.close() return final_result, errors, model_actions, model_thoughts From 8198d68e4dd7b6328fe957ad64f2e3cdfb0f1061 Mon Sep 17 00:00:00 2001 From: meshkatshb Date: Fri, 10 Jan 2025 18:00:10 +0330 Subject: [PATCH 6/9] revert changes --- webui.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/webui.py b/webui.py index f1b784b..8c603d0 100644 --- a/webui.py +++ b/webui.py @@ -286,8 +286,6 @@ async def run_custom_agent( if playwright: await playwright.stop() await browser.close() - - return final_result, errors, model_actions, model_thoughts # Define the theme map globally From ed38f5bb7ebcb7a6dbc6474a49f1479e8f6b3548 Mon Sep 17 00:00:00 2001 From: vvincent1234 Date: Sat, 11 Jan 2025 11:12:34 +0800 Subject: [PATCH 7/9] remove default api key --- webui.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/webui.py b/webui.py index f3f7b89..f1e9452 100644 --- a/webui.py +++ b/webui.py @@ -425,13 +425,13 @@ def create_ui(theme_name="Ocean"): with gr.Row(): llm_base_url = gr.Textbox( label="Base URL", - value=os.getenv(f"{llm_provider.value.upper()}_BASE_URL ", ""), # Default to .env value + value='', info="API endpoint URL (if required)" ) llm_api_key = gr.Textbox( label="API Key", type="password", - value=os.getenv(f"{llm_provider.value.upper()}_API_KEY", ""), # Default to .env value + value='', info="Your API key (leave blank to use .env)" ) From db73db1f7cc1ccc66abf701bb4d051547dd54176 Mon Sep 17 00:00:00 2001 From: vvincent1234 Date: Sat, 11 Jan 2025 16:30:31 +0800 Subject: [PATCH 8/9] fix macos cannot use own browser --- src/browser/custom_browser.py | 130 +++++++++++++++++++------ src/browser/custom_context.py | 16 +--- tests/test_browser_use.py | 139 +++++++++++++++++++++++++-- webui.py | 173 +++++++++++++++++++--------------- 4 files changed, 337 insertions(+), 121 deletions(-) diff --git a/src/browser/custom_browser.py b/src/browser/custom_browser.py index 4c511ab..829e06e 100644 --- a/src/browser/custom_browser.py +++ b/src/browser/custom_browser.py @@ -4,6 +4,16 @@ # @ProjectName: browser-use-webui # @FileName: browser.py +import asyncio + +from playwright.async_api import Browser as PlaywrightBrowser +from playwright.async_api import ( + BrowserContext as PlaywrightBrowserContext, +) +from playwright.async_api import ( + Playwright, + async_playwright, +) from browser_use.browser.browser import Browser from browser_use.browser.context import BrowserContext, BrowserContextConfig from playwright.async_api import BrowserContext as PlaywrightBrowserContext @@ -15,36 +25,102 @@ from .custom_context import CustomBrowserContext logger = logging.getLogger(__name__) class CustomBrowser(Browser): - _global_context = None async def new_context( self, - config: BrowserContextConfig = BrowserContextConfig(), - context: PlaywrightBrowserContext = None, + config: BrowserContextConfig = BrowserContextConfig() ) -> CustomBrowserContext: - """Create a browser context with persistence support""" - persistence_config = BrowserPersistenceConfig.from_env() - - if persistence_config.persistent_session: - if CustomBrowser._global_context is not None: - logger.info("Reusing existing persistent browser context") - return CustomBrowser._global_context - - context_instance = CustomBrowserContext(config=config, browser=self, context=context) - CustomBrowser._global_context = context_instance - logger.info("Created new persistent browser context") - return context_instance - - logger.info("Creating non-persistent browser context") - return CustomBrowserContext(config=config, browser=self, context=context) + return CustomBrowserContext(config=config, browser=self) + + async def _setup_browser(self, playwright: Playwright) -> PlaywrightBrowser: + """Sets up and returns a Playwright Browser instance with anti-detection measures.""" + if self.config.wss_url: + browser = await playwright.chromium.connect(self.config.wss_url) + return browser + elif self.config.chrome_instance_path: + import subprocess + + import requests + + try: + # Check if browser is already running + response = requests.get('http://localhost:9222/json/version', timeout=2) + if response.status_code == 200: + logger.info('Reusing existing Chrome instance') + browser = await playwright.chromium.connect_over_cdp( + endpoint_url='http://localhost:9222', + timeout=20000, # 20 second timeout for connection + ) + return browser + except requests.ConnectionError: + logger.debug('No existing Chrome instance found, starting a new one') + + # Start a new Chrome instance + subprocess.Popen( + [ + self.config.chrome_instance_path, + '--remote-debugging-port=9222', + ], + stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL, + ) + + # Attempt to connect again after starting a new instance + for _ in range(10): + try: + response = requests.get('http://localhost:9222/json/version', timeout=2) + if response.status_code == 200: + break + except requests.ConnectionError: + pass + await asyncio.sleep(1) + + try: + browser = await playwright.chromium.connect_over_cdp( + endpoint_url='http://localhost:9222', + timeout=20000, # 20 second timeout for connection + ) + return browser + except Exception as e: + logger.error(f'Failed to start a new Chrome instance.: {str(e)}') + raise RuntimeError( + ' To start chrome in Debug mode, you need to close all existing Chrome instances and try again otherwise we can not connect to the instance.' + ) - async def close(self): - """Override close to respect persistence setting""" - persistence_config = BrowserPersistenceConfig.from_env() - if not persistence_config.persistent_session: - if CustomBrowser._global_context is not None: - await CustomBrowser._global_context.close() - CustomBrowser._global_context = None - await super().close() else: - logger.info("Skipping browser close due to persistent session") + try: + disable_security_args = [] + if self.config.disable_security: + disable_security_args = [ + '--disable-web-security', + '--disable-site-isolation-trials', + '--disable-features=IsolateOrigins,site-per-process', + ] + + browser = await playwright.chromium.launch( + headless=self.config.headless, + args=[ + '--no-sandbox', + '--disable-blink-features=AutomationControlled', + '--disable-infobars', + '--disable-background-timer-throttling', + '--disable-popup-blocking', + '--disable-backgrounding-occluded-windows', + '--disable-renderer-backgrounding', + '--disable-window-activation', + '--disable-focus-on-load', + '--no-first-run', + '--no-default-browser-check', + '--no-startup-window', + '--window-position=0,0', + # '--window-size=1280,1000', + ] + + disable_security_args + + self.config.extra_chromium_args, + proxy=self.config.proxy, + ) + + return browser + except Exception as e: + logger.error(f'Failed to initialize Playwright browser: {str(e)}') + raise diff --git a/src/browser/custom_context.py b/src/browser/custom_context.py index 43ff776..6de991b 100644 --- a/src/browser/custom_context.py +++ b/src/browser/custom_context.py @@ -22,22 +22,17 @@ class CustomBrowserContext(BrowserContext): def __init__( self, browser: "Browser", - config: BrowserContextConfig = BrowserContextConfig(), - context: PlaywrightBrowserContext = None, + config: BrowserContextConfig = BrowserContextConfig() ): super(CustomBrowserContext, self).__init__(browser=browser, config=config) - self.context = context - self._persistence_config = BrowserPersistenceConfig.from_env() async def _create_context(self, browser: PlaywrightBrowser) -> PlaywrightBrowserContext: """Creates a new browser context with anti-detection measures and loads cookies if available.""" # If we have a context, return it directly - if self.context: - return self.context # Check if we should use existing context for persistence - if self._persistence_config.persistent_session and len(browser.contexts) > 0: - logger.info("Using existing persistent context") + if self.browser.config.chrome_instance_path and len(browser.contexts) > 0: + # Connect to existing Chrome instance instead of creating new one context = browser.contexts[0] else: # Original code for creating new context @@ -99,8 +94,3 @@ class CustomBrowserContext(BrowserContext): ) return context - - async def close(self): - """Override close to respect persistence setting""" - if not self._persistence_config.persistent_session: - await super().close() diff --git a/tests/test_browser_use.py b/tests/test_browser_use.py index 4ced1db..b13aa26 100644 --- a/tests/test_browser_use.py +++ b/tests/test_browser_use.py @@ -3,6 +3,7 @@ # @Author : wenshao # @ProjectName: browser-use-webui # @FileName: test_browser_use.py +import pdb from dotenv import load_dotenv @@ -28,20 +29,29 @@ async def test_browser_use_org(): BrowserContextWindowSize, ) + # llm = utils.get_llm_model( + # provider="azure_openai", + # model_name="gpt-4o", + # temperature=0.8, + # base_url=os.getenv("AZURE_OPENAI_ENDPOINT", ""), + # api_key=os.getenv("AZURE_OPENAI_API_KEY", ""), + # ) + llm = utils.get_llm_model( - provider="azure_openai", - model_name="gpt-4o", - temperature=0.8, - base_url=os.getenv("AZURE_OPENAI_ENDPOINT", ""), - api_key=os.getenv("AZURE_OPENAI_API_KEY", ""), + provider="deepseek", + model_name="deepseek-chat", + temperature=0.8 ) window_w, window_h = 1920, 1080 + use_vision = False + chrome_path = os.getenv("CHROME_PATH", None) browser = Browser( config=BrowserConfig( headless=False, disable_security=True, + chrome_instance_path=chrome_path, extra_chromium_args=[f"--window-size={window_w},{window_h}"], ) ) @@ -59,6 +69,7 @@ async def test_browser_use_org(): task="go to google.com and type 'OpenAI' click search and give me the first url", llm=llm, browser_context=browser_context, + use_vision=use_vision ) history: AgentHistoryList = await agent.run(max_steps=10) @@ -208,6 +219,122 @@ async def test_browser_use_custom(): await browser.close() +async def test_browser_use_custom_v2(): + from browser_use.browser.context import BrowserContextWindowSize + from browser_use.browser.browser import BrowserConfig + from playwright.async_api import async_playwright + + from src.agent.custom_agent import CustomAgent + from src.agent.custom_prompts import CustomSystemPrompt + from src.browser.custom_browser import CustomBrowser + from src.browser.custom_context import BrowserContextConfig + from src.controller.custom_controller import CustomController + + window_w, window_h = 1920, 1080 + + # llm = utils.get_llm_model( + # provider="azure_openai", + # model_name="gpt-4o", + # temperature=0.8, + # base_url=os.getenv("AZURE_OPENAI_ENDPOINT", ""), + # api_key=os.getenv("AZURE_OPENAI_API_KEY", ""), + # ) + + # llm = utils.get_llm_model( + # provider="gemini", + # model_name="gemini-2.0-flash-exp", + # temperature=1.0, + # api_key=os.getenv("GOOGLE_API_KEY", "") + # ) + + llm = utils.get_llm_model( + provider="deepseek", + model_name="deepseek-chat", + temperature=0.8 + ) + + # llm = utils.get_llm_model( + # provider="ollama", model_name="qwen2.5:7b", temperature=0.8 + # ) + + controller = CustomController() + use_own_browser = True + disable_security = True + use_vision = False # Set to False when using DeepSeek + tool_call_in_content = True # Set to True when using Ollama + max_actions_per_step = 1 + playwright = None + browser = None + browser_context = None + + try: + if use_own_browser: + chrome_path = os.getenv("CHROME_PATH", None) + if chrome_path == "": + chrome_path = None + else: + chrome_path = None + browser = CustomBrowser( + config=BrowserConfig( + headless=False, + disable_security=disable_security, + chrome_instance_path=chrome_path, + extra_chromium_args=[f"--window-size={window_w},{window_h}"], + ) + ) + browser_context = await browser.new_context( + config=BrowserContextConfig( + trace_path="./tmp/traces", + save_recording_path="./tmp/record_videos", + no_viewport=False, + browser_window_size=BrowserContextWindowSize( + width=window_w, height=window_h + ), + ) + ) + agent = CustomAgent( + task="go to google.com and type 'OpenAI' click search and give me the first url", + add_infos="", # some hints for llm to complete the task + llm=llm, + browser=browser, + browser_context=browser_context, + controller=controller, + system_prompt_class=CustomSystemPrompt, + use_vision=use_vision, + tool_call_in_content=tool_call_in_content, + max_actions_per_step=max_actions_per_step + ) + history: AgentHistoryList = await agent.run(max_steps=10) + + print("Final Result:") + pprint(history.final_result(), indent=4) + + print("\nErrors:") + pprint(history.errors(), indent=4) + + # e.g. xPaths the model clicked on + print("\nModel Outputs:") + pprint(history.model_actions(), indent=4) + + print("\nThoughts:") + pprint(history.model_thoughts(), indent=4) + # close browser + except Exception: + import traceback + + traceback.print_exc() + finally: + # 显式关闭持久化上下文 + if browser_context: + await browser_context.close() + + # 关闭 Playwright 对象 + if playwright: + await playwright.stop() + if browser: + await browser.close() + if __name__ == "__main__": # asyncio.run(test_browser_use_org()) - asyncio.run(test_browser_use_custom()) + # asyncio.run(test_browser_use_custom()) + asyncio.run(test_browser_use_custom_v2()) diff --git a/webui.py b/webui.py index 39e6421..bf20c12 100644 --- a/webui.py +++ b/webui.py @@ -44,7 +44,6 @@ from browser_use.browser.context import BrowserContextConfig, BrowserContextWind # Global variables for persistence _global_browser = None _global_browser_context = None -_global_playwright = None async def run_browser_agent( agent_type, @@ -54,6 +53,7 @@ async def run_browser_agent( llm_base_url, llm_api_key, use_own_browser, + keep_browser_open, headless, disable_security, window_w, @@ -95,6 +95,8 @@ async def run_browser_agent( if agent_type == "org": final_result, errors, model_actions, model_thoughts = await run_org_agent( llm=llm, + use_own_browser=use_own_browser, + keep_browser_open=keep_browser_open, headless=headless, disable_security=disable_security, window_w=window_w, @@ -111,6 +113,7 @@ async def run_browser_agent( final_result, errors, model_actions, model_thoughts = await run_custom_agent( llm=llm, use_own_browser=use_own_browser, + keep_browser_open=keep_browser_open, headless=headless, disable_security=disable_security, window_w=window_w, @@ -142,6 +145,8 @@ async def run_browser_agent( async def run_org_agent( llm, + use_own_browser, + keep_browser_open, headless, disable_security, window_w, @@ -155,28 +160,43 @@ async def run_org_agent( tool_call_in_content ): - browser = Browser( - config=BrowserConfig( - headless=headless, - disable_security=disable_security, - extra_chromium_args=[f"--window-size={window_w},{window_h}"], - ) - ) - async with await browser.new_context( - config=BrowserContextConfig( - trace_path=save_trace_path if save_trace_path else None, - save_recording_path=save_recording_path if save_recording_path else None, - no_viewport=False, - browser_window_size=BrowserContextWindowSize( - width=window_w, height=window_h - ), + try: + global _global_browser, _global_browser_context + if use_own_browser: + chrome_path = os.getenv("CHROME_PATH", None) + if chrome_path == "": + chrome_path = None + else: + chrome_path = None + + if _global_browser is None: + _global_browser = Browser( + config=BrowserConfig( + headless=headless, + disable_security=disable_security, + chrome_instance_path=chrome_path, + extra_chromium_args=[f"--window-size={window_w},{window_h}"], + ) ) - ) as browser_context: + + if _global_browser_context is None: + _global_browser_context = await _global_browser.new_context( + config=BrowserContextConfig( + trace_path=save_trace_path if save_trace_path else None, + save_recording_path=save_recording_path if save_recording_path else None, + no_viewport=False, + browser_window_size=BrowserContextWindowSize( + width=window_w, height=window_h + ), + ) + ) + agent = Agent( task=task, llm=llm, use_vision=use_vision, - browser_context=browser_context, + browser=_global_browser, + browser_context=_global_browser_context, max_actions_per_step=max_actions_per_step, tool_call_in_content=tool_call_in_content ) @@ -186,13 +206,28 @@ async def run_org_agent( errors = history.errors() model_actions = history.model_actions() model_thoughts = history.model_thoughts() - await browser.close() - return final_result, errors, model_actions, model_thoughts + return final_result, errors, model_actions, model_thoughts + except Exception as e: + import traceback + traceback.print_exc() + errors = str(e) + "\n" + traceback.format_exc() + return '', errors, '', '' + finally: + # Handle cleanup based on persistence configuration + if not keep_browser_open: + if _global_browser_context: + await _global_browser_context.close() + _global_browser_context = None + + if _global_browser: + await _global_browser.close() + _global_browser = None async def run_custom_agent( llm, use_own_browser, + keep_browser_open, headless, disable_security, window_w, @@ -206,67 +241,40 @@ async def run_custom_agent( max_actions_per_step, tool_call_in_content ): - global _global_browser, _global_browser_context, _global_playwright - - controller = CustomController() - persistence_config = BrowserPersistenceConfig.from_env() - try: + global _global_browser, _global_browser_context + + if use_own_browser: + chrome_path = os.getenv("CHROME_PATH", None) + if chrome_path == "": + chrome_path = None + else: + chrome_path = None + + controller = CustomController() + # Initialize global browser if needed if _global_browser is None: _global_browser = CustomBrowser( config=BrowserConfig( headless=headless, disable_security=disable_security, + chrome_instance_path=chrome_path, extra_chromium_args=[f"--window-size={window_w},{window_h}"], ) ) - # Handle browser context based on configuration - if use_own_browser: - if _global_browser_context is None: - _global_playwright = await async_playwright().start() - chrome_exe = os.getenv("CHROME_PATH", "") - chrome_use_data = os.getenv("CHROME_USER_DATA", "") - - browser_context = await _global_playwright.chromium.launch_persistent_context( - user_data_dir=chrome_use_data, - executable_path=chrome_exe, + if _global_browser_context is None: + _global_browser_context = await _global_browser.new_context( + config=BrowserContextConfig( + trace_path=save_trace_path if save_trace_path else None, + save_recording_path=save_recording_path if save_recording_path else None, no_viewport=False, - headless=headless, - user_agent=( - "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " - "(KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36" - ), - java_script_enabled=True, - bypass_csp=disable_security, - ignore_https_errors=disable_security, - record_video_dir=save_recording_path if save_recording_path else None, - record_video_size={"width": window_w, "height": window_h}, - ) - _global_browser_context = await _global_browser.new_context( - config=BrowserContextConfig( - trace_path=save_trace_path if save_trace_path else None, - save_recording_path=save_recording_path if save_recording_path else None, - no_viewport=False, - browser_window_size=BrowserContextWindowSize( - width=window_w, height=window_h - ), - ), - context=browser_context, - ) - else: - if _global_browser_context is None: - _global_browser_context = await _global_browser.new_context( - config=BrowserContextConfig( - trace_path=save_trace_path if save_trace_path else None, - save_recording_path=save_recording_path if save_recording_path else None, - no_viewport=False, - browser_window_size=BrowserContextWindowSize( - width=window_w, height=window_h - ), + browser_window_size=BrowserContextWindowSize( + width=window_w, height=window_h ), ) + ) # Create and run agent agent = CustomAgent( @@ -274,6 +282,7 @@ async def run_custom_agent( add_infos=add_infos, use_vision=use_vision, llm=llm, + browser=_global_browser, browser_context=_global_browser_context, controller=controller, system_prompt_class=CustomSystemPrompt, @@ -286,28 +295,24 @@ async def run_custom_agent( errors = history.errors() model_actions = history.model_actions() model_thoughts = history.model_thoughts() + return final_result, errors, model_actions, model_thoughts except Exception as e: import traceback traceback.print_exc() errors = str(e) + "\n" + traceback.format_exc() - + return '', errors, '', '' finally: # Handle cleanup based on persistence configuration - if not persistence_config.persistent_session: + if not keep_browser_open: if _global_browser_context: await _global_browser_context.close() _global_browser_context = None - if _global_playwright: - await _global_playwright.stop() - _global_playwright = None - if _global_browser: await _global_browser.close() _global_browser = None - return final_result, errors, model_actions, model_thoughts # Define the theme map globally theme_map = { @@ -321,6 +326,16 @@ theme_map = { "Base": Base() } +async def close_global_browser(): + global _global_browser, _global_browser_context + + if _global_browser_context: + await _global_browser_context.close() + _global_browser_context = None + + if _global_browser: + await _global_browser.close() + _global_browser = None def create_ui(theme_name="Ocean"): css = """ @@ -443,6 +458,11 @@ def create_ui(theme_name="Ocean"): value=False, info="Use your existing browser instance", ) + keep_browser_open = gr.Checkbox( + label="Keep Browser Open", + value=os.getenv("CHROME_PERSISTENT_SESSION", "False").lower() == "true", + info="Keep Browser Open between Tasks", + ) headless = gr.Checkbox( label="Headless Mode", value=False, @@ -578,12 +598,15 @@ def create_ui(theme_name="Ocean"): outputs=save_recording_path ) + use_own_browser.change(fn=close_global_browser) + keep_browser_open.change(fn=close_global_browser) + # Run button click handler run_button.click( fn=run_browser_agent, inputs=[ agent_type, llm_provider, llm_model_name, llm_temperature, llm_base_url, llm_api_key, - use_own_browser, headless, disable_security, window_w, window_h, save_recording_path, save_trace_path, + use_own_browser, keep_browser_open, headless, disable_security, window_w, window_h, save_recording_path, save_trace_path, enable_recording, task, add_infos, max_steps, use_vision, max_actions_per_step, tool_call_in_content ], outputs=[final_result_output, errors_output, model_actions_output, model_thoughts_output, recording_display], From a234f0ca7e51640732eabdd88e8559c5397bfd98 Mon Sep 17 00:00:00 2001 From: vvincent1234 Date: Sat, 11 Jan 2025 16:58:41 +0800 Subject: [PATCH 9/9] add generate gif --- src/agent/custom_agent.py | 120 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 120 insertions(+) diff --git a/src/agent/custom_agent.py b/src/agent/custom_agent.py index 3bf5496..f4c1df5 100644 --- a/src/agent/custom_agent.py +++ b/src/agent/custom_agent.py @@ -9,6 +9,10 @@ import logging import pdb import traceback from typing import Optional, Type +from PIL import Image, ImageDraw, ImageFont +import os +import base64 +import io from browser_use.agent.prompts import SystemPrompt from browser_use.agent.service import Agent @@ -227,6 +231,119 @@ class CustomAgent(Agent): ) if state: self._make_history_item(model_output, state, result) + def create_history_gif( + self, + output_path: str = 'agent_history.gif', + duration: int = 3000, + show_goals: bool = True, + show_task: bool = True, + show_logo: bool = False, + font_size: int = 40, + title_font_size: int = 56, + goal_font_size: int = 44, + margin: int = 40, + line_spacing: float = 1.5, + ) -> None: + """Create a GIF from the agent's history with overlaid task and goal text.""" + if not self.history.history: + logger.warning('No history to create GIF from') + return + + images = [] + # if history is empty or first screenshot is None, we can't create a gif + if not self.history.history or not self.history.history[0].state.screenshot: + logger.warning('No history or first screenshot to create GIF from') + return + + # Try to load nicer fonts + try: + # Try different font options in order of preference + font_options = ['Helvetica', 'Arial', 'DejaVuSans', 'Verdana'] + font_loaded = False + + for font_name in font_options: + try: + import platform + if platform.system() == "Windows": + # Need to specify the abs font path on Windows + font_name = os.path.join(os.getenv("WIN_FONT_DIR", "C:\\Windows\\Fonts"), font_name + ".ttf") + regular_font = ImageFont.truetype(font_name, font_size) + title_font = ImageFont.truetype(font_name, title_font_size) + goal_font = ImageFont.truetype(font_name, goal_font_size) + font_loaded = True + break + except OSError: + continue + + if not font_loaded: + raise OSError('No preferred fonts found') + + except OSError: + regular_font = ImageFont.load_default() + title_font = ImageFont.load_default() + + goal_font = regular_font + + # Load logo if requested + logo = None + if show_logo: + try: + logo = Image.open('./static/browser-use.png') + # Resize logo to be small (e.g., 40px height) + logo_height = 150 + aspect_ratio = logo.width / logo.height + logo_width = int(logo_height * aspect_ratio) + logo = logo.resize((logo_width, logo_height), Image.Resampling.LANCZOS) + except Exception as e: + logger.warning(f'Could not load logo: {e}') + + # Create task frame if requested + if show_task and self.task: + task_frame = self._create_task_frame( + self.task, + self.history.history[0].state.screenshot, + title_font, + regular_font, + logo, + line_spacing, + ) + images.append(task_frame) + + # Process each history item + for i, item in enumerate(self.history.history, 1): + if not item.state.screenshot: + continue + + # Convert base64 screenshot to PIL Image + img_data = base64.b64decode(item.state.screenshot) + image = Image.open(io.BytesIO(img_data)) + + if show_goals and item.model_output: + image = self._add_overlay_to_image( + image=image, + step_number=i, + goal_text=item.model_output.current_state.thought, + regular_font=regular_font, + title_font=title_font, + margin=margin, + logo=logo, + ) + + images.append(image) + + if images: + # Save the GIF + images[0].save( + output_path, + save_all=True, + append_images=images[1:], + duration=duration, + loop=0, + optimize=False, + ) + logger.info(f'Created GIF at {output_path}') + else: + logger.warning('No images found in history to create GIF') async def run(self, max_steps: int = 100) -> AgentHistoryList: """Execute the task with maximum number of steps""" @@ -283,3 +400,6 @@ class CustomAgent(Agent): if not self.injected_browser and self.browser: await self.browser.close() + + if self.generate_gif: + self.create_history_gif()