add webui and readme

2026-03-22 11:17:17 +08:00 · 2025-01-03 01:02:36 +08:00
parent 2024ad300f
commit 0b822fa1c2
6 changed files with 391 additions and 9 deletions
--- a/.env.example
+++ b/.env.example
@@ -12,4 +12,7 @@ AZURE_OPENAI_API_KEY=
 ANONYMIZED_TELEMETRY=true

 # LogLevel: Set to debug to enable verbose logging, set to result to get results only. Available: result | debug | info
-BROWSER_USE_LOGGING_LEVEL=info
+BROWSER_USE_LOGGING_LEVEL=info
+
+CHROME_PATH=
+CHROME_USER_DATA=
--- a/README.md
+++ b/README.md
@@ -0,0 +1,47 @@
+# Browser-Use WebUI
+
+## Background
+
+This project builds upon the foundation of the [browser-use](https://github.com/browser-use/browser-use), which is designed to make websites accessible for AI agents. We have enhanced the original capabilities by providing:
+
+1.  **A Brand New WebUI:** We offer a comprehensive web interface that supports a wide range of `browser-use` functionalities. This UI is designed to be user-friendly and enables easy interaction with the browser agent.
+
+2.  **Expanded LLM Support:** We've integrated support for various Large Language Models (LLMs), including: Gemini, OpenAI, Azure OpenAI, Anthropic etc. And we plan to add support for even more models in the future.
+
+3.  **Custom Browser Support:** You can use your own browser with our tool, eliminating the need to re-login to sites or deal with other authentication challenges. This feature also supports high-definition screen recording.
+
+4.  **Customized Agent:** We've implemented a custom agent that enhances `browser-use` with Optimized prompts.
+
+<video src="https://github.com/user-attachments/assets/cc4ca59f-e4a5-43d8-86db-bb0e6edbedef" controls="controls" width="500" height="300" >Your browser does not support playing this video!</video>
+
+## Environment Installation
+
+1.  **Python Version:** Ensure you have Python 3.11 or higher installed.
+2.  **Install `browser-use`:**
+    ```bash
+    pip install browser-use
+    ```
+3.  **Install Playwright:**
+    ```bash
+    playwright install
+    ```
+4.  **Install Dependencies:**
+    ```bash
+    pip install -r requirements.txt
+    ```
+5.  **Configure Environment Variables:**
+    - Copy `.env.example` to `.env` and set your environment variables, including API keys for the LLM.
+    - **If using your own browser:**
+      - Set `CHROME_PATH` to the executable path of your browser (e.g., `C:\Program Files\Google\Chrome\Application\chrome.exe` on Windows).
+      - Set `CHROME_USER_DATA` to the user data directory of your browser (e.g.,`C:\Users\<YourUsername>\AppData\Local\Google\Chrome\User Data`).
+
+## Usage
+
+1.  **Run the WebUI:**
+    ```bash
+    python webui.py --ip 127.0.0.1 --port 7788
+    ```
+2.  **Access the WebUI:** Open your web browser and navigate to `http://127.0.0.1:7788`.
+3.  **Using Your Own Browser:**
+    - Open the WebUI in a non-Chrome browser, such as Firefox or Edge. This is important because the persistent browser context will use the Chrome data when running the agent.
+    - Check the "Use Own Browser" option within the Browser Settings.
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,3 +1,4 @@
 browser-use
 langchain-google-genai
 pyperclip
+gradio
--- a/src/utils/utils.py
+++ b/src/utils/utils.py
@@ -6,6 +6,8 @@
 # @FileName: utils.py

 import base64
+import os
+
 from langchain_openai import ChatOpenAI, AzureChatOpenAI
 from langchain_anthropic import ChatAnthropic
 from langchain_google_genai import ChatGoogleGenerativeAI
@@ -19,32 +21,64 @@ def get_llm_model(provider: str, **kwargs):
    :return:
    """
    if provider == 'anthropic':
+        if not kwargs.get("base_url", ""):
+            base_url = "https://api.anthropic.com"
+        else:
+            base_url = kwargs.get("base_url")
+
+        if not kwargs.get("api_key", ""):
+            api_key = os.getenv("ANTHROPIC_API_KEY", "")
+        else:
+            api_key = kwargs.get("api_key")
+
        return ChatAnthropic(
            model_name=kwargs.get("model_name", 'claude-3-5-sonnet-20240620'),
            temperature=kwargs.get("temperature", 0.0),
-            base_url=kwargs.get("base_url", "https://api.anthropic.com"),
-            api_key=kwargs.get("api_key", None)
+            base_url=base_url,
+            api_key=api_key
        )
    elif provider == 'openai':
+        if not kwargs.get("base_url", ""):
+            base_url = "https://api.openai.com/v1"
+        else:
+            base_url = kwargs.get("base_url")
+
+        if not kwargs.get("api_key", ""):
+            api_key = os.getenv("OPENAI_API_KEY", "")
+        else:
+            api_key = kwargs.get("api_key")
+
        return ChatOpenAI(
            model=kwargs.get("model_name", 'gpt-4o'),
            temperature=kwargs.get("temperature", 0.0),
-            base_url=kwargs.get("base_url", "https://api.openai.com/v1/"),
-            api_key=kwargs.get("api_key", None)
+            base_url=base_url,
+            api_key=api_key
        )
    elif provider == 'gemini':
+        if not kwargs.get("api_key", ""):
+            api_key = os.getenv("GOOGLE_API_KEY", "")
+        else:
+            api_key = kwargs.get("api_key")
        return ChatGoogleGenerativeAI(
            model=kwargs.get("model_name", 'gemini-2.0-flash-exp'),
            temperature=kwargs.get("temperature", 0.0),
-            google_api_key=kwargs.get("api_key", None),
+            google_api_key=api_key,
        )
    elif provider == "azure_openai":
+        if not kwargs.get("base_url", ""):
+            base_url = os.getenv("AZURE_OPENAI_ENDPOINT", "")
+        else:
+            base_url = kwargs.get("base_url")
+        if not kwargs.get("api_key", ""):
+            api_key = os.getenv("AZURE_OPENAI_API_KEY", "")
+        else:
+            api_key = kwargs.get("api_key")
        return AzureChatOpenAI(
            model=kwargs.get("model_name", 'gpt-4o'),
            temperature=kwargs.get("temperature", 0.0),
            api_version="2024-05-01-preview",
-            azure_endpoint=kwargs.get("base_url", ""),
-            api_key=kwargs.get("api_key", None)
+            azure_endpoint=base_url,
+            api_key=api_key
        )
    else:
        raise ValueError(f'Unsupported provider: {provider}')
--- a/tests/test_browser_use.py
+++ b/tests/test_browser_use.py
@@ -106,7 +106,7 @@ async def test_browser_use_custom():
    )

    controller = CustomController()
-    use_own_browser = True
+    use_own_browser = False
    disable_security = True
    playwright = None
    browser_context_ = None
--- a/webui.py
+++ b/webui.py
@@ -4,3 +4,300 @@
 # @Email   : wenshaoguo1026@gmail.com
 # @Project : browser-use-webui
 # @FileName: webui.py
+import pdb
+
+from dotenv import load_dotenv
+
+load_dotenv()
+import argparse
+
+import asyncio
+
+import gradio as gr
+import asyncio
+import os
+from pprint import pprint
+from typing import List, Dict, Any
+
+from playwright.async_api import async_playwright
+from browser_use.browser.browser import Browser, BrowserConfig
+from browser_use.browser.context import (
+    BrowserContext,
+    BrowserContextConfig,
+    BrowserContextWindowSize,
+)
+from browser_use.agent.service import Agent
+
+from src.browser.custom_browser import CustomBrowser, BrowserConfig
+from src.browser.custom_context import BrowserContext, BrowserContextConfig
+from src.controller.custom_controller import CustomController
+from src.agent.custom_agent import CustomAgent
+from src.agent.custom_prompts import CustomSystemPrompt
+
+from src.utils import utils
+
+
+async def run_browser_agent(
+        agent_type,
+        llm_provider,
+        llm_model_name,
+        llm_temperature,
+        llm_base_url,
+        llm_api_key,
+        use_own_browser,
+        headless,
+        disable_security,
+        window_w,
+        window_h,
+        save_recording_path,
+        task,
+        add_infos,
+        progress=gr.Progress()
+):
+    """
+    Runs the browser agent based on user configurations.
+    """
+
+    llm = utils.get_llm_model(
+        provider=llm_provider,
+        model_name=llm_model_name,
+        temperature=llm_temperature,
+        base_url=llm_base_url,
+        api_key=llm_api_key
+    )
+    if agent_type == "org":
+        return await run_org_agent(
+            llm=llm,
+            headless=headless,
+            disable_security=disable_security,
+            window_w=window_w,
+            window_h=window_h,
+            save_recording_path=save_recording_path,
+            task=task,
+            progress=progress,
+        )
+    elif agent_type == "custom":
+        return await run_custom_agent(
+            llm=llm,
+            use_own_browser=use_own_browser,
+            headless=headless,
+            disable_security=disable_security,
+            window_w=window_w,
+            window_h=window_h,
+            save_recording_path=save_recording_path,
+            task=task,
+            add_infos=add_infos,
+            progress=progress,
+        )
+    else:
+        raise ValueError(f"Invalid agent type: {agent_type}")
+
+
+async def run_org_agent(
+        llm,
+        headless,
+        disable_security,
+        window_w,
+        window_h,
+        save_recording_path,
+        task,
+        progress
+):
+    browser = Browser(
+        config=BrowserConfig(
+            headless=headless,
+            disable_security=disable_security,
+            extra_chromium_args=[f'--window-size={window_w},{window_h}'],
+        )
+    )
+    async with await browser.new_context(
+            config=BrowserContextConfig(
+                trace_path='./tmp/traces',
+                save_recording_path=save_recording_path if save_recording_path else None,
+                no_viewport=False,
+                browser_window_size=BrowserContextWindowSize(width=window_w, height=window_h),
+            )
+    ) as browser_context:
+        agent = Agent(
+            task=task,
+            llm=llm,
+            browser_context=browser_context,
+        )
+        history = await agent.run(max_steps=10)
+
+        final_result = history.final_result()
+        errors = history.errors()
+        model_actions = history.model_actions()
+        model_thoughts = history.model_thoughts()
+    await browser.close()
+    return final_result, errors, model_actions, model_thoughts
+
+
+async def run_custom_agent(
+        llm,
+        use_own_browser,
+        headless,
+        disable_security,
+        window_w,
+        window_h,
+        save_recording_path,
+        task,
+        add_infos,
+        progress
+):
+    controller = CustomController()
+    playwright = None
+    browser_context_ = None
+    try:
+        if use_own_browser:
+            playwright = await async_playwright().start()
+            chrome_exe = os.getenv("CHROME_PATH", "")
+            chrome_use_data = os.getenv("CHROME_USER_DATA", "")
+            browser_context_ = await playwright.chromium.launch_persistent_context(
+                user_data_dir=chrome_use_data,
+                executable_path=chrome_exe,
+                no_viewport=False,
+                headless=headless,  # 保持浏览器窗口可见
+                user_agent=(
+                    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
+                    '(KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36'
+                ),
+                java_script_enabled=True,
+                bypass_csp=disable_security,
+                ignore_https_errors=disable_security,
+                record_video_dir=save_recording_path if save_recording_path else None,
+                record_video_size={'width': window_w, 'height': window_h}
+            )
+        else:
+            browser_context_ = None
+
+        browser = CustomBrowser(
+            config=BrowserConfig(
+                headless=headless,
+                disable_security=disable_security,
+                extra_chromium_args=[f'--window-size={window_w},{window_h}'],
+            )
+        )
+        async with await browser.new_context(
+                config=BrowserContextConfig(
+                    trace_path='./tmp/result_processing',
+                    save_recording_path=save_recording_path if save_recording_path else None,
+                    no_viewport=False,
+                    browser_window_size=BrowserContextWindowSize(width=window_w, height=window_h),
+                ),
+                context=browser_context_
+        ) as browser_context:
+            agent = CustomAgent(
+                task=task,
+                add_infos=add_infos,
+                llm=llm,
+                browser_context=browser_context,
+                controller=controller,
+                system_prompt_class=CustomSystemPrompt
+            )
+            history = await agent.run(max_steps=10)
+
+            final_result = history.final_result()
+            errors = history.errors()
+            model_actions = history.model_actions()
+            model_thoughts = history.model_thoughts()
+
+    except Exception as e:
+        import traceback
+        traceback.print_exc()
+        final_result = ""
+        errors = str(e) + "\n" + traceback.format_exc()
+        model_actions = ""
+        model_thoughts = ""
+    finally:
+        # 显式关闭持久化上下文
+        if browser_context_:
+            await browser_context_.close()
+
+        # 关闭 Playwright 对象
+        if playwright:
+            await playwright.stop()
+        await browser.close()
+    return final_result, errors, model_actions, model_thoughts
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Gradio UI for Browser Agent")
+    parser.add_argument("--ip", type=str, default="127.0.0.1", help="IP address to bind to")
+    parser.add_argument("--port", type=int, default=7788, help="Port to listen on")
+    args = parser.parse_args()
+
+    js_func = """
+        function refresh() {
+            const url = new URL(window.location);
+
+            if (url.searchParams.get('__theme') !== 'dark') {
+                url.searchParams.set('__theme', 'dark');
+                window.location.href = url.href;
+            }
+        }
+        """
+
+    # Gradio UI setup
+    with gr.Blocks(title="Browser Use WebUI", theme=gr.themes.Soft(font=[gr.themes.GoogleFont("Plus Jakarta Sans")]),
+                   js=js_func) as demo:
+        gr.Markdown("<center><h1>Browser Use WebUI</h1></center>")
+        with gr.Row():
+            agent_type = gr.Radio(["org", "custom"], label="Agent Type", value="custom")
+        with gr.Row():
+            llm_provider = gr.Dropdown(
+                ["anthropic", "openai", "gemini", "azure_openai"], label="LLM Provider", value="gemini"
+            )
+            llm_model_name = gr.Textbox(label="LLM Model Name", value="gemini-2.0-flash-exp")
+            llm_temperature = gr.Number(label="LLM Temperature", value=1.0)
+        with gr.Row():
+            llm_base_url = gr.Textbox(label="LLM Base URL")
+            llm_api_key = gr.Textbox(label="LLM API Key", type="password")
+
+        with gr.Accordion("Browser Settings", open=False):
+            use_own_browser = gr.Checkbox(label="Use Own Browser", value=False)
+            headless = gr.Checkbox(label="Headless", value=False)
+            disable_security = gr.Checkbox(label="Disable Security", value=True)
+            with gr.Row():
+                window_w = gr.Number(label="Window Width", value=1920)
+                window_h = gr.Number(label="Window Height", value=1080)
+            save_recording_path = gr.Textbox(label="Save Recording Path", placeholder="e.g. ./tmp/record_videos",
+                                             value="./tmp/record_videos")
+        with gr.Accordion("Task Settings", open=True):
+            task = gr.Textbox(label="Task", lines=10,
+                              value="go to google.com and type 'OpenAI' click search and give me the first url")
+            add_infos = gr.Textbox(label="Additional Infos", lines=10)
+
+        run_button = gr.Button("Run Agent", variant="primary")
+        with gr.Column():
+            final_result_output = gr.Textbox(label="Final Result", lines=5)
+            errors_output = gr.Textbox(label="Errors", lines=5, )
+            model_actions_output = gr.Textbox(label="Model Actions", lines=5)
+            model_thoughts_output = gr.Textbox(label="Model Thoughts", lines=5)
+
+        run_button.click(
+            fn=run_browser_agent,
+            inputs=[
+                agent_type,
+                llm_provider,
+                llm_model_name,
+                llm_temperature,
+                llm_base_url,
+                llm_api_key,
+                use_own_browser,
+                headless,
+                disable_security,
+                window_w,
+                window_h,
+                save_recording_path,
+                task,
+                add_infos,
+            ],
+            outputs=[final_result_output, errors_output, model_actions_output, model_thoughts_output],
+        )
+
+    demo.launch(server_name=args.ip, server_port=args.port)
+
+
+if __name__ == '__main__':
+    main()