Merge branch 'main' of https://github.com/katiue/browser-use-webui

2026-03-22 11:17:17 +08:00 · 2025-01-07 21:44:26 +07:00
parent cd24485ab4 6c29bc1195
commit 10f6781e59
7 changed files with 106 additions and 30 deletions
--- a/README.md
+++ b/README.md
@@ -5,4 +5,56 @@ sdk: gradio
 sdk_version: 5.9.1
 python_version: 3.12
 startup_duration_timeout: 2h
---
+---
+# Browser-Use WebUI
+
+## Background
+
+This project builds upon the foundation of the [browser-use](https://github.com/browser-use/browser-use), which is designed to make websites accessible for AI agents. We have enhanced the original capabilities by providing:
+
+1.  **A Brand New WebUI:** We offer a comprehensive web interface that supports a wide range of `browser-use` functionalities. This UI is designed to be user-friendly and enables easy interaction with the browser agent.
+
+2.  **Expanded LLM Support:** We've integrated support for various Large Language Models (LLMs), including: Gemini, OpenAI, Azure OpenAI, Anthropic, DeepSeek, Ollama etc. And we plan to add support for even more models in the future.
+
+3.  **Custom Browser Support:** You can use your own browser with our tool, eliminating the need to re-login to sites or deal with other authentication challenges. This feature also supports high-definition screen recording.
+
+4.  **Customized Agent:** We've implemented a custom agent that enhances `browser-use` with Optimized prompts.
+
+<video src="https://github.com/user-attachments/assets/58c0f59e-02b4-4413-aba8-6184616bf181" controls="controls" width="500" height="300" >Your browser does not support playing this video!</video>
+
+**Changelog**
+- [x] **2025/01/06:** Thanks to @richard-devbot, a New and Well-Designed WebUI is released. [Video tutorial demo](https://github.com/warmshao/browser-use-webui/issues/1#issuecomment-2573393113).
+
+
+## Environment Installation
+
+1.  **Python Version:** Ensure you have Python 3.11 or higher installed.
+2.  **Install `browser-use`:**
+    ```bash
+    pip install browser-use
+    ```
+3.  **Install Playwright:**
+    ```bash
+    playwright install
+    ```
+4.  **Install Dependencies:**
+    ```bash
+    pip install -r requirements.txt
+    ```
+5.  **Configure Environment Variables:**
+    - Copy `.env.example` to `.env` and set your environment variables, including API keys for the LLM.
+    - **If using your own browser:**
+      - Set `CHROME_PATH` to the executable path of your browser (e.g., `C:\Program Files\Google\Chrome\Application\chrome.exe` on Windows).
+      - Set `CHROME_USER_DATA` to the user data directory of your browser (e.g.,`C:\Users\<YourUsername>\AppData\Local\Google\Chrome\User Data`).
+
+## Usage
+
+1.  **Run the WebUI:**
+    ```bash
+    python webui.py --ip 127.0.0.1 --port 7788
+    ```
+2.  **Access the WebUI:** Open your web browser and navigate to `http://127.0.0.1:7788`.
+3.  **Using Your Own Browser:**
+    - Close all chrome windows
+    - Open the WebUI in a non-Chrome browser, such as Firefox or Edge. This is important because the persistent browser context will use the Chrome data when running the agent.
+    - Check the "Use Own Browser" option within the Browser Settings.
--- a/requirements.txt
+++ b/requirements.txt
@@ -3,4 +3,6 @@ langchain-google-genai
 pyperclip
 gradio
 python-dotenv
-argparse
+argparse
+langchain-ollama
+
--- a/src/agent/custom_prompts.py
+++ b/src/agent/custom_prompts.py
@@ -82,7 +82,7 @@ class CustomSystemPrompt(SystemPrompt):
       - sometimes labels overlap, so use the context to verify the correct element

    7. Form filling:
-       - If you fill a input field and your action sequence is interrupted, most often a list with suggestions poped up under the field and you need to first select the right element from the suggestion list.
+       - If you fill an input field and your action sequence is interrupted, most often a list with suggestions poped up under the field and you need to first select the right element from the suggestion list.

    8. ACTION SEQUENCING:
       - Actions are executed in the order they appear in the list 
--- a/src/utils/utils.py
+++ b/src/utils/utils.py
@@ -11,6 +11,7 @@ import os
 from langchain_openai import ChatOpenAI, AzureChatOpenAI
 from langchain_anthropic import ChatAnthropic
 from langchain_google_genai import ChatGoogleGenerativeAI
+from langchain_ollama import ChatOllama


 def get_llm_model(provider: str, **kwargs):
@@ -39,7 +40,7 @@ def get_llm_model(provider: str, **kwargs):
        )
    elif provider == 'openai':
        if not kwargs.get("base_url", ""):
-            base_url = "https://api.openai.com/v1"
+            base_url = os.getenv("OPENAI_ENDPOINT", "https://api.openai.com/v1")
        else:
            base_url = kwargs.get("base_url")

@@ -66,7 +67,7 @@ def get_llm_model(provider: str, **kwargs):
            api_key = kwargs.get("api_key")

        return ChatOpenAI(
-            model=kwargs.get("model_name", 'gpt-4o'),
+            model=kwargs.get("model_name", 'deepseek-chat'),
            temperature=kwargs.get("temperature", 0.0),
            base_url=base_url,
            api_key=api_key
@@ -81,6 +82,11 @@ def get_llm_model(provider: str, **kwargs):
            temperature=kwargs.get("temperature", 0.0),
            google_api_key=api_key,
        )
+    elif provider == 'ollama':
+        return ChatOllama(
+            model=kwargs.get("model_name", 'qwen2.5:7b'),
+            temperature=kwargs.get("temperature", 0.0),
+        )
    elif provider == "azure_openai":
        if not kwargs.get("base_url", ""):
            base_url = os.getenv("AZURE_OPENAI_ENDPOINT", "")
--- a/tests/test_browser_use.py
+++ b/tests/test_browser_use.py
@@ -105,9 +105,15 @@ async def test_browser_use_custom():
    #     api_key=os.getenv("GOOGLE_API_KEY", "")
    # )

+    # llm = utils.get_llm_model(
+    #     provider="deepseek",
+    #     model_name="deepseek-chat",
+    #     temperature=0.8
+    # )
+
    llm = utils.get_llm_model(
-        provider="deepseek",
-        model_name="deepseek-chat",
+        provider="ollama",
+        model_name="qwen2.5:7b",
        temperature=0.8
    )

--- a/tests/test_llm_api.py
+++ b/tests/test_llm_api.py
@@ -106,7 +106,6 @@ def test_deepseek_model():
        base_url=os.getenv("DEEPSEEK_ENDPOINT", ""),
        api_key=os.getenv("DEEPSEEK_API_KEY", "")
    )
-    pdb.set_trace()
    message = HumanMessage(
        content=[
            {"type": "text", "text": "who are you?"}
@@ -116,8 +115,17 @@ def test_deepseek_model():
    print(ai_msg.content)


+def test_ollama_model():
+    from langchain_ollama import ChatOllama
+
+    llm = ChatOllama(model="qwen2.5:7b")
+    ai_msg = llm.invoke("Sing a ballad of LangChain.")
+    print(ai_msg.content)
+
+
 if __name__ == '__main__':
    # test_openai_model()
    # test_gemini_model()
    # test_azure_openai_model()
-    test_deepseek_model()
+    # test_deepseek_model()
+    test_ollama_model()
--- a/webui.py
+++ b/webui.py
@@ -46,10 +46,14 @@ async def run_browser_agent(
        use_vision,
        browser_context=None  # Added optional argument
 ):
-    """
-    Runs the browser agent based on user configurations.
-    """
+    # Ensure the recording directory exists
+    os.makedirs(save_recording_path, exist_ok=True)

+    # Get the list of existing videos before the agent runs
+    existing_videos = set(glob.glob(os.path.join(save_recording_path, '*.[mM][pP]4')) + 
+                          glob.glob(os.path.join(save_recording_path, '*.[wW][eE][bB][mM]')))
+
+    # Run the agent
    llm = utils.get_llm_model(
        provider=llm_provider,
        model_name=llm_model_name,
@@ -58,7 +62,7 @@ async def run_browser_agent(
        api_key=llm_api_key
    )
    if agent_type == "org":
-        return await run_org_agent(
+        final_result, errors, model_actions, model_thoughts = await run_org_agent(
            llm=llm,
            headless=headless,
            disable_security=disable_security,
@@ -71,7 +75,7 @@ async def run_browser_agent(
            browser_context=browser_context  # pass context
        )
    elif agent_type == "custom":
-        return await run_custom_agent(
+        final_result, errors, model_actions, model_thoughts = await run_custom_agent(
            llm=llm,
            use_own_browser=use_own_browser,
            headless=headless,
@@ -88,6 +92,16 @@ async def run_browser_agent(
    else:
        raise ValueError(f"Invalid agent type: {agent_type}")

+    # Get the list of videos after the agent runs
+    new_videos = set(glob.glob(os.path.join(save_recording_path, '*.[mM][pP]4')) + 
+                     glob.glob(os.path.join(save_recording_path, '*.[wW][eE][bB][mM]')))
+
+    # Find the newly created video
+    latest_video = None
+    if new_videos - existing_videos:
+        latest_video = list(new_videos - existing_videos)[0]  # Get the first new video
+
+    return final_result, errors, model_actions, model_thoughts, latest_video

 async def run_org_agent(
        llm,
@@ -420,22 +434,10 @@ def main():
        run_button.click(
            fn=run_with_stream,
            inputs=[
-                agent_type,
-                llm_provider,
-                llm_model_name,
-                llm_temperature,
-                llm_base_url,
-                llm_api_key,
-                use_own_browser,
-                headless,
-                disable_security,
-                window_w,
-                window_h,
-                save_recording_path,
-                task,
-                add_infos,
-                max_steps,
-                use_vision
+                agent_type, llm_provider, llm_model_name, llm_temperature,
+                llm_base_url, llm_api_key, use_own_browser, headless,
+                disable_security, window_w, window_h, save_recording_path,
+                task, add_infos, max_steps, use_vision
            ],
            outputs=[
                browser_view,