diff --git a/.env.example b/.env.example index e2d3a7a..2a41049 100644 --- a/.env.example +++ b/.env.example @@ -12,4 +12,7 @@ AZURE_OPENAI_API_KEY= ANONYMIZED_TELEMETRY=true # LogLevel: Set to debug to enable verbose logging, set to result to get results only. Available: result | debug | info -BROWSER_USE_LOGGING_LEVEL=info \ No newline at end of file +BROWSER_USE_LOGGING_LEVEL=info + +CHROME_PATH= +CHROME_USER_DATA= \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..7d5d262 --- /dev/null +++ b/README.md @@ -0,0 +1,47 @@ +# Browser-Use WebUI + +## Background + +This project builds upon the foundation of the [browser-use](https://github.com/browser-use/browser-use), which is designed to make websites accessible for AI agents. We have enhanced the original capabilities by providing: + +1. **A Brand New WebUI:** We offer a comprehensive web interface that supports a wide range of `browser-use` functionalities. This UI is designed to be user-friendly and enables easy interaction with the browser agent. + +2. **Expanded LLM Support:** We've integrated support for various Large Language Models (LLMs), including: Gemini, OpenAI, Azure OpenAI, Anthropic etc. And we plan to add support for even more models in the future. + +3. **Custom Browser Support:** You can use your own browser with our tool, eliminating the need to re-login to sites or deal with other authentication challenges. This feature also supports high-definition screen recording. + +4. **Customized Agent:** We've implemented a custom agent that enhances `browser-use` with Optimized prompts. + + + +## Environment Installation + +1. **Python Version:** Ensure you have Python 3.11 or higher installed. +2. **Install `browser-use`:** + ```bash + pip install browser-use + ``` +3. **Install Playwright:** + ```bash + playwright install + ``` +4. **Install Dependencies:** + ```bash + pip install -r requirements.txt + ``` +5. **Configure Environment Variables:** + - Copy `.env.example` to `.env` and set your environment variables, including API keys for the LLM. + - **If using your own browser:** + - Set `CHROME_PATH` to the executable path of your browser (e.g., `C:\Program Files\Google\Chrome\Application\chrome.exe` on Windows). + - Set `CHROME_USER_DATA` to the user data directory of your browser (e.g.,`C:\Users\\AppData\Local\Google\Chrome\User Data`). + +## Usage + +1. **Run the WebUI:** + ```bash + python webui.py --ip 127.0.0.1 --port 7788 + ``` +2. **Access the WebUI:** Open your web browser and navigate to `http://127.0.0.1:7788`. +3. **Using Your Own Browser:** + - Open the WebUI in a non-Chrome browser, such as Firefox or Edge. This is important because the persistent browser context will use the Chrome data when running the agent. + - Check the "Use Own Browser" option within the Browser Settings. diff --git a/requirements.txt b/requirements.txt index 2e53672..eb339d6 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,4 @@ browser-use langchain-google-genai pyperclip +gradio \ No newline at end of file diff --git a/src/utils/utils.py b/src/utils/utils.py index e81806f..1b8d7af 100644 --- a/src/utils/utils.py +++ b/src/utils/utils.py @@ -6,6 +6,8 @@ # @FileName: utils.py import base64 +import os + from langchain_openai import ChatOpenAI, AzureChatOpenAI from langchain_anthropic import ChatAnthropic from langchain_google_genai import ChatGoogleGenerativeAI @@ -19,32 +21,64 @@ def get_llm_model(provider: str, **kwargs): :return: """ if provider == 'anthropic': + if not kwargs.get("base_url", ""): + base_url = "https://api.anthropic.com" + else: + base_url = kwargs.get("base_url") + + if not kwargs.get("api_key", ""): + api_key = os.getenv("ANTHROPIC_API_KEY", "") + else: + api_key = kwargs.get("api_key") + return ChatAnthropic( model_name=kwargs.get("model_name", 'claude-3-5-sonnet-20240620'), temperature=kwargs.get("temperature", 0.0), - base_url=kwargs.get("base_url", "https://api.anthropic.com"), - api_key=kwargs.get("api_key", None) + base_url=base_url, + api_key=api_key ) elif provider == 'openai': + if not kwargs.get("base_url", ""): + base_url = "https://api.openai.com/v1" + else: + base_url = kwargs.get("base_url") + + if not kwargs.get("api_key", ""): + api_key = os.getenv("OPENAI_API_KEY", "") + else: + api_key = kwargs.get("api_key") + return ChatOpenAI( model=kwargs.get("model_name", 'gpt-4o'), temperature=kwargs.get("temperature", 0.0), - base_url=kwargs.get("base_url", "https://api.openai.com/v1/"), - api_key=kwargs.get("api_key", None) + base_url=base_url, + api_key=api_key ) elif provider == 'gemini': + if not kwargs.get("api_key", ""): + api_key = os.getenv("GOOGLE_API_KEY", "") + else: + api_key = kwargs.get("api_key") return ChatGoogleGenerativeAI( model=kwargs.get("model_name", 'gemini-2.0-flash-exp'), temperature=kwargs.get("temperature", 0.0), - google_api_key=kwargs.get("api_key", None), + google_api_key=api_key, ) elif provider == "azure_openai": + if not kwargs.get("base_url", ""): + base_url = os.getenv("AZURE_OPENAI_ENDPOINT", "") + else: + base_url = kwargs.get("base_url") + if not kwargs.get("api_key", ""): + api_key = os.getenv("AZURE_OPENAI_API_KEY", "") + else: + api_key = kwargs.get("api_key") return AzureChatOpenAI( model=kwargs.get("model_name", 'gpt-4o'), temperature=kwargs.get("temperature", 0.0), api_version="2024-05-01-preview", - azure_endpoint=kwargs.get("base_url", ""), - api_key=kwargs.get("api_key", None) + azure_endpoint=base_url, + api_key=api_key ) else: raise ValueError(f'Unsupported provider: {provider}') diff --git a/tests/test_browser_use.py b/tests/test_browser_use.py index dc8a0b6..42ed270 100644 --- a/tests/test_browser_use.py +++ b/tests/test_browser_use.py @@ -106,7 +106,7 @@ async def test_browser_use_custom(): ) controller = CustomController() - use_own_browser = True + use_own_browser = False disable_security = True playwright = None browser_context_ = None diff --git a/webui.py b/webui.py index 85c8660..5ab933c 100644 --- a/webui.py +++ b/webui.py @@ -4,3 +4,300 @@ # @Email : wenshaoguo1026@gmail.com # @Project : browser-use-webui # @FileName: webui.py +import pdb + +from dotenv import load_dotenv + +load_dotenv() +import argparse + +import asyncio + +import gradio as gr +import asyncio +import os +from pprint import pprint +from typing import List, Dict, Any + +from playwright.async_api import async_playwright +from browser_use.browser.browser import Browser, BrowserConfig +from browser_use.browser.context import ( + BrowserContext, + BrowserContextConfig, + BrowserContextWindowSize, +) +from browser_use.agent.service import Agent + +from src.browser.custom_browser import CustomBrowser, BrowserConfig +from src.browser.custom_context import BrowserContext, BrowserContextConfig +from src.controller.custom_controller import CustomController +from src.agent.custom_agent import CustomAgent +from src.agent.custom_prompts import CustomSystemPrompt + +from src.utils import utils + + +async def run_browser_agent( + agent_type, + llm_provider, + llm_model_name, + llm_temperature, + llm_base_url, + llm_api_key, + use_own_browser, + headless, + disable_security, + window_w, + window_h, + save_recording_path, + task, + add_infos, + progress=gr.Progress() +): + """ + Runs the browser agent based on user configurations. + """ + + llm = utils.get_llm_model( + provider=llm_provider, + model_name=llm_model_name, + temperature=llm_temperature, + base_url=llm_base_url, + api_key=llm_api_key + ) + if agent_type == "org": + return await run_org_agent( + llm=llm, + headless=headless, + disable_security=disable_security, + window_w=window_w, + window_h=window_h, + save_recording_path=save_recording_path, + task=task, + progress=progress, + ) + elif agent_type == "custom": + return await run_custom_agent( + llm=llm, + use_own_browser=use_own_browser, + headless=headless, + disable_security=disable_security, + window_w=window_w, + window_h=window_h, + save_recording_path=save_recording_path, + task=task, + add_infos=add_infos, + progress=progress, + ) + else: + raise ValueError(f"Invalid agent type: {agent_type}") + + +async def run_org_agent( + llm, + headless, + disable_security, + window_w, + window_h, + save_recording_path, + task, + progress +): + browser = Browser( + config=BrowserConfig( + headless=headless, + disable_security=disable_security, + extra_chromium_args=[f'--window-size={window_w},{window_h}'], + ) + ) + async with await browser.new_context( + config=BrowserContextConfig( + trace_path='./tmp/traces', + save_recording_path=save_recording_path if save_recording_path else None, + no_viewport=False, + browser_window_size=BrowserContextWindowSize(width=window_w, height=window_h), + ) + ) as browser_context: + agent = Agent( + task=task, + llm=llm, + browser_context=browser_context, + ) + history = await agent.run(max_steps=10) + + final_result = history.final_result() + errors = history.errors() + model_actions = history.model_actions() + model_thoughts = history.model_thoughts() + await browser.close() + return final_result, errors, model_actions, model_thoughts + + +async def run_custom_agent( + llm, + use_own_browser, + headless, + disable_security, + window_w, + window_h, + save_recording_path, + task, + add_infos, + progress +): + controller = CustomController() + playwright = None + browser_context_ = None + try: + if use_own_browser: + playwright = await async_playwright().start() + chrome_exe = os.getenv("CHROME_PATH", "") + chrome_use_data = os.getenv("CHROME_USER_DATA", "") + browser_context_ = await playwright.chromium.launch_persistent_context( + user_data_dir=chrome_use_data, + executable_path=chrome_exe, + no_viewport=False, + headless=headless, # 保持浏览器窗口可见 + user_agent=( + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 ' + '(KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36' + ), + java_script_enabled=True, + bypass_csp=disable_security, + ignore_https_errors=disable_security, + record_video_dir=save_recording_path if save_recording_path else None, + record_video_size={'width': window_w, 'height': window_h} + ) + else: + browser_context_ = None + + browser = CustomBrowser( + config=BrowserConfig( + headless=headless, + disable_security=disable_security, + extra_chromium_args=[f'--window-size={window_w},{window_h}'], + ) + ) + async with await browser.new_context( + config=BrowserContextConfig( + trace_path='./tmp/result_processing', + save_recording_path=save_recording_path if save_recording_path else None, + no_viewport=False, + browser_window_size=BrowserContextWindowSize(width=window_w, height=window_h), + ), + context=browser_context_ + ) as browser_context: + agent = CustomAgent( + task=task, + add_infos=add_infos, + llm=llm, + browser_context=browser_context, + controller=controller, + system_prompt_class=CustomSystemPrompt + ) + history = await agent.run(max_steps=10) + + final_result = history.final_result() + errors = history.errors() + model_actions = history.model_actions() + model_thoughts = history.model_thoughts() + + except Exception as e: + import traceback + traceback.print_exc() + final_result = "" + errors = str(e) + "\n" + traceback.format_exc() + model_actions = "" + model_thoughts = "" + finally: + # 显式关闭持久化上下文 + if browser_context_: + await browser_context_.close() + + # 关闭 Playwright 对象 + if playwright: + await playwright.stop() + await browser.close() + return final_result, errors, model_actions, model_thoughts + + +def main(): + parser = argparse.ArgumentParser(description="Gradio UI for Browser Agent") + parser.add_argument("--ip", type=str, default="127.0.0.1", help="IP address to bind to") + parser.add_argument("--port", type=int, default=7788, help="Port to listen on") + args = parser.parse_args() + + js_func = """ + function refresh() { + const url = new URL(window.location); + + if (url.searchParams.get('__theme') !== 'dark') { + url.searchParams.set('__theme', 'dark'); + window.location.href = url.href; + } + } + """ + + # Gradio UI setup + with gr.Blocks(title="Browser Use WebUI", theme=gr.themes.Soft(font=[gr.themes.GoogleFont("Plus Jakarta Sans")]), + js=js_func) as demo: + gr.Markdown("

Browser Use WebUI

") + with gr.Row(): + agent_type = gr.Radio(["org", "custom"], label="Agent Type", value="custom") + with gr.Row(): + llm_provider = gr.Dropdown( + ["anthropic", "openai", "gemini", "azure_openai"], label="LLM Provider", value="gemini" + ) + llm_model_name = gr.Textbox(label="LLM Model Name", value="gemini-2.0-flash-exp") + llm_temperature = gr.Number(label="LLM Temperature", value=1.0) + with gr.Row(): + llm_base_url = gr.Textbox(label="LLM Base URL") + llm_api_key = gr.Textbox(label="LLM API Key", type="password") + + with gr.Accordion("Browser Settings", open=False): + use_own_browser = gr.Checkbox(label="Use Own Browser", value=False) + headless = gr.Checkbox(label="Headless", value=False) + disable_security = gr.Checkbox(label="Disable Security", value=True) + with gr.Row(): + window_w = gr.Number(label="Window Width", value=1920) + window_h = gr.Number(label="Window Height", value=1080) + save_recording_path = gr.Textbox(label="Save Recording Path", placeholder="e.g. ./tmp/record_videos", + value="./tmp/record_videos") + with gr.Accordion("Task Settings", open=True): + task = gr.Textbox(label="Task", lines=10, + value="go to google.com and type 'OpenAI' click search and give me the first url") + add_infos = gr.Textbox(label="Additional Infos", lines=10) + + run_button = gr.Button("Run Agent", variant="primary") + with gr.Column(): + final_result_output = gr.Textbox(label="Final Result", lines=5) + errors_output = gr.Textbox(label="Errors", lines=5, ) + model_actions_output = gr.Textbox(label="Model Actions", lines=5) + model_thoughts_output = gr.Textbox(label="Model Thoughts", lines=5) + + run_button.click( + fn=run_browser_agent, + inputs=[ + agent_type, + llm_provider, + llm_model_name, + llm_temperature, + llm_base_url, + llm_api_key, + use_own_browser, + headless, + disable_security, + window_w, + window_h, + save_recording_path, + task, + add_infos, + ], + outputs=[final_result_output, errors_output, model_actions_output, model_thoughts_output], + ) + + demo.launch(server_name=args.ip, server_port=args.port) + + +if __name__ == '__main__': + main()