diff --git a/.env.example b/.env.example index e13240b..2ebe67b 100644 --- a/.env.example +++ b/.env.example @@ -17,5 +17,17 @@ ANONYMIZED_TELEMETRY=true # LogLevel: Set to debug to enable verbose logging, set to result to get results only. Available: result | debug | info BROWSER_USE_LOGGING_LEVEL=info +# Chrome settings CHROME_PATH= -CHROME_USER_DATA= \ No newline at end of file +CHROME_USER_DATA= +CHROME_DEBUGGING_PORT=9222 +CHROME_DEBUGGING_HOST=localhost +CHROME_PERSISTENT_SESSION=false # Set to true to keep browser open between AI tasks + +# Display settings +RESOLUTION=1920x1080x24 # Format: WIDTHxHEIGHTxDEPTH +RESOLUTION_WIDTH=1920 # Width in pixels +RESOLUTION_HEIGHT=1080 # Height in pixels + +# VNC settings +VNC_PASSWORD=youvncpassword \ No newline at end of file diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..af1d438 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,82 @@ +FROM python:3.11-slim + +# Install system dependencies +RUN apt-get update && apt-get install -y \ + wget \ + gnupg \ + curl \ + unzip \ + xvfb \ + libgconf-2-4 \ + libxss1 \ + libnss3 \ + libnspr4 \ + libasound2 \ + libatk1.0-0 \ + libatk-bridge2.0-0 \ + libcups2 \ + libdbus-1-3 \ + libdrm2 \ + libgbm1 \ + libgtk-3-0 \ + libxcomposite1 \ + libxdamage1 \ + libxfixes3 \ + libxrandr2 \ + xdg-utils \ + fonts-liberation \ + dbus \ + xauth \ + xvfb \ + x11vnc \ + tigervnc-tools \ + supervisor \ + net-tools \ + procps \ + git \ + python3-numpy \ + && rm -rf /var/lib/apt/lists/* + +# Install noVNC +RUN git clone https://github.com/novnc/noVNC.git /opt/novnc \ + && git clone https://github.com/novnc/websockify /opt/novnc/utils/websockify \ + && ln -s /opt/novnc/vnc.html /opt/novnc/index.html + +# Install Chrome +RUN wget -q -O - https://dl-ssl.google.com/linux/linux_signing_key.pub | apt-key add - \ + && echo "deb http://dl.google.com/linux/chrome/deb/ stable main" >> /etc/apt/sources.list.d/google.list \ + && apt-get update \ + && apt-get install -y google-chrome-stable \ + && rm -rf /var/lib/apt/lists/* + +# Set up working directory +WORKDIR /app + +# Copy requirements and install Python dependencies +COPY requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt + +# Install Playwright and browsers with system dependencies +ENV PLAYWRIGHT_BROWSERS_PATH=/ms-playwright +RUN playwright install --with-deps chromium +RUN playwright install-deps + +# Copy the application code +COPY . . + +# Set environment variables +ENV PYTHONUNBUFFERED=1 +ENV BROWSER_USE_LOGGING_LEVEL=info +ENV CHROME_PATH=/usr/bin/google-chrome +ENV ANONYMIZED_TELEMETRY=false +ENV DISPLAY=:99 +ENV RESOLUTION=1920x1080x24 +ENV VNC_PASSWORD=vncpassword + +# Set up supervisor configuration +RUN mkdir -p /var/log/supervisor +COPY supervisord.conf /etc/supervisor/conf.d/supervisord.conf + +EXPOSE 7788 6080 5900 + +CMD ["/usr/bin/supervisord", "-c", "/etc/supervisor/conf.d/supervisord.conf"] \ No newline at end of file diff --git a/README.md b/README.md index 1ebee46..184eeb9 100644 --- a/README.md +++ b/README.md @@ -17,9 +17,13 @@ We would like to officially thank [WarmShao](https://github.com/warmshao) for hi **Custom Browser Support:** You can use your own browser with our tool, eliminating the need to re-login to sites or deal with other authentication challenges. This feature also supports high-definition screen recording. - +**Persistent Browser Sessions:** You can choose to keep the browser window open between AI tasks, allowing you to see the complete history and state of AI interactions. -## Installation Guide + + +## Installation Options + +### Option 1: Local Installation Read the [quickstart guide](https://docs.browser-use.com/quickstart#prepare-the-environment) or follow the steps below to get started. @@ -49,84 +53,132 @@ Then install playwright: playwright install ``` +### Option 2: Docker Installation + +1. **Prerequisites:** + - Docker and Docker Compose installed on your system + - Git to clone the repository + +2. **Setup:** + ```bash + # Clone the repository + git clone https://github.com/browser-use/web-ui.git + cd web-ui + + # Copy and configure environment variables + cp .env.example .env + # Edit .env with your preferred text editor and add your API keys + ``` + +3. **Run with Docker:** + ```bash + # Build and start the container with default settings (browser closes after AI tasks) + docker compose up --build + + # Or run with persistent browser (browser stays open between AI tasks) + CHROME_PERSISTENT_SESSION=true docker compose up --build + ``` + +4. **Access the Application:** + - WebUI: `http://localhost:7788` + - VNC Viewer (to see browser interactions): `http://localhost:6080/vnc.html` + + Default VNC password is "vncpassword". You can change it by setting the `VNC_PASSWORD` environment variable in your `.env` file. + + ## Usage -1. **Run the WebUI:** +### Local Setup +1. Copy `.env.example` to `.env` and set your environment variables, including API keys for the LLM. `cp .env.example .env` +2. **Run the WebUI:** ```bash python webui.py --ip 127.0.0.1 --port 7788 ``` -2. **Access the WebUI:** Open your web browser and navigate to `http://127.0.0.1:7788`. -3. **Using Your Own Browser:** - - Close all chrome windows +4. WebUI options: + - `--ip`: The IP address to bind the WebUI to. Default is `127.0.0.1`. + - `--port`: The port to bind the WebUI to. Default is `7788`. + - `--theme`: The theme for the user interface. Default is `Ocean`. + - **Default**: The standard theme with a balanced design. + - **Soft**: A gentle, muted color scheme for a relaxed viewing experience. + - **Monochrome**: A grayscale theme with minimal color for simplicity and focus. + - **Glass**: A sleek, semi-transparent design for a modern appearance. + - **Origin**: A classic, retro-inspired theme for a nostalgic feel. + - **Citrus**: A vibrant, citrus-inspired palette with bright and fresh colors. + - **Ocean** (default): A blue, ocean-inspired theme providing a calming effect. + - `--dark-mode`: Enables dark mode for the user interface. +3. **Access the WebUI:** Open your web browser and navigate to `http://127.0.0.1:7788`. +4. **Using Your Own Browser(Optional):** + - Set `CHROME_PATH` to the executable path of your browser and `CHROME_USER_DATA` to the user data directory of your browser. + - Windows + ```env + CHROME_PATH="C:\Program Files\Google\Chrome\Application\chrome.exe" + CHROME_USER_DATA="C:\Users\YourUsername\AppData\Local\Google\Chrome\User Data" + ``` + > Note: Replace `YourUsername` with your actual Windows username for Windows systems. + - Mac + ```env + CHROME_PATH="/Applications/Google Chrome.app/Contents/MacOS/Google Chrome" + CHROME_USER_DATA="~/Library/Application Support/Google/Chrome/Profile 1" + ``` + - Close all Chrome windows - Open the WebUI in a non-Chrome browser, such as Firefox or Edge. This is important because the persistent browser context will use the Chrome data when running the agent. - Check the "Use Own Browser" option within the Browser Settings. +5. **Keep Browser Open(Optional):** + - Set `CHROME_PERSISTENT_SESSION=true` in the `.env` file. -### Options: +### Docker Setup +1. **Environment Variables:** + - All configuration is done through the `.env` file + - Available environment variables: + ``` + # LLM API Keys + OPENAI_API_KEY=your_key_here + ANTHROPIC_API_KEY=your_key_here + GOOGLE_API_KEY=your_key_here -### `--theme` + # Browser Settings + CHROME_PERSISTENT_SESSION=true # Set to true to keep browser open between AI tasks + RESOLUTION=1920x1080x24 # Custom resolution format: WIDTHxHEIGHTxDEPTH + RESOLUTION_WIDTH=1920 # Custom width in pixels + RESOLUTION_HEIGHT=1080 # Custom height in pixels -- **Type**: `str` -- **Default**: `Ocean` -- **Description**: Specifies the theme for the user interface. -- **Options**: - The available themes are defined in the `theme_map` dictionary. Below are the options you can choose from: - - **Default**: The standard theme with a balanced design. - - **Soft**: A gentle, muted color scheme for a relaxed viewing experience. - - **Monochrome**: A grayscale theme with minimal color for simplicity and focus. - - **Glass**: A sleek, semi-transparent design for a modern appearance. - - **Origin**: A classic, retro-inspired theme for a nostalgic feel. - - **Citrus**: A vibrant, citrus-inspired palette with bright and fresh colors. - - **Ocean** (default): A blue, ocean-inspired theme providing a calming effect. + # VNC Settings + VNC_PASSWORD=your_vnc_password # Optional, defaults to "vncpassword" + ``` -**Example**: +2. **Browser Persistence Modes:** + - **Default Mode (CHROME_PERSISTENT_SESSION=false):** + - Browser opens and closes with each AI task + - Clean state for each interaction + - Lower resource usage -```bash -python webui.py --ip 127.0.0.1 --port 7788 --theme Glass -``` + - **Persistent Mode (CHROME_PERSISTENT_SESSION=true):** + - Browser stays open between AI tasks + - Maintains history and state + - Allows viewing previous AI interactions + - Set in `.env` file or via environment variable when starting container -### `--dark-mode` +3. **Viewing Browser Interactions:** + - Access the noVNC viewer at `http://localhost:6080/vnc.html` + - Enter the VNC password (default: "vncpassword" or what you set in VNC_PASSWORD) + - You can now see all browser interactions in real-time -- **Type**: `boolean` -- **Default**: Disabled -- **Description**: Enables dark mode for the user interface. This is a simple toggle; including the flag activates dark mode, while omitting it keeps the interface in light mode. -- **Options**: - - **Enabled (`--dark-mode`)**: Activates dark mode, switching the interface to a dark color scheme for better visibility in low-light environments. - - **Disabled (default)**: Keeps the interface in the default light mode. +4. **Container Management:** + ```bash + # Start with persistent browser + CHROME_PERSISTENT_SESSION=true docker compose up -d -**Example**: + # Start with default mode (browser closes after tasks) + docker compose up -d -```bash -python webui.py --ip 127.0.0.1 --port 7788 --dark-mode -``` + # View logs + docker compose logs -f -## (Optional) Configure Environment Variables - -Copy `.env.example` to `.env` and set your environment variables, including API keys for the LLM. With - -```bash -cp .env.example .env -``` - -**If using your own browser:** - Set `CHROME_PATH` to the executable path of your browser and `CHROME_USER_DATA` to the user data directory of your browser. - -You can just copy examples down below to your `.env` file. - -### Windows - -```env -CHROME_PATH="C:\Program Files\Google\Chrome\Application\chrome.exe" -CHROME_USER_DATA="C:\Users\YourUsername\AppData\Local\Google\Chrome\User Data" -``` - -> Note: Replace `YourUsername` with your actual Windows username for Windows systems. - -### Mac - -```env -CHROME_PATH="/Applications/Google Chrome.app/Contents/MacOS/Google Chrome" -CHROME_USER_DATA="~/Library/Application Support/Google/Chrome/Profile 1" -``` + # Stop the container + docker compose down + ``` ## Changelog -- [x] **2025/01/06:** Thanks to @richard-devbot, a New and Well-Designed WebUI is released. [Video tutorial demo](https://github.com/warmshao/browser-use-webui/issues/1#issuecomment-2573393113). +- [x] **2025/01/10:** Thanks to @casistack. Now we have Docker Setup option and also Support keep browser open between tasks.[Video tutorial demo](https://github.com/browser-use/web-ui/issues/1#issuecomment-2582511750). +- [x] **2025/01/06:** Thanks to @richard-devbot. A New and Well-Designed WebUI is released. [Video tutorial demo](https://github.com/warmshao/browser-use-webui/issues/1#issuecomment-2573393113). \ No newline at end of file diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..6253a4a --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,51 @@ +services: + browser-use-webui: + build: + context: . + dockerfile: Dockerfile + ports: + - "7788:7788" # Gradio default port + - "6080:6080" # noVNC web interface + - "5900:5900" # VNC port + - "9222:9222" # Chrome remote debugging port + environment: + - OPENAI_ENDPOINT=${OPENAI_ENDPOINT:-https://api.openai.com/v1} + - OPENAI_API_KEY=${OPENAI_API_KEY:-} + - ANTHROPIC_API_KEY=${ANTHROPIC_API_KEY:-} + - GOOGLE_API_KEY=${GOOGLE_API_KEY:-} + - AZURE_OPENAI_ENDPOINT=${AZURE_OPENAI_ENDPOINT:-} + - AZURE_OPENAI_API_KEY=${AZURE_OPENAI_API_KEY:-} + - DEEPSEEK_ENDPOINT=${DEEPSEEK_ENDPOINT:-https://api.deepseek.com} + - DEEPSEEK_API_KEY=${DEEPSEEK_API_KEY:-} + - BROWSER_USE_LOGGING_LEVEL=${BROWSER_USE_LOGGING_LEVEL:-info} + - ANONYMIZED_TELEMETRY=false + - CHROME_PATH=/usr/bin/google-chrome + - CHROME_USER_DATA=/app/data/chrome_data + - CHROME_PERSISTENT_SESSION=${CHROME_PERSISTENT_SESSION:-false} + - DISPLAY=:99 + - PLAYWRIGHT_BROWSERS_PATH=/ms-playwright + - RESOLUTION=${RESOLUTION:-1920x1080x24} + - RESOLUTION_WIDTH=${RESOLUTION_WIDTH:-1920} + - RESOLUTION_HEIGHT=${RESOLUTION_HEIGHT:-1080} + - VNC_PASSWORD=${VNC_PASSWORD:-vncpassword} + - PERSISTENT_BROWSER_PORT=9222 + - PERSISTENT_BROWSER_HOST=localhost + - CHROME_DEBUGGING_PORT=9222 + - CHROME_DEBUGGING_HOST=localhost + volumes: + - ./data:/app/data + - ./data/chrome_data:/app/data/chrome_data + - /tmp/.X11-unix:/tmp/.X11-unix + restart: unless-stopped + shm_size: '2gb' + cap_add: + - SYS_ADMIN + security_opt: + - seccomp=unconfined + tmpfs: + - /tmp + healthcheck: + test: ["CMD", "nc", "-z", "localhost", "5900"] + interval: 10s + timeout: 5s + retries: 3 \ No newline at end of file diff --git a/src/browser/config.py b/src/browser/config.py new file mode 100644 index 0000000..32329c4 --- /dev/null +++ b/src/browser/config.py @@ -0,0 +1,30 @@ +# -*- coding: utf-8 -*- +# @Time : 2025/1/6 +# @Author : wenshao +# @ProjectName: browser-use-webui +# @FileName: config.py + +import os +from dataclasses import dataclass +from typing import Optional + + +@dataclass +class BrowserPersistenceConfig: + """Configuration for browser persistence""" + + persistent_session: bool = False + user_data_dir: Optional[str] = None + debugging_port: Optional[int] = None + debugging_host: Optional[str] = None + + @classmethod + def from_env(cls) -> "BrowserPersistenceConfig": + """Create config from environment variables""" + return cls( + persistent_session=os.getenv("CHROME_PERSISTENT_SESSION", "").lower() + == "true", + user_data_dir=os.getenv("CHROME_USER_DATA"), + debugging_port=int(os.getenv("CHROME_DEBUGGING_PORT", "9222")), + debugging_host=os.getenv("CHROME_DEBUGGING_HOST", "localhost"), + ) \ No newline at end of file diff --git a/src/browser/custom_browser.py b/src/browser/custom_browser.py index 790eb95..287cd06 100644 --- a/src/browser/custom_browser.py +++ b/src/browser/custom_browser.py @@ -6,15 +6,45 @@ from browser_use.browser.browser import Browser from browser_use.browser.context import BrowserContext, BrowserContextConfig +from playwright.async_api import BrowserContext as PlaywrightBrowserContext +import logging +from .config import BrowserPersistenceConfig from .custom_context import CustomBrowserContext +logger = logging.getLogger(__name__) class CustomBrowser(Browser): + _global_context = None + async def new_context( self, config: BrowserContextConfig = BrowserContextConfig(), - context: CustomBrowserContext = None, - ) -> BrowserContext: - """Create a browser context""" + context: PlaywrightBrowserContext = None, + ) -> CustomBrowserContext: + """Create a browser context with persistence support""" + persistence_config = BrowserPersistenceConfig.from_env() + + if persistence_config.persistent_session: + if CustomBrowser._global_context is not None: + logger.info("Reusing existing persistent browser context") + return CustomBrowser._global_context + + context_instance = CustomBrowserContext(config=config, browser=self, context=context) + CustomBrowser._global_context = context_instance + logger.info("Created new persistent browser context") + return context_instance + + logger.info("Creating non-persistent browser context") return CustomBrowserContext(config=config, browser=self, context=context) + + async def close(self): + """Override close to respect persistence setting""" + persistence_config = BrowserPersistenceConfig.from_env() + if not persistence_config.persistent_session: + if CustomBrowser._global_context is not None: + await CustomBrowser._global_context.close() + CustomBrowser._global_context = None + await super().close() + else: + logger.info("Skipping browser close due to persistent session") \ No newline at end of file diff --git a/src/browser/custom_context.py b/src/browser/custom_context.py index 2fe7e7c..b46dddb 100644 --- a/src/browser/custom_context.py +++ b/src/browser/custom_context.py @@ -9,84 +9,77 @@ import json import logging import os -from playwright.async_api import Browser as PlaywrightBrowser, Page, BrowserContext as PlaywrightContext from browser_use.browser.browser import Browser from browser_use.browser.context import BrowserContext, BrowserContextConfig +from playwright.async_api import Browser as PlaywrightBrowser +from playwright.async_api import BrowserContext as PlaywrightBrowserContext + +from .config import BrowserPersistenceConfig logger = logging.getLogger(__name__) + + class CustomBrowserContext(BrowserContext): def __init__( self, - browser: "CustomBrowser", # Forward declaration for CustomBrowser + browser: "Browser", config: BrowserContextConfig = BrowserContextConfig(), - context: PlaywrightContext = None + context: PlaywrightBrowserContext = None, ): super(CustomBrowserContext, self).__init__(browser=browser, config=config) - self.context = context # Rename to avoid confusion + self.context = context self._page = None + self._persistence_config = BrowserPersistenceConfig.from_env() @property - def impl_context(self) -> PlaywrightContext: + def impl_context(self) -> PlaywrightBrowserContext: """Returns the underlying Playwright context implementation""" return self.context - async def _create_context(self, browser: PlaywrightBrowser = None): + async def _create_context(self, browser: PlaywrightBrowser) -> PlaywrightBrowserContext: """Creates a new browser context with anti-detection measures and loads cookies if available.""" if self.context: + logger.info("Browser context already exists, returning existing context.") return self.context - # If a Playwright browser is not provided, get it from our custom browser - pw_browser = browser or await self.browser.get_playwright_browser() - - context_args = { - 'viewport': self.config.browser_window_size, - 'no_viewport': False, - 'bypass_csp': self.config.disable_security, - 'ignore_https_errors': self.config.disable_security - } - - if self.config.save_recording_path: - context_args.update({ - 'record_video_dir': self.config.save_recording_path, - 'record_video_size': self.config.browser_window_size - }) - - self.context = await pw_browser.new_context(**context_args) + # Check for persistent context + if self._persistence_config.persistent_session and len(browser.contexts) > 0: + logger.info("Using existing persistent context.") + self.context = browser.contexts[0] + else: + logger.info("Creating a new browser context.") + self.context = await browser.new_context( + viewport=self.config.browser_window_size, + no_viewport=False, + user_agent=( + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " + "(KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36" + ), + java_script_enabled=True, + bypass_csp=self.config.disable_security, + ignore_https_errors=self.config.disable_security, + record_video_dir=self.config.save_recording_path, + record_video_size=self.config.browser_window_size, + ) + # Handle tracing if self.config.trace_path: await self.context.tracing.start(screenshots=True, snapshots=True, sources=True) - # Load cookies if they exist + # Load cookies if self.config.cookies_file and os.path.exists(self.config.cookies_file): with open(self.config.cookies_file, "r") as f: cookies = json.load(f) - logger.info( - f"Loaded {len(cookies)} cookies from {self.config.cookies_file}" - ) + logger.info(f"Loaded {len(cookies)} cookies from {self.config.cookies_file}.") await self.context.add_cookies(cookies) - # Expose anti-detection scripts + # Inject anti-detection scripts await self.context.add_init_script( """ - // Webdriver property - Object.defineProperty(navigator, 'webdriver', { - get: () => undefined - }); - - // Languages - Object.defineProperty(navigator, 'languages', { - get: () => ['en-US', 'en'] - }); - - // Plugins - Object.defineProperty(navigator, 'plugins', { - get: () => [1, 2, 3, 4, 5] - }); - - // Chrome runtime + Object.defineProperty(navigator, 'webdriver', { get: () => undefined }); + Object.defineProperty(navigator, 'languages', { get: () => ['en-US', 'en'] }); + Object.defineProperty(navigator, 'plugins', { get: () => [1, 2, 3, 4, 5] }); window.chrome = { runtime: {} }; - - // Permissions const originalQuery = window.navigator.permissions.query; window.navigator.permissions.query = (parameters) => ( parameters.name === 'notifications' ? @@ -96,42 +89,39 @@ class CustomBrowserContext(BrowserContext): """ ) - # Create an initial page - self._page = await self.context.new_page() - await self._page.goto('about:blank') # Ensure page is ready - + # Create initial page if none exists + if not self.context.pages: + self._page = await self.context.new_page() + await self._page.goto('about:blank') + return self.context - async def new_page(self) -> Page: - """Creates and returns a new page in this context""" + async def new_page(self): + """Creates and returns a new page in this context.""" if not self.context: - await self._create_context() + await self._create_context(await self.browser.get_playwright_browser()) return await self.context.new_page() - async def __aenter__(self): + async def get_current_page(self): + """Returns the current page or creates one if none exists.""" if not self.context: - await self._create_context() - return self + await self._create_context(await self.browser.get_playwright_browser()) + pages = self.context.pages + if not pages: + logger.warning("No existing pages in the context. Creating a new page.") + return await self.context.new_page() + return pages[0] - async def __aexit__(self, *args): - if self.context: + async def close(self): + """Override close to respect persistence setting.""" + if not self._persistence_config.persistent_session and self.context: await self.context.close() self.context = None @property def pages(self): - """Returns list of pages in context""" - return self.context.pages if self.context else [] - - async def get_state(self, **kwargs): - if self.context: - pages = self.context.pages - if pages: - return await super().get_state(**kwargs) - return None - - async def get_pages(self): - """Get pages in a way that works""" + """Returns list of pages in the context.""" if not self.context: + logger.warning("Attempting to access pages but context is not initialized.") return [] return self.context.pages diff --git a/src/utils/stream_utils.py b/src/utils/stream_utils.py index e3bdc22..f4dde56 100644 --- a/src/utils/stream_utils.py +++ b/src/utils/stream_utils.py @@ -3,24 +3,38 @@ import asyncio from typing import AsyncGenerator from playwright.async_api import BrowserContext, Error as PlaywrightError -async def capture_screenshot(browser_context: BrowserContext) -> str: +async def capture_screenshot(browser_context) -> str: """Capture and encode a screenshot""" try: - # Get the implementation context - context = getattr(browser_context, 'impl_context', None) + # Get the implementation context - handle both direct Playwright context and wrapped context + context = browser_context + if hasattr(browser_context, 'context'): + context = browser_context.context + if not context: - return "
No browser context implementation available
" + return "
No browser context available
" # Get all pages - all_pages = context.pages - if not all_pages: + pages = context.pages + if not pages: return "
Waiting for page to be available...
" - # Use the first page - for page in all_pages: + + # Use the first non-blank page or fallback to first page + active_page = None + for page in pages: if page.url != 'about:blank': + active_page = page break + + if not active_page and pages: + active_page = pages[0] + + if not active_page: + return "
No active page available
" + + # Take screenshot try: - screenshot = await page.screenshot( + screenshot = await active_page.screenshot( type='jpeg', quality=75, scale="css" @@ -29,5 +43,6 @@ async def capture_screenshot(browser_context: BrowserContext) -> str: return f'' except Exception as e: return f"
Screenshot failed: {str(e)}
" + except Exception as e: return f"
Screenshot error: {str(e)}
" \ No newline at end of file diff --git a/supervisord.conf b/supervisord.conf new file mode 100644 index 0000000..ff884c8 --- /dev/null +++ b/supervisord.conf @@ -0,0 +1,83 @@ +[supervisord] +nodaemon=true +logfile=/dev/stdout +logfile_maxbytes=0 +loglevel=debug + +[program:xvfb] +command=Xvfb :99 -screen 0 %(ENV_RESOLUTION)s -ac +extension GLX +render -noreset +autorestart=true +stdout_logfile=/dev/stdout +stdout_logfile_maxbytes=0 +stderr_logfile=/dev/stderr +stderr_logfile_maxbytes=0 +priority=100 +startsecs=3 + +[program:vnc_setup] +command=bash -c "mkdir -p ~/.vnc && echo '%(ENV_VNC_PASSWORD)s' | vncpasswd -f > ~/.vnc/passwd && chmod 600 ~/.vnc/passwd && ls -la ~/.vnc/passwd" +autorestart=false +startsecs=0 +priority=150 +stdout_logfile=/dev/stdout +stdout_logfile_maxbytes=0 +stderr_logfile=/dev/stderr +stderr_logfile_maxbytes=0 + +[program:x11vnc] +command=bash -c "sleep 3 && DISPLAY=:99 x11vnc -display :99 -forever -shared -rfbauth /root/.vnc/passwd -rfbport 5900 -bg -o /var/log/x11vnc.log" +autorestart=true +stdout_logfile=/dev/stdout +stdout_logfile_maxbytes=0 +stderr_logfile=/dev/stderr +stderr_logfile_maxbytes=0 +priority=200 +startretries=5 +startsecs=5 +depends_on=vnc_setup + +[program:x11vnc_log] +command=tail -f /var/log/x11vnc.log +autorestart=true +stdout_logfile=/dev/stdout +stdout_logfile_maxbytes=0 +stderr_logfile=/dev/stderr +stderr_logfile_maxbytes=0 +priority=250 + +[program:novnc] +command=bash -c "sleep 5 && cd /opt/novnc && ./utils/novnc_proxy --vnc localhost:5900 --listen 0.0.0.0:6080 --web /opt/novnc" +autorestart=true +stdout_logfile=/dev/stdout +stdout_logfile_maxbytes=0 +stderr_logfile=/dev/stderr +stderr_logfile_maxbytes=0 +priority=300 +startretries=5 +startsecs=3 +depends_on=x11vnc + +[program:persistent_browser] +command=bash -c 'if [ "%(ENV_CHROME_PERSISTENT_SESSION)s" = "true" ]; then mkdir -p /app/data/chrome_data && sleep 8 && google-chrome --user-data-dir=/app/data/chrome_data --window-position=0,0 --window-size=%(ENV_RESOLUTION_WIDTH)s,%(ENV_RESOLUTION_HEIGHT)s --start-maximized --no-sandbox --disable-dev-shm-usage --disable-gpu --disable-software-rasterizer --disable-setuid-sandbox --no-first-run --no-default-browser-check --no-experiments --ignore-certificate-errors --remote-debugging-port=9222 --remote-debugging-address=0.0.0.0 "data:text/html,

Browser Ready for AI Interaction

"; else echo "Persistent browser disabled"; fi' +autorestart=%(ENV_CHROME_PERSISTENT_SESSION)s +stdout_logfile=/dev/stdout +stdout_logfile_maxbytes=0 +stderr_logfile=/dev/stderr +stderr_logfile_maxbytes=0 +priority=350 +startretries=3 +startsecs=3 +depends_on=novnc + +[program:webui] +command=python webui.py --ip 0.0.0.0 --port 7788 +directory=/app +autorestart=true +stdout_logfile=/dev/stdout +stdout_logfile_maxbytes=0 +stderr_logfile=/dev/stderr +stderr_logfile_maxbytes=0 +priority=400 +startretries=3 +startsecs=3 +depends_on=persistent_browser \ No newline at end of file diff --git a/webui.py b/webui.py index b41b827..49ff324 100644 --- a/webui.py +++ b/webui.py @@ -30,6 +30,16 @@ from src.browser.custom_context import BrowserContextConfig from src.controller.custom_controller import CustomController from src.utils import utils from src.utils.utils import update_model_dropdown +from src.browser.config import BrowserPersistenceConfig +from src.browser.custom_browser import CustomBrowser +from src.browser.custom_context import CustomBrowserContext +from browser_use.browser.browser import BrowserConfig +from browser_use.browser.context import BrowserContextConfig, BrowserContextWindowSize + +# Global variables for persistence +_global_browser = None +_global_browser_context = None +_global_playwright = None from src.utils.file_utils import get_latest_files from src.utils.stream_utils import capture_screenshot @@ -196,121 +206,113 @@ async def run_custom_agent( tool_call_in_content, browser_context=None, # receive context ): + global _global_browser, _global_browser_context, _global_playwright + controller = CustomController() playwright = None browser = None + persistence_config = BrowserPersistenceConfig.from_env() + try: - if use_own_browser: - playwright = await async_playwright().start() - chrome_exe = os.getenv("CHROME_PATH", "") - chrome_use_data = os.getenv("CHROME_USER_DATA", "") - - if chrome_exe == "": - chrome_exe = None - elif not os.path.exists(chrome_exe): - raise ValueError(f"Chrome executable not found at {chrome_exe}") - - if chrome_use_data == "": - chrome_use_data = None - - browser_context_ = await playwright.chromium.launch_persistent_context( - user_data_dir=chrome_use_data if chrome_use_data else "", - executable_path=chrome_exe, - no_viewport=False, - headless=headless, # 保持浏览器窗口可见 - user_agent=( - "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " - "(KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36" - ), - java_script_enabled=True, - bypass_csp=disable_security, - ignore_https_errors=disable_security, - record_video_dir=save_recording_path if save_recording_path else None, - record_video_size={"width": window_w, "height": window_h}, - ) - else: - browser_context_ = None - - if browser_context is not None: - # Reuse context - agent = CustomAgent( - task=task, - add_infos=add_infos, - use_vision=use_vision, - llm=llm, - browser_context=browser_context, - controller=controller, - system_prompt_class=CustomSystemPrompt - ) - history = await agent.run(max_steps=max_steps) - final_result = history.final_result() - errors = history.errors() - model_actions = history.model_actions() - model_thoughts = history.model_thoughts() - recorded_files = get_latest_files(save_recording_path) - trace_file = get_latest_files(save_recording_path + "/../traces") - return final_result, errors, model_actions, model_thoughts, recorded_files.get('.webm'), trace_file.get('.zip') - else: - browser = CustomBrowser( + # Initialize global browser if needed + if _global_browser is None: + _global_browser = CustomBrowser( config=BrowserConfig( headless=headless, disable_security=disable_security, extra_chromium_args=[f"--window-size={window_w},{window_h}"], ) ) - async with await browser.new_context( + + # Handle browser context based on configuration + if use_own_browser: + if _global_browser_context is None: + _global_playwright = await async_playwright().start() + chrome_exe = os.getenv("CHROME_PATH", "") + chrome_use_data = os.getenv("CHROME_USER_DATA", "") + + browser_context = await _global_playwright.chromium.launch_persistent_context( + user_data_dir=chrome_use_data, + executable_path=chrome_exe, + no_viewport=False, + headless=headless, + user_agent=( + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " + "(KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36" + ), + java_script_enabled=True, + bypass_csp=disable_security, + ignore_https_errors=disable_security, + record_video_dir=save_recording_path if save_recording_path else None, + record_video_size={"width": window_w, "height": window_h}, + ) + _global_browser_context = await _global_browser.new_context( config=BrowserContextConfig( trace_path=save_trace_path if save_trace_path else None, - save_recording_path=save_recording_path - if save_recording_path - else None, + save_recording_path=save_recording_path if save_recording_path else None, no_viewport=False, browser_window_size=BrowserContextWindowSize( width=window_w, height=window_h ), ), - context=browser_context_, - ) as browser_context: - agent = CustomAgent( - task=task, - add_infos=add_infos, - use_vision=use_vision, - llm=llm, - browser_context=browser_context, - controller=controller, - system_prompt_class=CustomSystemPrompt, - max_actions_per_step=max_actions_per_step, - tool_call_in_content=tool_call_in_content + context=browser_context, ) - history = await agent.run(max_steps=max_steps) - final_result = history.final_result() - errors = history.errors() - model_actions = history.model_actions() - model_thoughts = history.model_thoughts() - recorded_files = get_latest_files(save_recording_path) - trace_file = get_latest_files(save_recording_path + "/../traces") - return final_result, errors, model_actions, model_thoughts, recorded_files.get('.webm'), trace_file.get('.zip') + else: + if _global_browser_context is None: + _global_browser_context = await _global_browser.new_context( + config=BrowserContextConfig( + trace_path=save_trace_path if save_trace_path else None, + save_recording_path=save_recording_path if save_recording_path else None, + no_viewport=False, + browser_window_size=BrowserContextWindowSize( + width=window_w, height=window_h + ), + ), + ) + + # Create and run agent + agent = CustomAgent( + task=task, + add_infos=add_infos, + use_vision=use_vision, + llm=llm, + browser_context=_global_browser_context, + controller=controller, + system_prompt_class=CustomSystemPrompt, + max_actions_per_step=max_actions_per_step, + tool_call_in_content=tool_call_in_content + ) + history = await agent.run(max_steps=max_steps) + + final_result = history.final_result() + errors = history.errors() + model_actions = history.model_actions() + model_thoughts = history.model_thoughts() + recorded_files = get_latest_files(save_recording_path) + trace_file = get_latest_files(save_trace_path) except Exception as e: import traceback - traceback.print_exc() - final_result = "" errors = str(e) + "\n" + traceback.format_exc() model_actions = "" model_thoughts = "" recorded_files = {} trace_file = {} finally: - # 显式关闭持久化上下文 - if browser_context_: - await browser_context_.close() + # Handle cleanup based on persistence configuration + if not persistence_config.persistent_session: + if _global_browser_context: + await _global_browser_context.close() + _global_browser_context = None - # 关闭 Playwright 对象 - if playwright: - await playwright.stop() - if browser: - await browser.close() + if _global_playwright: + await _global_playwright.stop() + _global_playwright = None + + if _global_browser: + await _global_browser.close() + _global_browser = None return final_result, errors, model_actions, model_thoughts, trace_file.get('.webm'), recorded_files.get('.zip') async def run_with_stream( @@ -336,95 +338,71 @@ async def run_with_stream( tool_call_in_content, ): """Wrapper to run the agent and handle streaming.""" - browser = None + global _global_browser, _global_browser_context + try: - # Initialize the browser - browser = CustomBrowser( - config=BrowserConfig( - headless=False, - disable_security=disable_security, - extra_chromium_args=[f"--window-size={window_w},{window_h}"], - ) - ) - - # Create a new browser context - async with await browser.new_context( - config=BrowserContextConfig( - trace_path=save_trace_path if save_trace_path else None, - save_recording_path=save_recording_path if save_recording_path else None, - no_viewport=False, - browser_window_size=BrowserContextWindowSize( - width=window_w, height=window_h - ), - ) - ) as browser_context: - # Run the browser agent in the background - agent_task = asyncio.create_task( - run_browser_agent( - agent_type=agent_type, - llm_provider=llm_provider, - llm_model_name=llm_model_name, - llm_temperature=llm_temperature, - llm_base_url=llm_base_url, - llm_api_key=llm_api_key, - use_own_browser=use_own_browser, - headless=headless, + # Initialize the global browser if it doesn't exist + if _global_browser is None: + _global_browser = CustomBrowser( + config=BrowserConfig( + headless=False, disable_security=disable_security, - window_w=window_w, - window_h=window_h, - save_recording_path=save_recording_path, - save_trace_path=save_trace_path, - enable_recording=enable_recording, - task=task, - add_infos=add_infos, - max_steps=max_steps, - use_vision=use_vision, - max_actions_per_step=max_actions_per_step, - tool_call_in_content=tool_call_in_content, - browser_context=browser_context # Explicit keyword argument + extra_chromium_args=[f"--window-size={window_w},{window_h}"], ) ) - # Initialize values for streaming - html_content = "
Starting browser...
" - final_result = errors = model_actions = model_thoughts = "" - recording = trace = None + # Create or reuse browser context + if _global_browser_context is None: + _global_browser_context = await _global_browser.new_context( + config=BrowserContextConfig( + trace_path=save_trace_path if save_trace_path else None, + save_recording_path=save_recording_path if save_recording_path else None, + no_viewport=False, + browser_window_size=BrowserContextWindowSize( + width=window_w, height=window_h + ), + ) + ) - # Periodically update the stream while the agent task is running - while not agent_task.done(): - try: - html_content = await capture_screenshot(browser_context) - except Exception as e: - html_content = f"
Screenshot error: {str(e)}
" - - yield [ - html_content, - final_result, - errors, - model_actions, - model_thoughts, - recording, - trace, - ] - await asyncio.sleep(0.01) + # Run the browser agent in the background + agent_task = asyncio.create_task( + run_browser_agent( + agent_type=agent_type, + llm_provider=llm_provider, + llm_model_name=llm_model_name, + llm_temperature=llm_temperature, + llm_base_url=llm_base_url, + llm_api_key=llm_api_key, + use_own_browser=use_own_browser, + headless=headless, + disable_security=disable_security, + window_w=window_w, + window_h=window_h, + save_recording_path=save_recording_path, + save_trace_path=save_trace_path, + enable_recording=enable_recording, + task=task, + add_infos=add_infos, + max_steps=max_steps, + use_vision=use_vision, + max_actions_per_step=max_actions_per_step, + tool_call_in_content=tool_call_in_content, + browser_context=_global_browser_context + ) + ) - # Once the agent task completes, get the results + # Initialize values for streaming + html_content = "
Using browser...
" + final_result = errors = model_actions = model_thoughts = "" + recording = trace = None + + # Periodically update the stream while the agent task is running + while not agent_task.done(): try: - result = await agent_task - if isinstance(result, tuple) and len(result) == 6: - ( - final_result, - errors, - model_actions, - model_thoughts, - recording, - trace, - ) = result - else: - errors = "Unexpected result format from agent" + html_content = await capture_screenshot(_global_browser_context) except Exception as e: - errors = f"Agent error: {str(e)}" - + html_content = f"
Screenshot error: {str(e)}
" + yield [ html_content, final_result, @@ -434,10 +412,30 @@ async def run_with_stream( recording, trace, ] + await asyncio.sleep(0.01) + + # Once the agent task completes, get the results + try: + result = await agent_task + if isinstance(result, tuple) and len(result) == 6: + final_result, errors, model_actions, model_thoughts, recording, trace = result + else: + errors = "Unexpected result format from agent" + except Exception as e: + errors = f"Agent error: {str(e)}" + + yield [ + html_content, + final_result, + errors, + model_actions, + model_thoughts, + recording, + trace, + ] except Exception as e: import traceback - yield [ f"
Browser error: {str(e)}
", "", @@ -447,9 +445,30 @@ async def run_with_stream( None, None, ] + +# Update the main function to handle cleanup +def main(): + async def cleanup(): + global _global_browser, _global_browser_context + if _global_browser_context: + await _global_browser_context.close() + if _global_browser: + await _global_browser.close() + _global_browser = None + _global_browser_context = None + + parser = argparse.ArgumentParser(description="Gradio UI for Browser Agent") + parser.add_argument("--ip", type=str, default="127.0.0.1", help="IP address to bind to") + parser.add_argument("--port", type=int, default=7788, help="Port to listen on") + parser.add_argument("--theme", type=str, default="Ocean", choices=theme_map.keys(), help="Theme to use for the UI") + parser.add_argument("--dark-mode", action="store_true", help="Enable dark mode") + args = parser.parse_args() + + try: + demo = create_ui(theme_name=args.theme) + demo.launch(server_name=args.ip, server_port=args.port) finally: - if browser: - await browser.close() + asyncio.get_event_loop().run_until_complete(cleanup()) from gradio.themes import Citrus, Default, Glass, Monochrome, Ocean, Origin, Soft, Base @@ -733,16 +752,5 @@ def create_ui(theme_name="Ocean"): return demo -def main(): - parser = argparse.ArgumentParser(description="Gradio UI for Browser Agent") - parser.add_argument("--ip", type=str, default="127.0.0.1", help="IP address to bind to") - parser.add_argument("--port", type=int, default=7788, help="Port to listen on") - parser.add_argument("--theme", type=str, default="Ocean", choices=theme_map.keys(), help="Theme to use for the UI") - parser.add_argument("--dark-mode", action="store_true", help="Enable dark mode") - args = parser.parse_args() - - demo = create_ui(theme_name=args.theme) - demo.launch(server_name=args.ip, server_port=args.port) - if __name__ == '__main__': main() \ No newline at end of file