Merge pull request #583 from vvincent1234/fix/docker

Fix/docker
2026-03-22 11:17:17 +08:00 · 2025-05-10 20:50:45 +08:00
parent 0146570f8f 33763b1024
commit 383b04ab5f
19 changed files with 831 additions and 931 deletions
--- a/.dockerignore
+++ b/.dockerignore
@@ -1,2 +1,5 @@
 data
-tmp
+tmp
+results
+
+.env
--- a/.env.example
+++ b/.env.example
@@ -40,14 +40,14 @@ ANONYMIZED_TELEMETRY=false
 # LogLevel: Set to debug to enable verbose logging, set to result to get results only. Available: result | debug | info
 BROWSER_USE_LOGGING_LEVEL=info

-# Chrome settings
-CHROME_PATH=
-CHROME_USER_DATA=
-CHROME_DEBUGGING_PORT=9222
-CHROME_DEBUGGING_HOST=localhost
+# Browser settings
+BROWSER_PATH=
+BROWSER_USER_DATA=
+BROWSER_DEBUGGING_PORT=9222
+BROWSER_DEBUGGING_HOST=localhost
 # Set to true to keep browser open between AI tasks
-CHROME_PERSISTENT_SESSION=false
-CHROME_CDP=
+KEEP_BROWSER_OPEN=true
+BROWSER_CDP=
 # Display settings
 # Format: WIDTHxHEIGHTxDEPTH
 RESOLUTION=1920x1080x24
--- a/54
+++ b/54
@@ -1,5 +1,9 @@
 FROM python:3.11-slim

+# Set platform for multi-arch builds (Docker Buildx will set this)
+ARG TARGETPLATFORM
+ARG NODE_MAJOR=20
+
 # Install system dependencies
 RUN apt-get update && apt-get install -y \
    wget \
@@ -28,7 +32,6 @@ RUN apt-get update && apt-get install -y \
    fonts-liberation \
    dbus \
    xauth \
-    xvfb \
    x11vnc \
    tigervnc-tools \
    supervisor \
@@ -40,6 +43,7 @@ RUN apt-get update && apt-get install -y \
    fonts-dejavu \
    fonts-dejavu-core \
    fonts-dejavu-extra \
+    vim \
    && rm -rf /var/lib/apt/lists/*

 # Install noVNC
@@ -47,40 +51,50 @@ RUN git clone https://github.com/novnc/noVNC.git /opt/novnc \
    && git clone https://github.com/novnc/websockify /opt/novnc/utils/websockify \
    && ln -s /opt/novnc/vnc.html /opt/novnc/index.html

-# Set platform for ARM64 compatibility
-ARG TARGETPLATFORM=linux/amd64
+# Install Node.js using NodeSource PPA
+RUN mkdir -p /etc/apt/keyrings \
+    && curl -fsSL https://deb.nodesource.com/gpgkey/nodesource-repo.gpg.key | gpg --dearmor -o /etc/apt/keyrings/nodesource.gpg \
+    && echo "deb [signed-by=/etc/apt/keyrings/nodesource.gpg] https://deb.nodesource.com/node_$NODE_MAJOR.x nodistro main" | tee /etc/apt/sources.list.d/nodesource.list \
+    && apt-get update \
+    && apt-get install nodejs -y \
+    && rm -rf /var/lib/apt/lists/*
+
+# Verify Node.js and npm installation (optional, but good for debugging)
+RUN node -v && npm -v && npx -v

 # Set up working directory
 WORKDIR /app

 # Copy requirements and install Python dependencies
 COPY requirements.txt .
+# Ensure 'patchright' is in your requirements.txt or install it directly
+# RUN pip install --no-cache-dir -r requirements.txt patchright # If not in requirements
 RUN pip install --no-cache-dir -r requirements.txt

-# Install patchright and browsers with system dependencies
-ENV PLAYWRIGHT_BROWSERS_PATH=/ms-patchright
-RUN patchright install --with-deps chromium
-RUN patchright install-deps
+# Install Patchright browsers and dependencies
+# Patchright documentation suggests PLAYWRIGHT_BROWSERS_PATH is still relevant
+# or that Patchright installs to a similar default location that Playwright would.
+# Let's assume Patchright respects PLAYWRIGHT_BROWSERS_PATH or its default install location is findable.
+ENV PLAYWRIGHT_BROWSERS_PATH=/ms-browsers
+RUN mkdir -p $PLAYWRIGHT_BROWSERS_PATH
+
+# Install recommended: Google Chrome (instead of just Chromium for better undetectability)
+# The 'patchright install chrome' command might download and place it.
+# The '--with-deps' equivalent for patchright install is to run 'patchright install-deps chrome' after.
+# RUN patchright install chrome --with-deps
+
+# Alternative: Install Chromium if Google Chrome is problematic in certain environments
+RUN patchright install chromium --with-deps
+

 # Copy the application code
 COPY . .

-# Set environment variables
-ENV PYTHONUNBUFFERED=1
-ENV BROWSER_USE_LOGGING_LEVEL=info
-ENV CHROME_PATH=/ms-playwright/chromium-*/chrome-linux/chrome
-ENV ANONYMIZED_TELEMETRY=false
-ENV DISPLAY=:99
-ENV RESOLUTION=1920x1080x24
-ENV VNC_PASSWORD=vncpassword
-ENV CHROME_PERSISTENT_SESSION=true
-ENV RESOLUTION_WIDTH=1920
-ENV RESOLUTION_HEIGHT=1080
-
 # Set up supervisor configuration
 RUN mkdir -p /var/log/supervisor
 COPY supervisord.conf /etc/supervisor/conf.d/supervisord.conf

-EXPOSE 7788 6080 5901
+EXPOSE 7788 6080 5901 9222

 CMD ["/usr/bin/supervisord", "-c", "/etc/supervisor/conf.d/supervisord.conf"]
+#CMD ["/bin/bash"]
--- a/Dockerfile.arm64
+++ b/Dockerfile.arm64
@@ -1,85 +0,0 @@
-FROM python:3.11-slim
-
-# Install system dependencies
-RUN apt-get update && apt-get install -y \
-    wget \
-    gnupg \
-    curl \
-    unzip \
-    xvfb \
-    libgconf-2-4 \
-    libxss1 \
-    libnss3 \
-    libnspr4 \
-    libasound2 \
-    libatk1.0-0 \
-    libatk-bridge2.0-0 \
-    libcups2 \
-    libdbus-1-3 \
-    libdrm2 \
-    libgbm1 \
-    libgtk-3-0 \
-    libxcomposite1 \
-    libxdamage1 \
-    libxfixes3 \
-    libxrandr2 \
-    xdg-utils \
-    fonts-liberation \
-    dbus \
-    xauth \
-    xvfb \
-    x11vnc \
-    tigervnc-tools \
-    supervisor \
-    net-tools \
-    procps \
-    git \
-    python3-numpy \
-    fontconfig \
-    fonts-dejavu \
-    fonts-dejavu-core \
-    fonts-dejavu-extra \
-    && rm -rf /var/lib/apt/lists/*
-
-# Install noVNC
-RUN git clone https://github.com/novnc/noVNC.git /opt/novnc \
-    && git clone https://github.com/novnc/websockify /opt/novnc/utils/websockify \
-    && ln -s /opt/novnc/vnc.html /opt/novnc/index.html
-
-# Set platform explicitly for ARM64
-ARG TARGETPLATFORM=linux/arm64
-
-# Set up working directory
-WORKDIR /app
-
-# Copy requirements and install Python dependencies
-COPY requirements.txt .
-RUN pip install --no-cache-dir -r requirements.txt
-
-# Install Playwright and browsers with system dependencies optimized for ARM64
-ENV PLAYWRIGHT_BROWSERS_PATH=/ms-playwright
-RUN PLAYWRIGHT_SKIP_BROWSER_DOWNLOAD=1 pip install playwright && \
-    playwright install --with-deps chromium
-
-# Copy the application code
-COPY . .
-
-# Set environment variables
-ENV PYTHONUNBUFFERED=1
-ENV BROWSER_USE_LOGGING_LEVEL=info
-ENV CHROME_PATH=/ms-playwright/chromium-*/chrome-linux/chrome
-ENV ANONYMIZED_TELEMETRY=false
-ENV DISPLAY=:99
-ENV RESOLUTION=1920x1080x24
-ENV VNC_PASSWORD=vncpassword
-ENV CHROME_PERSISTENT_SESSION=true
-ENV RESOLUTION_WIDTH=1920
-ENV RESOLUTION_HEIGHT=1080
-
-# Set up supervisor configuration
-RUN mkdir -p /var/log/supervisor
-COPY supervisord.conf /etc/supervisor/conf.d/supervisord.conf
-
-EXPOSE 7788 6080 5901
-
-CMD ["/usr/bin/supervisord", "-c", "/etc/supervisor/conf.d/supervisord.conf"]
--- a/README.md
+++ b/README.md
@@ -23,10 +23,6 @@ We would like to officially thank [WarmShao](https://github.com/warmshao) for hi

 ## Installation Guide

-### Prerequisites
- Python 3.11 or higher
- Git (for cloning the repository)
-
 ### Option 1: Local Installation

 Read the [quickstart guide](https://docs.browser-use.com/quickstart#prepare-the-environment) or follow the steps below to get started.
@@ -65,10 +61,13 @@ Install Python packages:
 uv pip install -r requirements.txt
 ```

-Install Browsers in Playwright:
-You can install specific browsers by running:
+Install Browsers in Patchright. 
 ```bash
-patchright install chromium
+patchright install --with-deps
+```
+Or you can install specific browsers by running:
+```bash
+patchright install chromium --with-deps
 ```

 #### Step 4: Configure Environment
@@ -83,6 +82,29 @@ cp .env.example .env
 ```
 2. Open `.env` in your preferred text editor and add your API keys and other settings

+#### Step 5: Enjoy the web-ui
+1.  **Run the WebUI:**
+    ```bash
+    python webui.py --ip 127.0.0.1 --port 7788
+    ```
+2. **Access the WebUI:** Open your web browser and navigate to `http://127.0.0.1:7788`.
+3. **Using Your Own Browser(Optional):**
+    - Set `BROWSER_PATH` to the executable path of your browser and `BROWSER_USER_DATA` to the user data directory of your browser. Leave `BROWSER_USER_DATA` empty if you want to use local user data.
+      - Windows
+        ```env
+         BROWSER_PATH="C:\Program Files\Google\Chrome\Application\chrome.exe"
+         BROWSER_USER_DATA="C:\Users\YourUsername\AppData\Local\Google\Chrome\User Data"
+        ```
+        > Note: Replace `YourUsername` with your actual Windows username for Windows systems.
+      - Mac
+        ```env
+         BROWSER_PATH="/Applications/Google Chrome.app/Contents/MacOS/Google Chrome"
+         BROWSER_USER_DATA="/Users/YourUsername/Library/Application Support/Google/Chrome"
+        ```
+    - Close all Chrome windows
+    - Open the WebUI in a non-Chrome browser, such as Firefox or Edge. This is important because the persistent browser context will use the Chrome data when running the agent.
+    - Check the "Use Own Browser" option within the Browser Settings.
+
 ### Option 2: Docker Installation

 #### Prerequisites
@@ -90,14 +112,14 @@ cp .env.example .env
  - [Docker Desktop](https://www.docker.com/products/docker-desktop/) (For Windows/macOS)
  - [Docker Engine](https://docs.docker.com/engine/install/) and [Docker Compose](https://docs.docker.com/compose/install/) (For Linux)

-#### Installation Steps
-1. Clone the repository:
+#### Step 1: Clone the Repository
 ```bash
 git clone https://github.com/browser-use/web-ui.git
 cd web-ui
 ```

-2. Create and configure environment file:
+#### Step 2: Configure Environment
+1. Create a copy of the example environment file:
 - Windows (Command Prompt):
 ```bash
 copy .env.example .env
@@ -106,122 +128,23 @@ copy .env.example .env
 ```bash
 cp .env.example .env
 ```
-Edit `.env` with your preferred text editor and add your API keys
+2. Open `.env` in your preferred text editor and add your API keys and other settings

-3. Run with Docker:
+#### Step 3: Docker Build and Run
 ```bash
-# Build and start the container with default settings (browser closes after AI tasks)
 docker compose up --build
 ```
+For ARM64 systems (e.g., Apple Silicon Macs), please run follow command:
 ```bash
-# Or run with persistent browser (browser stays open between AI tasks)
-CHROME_PERSISTENT_SESSION=true docker compose up --build
+TARGETPLATFORM=linux/arm64 docker compose up --build
 ```

-
-4. Access the Application:
- Web Interface: Open `http://localhost:7788` in your browser
+#### Step 4: Enjoy the web-ui and vnc
+- Web-UI: Open `http://localhost:7788` in your browser
 - VNC Viewer (for watching browser interactions): Open `http://localhost:6080/vnc.html`
  - Default VNC password: "youvncpassword"
  - Can be changed by setting `VNC_PASSWORD` in your `.env` file

-## Usage
-
-### Local Setup
-1.  **Run the WebUI:**
-    After completing the installation steps above, start the application:
-    ```bash
-    python webui.py --ip 127.0.0.1 --port 7788
-    ```
-2. WebUI options:
-   - `--ip`: The IP address to bind the WebUI to. Default is `127.0.0.1`.
-   - `--port`: The port to bind the WebUI to. Default is `7788`.
-   - `--theme`: The theme for the user interface. Default is `Ocean`.
-     - **Default**: The standard theme with a balanced design.
-     - **Soft**: A gentle, muted color scheme for a relaxed viewing experience.
-     - **Monochrome**: A grayscale theme with minimal color for simplicity and focus.
-     - **Glass**: A sleek, semi-transparent design for a modern appearance.
-     - **Origin**: A classic, retro-inspired theme for a nostalgic feel.
-     - **Citrus**: A vibrant, citrus-inspired palette with bright and fresh colors.
-     - **Ocean** (default): A blue, ocean-inspired theme providing a calming effect.
-   - `--dark-mode`: Enables dark mode for the user interface.
-3.  **Access the WebUI:** Open your web browser and navigate to `http://127.0.0.1:7788`.
-4.  **Using Your Own Browser(Optional):**
-    - Set `CHROME_PATH` to the executable path of your browser and `CHROME_USER_DATA` to the user data directory of your browser. Leave `CHROME_USER_DATA` empty if you want to use local user data.
-      - Windows
-        ```env
-         CHROME_PATH="C:\Program Files\Google\Chrome\Application\chrome.exe"
-         CHROME_USER_DATA="C:\Users\YourUsername\AppData\Local\Google\Chrome\User Data"
-        ```
-        > Note: Replace `YourUsername` with your actual Windows username for Windows systems.
-      - Mac
-        ```env
-         CHROME_PATH="/Applications/Google Chrome.app/Contents/MacOS/Google Chrome"
-         CHROME_USER_DATA="/Users/YourUsername/Library/Application Support/Google/Chrome"
-        ```
-    - Close all Chrome windows
-    - Open the WebUI in a non-Chrome browser, such as Firefox or Edge. This is important because the persistent browser context will use the Chrome data when running the agent.
-    - Check the "Use Own Browser" option within the Browser Settings.
-5. **Keep Browser Open(Optional):**
-    - Set `CHROME_PERSISTENT_SESSION=true` in the `.env` file.
-
-### Docker Setup
-1. **Environment Variables:**
-   - All configuration is done through the `.env` file
-   - Available environment variables:
-     ```
-     # LLM API Keys
-     OPENAI_API_KEY=your_key_here
-     ANTHROPIC_API_KEY=your_key_here
-     GOOGLE_API_KEY=your_key_here
-
-     # Browser Settings
-     CHROME_PERSISTENT_SESSION=true   # Set to true to keep browser open between AI tasks
-     RESOLUTION=1920x1080x24         # Custom resolution format: WIDTHxHEIGHTxDEPTH
-     RESOLUTION_WIDTH=1920           # Custom width in pixels
-     RESOLUTION_HEIGHT=1080          # Custom height in pixels
-
-     # VNC Settings
-     VNC_PASSWORD=your_vnc_password  # Optional, defaults to "vncpassword"
-     ```
-
-2. **Platform Support:**
-   - Supports both AMD64 and ARM64 architectures
-   - For ARM64 systems (e.g., Apple Silicon Macs), the container will automatically use the appropriate image
-
-3. **Browser Persistence Modes:**
-   - **Default Mode (CHROME_PERSISTENT_SESSION=false):**
-     - Browser opens and closes with each AI task
-     - Clean state for each interaction
-     - Lower resource usage
-
-   - **Persistent Mode (CHROME_PERSISTENT_SESSION=true):**
-     - Browser stays open between AI tasks
-     - Maintains history and state
-     - Allows viewing previous AI interactions
-     - Set in `.env` file or via environment variable when starting container
-
-4. **Viewing Browser Interactions:**
-   - Access the noVNC viewer at `http://localhost:6080/vnc.html`
-   - Enter the VNC password (default: "vncpassword" or what you set in VNC_PASSWORD)
-   - Direct VNC access available on port 5900 (mapped to container port 5901)
-   - You can now see all browser interactions in real-time
-
-5. **Container Management:**
-   ```bash
-   # Start with persistent browser
-   CHROME_PERSISTENT_SESSION=true docker compose up -d
-
-   # Start with default mode (browser closes after tasks)
-   docker compose up -d
-
-   # View logs
-   docker compose logs -f
-
-   # Stop the container
-   docker compose down
-   ```
-
 ## Changelog
 - [x] **2025/01/26:** Thanks to @vvincent1234. Now browser-use-webui can combine with DeepSeek-r1 to engage in deep thinking!
 - [x] **2025/01/10:** Thanks to @casistack. Now we have Docker Setup option and also Support keep browser open between tasks.[Video tutorial demo](https://github.com/browser-use/web-ui/issues/1#issuecomment-2582511750).
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -1,62 +1,79 @@
 services:
+  # debug: docker compose run --rm -it browser-use-webui bash
  browser-use-webui:
-    platform: linux/amd64
    build:
      context: .
-      dockerfile: ${DOCKERFILE:-Dockerfile}
+      dockerfile: Dockerfile
      args:
        TARGETPLATFORM: ${TARGETPLATFORM:-linux/amd64}
    ports:
-      - "7788:7788"  # Gradio default port
-      - "6080:6080"  # noVNC web interface
-      - "5901:5901"  # VNC port
-      - "9222:9222"  # Chrome remote debugging port
+      - "7788:7788"
+      - "6080:6080"
+      - "5901:5901"
+      - "9222:9222"
    environment:
+      # LLM API Keys & Endpoints
      - OPENAI_ENDPOINT=${OPENAI_ENDPOINT:-https://api.openai.com/v1}
      - OPENAI_API_KEY=${OPENAI_API_KEY:-}
-      - ANTHROPIC_API_KEY=${ANTHROPIC_API_KEY:-}
      - ANTHROPIC_ENDPOINT=${ANTHROPIC_ENDPOINT:-https://api.anthropic.com}
+      - ANTHROPIC_API_KEY=${ANTHROPIC_API_KEY:-}
      - GOOGLE_API_KEY=${GOOGLE_API_KEY:-}
      - AZURE_OPENAI_ENDPOINT=${AZURE_OPENAI_ENDPOINT:-}
      - AZURE_OPENAI_API_KEY=${AZURE_OPENAI_API_KEY:-}
+      - AZURE_OPENAI_API_VERSION=${AZURE_OPENAI_API_VERSION:-2025-01-01-preview}
      - DEEPSEEK_ENDPOINT=${DEEPSEEK_ENDPOINT:-https://api.deepseek.com}
      - DEEPSEEK_API_KEY=${DEEPSEEK_API_KEY:-}
      - OLLAMA_ENDPOINT=${OLLAMA_ENDPOINT:-http://localhost:11434}
-      - MISTRAL_API_KEY=${MISTRAL_API_KEY:-}
      - MISTRAL_ENDPOINT=${MISTRAL_ENDPOINT:-https://api.mistral.ai/v1}
+      - MISTRAL_API_KEY=${MISTRAL_API_KEY:-}
      - ALIBABA_ENDPOINT=${ALIBABA_ENDPOINT:-https://dashscope.aliyuncs.com/compatible-mode/v1}
      - ALIBABA_API_KEY=${ALIBABA_API_KEY:-}
      - MOONSHOT_ENDPOINT=${MOONSHOT_ENDPOINT:-https://api.moonshot.cn/v1}
      - MOONSHOT_API_KEY=${MOONSHOT_API_KEY:-}
-      - IBM_API_KEY=${IBM_API_KEY:-}
+      - UNBOUND_ENDPOINT=${UNBOUND_ENDPOINT:-https://api.getunbound.ai}
+      - UNBOUND_API_KEY=${UNBOUND_API_KEY:-}
+      - SiliconFLOW_ENDPOINT=${SiliconFLOW_ENDPOINT:-https://api.siliconflow.cn/v1/}
+      - SiliconFLOW_API_KEY=${SiliconFLOW_API_KEY:-}
      - IBM_ENDPOINT=${IBM_ENDPOINT:-https://us-south.ml.cloud.ibm.com}
+      - IBM_API_KEY=${IBM_API_KEY:-}
      - IBM_PROJECT_ID=${IBM_PROJECT_ID:-}
-      - BROWSER_USE_LOGGING_LEVEL=${BROWSER_USE_LOGGING_LEVEL:-info}
+
+      # Application Settings
      - ANONYMIZED_TELEMETRY=${ANONYMIZED_TELEMETRY:-false}
-      - CHROME_PATH=/usr/bin/google-chrome
-      - CHROME_USER_DATA=/app/data/chrome_data
-      - CHROME_PERSISTENT_SESSION=${CHROME_PERSISTENT_SESSION:-false}
-      - CHROME_CDP=${CHROME_CDP:-http://localhost:9222}
+      - BROWSER_USE_LOGGING_LEVEL=${BROWSER_USE_LOGGING_LEVEL:-info}
+
+      # Browser Settings
+      - BROWSER_PATH=
+      - BROWSER_USER_DATA=
+      - BROWSER_DEBUGGING_PORT=${BROWSER_DEBUGGING_PORT:-9222}
+      - BROWSER_DEBUGGING_HOST=localhost
+      - USE_OWN_BROWSER=false
+      - KEEP_BROWSER_OPEN=true
+      - BROWSER_CDP=${BROWSER_CDP:-} # e.g., http://localhost:9222
+
+      # Display Settings
      - DISPLAY=:99
-      - PLAYWRIGHT_BROWSERS_PATH=/ms-playwright
+      # This ENV is used by the Dockerfile during build time if Patchright respects it.
+      # It's not strictly needed at runtime by docker-compose unless your app or scripts also read it.
+      - PLAYWRIGHT_BROWSERS_PATH=/ms-browsers # Matches Dockerfile ENV
      - RESOLUTION=${RESOLUTION:-1920x1080x24}
      - RESOLUTION_WIDTH=${RESOLUTION_WIDTH:-1920}
      - RESOLUTION_HEIGHT=${RESOLUTION_HEIGHT:-1080}
-      - VNC_PASSWORD=${VNC_PASSWORD:-vncpassword}
-      - CHROME_DEBUGGING_PORT=9222
-      - CHROME_DEBUGGING_HOST=localhost
+
+      # VNC Settings
+      - VNC_PASSWORD=${VNC_PASSWORD:-youvncpassword}
+
    volumes:
      - /tmp/.X11-unix:/tmp/.X11-unix
+      # - ./my_chrome_data:/app/data/chrome_data # Optional: persist browser data
    restart: unless-stopped
    shm_size: '2gb'
    cap_add:
      - SYS_ADMIN
-    security_opt:
-      - seccomp=unconfined
    tmpfs:
      - /tmp
    healthcheck:
-      test: ["CMD", "nc", "-z", "localhost", "5901"]
+      test: ["CMD", "nc", "-z", "localhost", "5901"] # VNC port
      interval: 10s
      timeout: 5s
-      retries: 3
+      retries: 3
--- a/entrypoint.sh
+++ b/entrypoint.sh
@@ -1,4 +0,0 @@
-#!/bin/bash
-
-# Start supervisord in the foreground to properly manage child processes
-exec /usr/bin/supervisord -n -c /etc/supervisor/conf.d/supervisord.conf
--- a/src/agent/browser_use/browser_use_agent.py
+++ b/src/agent/browser_use/browser_use_agent.py
@@ -8,30 +8,49 @@ import os
 from browser_use.agent.gif import create_history_gif
 from browser_use.agent.service import Agent, AgentHookFunc
 from browser_use.agent.views import (
+    ActionResult,
+    AgentHistory,
    AgentHistoryList,
    AgentStepInfo,
+    ToolCallingMethod,
 )
+from browser_use.browser.views import BrowserStateHistory
 from browser_use.telemetry.views import (
    AgentEndTelemetryEvent,
 )
 from browser_use.utils import time_execution_async
 from dotenv import load_dotenv
+from browser_use.agent.message_manager.utils import is_model_without_tool_support

 load_dotenv()
 logger = logging.getLogger(__name__)

 SKIP_LLM_API_KEY_VERIFICATION = (
-    os.environ.get("SKIP_LLM_API_KEY_VERIFICATION", "false").lower()[0] in "ty1"
+        os.environ.get("SKIP_LLM_API_KEY_VERIFICATION", "false").lower()[0] in "ty1"
 )


 class BrowserUseAgent(Agent):
+    def _set_tool_calling_method(self) -> ToolCallingMethod | None:
+        tool_calling_method = self.settings.tool_calling_method
+        if tool_calling_method == 'auto':
+            if is_model_without_tool_support(self.model_name):
+                return 'raw'
+            elif self.chat_model_library == 'ChatGoogleGenerativeAI':
+                return None
+            elif self.chat_model_library == 'ChatOpenAI':
+                return 'function_calling'
+            elif self.chat_model_library == 'AzureChatOpenAI':
+                return 'function_calling'
+            else:
+                return None
+        else:
+            return tool_calling_method
+
    @time_execution_async("--run (agent)")
    async def run(
-        self,
-        max_steps: int = 100,
-        on_step_start: AgentHookFunc | None = None,
-        on_step_end: AgentHookFunc | None = None,
+            self, max_steps: int = 100, on_step_start: AgentHookFunc | None = None,
+            on_step_end: AgentHookFunc | None = None
    ) -> AgentHistoryList:
        """Execute the task with maximum number of steps"""

@@ -49,41 +68,28 @@ class BrowserUseAgent(Agent):
        )
        signal_handler.register()

-        # Wait for verification task to complete if it exists
-        if hasattr(self, "_verification_task") and not self._verification_task.done():
-            try:
-                await self._verification_task
-            except Exception:
-                # Error already logged in the task
-                pass
-
        try:
            self._log_agent_run()

            # Execute initial actions if provided
            if self.initial_actions:
-                result = await self.multi_act(
-                    self.initial_actions, check_for_new_elements=False
-                )
+                result = await self.multi_act(self.initial_actions, check_for_new_elements=False)
                self.state.last_result = result

            for step in range(max_steps):
                # Check if waiting for user input after Ctrl+C
-                while self.state.paused:
-                    await asyncio.sleep(0.5)
-                    if self.state.stopped:
-                        break
+                if self.state.paused:
+                    signal_handler.wait_for_resume()
+                    signal_handler.reset()

                # Check if we should stop due to too many failures
                if self.state.consecutive_failures >= self.settings.max_failures:
-                    logger.error(
-                        f"❌ Stopping due to {self.settings.max_failures} consecutive failures"
-                    )
+                    logger.error(f'❌ Stopping due to {self.settings.max_failures} consecutive failures')
                    break

                # Check control flags before each step
                if self.state.stopped:
-                    logger.info("Agent stopped")
+                    logger.info('Agent stopped')
                    break

                while self.state.paused:
@@ -108,15 +114,30 @@ class BrowserUseAgent(Agent):
                    await self.log_completion()
                    break
            else:
-                logger.info("❌ Failed to complete task in maximum steps")
+                error_message = 'Failed to complete task in maximum steps'
+
+                self.state.history.history.append(
+                    AgentHistory(
+                        model_output=None,
+                        result=[ActionResult(error=error_message, include_in_memory=True)],
+                        state=BrowserStateHistory(
+                            url='',
+                            title='',
+                            tabs=[],
+                            interacted_element=[],
+                            screenshot=None,
+                        ),
+                        metadata=None,
+                    )
+                )
+
+                logger.info(f'❌ {error_message}')

            return self.state.history

        except KeyboardInterrupt:
            # Already handled by our signal handler, but catch any direct KeyboardInterrupt as well
-            logger.info(
-                "Got KeyboardInterrupt during execution, returning current history"
-            )
+            logger.info('Got KeyboardInterrupt during execution, returning current history')
            return self.state.history

        finally:
@@ -136,13 +157,29 @@ class BrowserUseAgent(Agent):
                )
            )

+            if self.settings.save_playwright_script_path:
+                logger.info(
+                    f'Agent run finished. Attempting to save Playwright script to: {self.settings.save_playwright_script_path}'
+                )
+                try:
+                    # Extract sensitive data keys if sensitive_data is provided
+                    keys = list(self.sensitive_data.keys()) if self.sensitive_data else None
+                    # Pass browser and context config to the saving method
+                    self.state.history.save_as_playwright_script(
+                        self.settings.save_playwright_script_path,
+                        sensitive_data_keys=keys,
+                        browser_config=self.browser.config,
+                        context_config=self.browser_context.config,
+                    )
+                except Exception as script_gen_err:
+                    # Log any error during script generation/saving
+                    logger.error(f'Failed to save Playwright script: {script_gen_err}', exc_info=True)
+
            await self.close()

            if self.settings.generate_gif:
-                output_path: str = "agent_history.gif"
+                output_path: str = 'agent_history.gif'
                if isinstance(self.settings.generate_gif, str):
                    output_path = self.settings.generate_gif

-                create_history_gif(
-                    task=self.task, history=self.state.history, output_path=output_path
-                )
+                create_history_gif(task=self.task, history=self.state.history, output_path=output_path)
--- a/src/agent/deep_research/deep_research_agent.py
+++ b/src/agent/deep_research/deep_research_agent.py
--- a/src/browser/custom_browser.py
+++ b/src/browser/custom_browser.py
@@ -26,25 +26,33 @@ from browser_use.browser.utils.screen_resolution import get_screen_resolution, g
 from browser_use.utils import time_execution_async
 import socket

-from .custom_context import CustomBrowserContext, CustomBrowserContextConfig
+from .custom_context import CustomBrowserContext

 logger = logging.getLogger(__name__)


 class CustomBrowser(Browser):

-    async def new_context(self, config: CustomBrowserContextConfig | None = None) -> CustomBrowserContext:
+    async def new_context(self, config: BrowserContextConfig | None = None) -> CustomBrowserContext:
        """Create a browser context"""
        browser_config = self.config.model_dump() if self.config else {}
        context_config = config.model_dump() if config else {}
        merged_config = {**browser_config, **context_config}
-        return CustomBrowserContext(config=CustomBrowserContextConfig(**merged_config), browser=self)
+        return CustomBrowserContext(config=BrowserContextConfig(**merged_config), browser=self)

    async def _setup_builtin_browser(self, playwright: Playwright) -> PlaywrightBrowser:
        """Sets up and returns a Playwright Browser instance with anti-detection measures."""
        assert self.config.browser_binary_path is None, 'browser_binary_path should be None if trying to use the builtin browsers'

-        if self.config.headless:
+        # Use the configured window size from new_context_config if available
+        if (
+                not self.config.headless
+                and hasattr(self.config, 'new_context_config')
+                and hasattr(self.config.new_context_config, 'browser_window_size')
+        ):
+            screen_size = self.config.new_context_config.browser_window_size.model_dump()
+            offset_x, offset_y = get_window_adjustments()
+        elif self.config.headless:
            screen_size = {'width': 1920, 'height': 1080}
            offset_x, offset_y = 0, 0
        else:
@@ -52,6 +60,7 @@ class CustomBrowser(Browser):
            offset_x, offset_y = get_window_adjustments()

        chrome_args = {
+            f'--remote-debugging-port={self.config.chrome_remote_debugging_port}',
            *CHROME_ARGS,
            *(CHROME_DOCKER_ARGS if IN_DOCKER else []),
            *(CHROME_HEADLESS_ARGS if self.config.headless else []),
@@ -70,8 +79,8 @@ class CustomBrowser(Browser):

        # check if port 9222 is already taken, if so remove the remote-debugging-port arg to prevent conflicts
        with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
-            if s.connect_ex(('localhost', 9222)) == 0:
-                chrome_args.remove('--remote-debugging-port=9222')
+            if s.connect_ex(('localhost', self.config.chrome_remote_debugging_port)) == 0:
+                chrome_args.remove(f'--remote-debugging-port={self.config.chrome_remote_debugging_port}')

        browser_class = getattr(playwright, self.config.browser_class)
        args = {
--- a/src/browser/custom_context.py
+++ b/src/browser/custom_context.py
@@ -12,10 +12,6 @@ from browser_use.browser.context import BrowserContextState
 logger = logging.getLogger(__name__)


-class CustomBrowserContextConfig(BrowserContextConfig):
-    force_new_context: bool = False  # force to create new context
-
-
 class CustomBrowserContext(BrowserContext):
    def __init__(
            self,
@@ -24,96 +20,3 @@ class CustomBrowserContext(BrowserContext):
            state: Optional[BrowserContextState] = None,
    ):
        super(CustomBrowserContext, self).__init__(browser=browser, config=config, state=state)
-
-    async def _create_context(self, browser: PlaywrightBrowser):
-        """Creates a new browser context with anti-detection measures and loads cookies if available."""
-        if not self.config.force_new_context and self.browser.config.cdp_url and len(browser.contexts) > 0:
-            context = browser.contexts[0]
-        elif not self.config.force_new_context and self.browser.config.browser_binary_path and len(
-                browser.contexts) > 0:
-            # Connect to existing Chrome instance instead of creating new one
-            context = browser.contexts[0]
-        else:
-            # Original code for creating new context
-            context = await browser.new_context(
-                no_viewport=True,
-                user_agent=self.config.user_agent,
-                java_script_enabled=True,
-                bypass_csp=self.config.disable_security,
-                ignore_https_errors=self.config.disable_security,
-                record_video_dir=self.config.save_recording_path,
-                record_video_size={
-                    "width": self.config.window_width,
-                    "height": self.config.window_height
-                },
-                record_har_path=self.config.save_har_path,
-                locale=self.config.locale,
-                http_credentials=self.config.http_credentials,
-                is_mobile=self.config.is_mobile,
-                has_touch=self.config.has_touch,
-                geolocation=self.config.geolocation,
-                permissions=self.config.permissions,
-                timezone_id=self.config.timezone_id,
-            )
-
-        if self.config.trace_path:
-            await context.tracing.start(screenshots=True, snapshots=True, sources=True)
-
-        # Load cookies if they exist
-        if self.config.cookies_file and os.path.exists(self.config.cookies_file):
-            with open(self.config.cookies_file, 'r') as f:
-                try:
-                    cookies = json.load(f)
-
-                    valid_same_site_values = ['Strict', 'Lax', 'None']
-                    for cookie in cookies:
-                        if 'sameSite' in cookie:
-                            if cookie['sameSite'] not in valid_same_site_values:
-                                logger.warning(
-                                    f"Fixed invalid sameSite value '{cookie['sameSite']}' to 'None' for cookie {cookie.get('name')}"
-                                )
-                                cookie['sameSite'] = 'None'
-                    logger.info(f'🍪  Loaded {len(cookies)} cookies from {self.config.cookies_file}')
-                    await context.add_cookies(cookies)
-
-                except json.JSONDecodeError as e:
-                    logger.error(f'Failed to parse cookies file: {str(e)}')
-
-        # Expose anti-detection scripts
-        await context.add_init_script(
-            """
-            // Webdriver property
-            Object.defineProperty(navigator, 'webdriver', {
-                get: () => undefined
-            });
-
-            // Languages
-            Object.defineProperty(navigator, 'languages', {
-                get: () => ['en-US']
-            });
-
-            // Plugins
-            Object.defineProperty(navigator, 'plugins', {
-                get: () => [1, 2, 3, 4, 5]
-            });
-
-            // Chrome runtime
-            window.chrome = { runtime: {} };
-
-            // Permissions
-            const originalQuery = window.navigator.permissions.query;
-            window.navigator.permissions.query = (parameters) => (
-                parameters.name === 'notifications' ?
-                    Promise.resolve({ state: Notification.permission }) :
-                    originalQuery(parameters)
-            );
-            (function () {
-                const originalAttachShadow = Element.prototype.attachShadow;
-                Element.prototype.attachShadow = function attachShadow(options) {
-                    return originalAttachShadow.call(this, { ...options, mode: "open" });
-                };
-            })();
-            """
-        )
-
-        return context
--- a/src/controller/custom_controller.py
+++ b/src/controller/custom_controller.py
@@ -172,6 +172,10 @@ class CustomController(Controller):
                        param_model=create_tool_param_model(tool),
                    )
                    logger.info(f"Add mcp tool: {tool_name}")
+                logger.debug(
+                    f"Registered {len(self.mcp_client.server_name_to_tools[server_name])} mcp tools for {server_name}")
+        else:
+            logger.warning(f"MCP client not started.")

    async def close_mcp_client(self):
        if self.mcp_client:
--- a/src/webui/components/browser_settings_tab.py
+++ b/src/webui/components/browser_settings_tab.py
@@ -1,3 +1,5 @@
+import os
+
 import gradio as gr
 import logging
 from gradio.components import Component
@@ -56,7 +58,7 @@ def create_browser_settings_tab(webui_manager: WebuiManager):
            )
            keep_browser_open = gr.Checkbox(
                label="Keep Browser Open",
-                value=True,
+                value=os.getenv("KEEP_BROWSER_OPEN", True),
                info="Keep Browser Open between Tasks",
                interactive=True
            )
@@ -91,6 +93,7 @@ def create_browser_settings_tab(webui_manager: WebuiManager):
        with gr.Row():
            cdp_url = gr.Textbox(
                label="CDP URL",
+                value=os.getenv("BROWSER_CDP", None),
                info="CDP URL for browser remote debugging",
                interactive=True,
            )
--- a/src/webui/components/browser_use_agent_tab.py
+++ b/src/webui/components/browser_use_agent_tab.py
@@ -13,14 +13,13 @@ from browser_use.agent.views import (
    AgentOutput,
 )
 from browser_use.browser.browser import BrowserConfig
-from browser_use.browser.context import BrowserContext
+from browser_use.browser.context import BrowserContext, BrowserContextConfig
 from browser_use.browser.views import BrowserState
 from gradio.components import Component
 from langchain_core.language_models.chat_models import BaseChatModel

 from src.agent.browser_use.browser_use_agent import BrowserUseAgent
 from src.browser.custom_browser import CustomBrowser
-from src.browser.custom_context import CustomBrowserContextConfig
 from src.controller.custom_controller import CustomController
 from src.utils import llm_provider
 from src.webui.webui_manager import WebuiManager
@@ -32,12 +31,12 @@ logger = logging.getLogger(__name__)


 async def _initialize_llm(
-    provider: Optional[str],
-    model_name: Optional[str],
-    temperature: float,
-    base_url: Optional[str],
-    api_key: Optional[str],
-    num_ctx: Optional[int] = None,
+        provider: Optional[str],
+        model_name: Optional[str],
+        temperature: float,
+        base_url: Optional[str],
+        api_key: Optional[str],
+        num_ctx: Optional[int] = None,
 ) -> Optional[BaseChatModel]:
    """Initializes the LLM based on settings. Returns None if provider/model is missing."""
    if not provider or not model_name:
@@ -68,10 +67,10 @@ async def _initialize_llm(


 def _get_config_value(
-    webui_manager: WebuiManager,
-    comp_dict: Dict[gr.components.Component, Any],
-    comp_id_suffix: str,
-    default: Any = None,
+        webui_manager: WebuiManager,
+        comp_dict: Dict[gr.components.Component, Any],
+        comp_id_suffix: str,
+        default: Any = None,
 ) -> Any:
    """Safely get value from component dictionary using its ID suffix relative to the tab."""
    # Assumes component ID format is "tab_name.comp_name"
@@ -133,7 +132,7 @@ def _format_agent_output(model_output: AgentOutput) -> str:


 async def _handle_new_step(
-    webui_manager: WebuiManager, state: BrowserState, output: AgentOutput, step_num: int
+        webui_manager: WebuiManager, state: BrowserState, output: AgentOutput, step_num: int
 ):
    """Callback for each step taken by the agent, including screenshot display."""

@@ -157,12 +156,12 @@ async def _handle_new_step(
        try:
            # Basic validation: check if it looks like base64
            if (
-                isinstance(screenshot_data, str) and len(screenshot_data) > 100
+                    isinstance(screenshot_data, str) and len(screenshot_data) > 100
            ):  # Arbitrary length check
                # *** UPDATED STYLE: Removed centering, adjusted width ***
                img_tag = f'<img src="data:image/jpeg;base64,{screenshot_data}" alt="Step {step_num} Screenshot" style="max-width: 800px; max-height: 600px; object-fit:contain;" />'
                screenshot_html = (
-                    img_tag + "<br/>"
+                        img_tag + "<br/>"
                )  # Use <br/> for line break after inline-block image
            else:
                logger.warning(
@@ -223,7 +222,7 @@ def _handle_done(webui_manager: WebuiManager, history: AgentHistoryList):


 async def _ask_assistant_callback(
-    webui_manager: WebuiManager, query: str, browser_context: BrowserContext
+        webui_manager: WebuiManager, query: str, browser_context: BrowserContext
 ) -> Dict[str, Any]:
    """Callback triggered by the agent's ask_for_assistant action."""
    logger.info("Agent requires assistance. Waiting for user input.")
@@ -274,7 +273,7 @@ async def _ask_assistant_callback(


 async def run_agent_task(
-    webui_manager: WebuiManager, components: Dict[gr.components.Component, Any]
+        webui_manager: WebuiManager, components: Dict[gr.components.Component, Any]
 ) -> AsyncGenerator[Dict[gr.components.Component, Any], None]:
    """Handles the entire lifecycle of initializing and running the agent."""

@@ -358,6 +357,7 @@ async def run_agent_task(
    # Planner LLM Settings (Optional)
    planner_llm_provider_name = get_setting("planner_llm_provider") or None
    planner_llm = None
+    planner_use_vision = False
    if planner_llm_provider_name:
        planner_llm_model_name = get_setting("planner_llm_model_name")
        planner_llm_temperature = get_setting("planner_llm_temperature", 0.6)
@@ -387,7 +387,7 @@ async def run_agent_task(
    )  # Logic handled by CDP/WSS presence
    keep_browser_open = get_browser_setting("keep_browser_open", False)
    headless = get_browser_setting("headless", False)
-    disable_security = get_browser_setting("disable_security", True)
+    disable_security = get_browser_setting("disable_security", False)
    window_w = int(get_browser_setting("window_w", 1280))
    window_h = int(get_browser_setting("window_h", 1100))
    cdp_url = get_browser_setting("cdp_url") or None
@@ -422,7 +422,7 @@ async def run_agent_task(

    # Pass the webui_manager instance to the callback when wrapping it
    async def ask_callback_wrapper(
-        query: str, browser_context: BrowserContext
+            query: str, browser_context: BrowserContext
    ) -> Dict[str, Any]:
        return await _ask_assistant_callback(webui_manager, query, browser_context)

@@ -451,20 +451,16 @@ async def run_agent_task(
        if not webui_manager.bu_browser:
            logger.info("Launching new browser instance.")
            extra_args = [f"--window-size={window_w},{window_h}"]
-            if browser_user_data_dir:
-                extra_args.append(f"--user-data-dir={browser_user_data_dir}")
-
            if use_own_browser:
-                browser_binary_path = (
-                    os.getenv("CHROME_PATH", None) or browser_binary_path
-                )
+                browser_binary_path = os.getenv("BROWSER_PATH", None) or browser_binary_path
                if browser_binary_path == "":
                    browser_binary_path = None
-                chrome_user_data = os.getenv("CHROME_USER_DATA", None)
-                if chrome_user_data:
-                    extra_args += [f"--user-data-dir={chrome_user_data}"]
+                browser_user_data = browser_user_data_dir or os.getenv("BROWSER_USER_DATA", None)
+                if browser_user_data:
+                    extra_args += [f"--user-data-dir={browser_user_data}"]
            else:
                browser_binary_path = None
+
            webui_manager.bu_browser = CustomBrowser(
                config=BrowserConfig(
                    headless=headless,
@@ -479,14 +475,14 @@ async def run_agent_task(
        # Create Context if needed
        if not webui_manager.bu_browser_context:
            logger.info("Creating new browser context.")
-            context_config = CustomBrowserContextConfig(
+            context_config = BrowserContextConfig(
                trace_path=save_trace_path if save_trace_path else None,
                save_recording_path=save_recording_path
                if save_recording_path
                else None,
                save_downloads_path=save_download_path if save_download_path else None,
-               window_width=window_w,
-               window_height=window_h,
+                window_height=window_h,
+                window_width=window_w,
            )
            if not webui_manager.bu_browser:
                raise ValueError("Browser not initialized, cannot create context.")
@@ -513,7 +509,7 @@ async def run_agent_task(

        # Pass the webui_manager to callbacks when wrapping them
        async def step_callback_wrapper(
-            state: BrowserState, output: AgentOutput, step_num: int
+                state: BrowserState, output: AgentOutput, step_num: int
        ):
            await _handle_new_step(webui_manager, state, output, step_num)

@@ -582,7 +578,7 @@ async def run_agent_task(
                    await asyncio.sleep(0.2)

                if (
-                    agent_task.done() or is_stopped
+                        agent_task.done() or is_stopped
                ):  # If stopped or task finished while paused
                    break

@@ -633,8 +629,8 @@ async def run_agent_task(
                yield update_dict
                # Wait until response is submitted or task finishes
                while (
-                    webui_manager.bu_response_event is not None
-                    and not agent_task.done()
+                        webui_manager.bu_response_event is not None
+                        and not agent_task.done()
                ):
                    await asyncio.sleep(0.2)
                # Restore UI after response submitted or if task ended unexpectedly
@@ -716,9 +712,9 @@ async def run_agent_task(
        except asyncio.CancelledError:
            logger.info("Agent task was cancelled.")
            if not any(
-                "Cancelled" in msg.get("content", "")
-                for msg in webui_manager.bu_chat_history
-                if msg.get("role") == "assistant"
+                    "Cancelled" in msg.get("content", "")
+                    for msg in webui_manager.bu_chat_history
+                    if msg.get("role") == "assistant"
            ):
                webui_manager.bu_chat_history.append(
                    {"role": "assistant", "content": "**Task Cancelled**."}
@@ -730,9 +726,9 @@ async def run_agent_task(
                f"**Agent Execution Error:**\n```\n{type(e).__name__}: {e}\n```"
            )
            if not any(
-                error_message in msg.get("content", "")
-                for msg in webui_manager.bu_chat_history
-                if msg.get("role") == "assistant"
+                    error_message in msg.get("content", "")
+                    for msg in webui_manager.bu_chat_history
+                    if msg.get("role") == "assistant"
            ):
                webui_manager.bu_chat_history.append(
                    {"role": "assistant", "content": error_message}
@@ -788,7 +784,7 @@ async def run_agent_task(
            clear_button_comp: gr.update(interactive=True),
            chatbot_comp: gr.update(
                value=webui_manager.bu_chat_history
-                + [{"role": "assistant", "content": f"**Setup Error:** {e}"}]
+                      + [{"role": "assistant", "content": f"**Setup Error:** {e}"}]
            ),
        }

@@ -797,7 +793,7 @@ async def run_agent_task(


 async def handle_submit(
-    webui_manager: WebuiManager, components: Dict[gr.components.Component, Any]
+        webui_manager: WebuiManager, components: Dict[gr.components.Component, Any]
 ):
    """Handles clicks on the main 'Submit' button."""
    user_input_comp = webui_manager.get_component_by_id("browser_use_agent.user_input")
@@ -1048,7 +1044,7 @@ def create_browser_use_agent_tab(webui_manager: WebuiManager):
    run_tab_outputs = list(tab_components.values())

    async def submit_wrapper(
-        components_dict: Dict[Component, Any],
+            components_dict: Dict[Component, Any],
    ) -> AsyncGenerator[Dict[Component, Any], None]:
        """Wrapper for handle_submit that yields its results."""
        async for update in handle_submit(webui_manager, components_dict):
--- a/src/webui/components/deep_research_agent_tab.py
+++ b/src/webui/components/deep_research_agent_tab.py
@@ -116,7 +116,7 @@ async def run_deep_research(webui_manager: WebuiManager, components: Dict[Compon
        # LLM Config (from agent_settings tab)
        llm_provider_name = get_setting("agent_settings", "llm_provider")
        llm_model_name = get_setting("agent_settings", "llm_model_name")
-        llm_temperature = get_setting("agent_settings", "llm_temperature", 0.5)  # Default if not found
+        llm_temperature = max(get_setting("agent_settings", "llm_temperature", 0.5), 0.5)
        llm_base_url = get_setting("agent_settings", "llm_base_url")
        llm_api_key = get_setting("agent_settings", "llm_api_key")
        ollama_num_ctx = get_setting("agent_settings", "ollama_num_ctx")
@@ -132,7 +132,7 @@ async def run_deep_research(webui_manager: WebuiManager, components: Dict[Compon
        # Note: DeepResearchAgent constructor takes a dict, not full Browser/Context objects
        browser_config_dict = {
            "headless": get_setting("browser_settings", "headless", False),
-            "disable_security": get_setting("browser_settings", "disable_security", True),
+            "disable_security": get_setting("browser_settings", "disable_security", False),
            "browser_binary_path": get_setting("browser_settings", "browser_binary_path"),
            "user_data_dir": get_setting("browser_settings", "browser_user_data_dir"),
            "window_width": int(get_setting("browser_settings", "window_w", 1280)),
--- a/supervisord.conf
+++ b/supervisord.conf
@@ -3,7 +3,7 @@ user=root
 nodaemon=true
 logfile=/dev/stdout
 logfile_maxbytes=0
-loglevel=debug
+loglevel=error

 [program:xvfb]
 command=Xvfb :99 -screen 0 %(ENV_RESOLUTION)s -ac +extension GLX +render -noreset
@@ -65,21 +65,6 @@ startretries=5
 startsecs=3
 depends_on=x11vnc

-[program:persistent_browser]
-environment=START_URL="data:text/html,<html><body><h1>Browser Ready</h1></body></html>"
-command=bash -c "mkdir -p /app/data/chrome_data && sleep 8 && $(find /ms-playwright/chromium-*/chrome-linux -name chrome) --user-data-dir=/app/data/chrome_data --window-position=0,0 --window-size=%(ENV_RESOLUTION_WIDTH)s,%(ENV_RESOLUTION_HEIGHT)s --start-maximized --no-sandbox --disable-dev-shm-usage --disable-gpu --disable-software-rasterizer --disable-setuid-sandbox --no-first-run --no-default-browser-check --no-experiments --ignore-certificate-errors --remote-debugging-port=9222 --remote-debugging-address=0.0.0.0 \"$START_URL\""
-autorestart=true
-stdout_logfile=/dev/stdout
-stdout_logfile_maxbytes=0
-stderr_logfile=/dev/stderr
-stderr_logfile_maxbytes=0
-priority=350
-startretries=5
-startsecs=10
-stopsignal=TERM
-stopwaitsecs=15
-depends_on=novnc
-
 [program:webui]
 command=python webui.py --ip 0.0.0.0 --port 7788
 directory=/app
@@ -92,5 +77,4 @@ priority=400
 startretries=3
 startsecs=3
 stopsignal=TERM
-stopwaitsecs=10
-depends_on=persistent_browser
+stopwaitsecs=10
--- a/tests/test_agents.py
+++ b/tests/test_agents.py
@@ -20,15 +20,14 @@ from src.utils import utils
 async def test_browser_use_agent():
    from browser_use.browser.browser import Browser, BrowserConfig
    from browser_use.browser.context import (
-        BrowserContextConfig,
-        BrowserContextWindowSize,
+        BrowserContextConfig
    )
    from browser_use.agent.service import Agent

    from src.browser.custom_browser import CustomBrowser
-    from src.browser.custom_context import CustomBrowserContextConfig
    from src.controller.custom_controller import CustomController
    from src.utils import llm_provider
+    from src.agent.browser_use.browser_use_agent import BrowserUseAgent

    # llm = utils.get_llm_model(
    #     provider="openai",
@@ -38,12 +37,12 @@ async def test_browser_use_agent():
    #     api_key=os.getenv("OPENAI_API_KEY", ""),
    # )

-    # llm = utils.get_llm_model(
-    #     provider="google",
-    #     model_name="gemini-2.0-flash",
-    #     temperature=0.6,
-    #     api_key=os.getenv("GOOGLE_API_KEY", "")
-    # )
+    llm = llm_provider.get_llm_model(
+        provider="google",
+        model_name="gemini-2.0-flash",
+        temperature=0.6,
+        api_key=os.getenv("GOOGLE_API_KEY", "")
+    )

    # llm = utils.get_llm_model(
    #     provider="deepseek",
@@ -67,25 +66,25 @@ async def test_browser_use_agent():

    window_w, window_h = 1280, 1100

-    llm = llm_provider.get_llm_model(
-        provider="azure_openai",
-        model_name="gpt-4o",
-        temperature=0.5,
-        base_url=os.getenv("AZURE_OPENAI_ENDPOINT", ""),
-        api_key=os.getenv("AZURE_OPENAI_API_KEY", ""),
-    )
+    # llm = llm_provider.get_llm_model(
+    #     provider="azure_openai",
+    #     model_name="gpt-4o",
+    #     temperature=0.5,
+    #     base_url=os.getenv("AZURE_OPENAI_ENDPOINT", ""),
+    #     api_key=os.getenv("AZURE_OPENAI_API_KEY", ""),
+    # )

    mcp_server_config = {
        "mcpServers": {
-            "markitdown": {
-                "command": "docker",
-                "args": [
-                    "run",
-                    "--rm",
-                    "-i",
-                    "markitdown-mcp:latest"
-                ]
-            },
+            # "markitdown": {
+            #     "command": "docker",
+            #     "args": [
+            #         "run",
+            #         "--rm",
+            #         "-i",
+            #         "markitdown-mcp:latest"
+            #     ]
+            # },
            "desktop-commander": {
                "command": "npx",
                "args": [
@@ -97,8 +96,7 @@ async def test_browser_use_agent():
    }
    controller = CustomController()
    await controller.setup_mcp_client(mcp_server_config)
-    use_own_browser = False
-    disable_security = True
+    use_own_browser = True
    use_vision = True  # Set to False when using DeepSeek

    max_actions_per_step = 10
@@ -106,37 +104,35 @@ async def test_browser_use_agent():
    browser_context = None

    try:
-        extra_chromium_args = [f"--window-size={window_w},{window_h}"]
+        extra_browser_args = [f"--window-size={window_w},{window_h}"]
        if use_own_browser:
-            chrome_path = os.getenv("CHROME_PATH", None)
-            if chrome_path == "":
-                chrome_path = None
-            chrome_user_data = os.getenv("CHROME_USER_DATA", None)
-            if chrome_user_data:
-                extra_chromium_args += [f"--user-data-dir={chrome_user_data}"]
+            browser_binary_path = os.getenv("BROWSER_PATH", None)
+            if browser_binary_path == "":
+                browser_binary_path = None
+            browser_user_data = os.getenv("BROWSER_USER_DATA", None)
+            if browser_user_data:
+                extra_browser_args += [f"--user-data-dir={browser_user_data}"]
        else:
-            chrome_path = None
+            browser_binary_path = None
        browser = CustomBrowser(
            config=BrowserConfig(
                headless=False,
-                disable_security=disable_security,
-                browser_binary_path=chrome_path,
-                extra_browser_args=extra_chromium_args,
+                browser_binary_path=browser_binary_path,
+                extra_browser_args=extra_browser_args,
            )
        )
        browser_context = await browser.new_context(
-            config=CustomBrowserContextConfig(
-                trace_path="./tmp/traces",
-                save_recording_path="./tmp/record_videos",
+            config=BrowserContextConfig(
+                trace_path=None,
+                save_recording_path=None,
                save_downloads_path="./tmp/downloads",
-                browser_window_size=BrowserContextWindowSize(
-                    width=window_w, height=window_h
-                ),
-                force_new_context=True
+                window_height=window_h,
+                window_width=window_w,
            )
        )
-        agent = Agent(
-            task="download pdf from https://arxiv.org/abs/2504.10458 and rename this pdf to 'GUI-r1-test.pdf'",
+        agent = BrowserUseAgent(
+            # task="download pdf from https://arxiv.org/pdf/2311.16498 and rename this pdf to 'mcp-test.pdf'",
+            task="give me nvidia stock price",
            llm=llm,
            browser=browser,
            browser_context=browser_context,
@@ -153,7 +149,6 @@ async def test_browser_use_agent():
        print("\nErrors:")
        pprint(history.errors(), indent=4)

-
    except Exception:
        import traceback
        traceback.print_exc()
@@ -167,24 +162,16 @@ async def test_browser_use_agent():


 async def test_browser_use_parallel():
-    from browser_use.browser.context import BrowserContextWindowSize
-    from browser_use.browser.browser import BrowserConfig
-    from patchright.async_api import async_playwright
-    from browser_use.browser.browser import Browser
-    from src.browser.custom_context import BrowserContextConfig
-    from src.controller.custom_controller import CustomController
-
    from browser_use.browser.browser import Browser, BrowserConfig
    from browser_use.browser.context import (
        BrowserContextConfig,
-        BrowserContextWindowSize,
    )
    from browser_use.agent.service import Agent

    from src.browser.custom_browser import CustomBrowser
-    from src.browser.custom_context import CustomBrowserContextConfig
    from src.controller.custom_controller import CustomController
    from src.utils import llm_provider
+    from src.agent.browser_use.browser_use_agent import BrowserUseAgent

    # llm = utils.get_llm_model(
    #     provider="openai",
@@ -233,15 +220,15 @@ async def test_browser_use_parallel():

    mcp_server_config = {
        "mcpServers": {
-            "markitdown": {
-                "command": "docker",
-                "args": [
-                    "run",
-                    "--rm",
-                    "-i",
-                    "markitdown-mcp:latest"
-                ]
-            },
+            # "markitdown": {
+            #     "command": "docker",
+            #     "args": [
+            #         "run",
+            #         "--rm",
+            #         "-i",
+            #         "markitdown-mcp:latest"
+            #     ]
+            # },
            "desktop-commander": {
                "command": "npx",
                "args": [
@@ -261,8 +248,7 @@ async def test_browser_use_parallel():
    }
    controller = CustomController()
    await controller.setup_mcp_client(mcp_server_config)
-    use_own_browser = False
-    disable_security = True
+    use_own_browser = True
    use_vision = True  # Set to False when using DeepSeek

    max_actions_per_step = 10
@@ -270,37 +256,35 @@ async def test_browser_use_parallel():
    browser_context = None

    try:
-        extra_chromium_args = [f"--window-size={window_w},{window_h}"]
+        extra_browser_args = [f"--window-size={window_w},{window_h}"]
        if use_own_browser:
-            chrome_path = os.getenv("CHROME_PATH", None)
-            if chrome_path == "":
-                chrome_path = None
-            chrome_user_data = os.getenv("CHROME_USER_DATA", None)
-            if chrome_user_data:
-                extra_chromium_args += [f"--user-data-dir={chrome_user_data}"]
+            browser_binary_path = os.getenv("BROWSER_PATH", None)
+            if browser_binary_path == "":
+                browser_binary_path = None
+            browser_user_data = os.getenv("BROWSER_USER_DATA", None)
+            if browser_user_data:
+                extra_browser_args += [f"--user-data-dir={browser_user_data}"]
        else:
-            chrome_path = None
+            browser_binary_path = None
        browser = CustomBrowser(
            config=BrowserConfig(
                headless=False,
-                disable_security=disable_security,
-                browser_binary_path=chrome_path,
-                extra_browser_args=extra_chromium_args,
+                browser_binary_path=browser_binary_path,
+                extra_browser_args=extra_browser_args,
            )
        )
        browser_context = await browser.new_context(
-            config=CustomBrowserContextConfig(
-                trace_path="./tmp/traces",
-                save_recording_path="./tmp/record_videos",
+            config=BrowserContextConfig(
+                trace_path=None,
+                save_recording_path=None,
                save_downloads_path="./tmp/downloads",
-                browser_window_size=BrowserContextWindowSize(
-                    width=window_w, height=window_h
-                ),
+                window_height=window_h,
+                window_width=window_w,
                force_new_context=True
            )
        )
        agents = [
-            Agent(task=task, llm=llm, browser=browser, controller=controller)
+            BrowserUseAgent(task=task, llm=llm, browser=browser, controller=controller)
            for task in [
                'Search Google for weather in Tokyo',
                # 'Check Reddit front page title',
@@ -332,6 +316,8 @@ async def test_browser_use_parallel():
            await browser_context.close()
        if browser:
            await browser.close()
+        if controller:
+            await controller.close_mcp_client()


 async def test_deep_research_agent():
@@ -362,8 +348,8 @@ async def test_deep_research_agent():

    browser_config = {"headless": False, "window_width": 1280, "window_height": 1100, "use_own_browser": False}
    agent = DeepResearchAgent(llm=llm, browser_config=browser_config, mcp_server_config=mcp_server_config)
-    research_topic = "Impact of Microplastics on Marine Ecosystems"
-    task_id_to_resume = "815460fb-337a-4850-8fa4-a5f2db301a89"  # Set this to resume a previous task ID
+    research_topic = "Give me investment advices of nvidia and tesla."
+    task_id_to_resume = ""  # Set this to resume a previous task ID

    print(f"Starting research on: {research_topic}")

@@ -403,6 +389,6 @@ async def test_deep_research_agent():


 if __name__ == "__main__":
-    # asyncio.run(test_browser_use_agent())
+    asyncio.run(test_browser_use_agent())
    # asyncio.run(test_browser_use_parallel())
-    asyncio.run(test_deep_research_agent())
+    # asyncio.run(test_deep_research_agent())
--- a/tests/test_controller.py
+++ b/tests/test_controller.py
@@ -14,20 +14,31 @@ async def test_mcp_client():
    from src.utils.mcp_client import setup_mcp_client_and_tools, create_tool_param_model

    test_server_config = {
-        "playwright": {
-            "command": "npx",
-            "args": [
-                "@playwright/mcp@latest",
-            ],
-            "transport": "stdio",
-        },
-        "filesystem": {
-            "command": "npx",
-            "args": [
-                "-y",
-                "@modelcontextprotocol/server-filesystem",
-                "/Users/warmshao/ai_workspace",
-            ]
+        "mcpServers": {
+            # "markitdown": {
+            #     "command": "docker",
+            #     "args": [
+            #         "run",
+            #         "--rm",
+            #         "-i",
+            #         "markitdown-mcp:latest"
+            #     ]
+            # },
+            "desktop-commander": {
+                "command": "npx",
+                "args": [
+                    "-y",
+                    "@wonderwhy-er/desktop-commander"
+                ]
+            },
+            # "filesystem": {
+            #     "command": "npx",
+            #     "args": [
+            #         "-y",
+            #         "@modelcontextprotocol/server-filesystem",
+            #         "/Users/xxx/ai_workspace",
+            #     ]
+            # },
        }
    }

@@ -48,15 +59,15 @@ async def test_controller_with_mcp():

    mcp_server_config = {
        "mcpServers": {
-            "markitdown": {
-                "command": "docker",
-                "args": [
-                    "run",
-                    "--rm",
-                    "-i",
-                    "markitdown-mcp:latest"
-                ]
-            },
+            # "markitdown": {
+            #     "command": "docker",
+            #     "args": [
+            #         "run",
+            #         "--rm",
+            #         "-i",
+            #         "markitdown-mcp:latest"
+            #     ]
+            # },
            "desktop-commander": {
                "command": "npx",
                "args": [
--- a/tests/test_llm_api.py
+++ b/tests/test_llm_api.py
@@ -142,17 +142,17 @@ def test_ibm_model():


 def test_qwen_model():
-    config = LLMConfig(provider="alibaba", model_name="qwen3-30b-a3b")
+    config = LLMConfig(provider="alibaba", model_name="qwen-vl-max")
    test_llm(config, "How many 'r's are in the word 'strawberry'?")


 if __name__ == "__main__":
    # test_openai_model()
    # test_google_model()
-    # test_azure_openai_model()
+    test_azure_openai_model()
    # test_deepseek_model()
    # test_ollama_model()
-    test_deepseek_r1_model()
+    # test_deepseek_r1_model()
    # test_deepseek_r1_ollama_model()
    # test_mistral_model()
    # test_ibm_model()